Merge remote branch 'origin/master' into pipe-video
authorChristian König <deathsimple@vodafone.de>
Sat, 8 Jan 2011 12:24:36 +0000 (13:24 +0100)
committerChristian König <deathsimple@vodafone.de>
Sat, 8 Jan 2011 12:24:36 +0000 (13:24 +0100)
Conflicts:
configure.ac
src/gallium/drivers/r600/eg_asm.c
src/gallium/drivers/r600/r600_asm.c
src/gallium/drivers/r600/r600_asm.h
src/gallium/include/pipe/p_format.h
src/gallium/targets/dri-nouveau/Makefile

13 files changed:
1  2 
configure.ac
src/gallium/auxiliary/util/u_format.csv
src/gallium/drivers/nvfx/nvfx_screen.c
src/gallium/drivers/r600/eg_asm.c
src/gallium/drivers/r600/r600_asm.c
src/gallium/drivers/r600/r600_asm.h
src/gallium/drivers/r600/r600_pipe.c
src/gallium/drivers/r600/r600_shader.c
src/gallium/drivers/r600/r600_sq.h
src/gallium/drivers/r600/r600_state.c
src/gallium/drivers/r600/r600_texture.c
src/gallium/include/pipe/p_format.h
src/gallium/targets/dri-nouveau/Makefile

diff --cc configure.ac
@@@ -1700,27 -1683,8 +1693,27 @@@ AC_ARG_ENABLE([gallium-nouveau]
      [enable_gallium_nouveau="$enableval"],
      [enable_gallium_nouveau=no])
  if test "x$enable_gallium_nouveau" = xyes; then
-     GALLIUM_DRIVERS_DIRS="$GALLIUM_DRIVERS_DIRS nouveau nvfx nv50"
+     GALLIUM_DRIVERS_DIRS="$GALLIUM_DRIVERS_DIRS nouveau nvfx nv50 nvc0"
 -    gallium_check_st "nouveau/drm" "dri-nouveau" "xorg-nouveau"
 +    gallium_check_st "nouveau/drm" "dri-nouveau" "xorg-nouveau" "xvmc-nouveau"
 +fi
 +
 +dnl
 +dnl Gallium G3DVL configuration
 +dnl
 +AC_ARG_ENABLE([gallium-g3dvl],
 +    [AS_HELP_STRING([--enable-gallium-g3dvl],
 +        [build gallium g3dvl @<:@default=disabled@:>@])],
 +    [enable_gallium_g3dvl="$enableval"],
 +    [enable_gallium_g3dvl=no])
 +if test "x$enable_gallium_g3dvl" = xyes; then
 +    case "$mesa_driver" in
 +    xlib)
 +        GALLIUM_TARGET_DIRS="$GALLIUM_TARGET_DIRS xvmc-softpipe"
 +        ;;
 +    dri)
 +        GALLIUM_WINSYS_DIRS="$GALLIUM_WINSYS_DIRS g3dvl/dri"
 +        ;;
 +    esac
  fi
  
  dnl
@@@ -37,17 -36,16 +37,17 @@@ int eg_bc_cf_build(struct r600_bc *bc, 
        switch (cf->inst) {
        case (EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU << 3):
        case (EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE << 3):
 +              assert(!end_of_program);
                bc->bytecode[id++] = S_SQ_CF_ALU_WORD0_ADDR(cf->addr >> 1) |
-                       S_SQ_CF_ALU_WORD0_KCACHE_MODE0(cf->kcache0_mode) |
-                       S_SQ_CF_ALU_WORD0_KCACHE_BANK0(cf->kcache0_bank) |
-                       S_SQ_CF_ALU_WORD0_KCACHE_BANK1(cf->kcache1_bank);
+                       S_SQ_CF_ALU_WORD0_KCACHE_MODE0(cf->kcache[0].mode) |
+                       S_SQ_CF_ALU_WORD0_KCACHE_BANK0(cf->kcache[0].bank) |
+                       S_SQ_CF_ALU_WORD0_KCACHE_BANK1(cf->kcache[1].bank);
                bc->bytecode[id++] = S_SQ_CF_ALU_WORD1_CF_INST(cf->inst >> 3) |
-                       S_SQ_CF_ALU_WORD1_KCACHE_MODE1(cf->kcache1_mode) |
-                       S_SQ_CF_ALU_WORD1_KCACHE_ADDR0(cf->kcache0_addr) |
-                       S_SQ_CF_ALU_WORD1_KCACHE_ADDR1(cf->kcache1_addr) |
-                                       S_SQ_CF_ALU_WORD1_BARRIER(cf->barrier) |
-                                       S_SQ_CF_ALU_WORD1_COUNT((cf->ndw / 2) - 1);
+                       S_SQ_CF_ALU_WORD1_KCACHE_MODE1(cf->kcache[1].mode) |
+                       S_SQ_CF_ALU_WORD1_KCACHE_ADDR0(cf->kcache[0].addr) |
+                       S_SQ_CF_ALU_WORD1_KCACHE_ADDR1(cf->kcache[1].addr) |
 -                                      S_SQ_CF_ALU_WORD1_BARRIER(1) |
 -                                      S_SQ_CF_ALU_WORD1_COUNT((cf->ndw / 2) - 1);
++                      S_SQ_CF_ALU_WORD1_BARRIER(cf->barrier) |
++                      S_SQ_CF_ALU_WORD1_COUNT((cf->ndw / 2) - 1);
                break;
        case EG_V_SQ_CF_WORD1_SQ_CF_INST_TEX:
        case EG_V_SQ_CF_WORD1_SQ_CF_INST_VTX:
@@@ -441,111 -239,110 +444,122 @@@ static int reserve_gpr(struct alu_bank_
        return 0;
  }
  
 -static int cycle_for_scalar_bank_swizzle(const int swiz, const int sel, unsigned *p_cycle)
 -{
 -      int table[3];
 -      int ret = 0;
 -      switch (swiz) {
 -      case SQ_ALU_SCL_210:
 -              table[0] = 2; table[1] = 1; table[2] = 0;
 -                *p_cycle = table[sel];
 -                break;
 -      case SQ_ALU_SCL_122:
 -              table[0] = 1; table[1] = 2; table[2] = 2;
 -                *p_cycle = table[sel];
 -                break;
 -      case SQ_ALU_SCL_212:
 -              table[0] = 2; table[1] = 1; table[2] = 2;
 -                *p_cycle = table[sel];
 -                break;
 -      case SQ_ALU_SCL_221:
 -              table[0] = 2; table[1] = 2; table[2] = 1;
 -              *p_cycle = table[sel];
 -                break;
 -              break;
 -      default:
 -              R600_ERR("bad scalar bank swizzle value\n");
 -              ret = -1;
 -              break;
 +static int reserve_cfile(struct alu_bank_swizzle *bs, unsigned sel, unsigned chan)
 +{
 +      int res, resmatch = -1, resempty = -1;
 +      for (res = 3; res >= 0; --res) {
 +              if (bs->hw_cfile_addr[res] == -1)
 +                      resempty = res;
 +              else if (bs->hw_cfile_addr[res] == sel &&
 +                      bs->hw_cfile_elem[res] == chan)
 +                      resmatch = res;
        }
 -      return ret;
 -}
 -
 -static int cycle_for_vector_bank_swizzle(const int swiz, const int sel, unsigned *p_cycle)
 -{
 -      int table[3];
 -      int ret;
 -
 -      switch (swiz) {
 -      case SQ_ALU_VEC_012:
 -              table[0] = 0; table[1] = 1; table[2] = 2;
 -                *p_cycle = table[sel];
 -                break;
 -      case SQ_ALU_VEC_021:
 -              table[0] = 0; table[1] = 2; table[2] = 1;
 -                *p_cycle = table[sel];
 -                break;
 -      case SQ_ALU_VEC_120:
 -              table[0] = 1; table[1] = 2; table[2] = 0;
 -                *p_cycle = table[sel];
 -                break;
 -      case SQ_ALU_VEC_102:
 -              table[0] = 1; table[1] = 0; table[2] = 2;
 -                *p_cycle = table[sel];
 -                break;
 -      case SQ_ALU_VEC_201:
 -              table[0] = 2; table[1] = 0; table[2] = 1;
 -                *p_cycle = table[sel];
 -                break;
 -      case SQ_ALU_VEC_210:
 -              table[0] = 2; table[1] = 1; table[2] = 0;
 -                *p_cycle = table[sel];
 -                break;
 -      default:
 -              R600_ERR("bad vector bank swizzle value\n");
 -              ret = -1;
 -              break;
 +      if (resmatch != -1)
 +              return 0; // Read for this scalar element already reserved, nothing to do here.
 +      else if (resempty != -1) {
 +              bs->hw_cfile_addr[resempty] = sel;
 +              bs->hw_cfile_elem[resempty] = chan;
 +      } else {
 +              // All cfile read ports are used, cannot reference vector element
 +              return -1;
        }
 -      return ret;
 +      return 0;
  }
  
 +static int is_gpr(unsigned sel)
 +{
 +      return (sel >= 0 && sel <= 127);
 +}
  
 +static int is_cfile(unsigned sel)
 +{
 +      return (sel > 255 && sel < 512);
 +}
  
 -static void update_chan_counter(struct r600_bc_alu *alu, int *chan_counter)
++/* CB constants start at 512, and get translated to a kcache index when ALU
++ * clauses are constructed. Note that we handle kcache constants the same way
++ * as (the now gone) cfile constants, is that really required? */
++static int is_cb_const(int sel)
+ {
 -      int num_src;
 -      int i;
 -      int channel_swizzle;
++      if (sel > 511 && sel < 4607)
++              return 1;
++      return 0;
++}
 -      num_src = r600_bc_get_num_operands(alu);
 +static int is_const(int sel)
 +{
 +      return is_cfile(sel) ||
++              is_cb_const(sel) ||
 +              (sel >= V_SQ_ALU_SRC_0 &&
 +              sel <= V_SQ_ALU_SRC_LITERAL);
 +}
 +
 +static int check_vector(struct r600_bc_alu *alu, struct alu_bank_swizzle *bs, int bank_swizzle)
 +{
 +      int r, src, num_src, sel, elem, cycle;
  
 -      for (i = 0; i < num_src; i++) {
 -              channel_swizzle = alu->src[i].chan;
 -              if ((alu->src[i].sel > 0 && alu->src[i].sel < 128) && channel_swizzle <= 3)
 -                      chan_counter[channel_swizzle]++;
 +      num_src = r600_bc_get_num_operands(alu);
 +      for (src = 0; src < num_src; src++) {
 +              sel = alu->src[src].sel;
 +              elem = alu->src[src].chan;
 +              if (is_gpr(sel)) {
 +                      cycle = cycle_for_bank_swizzle_vec[bank_swizzle][src];
 +                      if (src == 1 && sel == alu->src[0].sel && elem == alu->src[0].chan)
 +                              // Nothing to do; special-case optimization,
 +                              // second source uses first source’s reservation
 +                              continue;
 +                      else {
 +                              r = reserve_gpr(bs, sel, elem, cycle);
 +                              if (r)
 +                                      return r;
 +                      }
 +              } else if (is_cfile(sel)) {
 +                      r = reserve_cfile(bs, sel, elem);
 +                      if (r)
 +                              return r;
 +              }
 +              // No restrictions on PV, PS, literal or special constants
        }
 +      return 0;
  }
  
 -/* we need something like this I think - but this is bogus */
 -int check_read_slots(struct r600_bc *bc, struct r600_bc_alu *alu_first)
 +static int check_scalar(struct r600_bc_alu *alu, struct alu_bank_swizzle *bs, int bank_swizzle)
  {
 -      struct r600_bc_alu *alu;
 -      int chan_counter[4]  = { 0 };
 -
 -      update_chan_counter(alu_first, chan_counter);
 +      int r, src, num_src, const_count, sel, elem, cycle;
  
 -      LIST_FOR_EACH_ENTRY(alu, &alu_first->bs_list, bs_list) {
 -              update_chan_counter(alu, chan_counter);
 +      num_src = r600_bc_get_num_operands(alu);
 +      for (const_count = 0, src = 0; src < num_src; ++src) {
 +              sel = alu->src[src].sel;
 +              elem = alu->src[src].chan;
 +              if (is_const(sel)) { // Any constant, including literal and inline constants
 +                      if (const_count >= 2)
 +                              // More than two references to a constant in
 +                              // transcendental operation.
 +                              return -1;
 +                      else
 +                              const_count++;
 +              }
 +              if (is_cfile(sel)) {
 +                      r = reserve_cfile(bs, sel, elem);
 +                      if (r)
 +                              return r;
 +              }
        }
 -
 -      if (chan_counter[0] > 3 ||
 -          chan_counter[1] > 3 ||
 -          chan_counter[2] > 3 ||
 -          chan_counter[3] > 3) {
 -              R600_ERR("needed to split instruction for input ran out of banks %x %d %d %d %d\n",
 -                       alu_first->inst, chan_counter[0], chan_counter[1], chan_counter[2], chan_counter[3]);
 -              return -1;
 +      for (src = 0; src < num_src; ++src) {
 +              sel = alu->src[src].sel;
 +              elem = alu->src[src].chan;
 +              if (is_gpr(sel)) {
 +                      cycle = cycle_for_bank_swizzle_scl[bank_swizzle][src];
 +                      if (cycle < const_count)
 +                              // Cycle for GPR load conflicts with
 +                              // constant load in transcendental operation.
 +                              return -1;
 +                      r = reserve_gpr(bs, sel, elem, cycle);
 +                      if (r)
 +                              return r;
 +              }
 +              // Constants already processed
 +              // No restrictions on PV, PS
        }
        return 0;
  }
@@@ -868,55 -547,58 +996,61 @@@ int r600_bc_add_alu_type(struct r600_b
                        free(nalu);
                        return r;
                }
 -              bc->cf_last->inst = (type << 3);
        }
 +      bc->cf_last->inst = (type << 3);
+       /* Setup the kcache for this ALU instruction. This will start a new
+        * ALU clause if needed. */
+       if ((r = r600_bc_alloc_kcache_lines(bc, nalu, type))) {
+               free(nalu);
+               return r;
+       }
        if (!bc->cf_last->curr_bs_head) {
                bc->cf_last->curr_bs_head = nalu;
 -              LIST_INITHEAD(&nalu->bs_list);
 -      } else {
 -              LIST_ADDTAIL(&nalu->bs_list, &bc->cf_last->curr_bs_head->bs_list);
        }
 -      /* at most 128 slots, one add alu can add 4 slots + 4 constants(2 slots)
 +      /* at most 128 slots, one add alu can add 5 slots + 4 constants(2 slots)
         * worst case */
-       if (alu->last && (bc->cf_last->ndw >> 1) >= 120) {
+       if (nalu->last && (bc->cf_last->ndw >> 1) >= 120) {
                bc->force_add_cf = 1;
        }
 -      /* number of gpr == the last gpr used in any alu */
 +      /* replace special constants */
        for (i = 0; i < 3; i++) {
 -              if (nalu->src[i].sel >= bc->ngpr && nalu->src[i].sel < 128) {
 -                      bc->ngpr = nalu->src[i].sel + 1;
 -              }
 -              /* compute how many literal are needed
 -               * either 2 or 4 literals
 -               */
 -              if (nalu->src[i].sel == 253) {
 -                      if (((nalu->src[i].chan + 2) & 0x6) > nalu->nliteral) {
 -                              nalu->nliteral = (nalu->src[i].chan + 2) & 0x6;
 -                      }
 -              }
 -      }
 -      if (!LIST_IS_EMPTY(&bc->cf_last->alu)) {
 -              lalu = LIST_ENTRY(struct r600_bc_alu, bc->cf_last->alu.prev, list);
 -              if (!lalu->last && lalu->nliteral > nalu->nliteral) {
 -                      nalu->nliteral = lalu->nliteral;
 -              }
 -      }
 -      if (nalu->dst.sel >= bc->ngpr) {
 -              bc->ngpr = nalu->dst.sel + 1;
 +              if (nalu->src[i].sel == V_SQ_ALU_SRC_LITERAL)
 +                      r600_bc_special_constants(
 +                              nalu->src[i].value[nalu->src[i].chan],
 +                              &nalu->src[i].sel, &nalu->src[i].neg);
        }
        LIST_ADDTAIL(&nalu->list, &bc->cf_last->alu);
        /* each alu use 2 dwords */
        bc->cf_last->ndw += 2;
        bc->ndw += 2;
  
-       bc->cf_last->kcache0_mode = 2;
        /* process cur ALU instructions for bank swizzle */
-       if (alu->last) {
+       if (nalu->last) {
 -              check_and_set_bank_swizzle(bc, bc->cf_last->curr_bs_head);
 +              struct r600_bc_alu *slots[5];
 +              r = assign_alu_units(bc->cf_last->curr_bs_head, slots);
 +              if (r)
 +                      return r;
 +
 +              if (bc->cf_last->prev_bs_head) {
 +                      r = merge_inst_groups(bc, slots, bc->cf_last->prev_bs_head);
 +                      if (r)
 +                              return r;
 +              }
 +
 +              if (bc->cf_last->prev_bs_head) {
 +                      r = replace_gpr_with_pv_ps(slots, bc->cf_last->prev_bs_head);
 +                      if (r)
 +                              return r;
 +              }
 +
 +              r = check_and_set_bank_swizzle(slots);
 +              if (r)
 +                      return r;
 +
 +              bc->cf_last->prev2_bs_head = bc->cf_last->prev_bs_head;
 +              bc->cf_last->prev_bs_head = bc->cf_last->curr_bs_head;
                bc->cf_last->curr_bs_head = NULL;
        }
        return 0;
@@@ -1170,841 -908,20 +1304,841 @@@ static enum cf_class get_cf_class(struc
                R600_ERR("unsupported CF instruction (0x%X)\n", cf->inst);
                return -EINVAL;
        }
 -      return 0;
  }
  
 -int r600_bc_build(struct r600_bc *bc)
 +/* common for r600/r700 - eg in eg_asm.c */
 +static int r600_bc_cf_build(struct r600_bc *bc, struct r600_bc_cf *cf)
  {
 -      struct r600_bc_cf *cf;
 -      struct r600_bc_alu *alu;
 -      struct r600_bc_vtx *vtx;
 -      struct r600_bc_tex *tex;
 -      unsigned addr;
 -      int r;
 +      unsigned id = cf->id;
 +      unsigned end_of_program = bc->cf.prev == &cf->list;
  
 -      if (bc->callstack[0].max > 0)
 -              bc->nstack = ((bc->callstack[0].max + 3) >> 2) + 2;
 +      switch (get_cf_class(cf)) {
 +      case CF_CLASS_ALU:
 +              assert(!end_of_program);
 +              bc->bytecode[id++] = S_SQ_CF_ALU_WORD0_ADDR(cf->addr >> 1) |
-                       S_SQ_CF_ALU_WORD0_KCACHE_MODE0(cf->kcache0_mode) |
-                       S_SQ_CF_ALU_WORD0_KCACHE_BANK0(cf->kcache0_bank) |
-                       S_SQ_CF_ALU_WORD0_KCACHE_BANK1(cf->kcache1_bank);
++                      S_SQ_CF_ALU_WORD0_KCACHE_MODE0(cf->kcache[0].mode) |
++                      S_SQ_CF_ALU_WORD0_KCACHE_BANK0(cf->kcache[0].bank) |
++                      S_SQ_CF_ALU_WORD0_KCACHE_BANK1(cf->kcache[1].bank);
 +
 +              bc->bytecode[id++] = S_SQ_CF_ALU_WORD1_CF_INST(cf->inst >> 3) |
-                       S_SQ_CF_ALU_WORD1_KCACHE_MODE1(cf->kcache1_mode) |
-                       S_SQ_CF_ALU_WORD1_KCACHE_ADDR0(cf->kcache0_addr) |
-                       S_SQ_CF_ALU_WORD1_KCACHE_ADDR1(cf->kcache1_addr) |
++                      S_SQ_CF_ALU_WORD1_KCACHE_MODE1(cf->kcache[1].mode) |
++                      S_SQ_CF_ALU_WORD1_KCACHE_ADDR0(cf->kcache[0].addr) |
++                      S_SQ_CF_ALU_WORD1_KCACHE_ADDR1(cf->kcache[1].addr) |
 +                      S_SQ_CF_ALU_WORD1_BARRIER(cf->barrier) |
 +                      S_SQ_CF_ALU_WORD1_USES_WATERFALL(bc->chiprev == CHIPREV_R600 ? cf->r6xx_uses_waterfall : 0) |
 +                      S_SQ_CF_ALU_WORD1_COUNT((cf->ndw / 2) - 1);
 +              break;
 +      case CF_CLASS_TEXTURE:
 +      case CF_CLASS_VERTEX:
 +              bc->bytecode[id++] = S_SQ_CF_WORD0_ADDR(cf->addr >> 1);
 +              bc->bytecode[id++] = S_SQ_CF_WORD1_CF_INST(cf->inst) |
 +                      S_SQ_CF_WORD1_BARRIER(cf->barrier) |
 +                      S_SQ_CF_WORD1_COUNT((cf->ndw / 4) - 1) |
 +                      S_SQ_CF_WORD1_END_OF_PROGRAM(end_of_program);
 +              break;
 +      case CF_CLASS_EXPORT:
 +              bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(cf->output.gpr) |
 +                      S_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE(cf->output.elem_size) |
 +                      S_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(cf->output.array_base) |
 +                      S_SQ_CF_ALLOC_EXPORT_WORD0_TYPE(cf->output.type);
 +              bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT(cf->output.burst_count - 1) |
 +                      S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_X(cf->output.swizzle_x) |
 +                      S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Y(cf->output.swizzle_y) |
 +                      S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Z(cf->output.swizzle_z) |
 +                      S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_W(cf->output.swizzle_w) |
 +                      S_SQ_CF_ALLOC_EXPORT_WORD1_BARRIER(cf->barrier) |
 +                      S_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->inst) |
 +                      S_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(end_of_program);
 +              break;
 +      case CF_CLASS_OTHER:
 +              bc->bytecode[id++] = S_SQ_CF_WORD0_ADDR(cf->cf_addr >> 1);
 +              bc->bytecode[id++] = S_SQ_CF_WORD1_CF_INST(cf->inst) |
 +                      S_SQ_CF_WORD1_BARRIER(cf->barrier) |
 +                      S_SQ_CF_WORD1_COND(cf->cond) |
 +                      S_SQ_CF_WORD1_POP_COUNT(cf->pop_count) |
 +                      S_SQ_CF_WORD1_END_OF_PROGRAM(end_of_program);
 +
 +              break;
 +      default:
 +              R600_ERR("unsupported CF instruction (0x%X)\n", cf->inst);
 +              return -EINVAL;
 +      }
 +      return 0;
 +}
 +
 +struct gpr_usage_range {
 +      int     replacement;
 +      int32_t start;
 +      int32_t end;
 +};
 +
 +struct gpr_usage {
 +      unsigned                channels:4;
 +      int32_t                 first_write;
 +      int32_t                 last_write[4];
 +      unsigned                nranges;
 +      struct gpr_usage_range  *ranges;
 +};
 +
 +static struct gpr_usage_range* add_gpr_usage_range(struct gpr_usage *usage)
 +{
 +      usage->nranges++;
 +      usage->ranges = realloc(usage->ranges, usage->nranges * sizeof(struct gpr_usage_range));
 +      if (!usage->ranges)
 +              return NULL;
 +      return &usage->ranges[usage->nranges-1];
 +}
 +
 +static void notice_gpr_read(struct gpr_usage *usage, int32_t id, unsigned chan)
 +{
 +        usage->channels |= 1 << chan;
 +        usage->first_write = -1;
 +        if (!usage->nranges) {
 +              struct gpr_usage_range* range = add_gpr_usage_range(usage);
 +              range->replacement = -1;
 +                range->start = -1;
 +                range->end = -1;
 +        }
 +        if (usage->ranges[usage->nranges-1].end < id)
 +              usage->ranges[usage->nranges-1].end = id;
 +}
 +
 +static void notice_gpr_rel_read(struct gpr_usage usage[128], int32_t id, unsigned chan)
 +{
 +      unsigned i;
 +      for (i = 0; i < 128; ++i)
 +              notice_gpr_read(&usage[i], id, chan);
 +}
 +
 +static void notice_gpr_last_write(struct gpr_usage *usage, int32_t id, unsigned chan)
 +{
 +        usage->last_write[chan] = id;
 +}
 +
 +static void notice_gpr_write(struct gpr_usage *usage, int32_t id, unsigned chan,
 +                              int predicate, int prefered_replacement)
 +{
 +      int32_t start = usage->first_write != -1 ? usage->first_write : id;
 +      usage->channels &= ~(1 << chan);
 +      if (usage->channels) {
 +              if (usage->first_write == -1)
 +                      usage->first_write = id;
 +      } else if (!usage->nranges || (usage->ranges[usage->nranges-1].start != start && !predicate)) {
 +              usage->first_write = start;
 +              struct gpr_usage_range* range = add_gpr_usage_range(usage);
 +              range->replacement = prefered_replacement;
 +                range->start = start;
 +                range->end = -1;
 +        } else if (usage->ranges[usage->nranges-1].start == start && prefered_replacement != -1) {
 +              usage->ranges[usage->nranges-1].replacement = prefered_replacement;
 +        }
 +        notice_gpr_last_write(usage, id, chan);
 +}
 +
 +static void notice_gpr_rel_last_write(struct gpr_usage usage[128], int32_t id, unsigned chan)
 +{
 +      unsigned i;
 +      for (i = 0; i < 128; ++i)
 +              notice_gpr_last_write(&usage[i], id, chan);
 +}
 +
 +static void notice_gpr_rel_write(struct gpr_usage usage[128], int32_t id, unsigned chan)
 +{
 +      unsigned i;
 +      for (i = 0; i < 128; ++i)
 +              notice_gpr_write(&usage[i], id, chan, 1, -1);
 +}
 +
 +static void notice_alu_src_gprs(struct r600_bc_alu *alu, struct gpr_usage usage[128], int32_t id)
 +{
 +      unsigned src, num_src;
 +
 +      num_src = r600_bc_get_num_operands(alu);
 +      for (src = 0; src < num_src; ++src) {
 +              // constants doesn't matter
 +              if (!is_gpr(alu->src[src].sel))
 +                      continue;
 +
 +              if (alu->src[src].rel)
 +                      notice_gpr_rel_read(usage, id, alu->src[src].chan);
 +              else
 +                      notice_gpr_read(&usage[alu->src[src].sel], id, alu->src[src].chan);
 +      }
 +}
 +
 +static void notice_alu_dst_gprs(struct r600_bc_alu *alu_first, struct gpr_usage usage[128],
 +                              int32_t id, int predicate)
 +{
 +      struct r600_bc_alu *alu;
 +      for (alu = alu_first; alu; alu = LIST_ENTRY(struct r600_bc_alu, alu->list.next, list)) {
 +              if (alu->dst.write) {
 +                      if (alu->dst.rel)
 +                              notice_gpr_rel_write(usage, id, alu->dst.chan);
 +                      else if (alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV && is_gpr(alu->src[0].sel))
 +                              notice_gpr_write(&usage[alu->dst.sel], id, alu->dst.chan,
 +                                              predicate, alu->src[0].sel);
 +                      else
 +                              notice_gpr_write(&usage[alu->dst.sel], id, alu->dst.chan, predicate, -1);
 +              }
 +
 +              if (alu->last)
 +                      break;
 +      }
 +}
 +
 +static void notice_tex_gprs(struct r600_bc_tex *tex, struct gpr_usage usage[128],
 +                              int32_t id, int predicate)
 +{
 +      if (tex->src_rel) {
 +                if (tex->src_sel_x < 4)
 +                      notice_gpr_rel_read(usage, id, tex->src_sel_x);
 +              if (tex->src_sel_y < 4)
 +                      notice_gpr_rel_read(usage, id, tex->src_sel_y);
 +              if (tex->src_sel_z < 4)
 +                      notice_gpr_rel_read(usage, id, tex->src_sel_z);
 +              if (tex->src_sel_w < 4)
 +                      notice_gpr_rel_read(usage, id, tex->src_sel_w);
 +        } else {
 +              if (tex->src_sel_x < 4)
 +                      notice_gpr_read(&usage[tex->src_gpr], id, tex->src_sel_x);
 +              if (tex->src_sel_y < 4)
 +                      notice_gpr_read(&usage[tex->src_gpr], id, tex->src_sel_y);
 +              if (tex->src_sel_z < 4)
 +                      notice_gpr_read(&usage[tex->src_gpr], id, tex->src_sel_z);
 +              if (tex->src_sel_w < 4)
 +                      notice_gpr_read(&usage[tex->src_gpr], id, tex->src_sel_w);
 +      }
 +      if (tex->dst_rel) {
 +              if (tex->dst_sel_x != 7)
 +                      notice_gpr_rel_write(usage, id, 0);
 +              if (tex->dst_sel_y != 7)
 +                      notice_gpr_rel_write(usage, id, 1);
 +              if (tex->dst_sel_z != 7)
 +                      notice_gpr_rel_write(usage, id, 2);
 +              if (tex->dst_sel_w != 7)
 +                      notice_gpr_rel_write(usage, id, 3);
 +      } else {
 +              if (tex->dst_sel_x != 7)
 +                      notice_gpr_write(&usage[tex->dst_gpr], id, 0, predicate, -1);
 +              if (tex->dst_sel_y != 7)
 +                      notice_gpr_write(&usage[tex->dst_gpr], id, 1, predicate, -1);
 +              if (tex->dst_sel_z != 7)
 +                      notice_gpr_write(&usage[tex->dst_gpr], id, 2, predicate, -1);
 +              if (tex->dst_sel_w != 7)
 +                      notice_gpr_write(&usage[tex->dst_gpr], id, 3, predicate, -1);
 +      }
 +}
 +
 +static void notice_vtx_gprs(struct r600_bc_vtx *vtx, struct gpr_usage usage[128],
 +                              int32_t id, int predicate)
 +{
 +      notice_gpr_read(&usage[vtx->src_gpr], id, vtx->src_sel_x);
 +
 +      if (vtx->dst_sel_x != 7)
 +              notice_gpr_write(&usage[vtx->dst_gpr], id, 0, predicate, -1);
 +      if (vtx->dst_sel_y != 7)
 +              notice_gpr_write(&usage[vtx->dst_gpr], id, 1, predicate, -1);
 +      if (vtx->dst_sel_z != 7)
 +              notice_gpr_write(&usage[vtx->dst_gpr], id, 2, predicate, -1);
 +      if (vtx->dst_sel_w != 7)
 +              notice_gpr_write(&usage[vtx->dst_gpr], id, 3, predicate, -1);
 +}
 +
 +static void notice_export_gprs(struct r600_bc_cf *cf, struct gpr_usage usage[128],
 +                              struct r600_bc_cf *export_cf[128], int32_t export_remap[128])
 +{
 +      //TODO handle other memory operations
 +      struct gpr_usage *output = &usage[cf->output.gpr];
 +      int32_t id = (output->last_write[0] + 0x100) & ~0xFF;
 +
 +      export_cf[cf->output.gpr] = cf;
 +      export_remap[cf->output.gpr] = id;
 +      if (cf->output.swizzle_x < 4)
 +              notice_gpr_read(output, id, cf->output.swizzle_x);
 +      if (cf->output.swizzle_y < 4)
 +              notice_gpr_read(output, id, cf->output.swizzle_y);
 +      if (cf->output.swizzle_z < 4)
 +              notice_gpr_read(output, id, cf->output.swizzle_z);
 +      if (cf->output.swizzle_w < 4)
 +              notice_gpr_read(output, id, cf->output.swizzle_w);
 +}
 +
 +static struct gpr_usage_range *find_src_range(struct gpr_usage *usage, int32_t id)
 +{
 +      unsigned i;
 +      for (i = 0; i < usage->nranges; ++i) {
 +              struct gpr_usage_range* range = &usage->ranges[i];
 +
 +              if (range->start < id && id <= range->end)
 +                      return range;
 +      }
 +      return NULL;
 +}
 +
 +static struct gpr_usage_range *find_dst_range(struct gpr_usage *usage, int32_t id)
 +{
 +      unsigned i;
 +      for (i = 0; i < usage->nranges; ++i) {
 +              struct gpr_usage_range* range = &usage->ranges[i];
 +              int32_t end = range->end;
 +
 +              if (range->start <= id && (id < end || end == -1))
 +                      return range;
 +      }
 +      assert(0); /* should not happen */
 +      return NULL;
 +}
 +
 +static int is_barrier_needed(struct gpr_usage *usage, int32_t id, unsigned chan, int32_t last_barrier)
 +{
 +      if (usage->last_write[chan] != (id & ~0xFF))
 +              return usage->last_write[chan] >= last_barrier;
 +      else
 +              return 0;
 +}
 +
 +static int is_intersection(struct gpr_usage_range* a, struct gpr_usage_range* b)
 +{
 +      return a->start <= b->end && b->start < a->end;
 +}
 +
 +static int rate_replacement(struct gpr_usage *usage, struct gpr_usage_range* range)
 +{
 +      unsigned i;
 +      int32_t best_start = 0x3FFFFFFF, best_end = 0x3FFFFFFF;
 +
 +      for (i = 0; i < usage->nranges; ++i) {
 +              if (usage->ranges[i].replacement != -1)
 +                      continue; /* ignore already remapped ranges */
 +
 +              if (is_intersection(&usage->ranges[i], range))
 +                      return -1; /* forget it if usages overlap */
 +
 +              if (range->start >= usage->ranges[i].end)
 +                      best_start = MIN2(best_start, range->start - usage->ranges[i].end);
 +
 +              if (range->end != -1 && range->end <= usage->ranges[i].start)
 +                      best_end = MIN2(best_end, usage->ranges[i].start - range->end);
 +      }
 +      return best_start + best_end;
 +}
 +
 +static void find_replacement(struct gpr_usage usage[128], unsigned current,
 +                              struct gpr_usage_range *range, int is_export)
 +{
 +      unsigned i;
 +      int best_gpr = -1, best_rate = 0x7FFFFFFF;
 +
 +      if (range->replacement != -1 && range->replacement <= current) {
 +              struct gpr_usage_range *other = find_src_range(&usage[range->replacement], range->start);
 +              if (other && other->replacement != -1)
 +                      range->replacement = other->replacement;
 +      }
 +
 +      if (range->replacement != -1 && range->replacement < current) {
 +              int rate = rate_replacement(&usage[range->replacement], range);
 +
 +              /* check if prefered replacement can be used */
 +              if (rate != -1) {
 +                      best_rate = rate;
 +                      best_gpr = range->replacement;
 +              }
 +      }
 +
 +      if (best_gpr == -1 && (range->start & ~0xFF) == (range->end & ~0xFF)) {
 +              /* register is just used inside one ALU clause */
 +              /* try to use clause temporaryis for it */
 +              for (i = 127; i > 123; --i) {
 +                      int rate = rate_replacement(&usage[i], range);
 +
 +                      if (rate == -1) /* can't be used because ranges overlap */
 +                              continue;
 +
 +                      if (rate < best_rate) {
 +                              best_rate = rate;
 +                              best_gpr = i;
 +
 +                              /* can't get better than this */
 +                              if (rate == 0 || is_export)
 +                                      break;
 +                      }
 +              }
 +      }
 +
 +      if (best_gpr == -1) {
 +              for (i = 0; i < current; ++i) {
 +                      int rate = rate_replacement(&usage[i], range);
 +
 +                      if (rate == -1) /* can't be used because ranges overlap */
 +                              continue;
 +
 +                      if (rate < best_rate) {
 +                              best_rate = rate;
 +                              best_gpr = i;
 +
 +                              /* can't get better than this */
 +                              if (rate == 0)
 +                                      break;
 +                      }
 +              }
 +      }
 +
 +      range->replacement = best_gpr;
 +      if (best_gpr != -1) {
 +              struct gpr_usage_range *reservation = add_gpr_usage_range(&usage[best_gpr]);
 +              reservation->replacement = -1;
 +              reservation->start = range->start;
 +              reservation->end = range->end;
 +      }
 +}
 +
 +static void find_export_replacement(struct gpr_usage usage[128],
 +                              struct gpr_usage_range *range, struct r600_bc_cf *current,
 +                              struct r600_bc_cf *next, int32_t next_id)
 +{
 +      if (!next || next_id <= range->start || next_id > range->end)
 +              return;
 +
 +      if (current->output.type != next->output.type)
 +              return;
 +
 +      if ((current->output.array_base + 1) != next->output.array_base)
 +              return;
 +
 +      find_src_range(&usage[next->output.gpr], next_id)->replacement = range->replacement + 1;
 +}
 +
 +static void replace_alu_gprs(struct r600_bc_alu *alu, struct gpr_usage usage[128],
 +                              int32_t id, int32_t last_barrier, unsigned *barrier)
 +{
 +      struct gpr_usage *cur_usage;
 +      struct gpr_usage_range *range;
 +      unsigned src, num_src;
 +
 +      num_src = r600_bc_get_num_operands(alu);
 +      for (src = 0; src < num_src; ++src) {
 +              // constants doesn't matter
 +              if (!is_gpr(alu->src[src].sel))
 +                      continue;
 +
 +              cur_usage = &usage[alu->src[src].sel];
 +              range = find_src_range(cur_usage, id);
 +              if (range->replacement != -1)
 +                      alu->src[src].sel = range->replacement;
 +
 +              *barrier |= is_barrier_needed(cur_usage, id, alu->src[src].chan, last_barrier);
 +      }
 +
 +      if (alu->dst.write) {
 +              cur_usage = &usage[alu->dst.sel];
 +              range = find_dst_range(cur_usage, id);
 +              if (range->replacement == alu->dst.sel) {
 +                      if (!alu->is_op3)
 +                              alu->dst.write = 0;
 +                      else
 +                              /*TODO: really check that register 123 is useable */
 +                              alu->dst.sel = 123;
 +              } else if (range->replacement != -1) {
 +                      alu->dst.sel = range->replacement;
 +              }
 +              if (alu->dst.rel)
 +                      notice_gpr_rel_last_write(usage, id, alu->dst.chan);
 +              else
 +                      notice_gpr_last_write(cur_usage, id, alu->dst.chan);
 +      }
 +}
 +
 +static void replace_tex_gprs(struct r600_bc_tex *tex, struct gpr_usage usage[128],
 +                              int32_t id, int32_t last_barrier, unsigned *barrier)
 +{
 +      struct gpr_usage *cur_usage = &usage[tex->src_gpr];
 +      struct gpr_usage_range *range = find_src_range(cur_usage, id);
 +
 +      if (tex->src_rel) {
 +              *barrier = 1;
 +        } else {
 +              if (tex->src_sel_x < 4)
 +                      *barrier |= is_barrier_needed(cur_usage, id, tex->src_sel_x, last_barrier);
 +              if (tex->src_sel_y < 4)
 +                      *barrier |= is_barrier_needed(cur_usage, id, tex->src_sel_y, last_barrier);
 +              if (tex->src_sel_z < 4)
 +                      *barrier |= is_barrier_needed(cur_usage, id, tex->src_sel_z, last_barrier);
 +              if (tex->src_sel_w < 4)
 +                      *barrier |= is_barrier_needed(cur_usage, id, tex->src_sel_w, last_barrier);
 +      }
 +
 +      if (range->replacement != -1)
 +              tex->src_gpr = range->replacement;
 +
 +      cur_usage = &usage[tex->dst_gpr];
 +      range = find_dst_range(cur_usage, id);
 +      if (range->replacement != -1)
 +              tex->dst_gpr = range->replacement;
 +
 +      if (tex->dst_rel) {
 +              if (tex->dst_sel_x != 7)
 +                      notice_gpr_rel_last_write(usage, id, tex->dst_sel_x);
 +              if (tex->dst_sel_y != 7)
 +                      notice_gpr_rel_last_write(usage, id, tex->dst_sel_y);
 +              if (tex->dst_sel_z != 7)
 +                      notice_gpr_rel_last_write(usage, id, tex->dst_sel_z);
 +              if (tex->dst_sel_w != 7)
 +                      notice_gpr_rel_last_write(usage, id, tex->dst_sel_w);
 +      } else {
 +              if (tex->dst_sel_x != 7)
 +                      notice_gpr_last_write(cur_usage, id, tex->dst_sel_x);
 +              if (tex->dst_sel_y != 7)
 +                      notice_gpr_last_write(cur_usage, id, tex->dst_sel_y);
 +              if (tex->dst_sel_z != 7)
 +                      notice_gpr_last_write(cur_usage, id, tex->dst_sel_z);
 +              if (tex->dst_sel_w != 7)
 +                      notice_gpr_last_write(cur_usage, id, tex->dst_sel_w);
 +      }
 +}
 +
 +static void replace_vtx_gprs(struct r600_bc_vtx *vtx, struct gpr_usage usage[128],
 +                              int32_t id, int32_t last_barrier, unsigned *barrier)
 +{
 +      struct gpr_usage *cur_usage = &usage[vtx->src_gpr];
 +      struct gpr_usage_range *range = find_src_range(cur_usage, id);
 +
 +      *barrier |= is_barrier_needed(cur_usage, id, vtx->src_sel_x, last_barrier);
 +
 +      if (range->replacement != -1)
 +              vtx->src_gpr = range->replacement;
 +
 +      cur_usage = &usage[vtx->dst_gpr];
 +      range = find_dst_range(cur_usage, id);
 +      if (range->replacement != -1)
 +              vtx->dst_gpr = range->replacement;
 +
 +      if (vtx->dst_sel_x != 7)
 +              notice_gpr_last_write(cur_usage, id, vtx->dst_sel_x);
 +      if (vtx->dst_sel_y != 7)
 +              notice_gpr_last_write(cur_usage, id, vtx->dst_sel_y);
 +      if (vtx->dst_sel_z != 7)
 +              notice_gpr_last_write(cur_usage, id, vtx->dst_sel_z);
 +      if (vtx->dst_sel_w != 7)
 +              notice_gpr_last_write(cur_usage, id, vtx->dst_sel_w);
 +}
 +
 +static void replace_export_gprs(struct r600_bc_cf *cf, struct gpr_usage usage[128],
 +                              int32_t id, int32_t last_barrier)
 +{
 +      //TODO handle other memory operations
 +      struct gpr_usage *cur_usage = &usage[cf->output.gpr];
 +      struct gpr_usage_range *range = find_src_range(cur_usage, id);
 +
 +      cf->barrier = 0;
 +      if (cf->output.swizzle_x < 4)
 +              cf->barrier |= is_barrier_needed(cur_usage, -1, cf->output.swizzle_x, last_barrier);
 +      if (cf->output.swizzle_y < 4)
 +              cf->barrier |= is_barrier_needed(cur_usage, -1, cf->output.swizzle_y, last_barrier);
 +      if (cf->output.swizzle_z < 4)
 +              cf->barrier |= is_barrier_needed(cur_usage, -1, cf->output.swizzle_z, last_barrier);
 +      if (cf->output.swizzle_w < 4)
 +              cf->barrier |= is_barrier_needed(cur_usage, -1, cf->output.swizzle_w, last_barrier);
 +
 +      if (range->replacement != -1)
 +              cf->output.gpr = range->replacement;
 +}
 +
 +static void optimize_alu_inst(struct r600_bc_cf *cf, struct r600_bc_alu *alu)
 +{
 +      struct r600_bc_alu *alu_next;
 +      unsigned chan;
 +      unsigned src, num_src;
 +
 +      /* check if a MOV could be optimized away */
 +      if (alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV) {
 +
 +              /* destination equals source? */
 +              if (alu->dst.sel != alu->src[0].sel ||
 +                      alu->dst.chan != alu->src[0].chan)
 +                      return;
 +
 +              /* any special handling for the source? */
 +              if (alu->src[0].rel || alu->src[0].neg || alu->src[0].abs)
 +                      return;
 +
 +              /* any special handling for destination? */
 +              if (alu->dst.rel || alu->dst.clamp)
 +                      return;
 +
 +              /* ok find next instruction group and check if ps/pv is used */
 +              for (alu_next = alu; !alu_next->last; alu_next = NEXT_ALU(alu_next));
 +
 +              if (alu_next->list.next != &cf->alu) {
 +                      chan = is_alu_reduction_inst(alu) ? 0 : alu->dst.chan;
 +                      for (alu_next = NEXT_ALU(alu_next); alu_next; alu_next = NEXT_ALU(alu_next)) {
 +                              num_src = r600_bc_get_num_operands(alu_next);
 +                              for (src = 0; src < num_src; ++src) {
 +                                      if (alu_next->src[src].sel == V_SQ_ALU_SRC_PV &&
 +                                              alu_next->src[src].chan == chan)
 +                                              return;
 +
 +                                      if (alu_next->src[src].sel == V_SQ_ALU_SRC_PS)
 +                                              return;
 +                              }
 +
 +                              if (alu_next->last)
 +                                      break;
 +                      }
 +              }
 +
 +              r600_bc_remove_alu(cf, alu);
 +      }
 +}
 +
 +static void optimize_export_inst(struct r600_bc *bc, struct r600_bc_cf *cf)
 +{
 +      struct r600_bc_cf *prev = LIST_ENTRY(struct r600_bc_cf, cf->list.prev, list);
 +      if (&prev->list == &bc->cf ||
 +              prev->inst != cf->inst ||
 +              prev->output.type != cf->output.type ||
 +              prev->output.elem_size != cf->output.elem_size ||
 +              prev->output.swizzle_x != cf->output.swizzle_x ||
 +              prev->output.swizzle_y != cf->output.swizzle_y ||
 +              prev->output.swizzle_z != cf->output.swizzle_z ||
 +              prev->output.swizzle_w != cf->output.swizzle_w)
 +              return;
 +
 +      if ((prev->output.burst_count + cf->output.burst_count) > 16)
 +              return;
 +
 +      if ((prev->output.gpr + prev->output.burst_count) == cf->output.gpr &&
 +              (prev->output.array_base + prev->output.burst_count) == cf->output.array_base) {
 +
 +              prev->output.burst_count += cf->output.burst_count;
 +              r600_bc_remove_cf(bc, cf);
 +
 +      } else if (prev->output.gpr == (cf->output.gpr + cf->output.burst_count) &&
 +              prev->output.array_base == (cf->output.array_base + cf->output.burst_count)) {
 +
 +              cf->output.burst_count += prev->output.burst_count;
 +              r600_bc_remove_cf(bc, prev);
 +      }
 +}
 +
 +static void r600_bc_optimize(struct r600_bc *bc)
 +{
 +      struct r600_bc_cf *cf, *next_cf;
 +      struct r600_bc_alu *first, *next_alu;
 +      struct r600_bc_alu *alu;
 +      struct r600_bc_vtx *vtx;
 +      struct r600_bc_tex *tex;
 +      struct gpr_usage usage[128];
 +
 +      /* assume that each gpr is exported only once */
 +      struct r600_bc_cf *export_cf[128] = { NULL };
 +      int32_t export_remap[128];
 +
 +      int32_t id, barrier[bc->nstack];
 +      unsigned i, j, stack, predicate, old_stack;
 +
 +      memset(&usage, 0, sizeof(usage));
 +      for (i = 0; i < 128; ++i) {
 +              usage[i].first_write = -1;
 +              usage[i].last_write[0] = -1;
 +              usage[i].last_write[1] = -1;
 +              usage[i].last_write[2] = -1;
 +              usage[i].last_write[3] = -1;
 +      }
 +
 +      /* first gather some informations about the gpr usage */
 +      id = 0; stack = 0;
 +      LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
 +              switch (get_cf_class(cf)) {
 +              case CF_CLASS_ALU:
 +                      predicate = 0;
 +                      first = NULL;
 +                      LIST_FOR_EACH_ENTRY(alu, &cf->alu, list) {
 +                              if (!first)
 +                                      first = alu;
 +                              notice_alu_src_gprs(alu, usage, id);
 +                              if (alu->last) {
 +                                      notice_alu_dst_gprs(first, usage, id, predicate || stack > 0);
 +                                      first = NULL;
 +                                      ++id;
 +                              }
 +                              if (is_alu_pred_inst(alu))
 +                                      predicate++;
 +                      }
 +                      if (cf->inst == V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE << 3)
 +                              stack += predicate;
 +                      else if (cf->inst == V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER << 3)
 +                              stack -= 1;
 +                      else if (cf->inst == V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER << 3)
 +                              stack -= 2;
 +                      break;
 +              case CF_CLASS_TEXTURE:
 +                      LIST_FOR_EACH_ENTRY(tex, &cf->tex, list) {
 +                              notice_tex_gprs(tex, usage, id++, stack > 0);
 +                      }
 +                      break;
 +              case CF_CLASS_VERTEX:
 +                      LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) {
 +                              notice_vtx_gprs(vtx, usage, id++, stack > 0);
 +                      }
 +                      break;
 +              case CF_CLASS_EXPORT:
 +                      notice_export_gprs(cf, usage, export_cf, export_remap);
 +                      continue; // don't increment id
 +              case CF_CLASS_OTHER:
 +                      switch (cf->inst) {
 +                      case V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
 +                      case V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
 +                      case V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS:
 +                              break;
 +
 +                      case V_SQ_CF_WORD1_SQ_CF_INST_POP:
 +                              stack -= cf->pop_count;
 +                              break;
 +
 +                      default:
 +                              // TODO implement loop handling
 +                              goto out;
 +                      }
 +              }
 +              id += 0x100;
 +              id &= ~0xFF;
 +      }
 +      assert(stack == 0);
 +
 +      /* try to optimize gpr usage */
 +      for (i = 0; i < 124; ++i) {
 +              for (j = 0; j < usage[i].nranges; ++j) {
 +                      struct gpr_usage_range *range = &usage[i].ranges[j];
 +                      int is_export = export_cf[i] && export_cf[i + 1] &&
 +                              range->start < export_remap[i] &&
 +                              export_remap[i] <= range->end;
 +
 +                      if (range->start == -1)
 +                              range->replacement = -1;
 +                      else if (range->end == -1)
 +                              range->replacement = i;
 +                      else
 +                              find_replacement(usage, i, range, is_export);
 +
 +                      if (range->replacement == -1)
 +                              bc->ngpr = i;
 +                      else if (range->replacement < i && range->replacement > bc->ngpr)
 +                              bc->ngpr = range->replacement;
 +
 +                      if (is_export && range->replacement != -1) {
 +                              find_export_replacement(usage, range, export_cf[i],
 +                                                      export_cf[i + 1], export_remap[i + 1]);
 +                      }
 +              }
 +      }
 +      bc->ngpr++;
 +
 +      /* apply the changes */
 +      for (i = 0; i < 128; ++i) {
 +              usage[i].last_write[0] = -1;
 +              usage[i].last_write[1] = -1;
 +              usage[i].last_write[2] = -1;
 +              usage[i].last_write[3] = -1;
 +      }
 +      barrier[0] = 0;
 +      id = 0; stack = 0;
 +      LIST_FOR_EACH_ENTRY_SAFE(cf, next_cf, &bc->cf, list) {
 +              old_stack = stack;
 +              switch (get_cf_class(cf)) {
 +              case CF_CLASS_ALU:
 +                      predicate = 0;
 +                      first = NULL;
 +                      cf->barrier = 0;
 +                      LIST_FOR_EACH_ENTRY_SAFE(alu, next_alu, &cf->alu, list) {
 +                              replace_alu_gprs(alu, usage, id, barrier[stack], &cf->barrier);
 +                              if (alu->last)
 +                                      ++id;
 +
 +                              if (is_alu_pred_inst(alu))
 +                                      predicate++;
 +
 +                              if (cf->inst == V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU << 3)
 +                                      optimize_alu_inst(cf, alu);
 +                      }
 +                      if (cf->inst == V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE << 3)
 +                              stack += predicate;
 +                      else if (cf->inst == V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER << 3)
 +                              stack -= 1;
 +                      else if (cf->inst == V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER << 3)
 +                              stack -= 2;
 +                      if (LIST_IS_EMPTY(&cf->alu)) {
 +                              r600_bc_remove_cf(bc, cf);
 +                              cf = NULL;
 +                      }
 +                      break;
 +              case CF_CLASS_TEXTURE:
 +                      cf->barrier = 0;
 +                      LIST_FOR_EACH_ENTRY(tex, &cf->tex, list) {
 +                              replace_tex_gprs(tex, usage, id++, barrier[stack], &cf->barrier);
 +                      }
 +                      break;
 +              case CF_CLASS_VERTEX:
 +                      cf->barrier = 0;
 +                      LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) {
 +                              replace_vtx_gprs(vtx, usage, id++, barrier[stack], &cf->barrier);
 +                      }
 +                      break;
 +              case CF_CLASS_EXPORT:
 +                      continue; // don't increment id
 +              case CF_CLASS_OTHER:
 +                      if (cf->inst == V_SQ_CF_WORD1_SQ_CF_INST_POP) {
 +                              cf->barrier = 0;
 +                              stack -= cf->pop_count;
 +                      }
 +                      break;
 +              }
 +
 +              id &= ~0xFF;
 +              if (cf && cf->barrier)
 +                      barrier[old_stack] = id;
 +
 +              for (i = old_stack + 1; i <= stack; ++i)
 +                      barrier[i] = barrier[old_stack];
 +
 +              id += 0x100;
 +              if (stack != 0) /* ensue exports are placed outside of conditional blocks */
 +                      continue;
 +
 +              for (i = 0; i < 128; ++i) {
 +                      if (!export_cf[i] || id < export_remap[i])
 +                              continue;
 +
 +                      r600_bc_move_cf(bc, export_cf[i], next_cf);
 +                      replace_export_gprs(export_cf[i], usage, export_remap[i], barrier[stack]);
 +                      if (export_cf[i]->barrier)
 +                              barrier[stack] = id - 1;
 +                      next_cf = LIST_ENTRY(struct r600_bc_cf, export_cf[i]->list.next, list);
 +                      optimize_export_inst(bc, export_cf[i]);
 +                      export_cf[i] = NULL;
 +              }
 +      }
 +      assert(stack == 0);
 +
 +out:
 +      for (i = 0; i < 128; ++i) {
 +              free(usage[i].ranges);
 +      }
 +}
 +
 +int r600_bc_build(struct r600_bc *bc)
 +{
 +      struct r600_bc_cf *cf;
 +      struct r600_bc_alu *alu;
 +      struct r600_bc_vtx *vtx;
 +      struct r600_bc_tex *tex;
 +      struct r600_bc_cf *exports[4] = { NULL };
 +      uint32_t literal[4];
 +      unsigned nliteral;
 +      unsigned addr;
 +      int i, r;
 +
 +      if (bc->callstack[0].max > 0)
 +              bc->nstack = ((bc->callstack[0].max + 3) >> 2) + 2;
        if (bc->type == TGSI_PROCESSOR_VERTEX && !bc->nstack) {
                bc->nstack = 1;
        }
@@@ -114,9 -122,15 +114,15 @@@ struct r600_bc_output 
        unsigned                        swizzle_y;
        unsigned                        swizzle_z;
        unsigned                        swizzle_w;
 -      unsigned                        barrier;
 +      unsigned                        burst_count;
  };
  
+ struct r600_bc_kcache {
+       unsigned                        bank;
+       unsigned                        mode;
+       unsigned                        addr;
+ };
  struct r600_bc_cf {
        struct list_head                list;
        unsigned                        inst;
        unsigned                        cond;
        unsigned                        pop_count;
        unsigned                        cf_addr; /* control flow addr */
-       unsigned                        kcache0_mode;
-       unsigned                        kcache1_mode;
-       unsigned                        kcache0_addr;
-       unsigned                        kcache1_addr;
-       unsigned                        kcache0_bank;
-       unsigned                        kcache1_bank;
 +      unsigned                        barrier;
+       struct r600_bc_kcache           kcache[2];
        unsigned                        r6xx_uses_waterfall;
        struct list_head                alu;
        struct list_head                tex;
Simple merge
@@@ -541,9 -542,11 +543,11 @@@ int r600_shader_from_tgsi(const struct 
        ctx.file_offset[TGSI_FILE_TEMPORARY] = ctx.file_offset[TGSI_FILE_OUTPUT] +
                                                ctx.info.file_count[TGSI_FILE_OUTPUT];
  
-       ctx.file_offset[TGSI_FILE_CONSTANT] = 128;
+       /* Outside the GPR range. This will be translated to one of the
+        * kcache banks later. */
+       ctx.file_offset[TGSI_FILE_CONSTANT] = 512;
  
 -      ctx.file_offset[TGSI_FILE_IMMEDIATE] = 253;
 +      ctx.file_offset[TGSI_FILE_IMMEDIATE] = V_SQ_ALU_SRC_LITERAL;
        ctx.temp_reg = ctx.file_offset[TGSI_FILE_TEMPORARY] +
                        ctx.info.file_count[TGSI_FILE_TEMPORARY];
  
                        r = ctx.inst_info->process(&ctx);
                        if (r)
                                goto out_err;
 -                      r = r600_bc_add_literal(ctx.bc, ctx.value);
 -                      if (r)
 -                              goto out_err;
                        break;
+               case TGSI_TOKEN_TYPE_PROPERTY:
+                       break;
                default:
                        R600_ERR("unsupported token type %d\n", ctx.parse.FullToken.Token.Type);
                        r = -EINVAL;
Simple merge
Simple merge
@@@ -186,22 -186,16 +186,28 @@@ enum pipe_format 
     PIPE_FORMAT_R8G8B8X8_UNORM          = 134,
     PIPE_FORMAT_B4G4R4X4_UNORM          = 135,
  
 +   PIPE_FORMAT_YV12                  = 136,
 +   PIPE_FORMAT_YV16                  = 137,
 +   PIPE_FORMAT_IYUV                  = 138,  /**< aka I420 */
 +   PIPE_FORMAT_NV12                  = 139,
 +   PIPE_FORMAT_NV21                  = 140,
 +   PIPE_FORMAT_AYUV                  = PIPE_FORMAT_A8R8G8B8_UNORM,
 +   PIPE_FORMAT_VUYA                  = PIPE_FORMAT_B8G8R8A8_UNORM,
 +   PIPE_FORMAT_XYUV                  = PIPE_FORMAT_X8R8G8B8_UNORM,
 +   PIPE_FORMAT_VUYX                  = PIPE_FORMAT_B8G8R8X8_UNORM,
 +   PIPE_FORMAT_IA44                  = 141,
 +   PIPE_FORMAT_AI44                  = 142,
 +
     /* some stencil samplers formats */
 -   PIPE_FORMAT_X24S8_USCALED           = 136,
 -   PIPE_FORMAT_S8X24_USCALED           = 137,
 -   PIPE_FORMAT_X32_S8X24_USCALED       = 138,
 +   PIPE_FORMAT_X24S8_USCALED           = 143,
 +   PIPE_FORMAT_S8X24_USCALED           = 144,
 +   PIPE_FORMAT_X32_S8X24_USCALED       = 145,
 -   PIPE_FORMAT_B2G3R3_UNORM            = 139,
 -   PIPE_FORMAT_L16A16_UNORM            = 140,
 -   PIPE_FORMAT_A16_UNORM               = 141,
 -   PIPE_FORMAT_I16_UNORM               = 142,
++   PIPE_FORMAT_B2G3R3_UNORM            = 146,
++   PIPE_FORMAT_L16A16_UNORM            = 147,
++   PIPE_FORMAT_A16_UNORM               = 148,
++   PIPE_FORMAT_I16_UNORM               = 149,
     PIPE_FORMAT_COUNT
  };
  
@@@ -10,7 -10,7 +10,8 @@@ PIPE_DRIVERS = 
        $(TOP)/src/gallium/drivers/rbug/librbug.a \
        $(TOP)/src/gallium/drivers/nvfx/libnvfx.a \
        $(TOP)/src/gallium/drivers/nv50/libnv50.a \
+       $(TOP)/src/gallium/drivers/nvc0/libnvc0.a \
 +      $(TOP)/src/gallium/drivers/softpipe/libsoftpipe.a \
        $(TOP)/src/gallium/drivers/nouveau/libnouveau.a
  
  C_SOURCES = \