freedreno: a2xx: NIR backend

author Jonathan Marek <jonathan@marek.ca>

Wed, 19 Dec 2018 01:15:57 +0000 (20:15 -0500)

committer Rob Clark <robdclark@gmail.com>

Tue, 22 Jan 2019 14:45:03 +0000 (14:45 +0000)
author Jonathan Marek <jonathan@marek.ca>
Wed, 19 Dec 2018 01:15:57 +0000 (20:15 -0500)
committer Rob Clark <robdclark@gmail.com>
Tue, 22 Jan 2019 14:45:03 +0000 (14:45 +0000)
diff --git a/src/gallium/drivers/freedreno/Makefile.sources b/src/gallium/drivers/freedreno/Makefile.sources

index 7bb033a..119b314 100644 (file)
--- a/src/gallium/drivers/freedreno/Makefile.sources
+++ b/src/gallium/drivers/freedreno/Makefile.sources
@@ -42,8 +42,6 @@ a2xx_SOURCES := \
         a2xx/disasm-a2xx.c \
         a2xx/fd2_blend.c \
         a2xx/fd2_blend.h \
-       a2xx/fd2_compiler.c \
-       a2xx/fd2_compiler.h \
         a2xx/fd2_context.c \
         a2xx/fd2_context.h \
         a2xx/fd2_draw.c \
@@ -67,8 +65,12 @@ a2xx_SOURCES := \
         a2xx/fd2_zsa.c \
         a2xx/fd2_zsa.h \
         a2xx/instr-a2xx.h \
-       a2xx/ir-a2xx.c \
-       a2xx/ir-a2xx.h
+       a2xx/ir2.c \
+       a2xx/ir2.h \
+       a2xx/ir2_assemble.c \
+       a2xx/ir2_nir.c \
+       a2xx/ir2_private.h \
+       a2xx/ir2_ra.c
  
  a3xx_SOURCES := \
         a3xx/fd3_blend.c \
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_compiler.c b/src/gallium/drivers/freedreno/a2xx/fd2_compiler.c

deleted file mode 100644 (file)

index 156bfc2..0000000
--- a/src/gallium/drivers/freedreno/a2xx/fd2_compiler.c
+++ /dev/null
@@ -1,1119 +0,0 @@
-/*
- * Copyright (C) 2012 Rob Clark <robclark@freedesktop.org>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <robclark@freedesktop.org>
- */
-
-#include "pipe/p_state.h"
-#include "util/u_string.h"
-#include "util/u_memory.h"
-#include "util/u_inlines.h"
-#include "tgsi/tgsi_parse.h"
-#include "tgsi/tgsi_ureg.h"
-#include "tgsi/tgsi_info.h"
-#include "tgsi/tgsi_strings.h"
-#include "tgsi/tgsi_dump.h"
-
-#include "fd2_compiler.h"
-#include "fd2_program.h"
-#include "fd2_util.h"
-
-#include "instr-a2xx.h"
-#include "ir-a2xx.h"
-
-struct fd2_compile_context {
-       struct fd_program_stateobj *prog;
-       struct fd2_shader_stateobj *so;
-
-       struct tgsi_parse_context parser;
-       unsigned type;
-
-       /* predicate stack: */
-       int pred_depth;
-       enum ir2_pred pred_stack[8];
-
-       /* Internal-Temporary and Predicate register assignment:
-        *
-        * Some TGSI instructions which translate into multiple actual
-        * instructions need one or more temporary registers, which are not
-        * assigned from TGSI perspective (ie. not TGSI_FILE_TEMPORARY).
-        * And some instructions (texture fetch) cannot write directly to
-        * output registers.  We could be more clever and re-use dst or a
-        * src register in some cases.  But for now don't try to be clever.
-        * Eventually we should implement an optimization pass that re-
-        * juggles the register usage and gets rid of unneeded temporaries.
-        *
-        * The predicate register must be valid across multiple TGSI
-        * instructions, but internal temporary's do not.  For this reason,
-        * once the predicate register is requested, until it is no longer
-        * needed, it gets the first register slot after after the TGSI
-        * assigned temporaries (ie. num_regs[TGSI_FILE_TEMPORARY]), and the
-        * internal temporaries get the register slots above this.
-        */
-
-       int pred_reg;
-       int num_internal_temps;
-
-       uint8_t num_regs[TGSI_FILE_COUNT];
-
-       /* maps input register idx to prog->export_linkage idx: */
-       uint8_t input_export_idx[64];
-
-       /* maps output register idx to prog->export_linkage idx: */
-       uint8_t output_export_idx[64];
-
-       /* idx/slot for last compiler generated immediate */
-       unsigned immediate_idx;
-
-       // TODO we can skip emit exports in the VS that the FS doesn't need..
-       // and get rid perhaps of num_param..
-       unsigned num_position, num_param;
-       unsigned position, psize;
-
-       uint64_t need_sync;
-};
-
-static int
-semantic_idx(struct tgsi_declaration_semantic *semantic)
-{
-       int idx = semantic->Name;
-       if (idx == TGSI_SEMANTIC_GENERIC)
-               idx = TGSI_SEMANTIC_COUNT + semantic->Index;
-       return idx;
-}
-
-/* assign/get the input/export register # for given semantic idx as
- * returned by semantic_idx():
- */
-static int
-export_linkage(struct fd2_compile_context *ctx, int idx)
-{
-       struct fd_program_stateobj *prog = ctx->prog;
-
-       /* if first time we've seen this export, assign the next available slot: */
-       if (prog->export_linkage[idx] == 0xff)
-               prog->export_linkage[idx] = prog->num_exports++;
-
-       return prog->export_linkage[idx];
-}
-
-static unsigned
-compile_init(struct fd2_compile_context *ctx, struct fd_program_stateobj *prog,
-               struct fd2_shader_stateobj *so)
-{
-       unsigned ret;
-
-       ctx->prog = prog;
-       ctx->so = so;
-       ctx->pred_depth = 0;
-
-       ret = tgsi_parse_init(&ctx->parser, so->tokens);
-       if (ret != TGSI_PARSE_OK)
-               return ret;
-
-       ctx->type = ctx->parser.FullHeader.Processor.Processor;
-       ctx->position = ~0;
-       ctx->psize = ~0;
-       ctx->num_position = 0;
-       ctx->num_param = 0;
-       ctx->need_sync = 0;
-       ctx->immediate_idx = 0;
-       ctx->pred_reg = -1;
-       ctx->num_internal_temps = 0;
-
-       memset(ctx->num_regs, 0, sizeof(ctx->num_regs));
-       memset(ctx->input_export_idx, 0, sizeof(ctx->input_export_idx));
-       memset(ctx->output_export_idx, 0, sizeof(ctx->output_export_idx));
-
-       /* do first pass to extract declarations: */
-       while (!tgsi_parse_end_of_tokens(&ctx->parser)) {
-               tgsi_parse_token(&ctx->parser);
-
-               switch (ctx->parser.FullToken.Token.Type) {
-               case TGSI_TOKEN_TYPE_DECLARATION: {
-                       struct tgsi_full_declaration *decl =
-                                       &ctx->parser.FullToken.FullDeclaration;
-                       if (decl->Declaration.File == TGSI_FILE_OUTPUT) {
-                               unsigned name = decl->Semantic.Name;
-
-                               assert(decl->Declaration.Semantic);  // TODO is this ever not true?
-
-                               ctx->output_export_idx[decl->Range.First] =
-                                               semantic_idx(&decl->Semantic);
-
-                               if (ctx->type == PIPE_SHADER_VERTEX) {
-                                       switch (name) {
-                                       case TGSI_SEMANTIC_POSITION:
-                                               ctx->position = ctx->num_regs[TGSI_FILE_OUTPUT];
-                                               ctx->num_position++;
-                                               break;
-                                       case TGSI_SEMANTIC_PSIZE:
-                                               ctx->psize = ctx->num_regs[TGSI_FILE_OUTPUT];
-                                               ctx->num_position++;
-                                               break;
-                                       case TGSI_SEMANTIC_COLOR:
-                                       case TGSI_SEMANTIC_GENERIC:
-                                               ctx->num_param++;
-                                               break;
-                                       default:
-                                               DBG("unknown VS semantic name: %s",
-                                                               tgsi_semantic_names[name]);
-                                               assert(0);
-                                       }
-                               } else {
-                                       switch (name) {
-                                       case TGSI_SEMANTIC_COLOR:
-                                       case TGSI_SEMANTIC_GENERIC:
-                                               ctx->num_param++;
-                                               break;
-                                       default:
-                                               DBG("unknown PS semantic name: %s",
-                                                               tgsi_semantic_names[name]);
-                                               assert(0);
-                                       }
-                               }
-                       } else if (decl->Declaration.File == TGSI_FILE_INPUT) {
-                               ctx->input_export_idx[decl->Range.First] =
-                                               semantic_idx(&decl->Semantic);
-                       }
-                       ctx->num_regs[decl->Declaration.File] =
-                                       MAX2(ctx->num_regs[decl->Declaration.File], decl->Range.Last + 1);
-                       break;
-               }
-               case TGSI_TOKEN_TYPE_IMMEDIATE: {
-                       struct tgsi_full_immediate *imm =
-                                       &ctx->parser.FullToken.FullImmediate;
-                       unsigned n = ctx->so->num_immediates++;
-                       memcpy(ctx->so->immediates[n].val, imm->u, 16);
-                       break;
-               }
-               default:
-                       break;
-               }
-       }
-
-       /* TGSI generated immediates are always entire vec4's, ones we
-        * generate internally are not:
-        */
-       ctx->immediate_idx = ctx->so->num_immediates * 4;
-
-       ctx->so->first_immediate = ctx->num_regs[TGSI_FILE_CONSTANT];
-
-       tgsi_parse_free(&ctx->parser);
-
-       return tgsi_parse_init(&ctx->parser, so->tokens);
-}
-
-static void
-compile_free(struct fd2_compile_context *ctx)
-{
-       tgsi_parse_free(&ctx->parser);
-}
-
-static void
-compile_vtx_fetch(struct fd2_compile_context *ctx)
-{
-       struct ir2_instruction **vfetch_instrs = ctx->so->vfetch_instrs;
-       int i;
-       for (i = 0; i < ctx->num_regs[TGSI_FILE_INPUT]; i++) {
-               struct ir2_instruction *instr = ir2_instr_create(
-                               ctx->so->ir, IR2_FETCH);
-               instr->fetch.opc = VTX_FETCH;
-
-               ctx->need_sync |= 1 << (i+1);
-
-               ir2_dst_create(instr, i+1, "xyzw", 0);
-               ir2_reg_create(instr, 0, "x", IR2_REG_INPUT);
-
-               if (i == 0)
-                       instr->sync = true;
-
-               vfetch_instrs[i] = instr;
-       }
-       ctx->so->num_vfetch_instrs = i;
-}
-
-/*
- * For vertex shaders (VS):
- * --- ------ -------------
- *
- *   Inputs:     R1-R(num_input)
- *   Constants:  C0-C(num_const-1)
- *   Immediates: C(num_const)-C(num_const+num_imm-1)
- *   Outputs:    export0-export(n) and export62, export63
- *      n is # of outputs minus gl_Position (export62) and gl_PointSize (export63)
- *   Temps:      R(num_input+1)-R(num_input+num_temps)
- *
- * R0 could be clobbered after the vertex fetch instructions.. so we
- * could use it for one of the temporaries.
- *
- * TODO: maybe the vertex fetch part could fetch first input into R0 as
- * the last vtx fetch instruction, which would let us use the same
- * register layout in either case.. although this is not what the blob
- * compiler does.
- *
- *
- * For frag shaders (PS):
- * --- ---- -------------
- *
- *   Inputs:     R0-R(num_input-1)
- *   Constants:  same as VS
- *   Immediates: same as VS
- *   Outputs:    export0-export(num_outputs)
- *   Temps:      R(num_input)-R(num_input+num_temps-1)
- *
- * In either case, immediates are are postpended to the constants
- * (uniforms).
- *
- */
-
-static unsigned
-get_temp_gpr(struct fd2_compile_context *ctx, int idx)
-{
-       unsigned num = idx + ctx->num_regs[TGSI_FILE_INPUT];
-       if (ctx->type == PIPE_SHADER_VERTEX)
-               num++;
-       return num;
-}
-
-static struct ir2_dst_register *
-add_dst_reg(struct fd2_compile_context *ctx, struct ir2_instruction *alu,
-               const struct tgsi_dst_register *dst)
-{
-       unsigned flags = 0, num = 0;
-       char swiz[5];
-
-       switch (dst->File) {
-       case TGSI_FILE_OUTPUT:
-               flags |= IR2_REG_EXPORT;
-               if (ctx->type == PIPE_SHADER_VERTEX) {
-                       if (dst->Index == ctx->position) {
-                               num = 62;
-                       } else if (dst->Index == ctx->psize) {
-                               num = 63;
-                       } else {
-                               num = export_linkage(ctx,
-                                               ctx->output_export_idx[dst->Index]);
-                       }
-               } else {
-                       num = dst->Index;
-               }
-               break;
-       case TGSI_FILE_TEMPORARY:
-               num = get_temp_gpr(ctx, dst->Index);
-               break;
-       default:
-               DBG("unsupported dst register file: %s",
-                       tgsi_file_name(dst->File));
-               assert(0);
-               break;
-       }
-
-       swiz[0] = (dst->WriteMask & TGSI_WRITEMASK_X) ? 'x' : '_';
-       swiz[1] = (dst->WriteMask & TGSI_WRITEMASK_Y) ? 'y' : '_';
-       swiz[2] = (dst->WriteMask & TGSI_WRITEMASK_Z) ? 'z' : '_';
-       swiz[3] = (dst->WriteMask & TGSI_WRITEMASK_W) ? 'w' : '_';
-       swiz[4] = '\0';
-
-       return ir2_dst_create(alu, num, swiz, flags);
-}
-
-static struct ir2_src_register *
-add_src_reg(struct fd2_compile_context *ctx, struct ir2_instruction *alu,
-               const struct tgsi_src_register *src)
-{
-       static const char swiz_vals[] = {
-                       'x', 'y', 'z', 'w',
-       };
-       char swiz[5];
-       unsigned flags = 0, num = 0;
-
-       switch (src->File) {
-       case TGSI_FILE_CONSTANT:
-               num = src->Index;
-               flags |= IR2_REG_CONST;
-               break;
-       case TGSI_FILE_INPUT:
-               if (ctx->type == PIPE_SHADER_VERTEX) {
-                       num = src->Index + 1;
-               } else {
-                       flags |= IR2_REG_INPUT;
-                       num = export_linkage(ctx,
-                                       ctx->input_export_idx[src->Index]);
-               }
-               break;
-       case TGSI_FILE_TEMPORARY:
-               num = get_temp_gpr(ctx, src->Index);
-               break;
-       case TGSI_FILE_IMMEDIATE:
-               num = src->Index + ctx->num_regs[TGSI_FILE_CONSTANT];
-               flags |= IR2_REG_CONST;
-               break;
-       default:
-               DBG("unsupported src register file: %s",
-                       tgsi_file_name(src->File));
-               assert(0);
-               break;
-       }
-
-       if (src->Absolute)
-               flags |= IR2_REG_ABS;
-       if (src->Negate)
-               flags |= IR2_REG_NEGATE;
-
-       swiz[0] = swiz_vals[src->SwizzleX];
-       swiz[1] = swiz_vals[src->SwizzleY];
-       swiz[2] = swiz_vals[src->SwizzleZ];
-       swiz[3] = swiz_vals[src->SwizzleW];
-       swiz[4] = '\0';
-
-       if ((ctx->need_sync & ((uint64_t)1 << num)) &&
-                       !(flags & IR2_REG_CONST)) {
-               alu->sync = true;
-               ctx->need_sync &= ~((uint64_t)1 << num);
-       }
-
-       return ir2_reg_create(alu, num, swiz, flags);
-}
-
-static void
-add_vector_clamp(struct tgsi_full_instruction *inst, struct ir2_instruction *alu)
-{
-       if (inst->Instruction.Saturate) {
-               alu->alu_vector.clamp = true;
-       }
-}
-
-static void
-add_scalar_clamp(struct tgsi_full_instruction *inst, struct ir2_instruction *alu)
-{
-       if (inst->Instruction.Saturate) {
-               alu->alu_scalar.clamp = true;
-       }
-}
-
-static void
-add_regs_vector_1(struct fd2_compile_context *ctx,
-               struct tgsi_full_instruction *inst, struct ir2_instruction *alu)
-{
-       assert(inst->Instruction.NumSrcRegs == 1);
-       assert(inst->Instruction.NumDstRegs == 1);
-
-       add_dst_reg(ctx, alu, &inst->Dst[0].Register);
-       add_src_reg(ctx, alu, &inst->Src[0].Register);
-       add_src_reg(ctx, alu, &inst->Src[0].Register);
-       add_vector_clamp(inst, alu);
-}
-
-static void
-add_regs_vector_2(struct fd2_compile_context *ctx,
-               struct tgsi_full_instruction *inst, struct ir2_instruction *alu)
-{
-       assert(inst->Instruction.NumSrcRegs == 2);
-       assert(inst->Instruction.NumDstRegs == 1);
-
-       add_dst_reg(ctx, alu, &inst->Dst[0].Register);
-       add_src_reg(ctx, alu, &inst->Src[0].Register);
-       add_src_reg(ctx, alu, &inst->Src[1].Register);
-       add_vector_clamp(inst, alu);
-}
-
-static void
-add_regs_vector_3(struct fd2_compile_context *ctx,
-               struct tgsi_full_instruction *inst, struct ir2_instruction *alu)
-{
-       assert(inst->Instruction.NumSrcRegs == 3);
-       assert(inst->Instruction.NumDstRegs == 1);
-
-       add_dst_reg(ctx, alu, &inst->Dst[0].Register);
-       add_src_reg(ctx, alu, &inst->Src[0].Register);
-       add_src_reg(ctx, alu, &inst->Src[1].Register);
-       add_src_reg(ctx, alu, &inst->Src[2].Register);
-       add_vector_clamp(inst, alu);
-}
-
-static void
-add_regs_scalar_1(struct fd2_compile_context *ctx,
-               struct tgsi_full_instruction *inst, struct ir2_instruction *alu)
-{
-       assert(inst->Instruction.NumSrcRegs == 1);
-       assert(inst->Instruction.NumDstRegs == 1);
-
-       add_dst_reg(ctx, alu, &inst->Dst[0].Register);
-       add_src_reg(ctx, alu, &inst->Src[0].Register);
-       add_scalar_clamp(inst, alu);
-}
-
-/*
- * Helpers for TGSI instructions that don't map to a single shader instr:
- */
-
-static void
-src_from_dst(struct tgsi_src_register *src, struct tgsi_dst_register *dst)
-{
-       src->File      = dst->File;
-       src->Indirect  = dst->Indirect;
-       src->Dimension = dst->Dimension;
-       src->Index     = dst->Index;
-       src->Absolute  = 0;
-       src->Negate    = 0;
-       src->SwizzleX  = TGSI_SWIZZLE_X;
-       src->SwizzleY  = TGSI_SWIZZLE_Y;
-       src->SwizzleZ  = TGSI_SWIZZLE_Z;
-       src->SwizzleW  = TGSI_SWIZZLE_W;
-}
-
-/* Get internal-temp src/dst to use for a sequence of instructions
- * generated by a single TGSI op.
- */
-static void
-get_internal_temp(struct fd2_compile_context *ctx,
-               struct tgsi_dst_register *tmp_dst,
-               struct tgsi_src_register *tmp_src)
-{
-       int n;
-
-       tmp_dst->File      = TGSI_FILE_TEMPORARY;
-       tmp_dst->WriteMask = TGSI_WRITEMASK_XYZW;
-       tmp_dst->Indirect  = 0;
-       tmp_dst->Dimension = 0;
-
-       /* assign next temporary: */
-       n = ctx->num_internal_temps++;
-       if (ctx->pred_reg != -1)
-               n++;
-
-       tmp_dst->Index = ctx->num_regs[TGSI_FILE_TEMPORARY] + n;
-
-       src_from_dst(tmp_src, tmp_dst);
-}
-
-static void
-get_predicate(struct fd2_compile_context *ctx, struct tgsi_dst_register *dst,
-               struct tgsi_src_register *src)
-{
-       assert(ctx->pred_reg != -1);
-
-       dst->File      = TGSI_FILE_TEMPORARY;
-       dst->WriteMask = TGSI_WRITEMASK_W;
-       dst->Indirect  = 0;
-       dst->Dimension = 0;
-       dst->Index     = get_temp_gpr(ctx, ctx->pred_reg);
-
-       if (src) {
-               src_from_dst(src, dst);
-               src->SwizzleX  = TGSI_SWIZZLE_W;
-               src->SwizzleY  = TGSI_SWIZZLE_W;
-               src->SwizzleZ  = TGSI_SWIZZLE_W;
-               src->SwizzleW  = TGSI_SWIZZLE_W;
-       }
-}
-
-static void
-push_predicate(struct fd2_compile_context *ctx, struct tgsi_src_register *src)
-{
-       struct ir2_instruction *alu;
-       struct tgsi_dst_register pred_dst;
-
-       if (ctx->pred_depth == 0) {
-               /* assign predicate register: */
-               ctx->pred_reg = ctx->num_regs[TGSI_FILE_TEMPORARY];
-
-               get_predicate(ctx, &pred_dst, NULL);
-
-               alu = ir2_instr_create_alu_s(ctx->so->ir, PRED_SETNEs);
-               add_dst_reg(ctx, alu, &pred_dst);
-               add_src_reg(ctx, alu, src);
-       } else {
-               struct tgsi_src_register pred_src;
-
-               get_predicate(ctx, &pred_dst, &pred_src);
-
-               alu = ir2_instr_create_alu_v(ctx->so->ir, MULv);
-               add_dst_reg(ctx, alu, &pred_dst);
-               add_src_reg(ctx, alu, &pred_src);
-               add_src_reg(ctx, alu, src);
-
-               // XXX need to make PRED_SETE_PUSHv IR2_PRED_NONE.. but need to make
-               // sure src reg is valid if it was calculated with a predicate
-               // condition..
-               alu->pred = IR2_PRED_NONE;
-       }
-
-       /* save previous pred state to restore in pop_predicate(): */
-       ctx->pred_stack[ctx->pred_depth++] = ctx->so->ir->pred;
-}
-
-static void
-pop_predicate(struct fd2_compile_context *ctx)
-{
-       /* restore previous predicate state: */
-       ctx->so->ir->pred = ctx->pred_stack[--ctx->pred_depth];
-
-       if (ctx->pred_depth != 0) {
-               struct ir2_instruction *alu;
-               struct tgsi_dst_register pred_dst;
-               struct tgsi_src_register pred_src;
-
-               get_predicate(ctx, &pred_dst, &pred_src);
-
-               alu = ir2_instr_create_alu_s(ctx->so->ir, PRED_SET_POPs);
-               add_dst_reg(ctx, alu, &pred_dst);
-               add_src_reg(ctx, alu, &pred_src);
-               alu->pred = IR2_PRED_NONE;
-       } else {
-               /* predicate register no longer needed: */
-               ctx->pred_reg = -1;
-       }
-}
-
-static void
-get_immediate(struct fd2_compile_context *ctx,
-               struct tgsi_src_register *reg, uint32_t val)
-{
-       unsigned neg, swiz, idx, i;
-       /* actually maps 1:1 currently.. not sure if that is safe to rely on: */
-       static const unsigned swiz2tgsi[] = {
-                       TGSI_SWIZZLE_X, TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Z, TGSI_SWIZZLE_W,
-       };
-
-       for (i = 0; i < ctx->immediate_idx; i++) {
-               swiz = i % 4;
-               idx  = i / 4;
-
-               if (ctx->so->immediates[idx].val[swiz] == val) {
-                       neg = 0;
-                       break;
-               }
-
-               if (ctx->so->immediates[idx].val[swiz] == -val) {
-                       neg = 1;
-                       break;
-               }
-       }
-
-       if (i == ctx->immediate_idx) {
-               /* need to generate a new immediate: */
-               swiz = i % 4;
-               idx  = i / 4;
-               neg  = 0;
-               ctx->so->immediates[idx].val[swiz] = val;
-               ctx->so->num_immediates = idx + 1;
-               ctx->immediate_idx++;
-       }
-
-       reg->File      = TGSI_FILE_IMMEDIATE;
-       reg->Indirect  = 0;
-       reg->Dimension = 0;
-       reg->Index     = idx;
-       reg->Absolute  = 0;
-       reg->Negate    = neg;
-       reg->SwizzleX  = swiz2tgsi[swiz];
-       reg->SwizzleY  = swiz2tgsi[swiz];
-       reg->SwizzleZ  = swiz2tgsi[swiz];
-       reg->SwizzleW  = swiz2tgsi[swiz];
-}
-
-/* POW(a,b) = EXP2(b * LOG2(a)) */
-static void
-translate_pow(struct fd2_compile_context *ctx,
-               struct tgsi_full_instruction *inst)
-{
-       struct tgsi_dst_register tmp_dst;
-       struct tgsi_src_register tmp_src;
-       struct ir2_instruction *alu;
-
-       get_internal_temp(ctx, &tmp_dst, &tmp_src);
-
-       alu = ir2_instr_create_alu_s(ctx->so->ir, LOG_CLAMP);
-       add_dst_reg(ctx, alu, &tmp_dst);
-       add_src_reg(ctx, alu, &inst->Src[0].Register);
-
-       alu = ir2_instr_create_alu_v(ctx->so->ir, MULv);
-       add_dst_reg(ctx, alu, &tmp_dst);
-       add_src_reg(ctx, alu, &tmp_src);
-       add_src_reg(ctx, alu, &inst->Src[1].Register);
-
-       /* NOTE: some of the instructions, like EXP_IEEE, seem hard-
-        * coded to take their input from the w component.
-        */
-       switch(inst->Dst[0].Register.WriteMask) {
-       case TGSI_WRITEMASK_X:
-               tmp_src.SwizzleW = TGSI_SWIZZLE_X;
-               break;
-       case TGSI_WRITEMASK_Y:
-               tmp_src.SwizzleW = TGSI_SWIZZLE_Y;
-               break;
-       case TGSI_WRITEMASK_Z:
-               tmp_src.SwizzleW = TGSI_SWIZZLE_Z;
-               break;
-       case TGSI_WRITEMASK_W:
-               tmp_src.SwizzleW = TGSI_SWIZZLE_W;
-               break;
-       default:
-               DBG("invalid writemask!");
-               assert(0);
-               break;
-       }
-
-       alu = ir2_instr_create_alu_s(ctx->so->ir, EXP_IEEE);
-       add_dst_reg(ctx, alu, &inst->Dst[0].Register);
-       add_src_reg(ctx, alu, &tmp_src);
-       add_scalar_clamp(inst, alu);
-}
-
-static void
-translate_tex(struct fd2_compile_context *ctx,
-               struct tgsi_full_instruction *inst, unsigned opc)
-{
-       struct ir2_instruction *instr;
-       struct ir2_src_register *reg;
-       struct tgsi_dst_register tmp_dst;
-       struct tgsi_src_register tmp_src;
-       const struct tgsi_src_register *coord;
-       bool using_temp = (inst->Dst[0].Register.File == TGSI_FILE_OUTPUT) ||
-                       inst->Instruction.Saturate;
-       int idx;
-
-       if (using_temp || (opc == TGSI_OPCODE_TXP))
-               get_internal_temp(ctx, &tmp_dst, &tmp_src);
-
-       if (opc == TGSI_OPCODE_TXP) {
-               static const char *swiz[] = {
-                               [TGSI_SWIZZLE_X] = "xxxx",
-                               [TGSI_SWIZZLE_Y] = "yyyy",
-                               [TGSI_SWIZZLE_Z] = "zzzz",
-                               [TGSI_SWIZZLE_W] = "wwww",
-               };
-
-               /* TXP - Projective Texture Lookup:
-                *
-                *  coord.x = src0.x / src.w
-                *  coord.y = src0.y / src.w
-                *  coord.z = src0.z / src.w
-                *  coord.w = src0.w
-                *  bias = 0.0
-                *
-                *  dst = texture_sample(unit, coord, bias)
-                */
-
-               instr = ir2_instr_create_alu_v(ctx->so->ir, MAXv);
-               add_dst_reg(ctx, instr, &tmp_dst)->swizzle = "___w";
-               add_src_reg(ctx, instr, &inst->Src[0].Register);
-               add_src_reg(ctx, instr, &inst->Src[0].Register);
-
-               instr = ir2_instr_create_alu_s(ctx->so->ir, RECIP_IEEE);
-               add_dst_reg(ctx, instr, &tmp_dst)->swizzle = "x___";
-               memcpy(add_src_reg(ctx, instr, &inst->Src[0].Register)->swizzle,
-                          swiz[inst->Src[0].Register.SwizzleW], 4);
-
-               instr = ir2_instr_create_alu_v(ctx->so->ir, MULv);
-               add_dst_reg(ctx, instr, &tmp_dst)->swizzle = "xyz_";
-               add_src_reg(ctx, instr, &tmp_src)->swizzle = "xxxx";
-               add_src_reg(ctx, instr, &inst->Src[0].Register);
-
-               coord = &tmp_src;
-       } else {
-               coord = &inst->Src[0].Register;
-       }
-
-       instr = ir2_instr_create(ctx->so->ir, IR2_FETCH);
-       instr->fetch.opc = TEX_FETCH;
-       instr->fetch.is_cube = (inst->Texture.Texture == TGSI_TEXTURE_3D);
-       instr->fetch.is_rect = (inst->Texture.Texture == TGSI_TEXTURE_RECT);
-       assert(inst->Texture.NumOffsets <= 1); // TODO what to do in other cases?
-
-       /* save off the tex fetch to be patched later with correct const_idx: */
-       idx = ctx->so->num_tfetch_instrs++;
-       ctx->so->tfetch_instrs[idx].samp_id = inst->Src[1].Register.Index;
-       ctx->so->tfetch_instrs[idx].instr = instr;
-
-       add_dst_reg(ctx, instr, using_temp ? &tmp_dst : &inst->Dst[0].Register);
-       reg = add_src_reg(ctx, instr, coord);
-
-       /* blob compiler always sets 3rd component to same as 1st for 2d: */
-       if (inst->Texture.Texture == TGSI_TEXTURE_2D || inst->Texture.Texture == TGSI_TEXTURE_RECT)
-               reg->swizzle[2] = reg->swizzle[0];
-
-       /* dst register needs to be marked for sync: */
-       ctx->need_sync |= 1 << instr->dst_reg.num;
-
-       /* TODO we need some way to know if the tex fetch needs to sync on alu pipe.. */
-       instr->sync = true;
-
-       if (using_temp) {
-               /* texture fetch can't write directly to export, so if tgsi
-                * is telling us the dst register is in output file, we load
-                * the texture to a temp and the use ALU instruction to move
-                * to output
-                */
-               instr = ir2_instr_create_alu_v(ctx->so->ir, MAXv);
-
-               add_dst_reg(ctx, instr, &inst->Dst[0].Register);
-               add_src_reg(ctx, instr, &tmp_src);
-               add_src_reg(ctx, instr, &tmp_src);
-               add_vector_clamp(inst, instr);
-       }
-}
-
-/* SGE(a,b) = GTE((b - a), 1.0, 0.0) */
-/* SLT(a,b) = GTE((b - a), 0.0, 1.0) */
-/* SEQ(a,b) = EQU((b - a), 1.0, 0.0) */
-/* SNE(a,b) = EQU((b - a), 0.0, 1.0) */
-static void
-translate_sge_slt_seq_sne(struct fd2_compile_context *ctx,
-               struct tgsi_full_instruction *inst, unsigned opc)
-{
-       struct ir2_instruction *instr;
-       struct tgsi_dst_register tmp_dst;
-       struct tgsi_src_register tmp_src;
-       struct tgsi_src_register tmp_const;
-       float c0, c1;
-       instr_vector_opc_t vopc;
-
-       switch (opc) {
-       default:
-               assert(0);
-       case TGSI_OPCODE_SGE:
-               c0 = 1.0;
-               c1 = 0.0;
-               vopc = CNDGTEv;
-               break;
-       case TGSI_OPCODE_SLT:
-               c0 = 0.0;
-               c1 = 1.0;
-               vopc = CNDGTEv;
-               break;
-       case TGSI_OPCODE_SEQ:
-               c0 = 0.0;
-               c1 = 1.0;
-               vopc = CNDEv;
-               break;
-       case TGSI_OPCODE_SNE:
-               c0 = 1.0;
-               c1 = 0.0;
-               vopc = CNDEv;
-               break;
-       }
-
-       get_internal_temp(ctx, &tmp_dst, &tmp_src);
-
-       instr = ir2_instr_create_alu_v(ctx->so->ir, ADDv);
-       add_dst_reg(ctx, instr, &tmp_dst);
-       add_src_reg(ctx, instr, &inst->Src[0].Register)->flags |= IR2_REG_NEGATE;
-       add_src_reg(ctx, instr, &inst->Src[1].Register);
-
-       instr = ir2_instr_create_alu_v(ctx->so->ir, vopc);
-       add_dst_reg(ctx, instr, &inst->Dst[0].Register);
-       add_src_reg(ctx, instr, &tmp_src);
-       get_immediate(ctx, &tmp_const, fui(c1));
-       add_src_reg(ctx, instr, &tmp_const);
-       get_immediate(ctx, &tmp_const, fui(c0));
-       add_src_reg(ctx, instr, &tmp_const);
-}
-
-/* LRP(a,b,c) = (a * b) + ((1 - a) * c) */
-static void
-translate_lrp(struct fd2_compile_context *ctx,
-               struct tgsi_full_instruction *inst,
-               unsigned opc)
-{
-       struct ir2_instruction *instr;
-       struct tgsi_dst_register tmp_dst1, tmp_dst2;
-       struct tgsi_src_register tmp_src1, tmp_src2;
-       struct tgsi_src_register tmp_const;
-
-       get_internal_temp(ctx, &tmp_dst1, &tmp_src1);
-       get_internal_temp(ctx, &tmp_dst2, &tmp_src2);
-
-       get_immediate(ctx, &tmp_const, fui(1.0));
-
-       /* tmp1 = (a * b) */
-       instr = ir2_instr_create_alu_v(ctx->so->ir, MULv);
-       add_dst_reg(ctx, instr, &tmp_dst1);
-       add_src_reg(ctx, instr, &inst->Src[0].Register);
-       add_src_reg(ctx, instr, &inst->Src[1].Register);
-
-       /* tmp2 = (1 - a) */
-       instr = ir2_instr_create_alu_v(ctx->so->ir, ADDv);
-       add_dst_reg(ctx, instr, &tmp_dst2);
-       add_src_reg(ctx, instr, &tmp_const);
-       add_src_reg(ctx, instr, &inst->Src[0].Register)->flags |= IR2_REG_NEGATE;
-
-       /* tmp2 = tmp2 * c */
-       instr = ir2_instr_create_alu_v(ctx->so->ir, MULv);
-       add_dst_reg(ctx, instr, &tmp_dst2);
-       add_src_reg(ctx, instr, &tmp_src2);
-       add_src_reg(ctx, instr, &inst->Src[2].Register);
-
-       /* dst = tmp1 + tmp2 */
-       instr = ir2_instr_create_alu_v(ctx->so->ir, ADDv);
-       add_dst_reg(ctx, instr, &inst->Dst[0].Register);
-       add_src_reg(ctx, instr, &tmp_src1);
-       add_src_reg(ctx, instr, &tmp_src2);
-}
-
-static void
-translate_trig(struct fd2_compile_context *ctx,
-               struct tgsi_full_instruction *inst,
-               unsigned opc)
-{
-       struct ir2_instruction *instr;
-       struct tgsi_dst_register tmp_dst;
-       struct tgsi_src_register tmp_src;
-       struct tgsi_src_register tmp_const;
-       instr_scalar_opc_t op;
-
-       switch (opc) {
-       default:
-               assert(0);
-       case TGSI_OPCODE_SIN:
-               op = SIN;
-               break;
-       case TGSI_OPCODE_COS:
-               op = COS;
-               break;
-       }
-
-       get_internal_temp(ctx, &tmp_dst, &tmp_src);
-
-       tmp_dst.WriteMask = TGSI_WRITEMASK_X;
-       tmp_src.SwizzleX = tmp_src.SwizzleY =
-                       tmp_src.SwizzleZ = tmp_src.SwizzleW = TGSI_SWIZZLE_X;
-
-       instr = ir2_instr_create_alu_v(ctx->so->ir, MULADDv);
-       add_dst_reg(ctx, instr, &tmp_dst);
-       add_src_reg(ctx, instr, &inst->Src[0].Register);
-       get_immediate(ctx, &tmp_const, fui(0.159155));
-       add_src_reg(ctx, instr, &tmp_const);
-       get_immediate(ctx, &tmp_const, fui(0.5));
-       add_src_reg(ctx, instr, &tmp_const);
-
-       instr = ir2_instr_create_alu_v(ctx->so->ir, FRACv);
-       add_dst_reg(ctx, instr, &tmp_dst);
-       add_src_reg(ctx, instr, &tmp_src);
-       add_src_reg(ctx, instr, &tmp_src);
-
-       instr = ir2_instr_create_alu_v(ctx->so->ir, MULADDv);
-       add_dst_reg(ctx, instr, &tmp_dst);
-       add_src_reg(ctx, instr, &tmp_src);
-       get_immediate(ctx, &tmp_const, fui(6.283185));
-       add_src_reg(ctx, instr, &tmp_const);
-       get_immediate(ctx, &tmp_const, fui(-3.141593));
-       add_src_reg(ctx, instr, &tmp_const);
-
-       instr = ir2_instr_create_alu_s(ctx->so->ir, op);
-       add_dst_reg(ctx, instr, &inst->Dst[0].Register);
-       add_src_reg(ctx, instr, &tmp_src);
-}
-
-static void
-translate_dp2(struct fd2_compile_context *ctx,
-               struct tgsi_full_instruction *inst,
-               unsigned opc)
-{
-       struct tgsi_src_register tmp_const;
-       struct ir2_instruction *instr;
-       /* DP2ADD c,a,b -> dot2(a,b) + c */
-       /* for c we use the constant 0.0 */
-       instr = ir2_instr_create_alu_v(ctx->so->ir, DOT2ADDv);
-       add_dst_reg(ctx, instr, &inst->Dst[0].Register);
-       add_src_reg(ctx, instr, &inst->Src[0].Register);
-       add_src_reg(ctx, instr, &inst->Src[1].Register);
-       get_immediate(ctx, &tmp_const, fui(0.0f));
-       add_src_reg(ctx, instr, &tmp_const);
-       add_vector_clamp(inst, instr);
-}
-
-/*
- * Main part of compiler/translator:
- */
-
-static void
-translate_instruction(struct fd2_compile_context *ctx,
-               struct tgsi_full_instruction *inst)
-{
-       unsigned opc = inst->Instruction.Opcode;
-       struct ir2_instruction *instr;
-
-       if (opc == TGSI_OPCODE_END)
-               return;
-
-       /* TODO turn this into a table: */
-       switch (opc) {
-       case TGSI_OPCODE_MOV:
-               instr = ir2_instr_create_alu_v(ctx->so->ir, MAXv);
-               add_regs_vector_1(ctx, inst, instr);
-               break;
-       case TGSI_OPCODE_RCP:
-               instr = ir2_instr_create_alu_s(ctx->so->ir, RECIP_IEEE);
-               add_regs_scalar_1(ctx, inst, instr);
-               break;
-       case TGSI_OPCODE_RSQ:
-               instr = ir2_instr_create_alu_s(ctx->so->ir, RECIPSQ_IEEE);
-               add_regs_scalar_1(ctx, inst, instr);
-               break;
-       case TGSI_OPCODE_SQRT:
-               instr = ir2_instr_create_alu_s(ctx->so->ir, SQRT_IEEE);
-               add_regs_scalar_1(ctx, inst, instr);
-               break;
-       case TGSI_OPCODE_MUL:
-               instr = ir2_instr_create_alu_v(ctx->so->ir, MULv);
-               add_regs_vector_2(ctx, inst, instr);
-               break;
-       case TGSI_OPCODE_ADD:
-               instr = ir2_instr_create_alu_v(ctx->so->ir, ADDv);
-               add_regs_vector_2(ctx, inst, instr);
-               break;
-       case TGSI_OPCODE_DP2:
-               translate_dp2(ctx, inst, opc);
-               break;
-       case TGSI_OPCODE_DP3:
-               instr = ir2_instr_create_alu_v(ctx->so->ir, DOT3v);
-               add_regs_vector_2(ctx, inst, instr);
-               break;
-       case TGSI_OPCODE_DP4:
-               instr = ir2_instr_create_alu_v(ctx->so->ir, DOT4v);
-               add_regs_vector_2(ctx, inst, instr);
-               break;
-       case TGSI_OPCODE_MIN:
-               instr = ir2_instr_create_alu_v(ctx->so->ir, MINv);
-               add_regs_vector_2(ctx, inst, instr);
-               break;
-       case TGSI_OPCODE_MAX:
-               instr = ir2_instr_create_alu_v(ctx->so->ir, MAXv);
-               add_regs_vector_2(ctx, inst, instr);
-               break;
-       case TGSI_OPCODE_SLT:
-       case TGSI_OPCODE_SGE:
-       case TGSI_OPCODE_SEQ:
-       case TGSI_OPCODE_SNE:
-               translate_sge_slt_seq_sne(ctx, inst, opc);
-               break;
-       case TGSI_OPCODE_MAD:
-               instr = ir2_instr_create_alu_v(ctx->so->ir, MULADDv);
-               add_regs_vector_3(ctx, inst, instr);
-               break;
-       case TGSI_OPCODE_LRP:
-               translate_lrp(ctx, inst, opc);
-               break;
-       case TGSI_OPCODE_FRC:
-               instr = ir2_instr_create_alu_v(ctx->so->ir, FRACv);
-               add_regs_vector_1(ctx, inst, instr);
-               break;
-       case TGSI_OPCODE_FLR:
-               instr = ir2_instr_create_alu_v(ctx->so->ir, FLOORv);
-               add_regs_vector_1(ctx, inst, instr);
-               break;
-       case TGSI_OPCODE_EX2:
-               instr = ir2_instr_create_alu_s(ctx->so->ir, EXP_IEEE);
-               add_regs_scalar_1(ctx, inst, instr);
-               break;
-       case TGSI_OPCODE_POW:
-               translate_pow(ctx, inst);
-               break;
-       case TGSI_OPCODE_COS:
-       case TGSI_OPCODE_SIN:
-               translate_trig(ctx, inst, opc);
-               break;
-       case TGSI_OPCODE_TEX:
-       case TGSI_OPCODE_TXP:
-               translate_tex(ctx, inst, opc);
-               break;
-       case TGSI_OPCODE_CMP:
-               instr = ir2_instr_create_alu_v(ctx->so->ir, CNDGTEv);
-               add_regs_vector_3(ctx, inst, instr);
-               instr->src_reg[0].flags ^= IR2_REG_NEGATE; /* src1 */
-               break;
-       case TGSI_OPCODE_IF:
-               push_predicate(ctx, &inst->Src[0].Register);
-               ctx->so->ir->pred = IR2_PRED_EQ;
-               break;
-       case TGSI_OPCODE_ELSE:
-               ctx->so->ir->pred = IR2_PRED_NE;
-               break;
-       case TGSI_OPCODE_ENDIF:
-               pop_predicate(ctx);
-               break;
-       case TGSI_OPCODE_F2I:
-               instr = ir2_instr_create_alu_v(ctx->so->ir, TRUNCv);
-               add_regs_vector_1(ctx, inst, instr);
-               break;
-       default:
-               DBG("unknown TGSI opc: %s", tgsi_get_opcode_name(opc));
-               tgsi_dump(ctx->so->tokens, 0);
-               assert(0);
-               break;
-       }
-
-       /* internal temporaries are only valid for the duration of a single
-        * TGSI instruction:
-        */
-       ctx->num_internal_temps = 0;
-}
-
-static void
-compile_instructions(struct fd2_compile_context *ctx)
-{
-       while (!tgsi_parse_end_of_tokens(&ctx->parser)) {
-               tgsi_parse_token(&ctx->parser);
-
-               switch (ctx->parser.FullToken.Token.Type) {
-               case TGSI_TOKEN_TYPE_INSTRUCTION:
-                       translate_instruction(ctx,
-                                       &ctx->parser.FullToken.FullInstruction);
-                       break;
-               default:
-                       break;
-               }
-       }
-}
-
-int
-fd2_compile_shader(struct fd_program_stateobj *prog,
-               struct fd2_shader_stateobj *so)
-{
-       struct fd2_compile_context ctx;
-
-       ir2_shader_destroy(so->ir);
-       so->ir = ir2_shader_create();
-       so->num_vfetch_instrs = so->num_tfetch_instrs = so->num_immediates = 0;
-
-       if (compile_init(&ctx, prog, so) != TGSI_PARSE_OK)
-               return -1;
-
-       if (ctx.type == PIPE_SHADER_VERTEX) {
-               compile_vtx_fetch(&ctx);
-       } else if (ctx.type == PIPE_SHADER_FRAGMENT) {
-               prog->num_exports = 0;
-               memset(prog->export_linkage, 0xff,
-                               sizeof(prog->export_linkage));
-       }
-
-       compile_instructions(&ctx);
-
-       compile_free(&ctx);
-
-       return 0;
-}
-
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_compiler.h b/src/gallium/drivers/freedreno/a2xx/fd2_compiler.h

deleted file mode 100644 (file)

index f26bb2f..0000000
--- a/src/gallium/drivers/freedreno/a2xx/fd2_compiler.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (C) 2012 Rob Clark <robclark@freedesktop.org>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <robclark@freedesktop.org>
- */
-
-#ifndef FD2_COMPILER_H_
-#define FD2_COMPILER_H_
-
-#include "fd2_program.h"
-#include "fd2_util.h"
-
-int fd2_compile_shader(struct fd_program_stateobj *prog,
-               struct fd2_shader_stateobj *so);
-
-#endif /* FD2_COMPILER_H_ */
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_draw.c b/src/gallium/drivers/freedreno/a2xx/fd2_draw.c

index 00381df..f15d57c 100644 (file)
--- a/src/gallium/drivers/freedreno/a2xx/fd2_draw.c
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_draw.c
@@ -216,7 +216,7 @@ fd2_clear(struct fd_context *ctx, unsigned buffers,
                 OUT_RING(ring, 0x0000028f);
         }
  
-       fd2_program_emit(ring, &ctx->solid_prog);
+       fd2_program_emit(ctx, ring, &ctx->solid_prog);
  
         OUT_PKT0(ring, REG_A2XX_TC_CNTL_STATUS, 1);
         OUT_RING(ring, A2XX_TC_CNTL_STATUS_L2_INVALIDATE);
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_emit.c b/src/gallium/drivers/freedreno/a2xx/fd2_emit.c

index ac2a02d..9628f26 100644 (file)
--- a/src/gallium/drivers/freedreno/a2xx/fd2_emit.c
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_emit.c
@@ -272,13 +272,25 @@ fd2_emit_state(struct fd_context *ctx, const enum fd_dirty_3d_state dirty)
                 OUT_RING(ring, fui(ctx->viewport.translate[1]));   /* PA_CL_VPORT_YOFFSET */
                 OUT_RING(ring, fui(ctx->viewport.scale[2]));       /* PA_CL_VPORT_ZSCALE */
                 OUT_RING(ring, fui(ctx->viewport.translate[2]));   /* PA_CL_VPORT_ZOFFSET */
-       }
  
-       if (dirty & (FD_DIRTY_PROG | FD_DIRTY_VTXSTATE | FD_DIRTY_TEXSTATE)) {
-               fd2_program_validate(ctx);
-               fd2_program_emit(ring, &ctx->prog);
+               /* set viewport in C65/C66, for a20x hw binning and fragcoord.z */
+               OUT_PKT3(ring, CP_SET_CONSTANT, 9);
+               OUT_RING(ring, 0x00000184);
+
+               OUT_RING(ring, fui(ctx->viewport.translate[0]));
+               OUT_RING(ring, fui(ctx->viewport.translate[1]));
+               OUT_RING(ring, fui(ctx->viewport.translate[2]));
+               OUT_RING(ring, fui(0.0f));
+
+               OUT_RING(ring, fui(ctx->viewport.scale[0]));
+               OUT_RING(ring, fui(ctx->viewport.scale[1]));
+               OUT_RING(ring, fui(ctx->viewport.scale[2]));
+               OUT_RING(ring, fui(0.0f));
         }
  
+       if (dirty & (FD_DIRTY_PROG | FD_DIRTY_VTXSTATE | FD_DIRTY_TEXSTATE))
+               fd2_program_emit(ctx, ring, &ctx->prog);
+
         if (dirty & (FD_DIRTY_PROG | FD_DIRTY_CONST)) {
                 emit_constants(ring,  VS_CONST_BASE * 4,
                                 &ctx->constbuf[PIPE_SHADER_VERTEX],
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c b/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c

index ca00f3c..56db560 100644 (file)
--- a/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c
@@ -137,7 +137,7 @@ fd2_emit_tile_gmem2mem(struct fd_batch *batch, struct fd_tile *tile)
                 OUT_RING(ring, 0x0000028f);
         }
  
-       fd2_program_emit(ring, &ctx->solid_prog);
+       fd2_program_emit(ctx, ring, &ctx->solid_prog);
  
         OUT_PKT3(ring, CP_SET_CONSTANT, 2);
         OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_AA_MASK));
@@ -285,7 +285,7 @@ fd2_emit_tile_mem2gmem(struct fd_batch *batch, struct fd_tile *tile)
         OUT_RING(ring, CP_REG(REG_A2XX_VGT_INDX_OFFSET));
         OUT_RING(ring, 0);
  
-       fd2_program_emit(ring, &ctx->blit_prog[0]);
+       fd2_program_emit(ctx, ring, &ctx->blit_prog[0]);
  
         OUT_PKT0(ring, REG_A2XX_TC_CNTL_STATUS, 1);
         OUT_RING(ring, A2XX_TC_CNTL_STATUS_L2_INVALIDATE);
@@ -476,6 +476,16 @@ fd2_emit_tile_renderprep(struct fd_batch *batch, struct fd_tile *tile)
         OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_WINDOW_OFFSET));
         OUT_RING(ring, A2XX_PA_SC_WINDOW_OFFSET_X(-tile->xoff) |
                         A2XX_PA_SC_WINDOW_OFFSET_Y(-tile->yoff));
+
+       /* tile offset for gl_FragCoord on a20x (C64 in fragment shader) */
+       if (is_a20x(batch->ctx->screen)) {
+               OUT_PKT3(ring, CP_SET_CONSTANT, 5);
+               OUT_RING(ring, 0x00000580);
+               OUT_RING(ring, fui(tile->xoff));
+               OUT_RING(ring, fui(tile->yoff));
+               OUT_RING(ring, fui(0.0f));
+               OUT_RING(ring, fui(0.0f));
+       }
  }
  
  void
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_program.c b/src/gallium/drivers/freedreno/a2xx/fd2_program.c

index 56b3ab2..da02044 100644 (file)
--- a/src/gallium/drivers/freedreno/a2xx/fd2_program.c
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_program.c
@@ -22,6 +22,7 @@
   *
   * Authors:
   *    Rob Clark <robclark@freedesktop.org>
+ *    Jonathan Marek <jonathan@marek.ca>
   */
  
  #include "pipe/p_state.h"
@@ -34,18 +35,20 @@
  
  #include "freedreno_program.h"
  
+#include "ir2.h"
  #include "fd2_program.h"
-#include "fd2_compiler.h"
  #include "fd2_texture.h"
  #include "fd2_util.h"
+#include "instr-a2xx.h"
  
  static struct fd2_shader_stateobj *
-create_shader(gl_shader_stage type)
+create_shader(struct pipe_context *pctx, gl_shader_stage type)
  {
         struct fd2_shader_stateobj *so = CALLOC_STRUCT(fd2_shader_stateobj);
         if (!so)
                 return NULL;
         so->type = type;
+       so->is_a20x = is_a20x(fd_context(pctx)->screen);
         return so;
  }
  
@@ -54,89 +57,67 @@ delete_shader(struct fd2_shader_stateobj *so)
  {
         if (!so)
                 return;
-       ir2_shader_destroy(so->ir);
-       free(so->tokens);
-       free(so->bin);
+       ralloc_free(so->nir);
+       for (int i = 0; i < ARRAY_SIZE(so->variant); i++)
+               free(so->variant[i].info.dwords);
         free(so);
  }
  
-static struct fd2_shader_stateobj *
-assemble(struct fd2_shader_stateobj *so)
+static void
+emit(struct fd_ringbuffer *ring, gl_shader_stage type,
+       struct ir2_shader_info *info)
  {
-       free(so->bin);
-       so->bin = ir2_shader_assemble(so->ir, &so->info);
-       if (!so->bin)
-               goto fail;
+       unsigned i;
  
-       if (fd_mesa_debug & FD_DBG_DISASM) {
-               DBG("disassemble: type=%d", so->type);
-               disasm_a2xx(so->bin, so->info.sizedwords, 0, so->type);
-       }
+       assert(info->sizedwords);
  
-       return so;
+       OUT_PKT3(ring, CP_IM_LOAD_IMMEDIATE, 2 + info->sizedwords);
+       OUT_RING(ring, type == MESA_SHADER_FRAGMENT);
+       OUT_RING(ring, info->sizedwords);
+       for (i = 0; i < info->sizedwords; i++)
+               OUT_RING(ring, info->dwords[i]);
+}
  
-fail:
-       debug_error("assemble failed!");
-       delete_shader(so);
-       return NULL;
+static int
+ir2_glsl_type_size(const struct glsl_type *type)
+{
+       return glsl_count_attribute_slots(type, false);
  }
  
-static struct fd2_shader_stateobj *
-compile(struct fd_program_stateobj *prog, struct fd2_shader_stateobj *so)
+static void *
+fd2_fp_state_create(struct pipe_context *pctx,
+               const struct pipe_shader_state *cso)
  {
-       int ret;
+       struct fd2_shader_stateobj *so = create_shader(pctx, MESA_SHADER_FRAGMENT);
+       if (!so)
+               return NULL;
  
-       if (fd_mesa_debug & FD_DBG_DISASM) {
-               DBG("dump tgsi: type=%d", so->type);
-               tgsi_dump(so->tokens, 0);
+       if (cso->type == PIPE_SHADER_IR_NIR) {
+               so->nir = cso->ir.nir;
+               NIR_PASS_V(so->nir, nir_lower_io, nir_var_all, ir2_glsl_type_size,
+                          (nir_lower_io_options)0);
+       } else {
+               assert(cso->type == PIPE_SHADER_IR_TGSI);
+               so->nir = ir2_tgsi_to_nir(cso->tokens);
         }
  
-       ret = fd2_compile_shader(prog, so);
-       if (ret)
+       if (ir2_optimize_nir(so->nir, true))
                 goto fail;
  
-       /* NOTE: we don't assemble yet because for VS we don't know the
-        * type information for vertex fetch yet.. so those need to be
-        * patched up later before assembling.
-        */
+       so->first_immediate = so->nir->num_uniforms;
  
-       so->info.sizedwords = 0;
+       ir2_compile(so, 0, NULL);
  
+       ralloc_free(so->nir);
+       so->nir = NULL;
         return so;
  
  fail:
-       debug_error("compile failed!");
         delete_shader(so);
         return NULL;
  }
  
  static void
-emit(struct fd_ringbuffer *ring, struct fd2_shader_stateobj *so)
-{
-       unsigned i;
-
-       if (so->info.sizedwords == 0)
-               assemble(so);
-
-       OUT_PKT3(ring, CP_IM_LOAD_IMMEDIATE, 2 + so->info.sizedwords);
-       OUT_RING(ring, (so->type == MESA_SHADER_VERTEX) ? 0 : 1);
-       OUT_RING(ring, so->info.sizedwords);
-       for (i = 0; i < so->info.sizedwords; i++)
-               OUT_RING(ring, so->bin[i]);
-}
-
-static void *
-fd2_fp_state_create(struct pipe_context *pctx,
-               const struct pipe_shader_state *cso)
-{
-       struct fd2_shader_stateobj *so = create_shader(MESA_SHADER_FRAGMENT);
-       if (!so)
-               return NULL;
-       so->tokens = tgsi_dup_tokens(cso->tokens);
-       return so;
-}
-
-static void
  fd2_fp_state_delete(struct pipe_context *pctx, void *hwcso)
  {
         struct fd2_shader_stateobj *so = hwcso;
@@ -147,11 +128,32 @@ static void *
  fd2_vp_state_create(struct pipe_context *pctx,
                 const struct pipe_shader_state *cso)
  {
-       struct fd2_shader_stateobj *so = create_shader(MESA_SHADER_VERTEX);
+       struct fd2_shader_stateobj *so = create_shader(pctx, MESA_SHADER_VERTEX);
         if (!so)
                 return NULL;
-       so->tokens = tgsi_dup_tokens(cso->tokens);
+
+       if (cso->type == PIPE_SHADER_IR_NIR) {
+               so->nir = cso->ir.nir;
+               NIR_PASS_V(so->nir, nir_lower_io, nir_var_all, ir2_glsl_type_size,
+                          (nir_lower_io_options)0);
+       } else {
+               assert(cso->type == PIPE_SHADER_IR_TGSI);
+               so->nir = ir2_tgsi_to_nir(cso->tokens);
+       }
+
+       if (ir2_optimize_nir(so->nir, true))
+               goto fail;
+
+       so->first_immediate = so->nir->num_uniforms;
+
+       /* compile binning variant now */
+       ir2_compile(so, 0, NULL);
+
         return so;
+
+fail:
+       delete_shader(so);
+       return NULL;
  }
  
  static void
@@ -162,277 +164,145 @@ fd2_vp_state_delete(struct pipe_context *pctx, void *hwcso)
  }
  
  static void
-patch_vtx_fetches(struct fd_context *ctx, struct fd2_shader_stateobj *so,
-               struct fd_vertex_stateobj *vtx)
+patch_vtx_fetch(struct fd_context *ctx, struct pipe_vertex_element *elem,
+       instr_fetch_vtx_t *instr, uint16_t dst_swiz)
  {
-       unsigned i;
-
-       assert(so->num_vfetch_instrs == vtx->num_elements);
-
-       /* update vtx fetch instructions: */
-       for (i = 0; i < so->num_vfetch_instrs; i++) {
-               struct ir2_instruction *instr = so->vfetch_instrs[i];
-               struct pipe_vertex_element *elem = &vtx->pipe[i];
-               struct pipe_vertex_buffer *vb =
+       struct pipe_vertex_buffer *vb =
                                 &ctx->vtx.vertexbuf.vb[elem->vertex_buffer_index];
-               enum pipe_format format = elem->src_format;
-               const struct util_format_description *desc =
-                               util_format_description(format);
-               unsigned j;
-
-               /* Find the first non-VOID channel. */
-               for (j = 0; j < 4; j++)
-                       if (desc->channel[j].type != UTIL_FORMAT_TYPE_VOID)
-                               break;
-
-               /* CI/CIS can probably be set in compiler instead: */
-               instr->fetch.const_idx = 20 + (i / 3);
-               instr->fetch.const_idx_sel = i % 3;
-
-               instr->fetch.fmt = fd2_pipe2surface(format);
-               instr->fetch.is_normalized = desc->channel[j].normalized;
-               instr->fetch.is_signed =
-                               desc->channel[j].type == UTIL_FORMAT_TYPE_SIGNED;
-               instr->fetch.stride = vb->stride ? : 1;
-               instr->fetch.offset = elem->src_offset;
-
-               for (j = 0; j < 4; j++)
-                       instr->dst_reg.swizzle[j] = "xyzw01__"[desc->swizzle[j]];
-
-               assert(instr->fetch.fmt != ~0);
-
-               DBG("vtx[%d]: %s (%d), ci=%d, cis=%d, id=%d, swizzle=%s, "
-                               "stride=%d, offset=%d",
-                               i, util_format_name(format),
-                               instr->fetch.fmt,
-                               instr->fetch.const_idx,
-                               instr->fetch.const_idx_sel,
-                               elem->instance_divisor,
-                               instr->dst_reg.swizzle,
-                               instr->fetch.stride,
-                               instr->fetch.offset);
+       enum pipe_format format = elem->src_format;
+       const struct util_format_description *desc =
+                       util_format_description(format);
+       unsigned j;
+
+       /* Find the first non-VOID channel. */
+       for (j = 0; j < 4; j++)
+               if (desc->channel[j].type != UTIL_FORMAT_TYPE_VOID)
+                       break;
+
+       instr->format = fd2_pipe2surface(format);
+       instr->num_format_all = !desc->channel[j].normalized;
+       instr->format_comp_all = desc->channel[j].type == UTIL_FORMAT_TYPE_SIGNED;
+       instr->stride = vb->stride;
+       instr->offset = elem->src_offset;
+
+       unsigned swiz = 0;
+       for (int i = 0; i < 4; i++) {
+               unsigned s = dst_swiz >> i*3 & 7;
+               swiz |= (s >= 4 ? s : desc->swizzle[s]) << i*3;
         }
-
-       /* trigger re-assemble: */
-       so->info.sizedwords = 0;
+       instr->dst_swiz = swiz;
  }
  
  static void
-patch_tex_fetches(struct fd_context *ctx, struct fd2_shader_stateobj *so,
-               struct fd_texture_stateobj *tex)
+patch_fetches(struct fd_context *ctx, struct ir2_shader_info *info,
+       struct fd_vertex_stateobj *vtx, struct fd_texture_stateobj *tex)
  {
-       unsigned i;
-
-       /* update tex fetch instructions: */
-       for (i = 0; i < so->num_tfetch_instrs; i++) {
-               struct ir2_instruction *instr = so->tfetch_instrs[i].instr;
-               unsigned samp_id = so->tfetch_instrs[i].samp_id;
-               unsigned const_idx = fd2_get_const_idx(ctx, tex, samp_id);
+       for (int i = 0; i < info->num_fetch_instrs; i++) {
+               struct ir2_fetch_info *fi = &info->fetch_info[i];
+
+               instr_fetch_t *instr = (instr_fetch_t*) &info->dwords[fi->offset];
+               if (instr->opc == VTX_FETCH) {
+                       unsigned idx = (instr->vtx.const_index - 20) * 3 +
+                               instr->vtx.const_index_sel;
+                       patch_vtx_fetch(ctx, &vtx->pipe[idx], &instr->vtx, fi->vtx.dst_swiz);
+                       continue;
+               }
  
-               if (const_idx != instr->fetch.const_idx) {
-                       instr->fetch.const_idx = const_idx;
-                       /* trigger re-assemble: */
-                       so->info.sizedwords = 0;
+               assert(instr->opc == TEX_FETCH);
+               instr->tex.const_idx = fd2_get_const_idx(ctx, tex, fi->tex.samp_id);
+               instr->tex.src_swiz = fi->tex.src_swiz;
+               if (fd2_texture_swap_xy(tex, fi->tex.samp_id)) {
+                       unsigned x = instr->tex.src_swiz;
+                       instr->tex.src_swiz = (x & 0x30) | (x & 3) << 2 | (x >> 2 & 3);
                 }
         }
  }
  
  void
-fd2_program_validate(struct fd_context *ctx)
+fd2_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring,
+               struct fd_program_stateobj *prog)
  {
-       struct fd_program_stateobj *prog = &ctx->prog;
-       bool dirty_fp = !!(ctx->dirty_shader[PIPE_SHADER_FRAGMENT] & FD_DIRTY_SHADER_PROG);
-       bool dirty_vp = !!(ctx->dirty_shader[PIPE_SHADER_VERTEX] & FD_DIRTY_SHADER_PROG);
-
-       /* if vertex or frag shader is dirty, we may need to recompile. Compile
-        * frag shader first, as that assigns the register slots for exports
-        * from the vertex shader.  And therefore if frag shader has changed we
-        * need to recompile both vert and frag shader.
-        */
-       if (dirty_fp)
-               compile(prog, prog->fp);
-
-       if (dirty_fp || dirty_vp)
-               compile(prog, prog->vp);
-
-       /* if necessary, fix up vertex fetch instructions: */
-       if (ctx->dirty & (FD_DIRTY_VTXSTATE | FD_DIRTY_PROG))
-               patch_vtx_fetches(ctx, prog->vp, ctx->vtx.vtx);
-
-       /* if necessary, fix up texture fetch instructions: */
-       if (ctx->dirty & (FD_DIRTY_TEXSTATE | FD_DIRTY_PROG)) {
-               patch_tex_fetches(ctx, prog->vp, &ctx->tex[PIPE_SHADER_VERTEX]);
-               patch_tex_fetches(ctx, prog->fp, &ctx->tex[PIPE_SHADER_FRAGMENT]);
+       struct fd2_shader_stateobj *fp = NULL, *vp;
+       struct ir2_shader_info *fpi, *vpi;
+       struct ir2_frag_linkage *f;
+       uint8_t vs_gprs, fs_gprs = 0, vs_export = 0;
+       enum a2xx_sq_ps_vtx_mode mode = POSITION_1_VECTOR;
+       bool binning = (ctx->batch && ring == ctx->batch->binning);
+       unsigned variant = 0;
+
+       vp = prog->vp;
+
+    /* find variant matching the linked fragment shader */
+       if (!binning) {
+               fp = prog->fp;
+               for (variant = 1; variant < ARRAY_SIZE(vp->variant); variant++) {
+                       /* if checked all variants, compile a new variant */
+                       if (!vp->variant[variant].info.sizedwords) {
+                               ir2_compile(vp, variant, fp);
+                               break;
+                       }
+
+                       /* check if fragment shader linkage matches */
+                       if (!memcmp(&vp->variant[variant].f, &fp->variant[0].f,
+                                       sizeof(struct ir2_frag_linkage)))
+                               break;
+               }
+               assert(variant < ARRAY_SIZE(vp->variant));
         }
-}
  
-void
-fd2_program_emit(struct fd_ringbuffer *ring,
-               struct fd_program_stateobj *prog)
-{
-       struct ir2_shader_info *vsi =
-               &((struct fd2_shader_stateobj *)prog->vp)->info;
-       struct ir2_shader_info *fsi =
-               &((struct fd2_shader_stateobj *)prog->fp)->info;
-       uint8_t vs_gprs, fs_gprs, vs_export;
+       vpi = &vp->variant[variant].info;
+       fpi = &fp->variant[0].info;
+       f = &fp->variant[0].f;
+
+       /* clear/gmem2mem/mem2gmem need to be changed to remove this condition */
+       if (prog != &ctx->solid_prog && prog != &ctx->blit_prog[0]) {
+               patch_fetches(ctx, vpi, ctx->vtx.vtx, &ctx->tex[PIPE_SHADER_VERTEX]);
+               if (fp)
+                       patch_fetches(ctx, fpi, NULL, &ctx->tex[PIPE_SHADER_FRAGMENT]);
+       }
  
-       emit(ring, prog->vp);
-       emit(ring, prog->fp);
+       emit(ring, MESA_SHADER_VERTEX, vpi);
  
-       vs_gprs = (vsi->max_reg < 0) ? 0x80 : vsi->max_reg;
-       fs_gprs = (fsi->max_reg < 0) ? 0x80 : fsi->max_reg;
-       vs_export = MAX2(1, prog->num_exports) - 1;
+       if (fp) {
+               emit(ring, MESA_SHADER_FRAGMENT, fpi);
+               fs_gprs = (fpi->max_reg < 0) ? 0x80 : fpi->max_reg;
+               vs_export = MAX2(1, f->inputs_count) - 1;
+       }
+
+       vs_gprs = (vpi->max_reg < 0) ? 0x80 : vpi->max_reg;
+
+       if (vp->writes_psize && !binning)
+               mode = POSITION_2_VECTORS_SPRITE;
+
+       /* set register to use for param (fragcoord/pointcoord/frontfacing) */
+       OUT_PKT3(ring, CP_SET_CONSTANT, 2);
+       OUT_RING(ring, CP_REG(REG_A2XX_SQ_CONTEXT_MISC));
+       OUT_RING(ring, A2XX_SQ_CONTEXT_MISC_SC_SAMPLE_CNTL(CENTERS_ONLY) |
+               COND(fp, A2XX_SQ_CONTEXT_MISC_PARAM_GEN_POS(f->inputs_count)) |
+               /* we need SCREEN_XY for both fragcoord and frontfacing */
+               A2XX_SQ_CONTEXT_MISC_SC_OUTPUT_SCREEN_XY);
  
         OUT_PKT3(ring, CP_SET_CONSTANT, 2);
         OUT_RING(ring, CP_REG(REG_A2XX_SQ_PROGRAM_CNTL));
-       OUT_RING(ring, A2XX_SQ_PROGRAM_CNTL_PS_EXPORT_MODE(POSITION_2_VECTORS_SPRITE) |
+       OUT_RING(ring, A2XX_SQ_PROGRAM_CNTL_PS_EXPORT_MODE(2) |
+                       A2XX_SQ_PROGRAM_CNTL_VS_EXPORT_MODE(mode) |
                         A2XX_SQ_PROGRAM_CNTL_VS_RESOURCE |
                         A2XX_SQ_PROGRAM_CNTL_PS_RESOURCE |
                         A2XX_SQ_PROGRAM_CNTL_VS_EXPORT_COUNT(vs_export) |
                         A2XX_SQ_PROGRAM_CNTL_PS_REGS(fs_gprs) |
-                       A2XX_SQ_PROGRAM_CNTL_VS_REGS(vs_gprs));
-}
-
-/* Creates shader:
- *    EXEC ADDR(0x2) CNT(0x1)
- *       (S)FETCH:     SAMPLE  R0.xyzw = R0.xyx CONST(0) LOCATION(CENTER)
- *    ALLOC PARAM/PIXEL SIZE(0x0)
- *    EXEC_END ADDR(0x3) CNT(0x1)
- *          ALU:       MAXv    export0 = R0, R0        ; gl_FragColor
- *    NOP
- */
-static struct fd2_shader_stateobj *
-create_blit_fp(void)
-{
-       struct fd2_shader_stateobj *so = create_shader(MESA_SHADER_FRAGMENT);
-       struct ir2_instruction *instr;
-
-       if (!so)
-               return NULL;
-
-       so->ir = ir2_shader_create();
-
-       instr = ir2_instr_create_tex_fetch(so->ir, 0);
-       ir2_dst_create(instr, 0, "xyzw", 0);
-       ir2_reg_create(instr, 0, "xyx", IR2_REG_INPUT);
-       instr->sync = true;
-
-       instr = ir2_instr_create_alu_v(so->ir, MAXv);
-       ir2_dst_create(instr, 0, NULL, IR2_REG_EXPORT);
-       ir2_reg_create(instr, 0, NULL, 0);
-       ir2_reg_create(instr, 0, NULL, 0);
-
-       return assemble(so);
-}
-
-/* Creates shader:
-*     EXEC ADDR(0x3) CNT(0x2)
-*           FETCH:     VERTEX  R1.xy01 = R0.x FMT_32_32_FLOAT UNSIGNED STRIDE(8) CONST(26, 1)
-*           FETCH:     VERTEX  R2.xyz1 = R0.x FMT_32_32_32_FLOAT UNSIGNED STRIDE(12) CONST(26, 0)
-*     ALLOC POSITION SIZE(0x0)
-*     EXEC ADDR(0x5) CNT(0x1)
-*           ALU:       MAXv    export62 = R2, R2       ; gl_Position
-*     ALLOC PARAM/PIXEL SIZE(0x0)
-*     EXEC_END ADDR(0x6) CNT(0x1)
-*           ALU:       MAXv    export0 = R1, R1
-*     NOP
- */
-static struct fd2_shader_stateobj *
-create_blit_vp(void)
-{
-       struct fd2_shader_stateobj *so = create_shader(MESA_SHADER_VERTEX);
-       struct ir2_instruction *instr;
-
-       if (!so)
-               return NULL;
-
-       so->ir = ir2_shader_create();
-
-       instr = ir2_instr_create_vtx_fetch(so->ir, 26, 1, FMT_32_32_FLOAT, false, 8);
-       instr->fetch.is_normalized = true;
-       ir2_dst_create(instr, 1, "xy01", 0);
-       ir2_reg_create(instr, 0, "x", IR2_REG_INPUT);
-
-       instr = ir2_instr_create_vtx_fetch(so->ir, 26, 0, FMT_32_32_32_FLOAT, false, 12);
-       instr->fetch.is_normalized = true;
-       ir2_dst_create(instr, 2, "xyz1", 0);
-       ir2_reg_create(instr, 0, "x", IR2_REG_INPUT);
-
-       instr = ir2_instr_create_alu_v(so->ir, MAXv);
-       ir2_dst_create(instr, 62, NULL, IR2_REG_EXPORT);
-       ir2_reg_create(instr, 2, NULL, 0);
-       ir2_reg_create(instr, 2, NULL, 0);
-
-       instr = ir2_instr_create_alu_v(so->ir, MAXv);
-       ir2_dst_create(instr, 0, NULL, IR2_REG_EXPORT);
-       ir2_reg_create(instr, 1, NULL, 0);
-       ir2_reg_create(instr, 1, NULL, 0);
-
-       return assemble(so);
-}
-
-/* Creates shader:
- *    ALLOC PARAM/PIXEL SIZE(0x0)
- *    EXEC_END ADDR(0x1) CNT(0x1)
- *          ALU:       MAXv    export0 = C0, C0        ; gl_FragColor
- */
-static struct fd2_shader_stateobj *
-create_solid_fp(void)
-{
-       struct fd2_shader_stateobj *so = create_shader(MESA_SHADER_FRAGMENT);
-       struct ir2_instruction *instr;
-
-       if (!so)
-               return NULL;
-
-       so->ir = ir2_shader_create();
-
-       instr = ir2_instr_create_alu_v(so->ir, MAXv);
-       ir2_dst_create(instr, 0, NULL, IR2_REG_EXPORT);
-       ir2_reg_create(instr, 0, NULL, IR2_REG_CONST);
-       ir2_reg_create(instr, 0, NULL, IR2_REG_CONST);
-
-       return assemble(so);
-}
-
-/* Creates shader:
- *    EXEC ADDR(0x3) CNT(0x1)
- *       (S)FETCH:     VERTEX  R1.xyz1 = R0.x FMT_32_32_32_FLOAT
- *                           UNSIGNED STRIDE(12) CONST(26, 0)
- *    ALLOC POSITION SIZE(0x0)
- *    EXEC ADDR(0x4) CNT(0x1)
- *          ALU:       MAXv    export62 = R1, R1       ; gl_Position
- *    ALLOC PARAM/PIXEL SIZE(0x0)
- *    EXEC_END ADDR(0x5) CNT(0x0)
- */
-static struct fd2_shader_stateobj *
-create_solid_vp(void)
-{
-       struct fd2_shader_stateobj *so = create_shader(MESA_SHADER_VERTEX);
-       struct ir2_instruction *instr;
-
-       if (!so)
-               return NULL;
-
-       so->ir = ir2_shader_create();
-
-       instr = ir2_instr_create_vtx_fetch(so->ir, 26, 0, FMT_32_32_32_FLOAT, false, 12);
-       ir2_dst_create(instr, 1, "xyz1", 0);
-       ir2_reg_create(instr, 0, "x", IR2_REG_INPUT);
-
-       instr = ir2_instr_create_alu_v(so->ir, MAXv);
-       ir2_dst_create(instr, 62, NULL, IR2_REG_EXPORT);
-       ir2_reg_create(instr, 1, NULL, 0);
-       ir2_reg_create(instr, 1, NULL, 0);
-
-
-       return assemble(so);
+                       A2XX_SQ_PROGRAM_CNTL_VS_REGS(vs_gprs) |
+                       COND(fp && fp->need_param, A2XX_SQ_PROGRAM_CNTL_PARAM_GEN) |
+                       COND(!fp, A2XX_SQ_PROGRAM_CNTL_GEN_INDEX_VTX));
  }
  
  void
  fd2_prog_init(struct pipe_context *pctx)
  {
         struct fd_context *ctx = fd_context(pctx);
+       struct fd_program_stateobj *prog;
+       struct fd2_shader_stateobj *so;
+       struct ir2_shader_info *info;
+       instr_fetch_vtx_t *instr;
  
         pctx->create_fs_state = fd2_fp_state_create;
         pctx->delete_fs_state = fd2_fp_state_delete;
@@ -442,8 +312,47 @@ fd2_prog_init(struct pipe_context *pctx)
  
         fd_prog_init(pctx);
  
-       ctx->solid_prog.fp = create_solid_fp();
-       ctx->solid_prog.vp = create_solid_vp();
-       ctx->blit_prog[0].fp = create_blit_fp();
-       ctx->blit_prog[0].vp = create_blit_vp();
+       /* XXX maybe its possible to reuse patch_vtx_fetch somehow? */
+
+       prog = &ctx->solid_prog;
+       so = prog->vp;
+       ir2_compile(prog->vp, 1, prog->fp);
+
+#define IR2_FETCH_SWIZ_XY01 0xb08
+#define IR2_FETCH_SWIZ_XYZ1 0xa88
+
+       info = &so->variant[1].info;
+
+       instr = (instr_fetch_vtx_t*) &info->dwords[info->fetch_info[0].offset];
+       instr->const_index = 26;
+       instr->const_index_sel = 0;
+       instr->format = FMT_32_32_32_FLOAT;
+       instr->format_comp_all = false;
+       instr->stride = 12;
+       instr->num_format_all = true;
+       instr->dst_swiz = IR2_FETCH_SWIZ_XYZ1;
+
+       prog = &ctx->blit_prog[0];
+       so = prog->vp;
+       ir2_compile(prog->vp, 1, prog->fp);
+
+       info = &so->variant[1].info;
+
+       instr = (instr_fetch_vtx_t*) &info->dwords[info->fetch_info[0].offset];
+       instr->const_index = 26;
+       instr->const_index_sel = 1;
+       instr->format = FMT_32_32_FLOAT;
+       instr->format_comp_all = false;
+       instr->stride = 8;
+       instr->num_format_all = false;
+       instr->dst_swiz = IR2_FETCH_SWIZ_XY01;
+
+       instr = (instr_fetch_vtx_t*) &info->dwords[info->fetch_info[1].offset];
+       instr->const_index = 26;
+       instr->const_index_sel = 0;
+       instr->format = FMT_32_32_32_FLOAT;
+       instr->format_comp_all = false;
+       instr->stride = 12;
+       instr->num_format_all = false;
+       instr->dst_swiz = IR2_FETCH_SWIZ_XYZ1;
  }
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_program.h b/src/gallium/drivers/freedreno/a2xx/fd2_program.h

index 01e9983..d4ac93b 100644 (file)
--- a/src/gallium/drivers/freedreno/a2xx/fd2_program.h
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_program.h
@@ -31,48 +31,38 @@
  
  #include "freedreno_context.h"
  
-#include "ir-a2xx.h"
+#include "ir2.h"
  #include "disasm.h"
  
  struct fd2_shader_stateobj {
+       nir_shader *nir;
         gl_shader_stage type;
+       bool is_a20x;
  
-       uint32_t *bin;
-
-       struct tgsi_token *tokens;
-
-       /* note that we defer compiling shader until we know both vs and ps..
-        * and if one changes, we potentially need to recompile in order to
-        * get varying linkages correct:
-        */
-       struct ir2_shader_info info;
-       struct ir2_shader *ir;
-
-       /* for vertex shaders, the fetch instructions which need to be
-        * patched up before assembly:
-        */
-       unsigned num_vfetch_instrs;
-       struct ir2_instruction *vfetch_instrs[64];
-
-       /* for all shaders, any tex fetch instructions which need to be
-        * patched before assembly:
+       /* note: using same set of immediates for all variants
+        * it doesn't matter, other than the slightly larger command stream
          */
-       unsigned num_tfetch_instrs;
-       struct {
-               unsigned samp_id;
-               struct ir2_instruction *instr;
-       } tfetch_instrs[64];
-
         unsigned first_immediate;     /* const reg # of first immediate */
         unsigned num_immediates;
         struct {
                 uint32_t val[4];
+               unsigned ncomp;
         } immediates[64];
+
+       bool writes_psize;
+       bool need_param;
+
+       /* note:
+        * fragment shader only has one variant
+        * first vertex shader variant is always binning shader
+        * we should use a dynamic array but in normal case there is
+        * only 2 variants (and 3 sometimes with GALLIUM_HUD)
+        */
+       struct ir2_shader_variant variant[8];
  };
  
-void fd2_program_emit(struct fd_ringbuffer *ring,
+void fd2_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring,
                 struct fd_program_stateobj *prog);
-void fd2_program_validate(struct fd_context *ctx);
  
  void fd2_prog_init(struct pipe_context *pctx);
  
diff --git a/src/gallium/drivers/freedreno/a2xx/instr-a2xx.h b/src/gallium/drivers/freedreno/a2xx/instr-a2xx.h

index 5a9f93e..2591062 100644 (file)
--- a/src/gallium/drivers/freedreno/a2xx/instr-a2xx.h
+++ b/src/gallium/drivers/freedreno/a2xx/instr-a2xx.h
@@ -87,6 +87,7 @@ typedef enum {
         SIN = 48,
         COS = 49,
         RETAIN_PREV = 50,
+       SCALAR_NONE = 63,
  } instr_scalar_opc_t;
  
  typedef enum {
@@ -120,6 +121,7 @@ typedef enum {
         KILLNEv = 27,
         DSTv = 28,
         MOVAv = 29,
+       VECTOR_NONE = 31,
  } instr_vector_opc_t;
  
  typedef struct PACKED {
@@ -161,9 +163,9 @@ typedef struct PACKED {
                 };
                 /* constants have full 8-bit index */
                 struct {
-                       uint8_t             src3_reg_const   : 8;
-                       uint8_t             src2_reg_const   : 8;
-                       uint8_t             src1_reg_const   : 8;
+                       uint8_t             src3_reg_byte    : 8;
+                       uint8_t             src2_reg_byte    : 8;
+                       uint8_t             src1_reg_byte    : 8;
                 };
         };
         instr_vector_opc_t  vector_opc               : 5;
@@ -389,10 +391,17 @@ typedef union PACKED {
                 instr_fetch_opc_t opc                    : 5;
                 uint32_t        dummy0                   : 27;
                 /* dword1: */
-               uint32_t        dummy1                   : 32;
+               uint32_t        dummy1                   : 31;
+               uint8_t         pred_select              : 1;
                 /* dword2: */
-               uint32_t        dummy2                   : 32;
+               uint32_t        dummy2                   : 31;
+               uint8_t         pred_condition           : 1;
         };
  } instr_fetch_t;
  
+typedef union PACKED {
+       instr_alu_t alu;
+       instr_fetch_t fetch;
+} instr_t;
+
  #endif /* INSTR_H_ */
diff --git a/src/gallium/drivers/freedreno/a2xx/ir-a2xx.c b/src/gallium/drivers/freedreno/a2xx/ir-a2xx.c

deleted file mode 100644 (file)

index af98118..0000000
--- a/src/gallium/drivers/freedreno/a2xx/ir-a2xx.c
+++ /dev/null
@@ -1,809 +0,0 @@
-/*
- * Copyright (c) 2012 Rob Clark <robdclark@gmail.com>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "ir-a2xx.h"
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-
-#include "freedreno_util.h"
-#include "instr-a2xx.h"
-
-#define DEBUG_MSG(f, ...)  do { if (0) DBG(f, ##__VA_ARGS__); } while (0)
-#define WARN_MSG(f, ...)   DBG("WARN:  "f, ##__VA_ARGS__)
-#define ERROR_MSG(f, ...)  DBG("ERROR: "f, ##__VA_ARGS__)
-
-static int instr_emit(struct ir2_instruction *instr, uint32_t *dwords,
-               uint32_t idx, struct ir2_shader_info *info);
-
-static uint32_t reg_fetch_src_swiz(struct ir2_src_register *reg, uint32_t n);
-static uint32_t reg_fetch_dst_swiz(struct ir2_dst_register *reg);
-static uint32_t reg_alu_dst_swiz(struct ir2_dst_register *reg);
-static uint32_t reg_alu_src_swiz(struct ir2_src_register *reg);
-
-/* simple allocator to carve allocations out of an up-front allocated heap,
- * so that we can free everything easily in one shot.
- */
-static void * ir2_alloc(struct ir2_shader *shader, int sz)
-{
-       void *ptr = &shader->heap[shader->heap_idx];
-       shader->heap_idx += align(sz, 4) / 4;
-       return ptr;
-}
-
-static char * ir2_strdup(struct ir2_shader *shader, const char *str)
-{
-       char *ptr = NULL;
-       if (str) {
-               int len = strlen(str);
-               ptr = ir2_alloc(shader, len+1);
-               memcpy(ptr, str, len);
-               ptr[len] = '\0';
-       }
-       return ptr;
-}
-
-struct ir2_shader * ir2_shader_create(void)
-{
-       DEBUG_MSG("");
-       struct ir2_shader *shader = calloc(1, sizeof(struct ir2_shader));
-       shader->max_reg = -1;
-       return shader;
-}
-
-void ir2_shader_destroy(struct ir2_shader *shader)
-{
-       DEBUG_MSG("");
-       free(shader);
-}
-
-/* check if an instruction is a simple MOV
- */
-static struct ir2_instruction * simple_mov(struct ir2_instruction *instr,
-               bool output)
-{
-    struct ir2_src_register *src_reg = instr->src_reg;
-    struct ir2_dst_register *dst_reg = &instr->dst_reg;
-    struct ir2_register *reg;
-    unsigned i;
-
-    /* MAXv used for MOV */
-    if (instr->instr_type != IR2_ALU_VECTOR ||
-               instr->alu_vector.opc != MAXv)
-               return NULL;
-
-       /* non identical srcs */
-       if (src_reg[0].num != src_reg[1].num)
-               return NULL;
-
-       /* flags */
-       int flags = IR2_REG_NEGATE | IR2_REG_ABS;
-       if (output)
-               flags |= IR2_REG_INPUT | IR2_REG_CONST;
-       if ((src_reg[0].flags & flags) || (src_reg[1].flags & flags))
-               return NULL;
-
-       /* clamping */
-       if (instr->alu_vector.clamp)
-               return NULL;
-
-       /* swizzling */
-    for (i = 0; i < 4; i++) {
-               char swiz = (dst_reg->swizzle ? dst_reg->swizzle : "xyzw")[i];
-               if (swiz == '_')
-                       continue;
-
-               if (swiz != (src_reg[0].swizzle ? src_reg[0].swizzle : "xyzw")[i] ||
-                       swiz != (src_reg[1].swizzle ? src_reg[1].swizzle : "xyzw")[i])
-                       return NULL;
-    }
-
-    if (output)
-               reg = &instr->shader->reg[src_reg[0].num];
-       else
-               reg = &instr->shader->reg[dst_reg->num];
-
-       assert(reg->write_idx >= 0);
-    if (reg->write_idx != reg->write_idx2)
-               return NULL;
-
-       if (!output)
-               return instr;
-
-       instr = instr->shader->instr[reg->write_idx];
-       return instr->instr_type != IR2_ALU_VECTOR ? NULL : instr;
-}
-
-static int src_to_reg(struct ir2_instruction *instr,
-               struct ir2_src_register *reg)
-{
-       if (reg->flags & IR2_REG_CONST)
-               return reg->num;
-
-       return instr->shader->reg[reg->num].reg;
-}
-
-static int dst_to_reg(struct ir2_instruction *instr,
-               struct ir2_dst_register *reg)
-{
-       if (reg->flags & IR2_REG_EXPORT)
-               return reg->num;
-
-       return instr->shader->reg[reg->num].reg;
-}
-
-static bool mask_get(uint32_t *mask, unsigned index)
-{
-    return !!(mask[index / 32] & 1 << index % 32);
-}
-
-static void mask_set(uint32_t *mask, struct ir2_register *reg, int index)
-{
-       if (reg) {
-               unsigned i;
-               for (i = 0; i < ARRAY_SIZE(reg->regmask); i++)
-                       mask[i] |= reg->regmask[i];
-       }
-       if (index >= 0)
-               mask[index / 32] |= 1 << index % 32;
-}
-
-static bool sets_pred(struct ir2_instruction *instr)
-{
-    return instr->instr_type == IR2_ALU_SCALAR &&
-               instr->alu_scalar.opc >= PRED_SETEs &&
-               instr->alu_scalar.opc <= PRED_SET_RESTOREs;
-}
-
-
-
-void* ir2_shader_assemble(struct ir2_shader *shader,
-               struct ir2_shader_info *info)
-{
-       /* NOTES
-        * blob compiler seems to always puts PRED_* instrs in a CF by
-        * themselves, and wont combine EQ/NE in the same CF
-        * (not doing this - doesn't seem to make a difference)
-        *
-        * TODO: implement scheduling for combining vector+scalar instructions
-        * -some vector instructions can be replaced by scalar
-        */
-
-       /* first step:
-        * 1. remove "NOP" MOV instructions generated by TGSI for input/output:
-        * 2. track information for register allocation, and to remove
-        * the dead code when some exports are not needed
-        * 3. add additional instructions for a20x hw binning if needed
-        * NOTE: modifies the shader instrs
-        * this step could be done as instructions are added by compiler instead
-        */
-
-       /* mask of exports that must be generated
-        * used to avoid calculating ps exports with hw binning
-       */
-       uint64_t export = ~0ull;
-       /* bitmask of variables required for exports defined by "export" */
-       uint32_t export_mask[REG_MASK/32+1] = {};
-
-       unsigned idx, reg_idx;
-       unsigned max_input = 0;
-       int export_size = -1;
-
-       for (idx = 0; idx < shader->instr_count; idx++) {
-               struct ir2_instruction *instr = shader->instr[idx], *prev;
-               struct ir2_dst_register dst_reg = instr->dst_reg;
-
-               if (dst_reg.flags & IR2_REG_EXPORT) {
-                       if (dst_reg.num < 32)
-                               export_size++;
-
-                       if ((prev = simple_mov(instr, true))) {
-                               /* copy instruction but keep dst */
-                               *instr = *prev;
-                               instr->dst_reg = dst_reg;
-                       }
-               }
-
-               for (reg_idx = 0; reg_idx < instr->src_reg_count; reg_idx++) {
-                       struct ir2_src_register *src_reg = &instr->src_reg[reg_idx];
-                       struct ir2_register *reg;
-                       int num;
-
-                       if (src_reg->flags & IR2_REG_CONST)
-                               continue;
-
-                       num = src_reg->num;
-                       reg = &shader->reg[num];
-                       reg->read_idx = idx;
-
-                       if (src_reg->flags & IR2_REG_INPUT) {
-                               max_input = MAX2(max_input, num);
-                       } else {
-                               /* bypass simple mov used to set src_reg */
-                               assert(reg->write_idx >= 0);
-                               prev = shader->instr[reg->write_idx];
-                               if (simple_mov(prev, false)) {
-                                       *src_reg = prev->src_reg[0];
-                                       /* process same src_reg again */
-                                       reg_idx -= 1;
-                                       continue;
-                               }
-                       }
-
-                       /* update dependencies */
-                       uint32_t *mask = (dst_reg.flags & IR2_REG_EXPORT) ?
-                                       export_mask : shader->reg[dst_reg.num].regmask;
-                       mask_set(mask, reg, num);
-                       if (sets_pred(instr))
-                               mask_set(export_mask, reg, num);
-               }
-       }
-
-       /* second step:
-        * emit instructions (with CFs) + RA
-        */
-       instr_cf_t cfs[128], *cf = cfs;
-       uint32_t alufetch[3*256], *af = alufetch;
-
-       /* RA is done on write, so inputs must be allocated here */
-       for (reg_idx = 0; reg_idx <= max_input; reg_idx++)
-               shader->reg[reg_idx].reg = reg_idx;
-       info->max_reg = max_input;
-
-       /* CF instr state */
-       instr_cf_exec_t exec = { .opc = EXEC };
-       instr_cf_alloc_t alloc = { .opc = ALLOC };
-       bool need_alloc = 0;
-       bool pos_export = 0;
-
-       export_size = MAX2(export_size, 0);
-
-       for (idx = 0; idx < shader->instr_count; idx++) {
-               struct ir2_instruction *instr = shader->instr[idx];
-               struct ir2_dst_register *dst_reg = &instr->dst_reg;
-               unsigned num = dst_reg->num;
-               struct ir2_register *reg;
-
-               /* a2xx only has 64 registers, so we can use a single 64-bit mask */
-               uint64_t regmask = 0ull;
-
-               /* compute the current regmask */
-               for (reg_idx = 0; (int) reg_idx <= shader->max_reg; reg_idx++) {
-                       reg = &shader->reg[reg_idx];
-                       if ((int) idx > reg->write_idx && idx < reg->read_idx)
-                               regmask |= (1ull << reg->reg);
-               }
-
-               if (dst_reg->flags & IR2_REG_EXPORT) {
-                       /* skip if export is not needed */
-                       if (!(export & (1ull << num)))
-                               continue;
-
-            /* ALLOC CF:
-             * want to alloc all < 32 at once
-                        * 32/33 and 62/63 come in pairs
-                        * XXX assuming all 3 types are never interleaved
-                        */
-            if (num < 32) {
-                               alloc.size = export_size;
-                               alloc.buffer_select = SQ_PARAMETER_PIXEL;
-                               need_alloc = export_size >= 0;
-                               export_size = -1;
-                       } else if (num == 32 || num == 33) {
-                               alloc.size = 0;
-                               alloc.buffer_select = SQ_MEMORY;
-                               need_alloc = num != 33;
-                       } else {
-                               alloc.size = 0;
-                               alloc.buffer_select = SQ_POSITION;
-                               need_alloc = !pos_export;
-                               pos_export = true;
-                       }
-
-               } else {
-                       /* skip if dst register not needed to compute exports */
-                       if (!mask_get(export_mask, num))
-                               continue;
-
-                       /* RA on first write */
-                       reg = &shader->reg[num];
-                       if (reg->write_idx == idx) {
-                               reg->reg = ffsll(~regmask) - 1;
-                               info->max_reg = MAX2(info->max_reg, reg->reg);
-                       }
-               }
-
-               if (exec.count == 6 || (exec.count && need_alloc)) {
-                       *cf++ = *(instr_cf_t*) &exec;
-                       exec.address += exec.count;
-                       exec.serialize = 0;
-                       exec.count = 0;
-               }
-
-               if (need_alloc) {
-                       *cf++ = *(instr_cf_t*) &alloc;
-                       need_alloc = false;
-               }
-
-               int ret = instr_emit(instr, af, idx, info); af += 3;
-               assert(!ret);
-
-               if (instr->instr_type == IR2_FETCH)
-                       exec.serialize |= 0x1 << exec.count * 2;
-               if (instr->sync)
-                       exec.serialize |= 0x2 << exec.count * 2;
-                exec.count += 1;
-       }
-
-
-       exec.opc = !export_size ? EXEC : EXEC_END;
-       *cf++ = *(instr_cf_t*) &exec;
-       exec.address += exec.count;
-       exec.serialize = 0;
-       exec.count = 0;
-
-       /* GPU will hang without at least one pixel alloc */
-       if (!export_size) {
-               alloc.size = 0;
-               alloc.buffer_select = SQ_PARAMETER_PIXEL;
-               *cf++ = *(instr_cf_t*) &alloc;
-
-               exec.opc = EXEC_END;
-               *cf++ = *(instr_cf_t*) &exec;
-       }
-
-       unsigned num_cfs = cf - cfs;
-
-       /* insert nop to get an even # of CFs */
-       if (num_cfs % 2) {
-               *cf++ = (instr_cf_t) { .opc = NOP };
-               num_cfs++;
-       }
-
-       /* offset cf addrs */
-       for (idx = 0; idx < num_cfs; idx++) {
-        switch (cfs[idx].opc) {
-               case EXEC:
-               case EXEC_END:
-                       cfs[idx].exec.address += num_cfs / 2;
-                       break;
-               default:
-                       break;
-               /* XXX  and any other address using cf that gets implemented */
-               }
-       }
-
-       /* concatenate cfs+alufetchs */
-       uint32_t cfdwords = num_cfs / 2 * 3;
-       uint32_t alufetchdwords = exec.address * 3;
-       info->sizedwords = cfdwords + alufetchdwords;
-       uint32_t *dwords = malloc(info->sizedwords * 4);
-       assert(dwords);
-       memcpy(dwords, cfs, cfdwords * 4);
-       memcpy(&dwords[cfdwords], alufetch, alufetchdwords * 4);
-       return dwords;
-}
-
-struct ir2_instruction * ir2_instr_create(struct ir2_shader *shader,
-               int instr_type)
-{
-       struct ir2_instruction *instr =
-                       ir2_alloc(shader, sizeof(struct ir2_instruction));
-       DEBUG_MSG("%d", instr_type);
-       instr->shader = shader;
-       instr->idx = shader->instr_count;
-       instr->pred = shader->pred;
-       instr->instr_type = instr_type;
-       shader->instr[shader->instr_count++] = instr;
-       return instr;
-}
-
-
-/*
- * FETCH instructions:
- */
-
-static int instr_emit_fetch(struct ir2_instruction *instr,
-               uint32_t *dwords, uint32_t idx,
-               struct ir2_shader_info *info)
-{
-       instr_fetch_t *fetch = (instr_fetch_t *)dwords;
-       struct ir2_dst_register *dst_reg = &instr->dst_reg;
-       struct ir2_src_register *src_reg = &instr->src_reg[0];
-
-       memset(fetch, 0, sizeof(*fetch));
-
-       fetch->opc = instr->fetch.opc;
-
-       if (instr->fetch.opc == VTX_FETCH) {
-               instr_fetch_vtx_t *vtx = &fetch->vtx;
-
-               assert(instr->fetch.stride <= 0xff);
-               assert(instr->fetch.fmt <= 0x3f);
-               assert(instr->fetch.const_idx <= 0x1f);
-               assert(instr->fetch.const_idx_sel <= 0x3);
-
-               vtx->src_reg = src_to_reg(instr, src_reg);
-               vtx->src_swiz = reg_fetch_src_swiz(src_reg, 1);
-               vtx->dst_reg = dst_to_reg(instr, dst_reg);
-               vtx->dst_swiz = reg_fetch_dst_swiz(dst_reg);
-               vtx->must_be_one = 1;
-               vtx->const_index = instr->fetch.const_idx;
-               vtx->const_index_sel = instr->fetch.const_idx_sel;
-               vtx->format_comp_all = !!instr->fetch.is_signed;
-               vtx->num_format_all = !instr->fetch.is_normalized;
-               vtx->format = instr->fetch.fmt;
-               vtx->stride = instr->fetch.stride;
-               vtx->offset = instr->fetch.offset;
-
-               if (instr->pred != IR2_PRED_NONE) {
-                       vtx->pred_select = 1;
-                       vtx->pred_condition = (instr->pred == IR2_PRED_EQ) ? 1 : 0;
-               }
-
-               /* XXX seems like every FETCH but the first has
-                * this bit set:
-                */
-               vtx->reserved3 = (idx > 0) ? 0x1 : 0x0;
-               vtx->reserved0 = (idx > 0) ? 0x2 : 0x3;
-       } else if (instr->fetch.opc == TEX_FETCH) {
-               instr_fetch_tex_t *tex = &fetch->tex;
-
-               assert(instr->fetch.const_idx <= 0x1f);
-
-               tex->src_reg = src_to_reg(instr, src_reg);
-               tex->src_swiz = reg_fetch_src_swiz(src_reg, 3);
-               tex->dst_reg = dst_to_reg(instr, dst_reg);
-               tex->dst_swiz = reg_fetch_dst_swiz(dst_reg);
-               tex->const_idx = instr->fetch.const_idx;
-               tex->mag_filter = TEX_FILTER_USE_FETCH_CONST;
-               tex->min_filter = TEX_FILTER_USE_FETCH_CONST;
-               tex->mip_filter = TEX_FILTER_USE_FETCH_CONST;
-               tex->aniso_filter = ANISO_FILTER_USE_FETCH_CONST;
-               tex->arbitrary_filter = ARBITRARY_FILTER_USE_FETCH_CONST;
-               tex->vol_mag_filter = TEX_FILTER_USE_FETCH_CONST;
-               tex->vol_min_filter = TEX_FILTER_USE_FETCH_CONST;
-               tex->use_comp_lod = 1;
-               tex->use_reg_lod = !instr->fetch.is_cube;
-               tex->sample_location = SAMPLE_CENTER;
-               tex->tx_coord_denorm = instr->fetch.is_rect;
-
-               if (instr->pred != IR2_PRED_NONE) {
-                       tex->pred_select = 1;
-                       tex->pred_condition = (instr->pred == IR2_PRED_EQ) ? 1 : 0;
-               }
-
-       } else {
-               ERROR_MSG("invalid fetch opc: %d\n", instr->fetch.opc);
-               return -1;
-       }
-
-       return 0;
-}
-
-/*
- * ALU instructions:
- */
-
-static int instr_emit_alu(struct ir2_instruction *instr_v,
-               struct ir2_instruction *instr_s, uint32_t *dwords,
-               struct ir2_shader_info *info)
-{
-       instr_alu_t *alu = (instr_alu_t *)dwords;
-       struct ir2_dst_register *vdst_reg, *sdst_reg;
-       struct ir2_src_register *src1_reg, *src2_reg, *src3_reg;
-       struct ir2_shader *shader = instr_v ? instr_v->shader : instr_s->shader;
-       enum ir2_pred pred = IR2_PRED_NONE;
-
-       memset(alu, 0, sizeof(*alu));
-
-       vdst_reg = NULL;
-       sdst_reg = NULL;
-       src1_reg = NULL;
-       src2_reg = NULL;
-       src3_reg = NULL;
-
-       if (instr_v) {
-               vdst_reg = &instr_v->dst_reg;
-               assert(instr_v->src_reg_count >= 2);
-               src1_reg = &instr_v->src_reg[0];
-               src2_reg = &instr_v->src_reg[1];
-               if (instr_v->src_reg_count > 2)
-                       src3_reg = &instr_v->src_reg[2];
-               pred = instr_v->pred;
-       }
-
-       if (instr_s) {
-               sdst_reg = &instr_s->dst_reg;
-               assert(instr_s->src_reg_count == 1);
-               assert(!instr_v || vdst_reg->flags == sdst_reg->flags);
-               assert(!instr_v || pred == instr_s->pred);
-               if (src3_reg) {
-                       assert(src3_reg->flags == instr_s->src_reg[0].flags);
-                       assert(src3_reg->num == instr_s->src_reg[0].num);
-                       assert(!strcmp(src3_reg->swizzle, instr_s->src_reg[0].swizzle));
-               }
-               src3_reg = &instr_s->src_reg[0];
-               pred = instr_s->pred;
-       }
-
-       if (vdst_reg) {
-               assert((vdst_reg->flags & ~IR2_REG_EXPORT) == 0);
-               assert(!vdst_reg->swizzle || (strlen(vdst_reg->swizzle) == 4));
-               alu->vector_opc          = instr_v->alu_vector.opc;
-               alu->vector_write_mask   = reg_alu_dst_swiz(vdst_reg);
-               alu->vector_dest         = dst_to_reg(instr_v, vdst_reg);
-       } else {
-               alu->vector_opc          = MAXv;
-       }
-
-       if (sdst_reg) {
-               alu->scalar_opc          = instr_s->alu_scalar.opc;
-               alu->scalar_write_mask   = reg_alu_dst_swiz(sdst_reg);
-               alu->scalar_dest         = dst_to_reg(instr_s, sdst_reg);
-       } else {
-               /* not sure if this is required, but adreno compiler seems
-                * to always set scalar opc to MAXs if it is not used:
-                */
-               alu->scalar_opc = MAXs;
-       }
-
-       alu->export_data =
-               !!((instr_v ? vdst_reg : sdst_reg)->flags & IR2_REG_EXPORT);
-
-       /* export32 has this bit set.. it seems to do more than just set
-        * the base address of the constants used to zero
-        * TODO make this less of a hack
-        */
-       if (alu->export_data && alu->vector_dest == 32) {
-               assert(!instr_s);
-               alu->relative_addr = 1;
-       }
-
-       if (src1_reg) {
-               if (src1_reg->flags & IR2_REG_CONST) {
-                       assert(!(src1_reg->flags & IR2_REG_ABS));
-                       alu->src1_reg_const  = src1_reg->num;
-               } else {
-                       alu->src1_reg        = shader->reg[src1_reg->num].reg;
-                       alu->src1_reg_abs    = !!(src1_reg->flags & IR2_REG_ABS);
-               }
-               alu->src1_swiz           = reg_alu_src_swiz(src1_reg);
-               alu->src1_reg_negate     = !!(src1_reg->flags & IR2_REG_NEGATE);
-               alu->src1_sel            = !(src1_reg->flags & IR2_REG_CONST);
-    }  else {
-               alu->src1_sel = 1;
-       }
-
-    if (src2_reg) {
-               if (src2_reg->flags & IR2_REG_CONST) {
-                       assert(!(src2_reg->flags & IR2_REG_ABS));
-                       alu->src2_reg_const  = src2_reg->num;
-               } else {
-                       alu->src2_reg        = shader->reg[src2_reg->num].reg;
-                       alu->src2_reg_abs    = !!(src2_reg->flags & IR2_REG_ABS);
-               }
-               alu->src2_swiz           = reg_alu_src_swiz(src2_reg);
-               alu->src2_reg_negate     = !!(src2_reg->flags & IR2_REG_NEGATE);
-               alu->src2_sel            = !(src2_reg->flags & IR2_REG_CONST);
-    } else {
-               alu->src2_sel = 1;
-    }
-
-    if (src3_reg) {
-               if (src3_reg->flags & IR2_REG_CONST) {
-                       assert(!(src3_reg->flags & IR2_REG_ABS));
-                       alu->src3_reg_const  = src3_reg->num;
-               } else {
-                       alu->src3_reg        = shader->reg[src3_reg->num].reg;
-                       alu->src3_reg_abs    = !!(src3_reg->flags & IR2_REG_ABS);
-               }
-               alu->src3_swiz           = reg_alu_src_swiz(src3_reg);
-               alu->src3_reg_negate     = !!(src3_reg->flags & IR2_REG_NEGATE);
-               alu->src3_sel            = !(src3_reg->flags & IR2_REG_CONST);
-       } else {
-               /* not sure if this is required, but adreno compiler seems
-                * to always set register bank for 3rd src if unused:
-                */
-               alu->src3_sel = 1;
-       }
-
-       alu->vector_clamp = instr_v ? instr_v->alu_vector.clamp : 0;
-       alu->scalar_clamp = instr_s ? instr_s->alu_scalar.clamp : 0;
-
-       if (pred != IR2_PRED_NONE)
-               alu->pred_select = (pred == IR2_PRED_EQ) ? 3 : 2;
-
-       return 0;
-}
-
-static int instr_emit(struct ir2_instruction *instr, uint32_t *dwords,
-               uint32_t idx, struct ir2_shader_info *info)
-{
-       switch (instr->instr_type) {
-       case IR2_FETCH: return instr_emit_fetch(instr, dwords, idx, info);
-       case IR2_ALU_VECTOR: return instr_emit_alu(instr, NULL, dwords, info);
-       case IR2_ALU_SCALAR: return instr_emit_alu(NULL, instr, dwords, info);
-       }
-       return -1;
-}
-
-struct ir2_dst_register * ir2_dst_create(struct ir2_instruction *instr,
-               int num, const char *swizzle, int flags)
-{
-       if (!(flags & IR2_REG_EXPORT)) {
-               struct ir2_register *reg = &instr->shader->reg[num];
-
-               unsigned i;
-               for (i = instr->shader->max_reg + 1; i <= num; i++)
-                       instr->shader->reg[i].write_idx = -1;
-               instr->shader->max_reg = i - 1;
-
-               if (reg->write_idx < 0)
-            reg->write_idx = instr->idx;
-               reg->write_idx2 = instr->idx;
-       }
-
-       struct ir2_dst_register *reg = &instr->dst_reg;
-       reg->flags = flags;
-       reg->num = num;
-       reg->swizzle = ir2_strdup(instr->shader, swizzle);
-       return reg;
-}
-
-struct ir2_src_register * ir2_reg_create(struct ir2_instruction *instr,
-               int num, const char *swizzle, int flags)
-{
-       assert(instr->src_reg_count + 1 <= ARRAY_SIZE(instr->src_reg));
-       if (!(flags & IR2_REG_CONST)) {
-               struct ir2_register *reg = &instr->shader->reg[num];
-
-               reg->read_idx = instr->idx;
-
-               unsigned i;
-               for (i = instr->shader->max_reg + 1; i <= num; i++)
-                       instr->shader->reg[i].write_idx = -1;
-               instr->shader->max_reg = i - 1;
-       }
-
-       struct ir2_src_register *reg = &instr->src_reg[instr->src_reg_count++];
-       reg->flags = flags;
-       reg->num = num;
-       reg->swizzle = ir2_strdup(instr->shader, swizzle);
-       return reg;
-}
-
-static uint32_t reg_fetch_src_swiz(struct ir2_src_register *reg, uint32_t n)
-{
-       uint32_t swiz = 0;
-       int i;
-
-       assert((reg->flags & ~IR2_REG_INPUT) == 0);
-       assert(reg->swizzle);
-
-       DEBUG_MSG("fetch src R%d.%s", reg->num, reg->swizzle);
-
-       for (i = n-1; i >= 0; i--) {
-               swiz <<= 2;
-               switch (reg->swizzle[i]) {
-               default:
-                       ERROR_MSG("invalid fetch src swizzle: %s", reg->swizzle);
-               case 'x': swiz |= 0x0; break;
-               case 'y': swiz |= 0x1; break;
-               case 'z': swiz |= 0x2; break;
-               case 'w': swiz |= 0x3; break;
-               }
-       }
-
-       return swiz;
-}
-
-static uint32_t reg_fetch_dst_swiz(struct ir2_dst_register *reg)
-{
-       uint32_t swiz = 0;
-       int i;
-
-       assert(reg->flags == 0);
-       assert(!reg->swizzle || (strlen(reg->swizzle) == 4));
-
-       DEBUG_MSG("fetch dst R%d.%s", reg->num, reg->swizzle);
-
-       if (reg->swizzle) {
-               for (i = 3; i >= 0; i--) {
-                       swiz <<= 3;
-                       switch (reg->swizzle[i]) {
-                       default:
-                               ERROR_MSG("invalid dst swizzle: %s", reg->swizzle);
-                       case 'x': swiz |= 0x0; break;
-                       case 'y': swiz |= 0x1; break;
-                       case 'z': swiz |= 0x2; break;
-                       case 'w': swiz |= 0x3; break;
-                       case '0': swiz |= 0x4; break;
-                       case '1': swiz |= 0x5; break;
-                       case '_': swiz |= 0x7; break;
-                       }
-               }
-       } else {
-               swiz = 0x688;
-       }
-
-       return swiz;
-}
-
-/* actually, a write-mask */
-static uint32_t reg_alu_dst_swiz(struct ir2_dst_register *reg)
-{
-       uint32_t swiz = 0;
-       int i;
-
-       assert((reg->flags & ~IR2_REG_EXPORT) == 0);
-       assert(!reg->swizzle || (strlen(reg->swizzle) == 4));
-
-       DEBUG_MSG("alu dst R%d.%s", reg->num, reg->swizzle);
-
-       if (reg->swizzle) {
-               for (i = 3; i >= 0; i--) {
-                       swiz <<= 1;
-                       if (reg->swizzle[i] == "xyzw"[i]) {
-                               swiz |= 0x1;
-                       } else if (reg->swizzle[i] != '_') {
-                               ERROR_MSG("invalid dst swizzle: %s", reg->swizzle);
-                               break;
-                       }
-               }
-       } else {
-               swiz = 0xf;
-       }
-
-       return swiz;
-}
-
-static uint32_t reg_alu_src_swiz(struct ir2_src_register *reg)
-{
-       uint32_t swiz = 0;
-       int i;
-
-       assert(!reg->swizzle || (strlen(reg->swizzle) == 4));
-
-       DEBUG_MSG("vector src R%d.%s", reg->num, reg->swizzle);
-
-       if (reg->swizzle) {
-               for (i = 3; i >= 0; i--) {
-                       swiz <<= 2;
-                       switch (reg->swizzle[i]) {
-                       default:
-                               ERROR_MSG("invalid vector src swizzle: %s", reg->swizzle);
-                       case 'x': swiz |= (0x0 - i) & 0x3; break;
-                       case 'y': swiz |= (0x1 - i) & 0x3; break;
-                       case 'z': swiz |= (0x2 - i) & 0x3; break;
-                       case 'w': swiz |= (0x3 - i) & 0x3; break;
-                       }
-               }
-       } else {
-               swiz = 0x0;
-       }
-
-       return swiz;
-}
diff --git a/src/gallium/drivers/freedreno/a2xx/ir-a2xx.h b/src/gallium/drivers/freedreno/a2xx/ir-a2xx.h

deleted file mode 100644 (file)

index ac29312..0000000
--- a/src/gallium/drivers/freedreno/a2xx/ir-a2xx.h
+++ /dev/null
@@ -1,188 +0,0 @@
-/*
- * Copyright (c) 2012 Rob Clark <robdclark@gmail.com>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef IR2_H_
-#define IR2_H_
-
-#include <stdint.h>
-#include <stdbool.h>
-
-#include "instr-a2xx.h"
-
-/* low level intermediate representation of an adreno a2xx shader program */
-
-struct ir2_shader;
-
-#define REG_MASK 0xff
-
-struct ir2_shader_info {
-       uint16_t sizedwords;
-       int8_t   max_reg;   /* highest GPR # used by shader */
-};
-
-struct ir2_register {
-       int16_t write_idx, write_idx2, read_idx, reg;
-       /* bitmask of variables on which this one depends
-        * XXX: use bitmask util?
-        */
-       uint32_t regmask[REG_MASK/32+1];
-};
-
-struct ir2_src_register {
-       enum {
-               IR2_REG_INPUT  = 0x1,
-               IR2_REG_CONST  = 0x2,
-               IR2_REG_NEGATE = 0x4,
-               IR2_REG_ABS    = 0x8,
-       } flags;
-       int num;
-       char *swizzle;
-};
-
-struct ir2_dst_register {
-       enum {
-               IR2_REG_EXPORT = 0x1,
-       } flags;
-       int num;
-       char *swizzle;
-};
-
-enum ir2_pred {
-       IR2_PRED_NONE = 0,
-       IR2_PRED_EQ = 1,
-       IR2_PRED_NE = 2,
-};
-
-struct ir2_instruction {
-       struct ir2_shader *shader;
-       unsigned idx;
-       enum {
-               IR2_FETCH,
-               IR2_ALU_VECTOR,
-               IR2_ALU_SCALAR,
-       } instr_type;
-       enum ir2_pred pred;
-       int sync;
-       unsigned src_reg_count;
-       struct ir2_dst_register dst_reg;
-       struct ir2_src_register src_reg[3];
-       union {
-               /* FETCH specific: */
-               struct {
-                       instr_fetch_opc_t opc;
-                       unsigned const_idx;
-                       /* texture fetch specific: */
-                       bool is_cube : 1;
-                       bool is_rect : 1;
-                       /* vertex fetch specific: */
-                       unsigned const_idx_sel;
-                       enum a2xx_sq_surfaceformat fmt;
-                       bool is_signed : 1;
-                       bool is_normalized : 1;
-                       uint32_t stride;
-                       uint32_t offset;
-               } fetch;
-               /* ALU-Vector specific: */
-               struct {
-                       instr_vector_opc_t opc;
-                       bool clamp;
-               } alu_vector;
-               /* ALU-Scalar specific: */
-               struct {
-                       instr_scalar_opc_t opc;
-                       bool clamp;
-               } alu_scalar;
-       };
-};
-
-struct ir2_shader {
-       unsigned instr_count;
-       int max_reg;
-       struct ir2_register reg[REG_MASK+1];
-
-       struct ir2_instruction *instr[0x200];
-       uint32_t heap[100 * 4096];
-       unsigned heap_idx;
-
-       enum ir2_pred pred;  /* pred inherited by newly created instrs */
-};
-
-struct ir2_shader * ir2_shader_create(void);
-void ir2_shader_destroy(struct ir2_shader *shader);
-void * ir2_shader_assemble(struct ir2_shader *shader,
-               struct ir2_shader_info *info);
-
-struct ir2_instruction * ir2_instr_create(struct ir2_shader *shader,
-               int instr_type);
-
-struct ir2_dst_register * ir2_dst_create(struct ir2_instruction *instr,
-               int num, const char *swizzle, int flags);
-struct ir2_src_register * ir2_reg_create(struct ir2_instruction *instr,
-               int num, const char *swizzle, int flags);
-
-/* some helper fxns: */
-
-static inline struct ir2_instruction *
-ir2_instr_create_alu_v(struct ir2_shader *shader, instr_vector_opc_t vop)
-{
-       struct ir2_instruction *instr = ir2_instr_create(shader, IR2_ALU_VECTOR);
-       if (!instr)
-               return instr;
-       instr->alu_vector.opc = vop;
-       return instr;
-}
-
-static inline struct ir2_instruction *
-ir2_instr_create_alu_s(struct ir2_shader *shader, instr_scalar_opc_t sop)
-{
-       struct ir2_instruction *instr = ir2_instr_create(shader, IR2_ALU_SCALAR);
-       if (!instr)
-               return instr;
-       instr->alu_scalar.opc = sop;
-       return instr;
-}
-
-static inline struct ir2_instruction *
-ir2_instr_create_vtx_fetch(struct ir2_shader *shader, int ci, int cis,
-               enum a2xx_sq_surfaceformat fmt, bool is_signed, int stride)
-{
-       struct ir2_instruction *instr = ir2_instr_create(shader, IR2_FETCH);
-       instr->fetch.opc = VTX_FETCH;
-       instr->fetch.const_idx = ci;
-       instr->fetch.const_idx_sel = cis;
-       instr->fetch.fmt = fmt;
-       instr->fetch.is_signed = is_signed;
-       instr->fetch.stride = stride;
-       return instr;
-}
-static inline struct ir2_instruction *
-ir2_instr_create_tex_fetch(struct ir2_shader *shader, int ci)
-{
-       struct ir2_instruction *instr = ir2_instr_create(shader, IR2_FETCH);
-       instr->fetch.opc = TEX_FETCH;
-       instr->fetch.const_idx = ci;
-       return instr;
-}
-
-
-#endif /* IR2_H_ */
diff --git a/src/gallium/drivers/freedreno/a2xx/ir2.c b/src/gallium/drivers/freedreno/a2xx/ir2.c

new file mode 100644 (file)

index 0000000..e7e6996
--- /dev/null
+++ b/src/gallium/drivers/freedreno/a2xx/ir2.c
@@ -0,0 +1,304 @@
+/*
+ * Copyright (C) 2018 Jonathan Marek <jonathan@marek.ca>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Jonathan Marek <jonathan@marek.ca>
+ */
+
+#include "ir2_private.h"
+
+static bool scalar_possible(struct ir2_instr *instr)
+{
+       if (instr->alu.scalar_opc == SCALAR_NONE)
+               return false;
+
+       return src_ncomp(instr) == 1;
+}
+
+static bool is_alu_compatible(struct ir2_instr *a, struct ir2_instr *b)
+{
+       if (!a)
+               return true;
+
+       /* dont use same instruction twice */
+       if (a == b)
+               return false;
+
+       /* PRED_SET must be alone */
+       if (b->alu.scalar_opc >= PRED_SETEs &&
+               b->alu.scalar_opc <= PRED_SET_RESTOREs)
+               return false;
+
+       /* must write to same export (issues otherwise?) */
+       return a->alu.export == b->alu.export;
+}
+
+/* priority of vector instruction for scheduling (lower=higher prio) */
+static unsigned alu_vector_prio(struct ir2_instr *instr)
+{
+       if (instr->alu.vector_opc == VECTOR_NONE)
+               return ~0u;
+
+       if (is_export(instr))
+               return 4;
+
+       /* TODO check src type and ncomps */
+       if (instr->src_count == 3)
+               return 0;
+
+       if (!scalar_possible(instr))
+               return 1;
+
+       return instr->src_count == 2 ? 2 : 3;
+}
+
+/* priority of scalar instruction for scheduling (lower=higher prio) */
+static unsigned alu_scalar_prio(struct ir2_instr *instr)
+{
+       if (!scalar_possible(instr))
+               return ~0u;
+
+       /* this case is dealt with later */
+       if (instr->src_count > 1)
+               return ~0u;
+
+       if (is_export(instr))
+               return 4;
+
+       /* PRED to end of block */
+       if (instr->alu.scalar_opc >= PRED_SETEs &&
+               instr->alu.scalar_opc <= PRED_SET_RESTOREs)
+               return 5;
+
+       /* scalar only have highest priority */
+       return instr->alu.vector_opc == VECTOR_NONE ? 0 : 3;
+}
+
+/* fill sched with next fetch or (vector and/or scalar) alu instruction */
+static int sched_next(struct ir2_context *ctx, struct ir2_sched_instr *sched)
+{
+       struct ir2_instr *avail[0x100], *instr_v = NULL, *instr_s = NULL;
+       unsigned avail_count = 0;
+
+       instr_alloc_type_t export = ~0u;
+       int block_idx = -1;
+
+       /* XXX merge this loop with the other one somehow? */
+       ir2_foreach_instr(instr, ctx) {
+               if (!instr->need_emit)
+                       continue;
+               if (is_export(instr))
+                       export = MIN2(export, export_buf(instr->alu.export));
+       }
+
+       ir2_foreach_instr(instr, ctx) {
+               if (!instr->need_emit)
+                       continue;
+
+               /* dont mix exports */
+               if (is_export(instr) && export_buf(instr->alu.export) != export)
+                       continue;
+
+               if (block_idx < 0)
+                       block_idx = instr->block_idx;
+               else if (block_idx != instr->block_idx || /* must be same block */
+                       instr->type == IR2_CF || /* CF/MEM must be alone */
+                       (is_export(instr) && export == SQ_MEMORY))
+                       break;
+               /* it works because IR2_CF is always at end of block
+                * and somewhat same idea with MEM exports, which might not be alone
+                * but will end up in-order at least
+                */
+
+               /* check if dependencies are satisfied */
+               bool is_ok = true;
+               ir2_foreach_src(src, instr) {
+                       if (src->type == IR2_SRC_REG) {
+                               /* need to check if all previous instructions in the block
+                                * which write the reg have been emitted
+                                * slow..
+                                * XXX: check components instead of whole register
+                                */
+                               struct ir2_reg *reg = get_reg_src(ctx, src);
+                               ir2_foreach_instr(p, ctx) {
+                                       if (!p->is_ssa && p->reg == reg && p->idx < instr->idx)
+                                               is_ok &= !p->need_emit;
+                               }
+                       } else if (src->type == IR2_SRC_SSA) {
+                               /* in this case its easy, just check need_emit */
+                               is_ok &= !ctx->instr[src->num].need_emit;
+                       }
+               }
+               if (!is_ok)
+                       continue;
+
+               avail[avail_count++] = instr;
+       }
+
+       if (!avail_count) {
+               assert(block_idx == -1);
+               return -1;
+       }
+
+       /* priority to FETCH instructions */
+       ir2_foreach_avail(instr) {
+               if (instr->type == IR2_ALU)
+                       continue;
+
+               ra_src_free(ctx, instr);
+               ra_reg(ctx, get_reg(instr), -1, false, 0);
+
+               instr->need_emit = false;
+               sched->instr = instr;
+               sched->instr_s = NULL;
+               return block_idx;
+       }
+
+       /* TODO precompute priorities */
+
+       unsigned prio_v = ~0u, prio_s = ~0u, prio;
+       ir2_foreach_avail(instr) {
+               prio = alu_vector_prio(instr);
+               if (prio < prio_v) {
+                       instr_v = instr;
+                       prio_v = prio;
+               }
+       }
+
+       /* TODO can still insert scalar if src_count=3, if smart about it */
+       if (!instr_v || instr_v->src_count < 3) {
+               ir2_foreach_avail(instr) {
+                       bool compat = is_alu_compatible(instr_v, instr);
+
+                       prio = alu_scalar_prio(instr);
+                       if (prio >= prio_v && !compat)
+                               continue;
+
+                       if (prio < prio_s) {
+                               instr_s = instr;
+                               prio_s = prio;
+                               if (!compat)
+                                       instr_v = NULL;
+                       }
+               }
+       }
+
+       assert(instr_v || instr_s);
+
+       /* free src registers */
+       if (instr_v) {
+               instr_v->need_emit = false;
+               ra_src_free(ctx, instr_v);
+       }
+
+       if (instr_s) {
+               instr_s->need_emit = false;
+               ra_src_free(ctx, instr_s);
+       }
+
+       /* allocate dst registers */
+       if (instr_v)
+               ra_reg(ctx, get_reg(instr_v), -1, is_export(instr_v), instr_v->alu.write_mask);
+
+       if (instr_s)
+               ra_reg(ctx, get_reg(instr_s), -1, is_export(instr_s), instr_s->alu.write_mask);
+
+       sched->instr = instr_v;
+       sched->instr_s = instr_s;
+       return block_idx;
+}
+
+/* scheduling: determine order of instructions */
+static void schedule_instrs(struct ir2_context *ctx)
+{
+       struct ir2_sched_instr *sched;
+       int block_idx;
+
+       /* allocate input registers */
+       for (unsigned idx = 0; idx < ARRAY_SIZE(ctx->input); idx++)
+               if (ctx->input[idx].initialized)
+                       ra_reg(ctx, &ctx->input[idx], idx, false, 0);
+
+       for (;;) {
+               sched = &ctx->instr_sched[ctx->instr_sched_count++];
+               block_idx = sched_next(ctx, sched);
+               if (block_idx < 0)
+                       break;
+               memcpy(sched->reg_state, ctx->reg_state, sizeof(ctx->reg_state));
+
+               /* catch texture fetch after scheduling and insert the
+                * SET_TEX_LOD right before it if necessary
+                * TODO clean this up
+                */
+               struct ir2_instr *instr = sched->instr, *tex_lod;
+               if (instr && instr->type == IR2_FETCH &&
+                       instr->fetch.opc == TEX_FETCH && instr->src_count == 2) {
+                       /* generate the SET_LOD instruction */
+                       tex_lod = &ctx->instr[ctx->instr_count++];
+                       tex_lod->type = IR2_FETCH;
+                       tex_lod->block_idx = instr->block_idx;
+                       tex_lod->pred = instr->pred;
+                       tex_lod->fetch.opc = TEX_SET_TEX_LOD;
+                       tex_lod->src[0] = instr->src[1];
+                       tex_lod->src_count = 1;
+
+                       sched[1] = sched[0];
+                       sched->instr = tex_lod;
+                       ctx->instr_sched_count++;
+               }
+
+               bool free_block = true;
+               ir2_foreach_instr(instr, ctx)
+                       free_block &= instr->block_idx != block_idx;
+               if (free_block)
+                       ra_block_free(ctx, block_idx);
+       };
+       ctx->instr_sched_count--;
+}
+
+void
+ir2_compile(struct fd2_shader_stateobj *so, unsigned variant,
+               struct fd2_shader_stateobj *fp)
+{
+       struct ir2_context ctx = { };
+       bool binning = !fp && so->type == MESA_SHADER_VERTEX;
+
+       if (fp)
+               so->variant[variant].f = fp->variant[0].f;
+
+       ctx.so = so;
+       ctx.info = &so->variant[variant].info;
+       ctx.f = &so->variant[variant].f;
+       ctx.info->max_reg = -1;
+
+       /* convert nir to internal representation */
+       ir2_nir_compile(&ctx, binning);
+
+       /* get ref_counts and kill non-needed instructions */
+       ra_count_refs(&ctx);
+
+       /* instruction order.. and vector->scalar conversions */
+       schedule_instrs(&ctx);
+
+       /* finally, assemble to bitcode */
+       assemble(&ctx, binning);
+}
diff --git a/src/gallium/drivers/freedreno/a2xx/ir2.h b/src/gallium/drivers/freedreno/a2xx/ir2.h

new file mode 100644 (file)

index 0000000..f381fdf
--- /dev/null
+++ b/src/gallium/drivers/freedreno/a2xx/ir2.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (C) 2018 Jonathan Marek <jonathan@marek.ca>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Jonathan Marek <jonathan@marek.ca>
+ */
+
+#ifndef IR2_H_
+#define IR2_H_
+
+#include "compiler/nir/nir.h"
+
+struct ir2_fetch_info {
+       /* dword offset of the fetch instruction */
+       uint16_t offset;
+       union {
+               /* swizzle to merge with tgsi swizzle */
+               struct {
+                       uint16_t dst_swiz;
+               } vtx;
+               /* sampler id to patch const_idx */
+               struct {
+                       uint16_t samp_id;
+                       uint8_t src_swiz;
+               } tex;
+       };
+};
+
+struct ir2_shader_info {
+       /* compiler shader */
+       uint32_t *dwords;
+
+       /* size of the compiled shader in dwords */
+       uint16_t sizedwords;
+
+       /* highest GPR # used by shader */
+       int8_t max_reg;
+
+       /* offset in dwords of first MEMORY export CF (for a20x hw binning) */
+       int16_t mem_export_ptr;
+
+       /* fetch instruction info for patching */
+       uint16_t num_fetch_instrs;
+       struct ir2_fetch_info fetch_info[64];
+};
+
+struct ir2_frag_linkage {
+       unsigned inputs_count;
+       struct {
+               uint8_t slot;
+               uint8_t ncomp;
+       } inputs[16];
+
+       /* driver_location of fragcoord.zw, -1 if not used */
+       int fragcoord;
+};
+
+struct ir2_shader_variant {
+       struct ir2_shader_info info;
+       struct ir2_frag_linkage f;
+};
+
+struct fd2_shader_stateobj;
+struct tgsi_token;
+
+void ir2_compile(struct fd2_shader_stateobj *so, unsigned variant,
+               struct fd2_shader_stateobj *fp);
+
+struct nir_shader *ir2_tgsi_to_nir(const struct tgsi_token *tokens);
+
+const nir_shader_compiler_options *ir2_get_compiler_options(void);
+
+int ir2_optimize_nir(nir_shader *s, bool lower);
+
+#endif                                                 /* IR2_H_ */
diff --git a/src/gallium/drivers/freedreno/a2xx/ir2_assemble.c b/src/gallium/drivers/freedreno/a2xx/ir2_assemble.c

new file mode 100644 (file)

index 0000000..e786a2c
--- /dev/null
+++ b/src/gallium/drivers/freedreno/a2xx/ir2_assemble.c
@@ -0,0 +1,548 @@
+/*
+ * Copyright (C) 2018 Jonathan Marek <jonathan@marek.ca>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Jonathan Marek <jonathan@marek.ca>
+ */
+
+#include "ir2_private.h"
+
+static unsigned
+src_swizzle(struct ir2_context *ctx, struct ir2_src *src, unsigned ncomp)
+{
+       struct ir2_reg_component *comps;
+       unsigned swiz = 0;
+
+       switch (src->type) {
+       case IR2_SRC_SSA:
+       case IR2_SRC_REG:
+               break;
+       default:
+               return src->swizzle;
+       }
+       /* we need to take into account where the components were allocated */
+       comps = get_reg_src(ctx, src)->comp;
+       for (int i = 0; i < ncomp; i++) {
+               swiz |= swiz_set(comps[swiz_get(src->swizzle, i)].c, i);
+       }
+       return swiz;
+}
+
+/* alu instr need to take into how the output components are allocated */
+
+/* scalar doesn't need to take into account dest swizzle */
+
+static unsigned
+alu_swizzle_scalar(struct ir2_context *ctx, struct ir2_src *reg)
+{
+       /* hardware seems to take from W, but swizzle everywhere just in case */
+       return swiz_merge(src_swizzle(ctx, reg, 1), IR2_SWIZZLE_XXXX);
+}
+
+static unsigned
+alu_swizzle(struct ir2_context *ctx, struct ir2_instr *instr, struct ir2_src *src)
+{
+       struct ir2_reg_component *comp = get_reg(instr)->comp;
+       unsigned swiz0 = src_swizzle(ctx, src, src_ncomp(instr));
+       unsigned swiz = 0;
+
+       /* non per component special cases */
+       switch (instr->alu.vector_opc) {
+       case PRED_SETE_PUSHv ... PRED_SETGTE_PUSHv:
+               return alu_swizzle_scalar(ctx, src);
+       case DOT2ADDv:
+       case DOT3v:
+       case DOT4v:
+       case CUBEv:
+               return swiz0;
+       default:
+               break;
+       }
+
+       for (int i = 0, j = 0; i < dst_ncomp(instr); j++) {
+               if (instr->alu.write_mask & 1 << j) {
+                       if (comp[j].c != 7)
+                               swiz |= swiz_set(i, comp[j].c);
+                       i++;
+               }
+       }
+       return swiz_merge(swiz0, swiz);
+}
+
+static unsigned
+alu_swizzle_scalar2(struct ir2_context *ctx, struct ir2_src *src, unsigned s1)
+{
+       /* hardware seems to take from ZW, but swizzle everywhere (ABAB) */
+       unsigned s0 = swiz_get(src_swizzle(ctx, src, 1), 0);
+       return swiz_merge(swiz_set(s0, 0) | swiz_set(s1, 1), IR2_SWIZZLE_XYXY);
+}
+
+/* write_mask needs to be transformed by allocation information */
+
+static unsigned
+alu_write_mask(struct ir2_context *ctx, struct ir2_instr *instr)
+{
+       struct ir2_reg_component *comp = get_reg(instr)->comp;
+       unsigned write_mask = 0;
+
+       for (int i = 0; i < 4; i++) {
+               if (instr->alu.write_mask & 1 << i)
+                       write_mask |= 1 << comp[i].c;
+       }
+
+       return write_mask;
+}
+
+/* fetch instructions can swizzle dest, but src swizzle needs conversion */
+
+static unsigned
+fetch_swizzle(struct ir2_context *ctx, struct ir2_src *src, unsigned ncomp)
+{
+       unsigned alu_swiz = src_swizzle(ctx, src, ncomp);
+       unsigned swiz = 0;
+       for (int i = 0; i < ncomp; i++)
+               swiz |= swiz_get(alu_swiz, i) << i * 2;
+       return swiz;
+}
+
+static unsigned
+fetch_dst_swiz(struct ir2_context *ctx, struct ir2_instr *instr)
+{
+       struct ir2_reg_component *comp = get_reg(instr)->comp;
+       unsigned dst_swiz = 0xfff;
+       for (int i = 0; i < dst_ncomp(instr); i++) {
+               dst_swiz &= ~(7 << comp[i].c * 3);
+               dst_swiz |= i << comp[i].c * 3;
+       }
+       return dst_swiz;
+}
+
+/* register / export # for instr */
+static unsigned
+dst_to_reg(struct ir2_context *ctx, struct ir2_instr *instr)
+{
+       if (is_export(instr))
+               return instr->alu.export;
+
+       return get_reg(instr)->idx;
+}
+
+/* register # for src */
+static unsigned src_to_reg(struct ir2_context *ctx, struct ir2_src *src)
+{
+       return get_reg_src(ctx, src)->idx;
+}
+
+static unsigned src_reg_byte(struct ir2_context *ctx, struct ir2_src *src)
+{
+       if (src->type == IR2_SRC_CONST) {
+               assert(!src->abs); /* no abs bit for const */
+               return src->num;
+       }
+       return src_to_reg(ctx, src) | (src->abs ? 0x80 : 0);
+}
+
+/* produce the 12 byte binary instruction for a given sched_instr */
+static void
+fill_instr(struct ir2_context *ctx, struct ir2_sched_instr *sched,
+                  instr_t *bc, bool * is_fetch)
+{
+       struct ir2_instr *instr = sched->instr, *instr_s, *instr_v;
+
+       *bc = (instr_t) {};
+
+       if (instr && instr->type == IR2_FETCH) {
+               *is_fetch = true;
+
+               bc->fetch.opc = instr->fetch.opc;
+               bc->fetch.pred_select = !!instr->pred;
+               bc->fetch.pred_condition = instr->pred & 1;
+
+               struct ir2_src *src = instr->src;
+
+               if (instr->fetch.opc == VTX_FETCH) {
+                       instr_fetch_vtx_t *vtx = &bc->fetch.vtx;
+
+                       assert(instr->fetch.vtx.const_idx <= 0x1f);
+                       assert(instr->fetch.vtx.const_idx_sel <= 0x3);
+
+                       vtx->src_reg = src_to_reg(ctx, src);
+                       vtx->src_swiz = fetch_swizzle(ctx, src, 1);
+                       vtx->dst_reg = dst_to_reg(ctx, instr);
+                       vtx->dst_swiz = fetch_dst_swiz(ctx, instr);
+
+                       vtx->must_be_one = 1;
+                       vtx->const_index = instr->fetch.vtx.const_idx;
+                       vtx->const_index_sel = instr->fetch.vtx.const_idx_sel;
+
+                       /* other fields will be patched */
+
+                       /* XXX seems like every FETCH but the first has
+                        * this bit set:
+                        */
+                       vtx->reserved3 = instr->idx ? 0x1 : 0x0;
+                       vtx->reserved0 = instr->idx ? 0x2 : 0x3;
+               } else if (instr->fetch.opc == TEX_FETCH) {
+                       instr_fetch_tex_t *tex = &bc->fetch.tex;
+
+                       tex->src_reg = src_to_reg(ctx, src);
+                       tex->src_swiz = fetch_swizzle(ctx, src, 3);
+                       tex->dst_reg = dst_to_reg(ctx, instr);
+                       tex->dst_swiz = fetch_dst_swiz(ctx, instr);
+                       /* tex->const_idx = patch_fetches */
+                       tex->mag_filter = TEX_FILTER_USE_FETCH_CONST;
+                       tex->min_filter = TEX_FILTER_USE_FETCH_CONST;
+                       tex->mip_filter = TEX_FILTER_USE_FETCH_CONST;
+                       tex->aniso_filter = ANISO_FILTER_USE_FETCH_CONST;
+                       tex->arbitrary_filter = ARBITRARY_FILTER_USE_FETCH_CONST;
+                       tex->vol_mag_filter = TEX_FILTER_USE_FETCH_CONST;
+                       tex->vol_min_filter = TEX_FILTER_USE_FETCH_CONST;
+                       tex->use_comp_lod = ctx->so->type == MESA_SHADER_FRAGMENT;
+                       tex->use_reg_lod = instr->src_count == 2;
+                       tex->sample_location = SAMPLE_CENTER;
+                       tex->tx_coord_denorm = instr->fetch.tex.is_rect;
+               } else if (instr->fetch.opc == TEX_SET_TEX_LOD) {
+                       instr_fetch_tex_t *tex = &bc->fetch.tex;
+
+                       tex->src_reg = src_to_reg(ctx, src);
+                       tex->src_swiz = fetch_swizzle(ctx, src, 1);
+                       tex->dst_reg = 0;
+                       tex->dst_swiz = 0xfff;
+
+                       tex->mag_filter = TEX_FILTER_USE_FETCH_CONST;
+                       tex->min_filter = TEX_FILTER_USE_FETCH_CONST;
+                       tex->mip_filter = TEX_FILTER_USE_FETCH_CONST;
+                       tex->aniso_filter = ANISO_FILTER_USE_FETCH_CONST;
+                       tex->arbitrary_filter = ARBITRARY_FILTER_USE_FETCH_CONST;
+                       tex->vol_mag_filter = TEX_FILTER_USE_FETCH_CONST;
+                       tex->vol_min_filter = TEX_FILTER_USE_FETCH_CONST;
+                       tex->use_comp_lod = 1;
+                       tex->use_reg_lod = 0;
+                       tex->sample_location = SAMPLE_CENTER;
+               } else {
+                       assert(0);
+               }
+               return;
+       }
+
+       instr_v = sched->instr;
+       instr_s = sched->instr_s;
+
+       if (instr_v) {
+               struct ir2_src src1, src2, *src3;
+
+               src1 = instr_v->src[0];
+               src2 = instr_v->src[instr_v->src_count > 1];
+               src3 = instr_v->src_count == 3 ? &instr_v->src[2] : NULL;
+
+               bc->alu.vector_opc = instr_v->alu.vector_opc;
+               bc->alu.vector_write_mask = alu_write_mask(ctx, instr_v);
+               bc->alu.vector_dest = dst_to_reg(ctx, instr_v);
+               bc->alu.vector_clamp = instr_v->alu.saturate;
+               bc->alu.export_data = instr_v->alu.export >= 0;
+
+               /* single operand SETEv, use 0.0f as src2 */
+               if (instr_v->src_count == 1 &&
+                       (bc->alu.vector_opc == SETEv ||
+                       bc->alu.vector_opc == SETNEv ||
+                       bc->alu.vector_opc == SETGTv ||
+                       bc->alu.vector_opc == SETGTEv))
+                       src2 = ir2_zero(ctx);
+
+               /* export32 instr for a20x hw binning has this bit set..
+                * it seems to do more than change the base address of constants
+                * XXX this is a hack
+                */
+               bc->alu.relative_addr =
+                       (bc->alu.export_data && bc->alu.vector_dest == 32);
+
+               bc->alu.src1_reg_byte = src_reg_byte(ctx, &src1);
+               bc->alu.src1_swiz = alu_swizzle(ctx, instr_v, &src1);
+               bc->alu.src1_reg_negate = src1.negate;
+               bc->alu.src1_sel = src1.type != IR2_SRC_CONST;
+
+               bc->alu.src2_reg_byte = src_reg_byte(ctx, &src2);
+               bc->alu.src2_swiz = alu_swizzle(ctx, instr_v, &src2);
+               bc->alu.src2_reg_negate = src2.negate;
+               bc->alu.src2_sel = src2.type != IR2_SRC_CONST;
+
+               if (src3) {
+                       bc->alu.src3_reg_byte = src_reg_byte(ctx, src3);
+                       bc->alu.src3_swiz = alu_swizzle(ctx, instr_v, src3);
+                       bc->alu.src3_reg_negate = src3->negate;
+                       bc->alu.src3_sel = src3->type != IR2_SRC_CONST;
+               }
+
+               bc->alu.pred_select = instr_v->pred;
+       }
+
+       if (instr_s) {
+               struct ir2_src *src = instr_s->src;
+
+               bc->alu.scalar_opc = instr_s->alu.scalar_opc;
+               bc->alu.scalar_write_mask = alu_write_mask(ctx, instr_s);
+               bc->alu.scalar_dest = dst_to_reg(ctx, instr_s);
+               bc->alu.scalar_clamp = instr_s->alu.saturate;
+               bc->alu.export_data = instr_s->alu.export >= 0;
+
+               if (instr_s->src_count == 1) {
+                       bc->alu.src3_reg_byte = src_reg_byte(ctx, src);
+                       bc->alu.src3_swiz = alu_swizzle_scalar(ctx, src);
+                       bc->alu.src3_reg_negate = src->negate;
+                       bc->alu.src3_sel = src->type != IR2_SRC_CONST;
+               } else {
+                       assert(instr_s->src_count == 2);
+
+                       bc->alu.src3_reg_byte = src_reg_byte(ctx, src);
+                       bc->alu.src3_swiz = alu_swizzle_scalar2(ctx, src, instr_s->alu.src1_swizzle);
+                       bc->alu.src3_reg_negate = src->negate;
+                       bc->alu.src3_sel = src->type != IR2_SRC_CONST;;
+               }
+
+               if (instr_v)
+                       assert(instr_s->pred == instr_v->pred);
+               bc->alu.pred_select = instr_s->pred;
+       }
+
+       *is_fetch = false;
+       return;
+}
+
+static unsigned
+write_cfs(struct ir2_context *ctx, instr_cf_t * cfs, unsigned cf_idx,
+                 instr_cf_alloc_t *alloc, instr_cf_exec_t *exec)
+{
+       assert(exec->count);
+
+       if (alloc)
+               cfs[cf_idx++].alloc = *alloc;
+
+       /* for memory alloc offset for patching */
+       if (alloc && alloc->buffer_select == SQ_MEMORY &&
+               ctx->info->mem_export_ptr == -1)
+               ctx->info->mem_export_ptr = cf_idx / 2 * 3;
+
+       cfs[cf_idx++].exec = *exec;
+       exec->address += exec->count;
+       exec->serialize = 0;
+       exec->count = 0;
+
+       return cf_idx;
+}
+
+/* assemble the final shader */
+void assemble(struct ir2_context *ctx, bool binning)
+{
+       /* hw seems to have a limit of 384 (num_cf/2+num_instr <= 384)
+        * address is 9 bits so could it be 512 ?
+        */
+       instr_cf_t cfs[384];
+       instr_t bytecode[384], bc;
+       unsigned block_addr[128];
+       unsigned num_cf = 0;
+
+       /* CF instr state */
+       instr_cf_exec_t exec = {.opc = EXEC};
+       instr_cf_alloc_t alloc = {.opc = ALLOC};
+
+       int sync_id, sync_id_prev = -1;
+       bool is_fetch = false;
+       bool need_sync = true;
+       bool need_alloc = false;
+       unsigned block_idx = 0;
+
+       ctx->info->mem_export_ptr = -1;
+       ctx->info->num_fetch_instrs = 0;
+
+       /* vertex shader always needs to allocate at least one parameter
+        * if it will never happen,
+        */
+       if (ctx->so->type == MESA_SHADER_VERTEX && ctx->f->inputs_count == 0) {
+               alloc.buffer_select = SQ_PARAMETER_PIXEL;
+               cfs[num_cf++].alloc = alloc;
+       }
+
+       block_addr[0] = 0;
+
+       for (int i = 0, j = 0; j < ctx->instr_sched_count; j++) {
+               struct ir2_instr *instr = ctx->instr_sched[j].instr;
+
+               /* catch IR2_CF since it isn't a regular instruction */
+               if (instr && instr->type == IR2_CF) {
+                       assert(!need_alloc); /* XXX */
+
+                       /* flush any exec cf before inserting jmp */
+                       if (exec.count)
+                               num_cf = write_cfs(ctx, cfs, num_cf, NULL, &exec);
+
+                       cfs[num_cf++].jmp_call = (instr_cf_jmp_call_t) {
+                               .opc = COND_JMP,
+                               .address = instr->cf.block_idx, /* will be fixed later */
+                               .force_call = !instr->pred,
+                               .predicated_jmp = 1,
+                               .direction = instr->cf.block_idx > instr->block_idx,
+                               .condition = instr->pred & 1,
+                       };
+                       continue;
+               }
+
+               /* fill the 3 dwords for the instruction */
+               fill_instr(ctx, &ctx->instr_sched[j], &bc, &is_fetch);
+
+               /* we need to sync between ALU/VTX_FETCH/TEX_FETCH types */
+               sync_id = 0;
+               if (is_fetch)
+                       sync_id = bc.fetch.opc == VTX_FETCH ? 1 : 2;
+
+               need_sync = sync_id != sync_id_prev;
+               sync_id_prev = sync_id;
+
+               unsigned block;
+               {
+
+                       if (ctx->instr_sched[j].instr)
+                               block = ctx->instr_sched[j].instr->block_idx;
+                       else
+                               block = ctx->instr_sched[j].instr_s->block_idx;
+
+                       assert(block_idx <= block);
+               }
+
+               /* info for patching */
+               if (is_fetch) {
+                       struct ir2_fetch_info *info =
+                               &ctx->info->fetch_info[ctx->info->num_fetch_instrs++];
+                       info->offset = i * 3;   /* add cf offset later */
+
+                       if (bc.fetch.opc == VTX_FETCH) {
+                               info->vtx.dst_swiz = bc.fetch.vtx.dst_swiz;
+                       } else if (bc.fetch.opc == TEX_FETCH) {
+                               info->tex.samp_id = instr->fetch.tex.samp_id;
+                               info->tex.src_swiz = bc.fetch.tex.src_swiz;
+                       } else {
+                               ctx->info->num_fetch_instrs--;
+                       }
+               }
+
+               /* exec cf after 6 instr or when switching between fetch / alu */
+               if (exec.count == 6 || (exec.count && (need_sync || block != block_idx))) {
+                       num_cf = write_cfs(ctx, cfs, num_cf, need_alloc ? &alloc : NULL, &exec);
+                       need_alloc = false;
+               }
+
+               /* update block_addrs for jmp patching */
+               while (block_idx < block)
+                       block_addr[++block_idx] = num_cf;
+
+               /* export - fill alloc cf */
+               if (!is_fetch && bc.alu.export_data) {
+                       /* get the export buffer from either vector/scalar dest */
+                       instr_alloc_type_t buffer =
+                               export_buf(bc.alu.vector_dest);
+                       if (bc.alu.scalar_write_mask) {
+                               if (bc.alu.vector_write_mask)
+                                       assert(buffer == export_buf(bc.alu.scalar_dest));
+                               buffer = export_buf(bc.alu.scalar_dest);
+                       }
+
+                       /* flush previous alloc if the buffer changes */
+                       bool need_new_alloc = buffer != alloc.buffer_select;
+
+                       /* memory export always in 32/33 pair, new alloc on 32 */
+                       if (bc.alu.vector_dest == 32)
+                               need_new_alloc = true;
+
+                       if (need_new_alloc && exec.count) {
+                               num_cf = write_cfs(ctx, cfs, num_cf, need_alloc ? &alloc : NULL, &exec);
+                               need_alloc = false;
+                       }
+
+                       need_alloc |= need_new_alloc;
+
+                       alloc.size = 0;
+                       alloc.buffer_select = buffer;
+
+                       if (buffer == SQ_PARAMETER_PIXEL && ctx->so->type == MESA_SHADER_VERTEX)
+                               alloc.size = ctx->f->inputs_count - 1;
+
+                       if (buffer == SQ_POSITION)
+                               alloc.size = ctx->so->writes_psize;
+               }
+
+               if (is_fetch)
+                       exec.serialize |= 0x1 << exec.count * 2;
+               if (need_sync)
+                       exec.serialize |= 0x2 << exec.count * 2;
+
+               need_sync = false;
+               exec.count += 1;
+               bytecode[i++] = bc;
+       }
+
+       /* final exec cf */
+       exec.opc = EXEC_END;
+       num_cf =
+               write_cfs(ctx, cfs, num_cf, need_alloc ? &alloc : NULL, &exec);
+
+       /* insert nop to get an even # of CFs */
+       if (num_cf % 2)
+               cfs[num_cf++] = (instr_cf_t) {
+               .opc = NOP};
+
+       /* patch cf addrs */
+       for (int idx = 0; idx < num_cf; idx++) {
+               switch (cfs[idx].opc) {
+               case NOP:
+               case ALLOC:
+                       break;
+               case EXEC:
+               case EXEC_END:
+                       cfs[idx].exec.address += num_cf / 2;
+                       break;
+               case COND_JMP:
+                       cfs[idx].jmp_call.address = block_addr[cfs[idx].jmp_call.address];
+                       break;
+               default:
+                       assert(0);
+               }
+       }
+
+       /* concatenate cfs and alu/fetch */
+       uint32_t cfdwords = num_cf / 2 * 3;
+       uint32_t alufetchdwords = exec.address * 3;
+       uint32_t sizedwords = cfdwords + alufetchdwords;
+       uint32_t *dwords = malloc(sizedwords * 4);
+       assert(dwords);
+       memcpy(dwords, cfs, cfdwords * 4);
+       memcpy(&dwords[cfdwords], bytecode, alufetchdwords * 4);
+
+       /* finalize ir2_shader_info */
+       ctx->info->dwords = dwords;
+       ctx->info->sizedwords = sizedwords;
+       for (int i = 0; i < ctx->info->num_fetch_instrs; i++)
+               ctx->info->fetch_info[i].offset += cfdwords;
+
+       if (fd_mesa_debug & FD_DBG_DISASM) {
+               DBG("disassemble: type=%d", ctx->so->type);
+               disasm_a2xx(dwords, sizedwords, 0, ctx->so->type);
+       }
+}
diff --git a/src/gallium/drivers/freedreno/a2xx/ir2_nir.c b/src/gallium/drivers/freedreno/a2xx/ir2_nir.c

new file mode 100644 (file)

index 0000000..ef9c5e0
--- /dev/null
+++ b/src/gallium/drivers/freedreno/a2xx/ir2_nir.c
@@ -0,0 +1,1173 @@
+/*
+ * Copyright (C) 2018 Jonathan Marek <jonathan@marek.ca>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Jonathan Marek <jonathan@marek.ca>
+ */
+
+#include "ir2_private.h"
+#include "nir/tgsi_to_nir.h"
+
+#include "freedreno_util.h"
+#include "fd2_program.h"
+
+static const nir_shader_compiler_options options = {
+       .lower_fpow = true,
+       .lower_flrp32 = true,
+       .lower_fmod32 = true,
+       .lower_fdiv = true,
+       .lower_fceil = true,
+       .fuse_ffma = true,
+       /* .fdot_replicates = true, it is replicated, but it makes things worse */
+       .lower_all_io_to_temps = true,
+       .vertex_id_zero_based = true, /* its not implemented anyway */
+};
+
+struct nir_shader *
+ir2_tgsi_to_nir(const struct tgsi_token *tokens)
+{
+       return tgsi_to_nir(tokens, &options);
+}
+
+const nir_shader_compiler_options *
+ir2_get_compiler_options(void)
+{
+       return &options;
+}
+
+#define OPT(nir, pass, ...) ({                             \
+   bool this_progress = false;                             \
+   NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__);      \
+   this_progress;                                          \
+})
+#define OPT_V(nir, pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__)
+
+static void
+ir2_optimize_loop(nir_shader *s)
+{
+       bool progress;
+       do {
+               progress = false;
+
+               OPT_V(s, nir_lower_vars_to_ssa);
+               progress |= OPT(s, nir_opt_copy_prop_vars);
+               progress |= OPT(s, nir_copy_prop);
+               progress |= OPT(s, nir_opt_dce);
+               progress |= OPT(s, nir_opt_cse);
+               /* progress |= OPT(s, nir_opt_gcm, true); */
+               progress |= OPT(s, nir_opt_peephole_select, UINT_MAX, true, true);
+               progress |= OPT(s, nir_opt_intrinsics);
+               progress |= OPT(s, nir_opt_algebraic);
+               progress |= OPT(s, nir_opt_constant_folding);
+               progress |= OPT(s, nir_opt_dead_cf);
+               if (OPT(s, nir_opt_trivial_continues)) {
+                       progress |= true;
+                       /* If nir_opt_trivial_continues makes progress, then we need to clean
+                        * things up if we want any hope of nir_opt_if or nir_opt_loop_unroll
+                        * to make progress.
+                        */
+                       OPT(s, nir_copy_prop);
+                       OPT(s, nir_opt_dce);
+               }
+               progress |= OPT(s, nir_opt_loop_unroll, nir_var_all);
+               progress |= OPT(s, nir_opt_if);
+               progress |= OPT(s, nir_opt_remove_phis);
+               progress |= OPT(s, nir_opt_undef);
+
+       }
+       while (progress);
+}
+
+/* trig workarounds is the same as ir3.. but we don't want to include ir3 */
+bool ir3_nir_apply_trig_workarounds(nir_shader * shader);
+
+int
+ir2_optimize_nir(nir_shader *s, bool lower)
+{
+       struct nir_lower_tex_options tex_options = {
+               .lower_txp = ~0u,
+               .lower_rect = 0,
+       };
+
+       if (fd_mesa_debug & FD_DBG_DISASM) {
+               debug_printf("----------------------\n");
+               nir_print_shader(s, stdout);
+               debug_printf("----------------------\n");
+       }
+
+       OPT_V(s, nir_opt_global_to_local);
+       OPT_V(s, nir_lower_regs_to_ssa);
+       OPT_V(s, nir_lower_vars_to_ssa);
+       OPT_V(s, nir_lower_indirect_derefs, nir_var_shader_in | nir_var_shader_out);
+
+       if (lower) {
+               OPT_V(s, ir3_nir_apply_trig_workarounds);
+               OPT_V(s, nir_lower_tex, &tex_options);
+       }
+
+       ir2_optimize_loop(s);
+
+       OPT_V(s, nir_remove_dead_variables, nir_var_function_temp);
+       OPT_V(s, nir_move_load_const);
+
+       /* TODO we dont want to get shaders writing to depth for depth textures */
+       if (s->info.stage == MESA_SHADER_FRAGMENT) {
+               nir_foreach_variable(var, &s->outputs) {
+                       if (var->data.location == FRAG_RESULT_DEPTH)
+                               return -1;
+               }
+       }
+
+       return 0;
+}
+
+static struct ir2_src
+load_const(struct ir2_context *ctx, float *value_f, unsigned ncomp)
+{
+       struct fd2_shader_stateobj *so = ctx->so;
+       unsigned imm_ncomp, swiz, idx, i, j;
+       uint32_t *value = (uint32_t*) value_f;
+
+       /* try to merge with existing immediate (TODO: try with neg) */
+       for (idx = 0; idx < so->num_immediates; idx++) {
+               swiz = 0;
+               imm_ncomp = so->immediates[idx].ncomp;
+               for (i = 0; i < ncomp; i++) {
+                       for (j = 0; j < imm_ncomp; j++) {
+                               if (value[i] == so->immediates[idx].val[j])
+                                       break;
+                       }
+                       if (j == imm_ncomp) {
+                               if (j == 4)
+                                       break;
+                               so->immediates[idx].val[imm_ncomp++] = value[i];
+                       }
+                       swiz |= swiz_set(j, i);
+               }
+               /* matched all components */
+               if (i == ncomp)
+                       break;
+       }
+
+       /* need to allocate new immediate */
+       if (idx == so->num_immediates) {
+               swiz = 0;
+               imm_ncomp = 0;
+               for (i = 0; i < ncomp; i++) {
+                       for (j = 0; j < imm_ncomp; j++) {
+                               if (value[i] == ctx->so->immediates[idx].val[j])
+                                       break;
+                       }
+                       if (j == imm_ncomp) {
+                               so->immediates[idx].val[imm_ncomp++] = value[i];
+                       }
+                       swiz |= swiz_set(j, i);
+               }
+               so->num_immediates++;
+       }
+       so->immediates[idx].ncomp = imm_ncomp;
+
+       if (ncomp == 1)
+               swiz = swiz_merge(swiz, IR2_SWIZZLE_XXXX);
+
+       return ir2_src(so->first_immediate + idx, swiz, IR2_SRC_CONST);
+}
+
+struct ir2_src
+ir2_zero(struct ir2_context *ctx)
+{
+       return load_const(ctx, (float[]) {0.0f}, 1);
+}
+
+static void
+update_range(struct ir2_context *ctx, struct ir2_reg *reg)
+{
+       if (!reg->initialized) {
+               reg->initialized = true;
+               reg->loop_depth = ctx->loop_depth;
+       }
+
+       if (ctx->loop_depth > reg->loop_depth) {
+               reg->block_idx_free = ctx->loop_last_block[reg->loop_depth + 1];
+       } else {
+               reg->loop_depth = ctx->loop_depth;
+               reg->block_idx_free = -1;
+       }
+
+       /* for regs we want to free at the end of the loop in any case
+        * XXX dont do this for ssa
+        */
+       if (reg->loop_depth)
+               reg->block_idx_free = ctx->loop_last_block[reg->loop_depth];
+}
+
+static struct ir2_src
+make_src(struct ir2_context *ctx, nir_src src)
+{
+       struct ir2_src res = {};
+       struct ir2_reg *reg;
+
+       nir_const_value *const_value = nir_src_as_const_value(src);
+
+       if (const_value) {
+               assert(src.is_ssa);
+               return load_const(ctx, &const_value->f32[0], src.ssa->num_components);
+       }
+
+       if (!src.is_ssa) {
+               res.num = src.reg.reg->index;
+               res.type = IR2_SRC_REG;
+               reg = &ctx->reg[res.num];
+       } else {
+               assert(ctx->ssa_map[src.ssa->index] >= 0);
+               res.num = ctx->ssa_map[src.ssa->index];
+               res.type = IR2_SRC_SSA;
+               reg = &ctx->instr[res.num].ssa;
+       }
+
+       update_range(ctx, reg);
+       return res;
+}
+
+static void
+set_index(struct ir2_context *ctx, nir_dest * dst,
+                 struct ir2_instr *instr)
+{
+       struct ir2_reg *reg = &instr->ssa;
+
+       if (dst->is_ssa) {
+               ctx->ssa_map[dst->ssa.index] = instr->idx;
+       } else {
+               assert(instr->is_ssa);
+               reg = &ctx->reg[dst->reg.reg->index];
+
+               instr->is_ssa = false;
+               instr->reg = reg;
+       }
+       update_range(ctx, reg);
+}
+
+static struct ir2_instr *
+ir2_instr_create(struct ir2_context *ctx, int type)
+{
+       struct ir2_instr *instr;
+
+       instr = &ctx->instr[ctx->instr_count++];
+       instr->idx = ctx->instr_count - 1;
+       instr->type = type;
+       instr->block_idx = ctx->block_idx;
+       instr->pred = ctx->pred;
+       instr->is_ssa = true;
+       return instr;
+}
+
+static struct ir2_instr *
+instr_create_alu(struct ir2_context *ctx, nir_op opcode, unsigned ncomp)
+{
+       /* emit_alu will fixup instrs that don't map directly */
+       static const struct ir2_opc {
+               int8_t scalar, vector;
+       } nir_ir2_opc[nir_num_opcodes+1] = {
+               [0 ... nir_num_opcodes - 1] = {-1, -1},
+
+               [nir_op_fmov] = {MAXs, MAXv},
+               [nir_op_fsign] = {-1, CNDGTEv},
+               [nir_op_fnot] = {SETEs, SETEv},
+      [nir_op_f2b32] = {SETNEs, SETNEv},
+               [nir_op_for] = {MAXs, MAXv},
+               [nir_op_fand] = {MINs, MINv},
+               [nir_op_fxor] = {-1, SETNEv},
+               [nir_op_fadd] = {ADDs, ADDv},
+               [nir_op_fsub] = {ADDs, ADDv},
+               [nir_op_fmul] = {MULs, MULv},
+               [nir_op_ffma] = {-1, MULADDv},
+               [nir_op_fmax] = {MAXs, MAXv},
+               [nir_op_fmin] = {MINs, MINv},
+               [nir_op_ffloor] = {FLOORs, FLOORv},
+               [nir_op_ffract] = {FRACs, FRACv},
+               [nir_op_ftrunc] = {TRUNCs, TRUNCv},
+               [nir_op_fdot2] = {-1, DOT2ADDv},
+               [nir_op_fdot3] = {-1, DOT3v},
+               [nir_op_fdot4] = {-1, DOT4v},
+               [nir_op_sge] = {-1, SETGTEv},
+               [nir_op_slt] = {-1, SETGTv},
+               [nir_op_sne] = {-1, SETNEv},
+               [nir_op_seq] = {-1, SETEv},
+               [nir_op_fcsel] = {-1, CNDEv},
+               [nir_op_frsq] = {RECIPSQ_IEEE, -1},
+               [nir_op_frcp] = {RECIP_IEEE, -1},
+               [nir_op_flog2] = {LOG_IEEE, -1},
+               [nir_op_fexp2] = {EXP_IEEE, -1},
+               [nir_op_fsqrt] = {SQRT_IEEE, -1},
+               [nir_op_fcos] = {COS, -1},
+               [nir_op_fsin] = {SIN, -1},
+               /* no fsat, fneg, fabs since source mods deal with those */
+
+               /* some nir passes still generate nir_op_imov */
+               [nir_op_imov] = {MAXs, MAXv},
+
+               /* so we can use this function with non-nir op */
+#define ir2_op_cube nir_num_opcodes
+               [ir2_op_cube] = {-1, CUBEv},
+       };
+
+       struct ir2_opc op = nir_ir2_opc[opcode];
+       assert(op.vector >= 0 || op.scalar >= 0);
+
+       struct ir2_instr *instr = ir2_instr_create(ctx, IR2_ALU);
+       instr->alu.vector_opc = op.vector;
+       instr->alu.scalar_opc = op.scalar;
+       instr->alu.export = -1;
+       instr->alu.write_mask = (1 << ncomp) - 1;
+       instr->src_count = opcode == ir2_op_cube ? 2 :
+               nir_op_infos[opcode].num_inputs;
+       instr->ssa.ncomp = ncomp;
+       return instr;
+}
+
+static struct ir2_instr *
+instr_create_alu_reg(struct ir2_context *ctx, nir_op opcode,
+               uint8_t write_mask, struct ir2_instr *share_reg)
+{
+       struct ir2_instr *instr;
+       struct ir2_reg *reg;
+       unsigned ncomp, max_comp;
+
+       reg = share_reg ? share_reg->reg : &ctx->reg[ctx->reg_count++];
+       reg->ncomp = MAX2(reg->ncomp, util_logbase2(write_mask) + 1);
+
+       instr = instr_create_alu(ctx, opcode, util_bitcount(write_mask));
+       instr->alu.write_mask = write_mask;
+       instr->reg = reg;
+       instr->is_ssa = false;
+       return instr;
+}
+
+
+static struct ir2_instr *
+instr_create_alu_dest(struct ir2_context *ctx, nir_op opcode, nir_dest *dst)
+{
+       struct ir2_instr *instr;
+       instr = instr_create_alu(ctx, opcode, nir_dest_num_components(*dst));
+       set_index(ctx, dst, instr);
+       return instr;
+}
+
+static struct ir2_instr *
+ir2_instr_create_fetch(struct ir2_context *ctx, nir_dest *dst,
+               instr_fetch_opc_t opc)
+{
+       struct ir2_instr *instr = ir2_instr_create(ctx, IR2_FETCH);
+       instr->fetch.opc = opc;
+       instr->src_count = 1;
+       instr->ssa.ncomp = nir_dest_num_components(*dst);
+       set_index(ctx, dst, instr);
+       return instr;
+}
+
+static struct ir2_src
+make_src_noconst(struct ir2_context *ctx, nir_src src)
+{
+       struct ir2_instr *instr;
+
+       if (nir_src_as_const_value(src)) {
+               assert(src.is_ssa);
+               instr = instr_create_alu(ctx, nir_op_fmov, src.ssa->num_components);
+               instr->src[0] = make_src(ctx, src);
+               return ir2_src(instr->idx, 0, IR2_SRC_SSA);
+       }
+
+       return make_src(ctx, src);
+}
+
+static void
+emit_alu(struct ir2_context *ctx, nir_alu_instr * alu)
+{
+       const nir_op_info *info = &nir_op_infos[alu->op];
+       nir_dest *dst = &alu->dest.dest;
+       struct ir2_instr *instr;
+       struct ir2_src tmp;
+       unsigned ncomp;
+
+       /* get the number of dst components */
+       if (dst->is_ssa) {
+               ncomp = dst->ssa.num_components;
+       } else {
+               ncomp = 0;
+               for (int i = 0; i < 4; i++)
+                       ncomp += !!(alu->dest.write_mask & 1 << i);
+       }
+
+       instr = instr_create_alu(ctx, alu->op, ncomp);
+       set_index(ctx, dst, instr);
+       instr->alu.saturate = alu->dest.saturate;
+       instr->alu.write_mask = alu->dest.write_mask;
+
+       for (int i = 0; i < info->num_inputs; i++) {
+               nir_alu_src *src = &alu->src[i];
+
+               /* compress swizzle with writemask when applicable */
+               unsigned swiz = 0, j = 0;
+               for (int i = 0; i < 4; i++) {
+                       if (!(alu->dest.write_mask & 1 << i) && !info->output_size)
+                               continue;
+                       swiz |= swiz_set(src->swizzle[i], j++);
+               }
+
+               instr->src[i] = make_src(ctx, src->src);
+               instr->src[i].swizzle = swiz_merge(instr->src[i].swizzle, swiz);
+               instr->src[i].negate = src->negate;
+               instr->src[i].abs = src->abs;
+       }
+
+       /* workarounds for NIR ops that don't map directly to a2xx ops */
+       switch (alu->op) {
+       case nir_op_slt:
+               tmp = instr->src[0];
+               instr->src[0] = instr->src[1];
+               instr->src[1] = tmp;
+               break;
+       case nir_op_fcsel:
+       case nir_op_bcsel:
+               tmp = instr->src[1];
+               instr->src[1] = instr->src[2];
+               instr->src[2] = tmp;
+               break;
+       case nir_op_fsub:
+               instr->src[1].negate = !instr->src[1].negate;
+               break;
+       case nir_op_fdot2:
+               instr->src_count = 3;
+               instr->src[2] = ir2_zero(ctx);
+               break;
+       case nir_op_fsign: {
+               /* we need an extra instruction to deal with the zero case */
+               struct ir2_instr *tmp;
+
+               /* tmp = x == 0 ? 0 : 1 */
+               tmp = instr_create_alu(ctx, nir_op_fcsel, ncomp);
+               tmp->src[0] = instr->src[0];
+               tmp->src[1] = ir2_zero(ctx);
+               tmp->src[2] = load_const(ctx, (float[]) {1.0f}, 1);
+
+               /* result = x >= 0 ? tmp : -tmp */
+               instr->src[1] = ir2_src(tmp->idx, 0, IR2_SRC_SSA);
+               instr->src[2] = instr->src[1];
+               instr->src[2].negate = true;
+               instr->src_count = 3;
+       } break;
+       default:
+               break;
+       }
+}
+
+static void
+load_input(struct ir2_context *ctx, nir_dest *dst, unsigned idx)
+{
+       struct ir2_instr *instr;
+       int slot = -1;
+
+       if (ctx->so->type == MESA_SHADER_VERTEX) {
+               instr = ir2_instr_create_fetch(ctx, dst, 0);
+               instr->src[0] = ir2_src(0, 0, IR2_SRC_INPUT);
+               instr->fetch.vtx.const_idx = 20 + (idx / 3);
+               instr->fetch.vtx.const_idx_sel = idx % 3;
+               return;
+       }
+
+       /* get slot from idx */
+       nir_foreach_variable(var, &ctx->nir->inputs) {
+               if (var->data.driver_location == idx) {
+                       slot = var->data.location;
+                       break;
+               }
+       }
+       assert(slot >= 0);
+
+       switch (slot) {
+       case VARYING_SLOT_PNTC:
+               /* need to extract with abs and invert y */
+               instr = instr_create_alu_dest(ctx, nir_op_ffma, dst);
+               instr->src[0] = ir2_src(ctx->f->inputs_count, IR2_SWIZZLE_ZW, IR2_SRC_INPUT);
+               instr->src[0].abs = true;
+               instr->src[1] = load_const(ctx, (float[]) {1.0f, -1.0f}, 2);
+               instr->src[2] = load_const(ctx, (float[]) {0.0f, 1.0f}, 2);
+               break;
+       case VARYING_SLOT_POS:
+               /* need to extract xy with abs and add tile offset on a20x
+                * zw from fragcoord input (w inverted in fragment shader)
+                * TODO: only components that are required by fragment shader
+                */
+               instr = instr_create_alu_reg(ctx,
+                       ctx->so->is_a20x ? nir_op_fadd : nir_op_fmov, 3, NULL);
+               instr->src[0] = ir2_src(ctx->f->inputs_count, 0, IR2_SRC_INPUT);
+               instr->src[0].abs = true;
+               /* on a20x, C64 contains the tile offset */
+               instr->src[1] = ir2_src(64, 0, IR2_SRC_CONST);
+
+               instr = instr_create_alu_reg(ctx, nir_op_fmov, 4, instr);
+               instr->src[0] = ir2_src(ctx->f->fragcoord, 0, IR2_SRC_INPUT);
+
+               instr = instr_create_alu_reg(ctx, nir_op_frcp, 8, instr);
+               instr->src[0] = ir2_src(ctx->f->fragcoord, IR2_SWIZZLE_Y, IR2_SRC_INPUT);
+
+               unsigned reg_idx = instr->reg - ctx->reg; /* XXX */
+               instr = instr_create_alu_dest(ctx, nir_op_fmov, dst);
+               instr->src[0] = ir2_src(reg_idx, 0, IR2_SRC_REG);
+               break;
+       default:
+               instr = instr_create_alu_dest(ctx, nir_op_fmov, dst);
+               instr->src[0] = ir2_src(idx, 0, IR2_SRC_INPUT);
+               break;
+       }
+}
+
+static unsigned
+output_slot(struct ir2_context *ctx, nir_intrinsic_instr *intr)
+{
+       int slot = -1;
+       unsigned idx = nir_intrinsic_base(intr);
+       nir_foreach_variable(var, &ctx->nir->outputs) {
+               if (var->data.driver_location == idx) {
+                       slot = var->data.location;
+                       break;
+               }
+       }
+       assert(slot != -1);
+       return slot;
+}
+
+static void
+store_output(struct ir2_context *ctx, nir_src src, unsigned slot, unsigned ncomp)
+{
+       struct ir2_instr *instr;
+       unsigned idx = 0;
+
+       if (ctx->so->type == MESA_SHADER_VERTEX) {
+               switch (slot) {
+               case VARYING_SLOT_POS:
+                       ctx->position = make_src(ctx, src);
+                       idx = 62;
+                       break;
+               case VARYING_SLOT_PSIZ:
+                       ctx->so->writes_psize = true;
+                       idx = 63;
+                       break;
+               default:
+                       /* find matching slot from fragment shader input */
+                       for (idx = 0; idx < ctx->f->inputs_count; idx++)
+                               if (ctx->f->inputs[idx].slot == slot)
+                                       break;
+                       if (idx == ctx->f->inputs_count)
+                               return;
+               }
+       } else if (slot != FRAG_RESULT_COLOR && slot != FRAG_RESULT_DATA0) {
+               /* only color output is implemented */
+               return;
+       }
+
+       instr = instr_create_alu(ctx, nir_op_fmov, ncomp);
+       instr->src[0] = make_src(ctx, src);
+       instr->alu.export = idx;
+}
+
+static void
+emit_intrinsic(struct ir2_context *ctx, nir_intrinsic_instr *intr)
+{
+       struct ir2_instr *instr;
+       nir_const_value *const_offset;
+       nir_deref_instr *deref;
+       unsigned idx;
+
+       switch (intr->intrinsic) {
+       case nir_intrinsic_load_input:
+               load_input(ctx, &intr->dest, nir_intrinsic_base(intr));
+               break;
+       case nir_intrinsic_store_output:
+               store_output(ctx, intr->src[0], output_slot(ctx, intr), intr->num_components);
+               break;
+       case nir_intrinsic_load_deref:
+               deref = nir_src_as_deref(intr->src[0]);
+               assert(deref->deref_type == nir_deref_type_var);
+               load_input(ctx, &intr->dest, deref->var->data.driver_location);
+               break;
+       case nir_intrinsic_store_deref:
+               deref = nir_src_as_deref(intr->src[0]);
+               assert(deref->deref_type == nir_deref_type_var);
+               store_output(ctx, intr->src[1], deref->var->data.location, intr->num_components);
+               break;
+       case nir_intrinsic_load_uniform:
+               const_offset = nir_src_as_const_value(intr->src[0]);
+               assert(const_offset); /* TODO can be false in ES2? */
+               idx = nir_intrinsic_base(intr);
+               idx += (uint32_t) nir_src_as_const_value(intr->src[0])->f32[0];
+               instr = instr_create_alu_dest(ctx, nir_op_fmov, &intr->dest);
+               instr->src[0] = ir2_src(idx, 0, IR2_SRC_CONST);
+               break;
+       case nir_intrinsic_discard:
+       case nir_intrinsic_discard_if:
+               instr = ir2_instr_create(ctx, IR2_ALU);
+               instr->alu.vector_opc = VECTOR_NONE;
+               if (intr->intrinsic == nir_intrinsic_discard_if) {
+                       instr->alu.scalar_opc = KILLNEs;
+                       instr->src[0] = make_src(ctx, intr->src[0]);
+               } else {
+                       instr->alu.scalar_opc = KILLEs;
+                       instr->src[0] = ir2_zero(ctx);
+               }
+               instr->alu.export = -1;
+               instr->src_count = 1;
+               break;
+       case nir_intrinsic_load_front_face:
+               /* gl_FrontFacing is in the sign of param.x
+                * rcp required because otherwise we can't differentiate -0.0 and +0.0
+                */
+               ctx->so->need_param = true;
+
+               struct ir2_instr *tmp = instr_create_alu(ctx, nir_op_frcp, 1);
+               tmp->src[0] = ir2_src(ctx->f->inputs_count, 0, IR2_SRC_INPUT);
+
+               instr = instr_create_alu_dest(ctx, nir_op_sge, &intr->dest);
+               instr->src[0] = ir2_src(tmp->idx, 0, IR2_SRC_SSA);
+               instr->src[1] = ir2_zero(ctx);
+               break;
+       default:
+               compile_error(ctx, "unimplemented intr %d\n", intr->intrinsic);
+               break;
+       }
+}
+
+static void
+emit_tex(struct ir2_context *ctx, nir_tex_instr * tex)
+{
+       bool is_rect = false, is_cube = false;
+       struct ir2_instr *instr;
+       nir_src *coord, *lod_bias;
+
+       coord = lod_bias = NULL;
+
+       for (unsigned i = 0; i < tex->num_srcs; i++) {
+               switch (tex->src[i].src_type) {
+               case nir_tex_src_coord:
+                       coord = &tex->src[i].src;
+                       break;
+               case nir_tex_src_bias:
+               case nir_tex_src_lod:
+                       assert(!lod_bias);
+                       lod_bias = &tex->src[i].src;
+                       break;
+               default:
+                       compile_error(ctx, "Unhandled NIR tex src type: %d\n",
+                                                 tex->src[i].src_type);
+                       return;
+               }
+       }
+
+       switch (tex->op) {
+       case nir_texop_tex:
+       case nir_texop_txb:
+       case nir_texop_txl:
+               break;
+       default:
+               compile_error(ctx, "unimplemented texop %d\n", tex->op);
+               return;
+       }
+
+       switch (tex->sampler_dim) {
+       case GLSL_SAMPLER_DIM_2D:
+               break;
+       case GLSL_SAMPLER_DIM_RECT:
+               is_rect = true;
+               break;
+       case GLSL_SAMPLER_DIM_CUBE:
+               is_cube = true;
+               break;
+       default:
+               compile_error(ctx, "unimplemented sampler %d\n", tex->sampler_dim);
+               return;
+       }
+
+       struct ir2_src src_coord = make_src_noconst(ctx, *coord);
+
+       /* for cube maps
+        * tmp = cube(coord)
+        * tmp.xy = tmp.xy / |tmp.z| + 1.5
+        * coord = tmp.xyw
+        */
+       if (is_cube) {
+               struct ir2_instr *rcp, *coord_xy;
+               unsigned reg_idx;
+
+               instr = instr_create_alu_reg(ctx, ir2_op_cube, 15, NULL);
+               instr->src[0] = src_coord;
+               instr->src[0].swizzle = IR2_SWIZZLE_ZZXY;
+               instr->src[1] = src_coord;
+               instr->src[1].swizzle = IR2_SWIZZLE_YXZZ;
+
+               reg_idx = instr->reg - ctx->reg; /* hacky */
+
+               rcp = instr_create_alu(ctx, nir_op_frcp, 1);
+               rcp->src[0] = ir2_src(reg_idx, IR2_SWIZZLE_Z, IR2_SRC_REG);
+               rcp->src[0].abs = true;
+
+               coord_xy = instr_create_alu_reg(ctx, nir_op_ffma, 3, instr);
+               coord_xy->src[0] = ir2_src(reg_idx, 0, IR2_SRC_REG);
+               coord_xy->src[1] = ir2_src(rcp->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);
+               coord_xy->src[2] = load_const(ctx, (float[]) {1.5f}, 1);
+
+               src_coord = ir2_src(reg_idx, 0, IR2_SRC_REG);
+               /* TODO: lod/bias transformed by src_coord.z ? */
+       }
+
+       instr = ir2_instr_create_fetch(ctx, &tex->dest, TEX_FETCH);
+       instr->src[0] = src_coord;
+       instr->src[0].swizzle = is_cube ? IR2_SWIZZLE_XYW : 0;
+       instr->fetch.tex.is_cube = is_cube;
+       instr->fetch.tex.is_rect = is_rect;
+       instr->fetch.tex.samp_id = tex->sampler_index;
+
+       /* for lod/bias, we insert an extra src for the backend to deal with */
+       if (lod_bias) {
+               instr->src[1] = make_src_noconst(ctx, *lod_bias);
+               /* backend will use 2-3 components so apply swizzle */
+               swiz_merge_p(&instr->src[1].swizzle, IR2_SWIZZLE_XXXX);
+               instr->src_count = 2;
+       }
+}
+
+static void
+setup_input(struct ir2_context *ctx, nir_variable * in)
+{
+       struct fd2_shader_stateobj *so = ctx->so;
+       unsigned array_len = MAX2(glsl_get_length(in->type), 1);
+       unsigned n = in->data.driver_location;
+       unsigned slot = in->data.location;
+
+       assert(array_len == 1);
+
+       /* handle later */
+       if (ctx->so->type == MESA_SHADER_VERTEX)
+               return;
+
+       if (ctx->so->type != MESA_SHADER_FRAGMENT)
+               compile_error(ctx, "unknown shader type: %d\n", ctx->so->type);
+
+       if (slot == VARYING_SLOT_PNTC) {
+               so->need_param = true;
+               return;
+       }
+
+       n = ctx->f->inputs_count++;
+
+       /* half of fragcoord from param reg, half from a varying */
+       if (slot == VARYING_SLOT_POS) {
+               ctx->f->fragcoord = n;
+               so->need_param = true;
+       }
+
+       ctx->f->inputs[n].slot = slot;
+       ctx->f->inputs[n].ncomp = glsl_get_components(in->type);
+
+       /* in->data.interpolation?
+        * opengl ES 2.0 can't do flat mode, but we still get it from GALLIUM_HUD
+        */
+}
+
+static void
+emit_undef(struct ir2_context *ctx, nir_ssa_undef_instr * undef)
+{
+       /* TODO we don't want to emit anything for undefs */
+
+       struct ir2_instr *instr;
+
+       instr = instr_create_alu_dest(ctx, nir_op_fmov,
+               &(nir_dest) {.ssa = undef->def,.is_ssa = true});
+       instr->src[0] = ir2_src(0, 0, IR2_SRC_CONST);
+}
+
+static void
+emit_instr(struct ir2_context *ctx, nir_instr * instr)
+{
+       switch (instr->type) {
+       case nir_instr_type_alu:
+               emit_alu(ctx, nir_instr_as_alu(instr));
+               break;
+       case nir_instr_type_deref:
+               /* ignored, handled as part of the intrinsic they are src to */
+               break;
+       case nir_instr_type_intrinsic:
+               emit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
+               break;
+       case nir_instr_type_load_const:
+               /* dealt with when using nir_src */
+               break;
+       case nir_instr_type_tex:
+               emit_tex(ctx, nir_instr_as_tex(instr));
+               break;
+       case nir_instr_type_jump:
+               ctx->block_has_jump[ctx->block_idx] = true;
+               break;
+       case nir_instr_type_ssa_undef:
+               emit_undef(ctx, nir_instr_as_ssa_undef(instr));
+               break;
+       default:
+               break;
+       }
+}
+
+/* fragcoord.zw and a20x hw binning outputs */
+static void
+extra_position_exports(struct ir2_context *ctx, bool binning)
+{
+       struct ir2_instr *instr, *rcp, *sc, *wincoord, *off;
+
+       if (ctx->f->fragcoord < 0 && !binning)
+               return;
+
+       instr = instr_create_alu(ctx, nir_op_fmax, 1);
+       instr->src[0] = ctx->position;
+       instr->src[0].swizzle = IR2_SWIZZLE_W;
+       instr->src[1] = ir2_zero(ctx);
+
+       rcp = instr_create_alu(ctx, nir_op_frcp, 1);
+       rcp->src[0] = ir2_src(instr->idx, 0, IR2_SRC_SSA);
+
+       sc = instr_create_alu(ctx, nir_op_fmul, 4);
+       sc->src[0] = ctx->position;
+       sc->src[1] = ir2_src(rcp->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);
+
+       wincoord = instr_create_alu(ctx, nir_op_ffma, 4);
+       wincoord->src[0] = ir2_src(66, 0, IR2_SRC_CONST);
+       wincoord->src[1] = ir2_src(sc->idx, 0, IR2_SRC_SSA);
+       wincoord->src[2] = ir2_src(65, 0, IR2_SRC_CONST);
+
+       /* fragcoord z/w */
+       if (ctx->f->fragcoord >= 0 && !binning) {
+               instr = instr_create_alu(ctx, nir_op_fmov, 1);
+               instr->src[0] = ir2_src(wincoord->idx, IR2_SWIZZLE_Z, IR2_SRC_SSA);
+               instr->alu.export = ctx->f->fragcoord;
+
+               instr = instr_create_alu(ctx, nir_op_fmov, 1);
+               instr->src[0] = ctx->position;
+               instr->src[0].swizzle = IR2_SWIZZLE_W;
+               instr->alu.export = ctx->f->fragcoord;
+               instr->alu.write_mask = 2;
+       }
+
+       if (!binning)
+               return;
+
+       off = instr_create_alu(ctx, nir_op_fadd, 1);
+       off->src[0] = ir2_src(64, 0, IR2_SRC_CONST);
+       off->src[1] = ir2_src(2, 0, IR2_SRC_INPUT);
+
+       /* 8 max set in freedreno_screen.. unneeded instrs patched out */
+       for (int i = 0; i < 8; i++) {
+               instr = instr_create_alu(ctx, nir_op_ffma, 4);
+               instr->src[0] = ir2_src(1, IR2_SWIZZLE_WYWW, IR2_SRC_CONST);
+               instr->src[1] = ir2_src(off->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);
+               instr->src[2] = ir2_src(3 + i, 0, IR2_SRC_CONST);
+               instr->alu.export = 32;
+
+               instr = instr_create_alu(ctx, nir_op_ffma, 4);
+               instr->src[0] = ir2_src(68 + i * 2, 0, IR2_SRC_CONST);
+               instr->src[1] = ir2_src(wincoord->idx, 0, IR2_SRC_SSA);
+               instr->src[2] = ir2_src(67 + i * 2, 0, IR2_SRC_CONST);
+               instr->alu.export = 33;
+       }
+}
+
+static bool emit_cf_list(struct ir2_context *ctx, struct exec_list *list);
+
+static bool
+emit_block(struct ir2_context *ctx, nir_block * block)
+{
+       struct ir2_instr *instr;
+       nir_block *succs = block->successors[0];
+
+       ctx->block_idx = block->index;
+
+       nir_foreach_instr(instr, block)
+               emit_instr(ctx, instr);
+
+       if (!succs || !succs->index)
+               return false;
+
+       /* we want to be smart and always jump and have the backend cleanup
+        * but we are not, so there are two cases where jump is needed:
+        *  loops (succs index lower)
+        *  jumps (jump instruction seen in block)
+        */
+       if (succs->index > block->index && !ctx->block_has_jump[block->index])
+               return false;
+
+       assert(block->successors[1] == NULL);
+
+       instr = ir2_instr_create(ctx, IR2_CF);
+       instr->cf.block_idx = succs->index;
+       /* XXX can't jump to a block with different predicate */
+       return true;
+}
+
+static void
+emit_if(struct ir2_context *ctx, nir_if * nif)
+{
+       unsigned pred = ctx->pred, pred_idx = ctx->pred_idx;
+       struct ir2_instr *instr;
+
+       /* XXX: blob seems to always use same register for condition */
+
+       instr = ir2_instr_create(ctx, IR2_ALU);
+       instr->src[0] = make_src(ctx, nif->condition);
+       instr->src_count = 1;
+       instr->ssa.ncomp = 1;
+       instr->alu.vector_opc = VECTOR_NONE;
+       instr->alu.scalar_opc = SCALAR_NONE;
+       instr->alu.export = -1;
+       instr->alu.write_mask = 1;
+       instr->pred = 0;
+
+       /* if nested, use PRED_SETNE_PUSHv */
+       if (pred) {
+               instr->alu.vector_opc = PRED_SETNE_PUSHv;
+               instr->src[1] = instr->src[0];
+               instr->src[0] = ir2_src(pred_idx, 0, IR2_SRC_SSA);
+               instr->src[0].swizzle = IR2_SWIZZLE_XXXX;
+               instr->src[1].swizzle = IR2_SWIZZLE_XXXX;
+               instr->src_count = 2;
+       } else {
+               instr->alu.scalar_opc = PRED_SETNEs;
+       }
+
+       ctx->pred_idx = instr->idx;
+       ctx->pred = 3;
+
+       emit_cf_list(ctx, &nif->then_list);
+
+       /* TODO: if these is no else branch we don't need this
+        * and if the else branch is simple, can just flip ctx->pred instead
+        */
+       instr = ir2_instr_create(ctx, IR2_ALU);
+       instr->src[0] = ir2_src(ctx->pred_idx, 0, IR2_SRC_SSA);
+       instr->src_count = 1;
+       instr->ssa.ncomp = 1;
+       instr->alu.vector_opc = VECTOR_NONE;
+       instr->alu.scalar_opc = PRED_SET_INVs;
+       instr->alu.export = -1;
+       instr->alu.write_mask = 1;
+       instr->pred = 0;
+       ctx->pred_idx = instr->idx;
+
+       emit_cf_list(ctx, &nif->else_list);
+
+       /* restore predicate for nested predicates */
+       if (pred) {
+               instr = ir2_instr_create(ctx, IR2_ALU);
+               instr->src[0] = ir2_src(ctx->pred_idx, 0, IR2_SRC_SSA);
+               instr->src_count = 1;
+               instr->ssa.ncomp = 1;
+               instr->alu.vector_opc = VECTOR_NONE;
+               instr->alu.scalar_opc = PRED_SET_POPs;
+               instr->alu.export = -1;
+               instr->alu.write_mask = 1;
+               instr->pred = 0;
+               ctx->pred_idx = instr->idx;
+       }
+
+       /* restore ctx->pred */
+       ctx->pred = pred;
+}
+
+/* get the highest block idx in the loop, so we know when
+ * we can free registers that are allocated outside the loop
+ */
+static unsigned
+loop_last_block(struct exec_list *list)
+{
+       nir_cf_node *node =
+               exec_node_data(nir_cf_node, exec_list_get_tail(list), node);
+       switch (node->type) {
+       case nir_cf_node_block:
+               return nir_cf_node_as_block(node)->index;
+       case nir_cf_node_if:
+               assert(0); /* XXX could this ever happen? */
+               return 0;
+       case nir_cf_node_loop:
+               return loop_last_block(&nir_cf_node_as_loop(node)->body);
+       default:
+               compile_error(ctx, "Not supported\n");
+               return 0;
+       }
+}
+
+static void
+emit_loop(struct ir2_context *ctx, nir_loop *nloop)
+{
+       ctx->loop_last_block[++ctx->loop_depth] = loop_last_block(&nloop->body);
+       emit_cf_list(ctx, &nloop->body);
+       ctx->loop_depth--;
+}
+
+static bool
+emit_cf_list(struct ir2_context *ctx, struct exec_list *list)
+{
+       bool ret = false;
+       foreach_list_typed(nir_cf_node, node, node, list) {
+               ret = false;
+               switch (node->type) {
+               case nir_cf_node_block:
+                       ret = emit_block(ctx, nir_cf_node_as_block(node));
+                       break;
+               case nir_cf_node_if:
+                       emit_if(ctx, nir_cf_node_as_if(node));
+                       break;
+               case nir_cf_node_loop:
+                       emit_loop(ctx, nir_cf_node_as_loop(node));
+                       break;
+               case nir_cf_node_function:
+                       compile_error(ctx, "Not supported\n");
+                       break;
+               }
+       }
+       return ret;
+}
+
+static void cleanup_binning(struct ir2_context *ctx)
+{
+       assert(ctx->so->type == MESA_SHADER_VERTEX);
+
+       /* kill non-position outputs for binning variant */
+       nir_foreach_block(block, nir_shader_get_entrypoint(ctx->nir)) {
+               nir_foreach_instr_safe(instr, block) {
+                       if (instr->type != nir_instr_type_intrinsic)
+                               continue;
+
+                       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+                       unsigned slot;
+                       switch (intr->intrinsic) {
+                       case nir_intrinsic_store_deref: {
+                               nir_deref_instr *deref = nir_src_as_deref(intr->src[0]);
+                               assert(deref->deref_type == nir_deref_type_var);
+                               slot = deref->var->data.location;
+                       } break;
+                       case nir_intrinsic_store_output:
+                               slot = output_slot(ctx, intr);
+                               break;
+                       default:
+                               continue;
+                       }
+
+                       if (slot != VARYING_SLOT_POS)
+                               nir_instr_remove(instr);
+               }
+       }
+
+       ir2_optimize_nir(ctx->nir, false);
+}
+
+void
+ir2_nir_compile(struct ir2_context *ctx, bool binning)
+{
+       struct fd2_shader_stateobj *so = ctx->so;
+
+       memset(ctx->ssa_map, 0xff, sizeof(ctx->ssa_map));
+
+       ctx->nir = nir_shader_clone(NULL, so->nir);
+
+       if (binning)
+               cleanup_binning(ctx);
+
+       /* postprocess */
+       OPT_V(ctx->nir, nir_opt_algebraic_late);
+
+       OPT_V(ctx->nir, nir_lower_to_source_mods, nir_lower_all_source_mods);
+       OPT_V(ctx->nir, nir_copy_prop);
+       OPT_V(ctx->nir, nir_opt_dce);
+       OPT_V(ctx->nir, nir_opt_move_comparisons);
+
+       OPT_V(ctx->nir, nir_lower_bool_to_float);
+
+       OPT_V(ctx->nir, nir_lower_locals_to_regs);
+
+       OPT_V(ctx->nir, nir_convert_from_ssa, true);
+
+       OPT_V(ctx->nir, nir_move_vec_src_uses_to_dest);
+       OPT_V(ctx->nir, nir_lower_vec_to_movs);
+
+       OPT_V(ctx->nir, nir_opt_dce);
+
+       nir_sweep(ctx->nir);
+
+       if (fd_mesa_debug & FD_DBG_DISASM) {
+               debug_printf("----------------------\n");
+               nir_print_shader(ctx->nir, stdout);
+               debug_printf("----------------------\n");
+       }
+
+       /* fd2_shader_stateobj init */
+       if (so->type == MESA_SHADER_FRAGMENT) {
+               ctx->f->fragcoord = -1;
+               ctx->f->inputs_count = 0;
+               memset(ctx->f->inputs, 0, sizeof(ctx->f->inputs));
+       }
+
+       /* Setup inputs: */
+       nir_foreach_variable(in, &ctx->nir->inputs)
+               setup_input(ctx, in);
+
+       if (so->type == MESA_SHADER_FRAGMENT) {
+               unsigned idx;
+               for (idx = 0; idx < ctx->f->inputs_count; idx++) {
+                       ctx->input[idx].ncomp = ctx->f->inputs[idx].ncomp;
+                       update_range(ctx, &ctx->input[idx]);
+               }
+               /* assume we have param input and kill it later if not */
+               ctx->input[idx].ncomp = 4;
+               update_range(ctx, &ctx->input[idx]);
+       } else {
+               ctx->input[0].ncomp = 1;
+               ctx->input[2].ncomp = 1;
+               update_range(ctx, &ctx->input[0]);
+               update_range(ctx, &ctx->input[2]);
+       }
+
+       /* And emit the body: */
+       nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->nir);
+
+       nir_foreach_register(reg, &fxn->registers) {
+               ctx->reg[reg->index].ncomp = reg->num_components;
+               ctx->reg_count = MAX2(ctx->reg_count, reg->index + 1);
+       }
+
+       nir_metadata_require(fxn, nir_metadata_block_index);
+       emit_cf_list(ctx, &fxn->body);
+       /* TODO emit_block(ctx, fxn->end_block); */
+
+       if (so->type == MESA_SHADER_VERTEX)
+               extra_position_exports(ctx, binning);
+
+       ralloc_free(ctx->nir);
+
+       /* kill unused param input */
+       if (so->type == MESA_SHADER_FRAGMENT && !so->need_param)
+               ctx->input[ctx->f->inputs_count].initialized = false;
+}
diff --git a/src/gallium/drivers/freedreno/a2xx/ir2_private.h b/src/gallium/drivers/freedreno/a2xx/ir2_private.h

new file mode 100644 (file)

index 0000000..d1fbacd
--- /dev/null
+++ b/src/gallium/drivers/freedreno/a2xx/ir2_private.h
@@ -0,0 +1,392 @@
+/*
+ * Copyright (C) 2018 Jonathan Marek <jonathan@marek.ca>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Jonathan Marek <jonathan@marek.ca>
+ */
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+
+#include "ir2.h"
+#include "fd2_program.h"
+#include "instr-a2xx.h"
+
+enum ir2_src_type {
+       IR2_SRC_SSA,
+       IR2_SRC_REG,
+       IR2_SRC_INPUT,
+       IR2_SRC_CONST,
+};
+
+struct ir2_src {
+       /* num can mean different things
+        *   ssa: index of instruction
+        *   reg: index in ctx->reg array
+        *   input: index in ctx->input array
+        *   const: constant index (C0, C1, etc)
+        */
+       uint16_t num;
+       uint8_t swizzle;
+       enum ir2_src_type type : 2;
+       uint8_t abs : 1;
+       uint8_t negate : 1;
+       uint8_t : 4;
+};
+
+struct ir2_reg_component {
+       uint8_t c : 3; /* assigned x/y/z/w (7=dont write, for fetch instr) */
+       bool alloc : 1; /* is it currently allocated */
+       uint8_t ref_count; /* for ra */
+};
+
+struct ir2_reg {
+       uint8_t idx; /* assigned hardware register */
+       uint8_t ncomp;
+
+       uint8_t loop_depth;
+       bool initialized;
+       /* block_idx to free on (-1 = free on ref_count==0) */
+       int block_idx_free;
+       struct ir2_reg_component comp[4];
+};
+
+struct ir2_instr {
+       unsigned idx;
+
+       unsigned block_idx;
+
+       enum {
+               IR2_NONE,
+               IR2_FETCH,
+               IR2_ALU,
+               IR2_CF,
+       } type : 2;
+
+       /* instruction needs to be emitted (for scheduling) */
+       bool need_emit : 1;
+
+       /* predicate value - (usually) same for entire block */
+       uint8_t pred : 2;
+
+       /* src */
+       uint8_t src_count;
+       struct ir2_src src[4];
+
+       /* dst */
+       bool is_ssa;
+       union {
+               struct ir2_reg ssa;
+               struct ir2_reg *reg;
+       };
+
+       /* type-specific */
+       union {
+               struct {
+                       instr_fetch_opc_t opc : 5;
+                       union {
+                               struct {
+                                       uint8_t const_idx;
+                                       uint8_t const_idx_sel;
+                               } vtx;
+                               struct {
+                                       bool is_cube : 1;
+                                       bool is_rect : 1;
+                                       uint8_t samp_id;
+                               } tex;
+                       };
+               } fetch;
+               struct {
+                       /* store possible opcs, then we can choose vector/scalar instr */
+                       instr_scalar_opc_t scalar_opc : 6;
+                       instr_vector_opc_t vector_opc : 5;
+                       /* same as nir */
+                       uint8_t write_mask : 4;
+                       bool saturate : 1;
+
+                       /* export idx (-1 no export) */
+                       int8_t export;
+
+                       /* for scalarized 2 src instruction */
+                       uint8_t src1_swizzle;
+               } alu;
+               struct {
+                       /* jmp dst block_idx */
+                       uint8_t block_idx;
+               } cf;
+       };
+};
+
+struct ir2_sched_instr {
+       uint32_t reg_state[8];
+       struct ir2_instr *instr, *instr_s;
+};
+
+struct ir2_context {
+       struct fd2_shader_stateobj *so;
+
+       unsigned block_idx, pred_idx;
+       uint8_t pred;
+       bool block_has_jump[64];
+
+       unsigned loop_last_block[64];
+       unsigned loop_depth;
+
+       nir_shader *nir;
+
+       /* ssa index of position output */
+       struct ir2_src position;
+
+       /* to translate SSA ids to instruction ids */
+       int16_t ssa_map[1024];
+
+       struct ir2_shader_info *info;
+       struct ir2_frag_linkage *f;
+
+       int prev_export;
+
+       /* RA state */
+       struct ir2_reg* live_regs[64];
+       uint32_t reg_state[256/32]; /* 64*4 bits */
+
+       /* inputs */
+       struct ir2_reg input[16 + 1]; /* 16 + param */
+
+       /* non-ssa regs */
+       struct ir2_reg reg[64];
+       unsigned reg_count;
+
+       struct ir2_instr instr[0x300];
+       unsigned instr_count;
+
+       struct ir2_sched_instr instr_sched[0x180];
+       unsigned instr_sched_count;
+};
+
+void assemble(struct ir2_context *ctx, bool binning);
+
+void ir2_nir_compile(struct ir2_context *ctx, bool binning);
+
+void ra_count_refs(struct ir2_context *ctx);
+void ra_reg(struct ir2_context *ctx, struct ir2_reg *reg, int force_idx,
+       bool export, uint8_t export_writemask);
+void ra_src_free(struct ir2_context *ctx, struct ir2_instr *instr);
+void ra_block_free(struct ir2_context *ctx, unsigned block);
+
+/* utils */
+enum {
+       IR2_SWIZZLE_Y = 1 << 0,
+       IR2_SWIZZLE_Z = 2 << 0,
+       IR2_SWIZZLE_W = 3 << 0,
+
+       IR2_SWIZZLE_ZW = 2 << 0 | 2 << 2,
+
+       IR2_SWIZZLE_XYW = 0 << 0 | 0 << 2 | 1 << 4,
+
+       IR2_SWIZZLE_XXXX = 0 << 0 | 3 << 2 | 2 << 4 | 1 << 6,
+       IR2_SWIZZLE_YYYY = 1 << 0 | 0 << 2 | 3 << 4 | 2 << 6,
+       IR2_SWIZZLE_ZZZZ = 2 << 0 | 1 << 2 | 0 << 4 | 3 << 6,
+       IR2_SWIZZLE_WWWW = 3 << 0 | 2 << 2 | 1 << 4 | 0 << 6,
+       IR2_SWIZZLE_WYWW = 3 << 0 | 0 << 2 | 1 << 4 | 0 << 6,
+       IR2_SWIZZLE_XYXY = 0 << 0 | 0 << 2 | 2 << 4 | 2 << 6,
+       IR2_SWIZZLE_ZZXY = 2 << 0 | 1 << 2 | 2 << 4 | 2 << 6,
+       IR2_SWIZZLE_YXZZ = 1 << 0 | 3 << 2 | 0 << 4 | 3 << 6,
+};
+
+#define compile_error(ctx, args...) ({ \
+       printf(args); \
+       assert(0); \
+})
+
+static inline struct ir2_src
+ir2_src(uint16_t num, uint8_t swizzle, enum ir2_src_type type)
+{
+       return (struct ir2_src) {
+               .num = num,
+               .swizzle = swizzle,
+               .type = type
+       };
+}
+
+/* ir2_assemble uses it .. */
+struct ir2_src ir2_zero(struct ir2_context *ctx);
+
+#define ir2_foreach_instr(it, ctx) \
+       for (struct ir2_instr *it = (ctx)->instr; ({ \
+               while (it != &(ctx)->instr[(ctx)->instr_count] && it->type == IR2_NONE) it++; \
+                it != &(ctx)->instr[(ctx)->instr_count]; }); it++)
+
+#define ir2_foreach_live_reg(it, ctx) \
+       for (struct ir2_reg **__ptr = (ctx)->live_regs, *it; ({ \
+               while (__ptr != &(ctx)->live_regs[64] && *__ptr == NULL) __ptr++; \
+                __ptr != &(ctx)->live_regs[64] ? (it=*__ptr) : NULL; }); it++)
+
+#define ir2_foreach_avail(it) \
+       for (struct ir2_instr **__instrp = avail, *it; \
+               it = *__instrp,  __instrp != &avail[avail_count]; __instrp++)
+
+#define ir2_foreach_src(it, instr) \
+       for (struct ir2_src *it = instr->src; \
+                it != &instr->src[instr->src_count]; it++)
+
+/* mask for register allocation
+ * 64 registers with 4 components each = 256 bits
+ */
+/* typedef struct {
+       uint64_t data[4];
+} regmask_t; */
+
+static inline bool mask_isset(uint32_t * mask, unsigned num)
+{
+       return ! !(mask[num / 32] & 1 << num % 32);
+}
+
+static inline void mask_set(uint32_t * mask, unsigned num)
+{
+       mask[num / 32] |= 1 << num % 32;
+}
+
+static inline void mask_unset(uint32_t * mask, unsigned num)
+{
+       mask[num / 32] &= ~(1 << num % 32);
+}
+
+static inline unsigned mask_reg(uint32_t * mask, unsigned num)
+{
+       return mask[num / 8] >> num % 8 * 4 & 0xf;
+}
+
+static inline bool is_export(struct ir2_instr *instr)
+{
+       return instr->type == IR2_ALU && instr->alu.export >= 0;
+}
+
+static inline instr_alloc_type_t export_buf(unsigned num)
+{
+       return num < 32 ? SQ_PARAMETER_PIXEL :
+               num >= 62 ? SQ_POSITION : SQ_MEMORY;
+}
+
+/* component c for channel i */
+static inline unsigned swiz_set(unsigned c, unsigned i)
+{
+       return ((c - i) & 3) << i * 2;
+}
+
+/* get swizzle in channel i */
+static inline unsigned swiz_get(unsigned swiz, unsigned i)
+{
+       return ((swiz >> i * 2) + i) & 3;
+}
+
+static inline unsigned swiz_merge(unsigned swiz0, unsigned swiz1)
+{
+       unsigned swiz = 0;
+       for (int i = 0; i < 4; i++)
+               swiz |= swiz_set(swiz_get(swiz0, swiz_get(swiz1, i)), i);
+       return swiz;
+}
+
+static inline void swiz_merge_p(uint8_t *swiz0, unsigned swiz1)
+{
+       unsigned swiz = 0;
+       for (int i = 0; i < 4; i++)
+               swiz |= swiz_set(swiz_get(*swiz0, swiz_get(swiz1, i)), i);
+       *swiz0 = swiz;
+}
+
+static inline struct ir2_reg * get_reg(struct ir2_instr *instr)
+{
+       return instr->is_ssa ? &instr->ssa : instr->reg;
+}
+
+static inline struct ir2_reg *
+get_reg_src(struct ir2_context *ctx, struct ir2_src *src)
+{
+       switch (src->type) {
+       case IR2_SRC_INPUT:
+               return &ctx->input[src->num];
+       case IR2_SRC_SSA:
+               return &ctx->instr[src->num].ssa;
+       case IR2_SRC_REG:
+               return &ctx->reg[src->num];
+       default:
+               return NULL;
+       }
+}
+
+/* gets a ncomp value for the dst */
+static inline unsigned dst_ncomp(struct ir2_instr *instr)
+{
+       if (instr->is_ssa)
+               return instr->ssa.ncomp;
+
+       if (instr->type == IR2_FETCH)
+               return instr->reg->ncomp;
+
+       assert(instr->type == IR2_ALU);
+
+       unsigned ncomp = 0;
+       for (int i = 0; i < instr->reg->ncomp; i++)
+               ncomp += !!(instr->alu.write_mask & 1 << i);
+       return ncomp;
+}
+
+/* gets a ncomp value for the src registers */
+static inline unsigned src_ncomp(struct ir2_instr *instr)
+{
+       if (instr->type == IR2_FETCH) {
+               switch (instr->fetch.opc) {
+               case VTX_FETCH:
+                       return 1;
+               case TEX_FETCH:
+                       return instr->fetch.tex.is_cube ? 3 : 2;
+               case TEX_SET_TEX_LOD:
+                       return 1;
+               default:
+                       assert(0);
+               }
+       }
+
+       switch (instr->alu.scalar_opc) {
+       case PRED_SETEs ... KILLONEs:
+               return 1;
+       default:
+               break;
+       }
+
+       switch (instr->alu.vector_opc) {
+       case DOT2ADDv:
+               return 2;
+       case DOT3v:
+               return 3;
+       case DOT4v:
+       case CUBEv:
+       case PRED_SETE_PUSHv:
+               return 4;
+       default:
+               return dst_ncomp(instr);
+       }
+}
diff --git a/src/gallium/drivers/freedreno/a2xx/ir2_ra.c b/src/gallium/drivers/freedreno/a2xx/ir2_ra.c

new file mode 100644 (file)

index 0000000..f37eb36
--- /dev/null
+++ b/src/gallium/drivers/freedreno/a2xx/ir2_ra.c
@@ -0,0 +1,226 @@
+/*
+ * Copyright (C) 2018 Jonathan Marek <jonathan@marek.ca>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Jonathan Marek <jonathan@marek.ca>
+ */
+
+#include "ir2_private.h"
+
+/* if an instruction has side effects, we should never kill it */
+static bool has_side_effects(struct ir2_instr *instr)
+{
+       if (instr->type == IR2_CF)
+               return true;
+       else if (instr->type == IR2_FETCH)
+               return false;
+
+       switch (instr->alu.scalar_opc) {
+       case PRED_SETEs ... KILLONEs:
+               return true;
+       default:
+               break;
+       }
+
+       switch (instr->alu.vector_opc) {
+       case PRED_SETE_PUSHv ... KILLNEv:
+               return true;
+       default:
+               break;
+       }
+
+       return instr->alu.export >= 0;
+}
+
+/* mark an instruction as required, and all its sources recursively */
+static void set_need_emit(struct ir2_context *ctx, struct ir2_instr *instr)
+{
+       struct ir2_reg *reg;
+
+       /* don't repeat work already done */
+       if (instr->need_emit)
+               return;
+
+       instr->need_emit = true;
+
+       ir2_foreach_src(src, instr) {
+               switch (src->type) {
+               case IR2_SRC_SSA:
+                       set_need_emit(ctx, &ctx->instr[src->num]);
+                       break;
+               case IR2_SRC_REG:
+                       /* slow ..  */
+                       reg = get_reg_src(ctx, src);
+                       ir2_foreach_instr(instr, ctx) {
+                               if (!instr->is_ssa && instr->reg == reg)
+                                       set_need_emit(ctx, instr);
+                       }
+               default:
+                       break;
+               }
+       }
+}
+
+/* get current bit mask of allocated components for a register */
+static unsigned reg_mask(struct ir2_context *ctx, unsigned idx)
+{
+       return ctx->reg_state[idx/8] >> idx%8*4 & 0xf;
+}
+
+static void reg_setmask(struct ir2_context *ctx, unsigned idx, unsigned c)
+{
+       idx = idx * 4 + c;
+       ctx->reg_state[idx/32] |= 1 << idx%32;
+}
+
+static void reg_freemask(struct ir2_context *ctx, unsigned idx, unsigned c)
+{
+       idx = idx * 4 + c;
+       ctx->reg_state[idx/32] &= ~(1 << idx%32);
+}
+
+void ra_count_refs(struct ir2_context *ctx)
+{
+       struct ir2_reg *reg;
+
+       /* mark instructions as needed
+        * need to do this because "substitutions" pass makes many movs not needed
+        */
+       ir2_foreach_instr(instr, ctx) {
+               if (has_side_effects(instr))
+                       set_need_emit(ctx, instr);
+       }
+
+       /* compute ref_counts */
+       ir2_foreach_instr(instr, ctx) {
+               /* kill non-needed so they can be skipped */
+               if (!instr->need_emit) {
+                       instr->type = IR2_NONE;
+                       continue;
+               }
+
+               ir2_foreach_src(src, instr) {
+                       if (src->type == IR2_SRC_CONST)
+                               continue;
+
+                       reg = get_reg_src(ctx, src);
+                       for (int i = 0; i < src_ncomp(instr); i++)
+                               reg->comp[swiz_get(src->swizzle, i)].ref_count++;
+               }
+       }
+}
+
+void ra_reg(struct ir2_context *ctx, struct ir2_reg *reg, int force_idx,
+       bool export, uint8_t export_writemask)
+{
+       /* for export, don't allocate anything but set component layout */
+       if (export) {
+               for (int i = 0; i < 4; i++)
+                       reg->comp[i].c = i;
+               return;
+       }
+
+       unsigned idx = force_idx;
+
+       /* TODO: allocate into the same register if theres room
+        * note: the blob doesn't do it, so verify that it is indeed better
+        * also, doing it would conflict with scalar mov insertion
+        */
+
+       /* check if already allocated */
+       for (int i = 0; i < reg->ncomp; i++) {
+               if (reg->comp[i].alloc)
+                       return;
+       }
+
+       if (force_idx < 0) {
+               for (idx = 0; idx < 64; idx++) {
+                       if (reg_mask(ctx, idx) == 0)
+                               break;
+               }
+       }
+       assert(idx != 64); /* TODO ran out of register space.. */
+
+       /* update max_reg value */
+       ctx->info->max_reg = MAX2(ctx->info->max_reg, (int) idx);
+
+       unsigned mask = reg_mask(ctx, idx);
+
+       for (int i = 0; i < reg->ncomp; i++) {
+               /* don't allocate never used values */
+               if (reg->comp[i].ref_count == 0) {
+                       reg->comp[i].c = 7;
+                       continue;
+               }
+
+               /* TODO */
+               unsigned c = 1 ? i : (ffs(~mask) - 1);
+               mask |= 1 << c;
+               reg->comp[i].c = c;
+               reg_setmask(ctx, idx, c);
+               reg->comp[i].alloc = true;
+       }
+
+       reg->idx = idx;
+       ctx->live_regs[reg->idx] = reg;
+}
+
+/* reduce srcs ref_count and free if needed */
+void ra_src_free(struct ir2_context *ctx, struct ir2_instr *instr)
+{
+       struct ir2_reg *reg;
+       struct ir2_reg_component *comp;
+
+       ir2_foreach_src(src, instr) {
+               if (src->type == IR2_SRC_CONST)
+                       continue;
+
+               reg = get_reg_src(ctx, src);
+               /* XXX use before write case */
+
+               for (int i = 0; i < src_ncomp(instr); i++) {
+                       comp = &reg->comp[swiz_get(src->swizzle, i)];
+                       if (!--comp->ref_count && reg->block_idx_free < 0) {
+                               reg_freemask(ctx, reg->idx, comp->c);
+                               comp->alloc = false;
+                       }
+               }
+       }
+}
+
+/* free any regs left for a block */
+void ra_block_free(struct ir2_context *ctx, unsigned block)
+{
+       ir2_foreach_live_reg(reg, ctx) {
+               if (reg->block_idx_free != block)
+                       continue;
+
+               for (int i = 0; i < reg->ncomp; i++) {
+                       if (!reg->comp[i].alloc) /* XXX should never be true? */
+                               continue;
+
+                       reg_freemask(ctx, reg->idx, reg->comp[i].c);
+                       reg->comp[i].alloc = false;
+               }
+               ctx->live_regs[reg->idx] = NULL;
+       }
+}
diff --git a/src/gallium/drivers/freedreno/freedreno_context.h b/src/gallium/drivers/freedreno/freedreno_context.h

index f44738f..85f17c8 100644 (file)
--- a/src/gallium/drivers/freedreno/freedreno_context.h
+++ b/src/gallium/drivers/freedreno/freedreno_context.h
@@ -56,14 +56,6 @@ struct fd_texture_stateobj {
  
  struct fd_program_stateobj {
         void *vp, *fp;
-
-       /* rest only used by fd2.. split out: */
-       uint8_t num_exports;
-       /* Indexed by semantic name or TGSI_SEMANTIC_COUNT + semantic index
-        * for TGSI_SEMANTIC_GENERIC.  Special vs exports (position and point-
-        * size) are not included in this
-        */
-       uint8_t export_linkage[63];
  };
  
  struct fd_constbuf_stateobj {
diff --git a/src/gallium/drivers/freedreno/freedreno_program.c b/src/gallium/drivers/freedreno/freedreno_program.c

index 989ccd1..3fa09ce 100644 (file)
--- a/src/gallium/drivers/freedreno/freedreno_program.c
+++ b/src/gallium/drivers/freedreno/freedreno_program.c
@@ -129,15 +129,14 @@ void fd_prog_init(struct pipe_context *pctx)
         pctx->bind_fs_state = fd_fp_state_bind;
         pctx->bind_vs_state = fd_vp_state_bind;
  
-       // XXX for now, let a2xx keep it's own hand-rolled shaders
-       // for solid and blit progs:
-       if (ctx->screen->gpu_id < 300)
-               return;
-
         ctx->solid_prog.fp = assemble_tgsi(pctx, solid_fp, true);
         ctx->solid_prog.vp = assemble_tgsi(pctx, solid_vp, false);
         ctx->blit_prog[0].vp = assemble_tgsi(pctx, blit_vp, false);
         ctx->blit_prog[0].fp = fd_prog_blit(pctx, 1, false);
+
+       if (ctx->screen->gpu_id < 300)
+               return;
+
         for (i = 1; i < ctx->screen->max_rts; i++) {
                 ctx->blit_prog[i].vp = ctx->blit_prog[0].vp;
                 ctx->blit_prog[i].fp = fd_prog_blit(pctx, i + 1, false);
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c

index 03b3587..e59922c 100644 (file)
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -58,6 +58,7 @@
  
  
  #include "ir3/ir3_nir.h"
+#include "a2xx/ir2.h"
  
  /* XXX this should go away */
  #include "state_tracker/drm_driver.h"
@@ -496,16 +497,9 @@ fd_screen_get_shader_param(struct pipe_screen *pscreen,
         case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS:
                 return 16;
         case PIPE_SHADER_CAP_PREFERRED_IR:
-               if (is_ir3(screen))
-                       return PIPE_SHADER_IR_NIR;
-               return PIPE_SHADER_IR_TGSI;
+               return PIPE_SHADER_IR_NIR;
         case PIPE_SHADER_CAP_SUPPORTED_IRS:
-               if (is_ir3(screen)) {
-                       return (1 << PIPE_SHADER_IR_NIR) | (1 << PIPE_SHADER_IR_TGSI);
-               } else {
-                       return (1 << PIPE_SHADER_IR_TGSI);
-               }
-               return 0;
+               return (1 << PIPE_SHADER_IR_NIR) | (1 << PIPE_SHADER_IR_TGSI);
         case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
                 return 32;
         case PIPE_SHADER_CAP_SCALAR_ISA:
@@ -636,7 +630,7 @@ fd_get_compiler_options(struct pipe_screen *pscreen,
         if (is_ir3(screen))
                 return ir3_get_compiler_options(screen->compiler);
  
-       return NULL;
+       return ir2_get_compiler_options();
  }
  
  boolean
diff --git a/src/gallium/drivers/freedreno/meson.build b/src/gallium/drivers/freedreno/meson.build

index 7afdf5a..40b55ad 100644 (file)
--- a/src/gallium/drivers/freedreno/meson.build
+++ b/src/gallium/drivers/freedreno/meson.build
@@ -60,8 +60,6 @@ files_libfreedreno = files(
    'a2xx/disasm-a2xx.c',
    'a2xx/fd2_blend.c',
    'a2xx/fd2_blend.h',
-  'a2xx/fd2_compiler.c',
-  'a2xx/fd2_compiler.h',
    'a2xx/fd2_context.c',
    'a2xx/fd2_context.h',
    'a2xx/fd2_draw.c',
@@ -85,8 +83,12 @@ files_libfreedreno = files(
    'a2xx/fd2_zsa.c',
    'a2xx/fd2_zsa.h',
    'a2xx/instr-a2xx.h',
-  'a2xx/ir-a2xx.c',
-  'a2xx/ir-a2xx.h',
+  'a2xx/ir2.c',
+  'a2xx/ir2.h',
+  'a2xx/ir2_assemble.c',
+  'a2xx/ir2_nir.c',
+  'a2xx/ir2_private.h',
+  'a2xx/ir2_ra.c',
    'a3xx/fd3_blend.c',
    'a3xx/fd3_blend.h',
    'a3xx/fd3_context.c',
author	Jonathan Marek <jonathan@marek.ca>
	Wed, 19 Dec 2018 01:15:57 +0000 (20:15 -0500)
committer	Rob Clark <robdclark@gmail.com>
	Tue, 22 Jan 2019 14:45:03 +0000 (14:45 +0000)
src/gallium/drivers/freedreno/Makefile.sources		patch \| blob \| history
src/gallium/drivers/freedreno/a2xx/fd2_compiler.c	[deleted file]	patch \| blob \| history
src/gallium/drivers/freedreno/a2xx/fd2_compiler.h	[deleted file]	patch \| blob \| history
src/gallium/drivers/freedreno/a2xx/fd2_draw.c		patch \| blob \| history
src/gallium/drivers/freedreno/a2xx/fd2_emit.c		patch \| blob \| history
src/gallium/drivers/freedreno/a2xx/fd2_gmem.c		patch \| blob \| history
src/gallium/drivers/freedreno/a2xx/fd2_program.c		patch \| blob \| history
src/gallium/drivers/freedreno/a2xx/fd2_program.h		patch \| blob \| history
src/gallium/drivers/freedreno/a2xx/instr-a2xx.h		patch \| blob \| history
src/gallium/drivers/freedreno/a2xx/ir-a2xx.c	[deleted file]	patch \| blob \| history
src/gallium/drivers/freedreno/a2xx/ir-a2xx.h	[deleted file]	patch \| blob \| history
src/gallium/drivers/freedreno/a2xx/ir2.c	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/freedreno/a2xx/ir2.h	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/freedreno/a2xx/ir2_assemble.c	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/freedreno/a2xx/ir2_nir.c	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/freedreno/a2xx/ir2_private.h	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/freedreno/a2xx/ir2_ra.c	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/freedreno/freedreno_context.h		patch \| blob \| history
src/gallium/drivers/freedreno/freedreno_program.c		patch \| blob \| history
src/gallium/drivers/freedreno/freedreno_screen.c		patch \| blob \| history
src/gallium/drivers/freedreno/meson.build		patch \| blob \| history