r300/compiler: Fix nested flow control in r500 vertex shaders
authorTom Stellard <tstellar@gmail.com>
Wed, 21 Sep 2011 04:05:55 +0000 (21:05 -0700)
committerTom Stellard <thomas.stellard@amd.com>
Sat, 14 Apr 2012 02:24:16 +0000 (22:24 -0400)
src/gallium/drivers/r300/Makefile.sources
src/gallium/drivers/r300/compiler/r3xx_vertprog.c
src/gallium/drivers/r300/compiler/r3xx_vertprog_dump.c
src/gallium/drivers/r300/compiler/radeon_code.h
src/gallium/drivers/r300/compiler/radeon_compiler.h
src/gallium/drivers/r300/compiler/radeon_opcodes.c
src/gallium/drivers/r300/compiler/radeon_opcodes.h
src/gallium/drivers/r300/compiler/radeon_program.h
src/gallium/drivers/r300/compiler/radeon_program_constants.h
src/gallium/drivers/r300/compiler/radeon_program_print.c
src/gallium/drivers/r300/compiler/radeon_vert_fc.c [new file with mode: 0644]

index e27b14e..1e7d31b 100644 (file)
@@ -46,6 +46,7 @@ C_SOURCES := \
        compiler/radeon_optimize.c \
        compiler/radeon_remove_constants.c \
        compiler/radeon_rename_regs.c \
+       compiler/radeon_vert_fc.c \
        compiler/radeon_variable.c \
        compiler/r3xx_fragprog.c \
        compiler/r300_fragprog.c \
index a8d8ebc..94733d7 100644 (file)
 
 #include "radeon_compiler_util.h"
 #include "radeon_dataflow.h"
+#include "radeon_program.h"
 #include "radeon_program_alu.h"
 #include "radeon_swizzle.h"
 #include "radeon_emulate_branches.h"
 #include "radeon_emulate_loops.h"
 #include "radeon_remove_constants.h"
 
-struct loop {
-       int BgnLoop;
-
-};
-
 /*
  * Take an already-setup and valid source then swizzle it appropriately to
  * obtain a constant ZERO or ONE source.
@@ -359,140 +355,13 @@ static void ei_pow(struct r300_vertex_program_code *vp,
        inst[3] = t_src_scalar(vp, &vpi->SrcReg[1]);
 }
 
-static void mark_write(void * userdata,        struct rc_instruction * inst,
-               rc_register_file file,  unsigned int index, unsigned int mask)
-{
-       unsigned int * writemasks = userdata;
-
-       if (file != RC_FILE_TEMPORARY)
-               return;
-
-       if (index >= R300_VS_MAX_TEMPS)
-               return;
-
-       writemasks[index] |= mask;
-}
-
-static unsigned long t_pred_src(struct r300_vertex_program_compiler * compiler)
-{
-       return PVS_SRC_OPERAND(compiler->PredicateIndex,
-               t_swizzle(RC_SWIZZLE_ZERO),
-               t_swizzle(RC_SWIZZLE_ZERO),
-               t_swizzle(RC_SWIZZLE_ZERO),
-               t_swizzle(RC_SWIZZLE_W),
-               t_src_class(RC_FILE_TEMPORARY),
-               0);
-}
-
-static unsigned long t_pred_dst(struct r300_vertex_program_compiler * compiler,
-                                       unsigned int hw_opcode, int is_math)
-{
-       return PVS_OP_DST_OPERAND(hw_opcode,
-            is_math,
-            0,
-            compiler->PredicateIndex,
-            RC_MASK_W,
-            t_dst_class(RC_FILE_TEMPORARY));
-
-}
-
-static void ei_if(struct r300_vertex_program_compiler * compiler,
-                                       struct rc_instruction *rci,
-                                       unsigned int * inst,
-                                       unsigned int branch_depth)
-{
-       unsigned int predicate_opcode;
-       int is_math = 0;
-
-       if (!compiler->Base.is_r500) {
-               rc_error(&compiler->Base,"Opcode IF not supported\n");
-               return;
-       }
-
-       /* Reserve a temporary to use as our predicate stack counter, if we
-        * don't already have one. */
-       if (!compiler->PredicateMask) {
-               unsigned int writemasks[RC_REGISTER_MAX_INDEX];
-               struct rc_instruction * inst;
-               unsigned int i;
-               memset(writemasks, 0, sizeof(writemasks));
-               for(inst = compiler->Base.Program.Instructions.Next;
-                               inst != &compiler->Base.Program.Instructions;
-                                                       inst = inst->Next) {
-                       rc_for_all_writes_mask(inst, mark_write, writemasks);
-               }
-               for(i = 0; i < compiler->Base.max_temp_regs; i++) {
-                       unsigned int mask = ~writemasks[i] & RC_MASK_XYZW;
-                       /* Only the W component can be used fo the predicate
-                        * stack counter. */
-                       if (mask & RC_MASK_W) {
-                               compiler->PredicateMask = RC_MASK_W;
-                               compiler->PredicateIndex = i;
-                               break;
-                       }
-               }
-               if (i == compiler->Base.max_temp_regs) {
-                       rc_error(&compiler->Base, "No free temporary to use for"
-                                       " predicate stack counter.\n");
-                       return;
-               }
-       }
-       predicate_opcode =
-                       branch_depth ? VE_PRED_SET_NEQ_PUSH : ME_PRED_SET_NEQ;
-
-       rci->U.I.SrcReg[0].Swizzle = RC_MAKE_SWIZZLE_SMEAR(GET_SWZ(rci->U.I.SrcReg[0].Swizzle,0));
-       if (branch_depth == 0) {
-               is_math = 1;
-               predicate_opcode = ME_PRED_SET_NEQ;
-               inst[1] = t_src(compiler->code, &rci->U.I.SrcReg[0]);
-               inst[2] = 0;
-       } else {
-               predicate_opcode = VE_PRED_SET_NEQ_PUSH;
-               inst[1] = t_pred_src(compiler);
-               inst[2] = t_src(compiler->code, &rci->U.I.SrcReg[0]);
-       }
-
-       inst[0] = t_pred_dst(compiler, predicate_opcode, is_math);
-       inst[3] = 0;
-
-}
-
-static void ei_else(struct r300_vertex_program_compiler * compiler,
-                                                       unsigned int * inst)
-{
-       if (!compiler->Base.is_r500) {
-               rc_error(&compiler->Base,"Opcode ELSE not supported\n");
-               return;
-       }
-       inst[0] = t_pred_dst(compiler, ME_PRED_SET_INV, 1);
-       inst[1] = t_pred_src(compiler);
-       inst[2] = 0;
-       inst[3] = 0;
-}
-
-static void ei_endif(struct r300_vertex_program_compiler *compiler,
-                                                       unsigned int * inst)
-{
-       if (!compiler->Base.is_r500) {
-               rc_error(&compiler->Base,"Opcode ENDIF not supported\n");
-               return;
-       }
-       inst[0] = t_pred_dst(compiler, ME_PRED_SET_POP, 1);
-       inst[1] = t_pred_src(compiler);
-       inst[2] = 0;
-       inst[3] = 0;
-}
-
 static void translate_vertex_program(struct radeon_compiler *c, void *user)
 {
        struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler*)c;
        struct rc_instruction *rci;
 
-       struct loop * loops = NULL;
-       int current_loop_depth = 0;
-       int loops_reserved = 0;
-
-       unsigned int branch_depth = 0;
+       unsigned loops[R500_PVS_MAX_LOOP_DEPTH];
+       unsigned loop_depth = 0;
 
        compiler->code->pos_end = 0;    /* Not supported yet */
        compiler->code->length = 0;
@@ -532,12 +401,9 @@ static void translate_vertex_program(struct radeon_compiler *c, void *user)
                case RC_OPCODE_COS: ei_math1(compiler->code, ME_COS, vpi, inst); break;
                case RC_OPCODE_DP4: ei_vector2(compiler->code, VE_DOT_PRODUCT, vpi, inst); break;
                case RC_OPCODE_DST: ei_vector2(compiler->code, VE_DISTANCE_VECTOR, vpi, inst); break;
-               case RC_OPCODE_ELSE: ei_else(compiler, inst); break;
-               case RC_OPCODE_ENDIF: ei_endif(compiler, inst); branch_depth--; break;
                case RC_OPCODE_EX2: ei_math1(compiler->code, ME_EXP_BASE2_FULL_DX, vpi, inst); break;
                case RC_OPCODE_EXP: ei_math1(compiler->code, ME_EXP_BASE2_DX, vpi, inst); break;
                case RC_OPCODE_FRC: ei_vector1(compiler->code, VE_FRACTION, vpi, inst); break;
-               case RC_OPCODE_IF: ei_if(compiler, rci, inst, branch_depth); branch_depth++; break;
                case RC_OPCODE_LG2: ei_math1(compiler->code, ME_LOG_BASE2_FULL_DX, vpi, inst); break;
                case RC_OPCODE_LIT: ei_lit(compiler->code, vpi, inst); break;
                case RC_OPCODE_LOG: ei_math1(compiler->code, ME_LOG_BASE2_DX, vpi, inst); break;
@@ -556,37 +422,27 @@ static void translate_vertex_program(struct radeon_compiler *c, void *user)
                case RC_OPCODE_SNE: ei_vector2(compiler->code, VE_SET_NOT_EQUAL, vpi, inst); break;
                case RC_OPCODE_BGNLOOP:
                {
-                       struct loop * l;
-
                        if ((!compiler->Base.is_r500
-                               && loops_reserved >= R300_VS_MAX_LOOP_DEPTH)
-                               || loops_reserved >= R500_VS_MAX_FC_DEPTH) {
+                               && loop_depth >= R300_VS_MAX_LOOP_DEPTH)
+                               || loop_depth >= R500_PVS_MAX_LOOP_DEPTH) {
                                rc_error(&compiler->Base,
                                                "Loops are nested too deep.");
                                return;
                        }
-                       memory_pool_array_reserve(&compiler->Base.Pool,
-                                       struct loop, loops, current_loop_depth,
-                                       loops_reserved, 1);
-                       l = &loops[current_loop_depth++];
-                       memset(l , 0, sizeof(struct loop));
-                       l->BgnLoop = (compiler->code->length / 4);
-                       continue;
+                       loops[loop_depth++] = ((compiler->code->length)/ 4) + 1;
+                       break;
                }
                case RC_OPCODE_ENDLOOP:
                {
-                       struct loop * l;
                        unsigned int act_addr;
                        unsigned int last_addr;
                        unsigned int ret_addr;
 
-                       assert(loops);
-                       l = &loops[current_loop_depth - 1];
-                       act_addr = l->BgnLoop - 1;
+                       ret_addr = loops[--loop_depth];
+                       act_addr = ret_addr - 1;
                        last_addr = (compiler->code->length / 4) - 1;
-                       ret_addr = l->BgnLoop;
 
-                       if (loops_reserved >= R300_VS_MAX_FC_OPS) {
+                       if (loop_depth >= R300_VS_MAX_FC_OPS) {
                                rc_error(&compiler->Base,
                                        "Too many flow control instructions.");
                                return;
@@ -595,7 +451,7 @@ static void translate_vertex_program(struct radeon_compiler *c, void *user)
                                compiler->code->fc_op_addrs.r500
                                        [compiler->code->num_fc_ops].lw =
                                        R500_PVS_FC_ACT_ADRS(act_addr)
-                                       | R500_PVS_FC_LOOP_CNT_JMP_INST(0xffff)
+                                       | R500_PVS_FC_LOOP_CNT_JMP_INST(0x00ff)
                                        ;
                                compiler->code->fc_op_addrs.r500
                                        [compiler->code->num_fc_ops].uw =
@@ -618,26 +474,51 @@ static void translate_vertex_program(struct radeon_compiler *c, void *user)
                        compiler->code->fc_ops |= R300_VAP_PVS_FC_OPC_LOOP(
                                                compiler->code->num_fc_ops);
                        compiler->code->num_fc_ops++;
-                       current_loop_depth--;
-                       continue;
+
+                       break;
                }
 
+               case RC_ME_PRED_SET_CLR:
+                       ei_math1(compiler->code, ME_PRED_SET_CLR, vpi, inst);
+                       break;
+
+               case RC_ME_PRED_SET_INV:
+                       ei_math1(compiler->code, ME_PRED_SET_INV, vpi, inst);
+                       break;
+
+               case RC_ME_PRED_SET_POP:
+                       ei_math1(compiler->code, ME_PRED_SET_POP, vpi, inst);
+                       break;
+
+               case RC_ME_PRED_SET_RESTORE:
+                       ei_math1(compiler->code, ME_PRED_SET_RESTORE, vpi, inst);
+                       break;
+
+               case RC_ME_PRED_SEQ:
+                       ei_math1(compiler->code, ME_PRED_SET_EQ, vpi, inst);
+                       break;
+
+               case RC_ME_PRED_SNEQ:
+                       ei_math1(compiler->code, ME_PRED_SET_NEQ, vpi, inst);
+                       break;
+
+               case RC_VE_PRED_SNEQ_PUSH:
+                       ei_vector2(compiler->code, VE_PRED_SET_NEQ_PUSH,
+                                                               vpi, inst);
+                       break;
+
                default:
                        rc_error(&compiler->Base, "Unknown opcode %s\n", info->Name);
                        return;
                }
 
-               /* Non-flow control instructions that are inside an if statement
-                * need to pay attention to the predicate bit. */
-               if (branch_depth
-                       && vpi->Opcode != RC_OPCODE_IF
-                       && vpi->Opcode != RC_OPCODE_ELSE
-                       && vpi->Opcode != RC_OPCODE_ENDIF) {
-
+               if (vpi->DstReg.Pred != RC_PRED_DISABLED) {
                        inst[0] |= (PVS_DST_PRED_ENABLE_MASK
                                                << PVS_DST_PRED_ENABLE_SHIFT);
-                       inst[0] |= (PVS_DST_PRED_SENSE_MASK
+                       if (vpi->DstReg.Pred == RC_PRED_SET) {
+                               inst[0] |= (PVS_DST_PRED_SENSE_MASK
                                                << PVS_DST_PRED_SENSE_SHIFT);
+                       }
                }
 
                /* Update the number of temporaries. */
@@ -650,10 +531,6 @@ static void translate_vertex_program(struct radeon_compiler *c, void *user)
                            vpi->SrcReg[i].Index >= compiler->code->num_temporaries)
                                compiler->code->num_temporaries = vpi->SrcReg[i].Index + 1;
 
-               if (compiler->PredicateMask)
-                       if (compiler->PredicateIndex >= compiler->code->num_temporaries)
-                               compiler->code->num_temporaries = compiler->PredicateIndex + 1;
-
                if (compiler->code->num_temporaries > compiler->Base.max_temp_regs) {
                        rc_error(&compiler->Base, "Too many temporaries.\n");
                        return;
@@ -1018,7 +895,6 @@ void r3xx_compile_vertex_program(struct r300_vertex_program_compiler *c)
        struct radeon_compiler_pass vs_list[] = {
                /* NAME                         DUMP PREDICATE  FUNCTION                        PARAM */
                {"add artificial outputs",      0, 1,           rc_vs_add_artificial_outputs,   NULL},
-               {"transform loops",             1, 1,           rc_transform_loops,             NULL},
                {"emulate branches",            1, !is_r500,    rc_emulate_branches,            NULL},
                {"emulate negative addressing", 1, 1,           rc_emulate_negative_addressing, NULL},
                {"native rewrite",              1, is_r500,     rc_local_transform,             alu_rewrite_r500},
@@ -1030,6 +906,7 @@ void r3xx_compile_vertex_program(struct r300_vertex_program_compiler *c)
                {"source conflict resolve",     1, 1,           rc_local_transform,             resolve_src_conflicts},
                {"register allocation",         1, opt,         allocate_temporary_registers,   NULL},
                {"dead constants",              1, 1,           rc_remove_unused_constants,     &c->code->constants_remap_table},
+               {"lower control flow opcodes",  1, is_r500,     rc_vert_fc,                     NULL},
                {"final code validation",       0, 1,           rc_validate_final_shader,       NULL},
                {"machine code generation",     0, 1,           translate_vertex_program,       NULL},
                {"dump machine code",           0, c->Base.Debug & RC_DBG_LOG, r300_vertex_program_dump,        NULL},
index 2bc0a87..a41559c 100644 (file)
@@ -190,16 +190,25 @@ void r300_vertex_program_dump(struct radeon_compiler *compiler, void *user)
 
        fprintf(stderr, "Flow Control Ops: 0x%08x\n",vs->fc_ops);
        for(i = 0; i < vs->num_fc_ops; i++) {
+               unsigned is_loop = 0;
                switch((vs->fc_ops >> (i * 2)) & 0x3 ) {
                case 0: fprintf(stderr, "NOP"); break;
                case 1: fprintf(stderr, "JUMP"); break;
-               case 2: fprintf(stderr, "LOOP"); break;
+               case 2: fprintf(stderr, "LOOP"); is_loop = 1; break;
                case 3: fprintf(stderr, "JSR"); break;
                }
                if (c->Base.is_r500) {
-                       fprintf(stderr,": uw-> 0x%08x lw-> 0x%08x\n",
+                       fprintf(stderr,": uw-> 0x%08x lw-> 0x%08x "
+                                                       "loop data->0x%08x\n",
                                vs->fc_op_addrs.r500[i].uw,
-                               vs->fc_op_addrs.r500[i].lw);
+                               vs->fc_op_addrs.r500[i].lw,
+                               vs->fc_loop_index[i]);
+                       if (is_loop) {
+                               fprintf(stderr, "Before = %u First = %u Last = %u\n",
+                                       vs->fc_op_addrs.r500[i].lw & 0xffff,
+                                       (vs->fc_op_addrs.r500[i].uw >> 16) & 0xffff,
+                                       vs->fc_op_addrs.r500[i].uw  & 0xffff);
+                       }
                } else {
                        fprintf(stderr,": 0x%08x\n", vs->fc_op_addrs.r300[i]);
                }
index 4280d66..44d5500 100644 (file)
@@ -40,6 +40,9 @@
 #define R500_PFS_MAX_BRANCH_DEPTH_FULL 32
 #define R500_PFS_MAX_BRANCH_DEPTH_PARTIAL 4
 
+/* The r500 maximum depth is not just for loops, but any combination of loops
+ * and subroutine jumps. */
+#define R500_PVS_MAX_LOOP_DEPTH 8
 
 #define STATE_R300_WINDOW_DIMENSION (STATE_INTERNAL_DRIVER+0)
 
@@ -262,9 +265,6 @@ struct rX00_fragment_program_code {
 #define R300_VS_MAX_TEMPS      32
 /* This is the max for all chipsets (r300-r500) */
 #define R300_VS_MAX_FC_OPS 16
-/* The r500 maximum depth is not just for loops, but any combination of loops
- * and subroutine jumps. */
-#define R500_VS_MAX_FC_DEPTH 8
 #define R300_VS_MAX_LOOP_DEPTH 1
 
 #define VSF_MAX_INPUTS 32
index e7ccbb7..d42cee9 100644 (file)
@@ -137,11 +137,10 @@ struct r300_vertex_program_compiler {
        void * UserData;
        void (*SetHwInputOutput)(struct r300_vertex_program_compiler * c);
 
-       int PredicateIndex;
-       unsigned int PredicateMask;
 };
 
 void r3xx_compile_vertex_program(struct r300_vertex_program_compiler* c);
+void rc_vert_fc(struct radeon_compiler *compiler, void *user);
 void r300_vertex_program_dump(struct radeon_compiler *compiler, void *user);
 
 struct radeon_compiler_pass {
index 3b49ad7..9bcb3c9 100644 (file)
@@ -437,6 +437,78 @@ struct rc_opcode_info rc_opcodes[MAX_RC_OPCODE] = {
        {
                .Opcode = RC_OPCODE_KILP,
                .Name = "KILP",
+       },
+       {
+               .Opcode = RC_ME_PRED_SEQ,
+               .Name = "ME_PRED_SEQ",
+               .NumSrcRegs = 1,
+               .HasDstReg = 1
+       },
+       {
+               .Opcode = RC_ME_PRED_SGT,
+               .Name = "ME_PRED_SGT",
+               .NumSrcRegs = 1,
+               .HasDstReg = 1
+       },
+       {
+               .Opcode = RC_ME_PRED_SGE,
+               .Name = "ME_PRED_SGE",
+               .NumSrcRegs = 1,
+               .HasDstReg = 1
+       },
+       {
+               .Opcode = RC_ME_PRED_SNEQ,
+               .Name = "ME_PRED_SNEQ",
+               .NumSrcRegs = 1,
+               .HasDstReg = 1
+       },
+       {
+               .Opcode = RC_ME_PRED_SET_CLR,
+               .Name = "ME_PRED_SET_CLEAR",
+               .NumSrcRegs = 1,
+               .HasDstReg = 1
+       },
+       {
+               .Opcode = RC_ME_PRED_SET_INV,
+               .Name = "ME_PRED_SET_INV",
+               .NumSrcRegs = 1,
+               .HasDstReg = 1
+       },
+       {
+               .Opcode = RC_ME_PRED_SET_POP,
+               .Name = "ME_PRED_SET_POP",
+               .NumSrcRegs = 1,
+               .HasDstReg = 1
+       },
+       {
+               .Opcode = RC_ME_PRED_SET_RESTORE,
+               .Name = "ME_PRED_SET_RESTORE",
+               .NumSrcRegs = 1,
+               .HasDstReg = 1
+       },
+       {
+               .Opcode = RC_VE_PRED_SEQ_PUSH,
+               .Name = "VE_PRED_SEQ_PUSH",
+               .NumSrcRegs = 2,
+               .HasDstReg = 1
+       },
+       {
+               .Opcode = RC_VE_PRED_SGT_PUSH,
+               .Name = "VE_PRED_SGT_PUSH",
+               .NumSrcRegs = 2,
+               .HasDstReg = 1
+       },
+       {
+               .Opcode = RC_VE_PRED_SGE_PUSH,
+               .Name = "VE_PRED_SGE_PUSH",
+               .NumSrcRegs = 2,
+               .HasDstReg = 1
+       },
+       {
+               .Opcode = RC_VE_PRED_SNEQ_PUSH,
+               .Name = "VE_PRED_SNEQ_PUSH",
+               .NumSrcRegs = 2,
+               .HasDstReg = 1
        }
 };
 
index 0b881c2..9c4b456 100644 (file)
@@ -217,6 +217,21 @@ typedef enum {
        /** Stop execution of the shader (GLSL discard) */
        RC_OPCODE_KILP,
 
+       /* Vertex shader CF Instructions */
+       RC_ME_PRED_SEQ,
+       RC_ME_PRED_SGT,
+       RC_ME_PRED_SGE,
+       RC_ME_PRED_SNEQ,
+       RC_ME_PRED_SET_CLR,
+       RC_ME_PRED_SET_INV,
+       RC_ME_PRED_SET_POP,
+       RC_ME_PRED_SET_RESTORE,
+
+       RC_VE_PRED_SEQ_PUSH,
+       RC_VE_PRED_SGT_PUSH,
+       RC_VE_PRED_SGE_PUSH,
+       RC_VE_PRED_SNEQ_PUSH,
+
        MAX_RC_OPCODE
 } rc_opcode;
 
index e68be93..67be1b9 100644 (file)
@@ -58,6 +58,7 @@ struct rc_dst_register {
        unsigned int File:3;
        unsigned int Index:RC_REGISTER_INDEX_BITS;
        unsigned int WriteMask:4;
+       unsigned int Pred:2;
 };
 
 struct rc_presub_instruction {
index c07c492..4dbf649 100644 (file)
@@ -203,4 +203,10 @@ static inline int rc_presubtract_src_reg_count(rc_presubtract_op op){
 #define RC_SOURCE_RGB   0x1
 #define RC_SOURCE_ALPHA 0x2
 
+typedef enum {
+       RC_PRED_DISABLED,
+       RC_PRED_SET,
+       RC_PRED_INV
+} rc_predicate_mode;
+
 #endif /* RADEON_PROGRAM_CONSTANTS_H */
index e3d2104..29a349e 100644 (file)
@@ -329,6 +329,12 @@ static void rc_print_normal_instruction(FILE * f, struct rc_instruction * inst,
                fprintf(f, ")]");
        }
 
+       if (inst->U.I.DstReg.Pred == RC_PRED_SET) {
+               fprintf(f, " PRED_SET");
+       } else if (inst->U.I.DstReg.Pred == RC_PRED_INV) {
+               fprintf(f, " PRED_INV");
+       }
+
        fprintf(f, "\n");
 }
 
diff --git a/src/gallium/drivers/r300/compiler/radeon_vert_fc.c b/src/gallium/drivers/r300/compiler/radeon_vert_fc.c
new file mode 100644 (file)
index 0000000..3568b23
--- /dev/null
@@ -0,0 +1,274 @@
+
+#include "radeon_compiler.h"
+#include "radeon_compiler_util.h"
+#include "radeon_dataflow.h"
+#include "radeon_program.h"
+#include "radeon_program_constants.h"
+
+struct vert_fc_state {
+       struct radeon_compiler *C;
+       unsigned BranchDepth;
+       unsigned LoopDepth;
+       unsigned LoopsReserved;
+       int PredStack[R500_PVS_MAX_LOOP_DEPTH];
+       int PredicateReg;
+       unsigned InCFBreak;
+};
+
+static void build_pred_src(
+       struct rc_src_register * src,
+       struct vert_fc_state * fc_state)
+{
+       src->Swizzle = RC_MAKE_SWIZZLE(RC_SWIZZLE_UNUSED, RC_SWIZZLE_UNUSED,
+                                       RC_SWIZZLE_UNUSED, RC_SWIZZLE_W);
+       src->File = RC_FILE_TEMPORARY;
+       src->Index = fc_state->PredicateReg;
+}
+
+static void build_pred_dst(
+       struct rc_dst_register * dst,
+       struct vert_fc_state * fc_state)
+{
+       dst->WriteMask = RC_MASK_W;
+       dst->File = RC_FILE_TEMPORARY;
+       dst->Index = fc_state->PredicateReg;
+}
+
+static void mark_write(void * userdata,        struct rc_instruction * inst,
+               rc_register_file file,  unsigned int index, unsigned int mask)
+{
+       unsigned int * writemasks = userdata;
+
+       if (file != RC_FILE_TEMPORARY)
+               return;
+
+       if (index >= R300_VS_MAX_TEMPS)
+               return;
+
+       writemasks[index] |= mask;
+}
+
+static int reserve_predicate_reg(struct vert_fc_state * fc_state)
+{
+       int i;
+       unsigned int writemasks[RC_REGISTER_MAX_INDEX];
+       struct rc_instruction * inst;
+       memset(writemasks, 0, sizeof(writemasks));
+       for(inst = fc_state->C->Program.Instructions.Next;
+                               inst != &fc_state->C->Program.Instructions;
+                               inst = inst->Next) {
+               rc_for_all_writes_mask(inst, mark_write, writemasks);
+       }
+
+       for(i = 0; i < fc_state->C->max_temp_regs; i++) {
+               /* Most of the control flow instructions only write the
+                * W component of the Predicate Register, but
+                * the docs say that ME_PRED_SET_CLR and
+                * ME_PRED_SET_RESTORE write all components of the
+                * register, so we must reserve a register that has
+                * all its components free. */
+               if (!writemasks[i]) {
+                       fc_state->PredicateReg = i;
+                       break;
+               }
+       }
+       if (i == fc_state->C->max_temp_regs) {
+               rc_error(fc_state->C, "No free temporary to use for"
+                               " predicate stack counter.\n");
+               return -1;
+       }
+       return 1;
+}
+
+static void lower_bgnloop(
+       struct rc_instruction * inst,
+       struct vert_fc_state * fc_state)
+{
+       struct rc_instruction * new_inst =
+                       rc_insert_new_instruction(fc_state->C, inst->Prev);
+
+       if ((!fc_state->C->is_r500
+               && fc_state->LoopsReserved >= R300_VS_MAX_LOOP_DEPTH)
+            || fc_state->LoopsReserved >= R500_PVS_MAX_LOOP_DEPTH) {
+               rc_error(fc_state->C, "Loops are nested too deep.");
+               return;
+       }
+
+       if (fc_state->LoopDepth == 0 && fc_state->BranchDepth == 0) {
+               if (fc_state->PredicateReg == -1) {
+                       if (reserve_predicate_reg(fc_state) == -1) {
+                               return;
+                       }
+               }
+
+               /* Initialize the predicate bit to true. */
+               new_inst->U.I.Opcode = RC_ME_PRED_SEQ;
+               build_pred_dst(&new_inst->U.I.DstReg, fc_state);
+               new_inst->U.I.SrcReg[0].Index = 0;
+               new_inst->U.I.SrcReg[0].File = RC_FILE_NONE;
+               new_inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_0000;
+       } else {
+               fc_state->PredStack[fc_state->LoopDepth] =
+                                               fc_state->PredicateReg;
+               /* Copy the the current predicate value to this loop's
+                * predicate register */
+
+               /* Use the old predicate value for src0 */
+               build_pred_src(&new_inst->U.I.SrcReg[0], fc_state);
+
+               /* Reserve this loop's predicate register */
+               if (reserve_predicate_reg(fc_state) == -1) {
+                       return;
+               }
+
+               /* Copy the old predicate value to the new register */
+               new_inst->U.I.Opcode = RC_OPCODE_ADD;
+               build_pred_dst(&new_inst->U.I.DstReg, fc_state);
+               new_inst->U.I.SrcReg[1].Index = 0;
+               new_inst->U.I.SrcReg[1].File = RC_FILE_NONE;
+               new_inst->U.I.SrcReg[1].Swizzle = RC_SWIZZLE_0000;
+       }
+
+}
+
+static void lower_brk(
+       struct rc_instruction * inst,
+       struct vert_fc_state * fc_state)
+{
+       if (fc_state->LoopDepth == 1) {
+               inst->U.I.Opcode = RC_OPCODE_RCP;
+               inst->U.I.DstReg.Pred = RC_PRED_INV;
+               inst->U.I.SrcReg[0].Index = 0;
+               inst->U.I.SrcReg[0].File = RC_FILE_NONE;
+               inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_0000;
+       } else {
+               inst->U.I.Opcode = RC_ME_PRED_SET_CLR;
+               inst->U.I.DstReg.Pred = RC_PRED_SET;
+       }
+
+       build_pred_dst(&inst->U.I.DstReg, fc_state);
+}
+
+static void lower_endloop(
+       struct rc_instruction * inst,
+       struct vert_fc_state * fc_state)
+{
+       struct rc_instruction * new_inst =
+                       rc_insert_new_instruction(fc_state->C, inst);
+
+       new_inst->U.I.Opcode = RC_ME_PRED_SET_RESTORE;
+       build_pred_dst(&new_inst->U.I.DstReg, fc_state);
+       /* Restore the previous predicate register. */
+       fc_state->PredicateReg = fc_state->PredStack[fc_state->LoopDepth - 1];
+       build_pred_src(&new_inst->U.I.SrcReg[0], fc_state);
+}
+
+static void lower_if(
+       struct rc_instruction * inst,
+       struct vert_fc_state * fc_state)
+{
+       /* Reserve a temporary to use as our predicate stack counter, if we
+        * don't already have one. */
+       if (fc_state->PredicateReg == -1) {
+               /* If we are inside a loop, the Predicate Register should
+                * have already been defined. */
+               assert(fc_state->LoopDepth == 0);
+
+               if (reserve_predicate_reg(fc_state) == -1) {
+                       return;
+               }
+       }
+
+       if (inst->Next->U.I.Opcode == RC_OPCODE_BRK) {
+               fc_state->InCFBreak = 1;
+       }
+       if ((fc_state->BranchDepth == 0 && fc_state->LoopDepth == 0)
+                       || (fc_state->LoopDepth == 1 && fc_state->InCFBreak)) {
+               if (fc_state->InCFBreak) {
+                       inst->U.I.Opcode = RC_ME_PRED_SEQ;
+                       inst->U.I.DstReg.Pred = RC_PRED_SET;
+               } else {
+                       inst->U.I.Opcode = RC_ME_PRED_SNEQ;
+               }
+       } else {
+               unsigned swz;
+               inst->U.I.Opcode = RC_VE_PRED_SNEQ_PUSH;
+               memcpy(&inst->U.I.SrcReg[1], &inst->U.I.SrcReg[0],
+                                               sizeof(inst->U.I.SrcReg[1]));
+               swz = rc_get_scalar_src_swz(inst->U.I.SrcReg[1].Swizzle);
+               /* VE_PRED_SNEQ_PUSH needs to the branch condition to be in the
+                * w component */
+               inst->U.I.SrcReg[1].Swizzle = RC_MAKE_SWIZZLE(RC_SWIZZLE_UNUSED,
+                               RC_SWIZZLE_UNUSED, RC_SWIZZLE_UNUSED, swz);
+               build_pred_src(&inst->U.I.SrcReg[0], fc_state);
+       }
+       build_pred_dst(&inst->U.I.DstReg, fc_state);
+}
+
+void rc_vert_fc(struct radeon_compiler *c, void *user)
+{
+       struct rc_instruction * inst;
+       struct vert_fc_state fc_state;
+
+       memset(&fc_state, 0, sizeof(fc_state));
+       fc_state.PredicateReg = -1;
+       fc_state.C = c;
+
+       for(inst = c->Program.Instructions.Next;
+                                       inst != &c->Program.Instructions;
+                                       inst = inst->Next) {
+
+               switch (inst->U.I.Opcode) {
+
+               case RC_OPCODE_BGNLOOP:
+                       lower_bgnloop(inst, &fc_state);
+                       fc_state.LoopDepth++;
+                       break;
+
+               case RC_OPCODE_BRK:
+                       lower_brk(inst, &fc_state);
+                       break;
+
+               case RC_OPCODE_ENDLOOP:
+                       if (fc_state.BranchDepth != 0
+                                       || fc_state.LoopDepth != 1) {
+                               lower_endloop(inst, &fc_state);
+                       }
+                       fc_state.LoopDepth--;
+                       /* Skip PRED_RESTORE */
+                       inst = inst->Next;
+                       break;
+               case RC_OPCODE_IF:
+                       lower_if(inst, &fc_state);
+                       fc_state.BranchDepth++;
+                       break;
+
+               case RC_OPCODE_ELSE:
+                       inst->U.I.Opcode = RC_ME_PRED_SET_INV;
+                       build_pred_dst(&inst->U.I.DstReg, &fc_state);
+                       build_pred_src(&inst->U.I.SrcReg[0], &fc_state);
+                       break;
+
+               case RC_OPCODE_ENDIF:
+                       if (fc_state.LoopDepth == 1 && fc_state.InCFBreak) {
+                               struct rc_instruction * to_delete = inst;
+                               inst = inst->Prev;
+                               rc_remove_instruction(to_delete);
+                               /* XXX: Delete the endif instruction */
+                       } else {
+                               inst->U.I.Opcode = RC_ME_PRED_SET_POP;
+                               build_pred_dst(&inst->U.I.DstReg, &fc_state);
+                               build_pred_src(&inst->U.I.SrcReg[0], &fc_state);
+                       }
+                       fc_state.InCFBreak = 0;
+                       fc_state.BranchDepth--;
+                       break;
+
+               default:
+                       if (fc_state.BranchDepth || fc_state.LoopDepth) {
+                               inst->U.I.DstReg.Pred = RC_PRED_SET;
+                       }
+                       break;
+               }
+       }
+}