Fix xyz/w interaction (needs a cleanup still..)
authorBen Skeggs <darktama@iinet.net.au>
Sun, 5 Jun 2005 08:25:54 +0000 (08:25 +0000)
committerBen Skeggs <darktama@iinet.net.au>
Sun, 5 Jun 2005 08:25:54 +0000 (08:25 +0000)
Use SRC0A instead of WZY/XXX combination for W in XYZ positions.
Remove dodgy hack from POW opcode, now works correctly without it

src/mesa/drivers/dri/r300/r300_context.c
src/mesa/drivers/dri/r300/r300_fragprog.c

index f4ed715..f949e92 100644 (file)
@@ -74,9 +74,10 @@ static const char *const card_extensions[] = {
        "GL_ARB_texture_border_clamp",
        "GL_ARB_texture_compression",
 /* disable until we support it, fixes a few things in ut2004 */
-//     "GL_ARB_texture_cube_map", 
+/*     "GL_ARB_texture_cube_map", */
        "GL_ARB_texture_env_add",
        "GL_ARB_texture_env_combine",
+       "GL_ARB_texture_env_crossbar",
        "GL_ARB_texture_env_dot3",
        "GL_ARB_texture_mirrored_repeat",
        "GL_ARB_vertex_buffer_object",
@@ -104,7 +105,6 @@ static const char *const card_extensions[] = {
        "GL_NV_blend_square",
        "GL_NV_vertex_program",
        "GL_SGIS_generate_mipmap",
-       "GL_ARB_texture_env_crossbar",
        NULL
 };
 
index de6a36f..e6a68ab 100644 (file)
@@ -40,8 +40,6 @@
  *   fglrx does (see r300_reg.h).
  * - Verify results of opcodes for accuracy, I've only checked them
  *   in specific cases.
- * - Learn more about interaction between xyz/w units.. A few bugs are
- *   caused by something I'm missing..
  * - and more...
  */
 
@@ -112,11 +110,13 @@ static const struct r300_pfv_swizzle {
        { "xxx", MAKE_SWZ3(X, X, X), GL_TRUE, R300_FPI0_ARGC_SRC0C_XXX, 4, GL_FALSE },
        { "yyy", MAKE_SWZ3(Y, Y, Y), GL_TRUE, R300_FPI0_ARGC_SRC0C_YYY, 4, GL_FALSE },
        { "zzz", MAKE_SWZ3(Z, Z, Z), GL_TRUE, R300_FPI0_ARGC_SRC0C_ZZZ, 4, GL_FALSE },
+       { "www", MAKE_SWZ3(W, W, W), GL_TRUE, R300_FPI0_ARGC_SRC0A, 1, GL_TRUE },
        { "yzx", MAKE_SWZ3(Y, Z, X), GL_TRUE, R300_FPI0_ARGC_SRC0C_YZX, 1, GL_FALSE },
        { "zxy", MAKE_SWZ3(Z, X, Y), GL_TRUE, R300_FPI0_ARGC_SRC0C_ZXY, 1, GL_FALSE },
-       { "wzy", MAKE_SWZ3(W, Z, Y), GL_TRUE, R300_FPI0_ARGC_SRC0CA_WZY, 1, GL_TRUE },
+/* disable this for now, until I find a clean way of making sure xyz/w streams
+ * have a source in the same register slot.. */
+//     { "wzy", MAKE_SWZ3(W, Z, Y), GL_TRUE, R300_FPI0_ARGC_SRC0CA_WZY, 1, GL_TRUE },
 /* special cases */
-       { NULL, MAKE_SWZ3(W, W, W), GL_FALSE, 0, 0, GL_FALSE},
        { NULL, MAKE_SWZ3(ONE, ONE, ONE), GL_FALSE, R300_FPI0_ARGC_ONE, 0, GL_FALSE},
        { NULL, MAKE_SWZ3(ZERO, ZERO, ZERO), GL_FALSE, R300_FPI0_ARGC_ZERO, 0, GL_FALSE},
        { NULL, PFS_INVAL, GL_FALSE, R300_FPI0_ARGC_HALF, 0, GL_FALSE},
@@ -124,10 +124,10 @@ static const struct r300_pfv_swizzle {
 };
 #define SWIZZLE_XYZ            0
 #define SWIZZLE_XXX            1
-#define SWIZZLE_WZY            6
-#define SWIZZLE_111            8
-#define SWIZZLE_000            9
-#define SWIZZLE_HHH            10
+#define SWIZZLE_WWW            4
+#define SWIZZLE_111            7
+#define SWIZZLE_000            8
+#define SWIZZLE_HHH            9
 
 #define SWZ_X_MASK (7 << 0)
 #define SWZ_Y_MASK (7 << 3)
@@ -320,30 +320,6 @@ static int swz_special_case(struct r300_fragment_program *rp,
        pfs_reg_t ssrc = pfs_default_reg;
 
        switch(GET_SWZ(v_swiz[src.v_swz].hash, 0)) {
-       case SWIZZLE_W:
-               ssrc = get_temp_reg(rp);
-               src.v_swz = SWIZZLE_WZY;
-               if (s_mask[mask].count == 3) {
-                       emit_arith(rp, PFS_OP_MAD, ssrc, WRITEMASK_XW, src, pfs_one, pfs_zero, 0);
-                       *r = ssrc;
-                       r->v_swz = SWIZZLE_XXX;
-                       r->s_swz = SWIZZLE_W;
-               } else if (mc + s_mask[mask].count == 3) {
-                       if (!r->valid)
-                               *r = get_temp_reg(rp);
-                       emit_arith(rp, PFS_OP_MAD, ssrc, WRITEMASK_XW, src, pfs_one, pfs_zero, 0);
-                       ssrc.v_swz = SWIZZLE_XXX;
-                       emit_arith(rp, PFS_OP_MAD, *r, s_mask[mask].mask|WRITEMASK_W, ssrc, pfs_one, pfs_zero, 0);
-                       free_temp(rp, ssrc);
-               } else {
-                       if (!r->valid)
-                               *r = get_temp_reg(rp);
-                       emit_arith(rp, PFS_OP_MAD, ssrc, WRITEMASK_X, src, pfs_one, pfs_zero, 0);
-                       ssrc.v_swz = SWIZZLE_XXX;
-                       emit_arith(rp, PFS_OP_MAD, *r, s_mask[mask].mask, ssrc, pfs_one, pfs_zero, 0);
-                       free_temp(rp, ssrc);
-               }
-               break;
        case SWIZZLE_ONE:
        case SWIZZLE_ZERO:
                if (!r->valid)
@@ -472,16 +448,16 @@ static void sync_streams(struct r300_fragment_program *rp) {
        /* Bring vector/scalar streams into sync, inserting nops into
         * whatever stream is lagging behind
         *
-        * I'm using "MAD t0, t0, 1.0, 0.0" as a NOP
+        * Using NOP == MAD out.none, 0, 0, 0
         */
        while (rp->v_pos != rp->s_pos) {
                if (rp->s_pos > rp->v_pos) {
-                       rp->alu.inst[rp->v_pos].inst0 = 0x00050A80;
-                       rp->alu.inst[rp->v_pos].inst1 = 0x03820800;
+                       rp->alu.inst[rp->v_pos].inst0 = 0x00050A14;
+                       rp->alu.inst[rp->v_pos].inst1 = 0x00020820;
                        rp->v_pos++;
                } else {
-                       rp->alu.inst[rp->s_pos].inst2 = 0x00040889;
-                       rp->alu.inst[rp->s_pos].inst3 = 0x00820800;
+                       rp->alu.inst[rp->s_pos].inst2 = 0x00040810;
+                       rp->alu.inst[rp->s_pos].inst3 = 0x00020820;
                        rp->s_pos++;
                }
        }       
@@ -550,25 +526,68 @@ static void emit_tex(struct r300_fragment_program *rp,
        rp->node[rp->cur_node].tex_end++;
 }
 
+#define ARG_NEG        (1<<5)
+#define ARG_ABS (1<<6)
+#define SRC_CONST (1<<5)
+#define SRC_STRIDE 6
+
+static int t_hw_src(struct r300_fragment_program *rp, pfs_reg_t src)
+{
+       int idx;
+
+       switch (src.type) {
+       case REG_TYPE_TEMP:
+               idx = rp->temps[src.index];
+               break;
+       case REG_TYPE_INPUT:
+               idx = rp->inputs[src.index];
+               break;
+       case REG_TYPE_CONST:
+               return (src.index | SRC_CONST);
+       default:
+               ERROR("Invalid type for source reg\n");
+               return (0 | SRC_CONST);
+       }
+
+       rp->used_in_node |= (1 << idx);
+       return idx;
+}
+
+/* Add sources to FPI1/FPI3 lists.  If source is already on list,
+ * reuse the index instead of wasting a source.
+ */
+static inline int add_src(int src[3], int *cnt, int reg) {
+       int i;
+
+       for (i=0;i<*cnt;i++)
+               if (src[i] == reg) return i;
+       
+       if (*cnt == 3) assert(0); /* I don't *think* this can happen */
+
+       src[*cnt] = reg;
+       return (*cnt)++;
+}
+
 static void emit_arith(struct r300_fragment_program *rp, int op,
                                pfs_reg_t dest, int mask,
                                pfs_reg_t src0, pfs_reg_t src1, pfs_reg_t src2,
                                int flags)
 {
        pfs_reg_t src[3] = { src0, src1, src2 };
+       /* XYZ/W emit control */
+       int v_idx = rp->v_pos, s_idx = rp->s_pos;
+       GLboolean emit_v = GL_FALSE, emit_s = GL_FALSE;
+       /* INST1/INST3 sources */
+       int vsrc[3], ssrc[3];
+       int nvs = 0, nss = 0;
+       /* INST0/INST2 sources */
+       int vswz[3], sswz[3];
+       /* temp stuff */
        int hwdest, hwsrc;
        int argc;
-       int v_idx = rp->v_pos, s_idx = rp->s_pos;
-       GLuint inst[4] = { 0, 0, 0, 0 }; 
        int vop, sop;
        int i;
-
-#define ARG_NEG        (1<<5)
-#define ARG_ABS (1<<6)
-#define ARG_STRIDE 7
-#define SRC_CONST (1<<5)
-#define SRC_STRIDE 6
-
+       
        if (!dest.valid || !src0.valid || !src1.valid || !src2.valid) {
                ERROR("invalid register.  dest/src0/src1/src2 valid = %d/%d/%d/%d\n",
                                                dest.valid, src0.valid, src1.valid, src2.valid);
@@ -598,96 +617,91 @@ static void emit_arith(struct r300_fragment_program *rp, int op,
                ERROR("invalid dest reg type %d\n", dest.type);
                return;
        }
-
-       /* grab hwregs of sources */
+       
+       int str;
        for (i=0;i<3;i++) {
                if (i<argc) {
-                       /* Decide on hardware source index */
-                       switch (src[i].type) {
-                       case REG_TYPE_INPUT:
-                               hwsrc = rp->inputs[src[i].index];
-                               rp->used_in_node |= (1 << hwsrc);
-
-                               inst[1] |= hwsrc << (i * SRC_STRIDE);
-                               inst[3] |= hwsrc << (i * SRC_STRIDE);
-                               break;
-                       case REG_TYPE_TEMP:
-                               /* make sure insn ordering is right... */
-                               if ((v_swiz[src[i].v_swz].dep_sca && v_idx < s_idx) ||
-                                       (s_swiz[src[i].s_swz].dep_vec && s_idx < v_idx)) {
+                       hwsrc = t_hw_src(rp, src[i]);
+                       if (mask & WRITEMASK_XYZ && vop != R300_FPI0_OUTC_REPL_ALPHA) {
+                               if (v_swiz[src[i].v_swz].dep_sca) {
                                        sync_streams(rp);
                                        v_idx = s_idx = rp->v_pos;
-                               }
+                                       emit_s = GL_TRUE;
+                                       str = add_src(ssrc, &nss, hwsrc);
+                               } else
+                                       str = add_src(vsrc, &nvs, hwsrc);
+                               vswz[i] = v_swiz[src[i].v_swz].base + (str * v_swiz[src[i].v_swz].stride);
+                       } else
+                               vswz[i] = R300_FPI0_ARGC_ZERO;
+
+                       if (mask & WRITEMASK_W || vop == R300_FPI0_OUTC_REPL_ALPHA) {
+                               if (s_swiz[src[i].s_swz].dep_vec) {
+                                       sync_streams(rp);
+                                       v_idx = s_idx = rp->v_pos;
+                                       emit_v = GL_TRUE;
+                                       str = add_src(vsrc, &nvs, hwsrc);
+                               } else
+                                       str = add_src(ssrc, &nss, hwsrc);
+                               sswz[i] = s_swiz[src[i].s_swz].base + (str * s_swiz[src[i].s_swz].stride);
+                       } else
+                               sswz[i] = R300_FPI2_ARGA_ZERO;
                
-                               hwsrc = rp->temps[src[i].index];
-                               rp->used_in_node |= (1 << hwsrc);
-
-                               inst[1] |= hwsrc << (i * SRC_STRIDE);
-                               inst[3] |= hwsrc << (i * SRC_STRIDE);
-                               break;
-                       case REG_TYPE_CONST:
-                               hwsrc = src[i].index;
-
-                               inst[1] |= ((hwsrc | SRC_CONST) << (i * SRC_STRIDE));
-                               inst[3] |= ((hwsrc | SRC_CONST) << (i * SRC_STRIDE));
-                               break;
-                       default:
-                               ERROR("invalid source reg\n");
-                               return;
-                       }
-
-                       /* Swizzling/Negation */
-                       if (vop == R300_FPI0_OUTC_REPL_ALPHA)
-                               inst[0] |= R300_FPI0_ARGC_ZERO << (i * ARG_STRIDE);
-                       else
-                               inst[0] |= (v_swiz[src[i].v_swz].base + (i * v_swiz[src[i].v_swz].stride)) << (i*ARG_STRIDE);
-                       inst[2] |= (s_swiz[src[i].s_swz].base + (i * s_swiz[src[i].s_swz].stride)) << (i*ARG_STRIDE);
-
                        if (src[i].negate) {
-                               inst[0] |= ARG_NEG << (i * ARG_STRIDE);
-                               inst[2] |= ARG_NEG << (i * ARG_STRIDE);
+                               vswz[i] |= ARG_NEG;
+                               sswz[i] |= ARG_NEG;
                        }
-                       
+
                        if (flags & PFS_FLAG_ABS) {
-                               inst[0] |= ARG_ABS << (i * ARG_STRIDE);
-                               inst[2] |= ARG_ABS << (i * ARG_STRIDE); 
+                               vswz[i] |= ARG_ABS;
+                               sswz[i] |= ARG_ABS;
                        }
                } else {
-                       /* read constant 0, use zero swizzle aswell */
-                       inst[0] |= R300_FPI0_ARGC_ZERO << (i*ARG_STRIDE);
-                       inst[1] |= SRC_CONST << (i*SRC_STRIDE);
-                       inst[2] |= R300_FPI2_ARGA_ZERO << (i*ARG_STRIDE);
-                       inst[3] |= SRC_CONST << (i*SRC_STRIDE);
+                       vswz[i] = R300_FPI0_ARGC_ZERO;
+                       sswz[i] = R300_FPI2_ARGA_ZERO;
                }
        }
+       /* Unused sources, read constant reg 0 */
+       for (i=nvs;i<3;i++)
+               vsrc[i] = 0 | SRC_CONST;
+       for (i=nss;i<3;i++)
+               ssrc[i] = 0 | SRC_CONST;
 
        if (flags & PFS_FLAG_SAT) {
                vop |= R300_FPI0_OUTC_SAT;
                sop |= R300_FPI2_OUTA_SAT;
        }
-               
-       if (mask & WRITEMASK_XYZ) {
+
+       if (mask & WRITEMASK_XYZ || emit_v) {
                if (r300_fpop[op].v_op == R300_FPI0_OUTC_REPL_ALPHA) {
                        sync_streams(rp);
                        s_idx = v_idx = rp->v_pos;
                }
-               rp->alu.inst[v_idx].inst0 = inst[0] | vop;
-               rp->alu.inst[v_idx].inst1 = inst[1] |
-                               (hwdest << R300_FPI1_DSTC_SHIFT) |
+               rp->alu.inst[v_idx].inst0 = vop |
+                               vswz[0] << R300_FPI0_ARG0C_SHIFT |
+                               vswz[1] << R300_FPI0_ARG1C_SHIFT |
+                               vswz[2] << R300_FPI0_ARG2C_SHIFT;
+               rp->alu.inst[v_idx].inst1 = hwdest << R300_FPI1_DSTC_SHIFT |
+                               vsrc[0] << R300_FPI1_SRC0C_SHIFT |
+                               vsrc[1] << R300_FPI1_SRC1C_SHIFT |
+                               vsrc[2] << R300_FPI1_SRC2C_SHIFT |
                                ((mask & WRITEMASK_XYZ) << (dest.type == REG_TYPE_OUTPUT ? 26 : 23));
                rp->v_pos = v_idx + 1;
        }
-       
-       if ((mask & WRITEMASK_W) || r300_fpop[op].v_op == R300_FPI0_OUTC_REPL_ALPHA) {
-               rp->alu.inst[s_idx].inst2 = inst[2] | sop;
-               rp->alu.inst[s_idx].inst3 = inst[3] |
-                               (hwdest << R300_FPI3_DSTA_SHIFT) |
+
+       if (mask & WRITEMASK_W || emit_s || vop == R300_FPI0_OUTC_REPL_ALPHA) {
+               rp->alu.inst[s_idx].inst2 = sop |
+                               sswz[0] << R300_FPI2_ARG0A_SHIFT |
+                               sswz[1] << R300_FPI2_ARG1A_SHIFT |
+                               sswz[2] << R300_FPI2_ARG2A_SHIFT;
+               rp->alu.inst[s_idx].inst3 = hwdest << R300_FPI3_DSTA_SHIFT |
+                               ssrc[0] << R300_FPI3_SRC0A_SHIFT |
+                               ssrc[1] << R300_FPI3_SRC1A_SHIFT |
+                               ssrc[2] << R300_FPI3_SRC2A_SHIFT |
                                (((mask & WRITEMASK_W)?1:0) << (dest.type == REG_TYPE_OUTPUT ? 24 : 23));
                rp->s_pos = s_idx + 1;
        }
 
-/* Force this for now */
-       sync_streams(rp);
+/*     sync_streams(rp); */
        return;
 };
        
@@ -791,17 +805,14 @@ static GLboolean parse_program(struct r300_fragment_program *rp)
                                                        flags);
                        break;
                case FP_OPCODE_POW:
-                       /* I don't like this, and it's probably wrong in some
-                        * circumstances... Needs checking */
                        src0 = t_src(rp, fpi->SrcReg[0]);
                        src1 = t_src(rp, fpi->SrcReg[1]);
                        dest = t_dst(rp, fpi->DstReg);
                        temp = get_temp_reg(rp);
-                       temp.s_swz = SWIZZLE_X; /* cheat, bypass swizzle code */
 
-                       emit_arith(rp, PFS_OP_LG2, temp, WRITEMASK_X,
+                       emit_arith(rp, PFS_OP_LG2, temp, WRITEMASK_W,
                                                        src0, pfs_zero, pfs_zero, 0);
-                       emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_X,
+                       emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_W,
                                                        temp, src1, pfs_zero, 0);
                        emit_arith(rp, PFS_OP_EX2, dest, fpi->DstReg.WriteMask,
                                                        temp, pfs_zero, pfs_zero, 0);
@@ -969,12 +980,12 @@ void translate_fragment_shader(struct r300_fragment_program *rp)
 
        if (!rp->translated) {
                init_program(rp);
-       
+
                if (parse_program(rp) == GL_FALSE) {
                        dump_program(rp);
                        return;
                }
-
+               
                /* Finish off */
                sync_streams(rp);
                rp->node[rp->cur_node].alu_end  = rp->v_pos - 1;