From 53d13e014fbe29a3789f7845443c858e94430097 Mon Sep 17 00:00:00 2001 From: Ben Skeggs Date: Wed, 25 May 2005 06:46:10 +0000 Subject: [PATCH] - Remove one of the loops in emit_arith - Handle REPL_ALPHA in emit_arith (possibly incorrect for some things) - Start on getting demos/arbfplight.c to look right. Won't be animated yet, need to re-work const emit so we can update consts without re-translating the entire program. Assertion in r300_state.c::setup_rs_unit needs to be disabled for it to work. --- src/mesa/drivers/dri/r300/r300_fragprog.c | 179 +++++++++++++++++++++--------- src/mesa/drivers/dri/r300/r300_fragprog.h | 6 +- src/mesa/drivers/dri/r300/r300_reg.h | 3 + 3 files changed, 133 insertions(+), 55 deletions(-) diff --git a/src/mesa/drivers/dri/r300/r300_fragprog.c b/src/mesa/drivers/dri/r300/r300_fragprog.c index dbc650d..fc10e36 100644 --- a/src/mesa/drivers/dri/r300/r300_fragprog.c +++ b/src/mesa/drivers/dri/r300/r300_fragprog.c @@ -38,6 +38,8 @@ * - Reuse input/temp regs, if they're no longer needed. * - Find out whether there's any benifit in ordering registers the way * fglrx does (see r300_reg.h). + * - Verify results of opcodes for accuracy, I've only checked them + * in specific cases. * - and more... */ @@ -78,11 +80,11 @@ const struct { { "MAX", 2, R300_FPI0_OUTC_MAX, R300_FPI2_OUTA_MAX }, { "CMP", 3, R300_FPI0_OUTC_CMP, R300_FPI2_OUTA_CMP }, { "FRC", 1, R300_FPI0_OUTC_FRC, R300_FPI2_OUTA_FRC }, -/* should the vector insns below be REPL_ALPHA? */ - { "EX2", 1, PFS_INVAL, R300_FPI2_OUTA_EX2 }, - { "LG2", 1, PFS_INVAL, R300_FPI2_OUTA_LG2 }, - { "RCP", 1, PFS_INVAL, R300_FPI2_OUTA_RCP }, - { "RSQ", 1, PFS_INVAL, R300_FPI2_OUTA_RSQ }, + { "EX2", 1, R300_FPI0_OUTC_REPL_ALPHA, R300_FPI2_OUTA_EX2 }, + { "LG2", 1, R300_FPI0_OUTC_REPL_ALPHA, R300_FPI2_OUTA_LG2 }, + { "RCP", 1, R300_FPI0_OUTC_REPL_ALPHA, R300_FPI2_OUTA_RCP }, + { "RSQ", 1, R300_FPI0_OUTC_REPL_ALPHA, R300_FPI2_OUTA_RSQ }, + { "REPL_ALPHA", 1, R300_FPI0_OUTC_REPL_ALPHA, PFS_INVAL } }; #define MAKE_SWZ3(x, y, z) (MAKE_SWIZZLE4(SWIZZLE_##x, \ @@ -545,12 +547,19 @@ static void emit_arith(struct r300_fragment_program *rp, int op, int flags) { pfs_reg_t src[3] = { src0, src1, src2 }; - int hwdest, hwsrc[3]; + int hwdest, hwsrc; int argc; int v_idx = rp->v_pos, s_idx = rp->s_pos; GLuint inst[4] = { 0, 0, 0, 0 }; + int vop, sop; int i; +#define ARG_NEG (1<<5) +#define ARG_ABS (1<<6) +#define ARG_STRIDE 7 +#define SRC_CONST (1<<5) +#define SRC_STRIDE 6 + if (!dest.valid || !src0.valid || !src1.valid || !src2.valid) { ERROR("invalid register. dest/src0/src1/src2 valid = %d/%d/%d/%d\n", dest.valid, src0.valid, src1.valid, src2.valid); @@ -563,34 +572,9 @@ static void emit_arith(struct r300_fragment_program *rp, int op, return; } argc = r300_fpop[op].argc; + vop = r300_fpop[op].v_op; + sop = r300_fpop[op].s_op; - /* grab hwregs of sources */ - for (i=0;iinputs[src[i].index]; - rp->used_in_node |= (1 << hwsrc[i]); - break; - case REG_TYPE_TEMP: - /* make sure insn ordering is right... */ - if ((src[i].vcross && v_idx < s_idx) || - (src[i].scross && s_idx < v_idx)) { - sync_streams(rp); - v_idx = s_idx = rp->v_pos; - } - - hwsrc[i] = rp->temps[src[i].index]; - rp->used_in_node |= (1 << hwsrc[i]); - break; - case REG_TYPE_CONST: - hwsrc[i] = src[i].index; - break; - default: - ERROR("invalid source reg\n"); - return; - } - } - /* grab hwregs of dest */ switch (dest.type) { case REG_TYPE_TEMP: @@ -606,42 +590,90 @@ static void emit_arith(struct r300_fragment_program *rp, int op, return; } + /* grab hwregs of sources */ for (i=0;i<3;i++) { - if (i < argc) { - inst[0] |= (v_swiz[src[i].v_swz].base + (i * v_swiz[src[i].v_swz].stride)) << (i * 7); - inst[2] |= (s_swiz[src[i].s_swz].base + (i * s_swiz[src[i].s_swz].stride)) << (i * 7); + if (iinputs[src[i].index]; + rp->used_in_node |= (1 << hwsrc); + + inst[1] |= hwsrc << (i * SRC_STRIDE); + inst[3] |= hwsrc << (i * SRC_STRIDE); + break; + case REG_TYPE_TEMP: + /* make sure insn ordering is right... */ + if ((src[i].vcross && v_idx < s_idx) || + (src[i].scross && s_idx < v_idx)) { + sync_streams(rp); + v_idx = s_idx = rp->v_pos; + } + + hwsrc = rp->temps[src[i].index]; + rp->used_in_node |= (1 << hwsrc); + + inst[1] |= hwsrc << (i * SRC_STRIDE); + inst[3] |= hwsrc << (i * SRC_STRIDE); + break; + case REG_TYPE_CONST: + hwsrc = src[i].index; + + inst[1] |= ((hwsrc | SRC_CONST) << (i * SRC_STRIDE)); + inst[3] |= ((hwsrc | SRC_CONST) << (i * SRC_STRIDE)); + break; + default: + ERROR("invalid source reg\n"); + return; + } + + /* Swizzling/Negation */ + if (vop == R300_FPI0_OUTC_REPL_ALPHA) + inst[0] |= R300_FPI0_ARGC_ZERO << (i * ARG_STRIDE); + else + inst[0] |= (v_swiz[src[i].v_swz].base + (i * v_swiz[src[i].v_swz].stride)) << (i*ARG_STRIDE); + inst[2] |= (s_swiz[src[i].s_swz].base + (i * s_swiz[src[i].s_swz].stride)) << (i*ARG_STRIDE); + if (src[i].negate) { - inst[0] |= (1<<5) << (i*7); - inst[2] |= (1<<5) << (i*7); + inst[0] |= ARG_NEG << (i * ARG_STRIDE); + inst[2] |= ARG_NEG << (i * ARG_STRIDE); } - inst[1] |= hwsrc[i] << (i*6); - inst[3] |= hwsrc[i] << (i*6); - if (src[i].type == REG_TYPE_CONST) { - inst[1] |= (1<<5) << (i*6); - inst[3] |= (1<<5) << (i*6); + + if (flags & PFS_FLAG_ABS) { + inst[0] |= ARG_ABS << (i * ARG_STRIDE); + inst[2] |= ARG_ABS << (i * ARG_STRIDE); } } else { - /* read constant zero, may aswell use a ZERO swizzle aswell.. */ - inst[0] |= R300_FPI0_ARGC_ZERO << (i*7); - inst[2] |= R300_FPI2_ARGA_ZERO << (i*7); - inst[1] |= (1<<5) << (i*6); - inst[3] |= (1<<5) << (i*6); + /* read constant 0, use zero swizzle aswell */ + inst[0] |= R300_FPI0_ARGC_ZERO << (i*ARG_STRIDE); + inst[1] |= SRC_CONST << (i*SRC_STRIDE); + inst[2] |= R300_FPI2_ARGA_ZERO << (i*ARG_STRIDE); + inst[3] |= SRC_CONST << (i*SRC_STRIDE); } } + if (flags & PFS_FLAG_SAT) { + vop |= R300_FPI0_OUTC_SAT; + sop |= R300_FPI2_OUTA_SAT; + } + if (mask & WRITEMASK_XYZ) { - rp->alu.inst[v_idx].inst0 = inst[0] | r300_fpop[op].v_op | flags; + if (r300_fpop[op].v_op == R300_FPI0_OUTC_REPL_ALPHA) { + sync_streams(rp); + s_idx = v_idx = rp->v_pos; + } + rp->alu.inst[v_idx].inst0 = inst[0] | vop; rp->alu.inst[v_idx].inst1 = inst[1] | (hwdest << R300_FPI1_DSTC_SHIFT) | ((mask & WRITEMASK_XYZ) << (dest.type == REG_TYPE_OUTPUT ? 26 : 23)); rp->v_pos = v_idx + 1; } - if (mask & WRITEMASK_W) { - rp->alu.inst[s_idx].inst2 = inst[2] | r300_fpop[op].s_op | flags; + if ((mask & WRITEMASK_W) || r300_fpop[op].v_op == R300_FPI0_OUTC_REPL_ALPHA) { + rp->alu.inst[s_idx].inst2 = inst[2] | sop; rp->alu.inst[s_idx].inst3 = inst[3] | (hwdest << R300_FPI3_DSTA_SHIFT) | - (1 << (dest.type == REG_TYPE_OUTPUT ? 24 : 23)); + (((mask & WRITEMASK_W)?1:0) << (dest.type == REG_TYPE_OUTPUT ? 24 : 23)); rp->s_pos = s_idx + 1; } @@ -663,7 +695,9 @@ static GLboolean parse_program(struct r300_fragment_program *rp) } for (fpi=mp->Instructions; fpi->Opcode != FP_OPCODE_END; fpi++) { - if (inst->Saturate) flags = R300_FPI0_OUTC_SAT; /* same for OUTA */ + if (fpi->Saturate) { + flags = PFS_FLAG_SAT; + } switch (fpi->Opcode) { case FP_OPCODE_ABS: @@ -681,6 +715,20 @@ static GLboolean parse_program(struct r300_fragment_program *rp) ERROR("unknown fpi->Opcode %d\n", fpi->Opcode); break; case FP_OPCODE_DP3: + dest = t_dst(rp, fpi->DstReg); + if (fpi->DstReg.WriteMask & WRITEMASK_W) { + /* I assume these need to share the same alu slot */ + sync_streams(rp); + emit_arith(rp, PFS_OP_DP4, dest, WRITEMASK_W, + pfs_zero, pfs_zero, pfs_zero, + flags); + } + emit_arith(rp, PFS_OP_DP3, t_dst(rp, fpi->DstReg), + fpi->DstReg.WriteMask & WRITEMASK_XYZ, + t_src(rp, fpi->SrcReg[0]), + t_src(rp, fpi->SrcReg[1]), + pfs_zero, flags); + break; case FP_OPCODE_DP4: case FP_OPCODE_DPH: case FP_OPCODE_DST: @@ -732,8 +780,31 @@ static GLboolean parse_program(struct r300_fragment_program *rp) flags); break; case FP_OPCODE_POW: + /* I don't like this, and it's probably wrong in some + * circumstances... Needs checking */ + src0 = t_src(rp, fpi->SrcReg[0]); + src1 = t_src(rp, fpi->SrcReg[1]); + dest = t_dst(rp, fpi->DstReg); + temp = get_temp_reg(rp); + temp.s_swz = SWIZZLE_X; /* cheat, bypass swizzle code */ + + emit_arith(rp, PFS_OP_LG2, temp, WRITEMASK_X, + src0, pfs_zero, pfs_zero, 0); + emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_X, + temp, src1, pfs_zero, 0); + emit_arith(rp, PFS_OP_EX2, dest, fpi->DstReg.WriteMask, + temp, pfs_zero, pfs_zero, 0); + free_temp(rp, temp); + break; case FP_OPCODE_RCP: + ERROR("unknown fpi->Opcode %d\n", fpi->Opcode); + break; case FP_OPCODE_RSQ: + emit_arith(rp, PFS_OP_RSQ, t_dst(rp, fpi->DstReg), + fpi->DstReg.WriteMask, + t_src(rp, fpi->SrcReg[0]), pfs_zero, pfs_zero, + flags | PFS_FLAG_ABS); + break; case FP_OPCODE_SCS: case FP_OPCODE_SGE: case FP_OPCODE_SIN: @@ -873,7 +944,7 @@ void init_program(struct r300_fragment_program *rp) void translate_fragment_shader(struct r300_fragment_program *rp) { int i; - + init_program(rp); if (parse_program(rp) == GL_FALSE) { diff --git a/src/mesa/drivers/dri/r300/r300_fragprog.h b/src/mesa/drivers/dri/r300/r300_fragprog.h index b98c6c0..26e4ae5 100644 --- a/src/mesa/drivers/dri/r300/r300_fragprog.h +++ b/src/mesa/drivers/dri/r300/r300_fragprog.h @@ -39,8 +39,12 @@ typedef struct _pfs_reg_t { #define PFS_OP_LG2 8 #define PFS_OP_RCP 9 #define PFS_OP_RSQ 10 -#define MAX_PFS_OP 10 +#define PFS_OP_REPL_ALPHA 11 +#define MAX_PFS_OP 11 #define OP(n) PFS_OP_##n +#define PFS_FLAG_SAT (1 << 0) +#define PFS_FLAG_ABS (1 << 1) + #endif diff --git a/src/mesa/drivers/dri/r300/r300_reg.h b/src/mesa/drivers/dri/r300/r300_reg.h index 3d090c3..0beef34 100644 --- a/src/mesa/drivers/dri/r300/r300_reg.h +++ b/src/mesa/drivers/dri/r300/r300_reg.h @@ -1000,12 +1000,15 @@ I am fairly certain that they are correct unless stated otherwise in comments. # define R300_FPI2_ARG0A_SHIFT 0 # define R300_FPI2_ARG0A_MASK (31 << 0) # define R300_FPI2_ARG0A_NEG (1 << 5) +# define R300_FPI2_ARG0A_ABS (1 << 6) /* GUESS */ # define R300_FPI2_ARG1A_SHIFT 7 # define R300_FPI2_ARG1A_MASK (31 << 7) # define R300_FPI2_ARG1A_NEG (1 << 12) +# define R300_FPI2_ARG1A_ABS (1 << 13) /* GUESS */ # define R300_FPI2_ARG2A_SHIFT 14 # define R300_FPI2_ARG2A_MASK (31 << 14) # define R300_FPI2_ARG2A_NEG (1 << 19) +# define R300_FPI2_ARG2A_ABS (1 << 20) /* GUESS */ # define R300_FPI2_SPECIAL_LRP (1 << 21) # define R300_FPI2_OUTA_MAD (0 << 23) # define R300_FPI2_OUTA_DP4 (1 << 23) -- 2.7.4