From 8e366dc365d01213b71b87ace47d30938db74845 Mon Sep 17 00:00:00 2001 From: Vadim Girlin Date: Sun, 13 Nov 2011 22:08:33 +0400 Subject: [PATCH] r600g: lazy load for AR register Emit MOVA* instruction only when AR is used. Signed-off-by: Vadim Girlin --- src/gallium/drivers/r600/r600_asm.c | 35 +++++++++++++++++ src/gallium/drivers/r600/r600_asm.h | 2 + src/gallium/drivers/r600/r600_shader.c | 70 ++++++++++++---------------------- 3 files changed, 61 insertions(+), 46 deletions(-) diff --git a/src/gallium/drivers/r600/r600_asm.c b/src/gallium/drivers/r600/r600_asm.c index 75dfdb4..0a1717c 100644 --- a/src/gallium/drivers/r600/r600_asm.c +++ b/src/gallium/drivers/r600/r600_asm.c @@ -255,6 +255,7 @@ static int r600_bytecode_add_cf(struct r600_bytecode *bc) bc->ncf++; bc->ndw += 2; bc->force_add_cf = 0; + bc->ar_loaded = 0; return 0; } @@ -1203,6 +1204,32 @@ static int r600_bytecode_alloc_kcache_lines(struct r600_bytecode *bc, struct r60 return 0; } +/* load AR register from gpr (bc->ar_reg) with MOVA_INT */ +static int load_ar(struct r600_bytecode *bc) +{ + struct r600_bytecode_alu alu; + int r; + + if (bc->ar_loaded) + return 0; + + /* hack to avoid making MOVA the last instruction in the clause */ + if ((bc->cf_last->ndw>>1) >= 110) + bc->force_add_cf = 1; + + memset(&alu, 0, sizeof(alu)); + alu.inst = BC_INST(bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT); + alu.src[0].sel = bc->ar_reg; + alu.last = 1; + r = r600_bytecode_add_alu(bc, &alu); + if (r) + return r; + + bc->cf_last->r6xx_uses_waterfall = 1; + bc->ar_loaded = 1; + return 0; +} + int r600_bytecode_add_alu_type(struct r600_bytecode *bc, const struct r600_bytecode_alu *alu, int type) { struct r600_bytecode_alu *nalu = r600_bytecode_alu(); @@ -1237,6 +1264,14 @@ int r600_bytecode_add_alu_type(struct r600_bytecode *bc, const struct r600_bytec } bc->cf_last->inst = (type << 3); + /* Check AR usage and load it if required */ + for (i = 0; i < 3; i++) + if (nalu->src[i].rel && !bc->ar_loaded) + load_ar(bc); + + if (nalu->dst.rel && !bc->ar_loaded) + load_ar(bc); + /* Setup the kcache for this ALU instruction. This will start a new * ALU clause if needed. */ if ((r = r600_bytecode_alloc_kcache_lines(bc, nalu, type))) { diff --git a/src/gallium/drivers/r600/r600_asm.h b/src/gallium/drivers/r600/r600_asm.h index 61caa4b..f4a6cfd 100644 --- a/src/gallium/drivers/r600/r600_asm.h +++ b/src/gallium/drivers/r600/r600_asm.h @@ -184,6 +184,8 @@ struct r600_bytecode { struct r600_cf_stack_entry fc_stack[32]; unsigned call_sp; struct r600_cf_callstack callstack[SQ_MAX_CALL_DEPTH]; + unsigned ar_loaded; + unsigned ar_reg; }; /* eg_asm.c */ diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c index 5ce464d..48a2f7b 100644 --- a/src/gallium/drivers/r600/r600_shader.c +++ b/src/gallium/drivers/r600/r600_shader.c @@ -166,7 +166,6 @@ struct r600_shader_ctx { unsigned type; unsigned file_offset[TGSI_FILE_COUNT]; unsigned temp_reg; - unsigned ar_reg; struct r600_shader_tgsi_instruction *inst_info; struct r600_bytecode *bc; struct r600_shader *shader; @@ -553,7 +552,7 @@ static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx, unsigned int offset memset(&alu, 0, sizeof(alu)); alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT); - alu.src[0].sel = ctx->ar_reg; + alu.src[0].sel = ctx->bc->ar_reg; alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; alu.src[1].value = offset; @@ -567,7 +566,7 @@ static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx, unsigned int offset ar_reg = dst_reg; } else { - ar_reg = ctx->ar_reg; + ar_reg = ctx->bc->ar_reg; } memset(&vtx, 0, sizeof(vtx)); @@ -750,9 +749,9 @@ static int r600_shader_from_tgsi(struct r600_pipe_context * rctx, struct r600_pi ctx.file_offset[TGSI_FILE_CONSTANT] = 512; ctx.file_offset[TGSI_FILE_IMMEDIATE] = V_SQ_ALU_SRC_LITERAL; - ctx.ar_reg = ctx.file_offset[TGSI_FILE_TEMPORARY] + + ctx.bc->ar_reg = ctx.file_offset[TGSI_FILE_TEMPORARY] + ctx.info.file_max[TGSI_FILE_TEMPORARY] + 1; - ctx.temp_reg = ctx.ar_reg + 1; + ctx.temp_reg = ctx.bc->ar_reg + 1; ctx.nliterals = 0; ctx.literals = NULL; @@ -2942,45 +2941,26 @@ static int tgsi_eg_arl(struct r600_shader_ctx *ctx) alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT; break; case TGSI_OPCODE_UARL: + alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV; break; default: assert(0); return -1; } - if (alu.inst) { - r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); - alu.last = 1; - alu.dst.sel = ctx->ar_reg; - alu.dst.write = 1; - r = r600_bytecode_add_alu(ctx->bc, &alu); - if (r) - return r; - } - - /* TODO: Note that the MOVA can be avoided if we never use AR for - * indexing non-CB registers in the current ALU clause. Similarly, we - * need to load AR from ar_reg again if we started a new clause - * between ARL and AR usage. The easy way to do that is to remove - * the MOVA here, and load it for the first AR access after ar_reg - * has been modified in each clause. */ - memset(&alu, 0, sizeof(struct r600_bytecode_alu)); - alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT; - if (inst->Instruction.Opcode == TGSI_OPCODE_UARL) - r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); - else { - alu.src[0].sel = ctx->ar_reg; - alu.src[0].chan = 0; - } + r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); alu.last = 1; + alu.dst.sel = ctx->bc->ar_reg; + alu.dst.write = 1; r = r600_bytecode_add_alu(ctx->bc, &alu); if (r) return r; + + ctx->bc->ar_loaded = 0; return 0; } static int tgsi_r600_arl(struct r600_shader_ctx *ctx) { - /* TODO from r600c, ar values don't persist between clauses */ struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; struct r600_bytecode_alu alu; int r; @@ -2990,7 +2970,7 @@ static int tgsi_r600_arl(struct r600_shader_ctx *ctx) memset(&alu, 0, sizeof(alu)); alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR; r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); - alu.dst.sel = ctx->ar_reg; + alu.dst.sel = ctx->bc->ar_reg; alu.dst.write = 1; alu.last = 1; @@ -2999,8 +2979,8 @@ static int tgsi_r600_arl(struct r600_shader_ctx *ctx) memset(&alu, 0, sizeof(alu)); alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT; - alu.src[0].sel = ctx->ar_reg; - alu.dst.sel = ctx->ar_reg; + alu.src[0].sel = ctx->bc->ar_reg; + alu.dst.sel = ctx->bc->ar_reg; alu.dst.write = 1; alu.last = 1; @@ -3011,7 +2991,7 @@ static int tgsi_r600_arl(struct r600_shader_ctx *ctx) memset(&alu, 0, sizeof(alu)); alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT; r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); - alu.dst.sel = ctx->ar_reg; + alu.dst.sel = ctx->bc->ar_reg; alu.dst.write = 1; alu.last = 1; @@ -3019,24 +2999,22 @@ static int tgsi_r600_arl(struct r600_shader_ctx *ctx) return r; break; case TGSI_OPCODE_UARL: + memset(&alu, 0, sizeof(alu)); + alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV; + r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); + alu.dst.sel = ctx->bc->ar_reg; + alu.dst.write = 1; + alu.last = 1; + + if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) + return r; break; default: assert(0); return -1; } - memset(&alu, 0, sizeof(alu)); - alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT; - if (inst->Instruction.Opcode == TGSI_OPCODE_UARL) - r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); - else - alu.src[0].sel = ctx->ar_reg; - alu.last = 1; - - r = r600_bytecode_add_alu(ctx->bc, &alu); - if (r) - return r; - ctx->bc->cf_last->r6xx_uses_waterfall = 1; + ctx->bc->ar_loaded = 0; return 0; } -- 2.7.4