From b4827cb58bef287d341e6f7518d3354550d0e3b9 Mon Sep 17 00:00:00 2001 From: David Schleef Date: Mon, 28 Jun 2010 22:29:36 -0700 Subject: [PATCH] neon: Fix accumulator opcodes --- orc/orcprogram-neon.c | 35 +++++++++++++++++++-- orc/orcrules-neon.c | 86 ++++++++++++++++++++++++++++++++++++++------------- 2 files changed, 97 insertions(+), 24 deletions(-) diff --git a/orc/orcprogram-neon.c b/orc/orcprogram-neon.c index a6a7fed..7dfb726 100644 --- a/orc/orcprogram-neon.c +++ b/orc/orcprogram-neon.c @@ -217,7 +217,7 @@ orc_compiler_neon_init (OrcCompiler *compiler) } void -orc_neon_load_constants (OrcCompiler *compiler) +orc_neon_load_constants_outer (OrcCompiler *compiler) { int i; for(i=0;ivars[i].alloc, 0); + break; + case ORC_VAR_TYPE_TEMP: + break; + default: + ORC_PROGRAM_ERROR(compiler,"bad vartype"); + break; + } + } +} + +void +orc_neon_load_constants_inner (OrcCompiler *compiler) +{ + int i; + for(i=0;ivars[i].name == NULL) continue; + + switch (compiler->vars[i].vartype) { + case ORC_VAR_TYPE_CONST: + break; + case ORC_VAR_TYPE_PARAM: + break; + case ORC_VAR_TYPE_SRC: + case ORC_VAR_TYPE_DEST: orc_arm_emit_load_reg (compiler, compiler->vars[i].ptr_register, compiler->exec_reg, ORC_STRUCT_OFFSET(OrcExecutor, arrays[i])); break; case ORC_VAR_TYPE_ACCUMULATOR: - orc_neon_emit_loadil (compiler, compiler->vars[i].alloc, 0); break; case ORC_VAR_TYPE_TEMP: break; @@ -540,6 +566,8 @@ orc_compiler_neon_assemble (OrcCompiler *compiler) orc_neon_emit_prologue (compiler); + orc_neon_load_constants_outer (compiler); + if (compiler->program->is_2d) { if (compiler->program->constant_m > 0) { orc_arm_emit_load_imm (compiler, ORC_ARM_A3, compiler->program->constant_m); @@ -601,7 +629,7 @@ orc_compiler_neon_assemble (OrcCompiler *compiler) orc_arm_emit_label (compiler, 7); } - orc_neon_load_constants (compiler); + orc_neon_load_constants_inner (compiler); if (compiler->loop_shift > 0) { int save_loop_shift = compiler->loop_shift; @@ -702,6 +730,7 @@ orc_neon_emit_loop (OrcCompiler *compiler) OrcStaticOpcode *opcode; OrcRule *rule; + orc_compiler_append_code(compiler,"# LOOP shift %d\n", compiler->loop_shift); for(j=0;jn_insns;j++){ compiler->insn_index = j; insn = compiler->insns + j; diff --git a/orc/orcrules-neon.c b/orc/orcrules-neon.c index 0454413..c0e24d9 100644 --- a/orc/orcrules-neon.c +++ b/orc/orcrules-neon.c @@ -1157,10 +1157,27 @@ UNARY(convlf,"vcvt.f32.s32",0xf3bb0600, 1) static void orc_neon_rule_accw (OrcCompiler *p, void *user, OrcInstruction *insn) { - orc_neon_emit_binary (p, "vadd.i16", 0xf2100800, - p->vars[insn->dest_args[0]].alloc, - p->vars[insn->dest_args[0]].alloc, - p->vars[insn->src_args[0]].alloc); + unsigned int code; + + if (p->loop_shift < 2) { + ORC_ASM_CODE(p," vshl.i64 %s, %s, #%d\n", + orc_neon_reg_name (p->tmpreg), + orc_neon_reg_name (p->vars[insn->src_args[0]].alloc), 48); + code = NEON_BINARY(0xf2a00590, p->tmpreg, 0, + p->vars[insn->src_args[0]].alloc); + code |= (48) << 16; + orc_arm_emit (p, code); + + orc_neon_emit_binary (p, "vadd.i16", 0xf2100800, + p->vars[insn->dest_args[0]].alloc, + p->vars[insn->dest_args[0]].alloc, + p->tmpreg); + } else { + orc_neon_emit_binary (p, "vadd.i16", 0xf2100800, + p->vars[insn->dest_args[0]].alloc, + p->vars[insn->dest_args[0]].alloc, + p->vars[insn->src_args[0]].alloc); + } } static void @@ -1255,23 +1272,50 @@ static void orc_neon_rule_accsadubl (OrcCompiler *p, void *user, OrcInstruction *insn) { orc_uint32 x; - - x = 0xf3800700; - ORC_ASM_CODE(p," vabdl.u8 %s, %s, %s\n", - orc_neon_reg_name_quad (p->tmpreg), - orc_neon_reg_name (p->vars[insn->src_args[0]].alloc), - orc_neon_reg_name (p->vars[insn->src_args[1]].alloc)); - x |= (p->tmpreg&0xf)<<12; - x |= ((p->tmpreg>>4)&0x1)<<22; - x |= (p->vars[insn->src_args[0]].alloc&0xf)<<16; - x |= ((p->vars[insn->src_args[0]].alloc>>4)&0x1)<<7; - x |= (p->vars[insn->src_args[1]].alloc&0xf)<<0; - x |= ((p->vars[insn->src_args[1]].alloc>>4)&0x1)<<5; - orc_arm_emit (p, x); - - orc_neon_emit_unary (p, "vpadal.u16", 0xf3b40680, - p->vars[insn->dest_args[0]].alloc, - p->tmpreg); + unsigned int code; + + if (p->loop_shift < 2) { + x = 0xf3800700; + ORC_ASM_CODE(p," vabdl.u8 %s, %s, %s\n", + orc_neon_reg_name_quad (p->tmpreg), + orc_neon_reg_name (p->vars[insn->src_args[0]].alloc), + orc_neon_reg_name (p->vars[insn->src_args[1]].alloc)); + x |= (p->tmpreg&0xf)<<12; + x |= ((p->tmpreg>>4)&0x1)<<22; + x |= (p->vars[insn->src_args[0]].alloc&0xf)<<16; + x |= ((p->vars[insn->src_args[0]].alloc>>4)&0x1)<<7; + x |= (p->vars[insn->src_args[1]].alloc&0xf)<<0; + x |= ((p->vars[insn->src_args[1]].alloc>>4)&0x1)<<5; + orc_arm_emit (p, x); + + ORC_ASM_CODE(p," vshl.i64 %s, %s, #%d\n", + orc_neon_reg_name (p->tmpreg), + orc_neon_reg_name (p->tmpreg), 64 - (16<loop_shift)); + code = NEON_BINARY(0xf2a00590, p->tmpreg, 0, p->tmpreg); + code |= (64 - (16<loop_shift)) << 16; + orc_arm_emit (p, code); + + orc_neon_emit_unary (p, "vpadal.u16", 0xf3b40680, + p->vars[insn->dest_args[0]].alloc, + p->tmpreg); + } else { + x = 0xf3800700; + ORC_ASM_CODE(p," vabdl.u8 %s, %s, %s\n", + orc_neon_reg_name_quad (p->tmpreg), + orc_neon_reg_name (p->vars[insn->src_args[0]].alloc), + orc_neon_reg_name (p->vars[insn->src_args[1]].alloc)); + x |= (p->tmpreg&0xf)<<12; + x |= ((p->tmpreg>>4)&0x1)<<22; + x |= (p->vars[insn->src_args[0]].alloc&0xf)<<16; + x |= ((p->vars[insn->src_args[0]].alloc>>4)&0x1)<<7; + x |= (p->vars[insn->src_args[1]].alloc&0xf)<<0; + x |= ((p->vars[insn->src_args[1]].alloc>>4)&0x1)<<5; + orc_arm_emit (p, x); + + orc_neon_emit_unary (p, "vpadal.u16", 0xf3b40680, + p->vars[insn->dest_args[0]].alloc, + p->tmpreg); + } } static void -- 2.7.4