From ecaa85842f728de83a98414a53dd3116955ae1c3 Mon Sep 17 00:00:00 2001 From: David Schleef Date: Sat, 16 May 2009 19:10:09 -0700 Subject: [PATCH] neon: add accumulator and merge opcodes --- orc-test/orctest.c | 4 +- orc/arm.c | 61 +++++++++++---- orc/arm.h | 3 + orc/orcprogram-neon.c | 124 ++++++++++++++++++++++++++++--- orc/orcrules-neon.c | 169 +++++++++++++++++++++++++++++++++++++++--- testsuite/test_compile_neon.c | 23 ++++-- 6 files changed, 343 insertions(+), 41 deletions(-) diff --git a/orc-test/orctest.c b/orc-test/orctest.c index e8b65a4..db0ef4c 100644 --- a/orc-test/orctest.c +++ b/orc-test/orctest.c @@ -74,6 +74,7 @@ orc_test_gcc_compile (OrcProgram *p) ret = system (cmd); if (ret != 0) { ORC_ERROR ("gcc failed"); + printf("%s\n", orc_program_get_asm_code (p)); return ORC_TEST_FAILED; } @@ -151,9 +152,10 @@ orc_test_gcc_compile_neon (OrcProgram *p) ret = fwrite(p->code, p->code_size, 1, file); fclose (file); - ret = system (PREFIX "gcc -mcpu=cortex-a8 -mfpu=neon -Wall -c tmp.s"); + ret = system (PREFIX "gcc -march=armv6t2 -mcpu=cortex-a8 -mfpu=neon -Wall -c tmp.s"); if (ret != 0) { printf("gcc failed\n"); + printf("%s\n", orc_program_get_asm_code (p)); return FALSE; } diff --git a/orc/arm.c b/orc/arm.c index c81a3d8..399da22 100644 --- a/orc/arm.c +++ b/orc/arm.c @@ -166,28 +166,29 @@ arm_emit_load_imm (OrcCompiler *compiler, int dest, int imm) { uint32_t code; int shift2; + unsigned int x; - shift2 = 0; -#if 0 - while (imm && ((imm&3)==0)) { - imm >>= 2; - shift2++; - } -#endif - while (imm && imm > 0xffff) { - if ((imm&3) != 0) { - ORC_ERROR("bad immediate value"); + if ((imm & 0xff) == imm) { + shift2 = 0; + x = imm; + } else { + shift2 = 0; + x = imm & 0xffffffff; + while ((x & 3) == 0) { + x >>= 2; + shift2++; + } + if (x > 0xff) { + ORC_PROGRAM_ERROR(compiler, "bad immediate value"); } - imm >>= 2; - shift2++; } code = 0xe3a00000; code |= (dest&0xf) << 12; code |= (((16-shift2)&0xf) << 8); - code |= (imm&0xff); + code |= (x&0xff); - ORC_ASM_CODE(compiler," mov %s, #0x%08x\n", arm_reg_name (dest), imm << (shift2*2)); + ORC_ASM_CODE(compiler," mov %s, #0x%08x\n", arm_reg_name (dest), imm); arm_emit (compiler, code); } @@ -226,8 +227,9 @@ arm_emit_sub (OrcCompiler *compiler, int dest, int src1, int src2) } void -arm_emit_add_imm (OrcCompiler *compiler, int dest, int src1, int value) +arm_emit_add_imm (OrcCompiler *compiler, int dest, int src1, int imm) { +#if 0 uint32_t code; code = 0xe2800000; @@ -240,6 +242,35 @@ arm_emit_add_imm (OrcCompiler *compiler, int dest, int src1, int value) arm_reg_name (src1), value); arm_emit (compiler, code); +#endif + uint32_t code; + int shift2; + unsigned int x; + + if ((imm & 0xff) == imm) { + shift2 = 0; + x = imm; + } else { + shift2 = 0; + x = imm & 0xffffffff; + while ((x & 3) == 0) { + x >>= 2; + shift2++; + } + if (x > 0xff) { + ORC_PROGRAM_ERROR(compiler, "bad immediate value"); + } + } + + code = 0xe2800000; + code |= (src1&0xf) << 16; + code |= (dest&0xf) << 12; + code |= (((16-shift2)&0xf) << 8); + code |= (x&0xff); + + ORC_ASM_CODE(compiler," add %s, %s, #0x%08x\n", arm_reg_name (dest), + arm_reg_name(src1), imm); + arm_emit (compiler, code); } void diff --git a/orc/arm.h b/orc/arm.h index f804297..662ae04 100644 --- a/orc/arm.h +++ b/orc/arm.h @@ -93,6 +93,9 @@ void arm_emit_store_reg (OrcCompiler *compiler, int src, int dest, int offset); void arm_do_fixups (OrcCompiler *compiler); +const char *neon_reg_name (int reg); +const char *neon_reg_name_quad (int reg); +void neon_emit_mov (OrcCompiler *compiler, int src, int dest); #endif diff --git a/orc/orcprogram-neon.c b/orc/orcprogram-neon.c index c78eeda..8b7f178 100644 --- a/orc/orcprogram-neon.c +++ b/orc/orcprogram-neon.c @@ -29,6 +29,7 @@ void orc_compiler_neon_assemble (OrcCompiler *compiler); void orc_compiler_rewrite_vars (OrcCompiler *compiler); void orc_compiler_dump (OrcCompiler *compiler); +void neon_save_accumulators (OrcCompiler *compiler); void @@ -134,6 +135,8 @@ orc_compiler_neon_init (OrcCompiler *compiler) compiler->used_regs[i] = 0; } + compiler->exec_reg = ARM_R0; + compiler->gp_tmpreg = ARM_A2; compiler->tmpreg = ORC_VEC_REG_BASE + 0; compiler->valid_regs[compiler->tmpreg] = 0; @@ -193,7 +196,13 @@ neon_load_constants (OrcCompiler *compiler) compiler->vars[i].ptr_register, neon_exec_ptr, ORC_STRUCT_OFFSET(OrcExecutor, arrays[i])); break; + case ORC_VAR_TYPE_ACCUMULATOR: + neon_emit_loadil (compiler, compiler->vars[i].alloc, 0); + break; + case ORC_VAR_TYPE_TEMP: + break; default: + ORC_PROGRAM_ERROR(compiler,"bad vartype"); break; } } @@ -277,16 +286,27 @@ get_shift (int size) return -1; } +static int +get_align_var (OrcCompiler *compiler) +{ + if (compiler->vars[ORC_VAR_D1].size) return ORC_VAR_D1; + if (compiler->vars[ORC_VAR_S1].size) return ORC_VAR_S1; + + ORC_PROGRAM_ERROR(compiler, "could not find alignment variable"); + + return -1; +} + void orc_compiler_neon_assemble (OrcCompiler *compiler) { - int dest_var; - int dest_shift; + int align_var; + int align_shift; - dest_var = orc_compiler_get_dest (compiler); - dest_shift = get_shift (compiler->vars[dest_var].size); + align_var = get_align_var (compiler); + align_shift = get_shift (compiler->vars[align_var].size); - compiler->vars[dest_var].is_aligned = FALSE; + compiler->vars[align_var].is_aligned = FALSE; neon_emit_prologue (compiler); @@ -298,11 +318,11 @@ orc_compiler_neon_assemble (OrcCompiler *compiler) arm_emit_load_reg (compiler, ARM_A3, neon_exec_ptr, (int)ORC_STRUCT_OFFSET(OrcExecutor,n)); arm_emit_load_reg (compiler, ARM_A2, neon_exec_ptr, - (int)ORC_STRUCT_OFFSET(OrcExecutor,arrays[dest_var])); + (int)ORC_STRUCT_OFFSET(OrcExecutor,arrays[align_var])); arm_emit_sub (compiler, ARM_IP, ARM_IP, ARM_A2); arm_emit_and_imm (compiler, ARM_IP, ARM_IP, (1< 0) { - arm_emit_asr_imm (compiler, ARM_IP, ARM_IP, dest_shift); + if (align_shift > 0) { + arm_emit_asr_imm (compiler, ARM_IP, ARM_IP, align_shift); } arm_emit_cmp (compiler, ARM_A3, ARM_IP); @@ -355,7 +375,7 @@ orc_compiler_neon_assemble (OrcCompiler *compiler) arm_emit_label (compiler, 1); compiler->loop_shift = save_loop_shift; - compiler->vars[dest_var].is_aligned = TRUE; + compiler->vars[align_var].is_aligned = TRUE; } if (compiler->loop_shift > 0) { @@ -380,7 +400,7 @@ orc_compiler_neon_assemble (OrcCompiler *compiler) int save_loop_shift = compiler->loop_shift; compiler->loop_shift = 0; - compiler->vars[dest_var].is_aligned = FALSE; + compiler->vars[align_var].is_aligned = FALSE; arm_emit_load_reg (compiler, ARM_IP, neon_exec_ptr, (int)ORC_STRUCT_OFFSET(OrcExecutor,counter3)); @@ -398,6 +418,8 @@ orc_compiler_neon_assemble (OrcCompiler *compiler) compiler->loop_shift = save_loop_shift; } + neon_save_accumulators (compiler); + neon_emit_epilogue (compiler); arm_do_fixups (compiler); @@ -451,11 +473,13 @@ neon_emit_loop (OrcCompiler *compiler) rule = insn->rule; if (rule && rule->emit) { +#if 0 if (compiler->vars[insn->dest_args[0]].alloc != compiler->vars[insn->src_args[0]].alloc) { - arm_emit_mov (compiler, compiler->vars[insn->src_args[0]].alloc, + neon_emit_mov (compiler, compiler->vars[insn->src_args[0]].alloc, compiler->vars[insn->dest_args[0]].alloc); } +#endif rule->emit (compiler, rule->emit_user, insn); } else { orc_compiler_append_code(compiler,"No rule for: %s\n", opcode->name); @@ -496,3 +520,81 @@ neon_emit_loop (OrcCompiler *compiler) #endif } +void +neon_save_accumulators (OrcCompiler *compiler) +{ + int i; + int src; + unsigned int code; + + for(i=0;ivars + i; + + if (compiler->vars[i].name == NULL) continue; + switch (compiler->vars[i].vartype) { + case ORC_VAR_TYPE_ACCUMULATOR: + src = compiler->vars[i].alloc; + + arm_emit_load_imm (compiler, compiler->gp_tmpreg, + ORC_STRUCT_OFFSET(OrcExecutor, accumulators[i-ORC_VAR_A1])); + switch (var->size) { + case 2: + ORC_ASM_CODE(compiler," vpaddl.u16 %s, %s\n", + neon_reg_name (src), + neon_reg_name (src)); + code = 0xf3b40080; + code |= (src&0xf) << 16; + code |= (src&0xf) << 12; + code |= ((src>>4)&0x1) << 22; + arm_emit (compiler, code); + + ORC_ASM_CODE(compiler," vpaddl.u32 %s, %s\n", + neon_reg_name (src), + neon_reg_name (src)); + code = 0xf3b40080; + code |= (src&0xf) << 16; + code |= (src&0xf) << 12; + code |= ((src>>4)&0x1) << 22; + arm_emit (compiler, code); + + ORC_ASM_CODE(compiler," vst1.16 %s[%d], [%s], %s\n", + neon_reg_name (src), 0, + arm_reg_name (compiler->gp_tmpreg), + arm_reg_name (compiler->exec_reg)); + code = 0xf4800400; + code |= (compiler->gp_tmpreg&0xf) << 16; + code |= (src&0xf) << 12; + code |= ((src>>4)&0x1) << 22; + arm_emit (compiler, code); + break; + case 4: + ORC_ASM_CODE(compiler," vpaddl.u32 %s, %s\n", + neon_reg_name (src), + neon_reg_name (src)); + code = 0xf3b40080; + code |= (src&0xf) << 16; + code |= (src&0xf) << 12; + code |= ((src>>4)&0x1) << 22; + arm_emit (compiler, code); + + ORC_ASM_CODE(compiler," vst1.32 %s[%d], [%s], %s\n", + neon_reg_name (src), 0, + arm_reg_name (compiler->gp_tmpreg), + arm_reg_name (compiler->exec_reg)); + code = 0xf4800800; + code |= (compiler->gp_tmpreg&0xf) << 16; + code |= (src&0xf) << 12; + code |= ((src>>4)&0x1) << 22; + arm_emit (compiler, code); + break; + default: + ORC_ERROR("bad size"); + } + + break; + default: + break; + } + } +} + diff --git a/orc/orcrules-neon.c b/orc/orcrules-neon.c index 41cba65..1b4b564 100644 --- a/orc/orcrules-neon.c +++ b/orc/orcrules-neon.c @@ -57,17 +57,23 @@ const char *neon_reg_name_quad (int reg) return vec_regs[reg&0x1f]; } -#if 0 void -neon_emit_mov (OrcCompiler *compiler, uint32_t code, int src, int dest) +neon_emit_mov (OrcCompiler *compiler, int src, int dest) { - code |= (src&0xf) << 12; - code |= ((src>>4)&0x1) << 22; + uint32_t code; + + ORC_ASM_CODE(compiler," vmov %s, %s\n", + neon_reg_name (dest), + neon_reg_name (src)); + code = 0xf2200110; code |= (dest&0xf) << 16; code |= ((dest>>4)&0x1) << 7; + code |= (src&0xf) << 12; + code |= ((src>>4)&0x1) << 22; + code |= (src&0xf) << 0; + code |= ((src>>4)&0x1) << 5; arm_emit (compiler, code); } -#endif void neon_loadb (OrcCompiler *compiler, int dest, int src1, int update, int is_aligned) @@ -297,7 +303,7 @@ neon_emit_loadib (OrcCompiler *compiler, int reg, int value) if (value == 0) { ORC_ASM_CODE(compiler," veor %s, %s, %s\n", neon_reg_name (reg), neon_reg_name (reg), neon_reg_name (reg)); - code = 0xee000b30; + code = 0xf3000110; code |= (reg&0xf) << 16; code |= (reg&0xf) << 12; code |= (reg&0xf) << 0; @@ -321,7 +327,7 @@ neon_emit_loadiw (OrcCompiler *compiler, int reg, int value) if (value == 0) { ORC_ASM_CODE(compiler," veor %s, %s, %s\n", neon_reg_name (reg), neon_reg_name (reg), neon_reg_name (reg)); - code = 0xee000b30; + code = 0xf3000110; code |= (reg&0xf) << 16; code |= (reg&0xf) << 12; code |= (reg&0xf) << 0; @@ -345,7 +351,7 @@ neon_emit_loadil (OrcCompiler *compiler, int reg, int value) if (value == 0) { ORC_ASM_CODE(compiler," veor %s, %s, %s\n", neon_reg_name (reg), neon_reg_name (reg), neon_reg_name (reg)); - code = 0xee000b30; + code = 0xf3000110; code |= (reg&0xf) << 16; code |= (reg&0xf) << 12; code |= (reg&0xf) << 0; @@ -844,13 +850,146 @@ BINARY_LONG(mulubw,"vmull.u8",0xf3800c00) BINARY_LONG(mulswl,"vmull.s16",0xf2900c00) BINARY_LONG(muluwl,"vmull.u16",0xf3900c00) +UNARY(swapw,"vrev16.i8",0xf3b00100) +UNARY(swapl,"vrev32.i8",0xf3b00080) + +UNARY_NARROW(select0lw,"vmovn.i32",0xf3b60200) +UNARY_NARROW(select0wb,"vmovn.i16",0xf3b20200) + +UNARY(mergebw,"vzip.8",0xf3b20180) +UNARY(mergewl,"vzip.16",0xf3b60180) + +static void +neon_emit_binary (OrcCompiler *p, const char *name, unsigned int code, + int dest, int src1, int src2) +{ + ORC_ASM_CODE(p," %s %s, %s, %s\n", name, + neon_reg_name (dest), neon_reg_name (src1), neon_reg_name (src2)); + code |= (dest&0xf)<<16; + code |= ((dest>>4)&0x1)<<7; + code |= (src1&0xf)<<12; + code |= ((src1>>4)&0x1)<<22; + code |= (src2&0xf)<<0; + code |= ((src2>>4)&0x1)<<5; + arm_emit (p, code); + +} + +static void +neon_rule_accw (OrcCompiler *p, void *user, OrcInstruction *insn) +{ + neon_emit_binary (p, "vadd.i16", 0xf2100800, + p->vars[insn->dest_args[0]].alloc, + p->vars[insn->dest_args[0]].alloc, + p->vars[insn->src_args[0]].alloc); +} + +static void +neon_rule_accl (OrcCompiler *p, void *user, OrcInstruction *insn) +{ + neon_emit_binary (p, "vadd.i32", 0xf2200800, + p->vars[insn->dest_args[0]].alloc, + p->vars[insn->dest_args[0]].alloc, + p->vars[insn->src_args[0]].alloc); +} + +static void +neon_rule_select1wb (OrcCompiler *p, void *user, OrcInstruction *insn) +{ + uint32_t x; + + x = 0xf3b00100; + ORC_ASM_CODE(p," vrev16.i8 %s, %s\n", + neon_reg_name (p->vars[insn->dest_args[0]].alloc), + neon_reg_name (p->vars[insn->src_args[0]].alloc)); + x |= (p->vars[insn->dest_args[0]].alloc&0xf)<<12; + x |= ((p->vars[insn->dest_args[0]].alloc>>4)&0x1)<<22; + //x |= (p->vars[insn->src_args[0]].alloc&0xf)<<16; + //x |= ((p->vars[insn->src_args[0]].alloc>>4)&0x1)<<7; + x |= (p->vars[insn->src_args[0]].alloc&0xf)<<0; + x |= ((p->vars[insn->src_args[0]].alloc>>4)&0x1)<<5; + arm_emit (p, x); + + x = 0xf3b20200; + ORC_ASM_CODE(p," vmovn.i16 %s, %s\n", + neon_reg_name (p->vars[insn->dest_args[0]].alloc), + neon_reg_name_quad (p->vars[insn->src_args[0]].alloc)); + x |= (p->vars[insn->dest_args[0]].alloc&0xf)<<12; + x |= ((p->vars[insn->dest_args[0]].alloc>>4)&0x1)<<22; + //x |= (p->vars[insn->src_args[0]].alloc&0xf)<<16; + //x |= ((p->vars[insn->src_args[0]].alloc>>4)&0x1)<<7; + x |= (p->vars[insn->src_args[0]].alloc&0xf)<<0; + x |= ((p->vars[insn->src_args[0]].alloc>>4)&0x1)<<5; + arm_emit (p, x); +} + +static void +neon_rule_select1lw (OrcCompiler *p, void *user, OrcInstruction *insn) +{ + uint32_t x; + + x = 0xf3b40080; + ORC_ASM_CODE(p," vrev32.i16 %s, %s\n", + neon_reg_name (p->vars[insn->dest_args[0]].alloc), + neon_reg_name (p->vars[insn->src_args[0]].alloc)); + x |= (p->vars[insn->dest_args[0]].alloc&0xf)<<12; + x |= ((p->vars[insn->dest_args[0]].alloc>>4)&0x1)<<22; + //x |= (p->vars[insn->src_args[1]].alloc&0xf)<<16; + //x |= ((p->vars[insn->src_args[1]].alloc>>4)&0x1)<<7; + x |= (p->vars[insn->src_args[0]].alloc&0xf)<<0; + x |= ((p->vars[insn->src_args[0]].alloc>>4)&0x1)<<5; + arm_emit (p, x); + + x = 0xf3b60200; + ORC_ASM_CODE(p," vmovn.i32 %s, %s\n", + neon_reg_name (p->vars[insn->dest_args[0]].alloc), + neon_reg_name_quad (p->vars[insn->src_args[0]].alloc)); + x |= (p->vars[insn->dest_args[0]].alloc&0xf)<<12; + x |= ((p->vars[insn->dest_args[0]].alloc>>4)&0x1)<<22; + //x |= (p->vars[insn->src_args[0]].alloc&0xf)<<16; + //x |= ((p->vars[insn->src_args[0]].alloc>>4)&0x1)<<7; + x |= (p->vars[insn->src_args[0]].alloc&0xf)<<0; + x |= ((p->vars[insn->src_args[0]].alloc>>4)&0x1)<<5; + arm_emit (p, x); +} + +static void +neon_rule_accsadubl (OrcCompiler *p, void *user, OrcInstruction *insn) +{ + uint32_t x; + + x = 0xf3840700; + ORC_ASM_CODE(p," vabdl.u8 %s, %s, %s\n", + neon_reg_name_quad (p->tmpreg), + neon_reg_name (p->vars[insn->src_args[0]].alloc), + neon_reg_name (p->vars[insn->src_args[1]].alloc)); + x |= (p->tmpreg&0xf)<<12; + x |= ((p->tmpreg>>4)&0x1)<<22; + x |= (p->vars[insn->src_args[0]].alloc&0xf)<<16; + x |= ((p->vars[insn->src_args[0]].alloc>>4)&0x1)<<7; + x |= (p->vars[insn->src_args[1]].alloc&0xf)<<0; + x |= ((p->vars[insn->src_args[1]].alloc>>4)&0x1)<<5; + arm_emit (p, x); + + x = 0xf3b40680; + ORC_ASM_CODE(p," vpadal.u16 %s, %s\n", + neon_reg_name (p->vars[insn->dest_args[0]].alloc), + neon_reg_name (p->tmpreg)); + x |= (p->vars[insn->dest_args[0]].alloc&0xf)<<12; + x |= ((p->vars[insn->dest_args[0]].alloc>>4)&0x1)<<22; + //x |= (p->vars[insn->src_args[0]].alloc&0xf)<<16; + //x |= ((p->vars[insn->src_args[0]].alloc>>4)&0x1)<<7; + x |= (p->tmpreg&0xf)<<0; + x |= ((p->tmpreg>>4)&0x1)<<5; + arm_emit (p, x); +} void orc_compiler_neon_register_rules (OrcTarget *target) { OrcRuleSet *rule_set; - rule_set = orc_rule_set_new (orc_opcode_set_get("sys"), target); + rule_set = orc_rule_set_new (orc_opcode_set_get("sys"), target, 0); #define REG(x) \ orc_rule_register (rule_set, #x , neon_rule_ ## x, NULL) @@ -948,6 +1087,18 @@ orc_compiler_neon_register_rules (OrcTarget *target) REG(mulswl); REG(muluwl); + REG(accw); + REG(accl); + REG(accsadubl); + REG(swapw); + REG(swapl); + REG(select0wb); + REG(select1wb); + REG(select0lw); + REG(select1lw); + REG(mergebw); + REG(mergewl); + orc_rule_register (rule_set, "shlb", neon_rule_shift, (void *)0); orc_rule_register (rule_set, "shrsb", neon_rule_shift, (void *)1); orc_rule_register (rule_set, "shrub", neon_rule_shift, (void *)2); diff --git a/testsuite/test_compile_neon.c b/testsuite/test_compile_neon.c index b188915..5b4a9c6 100644 --- a/testsuite/test_compile_neon.c +++ b/testsuite/test_compile_neon.c @@ -73,17 +73,30 @@ test_opcode (OrcStaticOpcode *opcode) OrcProgram *p; char s[40]; - if (opcode->src_size[1] == 0) { - p = orc_program_new_ds (opcode->dest_size[0], opcode->src_size[0]); + if (opcode->flags & ORC_STATIC_OPCODE_ACCUMULATOR) { + if (opcode->src_size[1] == 0) { + p = orc_program_new_as (opcode->dest_size[0], opcode->src_size[0]); + } else { + p = orc_program_new_ass (opcode->dest_size[0], opcode->src_size[0], + opcode->src_size[1]); + } } else { - p = orc_program_new_dss (opcode->dest_size[0], opcode->src_size[0], - opcode->src_size[1]); + if (opcode->src_size[1] == 0) { + p = orc_program_new_ds (opcode->dest_size[0], opcode->src_size[0]); + } else { + p = orc_program_new_dss (opcode->dest_size[0], opcode->src_size[0], + opcode->src_size[1]); + } } sprintf(s, "test_%s", opcode->name); orc_program_set_name (p, s); - orc_program_append_str (p, opcode->name, "d1", "s1", "s2"); + if (opcode->flags & ORC_STATIC_OPCODE_ACCUMULATOR) { + orc_program_append_str (p, opcode->name, "a1", "s1", "s2"); + } else { + orc_program_append_str (p, opcode->name, "d1", "s1", "s2"); + } orc_test_gcc_compile_neon (p); -- 2.7.4