From: David Schleef Date: Thu, 8 Jul 2010 23:53:29 +0000 (-0700) Subject: neon: Add short-array unaligned speedup X-Git-Tag: orc-0.4.6~10 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=8f807687c895ff388b2d69a15cca37766eb2a119;p=platform%2Fupstream%2Forc.git neon: Add short-array unaligned speedup --- diff --git a/orc/orcprogram-neon.c b/orc/orcprogram-neon.c index 38defdd..8161f12 100644 --- a/orc/orcprogram-neon.c +++ b/orc/orcprogram-neon.c @@ -595,6 +595,8 @@ get_align_var (OrcCompiler *compiler) enum { LABEL_ONE_REGION = 1, LABEL_ONE_REGION_AFTER, + LABEL_REGION0_LOOP, + LABEL_REGION0_SKIP, LABEL_REGION1_LOOP, LABEL_REGION1_SKIP, LABEL_REGION2_LOOP_SMALL, @@ -643,6 +645,42 @@ orc_compiler_neon_assemble (OrcCompiler *compiler) orc_arm_emit_label (compiler, LABEL_OUTER_LOOP); } +#define ORC_NEON_ALIGNED_DEST_CUTOFF 64 + + if (compiler->loop_shift > 0) { + orc_arm_emit_load_reg (compiler, ORC_ARM_A3, compiler->exec_reg, + (int)ORC_STRUCT_OFFSET(OrcExecutor,n)); + orc_arm_emit_cmp_imm (compiler, ORC_ARM_A3, ORC_NEON_ALIGNED_DEST_CUTOFF); + orc_arm_emit_branch (compiler, ORC_ARM_COND_GT, LABEL_REGION0_SKIP); + + orc_arm_emit_asr_imm (compiler, ORC_ARM_A2, ORC_ARM_A3, + compiler->loop_shift); + orc_arm_emit_store_reg (compiler, ORC_ARM_A2, compiler->exec_reg, + (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2)); + + orc_arm_emit_and_imm (compiler, ORC_ARM_A3, ORC_ARM_A3, + (1<loop_shift)-1); + orc_arm_emit_store_reg (compiler, ORC_ARM_A3, compiler->exec_reg, + (int)ORC_STRUCT_OFFSET(OrcExecutor,counter3)); + + orc_neon_load_constants_inner (compiler); + orc_arm_emit_load_reg (compiler, ORC_ARM_IP, compiler->exec_reg, + (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2)); + orc_arm_emit_cmp_imm (compiler, ORC_ARM_IP, 0); + orc_arm_emit_branch (compiler, ORC_ARM_COND_EQ, LABEL_REGION2_SKIP); + + compiler->size_region = 0; + orc_arm_emit_label (compiler, LABEL_REGION0_LOOP); + orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE); + orc_neon_emit_loop (compiler, -1); + orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_REGION0_LOOP); + + + orc_arm_emit_branch (compiler, ORC_ARM_COND_AL, LABEL_REGION2_SKIP); + + orc_arm_emit_label (compiler, LABEL_REGION0_SKIP); + } + if (compiler->loop_shift > 0) { orc_arm_emit_load_imm (compiler, ORC_ARM_IP, 1<>4)&0x1) << 22; code |= (!update) << 1; orc_arm_emit (compiler, code); + } else if (compiler->loop_shift == 5) { + ORC_ASM_CODE(compiler," vst1.8 { %s, %s, %s, %s }, [%s]%s\n", + orc_neon_reg_name (src1), + orc_neon_reg_name (src1+1), + orc_neon_reg_name (src1+2), + orc_neon_reg_name (src1+3), + orc_arm_reg_name (dest), + update ? "!" : ""); + code = 0xf400020d; + code |= (dest&0xf) << 16; + code |= (src1&0xf) << 12; + code |= ((src1>>4)&0x1) << 22; + code |= (!update) << 1; + orc_arm_emit (compiler, code); } else if (is_aligned && compiler->loop_shift == 4) { ORC_ASM_CODE(compiler," vst1.8 { %s, %s }, [%s,:128]%s\n", orc_neon_reg_name (src1), @@ -603,6 +617,18 @@ orc_neon_storeb (OrcCompiler *compiler, int dest, int update, int src1, int is_a code |= ((src1>>4)&0x1) << 22; code |= (!update) << 1; orc_arm_emit (compiler, code); + } else if (compiler->loop_shift == 4) { + ORC_ASM_CODE(compiler," vst1.8 { %s, %s }, [%s]%s\n", + orc_neon_reg_name (src1), + orc_neon_reg_name (src1+1), + orc_arm_reg_name (dest), + update ? "!" : ""); + code = 0xf4000a0d; + code |= (dest&0xf) << 16; + code |= (src1&0xf) << 12; + code |= ((src1>>4)&0x1) << 22; + code |= (!update) << 1; + orc_arm_emit (compiler, code); } else if (is_aligned && compiler->loop_shift == 3) { ORC_ASM_CODE(compiler," vst1.8 %s, [%s,:64]%s\n", orc_neon_reg_name (src1),