enum {
LABEL_ONE_REGION = 1,
LABEL_ONE_REGION_AFTER,
+ LABEL_REGION0_LOOP,
+ LABEL_REGION0_SKIP,
LABEL_REGION1_LOOP,
LABEL_REGION1_SKIP,
LABEL_REGION2_LOOP_SMALL,
orc_arm_emit_label (compiler, LABEL_OUTER_LOOP);
}
+#define ORC_NEON_ALIGNED_DEST_CUTOFF 64
+
+ if (compiler->loop_shift > 0) {
+ orc_arm_emit_load_reg (compiler, ORC_ARM_A3, compiler->exec_reg,
+ (int)ORC_STRUCT_OFFSET(OrcExecutor,n));
+ orc_arm_emit_cmp_imm (compiler, ORC_ARM_A3, ORC_NEON_ALIGNED_DEST_CUTOFF);
+ orc_arm_emit_branch (compiler, ORC_ARM_COND_GT, LABEL_REGION0_SKIP);
+
+ orc_arm_emit_asr_imm (compiler, ORC_ARM_A2, ORC_ARM_A3,
+ compiler->loop_shift);
+ orc_arm_emit_store_reg (compiler, ORC_ARM_A2, compiler->exec_reg,
+ (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2));
+
+ orc_arm_emit_and_imm (compiler, ORC_ARM_A3, ORC_ARM_A3,
+ (1<<compiler->loop_shift)-1);
+ orc_arm_emit_store_reg (compiler, ORC_ARM_A3, compiler->exec_reg,
+ (int)ORC_STRUCT_OFFSET(OrcExecutor,counter3));
+
+ orc_neon_load_constants_inner (compiler);
+ orc_arm_emit_load_reg (compiler, ORC_ARM_IP, compiler->exec_reg,
+ (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2));
+ orc_arm_emit_cmp_imm (compiler, ORC_ARM_IP, 0);
+ orc_arm_emit_branch (compiler, ORC_ARM_COND_EQ, LABEL_REGION2_SKIP);
+
+ compiler->size_region = 0;
+ orc_arm_emit_label (compiler, LABEL_REGION0_LOOP);
+ orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE);
+ orc_neon_emit_loop (compiler, -1);
+ orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_REGION0_LOOP);
+
+
+ orc_arm_emit_branch (compiler, ORC_ARM_COND_AL, LABEL_REGION2_SKIP);
+
+ orc_arm_emit_label (compiler, LABEL_REGION0_SKIP);
+ }
+
if (compiler->loop_shift > 0) {
orc_arm_emit_load_imm (compiler, ORC_ARM_IP, 1<<align_shift);
orc_arm_emit_branch (compiler, ORC_ARM_COND_EQ, LABEL_REGION1_SKIP);
orc_arm_emit_label (compiler, LABEL_REGION1_LOOP);
- orc_neon_emit_loop (compiler, 0);
orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE);
+ orc_neon_emit_loop (compiler, -1);
orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_REGION1_LOOP);
orc_arm_emit_label (compiler, LABEL_REGION1_SKIP);
orc_arm_emit_branch (compiler, ORC_ARM_COND_EQ, LABEL_REGION3_SKIP);
orc_arm_emit_label (compiler, LABEL_REGION3_LOOP);
- orc_neon_emit_loop (compiler, 0);
orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE);
+ orc_neon_emit_loop (compiler, -1);
orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_REGION3_LOOP);
orc_arm_emit_label (compiler, LABEL_REGION3_SKIP);
code |= ((src1>>4)&0x1) << 22;
code |= (!update) << 1;
orc_arm_emit (compiler, code);
+ } else if (compiler->loop_shift == 5) {
+ ORC_ASM_CODE(compiler," vst1.8 { %s, %s, %s, %s }, [%s]%s\n",
+ orc_neon_reg_name (src1),
+ orc_neon_reg_name (src1+1),
+ orc_neon_reg_name (src1+2),
+ orc_neon_reg_name (src1+3),
+ orc_arm_reg_name (dest),
+ update ? "!" : "");
+ code = 0xf400020d;
+ code |= (dest&0xf) << 16;
+ code |= (src1&0xf) << 12;
+ code |= ((src1>>4)&0x1) << 22;
+ code |= (!update) << 1;
+ orc_arm_emit (compiler, code);
} else if (is_aligned && compiler->loop_shift == 4) {
ORC_ASM_CODE(compiler," vst1.8 { %s, %s }, [%s,:128]%s\n",
orc_neon_reg_name (src1),
code |= ((src1>>4)&0x1) << 22;
code |= (!update) << 1;
orc_arm_emit (compiler, code);
+ } else if (compiler->loop_shift == 4) {
+ ORC_ASM_CODE(compiler," vst1.8 { %s, %s }, [%s]%s\n",
+ orc_neon_reg_name (src1),
+ orc_neon_reg_name (src1+1),
+ orc_arm_reg_name (dest),
+ update ? "!" : "");
+ code = 0xf4000a0d;
+ code |= (dest&0xf) << 16;
+ code |= (src1&0xf) << 12;
+ code |= ((src1>>4)&0x1) << 22;
+ code |= (!update) << 1;
+ orc_arm_emit (compiler, code);
} else if (is_aligned && compiler->loop_shift == 3) {
ORC_ASM_CODE(compiler," vst1.8 %s, [%s,:64]%s\n",
orc_neon_reg_name (src1),