From 8ef6d657ccc1e98476ebc5d101c6729e6b916f78 Mon Sep 17 00:00:00 2001 From: Gaetan Bahl Date: Mon, 22 Aug 2022 19:32:29 +0200 Subject: [PATCH] neon: Fix discrepancy when using loadupdb in 32-bit Fix shifted outputs when output array is 8-byte aligned but not 16-byte aligned and loop shift is 1. Fixes #32 Signed-off-by: Gaetan Bahl --- orc/orcprogram-neon.c | 12 ++++++------ orc/orcrules-neon.c | 39 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 44 insertions(+), 7 deletions(-) diff --git a/orc/orcprogram-neon.c b/orc/orcprogram-neon.c index de24bdb..5236b9f 100644 --- a/orc/orcprogram-neon.c +++ b/orc/orcprogram-neon.c @@ -909,9 +909,9 @@ orc_compiler_neon_assemble (OrcCompiler *compiler) compiler->size_region = 0; orc_arm_emit_label (compiler, LABEL_REGION0_LOOP); - orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE); orc_neon_emit_loop (compiler, -1); + orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE); orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_REGION0_LOOP); orc_arm_emit_branch (compiler, ORC_ARM_COND_AL, LABEL_REGION2_SKIP); orc_arm_emit_label (compiler, LABEL_REGION0_SKIP); @@ -976,8 +976,8 @@ orc_compiler_neon_assemble (OrcCompiler *compiler) orc_arm_emit_branch (compiler, ORC_ARM_COND_EQ, LABEL_REGION1_SKIP); orc_arm_emit_label (compiler, LABEL_REGION1_LOOP); - orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE); orc_neon_emit_loop (compiler, -1); + orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE); orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_REGION1_LOOP); orc_arm_emit_label (compiler, LABEL_REGION1_SKIP); @@ -1004,10 +1004,10 @@ orc_compiler_neon_assemble (OrcCompiler *compiler) /* N is larger than L2 cache size */ compiler->size_region = 3; orc_arm_emit_label (compiler, LABEL_REGION2_LOOP_LARGE); - orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE); for(i=0;i<(1<unroll_shift);i++){ orc_neon_emit_loop (compiler, i); } + orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE); orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_REGION2_LOOP_LARGE); orc_arm_emit_branch (compiler, ORC_ARM_COND_AL, LABEL_REGION2_SKIP); @@ -1020,10 +1020,10 @@ orc_compiler_neon_assemble (OrcCompiler *compiler) /* N is smaller than L2 cache size */ compiler->size_region = 2; orc_arm_emit_label (compiler, LABEL_REGION2_LOOP_MEDIUM); - orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE); for(i=0;i<(1<unroll_shift);i++){ orc_neon_emit_loop (compiler, i); } + orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE); orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_REGION2_LOOP_MEDIUM); orc_arm_emit_branch (compiler, ORC_ARM_COND_AL, LABEL_REGION2_SKIP); @@ -1031,10 +1031,10 @@ orc_compiler_neon_assemble (OrcCompiler *compiler) /* N is smaller than L2 cache size */ compiler->size_region = 1; orc_arm_emit_label (compiler, LABEL_REGION2_LOOP_SMALL); - orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE); for(i=0;i<(1<unroll_shift);i++){ orc_neon_emit_loop (compiler, i); } + orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE); orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_REGION2_LOOP_SMALL); orc_arm_emit_label (compiler, LABEL_REGION2_SKIP); @@ -1053,8 +1053,8 @@ orc_compiler_neon_assemble (OrcCompiler *compiler) orc_arm_emit_branch (compiler, ORC_ARM_COND_EQ, LABEL_REGION3_SKIP); orc_arm_emit_label (compiler, LABEL_REGION3_LOOP); - orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE); orc_neon_emit_loop (compiler, -1); + orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE); orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_REGION3_LOOP); orc_arm_emit_label (compiler, LABEL_REGION3_SKIP); diff --git a/orc/orcrules-neon.c b/orc/orcrules-neon.c index c4146fa..570199e 100644 --- a/orc/orcrules-neon.c +++ b/orc/orcrules-neon.c @@ -1198,7 +1198,7 @@ neon_rule_loadupdb (OrcCompiler *compiler, void *user, OrcInstruction *insn) } else { ptr_reg = src->ptr_register; } - if (size > 8) { + if (size >= 8) { if (src->is_aligned) { if (size == 32) { ORC_ASM_CODE(compiler," vld1.64 { %s, %s, %s, %s }, [%s,:256]\n", @@ -1280,6 +1280,43 @@ neon_rule_loadupdb (OrcCompiler *compiler, void *user, OrcInstruction *insn) orc_neon_emit_unary (compiler, "vzip.8", 0xf3b20180, compiler->vars[insn->dest_args[0]].alloc, compiler->vars[insn->dest_args[0]].alloc + 1); + + if (compiler->loop_shift == 1) { + /* When the loop_shift is 1, it is possible that one iteration of shift 0 + has already been performed if the destination array is 8-byte aligned + (but not 16-byte aligned). + In this case, the output offset has been incremented by 1, and we need to + shift the outputs of loadupdb.*/ + + // set temp reg to 0 + orc_arm_emit_eor_r(compiler, ORC_ARM_COND_AL, 0, + compiler->gp_tmpreg, compiler->gp_tmpreg, compiler->gp_tmpreg); + // test if input offset is odd + orc_arm_emit_tst_i(compiler, ORC_ARM_COND_AL, src->ptr_offset, 0x1); + // if yes, tmpreg = 0xff + orc_arm_emit_mov_i(compiler, ORC_ARM_COND_NE, 0, compiler->gp_tmpreg, 0xff); + + // fill a simd reg with value of tmpreg (0xff or 0x0) + ORC_ASM_CODE(compiler," %s %s, %s\n", "vdup.8", + orc_neon_reg_name (dest->alloc+3), orc_arm_reg_name (compiler->gp_tmpreg)); + code = 0xeec00b10; + code |= ((compiler->vars[insn->dest_args[0]].alloc+3)&0xf)<<16; // Vd + code |= (compiler->gp_tmpreg&0xf) << 12; // Rt + code |= (((compiler->vars[insn->dest_args[0]].alloc+3)>>4)&0x1) << 7; // D + orc_arm_emit (compiler, code); + + // vext.8 with #imm=1 to create shifted output + orc_neon_emit_binary (compiler, "vext.8", 0xf2b00100, + compiler->vars[insn->dest_args[0]].alloc+1, + compiler->vars[insn->dest_args[0]].alloc, + compiler->vars[insn->dest_args[0]].alloc+1); + + // select shifted output or not + orc_neon_emit_binary(compiler, "vbit.8", 0xf3200110, + compiler->vars[insn->dest_args[0]].alloc, + compiler->vars[insn->dest_args[0]].alloc+1, + compiler->vars[insn->dest_args[0]].alloc+3); + } break; case 2: orc_neon_emit_binary (compiler, "vorr", 0xf2200110, -- 2.7.4