neon: Fix discrepancy when using loadupdb in 32-bit

author Gaetan Bahl <gaetan.bahl@nxp.com>

Mon, 22 Aug 2022 17:32:29 +0000 (19:32 +0200)

committer Gaetan Bahl <gaetan.bahl@nxp.com>

Mon, 22 Aug 2022 17:32:29 +0000 (19:32 +0200)
author Gaetan Bahl <gaetan.bahl@nxp.com>
Mon, 22 Aug 2022 17:32:29 +0000 (19:32 +0200)
committer Gaetan Bahl <gaetan.bahl@nxp.com>
Mon, 22 Aug 2022 17:32:29 +0000 (19:32 +0200)
diff --git a/orc/orcprogram-neon.c b/orc/orcprogram-neon.c

index de24bdb..5236b9f 100644 (file)
--- a/orc/orcprogram-neon.c
+++ b/orc/orcprogram-neon.c
@@ -909,9 +909,9 @@ orc_compiler_neon_assemble (OrcCompiler *compiler)
  
        compiler->size_region = 0;
        orc_arm_emit_label (compiler, LABEL_REGION0_LOOP);
-      orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE);
  
        orc_neon_emit_loop (compiler, -1);
+      orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE);
        orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_REGION0_LOOP);
        orc_arm_emit_branch (compiler, ORC_ARM_COND_AL, LABEL_REGION2_SKIP);
        orc_arm_emit_label (compiler, LABEL_REGION0_SKIP);
@@ -976,8 +976,8 @@ orc_compiler_neon_assemble (OrcCompiler *compiler)
        orc_arm_emit_branch (compiler, ORC_ARM_COND_EQ, LABEL_REGION1_SKIP);
  
        orc_arm_emit_label (compiler, LABEL_REGION1_LOOP);
-      orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE);
        orc_neon_emit_loop (compiler, -1);
+      orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE);
        orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_REGION1_LOOP);
        orc_arm_emit_label (compiler, LABEL_REGION1_SKIP);
  
@@ -1004,10 +1004,10 @@ orc_compiler_neon_assemble (OrcCompiler *compiler)
      /* N is larger than L2 cache size */
      compiler->size_region = 3;
      orc_arm_emit_label (compiler, LABEL_REGION2_LOOP_LARGE);
-    orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE);
      for(i=0;i<(1<<compiler->unroll_shift);i++){
        orc_neon_emit_loop (compiler, i);
      }
+    orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE);
      orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_REGION2_LOOP_LARGE);
      orc_arm_emit_branch (compiler, ORC_ARM_COND_AL, LABEL_REGION2_SKIP);
  
@@ -1020,10 +1020,10 @@ orc_compiler_neon_assemble (OrcCompiler *compiler)
      /* N is smaller than L2 cache size */
      compiler->size_region = 2;
      orc_arm_emit_label (compiler, LABEL_REGION2_LOOP_MEDIUM);
-    orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE);
      for(i=0;i<(1<<compiler->unroll_shift);i++){
        orc_neon_emit_loop (compiler, i);
      }
+    orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE);
      orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_REGION2_LOOP_MEDIUM);
      orc_arm_emit_branch (compiler, ORC_ARM_COND_AL, LABEL_REGION2_SKIP);
  
@@ -1031,10 +1031,10 @@ orc_compiler_neon_assemble (OrcCompiler *compiler)
      /* N is smaller than L2 cache size */
      compiler->size_region = 1;
      orc_arm_emit_label (compiler, LABEL_REGION2_LOOP_SMALL);
-    orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE);
      for(i=0;i<(1<<compiler->unroll_shift);i++){
        orc_neon_emit_loop (compiler, i);
      }
+    orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE);
      orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_REGION2_LOOP_SMALL);
  
      orc_arm_emit_label (compiler, LABEL_REGION2_SKIP);
@@ -1053,8 +1053,8 @@ orc_compiler_neon_assemble (OrcCompiler *compiler)
        orc_arm_emit_branch (compiler, ORC_ARM_COND_EQ, LABEL_REGION3_SKIP);
  
        orc_arm_emit_label (compiler, LABEL_REGION3_LOOP);
-      orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE);
        orc_neon_emit_loop (compiler, -1);
+      orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE);
        orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_REGION3_LOOP);
        orc_arm_emit_label (compiler, LABEL_REGION3_SKIP);
  
diff --git a/orc/orcrules-neon.c b/orc/orcrules-neon.c

index c4146fa..570199e 100644 (file)
--- a/orc/orcrules-neon.c
+++ b/orc/orcrules-neon.c
@@ -1198,7 +1198,7 @@ neon_rule_loadupdb (OrcCompiler *compiler, void *user, OrcInstruction *insn)
      } else {
        ptr_reg = src->ptr_register;
      }
-    if (size > 8) {
+    if (size >= 8) {
        if (src->is_aligned) {
          if (size == 32) {
            ORC_ASM_CODE(compiler,"  vld1.64 { %s, %s, %s, %s }, [%s,:256]\n",
@@ -1280,6 +1280,43 @@ neon_rule_loadupdb (OrcCompiler *compiler, void *user, OrcInstruction *insn)
          orc_neon_emit_unary (compiler, "vzip.8", 0xf3b20180,
            compiler->vars[insn->dest_args[0]].alloc,
            compiler->vars[insn->dest_args[0]].alloc + 1);
+
+        if (compiler->loop_shift == 1) {
+          /* When the loop_shift is 1, it is possible that one iteration of shift 0
+          has already been performed if the destination array is 8-byte aligned
+          (but not 16-byte aligned).
+          In this case, the output offset has been incremented by 1, and we need to
+          shift the outputs of loadupdb.*/
+
+          // set temp reg to 0
+          orc_arm_emit_eor_r(compiler, ORC_ARM_COND_AL, 0,
+            compiler->gp_tmpreg, compiler->gp_tmpreg, compiler->gp_tmpreg);
+          // test if input offset is odd
+          orc_arm_emit_tst_i(compiler, ORC_ARM_COND_AL, src->ptr_offset, 0x1);
+          // if yes, tmpreg = 0xff
+          orc_arm_emit_mov_i(compiler, ORC_ARM_COND_NE, 0, compiler->gp_tmpreg, 0xff);
+
+          // fill a simd reg with value of tmpreg (0xff or 0x0)
+          ORC_ASM_CODE(compiler,"  %s %s, %s\n", "vdup.8",
+            orc_neon_reg_name (dest->alloc+3), orc_arm_reg_name (compiler->gp_tmpreg));
+          code = 0xeec00b10;
+          code |= ((compiler->vars[insn->dest_args[0]].alloc+3)&0xf)<<16; // Vd
+          code |= (compiler->gp_tmpreg&0xf) << 12; // Rt
+          code |= (((compiler->vars[insn->dest_args[0]].alloc+3)>>4)&0x1) << 7; // D
+          orc_arm_emit (compiler, code);
+
+          // vext.8 with #imm=1 to create shifted output
+          orc_neon_emit_binary (compiler, "vext.8", 0xf2b00100,
+            compiler->vars[insn->dest_args[0]].alloc+1,
+            compiler->vars[insn->dest_args[0]].alloc,
+            compiler->vars[insn->dest_args[0]].alloc+1);
+
+          // select shifted output or not
+          orc_neon_emit_binary(compiler, "vbit.8", 0xf3200110,
+            compiler->vars[insn->dest_args[0]].alloc,
+            compiler->vars[insn->dest_args[0]].alloc+1,
+            compiler->vars[insn->dest_args[0]].alloc+3);
+        }
          break;
        case 2:
          orc_neon_emit_binary (compiler, "vorr", 0xf2200110,
author	Gaetan Bahl <gaetan.bahl@nxp.com>
	Mon, 22 Aug 2022 17:32:29 +0000 (19:32 +0200)
committer	Gaetan Bahl <gaetan.bahl@nxp.com>
	Mon, 22 Aug 2022 17:32:29 +0000 (19:32 +0200)
orc/orcprogram-neon.c		patch \| blob \| history
orc/orcrules-neon.c		patch \| blob \| history