neon: Fix unsigned only implementation of loadoffb, loadoffw and loadoffl
authorKnobe, Daniel <daniel-knobe@web.de>
Tue, 7 Dec 2021 09:22:15 +0000 (10:22 +0100)
committerKnobe, Daniel <daniel-knobe@web.de>
Tue, 7 Dec 2021 09:22:15 +0000 (10:22 +0100)
This has direct impact on bayer2rgb performance. Tested on i.MX8mm aarch64 -> Speedboost of ~17%.
Reason:
The line loadoffw t, s, -1 results in orc silent compile error
Pipeline:
gst-launch-1.0 -v videotestsrc ! video/x-bayer,width=1920,height=1080 ! bayer2rgb ! fpsdisplaysink video-sink=fakesink sync=0
Average performance with fix: 25.21fps
Average performance without fix: 21.60fps

Part-of: <https://gitlab.freedesktop.org/gstreamer/orc/-/merge_requests/62>

orc/orcrules-neon.c

index 726f0d4..a9c6bb0 100644 (file)
@@ -1363,20 +1363,38 @@ neon_rule_loadX (OrcCompiler *compiler, void *user, OrcInstruction *insn)
   if (src->vartype == ORC_VAR_TYPE_DEST) update = FALSE;
 
   if (type == 1) {
-    if (compiler->vars[insn->src_args[1]].vartype != ORC_VAR_TYPE_CONST) {
+    OrcVariable *src2 = compiler->vars + insn->src_args[1];
+
+    if (src2->vartype != ORC_VAR_TYPE_CONST) {
       ORC_PROGRAM_ERROR(compiler,"unimplemented");
       return;
     }
 
     ptr_register = compiler->gp_tmpreg;
     if (compiler->is_64bit) {
+      if (src2->value.i < 0) {
+        orc_arm64_emit_sub_imm (compiler, 64, ptr_register,
+            src->ptr_register,
+            src2->value.i * src->size * -1);
+      }
+      else
+      {
         orc_arm64_emit_add_imm (compiler, 64, ptr_register,
             src->ptr_register,
-            compiler->vars[insn->src_args[1]].value.i * src->size);
+            src2->value.i * src->size);
+      }
     } else {
+      if (src2->value.i < 0) {
+        orc_arm_emit_sub_imm (compiler, ptr_register,
+            src->ptr_register,
+            src2->value.i * src->size * -1, TRUE);
+      }
+      else
+      {
         orc_arm_emit_add_imm (compiler, ptr_register,
             src->ptr_register,
-            compiler->vars[insn->src_args[1]].value.i * src->size);
+            src2->value.i * src->size);
+      }
     }
 
     update = FALSE;