aarch64: implement emits for some vector instructions and ORC ops (add)
authorDongju Chae <dongju.chae@samsung.com>
Mon, 21 Oct 2019 11:18:35 +0000 (20:18 +0900)
committerMarek Vasut <marex@denx.de>
Wed, 16 Sep 2020 12:27:27 +0000 (14:27 +0200)
This commit implements vector load/store/add instructions and ORC 'add' ops.
They reuse existing macros like 'BINARY', and aarch64-specific arguments are added.

Signed-off-by: Dongju Chae <dongju.chae@samsung.com>
orc/orcrules-neon.c

index 11e0a20..0609382 100644 (file)
@@ -56,6 +56,253 @@ const char *orc_neon_reg_name_quad (int reg)
   return vec_regs[reg&0x1f];
 }
 
+/** the names of the SIMD registers when used in a scalar way */
+const char *orc_neon64_reg_name_scalar (int reg, int size)
+{
+  static const char *vec_regs[5][32] = {
+    { /** 8-bit */
+      "b0", "b1", "b2", "b3",
+      "b4", "b5", "b6", "b7",
+      "b8", "b9", "b10", "b11",
+      "b12", "b13", "b14", "b15",
+      "b16", "b17", "b18", "b19",
+      "b20", "b21", "b22", "b23",
+      "b24", "b25", "b26", "b27",
+      "b28", "b29", "b30", "b31"
+    },
+    { /** 16-bit */
+      "h0", "h1", "h2", "h3",
+      "h4", "h5", "h6", "h7",
+      "h8", "h9", "h10", "h11",
+      "h12", "h13", "h14", "h15",
+      "h16", "h17", "h18", "h19",
+      "h20", "h21", "h22", "h23",
+      "h24", "h25", "h26", "h27",
+      "h28", "h29", "h30", "h31"
+    },
+    { /** 32-bit */
+      "s0", "s1", "s2", "s3",
+      "s4", "s5", "s6", "s7",
+      "s8", "s9", "s10", "s11",
+      "s12", "s13", "s14", "s15",
+      "s16", "s17", "s18", "s19",
+      "s20", "s21", "s22", "s23",
+      "s24", "s25", "s26", "s27",
+      "s28", "s29", "s30", "s31"
+    },
+    { /** 64-bit */
+      "d0", "d1", "d2", "d3",
+      "d4", "d5", "d6", "d7",
+      "d8", "d9", "d10", "d11",
+      "d12", "d13", "d14", "d15",
+      "d16", "d17", "d18", "d19",
+      "d20", "d21", "d22", "d23",
+      "d24", "d25", "d26", "d27",
+      "d28", "d29", "d30", "d31"
+    },
+    { /** 128-bit */
+      "q0", "q1", "q2", "q3",
+      "q4", "q5", "q6", "q7",
+      "q8", "q9", "q10", "q11",
+      "q12", "q13", "q14", "q15",
+      "q16", "q17", "q18", "q19",
+      "q20", "q21", "q22", "q23",
+      "q24", "q25", "q26", "q27",
+      "q28", "q29", "q30", "q31"
+    }
+  };
+  int size_idx;
+
+  if (reg < ORC_VEC_REG_BASE || reg >= ORC_VEC_REG_BASE+32) {
+    return "ERROR";
+  }
+
+  size_idx = -1;
+  while (size) {
+    size_idx++;
+    size >>= 1;
+  }
+
+  if (size_idx < 0 || size_idx >= 5) {
+    return "ERROR";
+  }
+
+  return vec_regs[size_idx][reg&0x1f];
+}
+
+/** the names of the SIMD vector registers when used for vectorization */
+const char *orc_neon64_reg_name_vector (int reg, int size, int quad)
+{
+  static const char *vec_regs[8][32] = {
+    {
+      "v0.8b", "v1.8b", "v2.8b", "v3.8b",
+      "v4.8b", "v5.8b", "v6.8b", "v7.8b",
+      "v8.8b", "v9.8b", "v10.8b", "v11.8b",
+      "v12.8b", "v13.8b", "v14.8b", "v15.8b",
+      "v16.8b", "v17.8b", "v18.8b", "v19.8b",
+      "v20.8b", "v21.8b", "v22.8b", "v23.8b",
+      "v24.8b", "v25.8b", "v26.8b", "v27.8b",
+      "v28.8b", "v29.8b", "v30.8b", "v31.8b"
+    },
+    {
+      "v0.16b", "v1.16b", "v2.16b", "v3.16b",
+      "v4.16b", "v5.16b", "v6.16b", "v7.16b",
+      "v8.16b", "v9.16b", "v10.16b", "v11.16b",
+      "v12.16b", "v13.16b", "v14.16b", "v15.16b",
+      "v16.16b", "v17.16b", "v18.16b", "v19.16b",
+      "v20.16b", "v21.16b", "v22.16b", "v23.16b",
+      "v24.16b", "v25.16b", "v26.16b", "v27.16b",
+      "v28.16b", "v29.16b", "v30.16b", "v31.16b"
+    },
+    {
+      "v0.4h", "v1.4h", "v2.4h", "v3.4h",
+      "v4.4h", "v5.4h", "v6.4h", "v7.4h",
+      "v8.4h", "v9.4h", "v10.4h", "v11.4h",
+      "v12.4h", "v13.4h", "v14.4h", "v15.4h",
+      "v16.4h", "v17.4h", "v18.4h", "v19.4h",
+      "v20.4h", "v21.4h", "v22.4h", "v23.4h",
+      "v24.4h", "v25.4h", "v26.4h", "v27.4h",
+      "v28.4h", "v29.4h", "v30.4h", "v31.4h"
+    },
+    {
+      "v0.8h", "v1.8h", "v2.8h", "v3.8h",
+      "v4.8h", "v5.8h", "v6.8h", "v7.8h",
+      "v8.8h", "v9.8h", "v10.8h", "v11.8h",
+      "v12.8h", "v13.8h", "v14.8h", "v15.8h",
+      "v16.8h", "v17.8h", "v18.8h", "v19.8h",
+      "v20.8h", "v21.8h", "v22.8h", "v23.8h",
+      "v24.8h", "v25.8h", "v26.8h", "v27.8h",
+      "v28.8h", "v29.8h", "v30.8h", "v31.8h"
+    },
+    {
+      "v0.2s", "v1.2s", "v2.2s", "v3.2s",
+      "v4.2s", "v5.2s", "v6.2s", "v7.2s",
+      "v8.2s", "v9.2s", "v10.2s", "v11.2s",
+      "v12.2s", "v13.2s", "v14.2s", "v15.2s",
+      "v16.2s", "v17.2s", "v18.2s", "v19.2s",
+      "v20.2s", "v21.2s", "v22.2s", "v23.2s",
+      "v24.2s", "v25.2s", "v26.2s", "v27.2s",
+      "v28.2s", "v29.2s", "v30.2s", "v31.2s"
+    },
+    {
+      "v0.4s", "v1.4s", "v2.4s", "v3.4s",
+      "v4.4s", "v5.4s", "v6.4s", "v7.4s",
+      "v8.4s", "v9.4s", "v10.4s", "v11.4s",
+      "v12.4s", "v13.4s", "v14.4s", "v15.4s",
+      "v16.4s", "v17.4s", "v18.4s", "v19.4s",
+      "v20.4s", "v21.4s", "v22.4s", "v23.4s",
+      "v24.4s", "v25.4s", "v26.4s", "v27.4s",
+      "v28.4s", "v29.4s", "v30.4s", "v31.4s"
+    },
+    {
+      "v0.1d", "v1.1d", "v2.1d", "v3.1d",
+      "v4.1d", "v5.1d", "v6.1d", "v7.1d",
+      "v8.1d", "v9.1d", "v10.1d", "v11.1d",
+      "v12.1d", "v13.1d", "v14.1d", "v15.1d",
+      "v16.1d", "v17.1d", "v18.1d", "v19.1d",
+      "v20.1d", "v21.1d", "v22.1d", "v23.1d",
+      "v24.1d", "v25.1d", "v26.1d", "v27.1d",
+      "v28.1d", "v29.1d", "v30.1d", "v31.1d"
+    },
+    {
+      "v0.2d", "v1.2d", "v2.2d", "v3.2d",
+      "v4.2d", "v5.2d", "v6.2d", "v7.2d",
+      "v8.2d", "v9.2d", "v10.2d", "v11.2d",
+      "v12.2d", "v13.2d", "v14.2d", "v15.2d",
+      "v16.2d", "v17.2d", "v18.2d", "v19.2d",
+      "v20.2d", "v21.2d", "v22.2d", "v23.2d",
+      "v24.2d", "v25.2d", "v26.2d", "v27.2d",
+      "v28.2d", "v29.2d", "v30.2d", "v31.2d"
+    }
+  };
+  int size_idx;
+
+  if (reg < ORC_VEC_REG_BASE || reg >= ORC_VEC_REG_BASE+32) {
+    return "ERROR";
+  }
+
+  size_idx = -1;
+  while (size) {
+    size_idx++;
+    size >>= 1;
+  }
+
+  if (size_idx < 0 || size_idx >= 4) {
+    return "ERROR";
+  }
+
+  if (quad != 0 && quad != 1) {
+    return "ERROR";
+  }
+
+  return vec_regs[size_idx*2+quad][reg&0x1f];
+}
+
+/** a single element from a SIMD vector register as a scalar operand */
+const char *orc_neon64_reg_name_vector_single (int reg, int size)
+{
+  static const char *vec_regs[4][32] = {
+    {
+      "v0.b", "v1.b", "v2.b", "v3.b",
+      "v4.b", "v5.b", "v6.b", "v7.b",
+      "v8.b", "v9.b", "v10.b", "v11.b",
+      "v12.b", "v13.b", "v14.b", "v15.b",
+      "v16.b", "v17.b", "v18.b", "v19.b",
+      "v20.b", "v21.b", "v22.b", "v23.b",
+      "v24.b", "v25.b", "v26.b", "v27.b",
+      "v28.b", "v29.b", "v30.b", "v31.b"
+    },
+    {
+      "v0.h", "v1.h", "v2.h", "v3.h",
+      "v4.h", "v5.h", "v6.h", "v7.h",
+      "v8.h", "v9.h", "v10.h", "v11.h",
+      "v12.h", "v13.h", "v14.h", "v15.h",
+      "v16.h", "v17.h", "v18.h", "v19.h",
+      "v20.h", "v21.h", "v22.h", "v23.h",
+      "v24.h", "v25.h", "v26.h", "v27.h",
+      "v28.h", "v29.h", "v30.h", "v31.h"
+    },
+    {
+      "v0.s", "v1.s", "v2.s", "v3.s",
+      "v4.s", "v5.s", "v6.s", "v7.s",
+      "v8.s", "v9.s", "v10.s", "v11.s",
+      "v12.s", "v13.s", "v14.s", "v15.s",
+      "v16.s", "v17.s", "v18.s", "v19.s",
+      "v20.s", "v21.s", "v22.s", "v23.s",
+      "v24.s", "v25.s", "v26.s", "v27.s",
+      "v28.s", "v29.s", "v30.s", "v31.s"
+    },
+    {
+      "v0.d", "v1.d", "v2.d", "v3.d",
+      "v4.d", "v5.d", "v6.d", "v7.d",
+      "v8.d", "v9.d", "v10.d", "v11.d",
+      "v12.d", "v13.d", "v14.d", "v15.d",
+      "v16.d", "v17.d", "v18.d", "v19.d",
+      "v20.d", "v21.d", "v22.d", "v23.d",
+      "v24.d", "v25.d", "v26.d", "v27.d",
+      "v28.d", "v29.d", "v30.d", "v31.d"
+    },
+  };
+
+  int size_idx;
+
+  if (reg < ORC_VEC_REG_BASE || reg >= ORC_VEC_REG_BASE+32) {
+    return "ERROR";
+  }
+
+  size_idx = -1;
+  while (size) {
+    size_idx++;
+    size >>= 1;
+  }
+
+  if (size_idx < 0 || size_idx >= 4) {
+    return "ERROR";
+  }
+
+  return vec_regs[size_idx][reg&0x1f];
+}
+
 static void
 orc_neon_emit_binary (OrcCompiler *p, const char *name, unsigned int code,
     int dest, int src1, int src2)
@@ -74,6 +321,30 @@ orc_neon_emit_binary (OrcCompiler *p, const char *name, unsigned int code,
   orc_arm_emit (p, code);
 }
 
+static void
+orc_neon64_emit_binary (OrcCompiler *p, const char *name, unsigned int code,
+    OrcVariable dest, OrcVariable src1, OrcVariable src2, int vec_shift)
+{
+  int is_quad = 0;
+
+  if (p->insn_shift == vec_shift + 1) {
+    is_quad = 1;
+  } else if (p->insn_shift > vec_shift + 1) {
+    ORC_COMPILER_ERROR(p, "out-of-shift");
+    return;
+  }
+
+  ORC_ASM_CODE(p,"  %s %s, %s, %s\n", name,
+      orc_neon64_reg_name_vector (dest.alloc, dest.size, is_quad),
+      orc_neon64_reg_name_vector (src1.alloc, src1.size, is_quad),
+      orc_neon64_reg_name_vector (src2.alloc, src2.size, is_quad));
+  code |= (is_quad&0x1)<<30;
+  code |= (src2.alloc&0x1f)<<16;
+  code |= (src1.alloc&0x1f)<<5;
+  code |= (dest.alloc&0x1f);
+  orc_arm_emit (p, code);
+}
+
 #define NEON_BINARY(code,a,b,c) \
   ((code) | \
    (((a)&0xf)<<12) | \
@@ -845,85 +1116,178 @@ neon_rule_loadX (OrcCompiler *compiler, void *user, OrcInstruction *insn)
     ptr_register = src->ptr_register;
   }
 
-  if (size >= 8) {
-    if (is_aligned) {
-      if (size == 32) {
-        ORC_ASM_CODE(compiler,"  vld1.64 { %s, %s, %s, %s }, [%s,:256]%s\n",
-            orc_neon_reg_name (dest->alloc),
-            orc_neon_reg_name (dest->alloc + 1),
-            orc_neon_reg_name (dest->alloc + 2),
-            orc_neon_reg_name (dest->alloc + 3),
-            orc_arm_reg_name (ptr_register),
-            update ? "!" : "");
-        code = 0xf42002dd;
-      } else if (size == 16) {
-        ORC_ASM_CODE(compiler,"  vld1.64 { %s, %s }, [%s,:128]%s\n",
-            orc_neon_reg_name (dest->alloc),
-            orc_neon_reg_name (dest->alloc + 1),
-            orc_arm_reg_name (ptr_register),
-            update ? "!" : "");
-        code = 0xf4200aed;
-      } else if (size == 8) {
-        ORC_ASM_CODE(compiler,"  vld1.64 %s, [%s]%s\n",
-            orc_neon_reg_name (dest->alloc),
-            orc_arm_reg_name (ptr_register),
-            update ? "!" : "");
-        code = 0xf42007cd;
+  if (compiler->is_64bit) {
+    int opcode, flag;
+
+    if (size >= 16) {
+      /** load multiple single-element structures to one, two, three, or four registers */
+      char vt_str[64];
+
+      memset(vt_str, '\x00', 64);
+
+      if (is_aligned) {
+        if (size == 64) {
+          snprintf(vt_str, 64, "%s, %s, %s, %s",
+              orc_neon64_reg_name_vector (dest->alloc, 8, 1),
+              orc_neon64_reg_name_vector (dest->alloc + 1, 8, 1),
+              orc_neon64_reg_name_vector (dest->alloc + 2, 8, 1),
+              orc_neon64_reg_name_vector (dest->alloc + 3, 8, 1));
+          opcode = 2;
+        } else if (size == 32) {
+          snprintf(vt_str, 64, "%s, %s",
+              orc_neon64_reg_name_vector (dest->alloc, 8, 1),
+              orc_neon64_reg_name_vector (dest->alloc + 1, 8, 1));
+          opcode = 10;
+        } else if (size == 16) {
+          snprintf(vt_str, 64, "%s",
+              orc_neon64_reg_name_vector (dest->alloc, 8, 1));
+          opcode = 7;
+        } else {
+          ORC_COMPILER_ERROR(compiler,"bad aligned load size %d",
+              src->size << compiler->insn_shift);
+          return;
+        }
+        flag = 7;
       } else {
-        ORC_COMPILER_ERROR(compiler,"bad aligned load size %d",
-            src->size << compiler->insn_shift);
+        if (size == 64) {
+          snprintf(vt_str, 64, "%s, %s, %s, %s",
+              orc_neon64_reg_name_vector (dest->alloc, 1, 1),
+              orc_neon64_reg_name_vector (dest->alloc + 1, 1, 1),
+              orc_neon64_reg_name_vector (dest->alloc + 2, 1, 1),
+              orc_neon64_reg_name_vector (dest->alloc + 3, 1, 1));
+          opcode = 2;
+        } else if (size == 32) {
+          snprintf(vt_str, 64, "%s, %s",
+              orc_neon64_reg_name_vector (dest->alloc, 1, 1),
+              orc_neon64_reg_name_vector (dest->alloc + 1, 1, 1));
+          opcode = 10;
+        } else if (size == 16) {
+          snprintf(vt_str, 64, "%s",
+              orc_neon64_reg_name_vector (dest->alloc, 1, 1));
+          opcode = 7;
+        } else {
+          ORC_COMPILER_ERROR(compiler,"bad aligned load size %d",
+              src->size << compiler->insn_shift);
+          return;
+        }
+        flag = 1;
       }
+      ORC_ASM_CODE(compiler,"  ld1 { %s }, [%s]\n",
+          vt_str, orc_arm64_reg_name (ptr_register, 64));
+      code = 0x0c400000;
+      code |= (flag&0x1) << 30;
+      code |= (flag&0x3) << 10;
+      code |= (opcode&0xf) << 12;
     } else {
-      if (size == 32) {
-        ORC_ASM_CODE(compiler,"  vld1.8 { %s, %s, %s, %s }, [%s]%s\n",
-            orc_neon_reg_name (dest->alloc),
-            orc_neon_reg_name (dest->alloc + 1),
-            orc_neon_reg_name (dest->alloc + 2),
-            orc_neon_reg_name (dest->alloc + 3),
-            orc_arm_reg_name (ptr_register),
-            update ? "!" : "");
-        code = 0xf420020d;
-      } else if (size == 16) {
-        ORC_ASM_CODE(compiler,"  vld1.8 { %s, %s }, [%s]%s\n",
-            orc_neon_reg_name (dest->alloc),
-            orc_neon_reg_name (dest->alloc + 1),
-            orc_arm_reg_name (ptr_register),
-            update ? "!" : "");
-        code = 0xf4200a0d;
-      } else if (size == 8) {
-        ORC_ASM_CODE(compiler,"  vld1.8 %s, [%s]%s\n",
-            orc_neon_reg_name (dest->alloc),
-            orc_arm_reg_name (ptr_register),
-            update ? "!" : "");
-        code = 0xf420070d;
+      /** load one single-element structure to one lane of one register */
+      flag = 0;
+      if (size == 8) {
+        opcode = 4;
+        flag = 1;
+      } else if (size == 4) {
+        opcode = 4;
+      } else if (size == 2) {
+        opcode = 2;
+      } else if (size == 1) {
+        opcode = 1;
       } else {
         ORC_COMPILER_ERROR(compiler,"bad unaligned load size %d",
             src->size << compiler->insn_shift);
+        return;
       }
+      ORC_ASM_CODE(compiler,"  ld1 { %s }[0], [%s]\n",
+          orc_neon64_reg_name_vector_single (dest->alloc, size),
+          orc_arm64_reg_name (ptr_register, 64));
+      code = 0x0d400000;
+      code |= (opcode&0x7) << 13;
+      code |= (flag&0x3) << 10;
     }
+
+    code |= (ptr_register&0x1f) << 5;
+    code |= (dest->alloc&0x1f);
+
+    orc_arm_emit (compiler, code);
   } else {
-    int shift;
-    if (size == 4) {
-      shift = 2;
-    } else if (size == 2) {
-      shift = 1;
+    if (size >= 8) {
+      if (is_aligned) {
+        if (size == 32) {
+          ORC_ASM_CODE(compiler,"  vld1.64 { %s, %s, %s, %s }, [%s,:256]%s\n",
+              orc_neon_reg_name (dest->alloc),
+              orc_neon_reg_name (dest->alloc + 1),
+              orc_neon_reg_name (dest->alloc + 2),
+              orc_neon_reg_name (dest->alloc + 3),
+              orc_arm_reg_name (ptr_register),
+              update ? "!" : "");
+          code = 0xf42002dd;
+        } else if (size == 16) {
+          ORC_ASM_CODE(compiler,"  vld1.64 { %s, %s }, [%s,:128]%s\n",
+              orc_neon_reg_name (dest->alloc),
+              orc_neon_reg_name (dest->alloc + 1),
+              orc_arm_reg_name (ptr_register),
+              update ? "!" : "");
+          code = 0xf4200aed;
+        } else if (size == 8) {
+          ORC_ASM_CODE(compiler,"  vld1.64 %s, [%s]%s\n",
+              orc_neon_reg_name (dest->alloc),
+              orc_arm_reg_name (ptr_register),
+              update ? "!" : "");
+          code = 0xf42007cd;
+        } else {
+          ORC_COMPILER_ERROR(compiler,"bad aligned load size %d",
+              src->size << compiler->insn_shift);
+        }
+      } else {
+        if (size == 32) {
+          ORC_ASM_CODE(compiler,"  vld1.8 { %s, %s, %s, %s }, [%s]%s\n",
+              orc_neon_reg_name (dest->alloc),
+              orc_neon_reg_name (dest->alloc + 1),
+              orc_neon_reg_name (dest->alloc + 2),
+              orc_neon_reg_name (dest->alloc + 3),
+              orc_arm_reg_name (ptr_register),
+              update ? "!" : "");
+          code = 0xf420020d;
+        } else if (size == 16) {
+          ORC_ASM_CODE(compiler,"  vld1.8 { %s, %s }, [%s]%s\n",
+              orc_neon_reg_name (dest->alloc),
+              orc_neon_reg_name (dest->alloc + 1),
+              orc_arm_reg_name (ptr_register),
+              update ? "!" : "");
+          code = 0xf4200a0d;
+        } else if (size == 8) {
+          ORC_ASM_CODE(compiler,"  vld1.8 %s, [%s]%s\n",
+              orc_neon_reg_name (dest->alloc),
+              orc_arm_reg_name (ptr_register),
+              update ? "!" : "");
+          code = 0xf420070d;
+        } else {
+          ORC_COMPILER_ERROR(compiler,"bad unaligned load size %d",
+              src->size << compiler->insn_shift);
+        }
+      }
     } else {
-      shift = 0;
+      int shift;
+      if (size == 4) {
+        shift = 2;
+      } else if (size == 2) {
+        shift = 1;
+      } else {
+        shift = 0;
+      }
+      ORC_ASM_CODE(compiler,"  vld1.%d %s[0], [%s]%s\n",
+          8<<shift,
+          orc_neon_reg_name (dest->alloc),
+          orc_arm_reg_name (ptr_register),
+          update ? "!" : "");
+      code = 0xf4a0000d;
+      code |= shift<<10;
+      code |= (0&7)<<5;
     }
-    ORC_ASM_CODE(compiler,"  vld1.%d %s[0], [%s]%s\n",
-        8<<shift,
-        orc_neon_reg_name (dest->alloc),
-        orc_arm_reg_name (ptr_register),
-        update ? "!" : "");
-    code = 0xf4a0000d;
-    code |= shift<<10;
-    code |= (0&7)<<5;
+    code |= (ptr_register&0xf) << 16;
+    code |= (dest->alloc&0xf) << 12;
+    code |= ((dest->alloc>>4)&0x1) << 22;
+    code |= (!update) << 1;
+    orc_arm_emit (compiler, code);
   }
-  code |= (ptr_register&0xf) << 16;
-  code |= (dest->alloc&0xf) << 12;
-  code |= ((dest->alloc>>4)&0x1) << 22;
-  code |= (!update) << 1;
-  orc_arm_emit (compiler, code);
 }
 
 static void
@@ -935,86 +1299,178 @@ neon_rule_storeX (OrcCompiler *compiler, void *user, OrcInstruction *insn)
   unsigned int code = 0;
   int size = dest->size << compiler->insn_shift;
 
-  if (size >= 8) {
-    if (dest->is_aligned) {
-      if (size == 32) {
-        ORC_ASM_CODE(compiler,"  vst1.64 { %s, %s, %s, %s }, [%s,:256]%s\n",
-            orc_neon_reg_name (src->alloc),
-            orc_neon_reg_name (src->alloc + 1),
-            orc_neon_reg_name (src->alloc + 2),
-            orc_neon_reg_name (src->alloc + 3),
-            orc_arm_reg_name (dest->ptr_register),
-            update ? "!" : "");
-        code = 0xf40002dd;
-      } else if (size == 16) {
-        ORC_ASM_CODE(compiler,"  vst1.64 { %s, %s }, [%s,:128]%s\n",
-            orc_neon_reg_name (src->alloc),
-            orc_neon_reg_name (src->alloc + 1),
-            orc_arm_reg_name (dest->ptr_register),
-            update ? "!" : "");
-        code = 0xf4000aed;
-      } else if (size == 8) {
-        ORC_ASM_CODE(compiler,"  vst1.64 %s, [%s]%s\n",
-            orc_neon_reg_name (src->alloc),
-            orc_arm_reg_name (dest->ptr_register),
-            update ? "!" : "");
-        code = 0xf40007cd;
+  if (compiler->is_64bit) {
+    int opcode, flag;
+
+    if (size >= 16) {
+      /** store multiple single-element structures to one, two, three, or four registers */
+      char vt_str[64];
+
+      memset(vt_str, '\x00', 64);
+
+      if (dest->is_aligned) {
+        if (size == 64) {
+          snprintf(vt_str, 64, "%s, %s, %s, %s",
+              orc_neon64_reg_name_vector (dest->alloc, 8, 1),
+              orc_neon64_reg_name_vector (dest->alloc + 1, 8, 1),
+              orc_neon64_reg_name_vector (dest->alloc + 2, 8, 1),
+              orc_neon64_reg_name_vector (dest->alloc + 3, 8, 1));
+          opcode = 2;
+        } else if (size == 32) {
+          snprintf(vt_str, 64, "%s, %s",
+              orc_neon64_reg_name_vector (dest->alloc, 8, 1),
+              orc_neon64_reg_name_vector (dest->alloc + 1, 8, 1));
+          opcode = 10;
+        } else if (size == 16) {
+          snprintf(vt_str, 64, "%s",
+              orc_neon64_reg_name_vector (dest->alloc, 8, 1));
+          opcode = 7;
+        } else {
+          ORC_COMPILER_ERROR(compiler,"bad aligned load size %d",
+              src->size << compiler->insn_shift);
+          return;
+        }
+        flag = 7;
       } else {
-        ORC_COMPILER_ERROR(compiler,"bad aligned store size %d", size);
+        if (size == 64) {
+          snprintf(vt_str, 64, "%s, %s, %s, %s",
+              orc_neon64_reg_name_vector (dest->alloc, 1, 1),
+              orc_neon64_reg_name_vector (dest->alloc + 1, 1, 1),
+              orc_neon64_reg_name_vector (dest->alloc + 2, 1, 1),
+              orc_neon64_reg_name_vector (dest->alloc + 3, 1, 1));
+          opcode = 2;
+        } else if (size == 32) {
+          snprintf(vt_str, 64, "%s, %s",
+              orc_neon64_reg_name_vector (dest->alloc, 1, 1),
+              orc_neon64_reg_name_vector (dest->alloc + 1, 1, 1));
+          opcode = 10;
+        } else if (size == 16) {
+          snprintf(vt_str, 64, "%s",
+              orc_neon64_reg_name_vector (dest->alloc, 1, 1));
+          opcode = 7;
+        } else {
+          ORC_COMPILER_ERROR(compiler,"bad aligned load size %d",
+              src->size << compiler->insn_shift);
+          return;
+        }
+        flag = 1;
       }
+      ORC_ASM_CODE(compiler,"  st1 { %s }, [%s]\n",
+          vt_str, orc_arm64_reg_name (dest->ptr_register, 64));
+      code = 0x0c000000;
+      code |= (flag&0x1) << 30;
+      code |= (flag&0x3) << 10;
+      code |= (opcode&0xf) << 12;
     } else {
-      if (size == 32) {
-        ORC_ASM_CODE(compiler,"  vst1.8 { %s, %s, %s, %s }, [%s]%s\n",
-            orc_neon_reg_name (src->alloc),
-            orc_neon_reg_name (src->alloc + 1),
-            orc_neon_reg_name (src->alloc + 2),
-            orc_neon_reg_name (src->alloc + 3),
-            orc_arm_reg_name (dest->ptr_register),
-            update ? "!" : "");
-        code = 0xf400020d;
-      } else if (size == 16) {
-        ORC_ASM_CODE(compiler,"  vst1.8 { %s, %s }, [%s]%s\n",
-            orc_neon_reg_name (src->alloc),
-            orc_neon_reg_name (src->alloc + 1),
-            orc_arm_reg_name (dest->ptr_register),
-            update ? "!" : "");
-        code = 0xf4000a0d;
-      } else if (size == 8) {
-        ORC_ASM_CODE(compiler,"  vst1.8 %s, [%s]%s\n",
-            orc_neon_reg_name (src->alloc),
-            orc_arm_reg_name (dest->ptr_register),
-            update ? "!" : "");
-        code = 0xf400070d;
+      /** store one single-element structure to one lane of one register */
+      flag = 0;
+      if (size == 8) {
+        opcode = 4;
+        flag = 1;
+      } else if (size == 4) {
+        opcode = 4;
+      } else if (size == 2) {
+        opcode = 2;
+      } else if (size == 1) {
+        opcode = 1;
       } else {
-        ORC_COMPILER_ERROR(compiler,"bad aligned store size %d", size);
+        ORC_COMPILER_ERROR(compiler,"bad unaligned load size %d",
+            src->size << compiler->insn_shift);
+        return;
       }
+      ORC_ASM_CODE(compiler,"  st1 { %s }[0], [%s]\n",
+          orc_neon64_reg_name_vector_single (dest->alloc, size),
+          orc_arm64_reg_name (dest->ptr_register, 64));
+      code = 0x0d000000;
+      code |= (opcode&0x7) << 13;
+      code |= (flag&0x3) << 10;
     }
+
+    code |= (dest->ptr_register&0x1f) << 5;
+    code |= (dest->alloc&0x1f);
+
+    orc_arm_emit (compiler, code);
   } else {
-    int shift;
-    if (size == 4) {
-      shift = 2;
-    } else if (size == 2) {
-      shift = 1;
+    if (size >= 8) {
+      if (dest->is_aligned) {
+        if (size == 32) {
+          ORC_ASM_CODE(compiler,"  vst1.64 { %s, %s, %s, %s }, [%s,:256]%s\n",
+              orc_neon_reg_name (src->alloc),
+              orc_neon_reg_name (src->alloc + 1),
+              orc_neon_reg_name (src->alloc + 2),
+              orc_neon_reg_name (src->alloc + 3),
+              orc_arm_reg_name (dest->ptr_register),
+              update ? "!" : "");
+          code = 0xf40002dd;
+        } else if (size == 16) {
+          ORC_ASM_CODE(compiler,"  vst1.64 { %s, %s }, [%s,:128]%s\n",
+              orc_neon_reg_name (src->alloc),
+              orc_neon_reg_name (src->alloc + 1),
+              orc_arm_reg_name (dest->ptr_register),
+              update ? "!" : "");
+          code = 0xf4000aed;
+        } else if (size == 8) {
+          ORC_ASM_CODE(compiler,"  vst1.64 %s, [%s]%s\n",
+              orc_neon_reg_name (src->alloc),
+              orc_arm_reg_name (dest->ptr_register),
+              update ? "!" : "");
+          code = 0xf40007cd;
+        } else {
+          ORC_COMPILER_ERROR(compiler,"bad aligned store size %d", size);
+        }
+      } else {
+        if (size == 32) {
+          ORC_ASM_CODE(compiler,"  vst1.8 { %s, %s, %s, %s }, [%s]%s\n",
+              orc_neon_reg_name (src->alloc),
+              orc_neon_reg_name (src->alloc + 1),
+              orc_neon_reg_name (src->alloc + 2),
+              orc_neon_reg_name (src->alloc + 3),
+              orc_arm_reg_name (dest->ptr_register),
+              update ? "!" : "");
+          code = 0xf400020d;
+        } else if (size == 16) {
+          ORC_ASM_CODE(compiler,"  vst1.8 { %s, %s }, [%s]%s\n",
+              orc_neon_reg_name (src->alloc),
+              orc_neon_reg_name (src->alloc + 1),
+              orc_arm_reg_name (dest->ptr_register),
+              update ? "!" : "");
+          code = 0xf4000a0d;
+        } else if (size == 8) {
+          ORC_ASM_CODE(compiler,"  vst1.8 %s, [%s]%s\n",
+              orc_neon_reg_name (src->alloc),
+              orc_arm_reg_name (dest->ptr_register),
+              update ? "!" : "");
+          code = 0xf400070d;
+        } else {
+          ORC_COMPILER_ERROR(compiler,"bad aligned store size %d", size);
+        }
+      }
     } else {
-      shift = 0;
+      int shift;
+      if (size == 4) {
+        shift = 2;
+      } else if (size == 2) {
+        shift = 1;
+      } else {
+        shift = 0;
+      }
+      ORC_ASM_CODE(compiler,"  vst1.%d %s[0], [%s]%s\n",
+          8<<shift,
+          orc_neon_reg_name (src->alloc),
+          orc_arm_reg_name (dest->ptr_register),
+          update ? "!" : "");
+      code = 0xf480000d;
+      code |= shift<<10;
+      code |= (0&7)<<5;
     }
-    ORC_ASM_CODE(compiler,"  vst1.%d %s[0], [%s]%s\n",
-        8<<shift,
-        orc_neon_reg_name (src->alloc),
-        orc_arm_reg_name (dest->ptr_register),
-        update ? "!" : "");
-    code = 0xf480000d;
-    code |= shift<<10;
-    code |= (0&7)<<5;
+    code |= (dest->ptr_register&0xf) << 16;
+    code |= (src->alloc&0xf) << 12;
+    code |= ((src->alloc>>4)&0x1) << 22;
+    code |= (!update) << 1;
+    orc_arm_emit (compiler, code);
   }
-  code |= (dest->ptr_register&0xf) << 16;
-  code |= (src->alloc&0xf) << 12;
-  code |= ((src->alloc>>4)&0x1) << 22;
-  code |= (!update) << 1;
-  orc_arm_emit (compiler, code);
 }
 
-
 #if 0
 static int
 orc_neon_get_const_shift (unsigned int value)
@@ -1333,7 +1789,7 @@ orc_neon_emit_loadpq (OrcCompiler *compiler, int dest, int param)
   orc_arm_emit (compiler, code);
 }
 
-#define UNARY(opcode,insn_name,code,vec_shift) \
+#define UNARY(opcode,insn_name,code,insn_name64,code64,vec_shift) \
 static void \
 orc_neon_rule_ ## opcode (OrcCompiler *p, void *user, OrcInstruction *insn) \
 { \
@@ -1350,7 +1806,7 @@ orc_neon_rule_ ## opcode (OrcCompiler *p, void *user, OrcInstruction *insn) \
   } \
 }
 
-#define UNARY_LONG(opcode,insn_name,code,vec_shift) \
+#define UNARY_LONG(opcode,insn_name,code,insn_name64,code64,vec_shift) \
 static void \
 orc_neon_rule_ ## opcode (OrcCompiler *p, void *user, OrcInstruction *insn) \
 { \
@@ -1363,7 +1819,7 @@ orc_neon_rule_ ## opcode (OrcCompiler *p, void *user, OrcInstruction *insn) \
   } \
 }
 
-#define UNARY_NARROW(opcode,insn_name,code,vec_shift) \
+#define UNARY_NARROW(opcode,insn_name,code,insn_name64,code64,vec_shift) \
 static void \
 orc_neon_rule_ ## opcode (OrcCompiler *p, void *user, OrcInstruction *insn) \
 { \
@@ -1376,26 +1832,37 @@ orc_neon_rule_ ## opcode (OrcCompiler *p, void *user, OrcInstruction *insn) \
   } \
 }
 
-#define BINARY(opcode,insn_name,code,vec_shift) \
+#define BINARY(opcode,insn_name,code,insn_name64,code64,vec_shift) \
 static void \
 orc_neon_rule_ ## opcode (OrcCompiler *p, void *user, OrcInstruction *insn) \
 { \
-  if (p->insn_shift <= vec_shift) { \
-    orc_neon_emit_binary (p, insn_name, code, \
-        p->vars[insn->dest_args[0]].alloc, \
-        p->vars[insn->src_args[0]].alloc, \
-        p->vars[insn->src_args[1]].alloc); \
-  } else if (p->insn_shift == vec_shift + 1) { \
-    orc_neon_emit_binary_quad (p, insn_name, code, \
-        p->vars[insn->dest_args[0]].alloc, \
-        p->vars[insn->src_args[0]].alloc, \
-        p->vars[insn->src_args[1]].alloc); \
+  if (p->is_64bit) { \
+    if (insn_name64) { \
+      orc_neon64_emit_binary (p, insn_name64, code64, \
+          p->vars[insn->dest_args[0]], \
+          p->vars[insn->src_args[0]], \
+          p->vars[insn->src_args[1]], vec_shift); \
+    } else { \
+      ORC_COMPILER_ERROR(p, "not supported in AArch64 yet"); \
+    } \
   } else { \
-    ORC_COMPILER_ERROR(p, "shift too large"); \
+    if (p->insn_shift <= vec_shift) { \
+      orc_neon_emit_binary (p, insn_name, code, \
+          p->vars[insn->dest_args[0]].alloc, \
+          p->vars[insn->src_args[0]].alloc, \
+          p->vars[insn->src_args[1]].alloc); \
+    } else if (p->insn_shift == vec_shift + 1) { \
+      orc_neon_emit_binary_quad (p, insn_name, code, \
+          p->vars[insn->dest_args[0]].alloc, \
+          p->vars[insn->src_args[0]].alloc, \
+          p->vars[insn->src_args[1]].alloc); \
+    } else { \
+      ORC_COMPILER_ERROR(p, "shift too large"); \
+    } \
   } \
 }
 
-#define BINARY_LONG(opcode,insn_name,code,vec_shift) \
+#define BINARY_LONG(opcode,insn_name,code,insn_name64,code64,vec_shift) \
 static void \
 orc_neon_rule_ ## opcode (OrcCompiler *p, void *user, OrcInstruction *insn) \
 { \
@@ -1409,7 +1876,7 @@ orc_neon_rule_ ## opcode (OrcCompiler *p, void *user, OrcInstruction *insn) \
   } \
 }
 
-#define BINARY_NARROW(opcode,insn_name,code,vec_shift) \
+#define BINARY_NARROW(opcode,insn_name,code,insn_name64,code64,vec_shift) \
 static void \
 orc_neon_rule_ ## opcode (OrcCompiler *p, void *user, OrcInstruction *insn) \
 { \
@@ -1423,7 +1890,7 @@ orc_neon_rule_ ## opcode (OrcCompiler *p, void *user, OrcInstruction *insn) \
   } \
 }
 
-#define MOVE(opcode,insn_name,code,vec_shift) \
+#define MOVE(opcode,insn_name,code,insn_name64,code64,vec_shift) \
 static void \
 orc_neon_rule_ ## opcode (OrcCompiler *p, void *user, OrcInstruction *insn) \
 { \
@@ -1445,7 +1912,6 @@ orc_neon_rule_ ## opcode (OrcCompiler *p, void *user, OrcInstruction *insn) \
   } \
 }
 
-
 typedef struct {
   orc_uint32 code;
   char *name;
@@ -1643,148 +2109,148 @@ orc_neon_rule_andn (OrcCompiler *p, void *user, OrcInstruction *insn)
 
 
 
-UNARY(absb,"vabs.s8",0xf3b10300, 3)
-BINARY(addb,"vadd.i8",0xf2000800, 3)
-BINARY(addssb,"vqadd.s8",0xf2000010, 3)
-BINARY(addusb,"vqadd.u8",0xf3000010, 3)
-BINARY(andb,"vand",0xf2000110, 3)
-/* BINARY(andnb,"vbic",0xf2100110, 3) */
-BINARY(avgsb,"vrhadd.s8",0xf2000100, 3)
-BINARY(avgub,"vrhadd.u8",0xf3000100, 3)
-BINARY(cmpeqb,"vceq.i8",0xf3000810, 3)
-BINARY(cmpgtsb,"vcgt.s8",0xf2000300, 3)
-MOVE(copyb,"vmov",0xf2200110, 3)
-BINARY(maxsb,"vmax.s8",0xf2000600, 3)
-BINARY(maxub,"vmax.u8",0xf3000600, 3)
-BINARY(minsb,"vmin.s8",0xf2000610, 3)
-BINARY(minub,"vmin.u8",0xf3000610, 3)
-BINARY(mullb,"vmul.i8",0xf2000910, 3)
-BINARY(orb,"vorr",0xf2200110, 3)
-/* LSHIFT(shlb,"vshl.i8",0xf2880510, 3) */
-/* RSHIFT(shrsb,"vshr.s8",0xf2880010,8, 3) */
-/* RSHIFT(shrub,"vshr.u8",0xf3880010,8, 3) */
-BINARY(subb,"vsub.i8",0xf3000800, 3)
-BINARY(subssb,"vqsub.s8",0xf2000210, 3)
-BINARY(subusb,"vqsub.u8",0xf3000210, 3)
-BINARY(xorb,"veor",0xf3000110, 3)
-
-UNARY(absw,"vabs.s16",0xf3b50300, 2)
-BINARY(addw,"vadd.i16",0xf2100800, 2)
-BINARY(addssw,"vqadd.s16",0xf2100010, 2)
-BINARY(addusw,"vqadd.u16",0xf3100010, 2)
-BINARY(andw,"vand",0xf2000110, 2)
-/* BINARY(andnw,"vbic",0xf2100110, 2) */
-BINARY(avgsw,"vrhadd.s16",0xf2100100, 2)
-BINARY(avguw,"vrhadd.u16",0xf3100100, 2)
-BINARY(cmpeqw,"vceq.i16",0xf3100810, 2)
-BINARY(cmpgtsw,"vcgt.s16",0xf2100300, 2)
-MOVE(copyw,"vmov",0xf2200110, 2)
-BINARY(maxsw,"vmax.s16",0xf2100600, 2)
-BINARY(maxuw,"vmax.u16",0xf3100600, 2)
-BINARY(minsw,"vmin.s16",0xf2100610, 2)
-BINARY(minuw,"vmin.u16",0xf3100610, 2)
-BINARY(mullw,"vmul.i16",0xf2100910, 2)
-BINARY(orw,"vorr",0xf2200110, 2)
-/* LSHIFT(shlw,"vshl.i16",0xf2900510, 2) */
-/* RSHIFT(shrsw,"vshr.s16",0xf2900010,16, 2) */
-/* RSHIFT(shruw,"vshr.u16",0xf3900010,16, 2) */
-BINARY(subw,"vsub.i16",0xf3100800, 2)
-BINARY(subssw,"vqsub.s16",0xf2100210, 2)
-BINARY(subusw,"vqsub.u16",0xf3100210, 2)
-BINARY(xorw,"veor",0xf3000110, 2)
-
-UNARY(absl,"vabs.s32",0xf3b90300, 1)
-BINARY(addl,"vadd.i32",0xf2200800, 1)
-BINARY(addssl,"vqadd.s32",0xf2200010, 1)
-BINARY(addusl,"vqadd.u32",0xf3200010, 1)
-BINARY(andl,"vand",0xf2000110, 1)
-/* BINARY(andnl,"vbic",0xf2100110, 1) */
-BINARY(avgsl,"vrhadd.s32",0xf2200100, 1)
-BINARY(avgul,"vrhadd.u32",0xf3200100, 1)
-BINARY(cmpeql,"vceq.i32",0xf3200810, 1)
-BINARY(cmpgtsl,"vcgt.s32",0xf2200300, 1)
-MOVE(copyl,"vmov",0xf2200110, 1)
-BINARY(maxsl,"vmax.s32",0xf2200600, 1)
-BINARY(maxul,"vmax.u32",0xf3200600, 1)
-BINARY(minsl,"vmin.s32",0xf2200610, 1)
-BINARY(minul,"vmin.u32",0xf3200610, 1)
-BINARY(mulll,"vmul.i32",0xf2200910, 1)
-BINARY(orl,"vorr",0xf2200110, 1)
-/* LSHIFT(shll,"vshl.i32",0xf2a00510, 1) */
-/* RSHIFT(shrsl,"vshr.s32",0xf2a00010,32, 1) */
-/* RSHIFT(shrul,"vshr.u32",0xf3a00010,32, 1) */
-BINARY(subl,"vsub.i32",0xf3200800, 1)
-BINARY(subssl,"vqsub.s32",0xf2200210, 1)
-BINARY(subusl,"vqsub.u32",0xf3200210, 1)
-BINARY(xorl,"veor",0xf3000110, 1)
-
-/* UNARY(absq,"vabs.s64",0xf3b10300, 0) */
-BINARY(addq,"vadd.i64",0xf2300800, 0)
-/* BINARY(addssq,"vqadd.s64",0xf2000010, 0) */
-/* BINARY(addusq,"vqadd.u64",0xf3000010, 0) */
-BINARY(andq,"vand",0xf2000110, 0)
-/* BINARY(avgsq,"vrhadd.s64",0xf2000100, 0) */
-/* BINARY(avguq,"vrhadd.u64",0xf3000100, 0) */
-/* BINARY(cmpeqq,"vceq.i64",0xf3000810, 0) */
-/* BINARY(cmpgtsq,"vcgt.s64",0xf2000300, 0) */
-MOVE(copyq,"vmov",0xf2200110, 0)
-/* BINARY(maxsq,"vmax.s64",0xf2000600, 0) */
-/* BINARY(maxuq,"vmax.u64",0xf3000600, 0) */
-/* BINARY(minsq,"vmin.s64",0xf2000610, 0) */
-/* BINARY(minuq,"vmin.u64",0xf3000610, 0) */
-/* BINARY(mullq,"vmul.i64",0xf2000910, 0) */
-BINARY(orq,"vorr",0xf2200110, 0)
-BINARY(subq,"vsub.i64",0xf3300800, 0)
-/* BINARY(subssq,"vqsub.s64",0xf2000210, 0) */
-/* BINARY(subusq,"vqsub.u64",0xf3000210, 0) */
-BINARY(xorq,"veor",0xf3000110, 0)
-
-UNARY_LONG(convsbw,"vmovl.s8",0xf2880a10, 3)
-UNARY_LONG(convubw,"vmovl.u8",0xf3880a10, 3)
-UNARY_LONG(convswl,"vmovl.s16",0xf2900a10, 2)
-UNARY_LONG(convuwl,"vmovl.u16",0xf3900a10, 2)
-UNARY_LONG(convslq,"vmovl.s32",0xf2a00a10, 1)
-UNARY_LONG(convulq,"vmovl.u32",0xf3a00a10, 1)
-UNARY_NARROW(convwb,"vmovn.i16",0xf3b20200, 3)
-UNARY_NARROW(convssswb,"vqmovn.s16",0xf3b20280, 3)
-UNARY_NARROW(convsuswb,"vqmovun.s16",0xf3b20240, 3)
-UNARY_NARROW(convuuswb,"vqmovn.u16",0xf3b202c0, 3)
-UNARY_NARROW(convlw,"vmovn.i32",0xf3b60200, 2)
-UNARY_NARROW(convql,"vmovn.i64",0xf3ba0200, 1)
-UNARY_NARROW(convssslw,"vqmovn.s32",0xf3b60280, 2)
-UNARY_NARROW(convsuslw,"vqmovun.s32",0xf3b60240, 2)
-UNARY_NARROW(convuuslw,"vqmovn.u32",0xf3b602c0, 2)
-UNARY_NARROW(convsssql,"vqmovn.s64",0xf3ba0280, 1)
-UNARY_NARROW(convsusql,"vqmovun.s64",0xf3ba0240, 1)
-UNARY_NARROW(convuusql,"vqmovn.u64",0xf3ba02c0, 1)
-
-BINARY_LONG(mulsbw,"vmull.s8",0xf2800c00, 3)
-BINARY_LONG(mulubw,"vmull.u8",0xf3800c00, 3)
-BINARY_LONG(mulswl,"vmull.s16",0xf2900c00, 2)
-BINARY_LONG(muluwl,"vmull.u16",0xf3900c00, 2)
-
-UNARY(swapw,"vrev16.i8",0xf3b00100, 2)
-UNARY(swapl,"vrev32.i8",0xf3b00080, 1)
-UNARY(swapq,"vrev64.i8",0xf3b00000, 0)
-UNARY(swapwl,"vrev32.i16",0xf3b40080, 1)
-UNARY(swaplq,"vrev64.i32",0xf3b80000, 0)
-
-UNARY_NARROW(select0ql,"vmovn.i64",0xf3ba0200, 1)
-UNARY_NARROW(select0lw,"vmovn.i32",0xf3b60200, 2)
-UNARY_NARROW(select0wb,"vmovn.i16",0xf3b20200, 3)
-
-BINARY(addf,"vadd.f32",0xf2000d00, 1)
-BINARY(subf,"vsub.f32",0xf2200d00, 1)
-BINARY(mulf,"vmul.f32",0xf3000d10, 1)
-BINARY(maxf,"vmax.f32",0xf2000f00, 1)
-BINARY(minf,"vmin.f32",0xf2200f00, 1)
-BINARY(cmpeqf,"vceq.f32",0xf2000e00, 1)
-/* BINARY_R(cmpltf,"vclt.f32",0xf3200e00, 1) */
-/* BINARY_R(cmplef,"vcle.f32",0xf3000e00, 1) */
-UNARY(convfl,"vcvt.s32.f32",0xf3bb0700, 1)
-UNARY(convlf,"vcvt.f32.s32",0xf3bb0600, 1)
-
-#define UNARY_VFP(opcode,insn_name,code,vec_shift) \
+UNARY(absb,"vabs.s8",0xf3b10300, NULL, 0, 3)
+BINARY(addb,"vadd.i8",0xf2000800, "add", 0x0e208400, 3)
+BINARY(addssb,"vqadd.s8",0xf2000010, "sqadd", 0x0e200c00, 3)
+BINARY(addusb,"vqadd.u8",0xf3000010, "uqadd", 0x2e200c00, 3)
+BINARY(andb,"vand",0xf2000110, NULL, 0, 3)
+/* BINARY(andnb,"vbic",0xf2100110, NULL, 0, 3) */
+BINARY(avgsb,"vrhadd.s8",0xf2000100, NULL, 0, 3)
+BINARY(avgub,"vrhadd.u8",0xf3000100, NULL, 0, 3)
+BINARY(cmpeqb,"vceq.i8",0xf3000810, NULL, 0, 3)
+BINARY(cmpgtsb,"vcgt.s8",0xf2000300, NULL, 0, 3)
+MOVE(copyb,"vmov",0xf2200110, NULL, 0, 3)
+BINARY(maxsb,"vmax.s8",0xf2000600, NULL, 0, 3)
+BINARY(maxub,"vmax.u8",0xf3000600, NULL, 0, 3)
+BINARY(minsb,"vmin.s8",0xf2000610, NULL, 0, 3)
+BINARY(minub,"vmin.u8",0xf3000610, NULL, 0, 3)
+BINARY(mullb,"vmul.i8",0xf2000910, NULL, 0, 3)
+BINARY(orb,"vorr",0xf2200110, NULL, 0, 3)
+/* LSHIFT(shlb,"vshl.i8",0xf2880510, NULL, 0, 3) */
+/* RSHIFT(shrsb,"vshr.s8",0xf2880010,8, NULL, 0, 3) */
+/* RSHIFT(shrub,"vshr.u8",0xf3880010,8, NULL, 0, 3) */
+BINARY(subb,"vsub.i8",0xf3000800, NULL, 0, 3)
+BINARY(subssb,"vqsub.s8",0xf2000210, NULL, 0, 3)
+BINARY(subusb,"vqsub.u8",0xf3000210, NULL, 0, 3)
+BINARY(xorb,"veor",0xf3000110, NULL, 0, 3)
+
+UNARY(absw,"vabs.s16",0xf3b50300, NULL, 0, 2)
+BINARY(addw,"vadd.i16",0xf2100800, "add", 0x0e608400, 2)
+BINARY(addssw,"vqadd.s16",0xf2100010, "sqadd", 0x0e600c00, 2)
+BINARY(addusw,"vqadd.u16",0xf3100010, "uqadd", 0x2e600c00, 2)
+BINARY(andw,"vand",0xf2000110, NULL, 0, 2)
+/* BINARY(andnw,"vbic",0xf2100110, NULL, 0, 2) */
+BINARY(avgsw,"vrhadd.s16",0xf2100100, NULL, 0, 2)
+BINARY(avguw,"vrhadd.u16",0xf3100100, NULL, 0, 2)
+BINARY(cmpeqw,"vceq.i16",0xf3100810, NULL, 0, 2)
+BINARY(cmpgtsw,"vcgt.s16",0xf2100300, NULL, 0, 2)
+MOVE(copyw,"vmov",0xf2200110, NULL, 0, 2)
+BINARY(maxsw,"vmax.s16",0xf2100600, NULL, 0, 2)
+BINARY(maxuw,"vmax.u16",0xf3100600, NULL, 0, 2)
+BINARY(minsw,"vmin.s16",0xf2100610, NULL, 0, 2)
+BINARY(minuw,"vmin.u16",0xf3100610, NULL, 0, 2)
+BINARY(mullw,"vmul.i16",0xf2100910, NULL, 0, 2)
+BINARY(orw,"vorr",0xf2200110, NULL, 0, 2)
+/* LSHIFT(shlw,"vshl.i16",0xf2900510, NULL, 0, 2) */
+/* RSHIFT(shrsw,"vshr.s16",0xf2900010,16, NULL, 0, 2) */
+/* RSHIFT(shruw,"vshr.u16",0xf3900010,16, NULL, 0, 2) */
+BINARY(subw,"vsub.i16",0xf3100800, NULL, 0, 2)
+BINARY(subssw,"vqsub.s16",0xf2100210, NULL, 0, 2)
+BINARY(subusw,"vqsub.u16",0xf3100210, NULL, 0, 2)
+BINARY(xorw,"veor",0xf3000110, NULL, 0, 2)
+
+UNARY(absl,"vabs.s32",0xf3b90300, NULL, 0, 1)
+BINARY(addl,"vadd.i32",0xf2200800, "add", 0x0ea08400, 1)
+BINARY(addssl,"vqadd.s32",0xf2200010, "sqadd", 0x0ea00c00, 1)
+BINARY(addusl,"vqadd.u32",0xf3200010, "uqadd", 0x2ea00c00, 1)
+BINARY(andl,"vand",0xf2000110, NULL, 0, 1)
+/* BINARY(andnl,"vbic",0xf2100110, NULL, 0, 1) */
+BINARY(avgsl,"vrhadd.s32",0xf2200100, NULL, 0, 1)
+BINARY(avgul,"vrhadd.u32",0xf3200100, NULL, 0, 1)
+BINARY(cmpeql,"vceq.i32",0xf3200810, NULL, 0, 1)
+BINARY(cmpgtsl,"vcgt.s32",0xf2200300, NULL, 0, 1)
+MOVE(copyl,"vmov",0xf2200110, NULL, 0, 1)
+BINARY(maxsl,"vmax.s32",0xf2200600, NULL, 0, 1)
+BINARY(maxul,"vmax.u32",0xf3200600, NULL, 0, 1)
+BINARY(minsl,"vmin.s32",0xf2200610, NULL, 0, 1)
+BINARY(minul,"vmin.u32",0xf3200610, NULL, 0, 1)
+BINARY(mulll,"vmul.i32",0xf2200910, NULL, 0, 1)
+BINARY(orl,"vorr",0xf2200110, NULL, 0, 1)
+/* LSHIFT(shll,"vshl.i32",0xf2a00510, NULL, 0, 1) */
+/* RSHIFT(shrsl,"vshr.s32",0xf2a00010,32, NULL, 0, 1) */
+/* RSHIFT(shrul,"vshr.u32",0xf3a00010,32, NULL, 0, 1) */
+BINARY(subl,"vsub.i32",0xf3200800, NULL, 0, 1)
+BINARY(subssl,"vqsub.s32",0xf2200210, NULL, 0, 1)
+BINARY(subusl,"vqsub.u32",0xf3200210, NULL, 0, 1)
+BINARY(xorl,"veor",0xf3000110, NULL, 0, 1)
+
+/* UNARY(absq,"vabs.s64",0xf3b10300, NULL, 0, 0) */
+BINARY(addq,"vadd.i64",0xf2300800, "add", 0x0ee08400, 0)
+/* BINARY(addssq,"vqadd.s64",0xf2000010, "sqadd", 0x0ee00c00, 0) */
+/* BINARY(addusq,"vqadd.u64",0xf3000010, "uqadd", 0x2ee00c00, 0) */
+BINARY(andq,"vand",0xf2000110, NULL, 0, 0)
+/* BINARY(avgsq,"vrhadd.s64",0xf2000100, NULL, 0, 0) */
+/* BINARY(avguq,"vrhadd.u64",0xf3000100, NULL, 0, 0) */
+/* BINARY(cmpeqq,"vceq.i64",0xf3000810, NULL, 0, 0) */
+/* BINARY(cmpgtsq,"vcgt.s64",0xf2000300, NULL, 0, 0) */
+MOVE(copyq,"vmov",0xf2200110, NULL, 0, 0)
+/* BINARY(maxsq,"vmax.s64",0xf2000600, NULL, 0, 0) */
+/* BINARY(maxuq,"vmax.u64",0xf3000600, NULL, 0, 0) */
+/* BINARY(minsq,"vmin.s64",0xf2000610, NULL, 0, 0) */
+/* BINARY(minuq,"vmin.u64",0xf3000610, NULL, 0, 0) */
+/* BINARY(mullq,"vmul.i64",0xf2000910, NULL, 0, 0) */
+BINARY(orq,"vorr",0xf2200110, NULL, 0, 0)
+BINARY(subq,"vsub.i64",0xf3300800, NULL, 0, 0)
+/* BINARY(subssq,"vqsub.s64",0xf2000210, NULL, 0, 0) */
+/* BINARY(subusq,"vqsub.u64",0xf3000210, NULL, 0, 0) */
+BINARY(xorq,"veor",0xf3000110, NULL, 0, 0)
+
+UNARY_LONG(convsbw,"vmovl.s8",0xf2880a10, NULL, 0, 3)
+UNARY_LONG(convubw,"vmovl.u8",0xf3880a10, NULL, 0, 3)
+UNARY_LONG(convswl,"vmovl.s16",0xf2900a10, NULL, 0, 2)
+UNARY_LONG(convuwl,"vmovl.u16",0xf3900a10, NULL, 0, 2)
+UNARY_LONG(convslq,"vmovl.s32",0xf2a00a10, NULL, 0, 1)
+UNARY_LONG(convulq,"vmovl.u32",0xf3a00a10, NULL, 0, 1)
+UNARY_NARROW(convwb,"vmovn.i16",0xf3b20200, NULL, 0, 3)
+UNARY_NARROW(convssswb,"vqmovn.s16",0xf3b20280, NULL, 0, 3)
+UNARY_NARROW(convsuswb,"vqmovun.s16",0xf3b20240, NULL, 0, 3)
+UNARY_NARROW(convuuswb,"vqmovn.u16",0xf3b202c0, NULL, 0, 3)
+UNARY_NARROW(convlw,"vmovn.i32",0xf3b60200, NULL, 0, 2)
+UNARY_NARROW(convql,"vmovn.i64",0xf3ba0200, NULL, 0, 1)
+UNARY_NARROW(convssslw,"vqmovn.s32",0xf3b60280, NULL, 0, 2)
+UNARY_NARROW(convsuslw,"vqmovun.s32",0xf3b60240, NULL, 0, 2)
+UNARY_NARROW(convuuslw,"vqmovn.u32",0xf3b602c0, NULL, 0, 2)
+UNARY_NARROW(convsssql,"vqmovn.s64",0xf3ba0280, NULL, 0, 1)
+UNARY_NARROW(convsusql,"vqmovun.s64",0xf3ba0240, NULL, 0, 1)
+UNARY_NARROW(convuusql,"vqmovn.u64",0xf3ba02c0, NULL, 0, 1)
+
+BINARY_LONG(mulsbw,"vmull.s8",0xf2800c00, NULL, 0, 3)
+BINARY_LONG(mulubw,"vmull.u8",0xf3800c00, NULL, 0, 3)
+BINARY_LONG(mulswl,"vmull.s16",0xf2900c00, NULL, 0, 2)
+BINARY_LONG(muluwl,"vmull.u16",0xf3900c00, NULL, 0, 2)
+
+UNARY(swapw,"vrev16.i8",0xf3b00100, NULL, 0, 2)
+UNARY(swapl,"vrev32.i8",0xf3b00080, NULL, 0, 1)
+UNARY(swapq,"vrev64.i8",0xf3b00000, NULL, 0, 0)
+UNARY(swapwl,"vrev32.i16",0xf3b40080, NULL, 0, 1)
+UNARY(swaplq,"vrev64.i32",0xf3b80000, NULL, 0, 0)
+
+UNARY_NARROW(select0ql,"vmovn.i64",0xf3ba0200, NULL, 0, 1)
+UNARY_NARROW(select0lw,"vmovn.i32",0xf3b60200, NULL, 0, 2)
+UNARY_NARROW(select0wb,"vmovn.i16",0xf3b20200, NULL, 0, 3)
+
+BINARY(addf,"vadd.f32",0xf2000d00, NULL, 0, 1)
+BINARY(subf,"vsub.f32",0xf2200d00, NULL, 0, 1)
+BINARY(mulf,"vmul.f32",0xf3000d10, NULL, 0, 1)
+BINARY(maxf,"vmax.f32",0xf2000f00, NULL, 0, 1)
+BINARY(minf,"vmin.f32",0xf2200f00, NULL, 0, 1)
+BINARY(cmpeqf,"vceq.f32",0xf2000e00, NULL, 0, 1)
+/* BINARY_R(cmpltf,"vclt.f32",0xf3200e00, NULL, 0, 1) */
+/* BINARY_R(cmplef,"vcle.f32",0xf3000e00, NULL, 0, 1) */
+UNARY(convfl,"vcvt.s32.f32",0xf3bb0700, NULL, 0, 1)
+UNARY(convlf,"vcvt.f32.s32",0xf3bb0600, NULL, 0, 1)
+
+#define UNARY_VFP(opcode,insn_name,code,insn_name64,code64,vec_shift) \
 static void \
 orc_neon_rule_ ## opcode (OrcCompiler *p, void *user, OrcInstruction *insn) \
 { \
@@ -1800,7 +2266,7 @@ orc_neon_rule_ ## opcode (OrcCompiler *p, void *user, OrcInstruction *insn) \
   } \
 }
 
-#define BINARY_VFP(opcode,insn_name,code,vec_shift) \
+#define BINARY_VFP(opcode,insn_name,code,insn_name64,code64,vec_shift) \
 static void \
 orc_neon_rule_ ## opcode (OrcCompiler *p, void *user, OrcInstruction *insn) \
 { \
@@ -1818,14 +2284,14 @@ orc_neon_rule_ ## opcode (OrcCompiler *p, void *user, OrcInstruction *insn) \
   } \
 }
 
-BINARY_VFP(addd,"vadd.f64",0xee300b00, 0)
-BINARY_VFP(subd,"vsub.f64",0xee300b40, 0)
-BINARY_VFP(muld,"vmul.f64",0xee200b00, 0)
-BINARY_VFP(divd,"vdiv.f64",0xee800b00, 0)
-UNARY_VFP(sqrtd,"vsqrt.f64",0xeeb10b00, 0)
-/* BINARY_VFP(cmpeqd,"vcmpe.f64",0xee000000, 0) */
-UNARY_VFP(convdf,"vcvt.f64.f32",0xee200b00, 0)
-UNARY_VFP(convfd,"vcvt.f32.f64",0xee200b00, 0)
+BINARY_VFP(addd,"vadd.f64",0xee300b00, NULL, 0, 0)
+BINARY_VFP(subd,"vsub.f64",0xee300b40, NULL, 0, 0)
+BINARY_VFP(muld,"vmul.f64",0xee200b00, NULL, 0, 0)
+BINARY_VFP(divd,"vdiv.f64",0xee800b00, NULL, 0, 0)
+UNARY_VFP(sqrtd,"vsqrt.f64",0xeeb10b00, NULL, 0, 0)
+/* BINARY_VFP(cmpeqd,"vcmpe.f64",0xee000000, NULL, 0, 0) */
+UNARY_VFP(convdf,"vcvt.f64.f32",0xee200b00, NULL, 0, 0)
+UNARY_VFP(convfd,"vcvt.f32.f64",0xee200b00, NULL, 0, 0)
 
 #if 1
 #define NUM_ITERS_DIVF 2