aarch64: orcprogram-neon porting to aarch64
authorDongju Chae <dongju.chae@samsung.com>
Mon, 21 Oct 2019 11:21:47 +0000 (20:21 +0900)
committerMarek Vasut <marex@denx.de>
Wed, 16 Sep 2020 12:27:27 +0000 (14:27 +0200)
This PR ports orcprogram-neon.c for AArch64 support.
It makes the orc compiler generate aarch64 assembly codes.

Currently, example1 succesfully runs on aarch64 (tested on an Android device).

Signed-off-by: Dongju Chae <dongju.chae@samsung.com>
orc/orcprogram-neon.c

index cf87249..6793622 100644 (file)
@@ -34,12 +34,15 @@ orc_neon_emit_prologue (OrcCompiler *compiler)
 {
   unsigned int regs = 0;
   orc_uint32 vregs = 0;
+  int num_gregs;
   int i;
 
   orc_compiler_append_code(compiler,".global %s\n", compiler->program->name);
   orc_compiler_append_code(compiler,"%s:\n", compiler->program->name);
 
-  for(i=0;i<16;i++){
+  num_gregs = compiler->is_64bit ? 32 : 16;
+
+  for(i=0;i<num_gregs;i++){
     if (compiler->used_regs[ORC_GP_REG_BASE + i] &&
         compiler->save_regs[ORC_GP_REG_BASE + i]) {
       regs |= (1<<i);
@@ -82,10 +85,13 @@ static void
 orc_neon_emit_epilogue (OrcCompiler *compiler)
 {
   int i;
+  int num_gregs;
   unsigned int regs = 0;
   orc_uint32 vregs = 0;
 
-  for(i=0;i<16;i++){
+  num_gregs = compiler->is_64bit ? 32 : 16;
+
+  for(i=0;i<num_gregs;i++){
     if (compiler->used_regs[ORC_GP_REG_BASE + i] &&
         compiler->save_regs[ORC_GP_REG_BASE + i]) {
       regs |= (1<<i);
@@ -157,36 +163,78 @@ orc_compiler_neon_init (OrcCompiler *compiler)
   int i;
   int loop_shift;
 
-  for(i=ORC_GP_REG_BASE;i<ORC_GP_REG_BASE+16;i++){
-    compiler->valid_regs[i] = 1;
+  if (compiler->target_flags & ORC_TARGET_NEON_64BIT) {
+    compiler->is_64bit = TRUE;
   }
-  for(i=ORC_VEC_REG_BASE+0;i<ORC_VEC_REG_BASE+32;i+=2){
-    compiler->valid_regs[i] = 1;
-  }
-  /* compiler->valid_regs[ORC_ARM_SB] = 0; */
-  compiler->valid_regs[ORC_ARM_IP] = 0;
-  compiler->valid_regs[ORC_ARM_SP] = 0;
-  compiler->valid_regs[ORC_ARM_LR] = 0;
-  compiler->valid_regs[ORC_ARM_PC] = 0;
-  for(i=4;i<12;i++) {
-    compiler->save_regs[ORC_GP_REG_BASE+i] = 1;
+
+  if (compiler->is_64bit) {
+    /** AArch64
+     * 31 64-bit generic-purpose registers (R0-R30) and SP
+     * 32 128-bit vector registers (do not overlap multiple registers in a narrower view)
+     * Note that PC is not a generic-purpose register in AArch64
+     */
+    for(i=ORC_GP_REG_BASE;i<ORC_GP_REG_BASE+32;i++){
+      compiler->valid_regs[i] = 1;
+    }
+    for(i=ORC_VEC_REG_BASE+0;i<ORC_VEC_REG_BASE+32;i++){
+      compiler->valid_regs[i] = 1;
+    }
+
+    compiler->valid_regs[ORC_ARM64_IP0] = 0;
+    compiler->valid_regs[ORC_ARM64_IP1] = 0;
+
+    compiler->valid_regs[ORC_ARM64_FP] = 0;
+    compiler->valid_regs[ORC_ARM64_LR] = 0;
+    compiler->valid_regs[ORC_ARM64_SP] = 0;
+
+    /** r19 to r29 are callee-saved */
+    for(i=19;i<29;i++) {
+      compiler->save_regs[ORC_GP_REG_BASE+i] = 1;
+    }
+  } else {
+    /** AArch32
+     * 16 32-bit generic-purpose registers (R0-R15)
+     * 32 64-bit vector registers (smaller registers are packed into larger ones)
+     */
+    for(i=ORC_GP_REG_BASE;i<ORC_GP_REG_BASE+16;i++){
+      compiler->valid_regs[i] = 1;
+    }
+    for(i=ORC_VEC_REG_BASE+0;i<ORC_VEC_REG_BASE+32;i+=2){
+      compiler->valid_regs[i] = 1;
+    }
+    /* compiler->valid_regs[ORC_ARM_SB] = 0; */
+    compiler->valid_regs[ORC_ARM_IP] = 0;
+    compiler->valid_regs[ORC_ARM_SP] = 0;
+    compiler->valid_regs[ORC_ARM_LR] = 0;
+    compiler->valid_regs[ORC_ARM_PC] = 0;
+
+    for(i=4;i<12;i++) {
+      compiler->save_regs[ORC_GP_REG_BASE+i] = 1;
+    }
   }
+
+  /** Both architectures have 8 callee-saved SIMD registers (v8-v15) */
   for(i=8;i<16;i++) {
     compiler->save_regs[ORC_VEC_REG_BASE+i] = 1;
   }
-  
+
   for(i=0;i<ORC_N_REGS;i++){
     compiler->alloc_regs[i] = 0;
     compiler->used_regs[i] = 0;
   }
 
   compiler->exec_reg = ORC_ARM_A1;
-  compiler->valid_regs[compiler->exec_reg] = 0;
   compiler->gp_tmpreg = ORC_ARM_A2;
+  if (compiler->is_64bit) {
+    compiler->tmpreg = ORC_VEC_REG_BASE + 0;
+    compiler->tmpreg2 = ORC_VEC_REG_BASE + 1;
+  } else {
+    compiler->tmpreg = ORC_VEC_REG_BASE + 0;
+    compiler->tmpreg2 = ORC_VEC_REG_BASE + 2;
+  }
+  compiler->valid_regs[compiler->exec_reg] = 0;
   compiler->valid_regs[compiler->gp_tmpreg] = 0;
-  compiler->tmpreg = ORC_VEC_REG_BASE + 0;
   compiler->valid_regs[compiler->tmpreg] = 0;
-  compiler->tmpreg2 = ORC_VEC_REG_BASE + 2;
   compiler->valid_regs[compiler->tmpreg2] = 0;
 
   loop_shift = 0;
@@ -311,9 +359,15 @@ orc_neon_load_constants_inner (OrcCompiler *compiler)
         break;
       case ORC_VAR_TYPE_SRC:
       case ORC_VAR_TYPE_DEST:
-        orc_arm_emit_load_reg (compiler, 
-            compiler->vars[i].ptr_register,
-            compiler->exec_reg, ORC_STRUCT_OFFSET(OrcExecutor, arrays[i]));
+        if (compiler->is_64bit) {
+          orc_arm64_emit_load_reg (compiler, 32,
+              compiler->vars[i].ptr_register,
+              compiler->exec_reg, ORC_STRUCT_OFFSET(OrcExecutor, arrays[i]));
+        } else {
+          orc_arm_emit_load_reg (compiler,
+              compiler->vars[i].ptr_register,
+              compiler->exec_reg, ORC_STRUCT_OFFSET(OrcExecutor, arrays[i]));
+        }
         break;
       case ORC_VAR_TYPE_ACCUMULATOR:
         break;
@@ -476,194 +530,160 @@ enum {
   LABEL_L1L2_AFTER,
 };
 
+#define ORC_NEON_ALIGNED_DEST_CUTOFF 64
+
 static void
-orc_compiler_neon_assemble (OrcCompiler *compiler)
+orc_neon64_loop_shift (OrcCompiler *compiler)
 {
-  int align_var;
-  int align_shift;
-  int var_size_shift;
-  int i;
-  int set_fpscr = FALSE;
-  
-  align_var = get_align_var (compiler);
-  if (compiler->error) return;
-
-  var_size_shift = get_shift (compiler->vars[align_var].size);
-  align_shift = 4;
-
-  compiler->vars[align_var].is_aligned = FALSE;
-
-  orc_neon_emit_prologue (compiler);
-
-  if (orc_program_has_float (compiler)) {
-    set_fpscr = TRUE;
-    ORC_ASM_CODE (compiler,"  vmrs %s, fpscr\n", orc_arm_reg_name (compiler->gp_tmpreg));
-    orc_arm_emit (compiler, 0xeef10a10 | ((compiler->gp_tmpreg&0xf)<<12));
-    ORC_ASM_CODE (compiler,"  push %s\n", orc_arm_reg_name (compiler->gp_tmpreg));
-    orc_arm_emit (compiler, 0xe52d0004 | ((compiler->gp_tmpreg&0xf)<<12));
-
-    orc_arm_emit_load_imm (compiler, compiler->gp_tmpreg, 1<<24);
-    ORC_ASM_CODE (compiler,"  vmsr fpscr, %s\n", orc_arm_reg_name (compiler->gp_tmpreg));
-    orc_arm_emit (compiler, 0xeee10a10 | ((compiler->gp_tmpreg&0xf)<<12));
-  }
-
-  orc_neon_load_constants_outer (compiler);
-
-  if (compiler->program->is_2d) {
-    if (compiler->program->constant_m > 0) {
-      orc_arm_emit_load_imm (compiler, ORC_ARM_A3, compiler->program->constant_m);
-      orc_arm_emit_store_reg (compiler, ORC_ARM_A3, compiler->exec_reg,
-          (int)ORC_STRUCT_OFFSET(OrcExecutor,params[ORC_VAR_A2]));
-    } else {
-      orc_arm_emit_load_reg (compiler, ORC_ARM_A3, compiler->exec_reg,
-          (int)ORC_STRUCT_OFFSET(OrcExecutor, params[ORC_VAR_A1]));
-      orc_arm_emit_store_reg (compiler, ORC_ARM_A3, compiler->exec_reg,
-          (int)ORC_STRUCT_OFFSET(OrcExecutor,params[ORC_VAR_A2]));
-    }
-
-    orc_arm_emit_label (compiler, LABEL_OUTER_LOOP);
-  }
+  int align_var = get_align_var (compiler);
+  int var_size_shift = get_shift (compiler->vars[align_var].size);
+  int align_shift = 4;
 
-#define ORC_NEON_ALIGNED_DEST_CUTOFF 64
-
-  if (compiler->loop_shift > 0 && compiler->n_insns < 5) {
-    orc_arm_emit_load_reg (compiler, ORC_ARM_A3, compiler->exec_reg,
+  if (compiler->n_insns < 5) {
+    /** Get the number of loops (N) from OrcExecutor */
+    orc_arm64_emit_load_reg (compiler, 32, ORC_ARM64_R2, compiler->exec_reg,
         (int)ORC_STRUCT_OFFSET(OrcExecutor,n));
-    orc_arm_emit_cmp_imm (compiler, ORC_ARM_A3, ORC_NEON_ALIGNED_DEST_CUTOFF);
+
+    /** if N > ORC_NEON_ALIGNED_DEST_CUTOFF, go to LABEL_REGION0_SKIP */
+    orc_arm64_emit_cmp_imm (compiler, 32, ORC_ARM64_R2, ORC_NEON_ALIGNED_DEST_CUTOFF);
     orc_arm_emit_branch (compiler, ORC_ARM_COND_GT, LABEL_REGION0_SKIP);
 
-    orc_arm_emit_asr_imm (compiler, ORC_ARM_A2, ORC_ARM_A3,
+    /** counter2 = N >> loop shift */
+    orc_arm64_emit_asr_imm (compiler, 32, ORC_ARM64_R1, ORC_ARM64_R2,
         compiler->loop_shift);
-    orc_arm_emit_store_reg (compiler, ORC_ARM_A2, compiler->exec_reg,
+    orc_arm64_emit_store_reg (compiler, 32, ORC_ARM64_R1, compiler->exec_reg,
         (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2));
 
-    orc_arm_emit_and_imm (compiler, ORC_ARM_A3, ORC_ARM_A3,
+    /** counter3 = N & loop shift */
+    orc_arm64_emit_and_imm (compiler, 32, ORC_ARM64_R2, ORC_ARM64_R2,
         (1<<compiler->loop_shift)-1);
-    orc_arm_emit_store_reg (compiler, ORC_ARM_A3, compiler->exec_reg,
+    orc_arm64_emit_store_reg (compiler, 32, ORC_ARM64_R2, compiler->exec_reg,
         (int)ORC_STRUCT_OFFSET(OrcExecutor,counter3));
 
+    /** load function arguments */
     orc_neon_load_constants_inner (compiler);
-    orc_arm_emit_load_reg (compiler, ORC_ARM_IP, compiler->exec_reg,
+
+    /** if counter2 == zero, go to LABEL_REGION2_SKIP */
+    orc_arm64_emit_load_reg (compiler, 32, ORC_ARM64_IP0, compiler->exec_reg,
         (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2));
-    orc_arm_emit_cmp_imm (compiler, ORC_ARM_IP, 0);
+    orc_arm64_emit_cmp_imm (compiler, 32, ORC_ARM64_IP0, 0);
     orc_arm_emit_branch (compiler, ORC_ARM_COND_EQ, LABEL_REGION2_SKIP);
 
+    /** vector calculation loop */
     compiler->size_region = 0;
     orc_arm_emit_label (compiler, LABEL_REGION0_LOOP);
-    orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE);
+    orc_arm64_emit_subs_imm (compiler, 32, ORC_ARM64_IP0, ORC_ARM64_IP0, 1);
+
+    /** vector instructions: @todo port to aarch64 */
     orc_neon_emit_loop (compiler, -1);
-    orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_REGION0_LOOP);
-    
 
+    /** if counter2 != zero, repeat loop */
+    orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_REGION0_LOOP);
+    /** else go to LABEL_REGION2_SKIP */
     orc_arm_emit_branch (compiler, ORC_ARM_COND_AL, LABEL_REGION2_SKIP);
-
     orc_arm_emit_label (compiler, LABEL_REGION0_SKIP);
   }
 
-  if (compiler->loop_shift > 0) {
-    orc_arm_emit_load_imm (compiler, ORC_ARM_IP, 1<<align_shift);
-
-    orc_arm_emit_load_reg (compiler, ORC_ARM_A2, compiler->exec_reg,
-        (int)ORC_STRUCT_OFFSET(OrcExecutor,arrays[align_var]));
-    orc_arm_emit_sub (compiler, ORC_ARM_IP, ORC_ARM_IP, ORC_ARM_A2);
-    orc_arm_emit_and_imm (compiler, ORC_ARM_IP, ORC_ARM_IP,
-        (1<<align_shift)-1);
-    if (var_size_shift > 0) {
-      orc_arm_emit_asr_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, var_size_shift);
-    }
-
-    orc_arm_emit_load_reg (compiler, ORC_ARM_A3, compiler->exec_reg,
-        (int)ORC_STRUCT_OFFSET(OrcExecutor,n));
-    orc_arm_emit_cmp (compiler, ORC_ARM_A3, ORC_ARM_IP);
-    orc_arm_emit_branch (compiler, ORC_ARM_COND_LE, LABEL_ONE_REGION);
-
-    orc_arm_emit_store_reg (compiler, ORC_ARM_IP, compiler->exec_reg,
-        (int)ORC_STRUCT_OFFSET(OrcExecutor,counter1));
-    orc_arm_emit_sub (compiler, ORC_ARM_A2, ORC_ARM_A3, ORC_ARM_IP);
-
-    orc_arm_emit_asr_imm (compiler, ORC_ARM_A3, ORC_ARM_A2,
-        compiler->loop_shift + compiler->unroll_shift);
-    orc_arm_emit_store_reg (compiler, ORC_ARM_A3, compiler->exec_reg,
-        (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2));
-
-    orc_arm_emit_and_imm (compiler, ORC_ARM_A3, ORC_ARM_A2,
-        (1<<(compiler->loop_shift + compiler->unroll_shift))-1);
-    orc_arm_emit_store_reg (compiler, ORC_ARM_A3, compiler->exec_reg,
-        (int)ORC_STRUCT_OFFSET(OrcExecutor,counter3));
-
-    orc_arm_emit_branch (compiler, ORC_ARM_COND_AL, LABEL_ONE_REGION_AFTER);
-    orc_arm_emit_label (compiler, LABEL_ONE_REGION);
-
-    orc_arm_emit_store_reg (compiler, ORC_ARM_A3, compiler->exec_reg,
-        (int)ORC_STRUCT_OFFSET(OrcExecutor,counter1));
-
-    orc_arm_emit_load_imm (compiler, ORC_ARM_A3, 0);
-    orc_arm_emit_store_reg (compiler, ORC_ARM_A3, compiler->exec_reg,
-        (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2));
-    orc_arm_emit_store_reg (compiler, ORC_ARM_A3, compiler->exec_reg,
-        (int)ORC_STRUCT_OFFSET(OrcExecutor,counter3));
-
-    orc_arm_emit_label (compiler, LABEL_ONE_REGION_AFTER);
+  /** IP0 = 1 << align_shift */
+  orc_arm64_emit_mov_imm (compiler, 32, ORC_ARM64_IP0, 1<<align_shift);
+
+  /** r1 == ORC_VAR_D1 */
+  orc_arm64_emit_load_reg (compiler, 32, ORC_ARM64_R1, compiler->exec_reg,
+      (int)ORC_STRUCT_OFFSET(OrcExecutor,arrays[align_var]));
+  /** IP0 = IP0 - r1 */
+  orc_arm64_emit_sub (compiler, 32, ORC_ARM64_IP0, ORC_ARM64_IP0, ORC_ARM64_R1);
+  /** IP0 = IP0 & ((1 << aligned_shift) -1) */
+  orc_arm64_emit_and_imm (compiler, 32, ORC_ARM64_IP0, ORC_ARM64_IP0,
+      (1<<align_shift)-1);
+  if (var_size_shift > 0) {
+    /** IP0 = IP0 >> var_size_shift */
+    orc_arm64_emit_asr_imm (compiler, 32, ORC_ARM64_IP0, ORC_ARM64_IP0, var_size_shift);
   }
 
-  orc_neon_load_constants_inner (compiler);
-
-  if (compiler->loop_shift > 0) {
-    int save_loop_shift = compiler->loop_shift;
-    compiler->loop_shift = 0;
-
-    orc_arm_emit_load_reg (compiler, ORC_ARM_IP, compiler->exec_reg,
-        (int)ORC_STRUCT_OFFSET(OrcExecutor,counter1));
-
-    orc_arm_emit_cmp_imm (compiler, ORC_ARM_IP, 0);
-    orc_arm_emit_branch (compiler, ORC_ARM_COND_EQ, LABEL_REGION1_SKIP);
-
-    orc_arm_emit_label (compiler, LABEL_REGION1_LOOP);
-    orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE);
-    orc_neon_emit_loop (compiler, -1);
-    orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_REGION1_LOOP);
-    orc_arm_emit_label (compiler, LABEL_REGION1_SKIP);
-
-    compiler->loop_shift = save_loop_shift;
-    compiler->vars[align_var].is_aligned = TRUE;
-  }
+  /** r2 = N */
+  orc_arm64_emit_load_reg (compiler, 32, ORC_ARM64_R2, compiler->exec_reg,
+      (int)ORC_STRUCT_OFFSET(OrcExecutor,n));
+  /** N <= IP0, go to LABEL_ONE_REGION */
+  orc_arm64_emit_cmp (compiler, 32, ORC_ARM64_R2, ORC_ARM64_IP0);
+  orc_arm_emit_branch (compiler, ORC_ARM_COND_LE, LABEL_ONE_REGION);
+
+  /** counter1 = IP0 */
+  orc_arm64_emit_store_reg (compiler, 32, ORC_ARM64_IP0, compiler->exec_reg,
+      (int)ORC_STRUCT_OFFSET(OrcExecutor,counter1));
+  /** r1 = r2 - IP0 */
+  orc_arm64_emit_sub (compiler, 32, ORC_ARM64_R1, ORC_ARM64_R2, ORC_ARM64_IP0);
+
+  /** r2 = r1 >> (loop_shift + unroll_shift) */
+  orc_arm64_emit_asr_imm (compiler, 32, ORC_ARM64_R2, ORC_ARM64_R1,
+      compiler->loop_shift + compiler->unroll_shift);
+  /** counter2 = r2 */
+  orc_arm64_emit_store_reg (compiler, 32, ORC_ARM64_R2, compiler->exec_reg,
+      (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2));
+
+  /** r2 = r1 & ((1<<(loop_shift + unroll_shift))-1) */
+  orc_arm64_emit_and_imm (compiler, 32, ORC_ARM64_R2, ORC_ARM64_R1,
+      (1<<(compiler->loop_shift + compiler->unroll_shift))-1);
+  /** counter3 = r2 */
+  orc_arm64_emit_store_reg (compiler, 32, ORC_ARM64_R2, compiler->exec_reg,
+      (int)ORC_STRUCT_OFFSET(OrcExecutor,counter3));
+
+  /** go to LABEL_ONE_REGION_AFTER */
+  orc_arm_emit_branch (compiler, ORC_ARM_COND_AL, LABEL_ONE_REGION_AFTER);
+  orc_arm_emit_label (compiler, LABEL_ONE_REGION);
+
+  /** counter1 = r2 */
+  orc_arm64_emit_store_reg (compiler, 32, ORC_ARM64_R2, compiler->exec_reg,
+      (int)ORC_STRUCT_OFFSET(OrcExecutor,counter1));
+  /** counter2 = counter3 = 0 */
+  orc_arm64_emit_mov_uimm (compiler, 32, ORC_ARM64_R2, 0);
+  orc_arm64_emit_store_reg (compiler, 32, ORC_ARM64_R2, compiler->exec_reg,
+      (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2));
+  orc_arm64_emit_store_reg (compiler, 32, ORC_ARM64_R2, compiler->exec_reg,
+      (int)ORC_STRUCT_OFFSET(OrcExecutor,counter3));
+
+  orc_arm_emit_label (compiler, LABEL_ONE_REGION_AFTER);
+}
 
-  if (compiler->loop_shift > 0) {
-    orc_arm_emit_load_reg (compiler, ORC_ARM_IP, compiler->exec_reg,
-        (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2));
-  } else {
-    orc_arm_emit_load_reg (compiler, ORC_ARM_IP, compiler->exec_reg,
-        (int)ORC_STRUCT_OFFSET(OrcExecutor,n));
-  }
+static void
+orc_neon64_loop_caches (OrcCompiler *compiler)
+{
+  int align_var = get_align_var (compiler);
+  int var_size_shift = get_shift (compiler->vars[align_var].size);
+  int i;
 
-  orc_arm_emit_cmp_imm (compiler, ORC_ARM_IP, 0);
+  /** if IP0 == 0, go to LABEL_REGION2_SKIP */
+  orc_arm64_emit_cmp_imm (compiler, 32, ORC_ARM64_IP0, 0);
   orc_arm_emit_branch (compiler, ORC_ARM_COND_EQ, LABEL_REGION2_SKIP);
 
-  orc_arm_emit_asr_imm (compiler, compiler->gp_tmpreg, ORC_ARM_IP,
+  /** r1 = IP0 >> (17 + var_size_shift - compiler->loop_shift - compiler->unroll_shift) */
+  orc_arm64_emit_asr_imm (compiler, 32, compiler->gp_tmpreg, ORC_ARM64_IP0,
       17 + var_size_shift - compiler->loop_shift - compiler->unroll_shift);
-  orc_arm_emit_cmp_imm (compiler, compiler->gp_tmpreg, 0);
+
+  /** if r1 == 0, go to LABEL_REGION2_MEDIUM */
+  orc_arm64_emit_cmp_imm (compiler, 32, compiler->gp_tmpreg, 0);
   orc_arm_emit_branch (compiler, ORC_ARM_COND_EQ, LABEL_REGION2_MEDIUM);
 
-  /* N is larger than L2 cache size */
+  /** N is larger than L2 cache size */
   compiler->size_region = 3;
   orc_arm_emit_label (compiler, LABEL_REGION2_LOOP_LARGE);
-  orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE);
+  orc_arm64_emit_subs_imm (compiler, 32, ORC_ARM64_IP0, ORC_ARM64_IP0, 1);
   for(i=0;i<(1<<compiler->unroll_shift);i++){
     orc_neon_emit_loop (compiler, i);
   }
   orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_REGION2_LOOP_LARGE);
+  /** DONE, let's finish */
   orc_arm_emit_branch (compiler, ORC_ARM_COND_AL, LABEL_REGION2_SKIP);
 
   orc_arm_emit_label (compiler, LABEL_REGION2_MEDIUM);
-  orc_arm_emit_asr_imm (compiler, compiler->gp_tmpreg, ORC_ARM_IP,
+  orc_arm64_emit_asr_imm (compiler, 32, compiler->gp_tmpreg, ORC_ARM64_IP0,
       13 + var_size_shift - compiler->loop_shift - compiler->unroll_shift);
-  orc_arm_emit_cmp_imm (compiler, compiler->gp_tmpreg, 0);
+  orc_arm64_emit_cmp_imm (compiler, 32, compiler->gp_tmpreg, 0);
   orc_arm_emit_branch (compiler, ORC_ARM_COND_EQ, LABEL_REGION2_SMALL);
 
   /* N is smaller than L2 cache size */
   compiler->size_region = 2;
   orc_arm_emit_label (compiler, LABEL_REGION2_LOOP_MEDIUM);
-  orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE);
+  orc_arm64_emit_subs_imm (compiler, 32, ORC_ARM64_IP0, ORC_ARM64_IP0, 1);
   for(i=0;i<(1<<compiler->unroll_shift);i++){
     orc_neon_emit_loop (compiler, i);
   }
@@ -671,48 +691,299 @@ orc_compiler_neon_assemble (OrcCompiler *compiler)
   orc_arm_emit_branch (compiler, ORC_ARM_COND_AL, LABEL_REGION2_SKIP);
 
   orc_arm_emit_label (compiler, LABEL_REGION2_SMALL);
-  /* N is smaller than L2 cache size */
+  /* N is smaller than L1 cache size */
   compiler->size_region = 1;
   orc_arm_emit_label (compiler, LABEL_REGION2_LOOP_SMALL);
-  orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE);
+  orc_arm64_emit_subs_imm (compiler, 32, ORC_ARM64_IP0, ORC_ARM64_IP0, 1);
   for(i=0;i<(1<<compiler->unroll_shift);i++){
     orc_neon_emit_loop (compiler, i);
   }
   orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_REGION2_LOOP_SMALL);
 
   orc_arm_emit_label (compiler, LABEL_REGION2_SKIP);
+}
+
+#define orc_neon64_loop_shift_remainder(compiler,counter,label_loop,label_skip) \
+{ \
+  int save_loop_shift = compiler->loop_shift; \
+  compiler->loop_shift = 0; \
+  orc_arm64_emit_load_reg (compiler, 32, ORC_ARM64_IP0, compiler->exec_reg, \
+      (int)ORC_STRUCT_OFFSET(OrcExecutor,counter)); \
+  orc_arm64_emit_cmp_imm (compiler, 32, ORC_ARM64_IP0, 0); \
+  orc_arm_emit_branch (compiler, ORC_ARM_COND_EQ, label_skip); \
+  orc_arm_emit_label (compiler, label_loop); \
+  orc_arm64_emit_subs_imm (compiler, 32, ORC_ARM64_IP0, ORC_ARM64_IP0, 1); \
+  orc_neon_emit_loop (compiler, -1); \
+  orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, label_loop); \
+  orc_arm_emit_label (compiler, label_skip); \
+  compiler->loop_shift = save_loop_shift; \
+}
 
-  if (compiler->loop_shift > 0) {
-    int save_loop_shift = compiler->loop_shift;
+static void
+orc_compiler_neon_assemble (OrcCompiler *compiler)
+{
+  int align_var;
+  int align_shift;
+  int var_size_shift;
+  int i;
+       int set_fpscr = FALSE;
 
-    compiler->loop_shift = 0;
+  align_var = get_align_var (compiler);
+  if (compiler->error) return;
 
-    compiler->vars[align_var].is_aligned = FALSE;
+  var_size_shift = get_shift (compiler->vars[align_var].size);
+  align_shift = 4;
 
-    orc_arm_emit_load_reg (compiler, ORC_ARM_IP, compiler->exec_reg,
-        (int)ORC_STRUCT_OFFSET(OrcExecutor,counter3));
+  compiler->vars[align_var].is_aligned = FALSE;
+
+  orc_neon_emit_prologue (compiler);
+
+       if (!compiler->is_64bit && orc_program_has_float (compiler)) {
+    set_fpscr = TRUE;
+    ORC_ASM_CODE (compiler,"  vmrs %s, fpscr\n", orc_arm_reg_name (compiler->gp_tmpreg));
+    orc_arm_emit (compiler, 0xeef10a10 | ((compiler->gp_tmpreg&0xf)<<12));
+    ORC_ASM_CODE (compiler,"  push %s\n", orc_arm_reg_name (compiler->gp_tmpreg));
+    orc_arm_emit (compiler, 0xe52d0004 | ((compiler->gp_tmpreg&0xf)<<12));
+
+    orc_arm_emit_load_imm (compiler, compiler->gp_tmpreg, 1<<24);
+    ORC_ASM_CODE (compiler,"  vmsr fpscr, %s\n", orc_arm_reg_name (compiler->gp_tmpreg));
+    orc_arm_emit (compiler, 0xeee10a10 | ((compiler->gp_tmpreg&0xf)<<12));
+  }
+
+  orc_neon_load_constants_outer (compiler);
+
+  if (compiler->is_64bit) {
+    /** @todo not supported yet */
+    if (compiler->program->is_2d) return;
+
+    if (compiler->loop_shift > 0) {
+      orc_neon64_loop_shift (compiler);
+
+      orc_neon_load_constants_inner (compiler);
+
+      orc_neon64_loop_shift_remainder (compiler, counter1,
+          LABEL_REGION1_LOOP, LABEL_REGION1_SKIP);
+      compiler->vars[align_var].is_aligned = TRUE;
+
+      orc_arm64_emit_load_reg (compiler, 32, ORC_ARM64_IP0, compiler->exec_reg,
+          (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2));
+
+      orc_neon64_loop_caches (compiler);
+
+      compiler->vars[align_var].is_aligned = FALSE;
+      orc_neon64_loop_shift_remainder (compiler, counter3,
+          LABEL_REGION3_LOOP, LABEL_REGION3_SKIP);
+    } else {
+      orc_neon_load_constants_inner (compiler);
+
+      orc_arm64_emit_load_reg (compiler, 32, ORC_ARM64_IP0, compiler->exec_reg,
+          (int)ORC_STRUCT_OFFSET(OrcExecutor,n));
+
+      orc_neon64_loop_caches (compiler);
+    }
+  } else {
+    if (compiler->program->is_2d) {
+      if (compiler->program->constant_m > 0) {
+        orc_arm_emit_load_imm (compiler, ORC_ARM_A3, compiler->program->constant_m);
+        orc_arm_emit_store_reg (compiler, ORC_ARM_A3, compiler->exec_reg,
+            (int)ORC_STRUCT_OFFSET(OrcExecutor,params[ORC_VAR_A2]));
+      } else {
+        orc_arm_emit_load_reg (compiler, ORC_ARM_A3, compiler->exec_reg,
+            (int)ORC_STRUCT_OFFSET(OrcExecutor, params[ORC_VAR_A1]));
+        orc_arm_emit_store_reg (compiler, ORC_ARM_A3, compiler->exec_reg,
+            (int)ORC_STRUCT_OFFSET(OrcExecutor,params[ORC_VAR_A2]));
+      }
+
+      orc_arm_emit_label (compiler, LABEL_OUTER_LOOP);
+    }
+
+    if (compiler->loop_shift > 0 && compiler->n_insns < 5) {
+      orc_arm_emit_load_reg (compiler, ORC_ARM_A3, compiler->exec_reg,
+          (int)ORC_STRUCT_OFFSET(OrcExecutor,n));
+
+      orc_arm_emit_cmp_imm (compiler, ORC_ARM_A3, ORC_NEON_ALIGNED_DEST_CUTOFF);
+      orc_arm_emit_branch (compiler, ORC_ARM_COND_GT, LABEL_REGION0_SKIP);
+
+      orc_arm_emit_asr_imm (compiler, ORC_ARM_A2, ORC_ARM_A3,
+          compiler->loop_shift);
+      orc_arm_emit_store_reg (compiler, ORC_ARM_A2, compiler->exec_reg,
+          (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2));
+
+      orc_arm_emit_and_imm (compiler, ORC_ARM_A3, ORC_ARM_A3,
+          (1<<compiler->loop_shift)-1);
+      orc_arm_emit_store_reg (compiler, ORC_ARM_A3, compiler->exec_reg,
+          (int)ORC_STRUCT_OFFSET(OrcExecutor,counter3));
+
+      orc_neon_load_constants_inner (compiler);
+      orc_arm_emit_load_reg (compiler, ORC_ARM_IP, compiler->exec_reg,
+          (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2));
+      orc_arm_emit_cmp_imm (compiler, ORC_ARM_IP, 0);
+      orc_arm_emit_branch (compiler, ORC_ARM_COND_EQ, LABEL_REGION2_SKIP);
+
+      compiler->size_region = 0;
+      orc_arm_emit_label (compiler, LABEL_REGION0_LOOP);
+      orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE);
+
+      orc_neon_emit_loop (compiler, -1);
+      orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_REGION0_LOOP);
+      orc_arm_emit_branch (compiler, ORC_ARM_COND_AL, LABEL_REGION2_SKIP);
+      orc_arm_emit_label (compiler, LABEL_REGION0_SKIP);
+    }
+
+    if (compiler->loop_shift > 0) {
+      orc_arm_emit_load_imm (compiler, ORC_ARM_IP, 1<<align_shift);
+
+      orc_arm_emit_load_reg (compiler, ORC_ARM_A2, compiler->exec_reg,
+          (int)ORC_STRUCT_OFFSET(OrcExecutor,arrays[align_var]));
+      orc_arm_emit_sub (compiler, ORC_ARM_IP, ORC_ARM_IP, ORC_ARM_A2);
+      orc_arm_emit_and_imm (compiler, ORC_ARM_IP, ORC_ARM_IP,
+          (1<<align_shift)-1);
+      if (var_size_shift > 0) {
+        orc_arm_emit_asr_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, var_size_shift);
+      }
+
+      orc_arm_emit_load_reg (compiler, ORC_ARM_A3, compiler->exec_reg,
+          (int)ORC_STRUCT_OFFSET(OrcExecutor,n));
+      orc_arm_emit_cmp (compiler, ORC_ARM_A3, ORC_ARM_IP);
+      orc_arm_emit_branch (compiler, ORC_ARM_COND_LE, LABEL_ONE_REGION);
+
+      orc_arm_emit_store_reg (compiler, ORC_ARM_IP, compiler->exec_reg,
+          (int)ORC_STRUCT_OFFSET(OrcExecutor,counter1));
+      orc_arm_emit_sub (compiler, ORC_ARM_A2, ORC_ARM_A3, ORC_ARM_IP);
+
+      orc_arm_emit_asr_imm (compiler, ORC_ARM_A3, ORC_ARM_A2,
+          compiler->loop_shift + compiler->unroll_shift);
+      orc_arm_emit_store_reg (compiler, ORC_ARM_A3, compiler->exec_reg,
+          (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2));
+
+      orc_arm_emit_and_imm (compiler, ORC_ARM_A3, ORC_ARM_A2,
+          (1<<(compiler->loop_shift + compiler->unroll_shift))-1);
+      orc_arm_emit_store_reg (compiler, ORC_ARM_A3, compiler->exec_reg,
+          (int)ORC_STRUCT_OFFSET(OrcExecutor,counter3));
+
+      orc_arm_emit_branch (compiler, ORC_ARM_COND_AL, LABEL_ONE_REGION_AFTER);
+      orc_arm_emit_label (compiler, LABEL_ONE_REGION);
+
+      orc_arm_emit_store_reg (compiler, ORC_ARM_A3, compiler->exec_reg,
+          (int)ORC_STRUCT_OFFSET(OrcExecutor,counter1));
+
+      orc_arm_emit_load_imm (compiler, ORC_ARM_A3, 0);
+      orc_arm_emit_store_reg (compiler, ORC_ARM_A3, compiler->exec_reg,
+          (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2));
+      orc_arm_emit_store_reg (compiler, ORC_ARM_A3, compiler->exec_reg,
+          (int)ORC_STRUCT_OFFSET(OrcExecutor,counter3));
+
+      orc_arm_emit_label (compiler, LABEL_ONE_REGION_AFTER);
+    }
+
+    orc_neon_load_constants_inner (compiler);
+
+    if (compiler->loop_shift > 0) {
+      int save_loop_shift = compiler->loop_shift;
+      compiler->loop_shift = 0;
+
+      orc_arm_emit_load_reg (compiler, ORC_ARM_IP, compiler->exec_reg,
+          (int)ORC_STRUCT_OFFSET(OrcExecutor,counter1));
+
+      orc_arm_emit_cmp_imm (compiler, ORC_ARM_IP, 0);
+      orc_arm_emit_branch (compiler, ORC_ARM_COND_EQ, LABEL_REGION1_SKIP);
+
+      orc_arm_emit_label (compiler, LABEL_REGION1_LOOP);
+      orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE);
+      orc_neon_emit_loop (compiler, -1);
+      orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_REGION1_LOOP);
+      orc_arm_emit_label (compiler, LABEL_REGION1_SKIP);
+
+      compiler->loop_shift = save_loop_shift;
+      compiler->vars[align_var].is_aligned = TRUE;
+    }
+
+    if (compiler->loop_shift > 0) {
+      orc_arm_emit_load_reg (compiler, ORC_ARM_IP, compiler->exec_reg,
+          (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2));
+    } else {
+      orc_arm_emit_load_reg (compiler, ORC_ARM_IP, compiler->exec_reg,
+          (int)ORC_STRUCT_OFFSET(OrcExecutor,n));
+    }
 
     orc_arm_emit_cmp_imm (compiler, ORC_ARM_IP, 0);
-    orc_arm_emit_branch (compiler, ORC_ARM_COND_EQ, LABEL_REGION3_SKIP);
+    orc_arm_emit_branch (compiler, ORC_ARM_COND_EQ, LABEL_REGION2_SKIP);
+
+    orc_arm_emit_asr_imm (compiler, compiler->gp_tmpreg, ORC_ARM_IP,
+        17 + var_size_shift - compiler->loop_shift - compiler->unroll_shift);
+    orc_arm_emit_cmp_imm (compiler, compiler->gp_tmpreg, 0);
+    orc_arm_emit_branch (compiler, ORC_ARM_COND_EQ, LABEL_REGION2_MEDIUM);
 
-    orc_arm_emit_label (compiler, LABEL_REGION3_LOOP);
+    /* N is larger than L2 cache size */
+    compiler->size_region = 3;
+    orc_arm_emit_label (compiler, LABEL_REGION2_LOOP_LARGE);
     orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE);
-    orc_neon_emit_loop (compiler, -1);
-    orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_REGION3_LOOP);
-    orc_arm_emit_label (compiler, LABEL_REGION3_SKIP);
+    for(i=0;i<(1<<compiler->unroll_shift);i++){
+      orc_neon_emit_loop (compiler, i);
+    }
+    orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_REGION2_LOOP_LARGE);
+    orc_arm_emit_branch (compiler, ORC_ARM_COND_AL, LABEL_REGION2_SKIP);
 
-    compiler->loop_shift = save_loop_shift;
-  }
+    orc_arm_emit_label (compiler, LABEL_REGION2_MEDIUM);
+    orc_arm_emit_asr_imm (compiler, compiler->gp_tmpreg, ORC_ARM_IP,
+        13 + var_size_shift - compiler->loop_shift - compiler->unroll_shift);
+    orc_arm_emit_cmp_imm (compiler, compiler->gp_tmpreg, 0);
+    orc_arm_emit_branch (compiler, ORC_ARM_COND_EQ, LABEL_REGION2_SMALL);
+
+    /* N is smaller than L2 cache size */
+    compiler->size_region = 2;
+    orc_arm_emit_label (compiler, LABEL_REGION2_LOOP_MEDIUM);
+    orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE);
+    for(i=0;i<(1<<compiler->unroll_shift);i++){
+      orc_neon_emit_loop (compiler, i);
+    }
+    orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_REGION2_LOOP_MEDIUM);
+    orc_arm_emit_branch (compiler, ORC_ARM_COND_AL, LABEL_REGION2_SKIP);
+
+    orc_arm_emit_label (compiler, LABEL_REGION2_SMALL);
+    /* N is smaller than L2 cache size */
+    compiler->size_region = 1;
+    orc_arm_emit_label (compiler, LABEL_REGION2_LOOP_SMALL);
+    orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE);
+    for(i=0;i<(1<<compiler->unroll_shift);i++){
+      orc_neon_emit_loop (compiler, i);
+    }
+    orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_REGION2_LOOP_SMALL);
+
+    orc_arm_emit_label (compiler, LABEL_REGION2_SKIP);
 
-  if (compiler->program->is_2d) {
-    neon_add_strides (compiler);
+    if (compiler->loop_shift > 0) {
+      int save_loop_shift = compiler->loop_shift;
 
-    orc_arm_emit_load_reg (compiler, ORC_ARM_A3, compiler->exec_reg,
-        (int)ORC_STRUCT_OFFSET(OrcExecutor, params[ORC_VAR_A2]));
-    orc_arm_emit_sub_imm (compiler, ORC_ARM_A3, ORC_ARM_A3, 1, TRUE);
-    orc_arm_emit_store_reg (compiler, ORC_ARM_A3, compiler->exec_reg,
-        (int)ORC_STRUCT_OFFSET(OrcExecutor,params[ORC_VAR_A2]));
-    orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_OUTER_LOOP);
+      compiler->loop_shift = 0;
+
+      compiler->vars[align_var].is_aligned = FALSE;
+
+      orc_arm_emit_load_reg (compiler, ORC_ARM_IP, compiler->exec_reg,
+          (int)ORC_STRUCT_OFFSET(OrcExecutor,counter3));
+
+      orc_arm_emit_cmp_imm (compiler, ORC_ARM_IP, 0);
+      orc_arm_emit_branch (compiler, ORC_ARM_COND_EQ, LABEL_REGION3_SKIP);
+
+      orc_arm_emit_label (compiler, LABEL_REGION3_LOOP);
+      orc_arm_emit_sub_imm (compiler, ORC_ARM_IP, ORC_ARM_IP, 1, TRUE);
+      orc_neon_emit_loop (compiler, -1);
+      orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_REGION3_LOOP);
+      orc_arm_emit_label (compiler, LABEL_REGION3_SKIP);
+
+      compiler->loop_shift = save_loop_shift;
+    }
+
+    if (compiler->program->is_2d) {
+      neon_add_strides (compiler);
+
+      orc_arm_emit_load_reg (compiler, ORC_ARM_A3, compiler->exec_reg,
+          (int)ORC_STRUCT_OFFSET(OrcExecutor, params[ORC_VAR_A2]));
+      orc_arm_emit_sub_imm (compiler, ORC_ARM_A3, ORC_ARM_A3, 1, TRUE);
+      orc_arm_emit_store_reg (compiler, ORC_ARM_A3, compiler->exec_reg,
+          (int)ORC_STRUCT_OFFSET(OrcExecutor,params[ORC_VAR_A2]));
+      orc_arm_emit_branch (compiler, ORC_ARM_COND_NE, LABEL_OUTER_LOOP);
+    }
   }
 
   orc_neon_save_accumulators (compiler);
@@ -729,11 +1000,13 @@ orc_compiler_neon_assemble (OrcCompiler *compiler)
 
   orc_arm_emit_align (compiler, 4);
 
-  orc_arm_emit_label (compiler, 20);
-  orc_arm_emit_data (compiler, 0x07060706);
-  orc_arm_emit_data (compiler, 0x07060706);
-  orc_arm_emit_data (compiler, 0x0f0e0f0e);
-  orc_arm_emit_data (compiler, 0x0f0e0f0e);
+  if (!compiler->is_64bit) {
+    orc_arm_emit_label (compiler, 20);
+    orc_arm_emit_data (compiler, 0x07060706);
+    orc_arm_emit_data (compiler, 0x07060706);
+    orc_arm_emit_data (compiler, 0x0f0e0f0e);
+    orc_arm_emit_data (compiler, 0x0f0e0f0e);
+  }
 
   orc_arm_do_fixups (compiler);
 }
@@ -830,10 +1103,16 @@ orc_neon_emit_loop (OrcCompiler *compiler, int unroll_index)
     if (compiler->vars[k].vartype == ORC_VAR_TYPE_SRC ||
         compiler->vars[k].vartype == ORC_VAR_TYPE_DEST) {
       if (compiler->vars[k].ptr_register) {
-        orc_arm_emit_add_imm (compiler,
-            compiler->vars[k].ptr_register,
-            compiler->vars[k].ptr_register,
-            compiler->vars[k].size << compiler->loop_shift);
+        if (compiler->is_64bit)
+          orc_arm64_emit_add_imm (compiler, 32,
+              compiler->vars[k].ptr_register,
+              compiler->vars[k].ptr_register,
+              compiler->vars[k].size << compiler->loop_shift);
+        else
+          orc_arm_emit_add_imm (compiler,
+              compiler->vars[k].ptr_register,
+              compiler->vars[k].ptr_register,
+              compiler->vars[k].size << compiler->loop_shift);
       } else {
         /* arm_emit_add_imm_memoffset (compiler, arm_ptr_size, */
         /*     compiler->vars[k].size << compiler->loop_shift, */