From f7ec3aa04cde041914865c29c885abe2fb093b58 Mon Sep 17 00:00:00 2001 From: David Schleef Date: Tue, 16 Jun 2009 18:40:22 -0700 Subject: [PATCH] neon: interleave aligned loads --- orc/orccompiler.c | 1 + orc/orcprogram-neon.c | 31 ++++++++++++++++++++++ orc/orcprogram.h | 1 + orc/orcrules-neon.c | 72 +++++++++++++++------------------------------------ 4 files changed, 54 insertions(+), 51 deletions(-) diff --git a/orc/orccompiler.c b/orc/orccompiler.c index 2adaf9d..d34f119 100644 --- a/orc/orccompiler.c +++ b/orc/orccompiler.c @@ -402,6 +402,7 @@ orc_compiler_global_reg_alloc (OrcCompiler *compiler) var->mask_alloc = orc_compiler_allocate_register (compiler, TRUE); var->ptr_offset = orc_compiler_allocate_register (compiler, FALSE); var->ptr_register = orc_compiler_allocate_register (compiler, FALSE); + var->aligned_data = orc_compiler_allocate_register (compiler, TRUE); break; case ORC_VAR_TYPE_DEST: var->ptr_register = orc_compiler_allocate_register (compiler, FALSE); diff --git a/orc/orcprogram-neon.c b/orc/orcprogram-neon.c index eae6bfe..940646e 100644 --- a/orc/orcprogram-neon.c +++ b/orc/orcprogram-neon.c @@ -287,6 +287,32 @@ orc_neon_load_alignment_masks (OrcCompiler *compiler) size - 1); orc_arm_emit_sub (compiler, var->ptr_register, var->ptr_register, var->ptr_offset); + + if (size == 4) { + int update = 1; + ORC_ASM_CODE(compiler," vld1.32 %s[1], [%s]%s\n", + orc_neon_reg_name (var->aligned_data), + orc_arm_reg_name (var->ptr_register), + update ? "!" : ""); + code = 0xf4a0088d; + code |= (var->ptr_register&0xf) << 16; + code |= ((var->aligned_data)&0xf) << 12; + code |= (((var->aligned_data)>>4)&0x1) << 22; + code |= (!update) << 1; + orc_arm_emit (compiler, code); + } else { + int update = 1; + ORC_ASM_CODE(compiler," vld1.64 %s, [%s]%s\n", + orc_neon_reg_name (var->aligned_data + 1), + orc_arm_reg_name (var->ptr_register), + update ? "!" : ""); + code = 0xf42007cd; + code |= (var->ptr_register&0xf) << 16; + code |= ((var->aligned_data+1)&0xf) << 12; + code |= (((var->aligned_data+1)>>4)&0x1) << 22; + code |= (!update) << 1; + orc_arm_emit (compiler, code); + } } break; @@ -308,6 +334,7 @@ void orc_neon_restore_unalignment (OrcCompiler *compiler) { int i; + int size; for(i=0;ivars[i]; @@ -319,8 +346,12 @@ orc_neon_restore_unalignment (OrcCompiler *compiler) if (var->is_aligned) continue; if (compiler->loop_shift > 1) { + size = var->size << compiler->loop_shift; + orc_arm_emit_add (compiler, var->ptr_register, var->ptr_register, var->ptr_offset); + orc_arm_emit_sub_imm (compiler, var->ptr_register, var->ptr_register, + size); } break; case ORC_VAR_TYPE_DEST: diff --git a/orc/orcprogram.h b/orc/orcprogram.h index 5b4389e..3ef7eb0 100644 --- a/orc/orcprogram.h +++ b/orc/orcprogram.h @@ -186,6 +186,7 @@ struct _OrcVariable { int ptr_register; int ptr_offset; + int aligned_data; }; /** diff --git a/orc/orcrules-neon.c b/orc/orcrules-neon.c index d4532d1..1170d3f 100644 --- a/orc/orcrules-neon.c +++ b/orc/orcrules-neon.c @@ -229,42 +229,27 @@ orc_neon_load_vec_unaligned (OrcCompiler *compiler, OrcVariable *var, { uint32_t code; - //orc_arm_emit_sub (compiler, var->ptr_register, var->ptr_register, - // var->ptr_offset); + orc_neon_emit_mov (compiler, var->aligned_data, var->aligned_data + 1); ORC_ASM_CODE(compiler," vld1.64 %s, [%s]%s\n", - orc_neon_reg_name (var->alloc), + orc_neon_reg_name (var->aligned_data + 1), orc_arm_reg_name (var->ptr_register), update ? "!" : ""); code = 0xf42007cd; code |= (var->ptr_register&0xf) << 16; - code |= (var->alloc&0xf) << 12; - code |= ((var->alloc>>4)&0x1) << 22; - code |= (!update) << 1; - orc_arm_emit (compiler, code); - - update = 0; - ORC_ASM_CODE(compiler," vld1.64 %s, [%s]%s\n", - orc_neon_reg_name (var->alloc + 1), - orc_arm_reg_name (var->ptr_register), - update ? "!" : ""); - code = 0xf42007cd; - code |= (var->ptr_register&0xf) << 16; - code |= ((var->alloc+1)&0xf) << 12; - code |= (((var->alloc+1)>>4)&0x1) << 22; + code |= ((var->aligned_data+1)&0xf) << 12; + code |= (((var->aligned_data+1)>>4)&0x1) << 22; code |= (!update) << 1; orc_arm_emit (compiler, code); ORC_ASM_CODE(compiler," vtbl.8 %s, {%s,%s}, %s\n", orc_neon_reg_name (var->alloc), - orc_neon_reg_name (var->alloc), - orc_neon_reg_name (var->alloc + 1), + orc_neon_reg_name (var->aligned_data), + orc_neon_reg_name (var->aligned_data+1), orc_neon_reg_name (var->mask_alloc)); - code = NEON_BINARY(0xf3b00900, var->alloc, var->alloc, var->mask_alloc); + code = NEON_BINARY(0xf3b00900, var->alloc, var->aligned_data, + var->mask_alloc); orc_arm_emit (compiler, code); - - //orc_arm_emit_add (compiler, var->ptr_register, var->ptr_register, - // var->ptr_offset); } void @@ -273,42 +258,27 @@ orc_neon_load_halfvec_unaligned (OrcCompiler *compiler, OrcVariable *var, { uint32_t code; - //orc_arm_emit_sub (compiler, var->ptr_register, var->ptr_register, - // var->ptr_offset); + orc_neon_emit_unary (compiler, "vrev64.i32", 0xf3b80000, + var->aligned_data, var->aligned_data); - ORC_ASM_CODE(compiler," vld1.32 %s[0], [%s]%s\n", - orc_neon_reg_name (var->alloc), - orc_arm_reg_name (var->ptr_register), - update ? "!" : ""); - code = 0xf4a0080d; - code |= (var->ptr_register&0xf) << 16; - code |= (var->alloc&0xf) << 12; - code |= ((var->alloc>>4)&0x1) << 22; - code |= (!update) << 1; - orc_arm_emit (compiler, code); - - update = 0; ORC_ASM_CODE(compiler," vld1.32 %s[1], [%s]%s\n", - orc_neon_reg_name (var->alloc), + orc_neon_reg_name (var->aligned_data), orc_arm_reg_name (var->ptr_register), update ? "!" : ""); - code = 0xf4a0088f; + code = 0xf4a0088d; code |= (var->ptr_register&0xf) << 16; - code |= ((var->alloc)&0xf) << 12; - code |= (((var->alloc)>>4)&0x1) << 22; + code |= ((var->aligned_data)&0xf) << 12; + code |= (((var->aligned_data)>>4)&0x1) << 22; code |= (!update) << 1; orc_arm_emit (compiler, code); ORC_ASM_CODE(compiler," vtbl.8 %s, {%s,%s}, %s\n", orc_neon_reg_name (var->alloc), - orc_neon_reg_name (var->alloc), - orc_neon_reg_name (var->alloc + 1), + orc_neon_reg_name (var->aligned_data), + orc_neon_reg_name (var->aligned_data + 1), orc_neon_reg_name (var->mask_alloc)); - code = NEON_BINARY(0xf3b00900, var->alloc, var->alloc, var->mask_alloc); + code = NEON_BINARY(0xf3b00900, var->alloc, var->aligned_data, var->mask_alloc); orc_arm_emit (compiler, code); - - //orc_arm_emit_add (compiler, var->ptr_register, var->ptr_register, - // var->ptr_offset); } void @@ -324,7 +294,7 @@ orc_neon_loadb (OrcCompiler *compiler, OrcVariable *var, int update) } else if (compiler->loop_shift == 2) { orc_neon_load_halfvec_unaligned (compiler, var, update); } else { - if (compiler->loop_shift > 0) { + if (compiler->loop_shift > 1) { ORC_ERROR("slow load"); } for(i=0;i<(1<loop_shift);i++){ @@ -352,7 +322,7 @@ orc_neon_loadw (OrcCompiler *compiler, OrcVariable *var, int update) } else if (compiler->loop_shift == 2) { orc_neon_load_vec_unaligned (compiler, var, update); } else { - if (compiler->loop_shift > 0) { + if (compiler->loop_shift > 1) { ORC_ERROR("slow load"); } for(i=0;i<(1<loop_shift);i++){ @@ -1149,7 +1119,7 @@ orc_neon_rule_select1wb (OrcCompiler *p, void *user, OrcInstruction *insn) p->vars[insn->src_args[0]].alloc); orc_neon_emit_unary_narrow (p, "vmovn.i16", 0xf3b20200, p->vars[insn->dest_args[0]].alloc, - p->vars[insn->src_args[0]].alloc); + p->vars[insn->dest_args[0]].alloc); } static void @@ -1160,7 +1130,7 @@ orc_neon_rule_select1lw (OrcCompiler *p, void *user, OrcInstruction *insn) p->vars[insn->src_args[0]].alloc); orc_neon_emit_unary_narrow (p, "vmovn.i32", 0xf3b60200, p->vars[insn->dest_args[0]].alloc, - p->vars[insn->src_args[0]].alloc); + p->vars[insn->dest_args[0]].alloc); } static void -- 2.7.4