From 04c1f87dcb7b92e515b1807a621e583a7a45cae2 Mon Sep 17 00:00:00 2001 From: David Schleef Date: Thu, 5 Aug 2010 14:21:34 -0700 Subject: [PATCH] sse: implement loadoff, loadup opcodes --- orc/orcprogram-sse.c | 57 ++++++++++++++++++++--- orc/orcrules-sse.c | 129 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 180 insertions(+), 6 deletions(-) diff --git a/orc/orcprogram-sse.c b/orc/orcprogram-sse.c index 4ff17d2..d7925d3 100644 --- a/orc/orcprogram-sse.c +++ b/orc/orcprogram-sse.c @@ -442,7 +442,7 @@ get_shift (int size) static void -orc_emit_split_n_regions (OrcCompiler *compiler) +orc_emit_split_3_regions (OrcCompiler *compiler) { int align_var; int align_shift; @@ -508,6 +508,35 @@ orc_emit_split_n_regions (OrcCompiler *compiler) orc_x86_emit_label (compiler, 7); } +static void +orc_emit_split_2_regions (OrcCompiler *compiler) +{ + int align_var; + int align_shift; + int var_size_shift; + + align_var = get_align_var (compiler); + var_size_shift = get_shift (compiler->vars[align_var].size); + align_shift = var_size_shift + compiler->loop_shift; + + /* Calculate n2 */ + orc_x86_emit_mov_memoffset_reg (compiler, 4, + (int)ORC_STRUCT_OFFSET(OrcExecutor,n), compiler->exec_reg, + compiler->gp_tmpreg); + orc_x86_emit_mov_reg_reg (compiler, 4, compiler->gp_tmpreg, X86_EAX); + orc_x86_emit_sar_imm_reg (compiler, 4, + compiler->loop_shift + compiler->unroll_shift, + compiler->gp_tmpreg); + orc_x86_emit_mov_reg_memoffset (compiler, 4, compiler->gp_tmpreg, + (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2), compiler->exec_reg); + + /* Calculate n3 */ + orc_x86_emit_and_imm_reg (compiler, 4, + (1<<(compiler->loop_shift + compiler->unroll_shift))-1, X86_EAX); + orc_x86_emit_mov_reg_memoffset (compiler, 4, X86_EAX, + (int)ORC_STRUCT_OFFSET(OrcExecutor,counter3), compiler->exec_reg); +} + #ifndef MMX static int orc_program_has_float (OrcCompiler *compiler) @@ -585,12 +614,17 @@ orc_compiler_sse_assemble (OrcCompiler *compiler) compiler->program->constant_n <= ORC_SSE_ALIGNED_DEST_CUTOFF) { /* don't need to load n */ } else if (compiler->loop_shift > 0) { - /* split n into three regions, with center region being aligned */ - orc_emit_split_n_regions (compiler); + if (!compiler->has_iterator_opcode) { + /* split n into three regions, with center region being aligned */ + orc_emit_split_3_regions (compiler); + } else { + orc_emit_split_2_regions (compiler); + } } else { /* loop shift is 0, no need to split */ orc_x86_emit_mov_memoffset_reg (compiler, 4, - (int)ORC_STRUCT_OFFSET(OrcExecutor,n), compiler->exec_reg, compiler->gp_tmpreg); + (int)ORC_STRUCT_OFFSET(OrcExecutor,n), compiler->exec_reg, + compiler->gp_tmpreg); orc_x86_emit_mov_reg_memoffset (compiler, 4, compiler->gp_tmpreg, (int)ORC_STRUCT_OFFSET(OrcExecutor,counter2), compiler->exec_reg); } @@ -623,10 +657,21 @@ orc_compiler_sse_assemble (OrcCompiler *compiler) } } compiler->loop_shift = save_loop_shift; + } else { int ui, ui_max; + int emit_region1 = TRUE; + int emit_region3 = TRUE; + + if (compiler->has_iterator_opcode) { + emit_region1 = FALSE; + } + if (compiler->loop_shift == 0) { + emit_region1 = FALSE; + emit_region3 = FALSE; + } - if (compiler->loop_shift > 0) { + if (emit_region1) { int save_loop_shift; int l; @@ -680,7 +725,7 @@ orc_compiler_sse_assemble (OrcCompiler *compiler) orc_x86_emit_jne (compiler, LABEL_INNER_LOOP_START); orc_x86_emit_label (compiler, LABEL_REGION2_SKIP); - if (compiler->loop_shift > 0) { + if (emit_region3) { int save_loop_shift; int l; diff --git a/orc/orcrules-sse.c b/orc/orcrules-sse.c index 328a564..ae8311e 100644 --- a/orc/orcrules-sse.c +++ b/orc/orcrules-sse.c @@ -122,6 +122,131 @@ sse_rule_loadX (OrcCompiler *compiler, void *user, OrcInstruction *insn) } static void +sse_rule_loadoffX (OrcCompiler *compiler, void *user, OrcInstruction *insn) +{ + OrcVariable *src = compiler->vars + insn->src_args[0]; + OrcVariable *dest = compiler->vars + insn->dest_args[0]; + int ptr_reg; + int offset = 0; + + if (compiler->vars[insn->src_args[1]].vartype != ORC_VAR_TYPE_CONST) { + ORC_COMPILER_ERROR(compiler, "Rule only works with consts"); + return; + } + + offset = (compiler->offset + compiler->vars[insn->src_args[1]].value) * + src->size; + if (src->ptr_register == 0) { + int i = insn->src_args[0]; + orc_x86_emit_mov_memoffset_reg (compiler, compiler->is_64bit ? 8 : 4, + (int)ORC_STRUCT_OFFSET(OrcExecutor, arrays[i]), + compiler->exec_reg, compiler->gp_tmpreg); + ptr_reg = compiler->gp_tmpreg; + } else { + ptr_reg = src->ptr_register; + } + switch (src->size << compiler->loop_shift) { + case 1: + orc_x86_emit_mov_memoffset_reg (compiler, 1, offset, ptr_reg, + compiler->gp_tmpreg); + orc_x86_emit_mov_reg_sse (compiler, compiler->gp_tmpreg, dest->alloc); + break; + case 2: + orc_x86_emit_mov_memoffset_reg (compiler, 2, offset, ptr_reg, + compiler->gp_tmpreg); + orc_x86_emit_mov_reg_sse (compiler, compiler->gp_tmpreg, dest->alloc); + break; + case 4: + orc_x86_emit_mov_memoffset_sse (compiler, 4, offset, ptr_reg, + dest->alloc, src->is_aligned); + break; + case 8: + orc_x86_emit_mov_memoffset_sse (compiler, 8, offset, ptr_reg, + dest->alloc, src->is_aligned); + break; + case 16: + orc_x86_emit_mov_memoffset_sse (compiler, 16, offset, ptr_reg, + dest->alloc, src->is_aligned); + break; + default: + ORC_COMPILER_ERROR(compiler,"bad load size %d", + src->size << compiler->loop_shift); + break; + } +} + +static void +sse_rule_loadupdb (OrcCompiler *compiler, void *user, OrcInstruction *insn) +{ + OrcVariable *src = compiler->vars + insn->src_args[0]; + OrcVariable *dest = compiler->vars + insn->dest_args[0]; + int ptr_reg; + int offset = 0; + + offset = compiler->offset * src->size; + if (src->ptr_register == 0) { + int i = insn->src_args[0]; + orc_x86_emit_mov_memoffset_reg (compiler, compiler->is_64bit ? 8 : 4, + (int)ORC_STRUCT_OFFSET(OrcExecutor, arrays[i]), + compiler->exec_reg, compiler->gp_tmpreg); + ptr_reg = compiler->gp_tmpreg; + } else { + ptr_reg = src->ptr_register; + } + switch (src->size << compiler->loop_shift) { + case 1: + case 2: + orc_x86_emit_mov_memoffset_reg (compiler, 1, offset, ptr_reg, + compiler->gp_tmpreg); + orc_x86_emit_mov_reg_sse (compiler, compiler->gp_tmpreg, dest->alloc); + break; + case 4: + orc_x86_emit_mov_memoffset_reg (compiler, 2, offset, ptr_reg, + compiler->gp_tmpreg); + orc_x86_emit_mov_reg_sse (compiler, compiler->gp_tmpreg, dest->alloc); + break; + case 8: + orc_x86_emit_mov_memoffset_sse (compiler, 4, offset, ptr_reg, + dest->alloc, src->is_aligned); + break; + case 16: + orc_x86_emit_mov_memoffset_sse (compiler, 8, offset, ptr_reg, + dest->alloc, src->is_aligned); + break; + case 32: + orc_x86_emit_mov_memoffset_sse (compiler, 16, offset, ptr_reg, + dest->alloc, src->is_aligned); + break; + default: + ORC_COMPILER_ERROR(compiler,"bad load size %d", + src->size << compiler->loop_shift); + break; + } + switch (src->size) { + case 1: + orc_sse_emit_punpcklbw (compiler, dest->alloc, dest->alloc); + break; + case 2: + orc_sse_emit_punpcklwd (compiler, dest->alloc, dest->alloc); + break; + case 4: + orc_sse_emit_punpckldq (compiler, dest->alloc, dest->alloc); + break; + } + /* FIXME hack */ + if (src->ptr_register) { + orc_x86_emit_add_imm_reg (compiler, compiler->is_64bit ? 8 : 4, + -(src->size << compiler->loop_shift)>>1, + src->ptr_register, FALSE); + } else { + orc_x86_emit_add_imm_memoffset (compiler, compiler->is_64bit ? 8 : 4, + -(src->size << compiler->loop_shift)>>1, + (int)ORC_STRUCT_OFFSET(OrcExecutor, arrays[insn->src_args[0]]), + compiler->exec_reg); + } +} + +static void sse_rule_storeX (OrcCompiler *compiler, void *user, OrcInstruction *insn) { OrcVariable *src = compiler->vars + insn->src_args[0]; @@ -1267,6 +1392,10 @@ orc_compiler_sse_register_rules (OrcTarget *target) orc_rule_register (rule_set, "loadb", sse_rule_loadX, NULL); orc_rule_register (rule_set, "loadw", sse_rule_loadX, NULL); orc_rule_register (rule_set, "loadl", sse_rule_loadX, NULL); + orc_rule_register (rule_set, "loadoffb", sse_rule_loadoffX, NULL); + orc_rule_register (rule_set, "loadoffw", sse_rule_loadoffX, NULL); + orc_rule_register (rule_set, "loadoffl", sse_rule_loadoffX, NULL); + orc_rule_register (rule_set, "loadupdb", sse_rule_loadupdb, NULL); orc_rule_register (rule_set, "loadpb", sse_rule_loadpX, NULL); orc_rule_register (rule_set, "loadpw", sse_rule_loadpX, NULL); orc_rule_register (rule_set, "loadpl", sse_rule_loadpX, NULL); -- 2.7.4