From: David Schleef Date: Mon, 9 Aug 2010 23:47:38 +0000 (-0700) Subject: Add divluw opcode X-Git-Tag: orc-0.4.7~53 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=840a11432443019826ca99836edf4cf53ec2bcce;p=platform%2Fupstream%2Forc.git Add divluw opcode --- diff --git a/orc/orcemulateopcodes.c b/orc/orcemulateopcodes.c index 98b2e73..ffe7383 100644 --- a/orc/orcemulateopcodes.c +++ b/orc/orcemulateopcodes.c @@ -1212,6 +1212,34 @@ emulate_div255w (OrcOpcodeExecutor *ex, int offset, int n) } void +emulate_divluw (OrcOpcodeExecutor *ex, int offset, int n) +{ + int i; + orc_int16 * ptr0; + const orc_int16 * ptr4; + const orc_int16 * ptr5; + orc_int16 var32; + orc_int16 var33; + orc_int16 var34; + + ptr0 = (orc_int16 *)ex->dest_ptrs[0]; + ptr4 = (orc_int16 *)ex->src_ptrs[0]; + ptr5 = (orc_int16 *)ex->src_ptrs[1]; + + for (i = 0; i < n; i++) { + /* 0: loadw */ + var32 = ptr4[i]; + /* 1: loadw */ + var33 = ptr5[i]; + /* 2: divluw */ + var34 = ((var33&0xff) == 0) ? 255 : ORC_CLAMP_UB(((uint16_t)var32)/((uint16_t)var33&0xff)); + /* 3: storew */ + ptr0[i] = var34; + } + +} + +void emulate_loadw (OrcOpcodeExecutor *ex, int offset, int n) { int i; diff --git a/orc/orcemulateopcodes.h b/orc/orcemulateopcodes.h index 354c8b2..7a93b61 100644 --- a/orc/orcemulateopcodes.h +++ b/orc/orcemulateopcodes.h @@ -49,6 +49,7 @@ void emulate_cmpeqw (OrcOpcodeExecutor *ex, int i, int n); void emulate_cmpgtsw (OrcOpcodeExecutor *ex, int i, int n); void emulate_copyw (OrcOpcodeExecutor *ex, int i, int n); void emulate_div255w (OrcOpcodeExecutor *ex, int i, int n); +void emulate_divluw (OrcOpcodeExecutor *ex, int i, int n); void emulate_loadw (OrcOpcodeExecutor *ex, int i, int n); void emulate_loadoffw (OrcOpcodeExecutor *ex, int i, int n); void emulate_loadpw (OrcOpcodeExecutor *ex, int i, int n); diff --git a/orc/orcopcodes.c b/orc/orcopcodes.c index 18a25d3..697ab44 100644 --- a/orc/orcopcodes.c +++ b/orc/orcopcodes.c @@ -332,6 +332,7 @@ static OrcStaticOpcode opcodes[] = { { "cmpgtsw", 0, { 2 }, { 2, 2 }, emulate_cmpgtsw }, { "copyw", 0, { 2 }, { 2 }, emulate_copyw }, { "div255w", 0, { 2 }, { 2 }, emulate_div255w }, + { "divluw", 0, { 2 }, { 2, 2 }, emulate_divluw }, { "loadw", ORC_STATIC_OPCODE_LOAD, { 2 }, { 2 }, emulate_loadw }, { "loadoffw", ORC_STATIC_OPCODE_LOAD|ORC_STATIC_OPCODE_SCALAR, { 2 }, { 2, 4 }, emulate_loadoffw }, { "loadpw", ORC_STATIC_OPCODE_LOAD|ORC_STATIC_OPCODE_SCALAR|ORC_STATIC_OPCODE_INVARIANT, { 2 }, { 2 }, emulate_loadpw }, diff --git a/orc/orcprogram-c.c b/orc/orcprogram-c.c index 0b152ad..5cd2ebe 100644 --- a/orc/orcprogram-c.c +++ b/orc/orcprogram-c.c @@ -837,6 +837,20 @@ c_rule_div255w (OrcCompiler *p, void *user, OrcInstruction *insn) dest, src, src); } +static void +c_rule_divluw (OrcCompiler *p, void *user, OrcInstruction *insn) +{ + char dest[20], src1[20], src2[20]; + + c_get_name_int (dest, p, insn->dest_args[0]); + c_get_name_int (src1, p, insn->src_args[0]); + c_get_name_int (src2, p, insn->src_args[1]); + + ORC_ASM_CODE(p, + " %s = ((%s&0xff) == 0) ? 255 : ORC_CLAMP_UB(((uint16_t)%s)/((uint16_t)%s&0xff));\n", + dest, src2, src1, src2); +} + static OrcTarget c_target = { "c", FALSE, @@ -917,5 +931,6 @@ orc_c_init (void) orc_rule_register (rule_set, "splatbl", c_rule_splatbl, NULL); orc_rule_register (rule_set, "splatw0q", c_rule_splatw0q, NULL); orc_rule_register (rule_set, "div255w", c_rule_div255w, NULL); + orc_rule_register (rule_set, "divluw", c_rule_divluw, NULL); } diff --git a/orc/orcrules-sse.c b/orc/orcrules-sse.c index c0afb22..50fb034 100644 --- a/orc/orcrules-sse.c +++ b/orc/orcrules-sse.c @@ -992,6 +992,117 @@ sse_rule_div255w (OrcCompiler *p, void *user, OrcInstruction *insn) orc_sse_emit_psrlw (p, 8, dest); } +#if 1 +static void +sse_rule_divluw (OrcCompiler *p, void *user, OrcInstruction *insn) +{ + /* About 5.9 cycles per array member on ginger */ + int src = p->vars[insn->src_args[1]].alloc; + int dest = p->vars[insn->dest_args[0]].alloc; + int divisor = p->tmpreg; + int rem = X86_XMM7; + int tmp2 = X86_XMM6; + int tmp3 = X86_XMM5; + int tmp4 = X86_XMM4; + int i; + + orc_sse_emit_movdqa (p, src, divisor); + orc_sse_emit_psllw (p, 8, divisor); + + orc_sse_emit_movdqa (p, dest, tmp4); + orc_sse_emit_psrlw (p, 1, divisor); + orc_sse_emit_psrlw (p, 1, tmp4); + orc_sse_emit_pcmpgtw (p, divisor, tmp4); + orc_sse_emit_psrlw (p, 8, tmp4); + + orc_sse_emit_movdqa (p, dest, rem); + orc_sse_emit_psrlw (p, 1, divisor); + orc_sse_emit_psrlw (p, 1, rem); + orc_sse_emit_movdqa (p, rem, tmp2); + orc_sse_emit_pcmpgtw (p, divisor, tmp2); + orc_sse_emit_psllw (p, 1, divisor); + orc_sse_emit_movdqa (p, dest, rem); + orc_sse_emit_movdqa (p, divisor, tmp3); + orc_sse_emit_pand (p, tmp2, tmp3); + orc_sse_emit_psubw (p, tmp3, rem); + orc_sse_emit_psrlw (p, 15, tmp2); + orc_sse_emit_psllw (p, 7, tmp2); + orc_sse_emit_movdqa (p, tmp4, dest); + orc_sse_emit_por (p, tmp2, dest); + + orc_x86_emit_mov_imm_reg (p, 4, 0x00010001, p->gp_tmpreg); + orc_x86_emit_mov_reg_sse (p, p->gp_tmpreg, tmp4); + orc_sse_emit_pshufd (p, 0, tmp4, tmp4); + orc_sse_emit_paddw (p, tmp4, rem); + + for(i=6;i>=0;i--){ + orc_sse_emit_psrlw (p, 1, divisor); + orc_sse_emit_movdqa (p, rem, tmp2); + orc_sse_emit_pcmpgtw (p, divisor, tmp2); + orc_sse_emit_movdqa (p, divisor, tmp3); + orc_sse_emit_pand (p, tmp2, tmp3); + orc_sse_emit_psubw (p, tmp3, rem); + orc_sse_emit_psrlw (p, 15, tmp2); + orc_sse_emit_psllw (p, i, tmp2); + orc_sse_emit_por (p, tmp2, dest); + } +} +#else +static void +sse_rule_divluw (OrcCompiler *p, void *user, OrcInstruction *insn) +{ + /* About 40.7 cycles per array member on ginger. I.e., really slow */ + int i; + int regsize = p->is_64bit ? 8 : 4; + + orc_x86_emit_add_imm_reg (p, regsize, -32 - 2*regsize, X86_ESP, FALSE); + orc_x86_emit_mov_sse_memoffset (p, 16, p->vars[insn->src_args[0]].alloc, + 0, X86_ESP, FALSE, FALSE); + orc_x86_emit_mov_sse_memoffset (p, 16, p->vars[insn->src_args[1]].alloc, + 16, X86_ESP, FALSE, FALSE); + orc_x86_emit_mov_reg_memoffset (p, 4, X86_EAX, 32, X86_ESP); + orc_x86_emit_mov_reg_memoffset (p, 4, X86_EDX, 32 + regsize, X86_ESP); + + for(i=0;i<(1<loop_shift);i++) { + int label = p->label_index++; + + orc_x86_emit_mov_memoffset_reg (p, 2, 16 + 2*i, X86_ESP, X86_ECX); + orc_x86_emit_mov_imm_reg (p, 4, 0, X86_EDX); + orc_x86_emit_mov_imm_reg (p, 2, 0x00ff, X86_EAX); + orc_x86_emit_and_imm_reg (p, 2, 0x00ff, X86_ECX); + orc_x86_emit_je (p, label); + orc_x86_emit_mov_memoffset_reg (p, 2, 2*i, X86_ESP, X86_EAX); + + ORC_ASM_CODE(p," div %%cx\n"); + *p->codeptr++ = 0x66; + *p->codeptr++ = 0xf7; + orc_x86_emit_modrm_reg (p, X86_ECX, 6); + + ORC_ASM_CODE(p," testw $0xff00, %%ax\n"); + *p->codeptr++ = 0x66; + *p->codeptr++ = 0xa9; + //*p->codeptr++ = 0xf7; + //orc_x86_emit_modrm_reg (p, X86_EAX, 0); + *p->codeptr++ = 0x00; + *p->codeptr++ = 0xff; + orc_x86_emit_je (p, label); + + orc_x86_emit_mov_imm_reg (p, 2, 0x00ff, X86_EAX); + + orc_x86_emit_label (p, label); + + orc_x86_emit_mov_reg_memoffset (p, 2, X86_EAX, 2*i, X86_ESP); + } + + orc_x86_emit_mov_memoffset_sse (p, 16, 0, X86_ESP, + p->vars[insn->dest_args[0]].alloc, FALSE); + orc_x86_emit_mov_memoffset_reg (p, 4, 32, X86_ESP, X86_EAX); + orc_x86_emit_mov_memoffset_reg (p, 4, 32 + regsize, X86_ESP, X86_EDX); + + orc_x86_emit_add_imm_reg (p, regsize, 32 + 2*regsize, X86_ESP, FALSE); +} +#endif + static void sse_rule_mulsbw (OrcCompiler *p, void *user, OrcInstruction *insn) { @@ -2194,6 +2305,7 @@ orc_compiler_sse_register_rules (OrcTarget *target) orc_rule_register (rule_set, "splatbw", sse_rule_splatbw, NULL); orc_rule_register (rule_set, "splatbl", sse_rule_splatbl, NULL); orc_rule_register (rule_set, "div255w", sse_rule_div255w, NULL); + orc_rule_register (rule_set, "divluw", sse_rule_divluw, NULL); /* SSE 3 -- no rules */