From 4b6619f8d21feabf88bc07323bf51b54c4c77e5c Mon Sep 17 00:00:00 2001 From: David Schleef Date: Fri, 6 Aug 2010 18:43:41 -0700 Subject: [PATCH] Add low-level opcodes for compositing --- orc/opcodes.h | 2 + orc/orcemulateopcodes.c | 138 ++++++++++++++++++++++++++++++++++++++++++++++++ orc/orcemulateopcodes.h | 6 +++ orc/orcopcodes.c | 7 +++ orc/orcprogram-c.c | 54 +++++++++++++++++++ orc/orcrules-sse.c | 94 +++++++++++++++++++++++++++++++++ orc/orcsse.c | 14 +++++ orc/orcsse.h | 1 + 8 files changed, 316 insertions(+) diff --git a/orc/opcodes.h b/orc/opcodes.h index 4db81a8..b324d5d 100644 --- a/orc/opcodes.h +++ b/orc/opcodes.h @@ -90,6 +90,7 @@ UNARY_SQ(loadpq, "%s") UNARY_BW(convsbw, "%s") UNARY_BW(convubw, "(orc_uint8)%s") UNARY_WB(convwb, "%s") +UNARY_WB(convhwb, "((orc_uint16)%s)>>8") UNARY_WB(convssswb, "ORC_CLAMP_SB(%s)") UNARY_WB(convsuswb, "ORC_CLAMP_UB(%s)") UNARY_WB(convusswb, "ORC_CLAMP_SB((orc_uint16)%s)") @@ -98,6 +99,7 @@ UNARY_WB(convuuswb, "ORC_CLAMP_UB((orc_uint16)%s)") UNARY_WL(convswl, "%s") UNARY_WL(convuwl, "(orc_uint16)%s") UNARY_LW(convlw, "%s") +UNARY_WB(convhlw, "((orc_uint32)%s)>>16") UNARY_LW(convssslw, "ORC_CLAMP_SW(%s)") UNARY_LW(convsuslw, "ORC_CLAMP_UW(%s)") UNARY_LW(convusslw, "ORC_CLAMP_SW((orc_uint32)%s)") diff --git a/orc/orcemulateopcodes.c b/orc/orcemulateopcodes.c index 5865d0d..f8cac0a 100644 --- a/orc/orcemulateopcodes.c +++ b/orc/orcemulateopcodes.c @@ -1189,6 +1189,29 @@ emulate_copyw (OrcOpcodeExecutor *ex, int offset, int n) } void +emulate_div255w (OrcOpcodeExecutor *ex, int offset, int n) +{ + int i; + orc_int16 * ptr0; + const orc_int16 * ptr4; + orc_int16 var32; + orc_int16 var33; + + ptr0 = (orc_int16 *)ex->dest_ptrs[0]; + ptr4 = (orc_int16 *)ex->src_ptrs[0]; + + for (i = 0; i < n; i++) { + /* 0: loadw */ + var32 = ptr4[i]; + /* 1: div255w */ + var33 = ((uint16_t)(((orc_uint16)(var32+128)) + (((orc_uint16)(var32+128))>>8)))>>8; + /* 2: storew */ + ptr0[i] = var33; + } + +} + +void emulate_loadw (OrcOpcodeExecutor *ex, int offset, int n) { int i; @@ -2568,6 +2591,29 @@ emulate_storeq (OrcOpcodeExecutor *ex, int offset, int n) } void +emulate_splatw0q (OrcOpcodeExecutor *ex, int offset, int n) +{ + int i; + orc_union64 * ptr0; + const orc_union64 * ptr4; + orc_union64 var32; + orc_union64 var33; + + ptr0 = (orc_union64 *)ex->dest_ptrs[0]; + ptr4 = (orc_union64 *)ex->src_ptrs[0]; + + for (i = 0; i < n; i++) { + /* 0: loadq */ + var32 = ptr4[i]; + /* 1: splatw0q */ + var33.i = ((orc_uint64)(var32.i&0xffff) << 48) | ((orc_uint64)(var32.i&0xffff)<<32) | ((var32.i&0xffff) << 16) | (var32.i&0xffff); + /* 2: storeq */ + ptr0[i] = var33; + } + +} + +void emulate_convsbw (OrcOpcodeExecutor *ex, int offset, int n) { int i; @@ -2614,6 +2660,52 @@ emulate_convubw (OrcOpcodeExecutor *ex, int offset, int n) } void +emulate_splatbw (OrcOpcodeExecutor *ex, int offset, int n) +{ + int i; + orc_int16 * ptr0; + const orc_int8 * ptr4; + orc_int8 var32; + orc_int16 var33; + + ptr0 = (orc_int16 *)ex->dest_ptrs[0]; + ptr4 = (orc_int8 *)ex->src_ptrs[0]; + + for (i = 0; i < n; i++) { + /* 0: loadb */ + var32 = ptr4[i]; + /* 1: splatbw */ + var33 = ((var32&0xff) << 8) | (var32&0xff); + /* 2: storew */ + ptr0[i] = var33; + } + +} + +void +emulate_splatbl (OrcOpcodeExecutor *ex, int offset, int n) +{ + int i; + orc_union32 * ptr0; + const orc_int8 * ptr4; + orc_int8 var32; + orc_union32 var33; + + ptr0 = (orc_union32 *)ex->dest_ptrs[0]; + ptr4 = (orc_int8 *)ex->src_ptrs[0]; + + for (i = 0; i < n; i++) { + /* 0: loadb */ + var32 = ptr4[i]; + /* 1: splatbl */ + var33.i = ((var32&0xff) << 24) | ((var32&0xff)<<16) | ((var32&0xff) << 8) | (var32&0xff); + /* 2: storel */ + ptr0[i] = var33; + } + +} + +void emulate_convswl (OrcOpcodeExecutor *ex, int offset, int n) { int i; @@ -2729,6 +2821,29 @@ emulate_convwb (OrcOpcodeExecutor *ex, int offset, int n) } void +emulate_convhwb (OrcOpcodeExecutor *ex, int offset, int n) +{ + int i; + orc_int8 * ptr0; + const orc_int16 * ptr4; + orc_int16 var32; + orc_int8 var33; + + ptr0 = (orc_int8 *)ex->dest_ptrs[0]; + ptr4 = (orc_int16 *)ex->src_ptrs[0]; + + for (i = 0; i < n; i++) { + /* 0: loadw */ + var32 = ptr4[i]; + /* 1: convhwb */ + var33 = ((orc_uint16)var32)>>8; + /* 2: storeb */ + ptr0[i] = var33; + } + +} + +void emulate_convssswb (OrcOpcodeExecutor *ex, int offset, int n) { int i; @@ -2844,6 +2959,29 @@ emulate_convlw (OrcOpcodeExecutor *ex, int offset, int n) } void +emulate_convhlw (OrcOpcodeExecutor *ex, int offset, int n) +{ + int i; + orc_int16 * ptr0; + const orc_union32 * ptr4; + orc_union32 var32; + orc_int16 var33; + + ptr0 = (orc_int16 *)ex->dest_ptrs[0]; + ptr4 = (orc_union32 *)ex->src_ptrs[0]; + + for (i = 0; i < n; i++) { + /* 0: loadl */ + var32 = ptr4[i]; + /* 1: convhlw */ + var33 = ((orc_uint32)var32.i)>>16; + /* 2: storew */ + ptr0[i] = var33; + } + +} + +void emulate_convssslw (OrcOpcodeExecutor *ex, int offset, int n) { int i; diff --git a/orc/orcemulateopcodes.h b/orc/orcemulateopcodes.h index 49d2e2d..354c8b2 100644 --- a/orc/orcemulateopcodes.h +++ b/orc/orcemulateopcodes.h @@ -48,6 +48,7 @@ void emulate_avguw (OrcOpcodeExecutor *ex, int i, int n); void emulate_cmpeqw (OrcOpcodeExecutor *ex, int i, int n); void emulate_cmpgtsw (OrcOpcodeExecutor *ex, int i, int n); void emulate_copyw (OrcOpcodeExecutor *ex, int i, int n); +void emulate_div255w (OrcOpcodeExecutor *ex, int i, int n); void emulate_loadw (OrcOpcodeExecutor *ex, int i, int n); void emulate_loadoffw (OrcOpcodeExecutor *ex, int i, int n); void emulate_loadpw (OrcOpcodeExecutor *ex, int i, int n); @@ -102,18 +103,23 @@ void emulate_xorl (OrcOpcodeExecutor *ex, int i, int n); void emulate_loadq (OrcOpcodeExecutor *ex, int i, int n); void emulate_loadpq (OrcOpcodeExecutor *ex, int i, int n); void emulate_storeq (OrcOpcodeExecutor *ex, int i, int n); +void emulate_splatw0q (OrcOpcodeExecutor *ex, int i, int n); void emulate_convsbw (OrcOpcodeExecutor *ex, int i, int n); void emulate_convubw (OrcOpcodeExecutor *ex, int i, int n); +void emulate_splatbw (OrcOpcodeExecutor *ex, int i, int n); +void emulate_splatbl (OrcOpcodeExecutor *ex, int i, int n); void emulate_convswl (OrcOpcodeExecutor *ex, int i, int n); void emulate_convuwl (OrcOpcodeExecutor *ex, int i, int n); void emulate_convslq (OrcOpcodeExecutor *ex, int i, int n); void emulate_convulq (OrcOpcodeExecutor *ex, int i, int n); void emulate_convwb (OrcOpcodeExecutor *ex, int i, int n); +void emulate_convhwb (OrcOpcodeExecutor *ex, int i, int n); void emulate_convssswb (OrcOpcodeExecutor *ex, int i, int n); void emulate_convsuswb (OrcOpcodeExecutor *ex, int i, int n); void emulate_convusswb (OrcOpcodeExecutor *ex, int i, int n); void emulate_convuuswb (OrcOpcodeExecutor *ex, int i, int n); void emulate_convlw (OrcOpcodeExecutor *ex, int i, int n); +void emulate_convhlw (OrcOpcodeExecutor *ex, int i, int n); void emulate_convssslw (OrcOpcodeExecutor *ex, int i, int n); void emulate_convsuslw (OrcOpcodeExecutor *ex, int i, int n); void emulate_convusslw (OrcOpcodeExecutor *ex, int i, int n); diff --git a/orc/orcopcodes.c b/orc/orcopcodes.c index b31df2c..18a25d3 100644 --- a/orc/orcopcodes.c +++ b/orc/orcopcodes.c @@ -331,6 +331,7 @@ static OrcStaticOpcode opcodes[] = { { "cmpeqw", 0, { 2 }, { 2, 2 }, emulate_cmpeqw }, { "cmpgtsw", 0, { 2 }, { 2, 2 }, emulate_cmpgtsw }, { "copyw", 0, { 2 }, { 2 }, emulate_copyw }, + { "div255w", 0, { 2 }, { 2 }, emulate_div255w }, { "loadw", ORC_STATIC_OPCODE_LOAD, { 2 }, { 2 }, emulate_loadw }, { "loadoffw", ORC_STATIC_OPCODE_LOAD|ORC_STATIC_OPCODE_SCALAR, { 2 }, { 2, 4 }, emulate_loadoffw }, { "loadpw", ORC_STATIC_OPCODE_LOAD|ORC_STATIC_OPCODE_SCALAR|ORC_STATIC_OPCODE_INVARIANT, { 2 }, { 2 }, emulate_loadpw }, @@ -388,21 +389,27 @@ static OrcStaticOpcode opcodes[] = { { "loadq", ORC_STATIC_OPCODE_LOAD, { 8 }, { 8 }, emulate_loadq }, { "loadpq", ORC_STATIC_OPCODE_LOAD|ORC_STATIC_OPCODE_SCALAR|ORC_STATIC_OPCODE_INVARIANT, { 8 }, { 8 }, emulate_loadpq }, { "storeq", ORC_STATIC_OPCODE_STORE, { 8 }, { 8 }, emulate_storeq }, + { "splatw0q", 0, { 8 }, { 8 }, emulate_splatw0q }, { "convsbw", 0, { 2 }, { 1 }, emulate_convsbw }, { "convubw", 0, { 2 }, { 1 }, emulate_convubw }, + { "splatbw", 0, { 2 }, { 1 }, emulate_splatbw }, + { "splatbl", 0, { 4 }, { 1 }, emulate_splatbl }, + { "convswl", 0, { 4 }, { 2 }, emulate_convswl }, { "convuwl", 0, { 4 }, { 2 }, emulate_convuwl }, { "convslq", 0, { 8 }, { 4 }, emulate_convslq }, { "convulq", 0, { 8 }, { 4 }, emulate_convulq }, { "convwb", 0, { 1 }, { 2 }, emulate_convwb }, + { "convhwb", 0, { 1 }, { 2 }, emulate_convhwb }, { "convssswb", 0, { 1 }, { 2 }, emulate_convssswb }, { "convsuswb", 0, { 1 }, { 2 }, emulate_convsuswb }, { "convusswb", 0, { 1 }, { 2 }, emulate_convusswb }, { "convuuswb", 0, { 1 }, { 2 }, emulate_convuuswb }, { "convlw", 0, { 2 }, { 4 }, emulate_convlw }, + { "convhlw", 0, { 2 }, { 4 }, emulate_convhlw }, { "convssslw", 0, { 2 }, { 4 }, emulate_convssslw }, { "convsuslw", 0, { 2 }, { 4 }, emulate_convsuslw }, { "convusslw", 0, { 2 }, { 4 }, emulate_convusslw }, diff --git a/orc/orcprogram-c.c b/orc/orcprogram-c.c index 9fce6ac..09e550a 100644 --- a/orc/orcprogram-c.c +++ b/orc/orcprogram-c.c @@ -782,6 +782,56 @@ c_rule_splitwb (OrcCompiler *p, void *user, OrcInstruction *insn) ORC_ASM_CODE(p," %s = %s & 0xff;\n", dest2, src); } +static void +c_rule_splatbw (OrcCompiler *p, void *user, OrcInstruction *insn) +{ + char dest[20], src[20]; + + c_get_name_int (dest, p, insn->dest_args[0]); + c_get_name_int (src, p, insn->src_args[0]); + + ORC_ASM_CODE(p," %s = ((%s&0xff) << 8) | (%s&0xff);\n", dest, src, src); +} + +static void +c_rule_splatbl (OrcCompiler *p, void *user, OrcInstruction *insn) +{ + char dest[20], src[20]; + + c_get_name_int (dest, p, insn->dest_args[0]); + c_get_name_int (src, p, insn->src_args[0]); + + ORC_ASM_CODE(p, + " %s = ((%s&0xff) << 24) | ((%s&0xff)<<16) | ((%s&0xff) << 8) | (%s&0xff);\n", + dest, src, src, src, src); +} + +static void +c_rule_splatw0q (OrcCompiler *p, void *user, OrcInstruction *insn) +{ + char dest[20], src[20]; + + c_get_name_int (dest, p, insn->dest_args[0]); + c_get_name_int (src, p, insn->src_args[0]); + + ORC_ASM_CODE(p, + " %s = ((orc_uint64)(%s&0xffff) << 48) | ((orc_uint64)(%s&0xffff)<<32) | ((%s&0xffff) << 16) | (%s&0xffff);\n", + dest, src, src, src, src); +} + +static void +c_rule_div255w (OrcCompiler *p, void *user, OrcInstruction *insn) +{ + char dest[20], src[20]; + + c_get_name_int (dest, p, insn->dest_args[0]); + c_get_name_int (src, p, insn->src_args[0]); + + ORC_ASM_CODE(p, + " %s = ((uint16_t)(((orc_uint16)(%s+128)) + (((orc_uint16)(%s+128))>>8)))>>8;\n", + dest, src, src); +} + static OrcTarget c_target = { "c", FALSE, @@ -858,5 +908,9 @@ orc_c_init (void) orc_rule_register (rule_set, "accsadubl", c_rule_accsadubl, NULL); orc_rule_register (rule_set, "splitlw", c_rule_splitlw, NULL); orc_rule_register (rule_set, "splitwb", c_rule_splitwb, NULL); + orc_rule_register (rule_set, "splatbw", c_rule_splatbw, NULL); + orc_rule_register (rule_set, "splatbl", c_rule_splatbl, NULL); + orc_rule_register (rule_set, "splatw0q", c_rule_splatw0q, NULL); + orc_rule_register (rule_set, "div255w", c_rule_div255w, NULL); } diff --git a/orc/orcrules-sse.c b/orc/orcrules-sse.c index 0f514b1..8dfde43 100644 --- a/orc/orcrules-sse.c +++ b/orc/orcrules-sse.c @@ -802,6 +802,22 @@ sse_rule_convwb (OrcCompiler *p, void *user, OrcInstruction *insn) } static void +sse_rule_convhwb (OrcCompiler *p, void *user, OrcInstruction *insn) +{ + int src = p->vars[insn->src_args[0]].alloc; + int dest = p->vars[insn->dest_args[0]].alloc; + + /* FIXME slow */ + + if (dest != src) { + orc_sse_emit_movdqa (p, src, dest); + } + + orc_sse_emit_psrlw (p, 8, dest); + orc_sse_emit_packuswb (p, dest, dest); +} + +static void sse_rule_convswl (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; @@ -849,6 +865,22 @@ sse_rule_convlw (OrcCompiler *p, void *user, OrcInstruction *insn) } static void +sse_rule_convhlw (OrcCompiler *p, void *user, OrcInstruction *insn) +{ + int src = p->vars[insn->src_args[0]].alloc; + int dest = p->vars[insn->dest_args[0]].alloc; + + /* FIXME slow */ + + if (dest != src) { + orc_sse_emit_movdqa (p, src, dest); + } + + orc_sse_emit_psrad (p, 16, dest); + orc_sse_emit_packssdw (p, dest, dest); +} + +static void sse_rule_convssslw (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[0]].alloc; @@ -907,6 +939,62 @@ sse_rule_convql (OrcCompiler *p, void *user, OrcInstruction *insn) } static void +sse_rule_splatw0q (OrcCompiler *p, void *user, OrcInstruction *insn) +{ + int src = p->vars[insn->src_args[0]].alloc; + int dest = p->vars[insn->dest_args[0]].alloc; + + if (src != dest) { + orc_sse_emit_movdqa (p, src, dest); + } + orc_sse_emit_pshuflw (p, ORC_SSE_SHUF(0,0,0,0), dest, dest); + orc_sse_emit_pshufhw (p, ORC_SSE_SHUF(0,0,0,0), dest, dest); +} + +static void +sse_rule_splatbw (OrcCompiler *p, void *user, OrcInstruction *insn) +{ + int src = p->vars[insn->src_args[0]].alloc; + int dest = p->vars[insn->dest_args[0]].alloc; + + if (src != dest) { + orc_sse_emit_movdqa (p, src, dest); + } + orc_sse_emit_punpcklbw (p, dest, dest); +} + +static void +sse_rule_splatbl (OrcCompiler *p, void *user, OrcInstruction *insn) +{ + int src = p->vars[insn->src_args[0]].alloc; + int dest = p->vars[insn->dest_args[0]].alloc; + + if (src != dest) { + orc_sse_emit_movdqa (p, src, dest); + } + orc_sse_emit_punpcklbw (p, dest, dest); + orc_sse_emit_punpcklwd (p, dest, dest); +} + +static void +sse_rule_div255w (OrcCompiler *p, void *user, OrcInstruction *insn) +{ + int src = p->vars[insn->src_args[0]].alloc; + int dest = p->vars[insn->dest_args[0]].alloc; + int tmp; + + if (src != dest) { + orc_sse_emit_movdqa (p, src, dest); + } + tmp = orc_compiler_get_constant (p, 2, 0x0080); + orc_sse_emit_paddw (p, tmp, dest); + orc_sse_emit_movdqa (p, dest, tmp); + orc_sse_emit_psrlw (p, 8, tmp); + orc_sse_emit_paddw (p, tmp, dest); + orc_sse_emit_psrlw (p, 8, dest); +} + +static void sse_rule_mulsbw (OrcCompiler *p, void *user, OrcInstruction *insn) { int src = p->vars[insn->src_args[1]].alloc; @@ -2102,6 +2190,12 @@ orc_compiler_sse_register_rules (OrcTarget *target) orc_rule_register (rule_set, "subssl", sse_rule_subssl_slow, NULL); orc_rule_register (rule_set, "addusl", sse_rule_addusl_slow, NULL); orc_rule_register (rule_set, "subusl", sse_rule_subusl_slow, NULL); + orc_rule_register (rule_set, "convhwb", sse_rule_convhwb, NULL); + orc_rule_register (rule_set, "convhlw", sse_rule_convhlw, NULL); + orc_rule_register (rule_set, "splatw0q", sse_rule_splatw0q, NULL); + orc_rule_register (rule_set, "splatbw", sse_rule_splatbw, NULL); + orc_rule_register (rule_set, "splatbl", sse_rule_splatbl, NULL); + orc_rule_register (rule_set, "div255w", sse_rule_div255w, NULL); /* SSE 3 -- no rules */ diff --git a/orc/orcsse.c b/orc/orcsse.c index b6edd27..89f0dfd 100644 --- a/orc/orcsse.c +++ b/orc/orcsse.c @@ -110,6 +110,20 @@ orc_sse_emit_pshufd (OrcCompiler *p, int shuf, int src, int dest) } void +orc_sse_emit_pshufhw (OrcCompiler *p, int shuf, int src, int dest) +{ + ORC_ASM_CODE(p," pshufhw $0x%04x, %%%s, %%%s\n", shuf, + orc_x86_get_regname_sse(src), + orc_x86_get_regname_sse(dest)); + *p->codeptr++ = 0xf3; + orc_x86_emit_rex (p, 0, dest, 0, src); + *p->codeptr++ = 0x0f; + *p->codeptr++ = 0x70; + orc_x86_emit_modrm_reg (p, src, dest); + *p->codeptr++ = shuf; +} + +void orc_sse_emit_pshuflw (OrcCompiler *p, int shuf, int src, int dest) { ORC_ASM_CODE(p," pshuflw $0x%04x, %%%s, %%%s\n", shuf, diff --git a/orc/orcsse.h b/orc/orcsse.h index a32ab13..c7583bd 100644 --- a/orc/orcsse.h +++ b/orc/orcsse.h @@ -64,6 +64,7 @@ void orc_sse_emit_0f (OrcCompiler *p, const char *insn_name, int code, int src, int dest); void orc_sse_emit_pshufd (OrcCompiler *p, int shuf, int src, int dest); void orc_sse_emit_pshuflw (OrcCompiler *p, int shuf, int src, int dest); +void orc_sse_emit_pshufhw (OrcCompiler *p, int shuf, int src, int dest); void orc_sse_emit_shiftimm (OrcCompiler *p, const char *insn_name, int code, int modrm_code, int shift, int reg); -- 2.7.4