From 4eb3225b38ce12cb34ab3d90804c9683bd7b4ed3 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Jos=C3=A9=20Fonseca?= Date: Tue, 8 Nov 2011 00:10:47 +0000 Subject: [PATCH] Remove tgsi_sse2. tgsi_exec is simple. llvm is fast. tgsi_sse2 ends up being neither. --- src/gallium/auxiliary/Makefile.sources | 5 - src/gallium/auxiliary/draw/draw_private.h | 4 - src/gallium/auxiliary/draw/draw_vs.c | 27 +- src/gallium/auxiliary/draw/draw_vs.h | 20 - src/gallium/auxiliary/draw/draw_vs_aos.c | 2267 ---------------- src/gallium/auxiliary/draw/draw_vs_aos.h | 255 -- src/gallium/auxiliary/draw/draw_vs_aos_io.c | 460 ---- src/gallium/auxiliary/draw/draw_vs_aos_machine.c | 328 --- src/gallium/auxiliary/draw/draw_vs_ppc.c | 7 +- src/gallium/auxiliary/draw/draw_vs_sse.c | 225 -- src/gallium/auxiliary/tgsi/tgsi_sse2.c | 3106 ---------------------- src/gallium/auxiliary/tgsi/tgsi_sse2.h | 80 - src/gallium/drivers/softpipe/Android.mk | 1 - src/gallium/drivers/softpipe/Makefile | 1 - src/gallium/drivers/softpipe/SConscript | 1 - src/gallium/drivers/softpipe/sp_context.c | 6 - src/gallium/drivers/softpipe/sp_context.h | 1 - src/gallium/drivers/softpipe/sp_fs.h | 4 - src/gallium/drivers/softpipe/sp_fs_sse.c | 248 -- src/gallium/drivers/softpipe/sp_state_shader.c | 5 +- 20 files changed, 3 insertions(+), 7048 deletions(-) delete mode 100644 src/gallium/auxiliary/draw/draw_vs_aos.c delete mode 100644 src/gallium/auxiliary/draw/draw_vs_aos.h delete mode 100644 src/gallium/auxiliary/draw/draw_vs_aos_io.c delete mode 100644 src/gallium/auxiliary/draw/draw_vs_aos_machine.c delete mode 100644 src/gallium/auxiliary/draw/draw_vs_sse.c delete mode 100644 src/gallium/auxiliary/tgsi/tgsi_sse2.c delete mode 100644 src/gallium/auxiliary/tgsi/tgsi_sse2.h delete mode 100644 src/gallium/drivers/softpipe/sp_fs_sse.c diff --git a/src/gallium/auxiliary/Makefile.sources b/src/gallium/auxiliary/Makefile.sources index 766beb0..baded90 100644 --- a/src/gallium/auxiliary/Makefile.sources +++ b/src/gallium/auxiliary/Makefile.sources @@ -33,12 +33,8 @@ C_SOURCES := \ draw/draw_pt_vsplit.c \ draw/draw_vertex.c \ draw/draw_vs.c \ - draw/draw_vs_aos.c \ - draw/draw_vs_aos_io.c \ - draw/draw_vs_aos_machine.c \ draw/draw_vs_exec.c \ draw/draw_vs_ppc.c \ - draw/draw_vs_sse.c \ draw/draw_vs_variant.c \ os/os_misc.c \ os/os_stream.c \ @@ -83,7 +79,6 @@ C_SOURCES := \ tgsi/tgsi_ppc.c \ tgsi/tgsi_sanity.c \ tgsi/tgsi_scan.c \ - tgsi/tgsi_sse2.c \ tgsi/tgsi_text.c \ tgsi/tgsi_transform.c \ tgsi/tgsi_ureg.c \ diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h index b84d2b7..3521a03 100644 --- a/src/gallium/auxiliary/draw/draw_private.h +++ b/src/gallium/auxiliary/draw/draw_private.h @@ -237,10 +237,6 @@ struct draw_context uint num_samplers; struct tgsi_sampler **samplers; - /* Here's another one: - */ - struct aos_machine *aos_machine; - const void *aligned_constants[PIPE_MAX_CONSTANT_BUFFERS]; diff --git a/src/gallium/auxiliary/draw/draw_vs.c b/src/gallium/auxiliary/draw/draw_vs.c index 1763dbc..957bbe5 100644 --- a/src/gallium/auxiliary/draw/draw_vs.c +++ b/src/gallium/auxiliary/draw/draw_vs.c @@ -81,14 +81,12 @@ draw_vs_set_constants(struct draw_context *draw, } draw->vs.aligned_constants[slot] = constants; - draw_vs_aos_machine_constants(draw->vs.aos_machine, slot, constants); } void draw_vs_set_viewport( struct draw_context *draw, const struct pipe_viewport_state *viewport ) { - draw_vs_aos_machine_viewport( draw->vs.aos_machine, viewport ); } @@ -103,22 +101,8 @@ draw_create_vertex_shader(struct draw_context *draw, tgsi_dump(shader->tokens, 0); } - if (!draw->pt.middle.llvm) { -#if 0 -/* these paths don't support vertex clamping - * TODO: either add it, or remove them completely - * use LLVM instead if you want performance - * use exec instead if you want debugging/more correctness - */ -#if defined(PIPE_ARCH_X86) - vs = draw_create_vs_sse( draw, shader ); -#elif defined(PIPE_ARCH_PPC) - vs = draw_create_vs_ppc( draw, shader ); -#endif -#endif - } #if HAVE_LLVM - else { + if (draw->pt.middle.llvm) { vs = draw_create_vs_llvm(draw, shader); } #endif @@ -199,12 +183,6 @@ draw_vs_init( struct draw_context *draw ) if (!draw->vs.fetch_cache) return FALSE; - draw->vs.aos_machine = draw_vs_aos_machine(); -#ifdef PIPE_ARCH_X86 - if (!draw->vs.aos_machine) - return FALSE; -#endif - return TRUE; } @@ -219,9 +197,6 @@ draw_vs_destroy( struct draw_context *draw ) if (draw->vs.emit_cache) translate_cache_destroy(draw->vs.emit_cache); - if (draw->vs.aos_machine) - draw_vs_aos_machine_destroy(draw->vs.aos_machine); - for (i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) { if (draw->vs.aligned_constant_storage[i]) { align_free((void *)draw->vs.aligned_constant_storage[i]); diff --git a/src/gallium/auxiliary/draw/draw_vs.h b/src/gallium/auxiliary/draw/draw_vs.h index e6d187e..49229f8 100644 --- a/src/gallium/auxiliary/draw/draw_vs.h +++ b/src/gallium/auxiliary/draw/draw_vs.h @@ -159,10 +159,6 @@ draw_create_vs_exec(struct draw_context *draw, const struct pipe_shader_state *templ); struct draw_vertex_shader * -draw_create_vs_sse(struct draw_context *draw, - const struct pipe_shader_state *templ); - -struct draw_vertex_shader * draw_create_vs_ppc(struct draw_context *draw, const struct pipe_shader_state *templ); @@ -170,10 +166,6 @@ draw_create_vs_ppc(struct draw_context *draw, struct draw_vs_variant_key; struct draw_vertex_shader; -struct draw_vs_variant * -draw_vs_create_variant_aos_sse( struct draw_vertex_shader *vs, - const struct draw_vs_variant_key *key ); - #if HAVE_LLVM struct draw_vertex_shader * draw_create_vs_llvm(struct draw_context *draw, @@ -214,18 +206,6 @@ static INLINE int draw_vs_variant_key_compare( const struct draw_vs_variant_key } -struct aos_machine *draw_vs_aos_machine( void ); -void draw_vs_aos_machine_destroy( struct aos_machine *machine ); - -void -draw_vs_aos_machine_constants(struct aos_machine *machine, - unsigned slot, - const void *constants); - -void draw_vs_aos_machine_viewport( struct aos_machine *machine, - const struct pipe_viewport_state *viewport ); - - #define MAX_TGSI_VERTICES 4 diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c deleted file mode 100644 index 7b90dba..0000000 --- a/src/gallium/auxiliary/draw/draw_vs_aos.c +++ /dev/null @@ -1,2267 +0,0 @@ -/* - * Mesa 3-D graphics library - * Version: 6.3 - * - * Copyright (C) 1999-2004 Brian Paul All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN - * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * Translate tgsi vertex programs to x86/x87/SSE/SSE2 machine code - * using the rtasm runtime assembler. Based on the old - * t_vb_arb_program_sse.c - */ - - -#include "util/u_memory.h" -#include "util/u_math.h" -#include "pipe/p_shader_tokens.h" -#include "util/u_debug.h" -#include "tgsi/tgsi_parse.h" -#include "tgsi/tgsi_util.h" -#include "tgsi/tgsi_exec.h" -#include "tgsi/tgsi_dump.h" - -#include "draw_vs.h" -#include "draw_vs_aos.h" - -#include "rtasm/rtasm_x86sse.h" - -#ifdef PIPE_ARCH_X86 -#define DISASSEM 0 -#define FAST_MATH 1 - -static const char *files[] = -{ - "NULL", - "CONST", - "IN", - "OUT", - "TEMP", - "SAMP", - "ADDR", - "IMM", - "INTERNAL", -}; - -static INLINE boolean eq( struct x86_reg a, - struct x86_reg b ) -{ - return (a.file == b.file && - a.idx == b.idx && - a.mod == b.mod && - a.disp == b.disp); -} - -struct x86_reg aos_get_x86( struct aos_compilation *cp, - unsigned which_reg, /* quick hack */ - unsigned value ) -{ - struct x86_reg reg; - - if (which_reg == 0) - reg = cp->temp_EBP; - else - reg = cp->tmp_EAX; - - if (cp->x86_reg[which_reg] != value) { - unsigned offset; - - switch (value) { - case X86_IMMEDIATES: - assert(which_reg == 0); - offset = Offset(struct aos_machine, immediates); - break; - case X86_CONSTANTS: - assert(which_reg == 1); - offset = Offset(struct aos_machine, constants); - break; - case X86_BUFFERS: - assert(which_reg == 0); - offset = Offset(struct aos_machine, buffer); - break; - default: - assert(0); - offset = 0; - } - - - x86_mov(cp->func, reg, - x86_make_disp(cp->machine_EDX, offset)); - - cp->x86_reg[which_reg] = value; - } - - return reg; -} - - -static struct x86_reg get_reg_ptr(struct aos_compilation *cp, - unsigned file, - unsigned idx ) -{ - struct x86_reg ptr = cp->machine_EDX; - - switch (file) { - case TGSI_FILE_INPUT: - assert(idx < MAX_INPUTS); - return x86_make_disp(ptr, Offset(struct aos_machine, input[idx])); - - case TGSI_FILE_OUTPUT: - return x86_make_disp(ptr, Offset(struct aos_machine, output[idx])); - - case TGSI_FILE_TEMPORARY: - assert(idx < MAX_TEMPS); - return x86_make_disp(ptr, Offset(struct aos_machine, temp[idx])); - - case AOS_FILE_INTERNAL: - assert(idx < MAX_INTERNALS); - return x86_make_disp(ptr, Offset(struct aos_machine, internal[idx])); - - case TGSI_FILE_IMMEDIATE: - assert(idx < MAX_IMMEDIATES); /* just a sanity check */ - return x86_make_disp(aos_get_x86(cp, 0, X86_IMMEDIATES), idx * 4 * sizeof(float)); - - case TGSI_FILE_CONSTANT: - assert(idx < MAX_CONSTANTS); /* just a sanity check */ - return x86_make_disp(aos_get_x86(cp, 1, X86_CONSTANTS), idx * 4 * sizeof(float)); - - default: - AOS_ERROR(cp, "unknown reg file"); - return x86_make_reg(0,0); - } -} - - - -#define X87_CW_EXCEPTION_INV_OP (1<<0) -#define X87_CW_EXCEPTION_DENORM_OP (1<<1) -#define X87_CW_EXCEPTION_ZERO_DIVIDE (1<<2) -#define X87_CW_EXCEPTION_OVERFLOW (1<<3) -#define X87_CW_EXCEPTION_UNDERFLOW (1<<4) -#define X87_CW_EXCEPTION_PRECISION (1<<5) -#define X87_CW_PRECISION_SINGLE (0<<8) -#define X87_CW_PRECISION_RESERVED (1<<8) -#define X87_CW_PRECISION_DOUBLE (2<<8) -#define X87_CW_PRECISION_DOUBLE_EXT (3<<8) -#define X87_CW_PRECISION_MASK (3<<8) -#define X87_CW_ROUND_NEAREST (0<<10) -#define X87_CW_ROUND_DOWN (1<<10) -#define X87_CW_ROUND_UP (2<<10) -#define X87_CW_ROUND_ZERO (3<<10) -#define X87_CW_ROUND_MASK (3<<10) -#define X87_CW_INFINITY (1<<12) - - - - -static void spill( struct aos_compilation *cp, unsigned idx ) -{ - if (!cp->xmm[idx].dirty || - (cp->xmm[idx].file != TGSI_FILE_INPUT && /* inputs are fetched into xmm & set dirty */ - cp->xmm[idx].file != TGSI_FILE_OUTPUT && - cp->xmm[idx].file != TGSI_FILE_TEMPORARY)) { - AOS_ERROR(cp, "invalid spill"); - return; - } - else { - struct x86_reg oldval = get_reg_ptr(cp, - cp->xmm[idx].file, - cp->xmm[idx].idx); - - if (0) debug_printf("\nspill %s[%d]", - files[cp->xmm[idx].file], - cp->xmm[idx].idx); - - assert(cp->xmm[idx].dirty); - sse_movaps(cp->func, oldval, x86_make_reg(file_XMM, idx)); - cp->xmm[idx].dirty = 0; - } -} - - -void aos_spill_all( struct aos_compilation *cp ) -{ - unsigned i; - - for (i = 0; i < 8; i++) { - if (cp->xmm[i].dirty) - spill(cp, i); - aos_release_xmm_reg(cp, i); - } -} - - -static struct x86_reg get_xmm_writable( struct aos_compilation *cp, - struct x86_reg reg ) -{ - if (reg.file != file_XMM || - cp->xmm[reg.idx].file != TGSI_FILE_NULL) - { - struct x86_reg tmp = aos_get_xmm_reg(cp); - sse_movaps(cp->func, tmp, reg); - reg = tmp; - } - - cp->xmm[reg.idx].last_used = cp->insn_counter; - return reg; -} - -static struct x86_reg get_xmm( struct aos_compilation *cp, - struct x86_reg reg ) -{ - if (reg.file != file_XMM) - { - struct x86_reg tmp = aos_get_xmm_reg(cp); - sse_movaps(cp->func, tmp, reg); - reg = tmp; - } - - cp->xmm[reg.idx].last_used = cp->insn_counter; - return reg; -} - - -/* Allocate an empty xmm register, either as a temporary or later to - * "adopt" as a shader reg. - */ -struct x86_reg aos_get_xmm_reg( struct aos_compilation *cp ) -{ - unsigned i; - unsigned oldest = 0; - boolean found = FALSE; - - for (i = 0; i < 8; i++) - if (cp->xmm[i].last_used != cp->insn_counter && - cp->xmm[i].file == TGSI_FILE_NULL) { - oldest = i; - found = TRUE; - } - - if (!found) { - for (i = 0; i < 8; i++) - if (cp->xmm[i].last_used < cp->xmm[oldest].last_used) - oldest = i; - } - - /* Need to write out the old value? - */ - if (cp->xmm[oldest].dirty) - spill(cp, oldest); - - assert(cp->xmm[oldest].last_used != cp->insn_counter); - - cp->xmm[oldest].file = TGSI_FILE_NULL; - cp->xmm[oldest].idx = 0; - cp->xmm[oldest].dirty = 0; - cp->xmm[oldest].last_used = cp->insn_counter; - return x86_make_reg(file_XMM, oldest); -} - -void aos_release_xmm_reg( struct aos_compilation *cp, - unsigned idx ) -{ - cp->xmm[idx].file = TGSI_FILE_NULL; - cp->xmm[idx].idx = 0; - cp->xmm[idx].dirty = 0; - cp->xmm[idx].last_used = 0; -} - - -static void aos_soft_release_xmm( struct aos_compilation *cp, - struct x86_reg reg ) -{ - if (reg.file == file_XMM) { - assert(cp->xmm[reg.idx].last_used == cp->insn_counter); - cp->xmm[reg.idx].last_used = cp->insn_counter - 1; - } -} - - - -/* Mark an xmm reg as holding the current copy of a shader reg. - */ -void aos_adopt_xmm_reg( struct aos_compilation *cp, - struct x86_reg reg, - unsigned file, - unsigned idx, - unsigned dirty ) -{ - unsigned i; - - if (reg.file != file_XMM) { - assert(0); - return; - } - - - /* If any xmm reg thinks it holds this shader reg, break the - * illusion. - */ - for (i = 0; i < 8; i++) { - if (cp->xmm[i].file == file && - cp->xmm[i].idx == idx) - { - /* If an xmm reg is already holding this shader reg, take into account its - * dirty flag... - */ - dirty |= cp->xmm[i].dirty; - aos_release_xmm_reg(cp, i); - } - } - - cp->xmm[reg.idx].file = file; - cp->xmm[reg.idx].idx = idx; - cp->xmm[reg.idx].dirty = dirty; - cp->xmm[reg.idx].last_used = cp->insn_counter; -} - - -/* Return a pointer to the in-memory copy of the reg, making sure it is uptodate. - */ -static struct x86_reg aos_get_shader_reg_ptr( struct aos_compilation *cp, - unsigned file, - unsigned idx ) -{ - unsigned i; - - /* Ensure the in-memory copy of this reg is up-to-date - */ - for (i = 0; i < 8; i++) { - if (cp->xmm[i].file == file && - cp->xmm[i].idx == idx && - cp->xmm[i].dirty) { - spill(cp, i); - } - } - - return get_reg_ptr( cp, file, idx ); -} - - -/* As above, but return a pointer. Note - this pointer may alias - * those returned by get_arg_ptr(). - */ -static struct x86_reg get_dst_ptr( struct aos_compilation *cp, - const struct tgsi_full_dst_register *dst ) -{ - unsigned file = dst->Register.File; - unsigned idx = dst->Register.Index; - unsigned i; - - - /* Ensure in-memory copy of this reg is up-to-date and invalidate - * any xmm copies. - */ - for (i = 0; i < 8; i++) { - if (cp->xmm[i].file == file && - cp->xmm[i].idx == idx) - { - if (cp->xmm[i].dirty) - spill(cp, i); - - aos_release_xmm_reg(cp, i); - } - } - - return get_reg_ptr( cp, file, idx ); -} - - - - - -/* Return an XMM reg if the argument is resident, otherwise return a - * base+offset pointer to the saved value. - */ -struct x86_reg aos_get_shader_reg( struct aos_compilation *cp, - unsigned file, - unsigned idx ) -{ - unsigned i; - - for (i = 0; i < 8; i++) { - if (cp->xmm[i].file == file && - cp->xmm[i].idx == idx) - { - cp->xmm[i].last_used = cp->insn_counter; - return x86_make_reg(file_XMM, i); - } - } - - /* If not found in the XMM register file, return an indirect - * reference to the in-memory copy: - */ - return get_reg_ptr( cp, file, idx ); -} - - - -static struct x86_reg aos_get_shader_reg_xmm( struct aos_compilation *cp, - unsigned file, - unsigned idx ) -{ - struct x86_reg reg = get_xmm( cp, - aos_get_shader_reg( cp, file, idx ) ); - - aos_adopt_xmm_reg( cp, - reg, - file, - idx, - FALSE ); - - return reg; -} - - - -struct x86_reg aos_get_internal_xmm( struct aos_compilation *cp, - unsigned imm ) -{ - return aos_get_shader_reg_xmm( cp, AOS_FILE_INTERNAL, imm ); -} - - -struct x86_reg aos_get_internal( struct aos_compilation *cp, - unsigned imm ) -{ - return aos_get_shader_reg( cp, AOS_FILE_INTERNAL, imm ); -} - - - - - -/* Emulate pshufd insn in regular SSE, if necessary: - */ -static void emit_pshufd( struct aos_compilation *cp, - struct x86_reg dst, - struct x86_reg arg0, - ubyte shuf ) -{ - if (cp->have_sse2) { - sse2_pshufd(cp->func, dst, arg0, shuf); - } - else { - if (!eq(dst, arg0)) - sse_movaps(cp->func, dst, arg0); - - sse_shufps(cp->func, dst, dst, shuf); - } -} - -/* load masks (pack into negs??) - * pshufd - shuffle according to writemask - * and - result, mask - * nand - dest, mask - * or - dest, result - */ -static boolean mask_write( struct aos_compilation *cp, - struct x86_reg dst, - struct x86_reg result, - unsigned mask ) -{ - struct x86_reg imm_swz = aos_get_internal_xmm(cp, IMM_SWZ); - struct x86_reg tmp = aos_get_xmm_reg(cp); - - emit_pshufd(cp, tmp, imm_swz, - SHUF((mask & 1) ? 2 : 3, - (mask & 2) ? 2 : 3, - (mask & 4) ? 2 : 3, - (mask & 8) ? 2 : 3)); - - sse_andps(cp->func, dst, tmp); - sse_andnps(cp->func, tmp, result); - sse_orps(cp->func, dst, tmp); - - aos_release_xmm_reg(cp, tmp.idx); - return TRUE; -} - - - - -/* Helper for writemask: - */ -static boolean emit_shuf_copy2( struct aos_compilation *cp, - struct x86_reg dst, - struct x86_reg arg0, - struct x86_reg arg1, - ubyte shuf ) -{ - struct x86_reg tmp = aos_get_xmm_reg(cp); - - emit_pshufd(cp, dst, arg1, shuf); - emit_pshufd(cp, tmp, arg0, shuf); - sse_shufps(cp->func, dst, tmp, SHUF(X, Y, Z, W)); - emit_pshufd(cp, dst, dst, shuf); - - aos_release_xmm_reg(cp, tmp.idx); - return TRUE; -} - - - -#define SSE_SWIZZLE_NOOP ((0<<0) | (1<<2) | (2<<4) | (3<<6)) - - -/* Locate a source register and perform any required (simple) swizzle. - * - * Just fail on complex swizzles at this point. - */ -static struct x86_reg fetch_src( struct aos_compilation *cp, - const struct tgsi_full_src_register *src ) -{ - struct x86_reg arg0 = aos_get_shader_reg(cp, - src->Register.File, - src->Register.Index); - unsigned i; - ubyte swz = 0; - unsigned negs = 0; - unsigned abs = 0; - - for (i = 0; i < 4; i++) { - unsigned swizzle = tgsi_util_get_full_src_register_swizzle( src, i ); - unsigned neg = tgsi_util_get_full_src_register_sign_mode( src, i ); - - swz |= (swizzle & 0x3) << (i * 2); - - switch (neg) { - case TGSI_UTIL_SIGN_TOGGLE: - negs |= (1<func, dst, arg0); - - if (negs && negs != 0xf) { - struct x86_reg imm_swz = aos_get_internal_xmm(cp, IMM_SWZ); - struct x86_reg tmp = aos_get_xmm_reg(cp); - - /* Load 1,-1,0,0 - * Use neg as arg to pshufd - * Multiply - */ - emit_pshufd(cp, tmp, imm_swz, - SHUF((negs & 1) ? 1 : 0, - (negs & 2) ? 1 : 0, - (negs & 4) ? 1 : 0, - (negs & 8) ? 1 : 0)); - sse_mulps(cp->func, dst, tmp); - - aos_release_xmm_reg(cp, tmp.idx); - aos_soft_release_xmm(cp, imm_swz); - } - else if (negs) { - struct x86_reg imm_negs = aos_get_internal_xmm(cp, IMM_NEGS); - sse_mulps(cp->func, dst, imm_negs); - aos_soft_release_xmm(cp, imm_negs); - } - - - if (abs && abs != 0xf) { - AOS_ERROR(cp, "unsupported partial abs"); - } - else if (abs) { - struct x86_reg neg = aos_get_internal(cp, IMM_NEGS); - struct x86_reg tmp = aos_get_xmm_reg(cp); - - sse_movaps(cp->func, tmp, dst); - sse_mulps(cp->func, tmp, neg); - sse_maxps(cp->func, dst, tmp); - - aos_release_xmm_reg(cp, tmp.idx); - aos_soft_release_xmm(cp, neg); - } - - aos_soft_release_xmm(cp, arg0); - return dst; - } - - return arg0; -} - -static void x87_fld_src( struct aos_compilation *cp, - const struct tgsi_full_src_register *src, - unsigned channel ) -{ - struct x86_reg arg0 = aos_get_shader_reg_ptr(cp, - src->Register.File, - src->Register.Index); - - unsigned swizzle = tgsi_util_get_full_src_register_swizzle( src, channel ); - unsigned neg = tgsi_util_get_full_src_register_sign_mode( src, channel ); - - x87_fld( cp->func, x86_make_disp(arg0, (swizzle & 3) * sizeof(float)) ); - - switch (neg) { - case TGSI_UTIL_SIGN_TOGGLE: - /* Flip the sign: - */ - x87_fchs( cp->func ); - break; - - case TGSI_UTIL_SIGN_KEEP: - break; - - case TGSI_UTIL_SIGN_CLEAR: - x87_fabs( cp->func ); - break; - - case TGSI_UTIL_SIGN_SET: - x87_fabs( cp->func ); - x87_fchs( cp->func ); - break; - - default: - AOS_ERROR(cp, "unsupported sign-mode"); - break; - } -} - - - - - - -/* Used to implement write masking. This and most of the other instructions - * here would be easier to implement if there had been a translation - * to a 2 argument format (dst/arg0, arg1) at the shader level before - * attempting to translate to x86/sse code. - */ -static void store_dest( struct aos_compilation *cp, - const struct tgsi_full_dst_register *reg, - struct x86_reg result ) -{ - struct x86_reg dst; - - switch (reg->Register.WriteMask) { - case 0: - return; - - case TGSI_WRITEMASK_XYZW: - aos_adopt_xmm_reg(cp, - get_xmm_writable(cp, result), - reg->Register.File, - reg->Register.Index, - TRUE); - return; - default: - break; - } - - dst = aos_get_shader_reg_xmm(cp, - reg->Register.File, - reg->Register.Index); - - switch (reg->Register.WriteMask) { - case TGSI_WRITEMASK_X: - sse_movss(cp->func, dst, get_xmm(cp, result)); - break; - - case TGSI_WRITEMASK_ZW: - sse_shufps(cp->func, dst, get_xmm(cp, result), SHUF(X, Y, Z, W)); - break; - - case TGSI_WRITEMASK_XY: - result = get_xmm_writable(cp, result); - sse_shufps(cp->func, result, dst, SHUF(X, Y, Z, W)); - dst = result; - break; - - case TGSI_WRITEMASK_YZW: - result = get_xmm_writable(cp, result); - sse_movss(cp->func, result, dst); - dst = result; - break; - - default: - mask_write(cp, dst, result, reg->Register.WriteMask); - break; - } - - aos_adopt_xmm_reg(cp, - dst, - reg->Register.File, - reg->Register.Index, - TRUE); - -} - -static void inject_scalar( struct aos_compilation *cp, - struct x86_reg dst, - struct x86_reg result, - ubyte swizzle ) -{ - sse_shufps(cp->func, dst, dst, swizzle); - sse_movss(cp->func, dst, result); - sse_shufps(cp->func, dst, dst, swizzle); -} - - -static void store_scalar_dest( struct aos_compilation *cp, - const struct tgsi_full_dst_register *reg, - struct x86_reg result ) -{ - unsigned writemask = reg->Register.WriteMask; - struct x86_reg dst; - - if (writemask != TGSI_WRITEMASK_X && - writemask != TGSI_WRITEMASK_Y && - writemask != TGSI_WRITEMASK_Z && - writemask != TGSI_WRITEMASK_W && - writemask != 0) - { - result = get_xmm_writable(cp, result); /* already true, right? */ - sse_shufps(cp->func, result, result, SHUF(X,X,X,X)); - store_dest(cp, reg, result); - return; - } - - result = get_xmm(cp, result); - dst = aos_get_shader_reg_xmm(cp, - reg->Register.File, - reg->Register.Index); - - - - switch (reg->Register.WriteMask) { - case TGSI_WRITEMASK_X: - sse_movss(cp->func, dst, result); - break; - - case TGSI_WRITEMASK_Y: - inject_scalar(cp, dst, result, SHUF(Y, X, Z, W)); - break; - - case TGSI_WRITEMASK_Z: - inject_scalar(cp, dst, result, SHUF(Z, Y, X, W)); - break; - - case TGSI_WRITEMASK_W: - inject_scalar(cp, dst, result, SHUF(W, Y, Z, X)); - break; - - default: - break; - } - - aos_adopt_xmm_reg(cp, - dst, - reg->Register.File, - reg->Register.Index, - TRUE); -} - - - -static void x87_fst_or_nop( struct x86_function *func, - unsigned writemask, - unsigned channel, - struct x86_reg ptr ) -{ - assert(ptr.file == file_REG32); - if (writemask & (1<Register.WriteMask; - - x87_fst_or_nop(cp->func, writemask, 0, ptr); - x87_fst_or_nop(cp->func, writemask, 1, ptr); - x87_fst_or_nop(cp->func, writemask, 2, ptr); - x87_fstp_or_pop(cp->func, writemask, 3, ptr); -} - -/* Save current x87 state and put it into single precision mode. - */ -static void save_fpu_state( struct aos_compilation *cp ) -{ - x87_fnstcw( cp->func, x86_make_disp(cp->machine_EDX, - Offset(struct aos_machine, fpu_restore))); -} - -static void restore_fpu_state( struct aos_compilation *cp ) -{ - x87_fnclex(cp->func); - x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX, - Offset(struct aos_machine, fpu_restore))); -} - -static void set_fpu_round_neg_inf( struct aos_compilation *cp ) -{ - if (cp->fpucntl != FPU_RND_NEG) { - cp->fpucntl = FPU_RND_NEG; - x87_fnclex(cp->func); - x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX, - Offset(struct aos_machine, fpu_rnd_neg_inf))); - } -} - -static void set_fpu_round_nearest( struct aos_compilation *cp ) -{ - if (cp->fpucntl != FPU_RND_NEAREST) { - cp->fpucntl = FPU_RND_NEAREST; - x87_fnclex(cp->func); - x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX, - Offset(struct aos_machine, fpu_rnd_nearest))); - } -} - -#if 0 -static void x87_emit_ex2( struct aos_compilation *cp ) -{ - struct x86_reg st0 = x86_make_reg(file_x87, 0); - struct x86_reg st1 = x86_make_reg(file_x87, 1); - int stack = cp->func->x87_stack; - - /* set_fpu_round_neg_inf( cp ); */ - - x87_fld(cp->func, st0); /* a a */ - x87_fprndint( cp->func ); /* int(a) a*/ - x87_fsubr(cp->func, st1, st0); /* int(a) frc(a) */ - x87_fxch(cp->func, st1); /* frc(a) int(a) */ - x87_f2xm1(cp->func); /* (2^frc(a))-1 int(a) */ - x87_fld1(cp->func); /* 1 (2^frc(a))-1 int(a) */ - x87_faddp(cp->func, st1); /* 2^frac(a) int(a) */ - x87_fscale(cp->func); /* (2^frac(a)*2^int(int(a))) int(a) */ - /* 2^a int(a) */ - x87_fstp(cp->func, st1); /* 2^a */ - - assert( stack == cp->func->x87_stack); - -} -#endif - -#if 0 -static void PIPE_CDECL print_reg( const char *msg, - const float *reg ) -{ - debug_printf("%s: %f %f %f %f\n", msg, reg[0], reg[1], reg[2], reg[3]); -} -#endif - -#if 0 -static void emit_print( struct aos_compilation *cp, - const char *message, /* must point to a static string! */ - unsigned file, - unsigned idx ) -{ - struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX ); - struct x86_reg arg = aos_get_shader_reg_ptr( cp, file, idx ); - unsigned i; - - /* There shouldn't be anything on the x87 stack. Can add this - * capacity later if need be. - */ - assert(cp->func->x87_stack == 0); - - /* For absolute correctness, need to spill/invalidate all XMM regs - * too. We're obviously not concerned about performance on this - * debug path, so here goes: - */ - for (i = 0; i < 8; i++) { - if (cp->xmm[i].dirty) - spill(cp, i); - - aos_release_xmm_reg(cp, i); - } - - /* Push caller-save (ie scratch) regs. - */ - x86_cdecl_caller_push_regs( cp->func ); - - - /* Push the arguments: - */ - x86_lea( cp->func, ecx, arg ); - x86_push( cp->func, ecx ); - x86_push_imm32( cp->func, (int)message ); - - /* Call the helper. Could call debug_printf directly, but - * print_reg is a nice place to put a breakpoint if need be. - */ - x86_mov_reg_imm( cp->func, ecx, (int)print_reg ); - x86_call( cp->func, ecx ); - x86_pop( cp->func, ecx ); - x86_pop( cp->func, ecx ); - - /* Pop caller-save regs - */ - x86_cdecl_caller_pop_regs( cp->func ); - - /* Done... - */ -} -#endif - -/** - * The traditional instructions. All operate on internal registers - * and ignore write masks and swizzling issues. - */ - -static boolean emit_ABS( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); - struct x86_reg neg = aos_get_internal(cp, IMM_NEGS); - struct x86_reg tmp = aos_get_xmm_reg(cp); - - sse_movaps(cp->func, tmp, arg0); - sse_mulps(cp->func, tmp, neg); - sse_maxps(cp->func, tmp, arg0); - - store_dest(cp, &op->Dst[0], tmp); - return TRUE; -} - -static boolean emit_ADD( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); - struct x86_reg arg1 = fetch_src(cp, &op->Src[1]); - struct x86_reg dst = get_xmm_writable(cp, arg0); - - sse_addps(cp->func, dst, arg1); - - store_dest(cp, &op->Dst[0], dst); - return TRUE; -} - -static boolean emit_COS( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - x87_fld_src(cp, &op->Src[0], 0); - x87_fcos(cp->func); - x87_fstp_dest4(cp, &op->Dst[0]); - return TRUE; -} - -/* The dotproduct instructions don't really do that well in sse: - * XXX: produces wrong results -- disabled. - */ -static boolean emit_DP3( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); - struct x86_reg arg1 = fetch_src(cp, &op->Src[1]); - struct x86_reg tmp = aos_get_xmm_reg(cp); - struct x86_reg dst = get_xmm_writable(cp, arg0); - - sse_mulps(cp->func, dst, arg1); - /* Now the hard bit: sum the first 3 values: - */ - sse_movhlps(cp->func, tmp, dst); - sse_addss(cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */ - emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z)); - sse_addss(cp->func, dst, tmp); - - aos_release_xmm_reg(cp, tmp.idx); - store_scalar_dest(cp, &op->Dst[0], dst); - return TRUE; -} - -static boolean emit_DP4( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); - struct x86_reg arg1 = fetch_src(cp, &op->Src[1]); - struct x86_reg tmp = aos_get_xmm_reg(cp); - struct x86_reg dst = get_xmm_writable(cp, arg0); - - sse_mulps(cp->func, dst, arg1); - - /* Now the hard bit: sum the values: - */ - sse_movhlps(cp->func, tmp, dst); - sse_addps(cp->func, dst, tmp); /* a*x+c*z, b*y+d*w, a*x+c*z, b*y+d*w */ - emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z)); - sse_addss(cp->func, dst, tmp); - - aos_release_xmm_reg(cp, tmp.idx); - store_scalar_dest(cp, &op->Dst[0], dst); - return TRUE; -} - -static boolean emit_DPH( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); - struct x86_reg arg1 = fetch_src(cp, &op->Src[1]); - struct x86_reg tmp = aos_get_xmm_reg(cp); - struct x86_reg dst = get_xmm_writable(cp, arg0); - - sse_mulps(cp->func, dst, arg1); - - /* Now the hard bit: sum the values (from DP3): - */ - sse_movhlps(cp->func, tmp, dst); - sse_addss(cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */ - emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z)); - sse_addss(cp->func, dst, tmp); - emit_pshufd(cp, tmp, arg1, SHUF(W,W,W,W)); - sse_addss(cp->func, dst, tmp); - - aos_release_xmm_reg(cp, tmp.idx); - store_scalar_dest(cp, &op->Dst[0], dst); - return TRUE; -} - -static boolean emit_DST( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); - struct x86_reg arg1 = fetch_src(cp, &op->Src[1]); - struct x86_reg dst = aos_get_xmm_reg(cp); - struct x86_reg tmp = aos_get_xmm_reg(cp); - struct x86_reg ones = aos_get_internal(cp, IMM_ONES); - -/* dst[0] = 1.0 * 1.0F; */ -/* dst[1] = arg0[1] * arg1[1]; */ -/* dst[2] = arg0[2] * 1.0; */ -/* dst[3] = 1.0 * arg1[3]; */ - - emit_shuf_copy2(cp, dst, arg0, ones, SHUF(X,W,Z,Y)); - emit_shuf_copy2(cp, tmp, arg1, ones, SHUF(X,Z,Y,W)); - sse_mulps(cp->func, dst, tmp); - - aos_release_xmm_reg(cp, tmp.idx); - store_dest(cp, &op->Dst[0], dst); - return TRUE; -} - -static boolean emit_LG2( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - x87_fld1(cp->func); /* 1 */ - x87_fld_src(cp, &op->Src[0], 0); /* a0 1 */ - x87_fyl2x(cp->func); /* log2(a0) */ - x87_fstp_dest4(cp, &op->Dst[0]); - return TRUE; -} - -#if 0 -static boolean emit_EX2( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - x87_fld_src(cp, &op->Src[0], 0); - x87_emit_ex2(cp); - x87_fstp_dest4(cp, &op->Dst[0]); - return TRUE; -} -#endif - - -static boolean emit_FLR( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - struct x86_reg dst = get_dst_ptr(cp, &op->Dst[0]); - unsigned writemask = op->Dst[0].Register.WriteMask; - int i; - - set_fpu_round_neg_inf( cp ); - - /* Load all sources first to avoid aliasing - */ - for (i = 3; i >= 0; i--) { - if (writemask & (1<Src[0], i); - } - } - - for (i = 0; i < 4; i++) { - if (writemask & (1<func ); - x87_fstp(cp->func, x86_make_disp(dst, i*4)); - } - } - - return TRUE; -} - - -static boolean emit_RND( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - struct x86_reg dst = get_dst_ptr(cp, &op->Dst[0]); - unsigned writemask = op->Dst[0].Register.WriteMask; - int i; - - set_fpu_round_nearest( cp ); - - /* Load all sources first to avoid aliasing - */ - for (i = 3; i >= 0; i--) { - if (writemask & (1<Src[0], i); - } - } - - for (i = 0; i < 4; i++) { - if (writemask & (1<func ); - x87_fstp(cp->func, x86_make_disp(dst, i*4)); - } - } - - return TRUE; -} - - -static boolean emit_FRC( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - struct x86_reg dst = get_dst_ptr(cp, &op->Dst[0]); - struct x86_reg st0 = x86_make_reg(file_x87, 0); - struct x86_reg st1 = x86_make_reg(file_x87, 1); - unsigned writemask = op->Dst[0].Register.WriteMask; - int i; - - set_fpu_round_neg_inf( cp ); - - /* suck all the source values onto the stack before writing out any - * dst, which may alias... - */ - for (i = 3; i >= 0; i--) { - if (writemask & (1<Src[0], i); - } - } - - for (i = 0; i < 4; i++) { - if (writemask & (1<func, st0); /* a a */ - x87_fprndint( cp->func ); /* flr(a) a */ - x87_fsubp(cp->func, st1); /* frc(a) */ - x87_fstp(cp->func, x86_make_disp(dst, i*4)); - } - } - - return TRUE; -} - - - - - - -static boolean emit_LIT( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX ); - unsigned writemask = op->Dst[0].Register.WriteMask; - unsigned lit_count = cp->lit_count++; - struct x86_reg result, arg0; - unsigned i; - -#if 1 - /* For absolute correctness, need to spill/invalidate all XMM regs - * too. - */ - for (i = 0; i < 8; i++) { - if (cp->xmm[i].dirty) - spill(cp, i); - aos_release_xmm_reg(cp, i); - } -#endif - - if (writemask != TGSI_WRITEMASK_XYZW) - result = x86_make_disp(cp->machine_EDX, Offset(struct aos_machine, tmp[0])); - else - result = get_dst_ptr(cp, &op->Dst[0]); - - - arg0 = fetch_src( cp, &op->Src[0] ); - if (arg0.file == file_XMM) { - struct x86_reg tmp = x86_make_disp(cp->machine_EDX, - Offset(struct aos_machine, tmp[1])); - sse_movaps( cp->func, tmp, arg0 ); - arg0 = tmp; - } - - - - /* Push caller-save (ie scratch) regs. - */ - x86_cdecl_caller_push_regs( cp->func ); - - /* Push the arguments: - */ - x86_push_imm32( cp->func, lit_count ); - - x86_lea( cp->func, ecx, arg0 ); - x86_push( cp->func, ecx ); - - x86_lea( cp->func, ecx, result ); - x86_push( cp->func, ecx ); - - x86_push( cp->func, cp->machine_EDX ); - - if (lit_count < MAX_LIT_INFO) { - x86_mov( cp->func, ecx, x86_make_disp( cp->machine_EDX, - Offset(struct aos_machine, lit_info) + - lit_count * sizeof(struct lit_info) + - Offset(struct lit_info, func))); - } - else { - x86_mov_reg_imm( cp->func, ecx, (int)aos_do_lit ); - } - - x86_call( cp->func, ecx ); - - x86_pop( cp->func, ecx ); /* fixme... */ - x86_pop( cp->func, ecx ); - x86_pop( cp->func, ecx ); - x86_pop( cp->func, ecx ); - - x86_cdecl_caller_pop_regs( cp->func ); - - if (writemask != TGSI_WRITEMASK_XYZW) { - store_dest( cp, - &op->Dst[0], - get_xmm_writable( cp, result ) ); - } - - return TRUE; -} - -#if 0 -static boolean emit_inline_LIT( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - struct x86_reg dst = get_dst_ptr(cp, &op->Dst[0]); - unsigned writemask = op->Dst[0].Register.WriteMask; - - if (writemask & TGSI_WRITEMASK_YZ) { - struct x86_reg st1 = x86_make_reg(file_x87, 1); - struct x86_reg st2 = x86_make_reg(file_x87, 2); - - /* a1' = a1 <= 0 ? 1 : a1; - */ - x87_fldz(cp->func); /* 1 0 */ -#if 1 - x87_fld1(cp->func); /* 1 0 */ -#else - /* Correct but slow due to fp exceptions generated in fyl2x - fix me. - */ - x87_fldz(cp->func); /* 1 0 */ -#endif - x87_fld_src(cp, &op->Src[0], 1); /* a1 1 0 */ - x87_fcomi(cp->func, st2); /* a1 1 0 */ - x87_fcmovb(cp->func, st1); /* a1' 1 0 */ - x87_fstp(cp->func, st1); /* a1' 0 */ - x87_fstp(cp->func, st1); /* a1' */ - - x87_fld_src(cp, &op->Src[0], 3); /* a3 a1' */ - x87_fxch(cp->func, st1); /* a1' a3 */ - - - /* Compute pow(a1, a3) - */ - x87_fyl2x(cp->func); /* a3*log2(a1) */ - x87_emit_ex2( cp ); /* 2^(a3*log2(a1)) */ - - - /* a0' = max2(a0, 0): - */ - x87_fldz(cp->func); /* 0 r2 */ - x87_fld_src(cp, &op->Src[0], 0); /* a0 0 r2 */ - x87_fcomi(cp->func, st1); - x87_fcmovb(cp->func, st1); /* a0' 0 r2 */ - - x87_fst_or_nop(cp->func, writemask, 1, dst); /* result[1] = a0' */ - - x87_fcomi(cp->func, st1); /* a0' 0 r2 */ - x87_fcmovnbe(cp->func, st2); /* r2' 0' r2 */ - - x87_fstp_or_pop(cp->func, writemask, 2, dst); /* 0 r2 */ - x87_fpop(cp->func); /* r2 */ - x87_fpop(cp->func); - } - - if (writemask & TGSI_WRITEMASK_XW) { - x87_fld1(cp->func); - x87_fst_or_nop(cp->func, writemask, 0, dst); - x87_fstp_or_pop(cp->func, writemask, 3, dst); - } - - return TRUE; -} -#endif - - - -static boolean emit_MAX( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); - struct x86_reg arg1 = fetch_src(cp, &op->Src[1]); - struct x86_reg dst = get_xmm_writable(cp, arg0); - - sse_maxps(cp->func, dst, arg1); - - store_dest(cp, &op->Dst[0], dst); - return TRUE; -} - - -static boolean emit_MIN( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); - struct x86_reg arg1 = fetch_src(cp, &op->Src[1]); - struct x86_reg dst = get_xmm_writable(cp, arg0); - - sse_minps(cp->func, dst, arg1); - - store_dest(cp, &op->Dst[0], dst); - return TRUE; -} - -static boolean emit_MOV( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); - struct x86_reg dst = get_xmm_writable(cp, arg0); - - /* potentially nothing to do */ - - store_dest(cp, &op->Dst[0], dst); - return TRUE; -} - -static boolean emit_MUL( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); - struct x86_reg arg1 = fetch_src(cp, &op->Src[1]); - struct x86_reg dst = get_xmm_writable(cp, arg0); - - sse_mulps(cp->func, dst, arg1); - - store_dest(cp, &op->Dst[0], dst); - return TRUE; -} - - -static boolean emit_MAD( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); - struct x86_reg arg1 = fetch_src(cp, &op->Src[1]); - struct x86_reg arg2 = fetch_src(cp, &op->Src[2]); - - /* If we can't clobber old contents of arg0, get a temporary & copy - * it there, then clobber it... - */ - arg0 = get_xmm_writable(cp, arg0); - - sse_mulps(cp->func, arg0, arg1); - sse_addps(cp->func, arg0, arg2); - store_dest(cp, &op->Dst[0], arg0); - return TRUE; -} - - - -/* A wrapper for powf(). - * Makes sure it is cdecl and operates on floats. - */ -static float PIPE_CDECL _powerf( float x, float y ) -{ -#if FAST_MATH - return util_fast_pow(x, y); -#else - return powf( x, y ); -#endif -} - -#if FAST_MATH -static float PIPE_CDECL _exp2(float x) -{ - return util_fast_exp2(x); -} -#endif - - -/* Really not sufficient -- need to check for conditions that could - * generate inf/nan values, which will slow things down hugely. - */ -static boolean emit_POW( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ -#if 0 - x87_fld_src(cp, &op->Src[1], 0); /* a1.x */ - x87_fld_src(cp, &op->Src[0], 0); /* a0.x a1.x */ - x87_fyl2x(cp->func); /* a1*log2(a0) */ - - x87_emit_ex2( cp ); /* 2^(a1*log2(a0)) */ - - x87_fstp_dest4(cp, &op->Dst[0]); -#else - uint i; - - /* For absolute correctness, need to spill/invalidate all XMM regs - * too. - */ - for (i = 0; i < 8; i++) { - if (cp->xmm[i].dirty) - spill(cp, i); - aos_release_xmm_reg(cp, i); - } - - /* Push caller-save (ie scratch) regs. - */ - x86_cdecl_caller_push_regs( cp->func ); - - x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, -8) ); - - x87_fld_src( cp, &op->Src[1], 0 ); - x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 4 ) ); - x87_fld_src( cp, &op->Src[0], 0 ); - x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 0 ) ); - - /* tmp_EAX has been pushed & will be restored below */ - x86_mov_reg_imm( cp->func, cp->tmp_EAX, (unsigned long) _powerf ); - x86_call( cp->func, cp->tmp_EAX ); - - x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, 8) ); - - x86_cdecl_caller_pop_regs( cp->func ); - - /* Note retval on x87 stack: - */ - cp->func->x87_stack++; - - x87_fstp_dest4( cp, &op->Dst[0] ); -#endif - return TRUE; -} - - -#if FAST_MATH -static boolean emit_EXPBASE2( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - uint i; - - /* For absolute correctness, need to spill/invalidate all XMM regs - * too. - */ - for (i = 0; i < 8; i++) { - if (cp->xmm[i].dirty) - spill(cp, i); - aos_release_xmm_reg(cp, i); - } - - /* Push caller-save (ie scratch) regs. - */ - x86_cdecl_caller_push_regs( cp->func ); - - x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, -4) ); - - x87_fld_src( cp, &op->Src[0], 0 ); - x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 0 ) ); - - /* tmp_EAX has been pushed & will be restored below */ - x86_mov_reg_imm( cp->func, cp->tmp_EAX, (unsigned long) _exp2 ); - x86_call( cp->func, cp->tmp_EAX ); - - x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, 4) ); - - x86_cdecl_caller_pop_regs( cp->func ); - - /* Note retval on x87 stack: - */ - cp->func->x87_stack++; - - x87_fstp_dest4( cp, &op->Dst[0] ); - - return TRUE; -} -#endif - - -static boolean emit_RCP( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); - struct x86_reg dst = aos_get_xmm_reg(cp); - - if (cp->have_sse2) { - sse2_rcpss(cp->func, dst, arg0); - /* extend precision here... - */ - } - else { - struct x86_reg ones = aos_get_internal(cp, IMM_ONES); - sse_movss(cp->func, dst, ones); - sse_divss(cp->func, dst, arg0); - } - - store_scalar_dest(cp, &op->Dst[0], dst); - return TRUE; -} - - -/* Although rsqrtps() and rcpps() are low precision on some/all SSE - * implementations, it is possible to improve its precision at - * fairly low cost, using a newton/raphson step, as below: - * - * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a) - * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)] - * or: - * x1 = rsqrtps(a) * [1.5 - .5 * a * rsqrtps(a) * rsqrtps(a)] - * - * - * See: http://softwarecommunity.intel.com/articles/eng/1818.htm - */ -static boolean emit_RSQ( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - if (0) { - struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); - struct x86_reg r = aos_get_xmm_reg(cp); - sse_rsqrtss(cp->func, r, arg0); - store_scalar_dest(cp, &op->Dst[0], r); - return TRUE; - } - else { - struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); - struct x86_reg r = aos_get_xmm_reg(cp); - - struct x86_reg neg_half = get_reg_ptr( cp, AOS_FILE_INTERNAL, IMM_RSQ ); - struct x86_reg one_point_five = x86_make_disp( neg_half, 4 ); - struct x86_reg src = get_xmm_writable( cp, arg0 ); - struct x86_reg neg = aos_get_internal(cp, IMM_NEGS); - struct x86_reg tmp = aos_get_xmm_reg(cp); - - sse_movaps(cp->func, tmp, src); - sse_mulps(cp->func, tmp, neg); - sse_maxps(cp->func, tmp, src); - - sse_rsqrtss( cp->func, r, tmp ); /* rsqrtss(a) */ - sse_mulss( cp->func, tmp, neg_half ); /* -.5 * a */ - sse_mulss( cp->func, tmp, r ); /* -.5 * a * r */ - sse_mulss( cp->func, tmp, r ); /* -.5 * a * r * r */ - sse_addss( cp->func, tmp, one_point_five ); /* 1.5 - .5 * a * r * r */ - sse_mulss( cp->func, r, tmp ); /* r * (1.5 - .5 * a * r * r) */ - - store_scalar_dest(cp, &op->Dst[0], r); - - aos_release_xmm_reg(cp, tmp.idx); - - return TRUE; - } -} - - -static boolean emit_SGE( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); - struct x86_reg arg1 = fetch_src(cp, &op->Src[1]); - struct x86_reg ones = aos_get_internal(cp, IMM_ONES); - struct x86_reg dst = get_xmm_writable(cp, arg0); - - sse_cmpps(cp->func, dst, arg1, cc_NotLessThan); - sse_andps(cp->func, dst, ones); - - store_dest(cp, &op->Dst[0], dst); - return TRUE; -} - -static boolean emit_SIN( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - x87_fld_src(cp, &op->Src[0], 0); - x87_fsin(cp->func); - x87_fstp_dest4(cp, &op->Dst[0]); - return TRUE; -} - - - -static boolean emit_SLT( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); - struct x86_reg arg1 = fetch_src(cp, &op->Src[1]); - struct x86_reg ones = aos_get_internal(cp, IMM_ONES); - struct x86_reg dst = get_xmm_writable(cp, arg0); - - sse_cmpps(cp->func, dst, arg1, cc_LessThan); - sse_andps(cp->func, dst, ones); - - store_dest(cp, &op->Dst[0], dst); - return TRUE; -} - -static boolean emit_SUB( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); - struct x86_reg arg1 = fetch_src(cp, &op->Src[1]); - struct x86_reg dst = get_xmm_writable(cp, arg0); - - sse_subps(cp->func, dst, arg1); - - store_dest(cp, &op->Dst[0], dst); - return TRUE; -} - -static boolean emit_TRUNC( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); - struct x86_reg tmp0 = aos_get_xmm_reg(cp); - - sse2_cvttps2dq(cp->func, tmp0, arg0); - sse2_cvtdq2ps(cp->func, tmp0, tmp0); - - store_dest(cp, &op->Dst[0], tmp0); - return TRUE; -} - -static boolean emit_XPD( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); - struct x86_reg arg1 = fetch_src(cp, &op->Src[1]); - struct x86_reg tmp0 = aos_get_xmm_reg(cp); - struct x86_reg tmp1 = aos_get_xmm_reg(cp); - - emit_pshufd(cp, tmp1, arg1, SHUF(Y, Z, X, W)); - sse_mulps(cp->func, tmp1, arg0); - emit_pshufd(cp, tmp0, arg0, SHUF(Y, Z, X, W)); - sse_mulps(cp->func, tmp0, arg1); - sse_subps(cp->func, tmp1, tmp0); - sse_shufps(cp->func, tmp1, tmp1, SHUF(Y, Z, X, W)); - -/* dst[2] = arg0[0] * arg1[1] - arg0[1] * arg1[0]; */ -/* dst[0] = arg0[1] * arg1[2] - arg0[2] * arg1[1]; */ -/* dst[1] = arg0[2] * arg1[0] - arg0[0] * arg1[2]; */ -/* dst[3] is undef */ - - - aos_release_xmm_reg(cp, tmp0.idx); - store_dest(cp, &op->Dst[0], tmp1); - return TRUE; -} - - - -static boolean -emit_instruction( struct aos_compilation *cp, - struct tgsi_full_instruction *inst ) -{ - x87_assert_stack_empty(cp->func); - - switch( inst->Instruction.Opcode ) { - case TGSI_OPCODE_MOV: - return emit_MOV( cp, inst ); - - case TGSI_OPCODE_LIT: - return emit_LIT(cp, inst); - - case TGSI_OPCODE_RCP: - return emit_RCP(cp, inst); - - case TGSI_OPCODE_RSQ: - return emit_RSQ(cp, inst); - - case TGSI_OPCODE_EXP: - /*return emit_EXP(cp, inst);*/ - return FALSE; - - case TGSI_OPCODE_LOG: - /*return emit_LOG(cp, inst);*/ - return FALSE; - - case TGSI_OPCODE_MUL: - return emit_MUL(cp, inst); - - case TGSI_OPCODE_ADD: - return emit_ADD(cp, inst); - - case TGSI_OPCODE_DP3: - return emit_DP3(cp, inst); - - case TGSI_OPCODE_DP4: - return emit_DP4(cp, inst); - - case TGSI_OPCODE_DST: - return emit_DST(cp, inst); - - case TGSI_OPCODE_MIN: - return emit_MIN(cp, inst); - - case TGSI_OPCODE_MAX: - return emit_MAX(cp, inst); - - case TGSI_OPCODE_SLT: - return emit_SLT(cp, inst); - - case TGSI_OPCODE_SGE: - return emit_SGE(cp, inst); - - case TGSI_OPCODE_MAD: - return emit_MAD(cp, inst); - - case TGSI_OPCODE_SUB: - return emit_SUB(cp, inst); - - case TGSI_OPCODE_LRP: - /*return emit_LERP(cp, inst);*/ - return FALSE; - - case TGSI_OPCODE_FRC: - return emit_FRC(cp, inst); - - case TGSI_OPCODE_CLAMP: - /*return emit_CLAMP(cp, inst);*/ - return FALSE; - - case TGSI_OPCODE_FLR: - return emit_FLR(cp, inst); - - case TGSI_OPCODE_ROUND: - return emit_RND(cp, inst); - - case TGSI_OPCODE_EX2: -#if FAST_MATH - return emit_EXPBASE2(cp, inst); -#elif 0 - /* this seems to fail for "larger" exponents. - * See glean tvertProg1's EX2 test. - */ - return emit_EX2(cp, inst); -#else - return FALSE; -#endif - - case TGSI_OPCODE_LG2: - return emit_LG2(cp, inst); - - case TGSI_OPCODE_POW: - return emit_POW(cp, inst); - - case TGSI_OPCODE_XPD: - return emit_XPD(cp, inst); - - case TGSI_OPCODE_ABS: - return emit_ABS(cp, inst); - - case TGSI_OPCODE_DPH: - return emit_DPH(cp, inst); - - case TGSI_OPCODE_COS: - return emit_COS(cp, inst); - - case TGSI_OPCODE_SIN: - return emit_SIN(cp, inst); - - case TGSI_OPCODE_TRUNC: - return emit_TRUNC(cp, inst); - - case TGSI_OPCODE_END: - return TRUE; - - default: - return FALSE; - } -} - - -static boolean emit_viewport( struct aos_compilation *cp ) -{ - struct x86_reg pos = aos_get_shader_reg_xmm(cp, - TGSI_FILE_OUTPUT, - cp->vaos->draw->vs.position_output ); - - struct x86_reg scale = x86_make_disp(cp->machine_EDX, - Offset(struct aos_machine, scale)); - - struct x86_reg translate = x86_make_disp(cp->machine_EDX, - Offset(struct aos_machine, translate)); - - sse_mulps(cp->func, pos, scale); - sse_addps(cp->func, pos, translate); - - aos_adopt_xmm_reg( cp, - pos, - TGSI_FILE_OUTPUT, - cp->vaos->draw->vs.position_output, - TRUE ); - return TRUE; -} - - -/* This is useful to be able to see the results on softpipe. Doesn't - * do proper clipping, just assumes the backend can do it during - * rasterization -- for debug only... - */ -static boolean emit_rhw_viewport( struct aos_compilation *cp ) -{ - struct x86_reg tmp = aos_get_xmm_reg(cp); - struct x86_reg pos = aos_get_shader_reg_xmm(cp, - TGSI_FILE_OUTPUT, - cp->vaos->draw->vs.position_output); - - struct x86_reg scale = x86_make_disp(cp->machine_EDX, - Offset(struct aos_machine, scale)); - - struct x86_reg translate = x86_make_disp(cp->machine_EDX, - Offset(struct aos_machine, translate)); - - - - emit_pshufd(cp, tmp, pos, SHUF(W, W, W, W)); - sse2_rcpss(cp->func, tmp, tmp); - sse_shufps(cp->func, tmp, tmp, SHUF(X, X, X, X)); - - sse_mulps(cp->func, pos, scale); - sse_mulps(cp->func, pos, tmp); - sse_addps(cp->func, pos, translate); - - /* Set pos[3] = w - */ - mask_write(cp, pos, tmp, TGSI_WRITEMASK_W); - - aos_adopt_xmm_reg( cp, - pos, - TGSI_FILE_OUTPUT, - cp->vaos->draw->vs.position_output, - TRUE ); - return TRUE; -} - - -#if 0 -static boolean note_immediate( struct aos_compilation *cp, - struct tgsi_full_immediate *imm ) -{ - unsigned pos = cp->num_immediates++; - unsigned j; - - assert( imm->Immediate.NrTokens <= 4 + 1 ); - for (j = 0; j < imm->Immediate.NrTokens - 1; j++) { - cp->vaos->machine->immediate[pos][j] = imm->u[j].Float; - } - - return TRUE; -} -#endif - - - - -static void find_last_write_outputs( struct aos_compilation *cp ) -{ - struct tgsi_parse_context parse; - unsigned this_instruction = 0; - unsigned i; - - tgsi_parse_init( &parse, cp->vaos->base.vs->state.tokens ); - - while (!tgsi_parse_end_of_tokens( &parse )) { - - tgsi_parse_token( &parse ); - - if (parse.FullToken.Token.Type != TGSI_TOKEN_TYPE_INSTRUCTION) - continue; - - for (i = 0; i < TGSI_FULL_MAX_DST_REGISTERS; i++) { - if (parse.FullToken.FullInstruction.Dst[i].Register.File == - TGSI_FILE_OUTPUT) - { - unsigned idx = parse.FullToken.FullInstruction.Dst[i].Register.Index; - cp->output_last_write[idx] = this_instruction; - } - } - - this_instruction++; - } - - tgsi_parse_free( &parse ); -} - - -#define ARG_MACHINE 1 -#define ARG_START_ELTS 2 -#define ARG_COUNT 3 -#define ARG_OUTBUF 4 - - -static boolean build_vertex_program( struct draw_vs_variant_aos_sse *variant, - boolean linear ) -{ - struct tgsi_parse_context parse; - struct aos_compilation cp; - unsigned fixup, label; - - util_init_math(); - - tgsi_parse_init( &parse, variant->base.vs->state.tokens ); - - memset(&cp, 0, sizeof(cp)); - - cp.insn_counter = 1; - cp.vaos = variant; - cp.have_sse2 = 1; - cp.func = &variant->func[ linear ? 0 : 1 ]; - - cp.tmp_EAX = x86_make_reg(file_REG32, reg_AX); - cp.idx_EBX = x86_make_reg(file_REG32, reg_BX); - cp.outbuf_ECX = x86_make_reg(file_REG32, reg_CX); - cp.machine_EDX = x86_make_reg(file_REG32, reg_DX); - cp.count_ESI = x86_make_reg(file_REG32, reg_SI); - cp.temp_EBP = x86_make_reg(file_REG32, reg_BP); - cp.stack_ESP = x86_make_reg( file_REG32, reg_SP ); - - x86_init_func(cp.func); - - find_last_write_outputs(&cp); - - x86_push(cp.func, cp.idx_EBX); - x86_push(cp.func, cp.count_ESI); - x86_push(cp.func, cp.temp_EBP); - - - /* Load arguments into regs: - */ - x86_mov(cp.func, cp.machine_EDX, x86_fn_arg(cp.func, ARG_MACHINE)); - x86_mov(cp.func, cp.idx_EBX, x86_fn_arg(cp.func, ARG_START_ELTS)); - x86_mov(cp.func, cp.count_ESI, x86_fn_arg(cp.func, ARG_COUNT)); - x86_mov(cp.func, cp.outbuf_ECX, x86_fn_arg(cp.func, ARG_OUTBUF)); - - - /* Compare count to zero and possibly bail. - */ - x86_xor(cp.func, cp.tmp_EAX, cp.tmp_EAX); - x86_cmp(cp.func, cp.count_ESI, cp.tmp_EAX); - fixup = x86_jcc_forward(cp.func, cc_E); - - - save_fpu_state( &cp ); - set_fpu_round_nearest( &cp ); - - aos_init_inputs( &cp, linear ); - - cp.x86_reg[0] = 0; - cp.x86_reg[1] = 0; - - /* Note address for loop jump - */ - label = x86_get_label(cp.func); - { - /* Fetch inputs... TODO: fetch lazily... - */ - if (!aos_fetch_inputs( &cp, linear )) - goto fail; - - /* Emit the shader: - */ - while( !tgsi_parse_end_of_tokens( &parse ) && !cp.error ) - { - tgsi_parse_token( &parse ); - - switch (parse.FullToken.Token.Type) { - case TGSI_TOKEN_TYPE_IMMEDIATE: -#if 0 - if (!note_immediate( &cp, &parse.FullToken.FullImmediate )) - goto fail; -#endif - break; - - case TGSI_TOKEN_TYPE_INSTRUCTION: - if (DISASSEM) - tgsi_dump_instruction( &parse.FullToken.FullInstruction, cp.insn_counter ); - - if (!emit_instruction( &cp, &parse.FullToken.FullInstruction )) - goto fail; - break; - } - - x87_assert_stack_empty(cp.func); - cp.insn_counter++; - - if (DISASSEM) - debug_printf("\n"); - } - - - { - unsigned i; - for (i = 0; i < 8; i++) { - if (cp.xmm[i].file != TGSI_FILE_OUTPUT) { - cp.xmm[i].file = TGSI_FILE_NULL; - cp.xmm[i].dirty = 0; - } - } - } - - if (cp.error) - goto fail; - - if (cp.vaos->base.key.clip) { - /* not really handling clipping, just do the rhw so we can - * see the results... - */ - emit_rhw_viewport(&cp); - } - else if (cp.vaos->base.key.viewport) { - emit_viewport(&cp); - } - - /* Emit output... TODO: do this eagerly after the last write to a - * given output. - */ - if (!aos_emit_outputs( &cp )) - goto fail; - - - /* Next vertex: - */ - x86_lea(cp.func, - cp.outbuf_ECX, - x86_make_disp(cp.outbuf_ECX, - cp.vaos->base.key.output_stride)); - - /* Incr index - */ - aos_incr_inputs( &cp, linear ); - } - /* decr count, loop if not zero - */ - x86_dec(cp.func, cp.count_ESI); - x86_jcc(cp.func, cc_NZ, label); - - restore_fpu_state(&cp); - - /* Land forward jump here: - */ - x86_fixup_fwd_jump(cp.func, fixup); - - /* Exit mmx state? - */ - if (cp.func->need_emms) - mmx_emms(cp.func); - - x86_pop(cp.func, cp.temp_EBP); - x86_pop(cp.func, cp.count_ESI); - x86_pop(cp.func, cp.idx_EBX); - - x87_assert_stack_empty(cp.func); - x86_ret(cp.func); - - tgsi_parse_free( &parse ); - return !cp.error; - - fail: - tgsi_parse_free( &parse ); - return FALSE; -} - - -/** cast wrapper */ -static INLINE struct draw_vs_variant_aos_sse * -draw_vs_variant_aos_sse(struct draw_vs_variant *variant) -{ - return (struct draw_vs_variant_aos_sse *) variant; -} - - -static void vaos_set_buffer( struct draw_vs_variant *variant, - unsigned buf, - const void *ptr, - unsigned stride, - unsigned max_stride) -{ - struct draw_vs_variant_aos_sse *vaos = draw_vs_variant_aos_sse(variant); - - if (buf < vaos->nr_vb) { - vaos->buffer[buf].base_ptr = (char *)ptr; - vaos->buffer[buf].stride = stride; - } - - if (0) debug_printf("%s %d/%d: %p %d\n", __FUNCTION__, buf, vaos->nr_vb, ptr, stride); -} - - - -static void PIPE_CDECL vaos_run_elts( struct draw_vs_variant *variant, - const unsigned *elts, - unsigned count, - void *output_buffer ) -{ - struct draw_vs_variant_aos_sse *vaos = draw_vs_variant_aos_sse(variant); - struct aos_machine *machine = vaos->draw->vs.aos_machine; - unsigned i; - - if (0) debug_printf("%s %d\n", __FUNCTION__, count); - - machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size; - for (i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) { - machine->constants[i] = vaos->draw->vs.aligned_constants[i]; - } - machine->immediates = vaos->base.vs->immediates; - machine->buffer = vaos->buffer; - - vaos->gen_run_elts( machine, - elts, - count, - output_buffer ); -} - -static void PIPE_CDECL vaos_run_linear( struct draw_vs_variant *variant, - unsigned start, - unsigned count, - void *output_buffer ) -{ - struct draw_vs_variant_aos_sse *vaos = draw_vs_variant_aos_sse(variant); - struct aos_machine *machine = vaos->draw->vs.aos_machine; - unsigned i; - - if (0) debug_printf("%s %d %d const: %x\n", __FUNCTION__, start, count, - vaos->base.key.const_vbuffers); - - machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size; - for (i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) { - machine->constants[i] = vaos->draw->vs.aligned_constants[i]; - } - machine->immediates = vaos->base.vs->immediates; - machine->buffer = vaos->buffer; - - vaos->gen_run_linear( machine, - start, - count, - output_buffer ); - - /* Sanity spot checks to make sure we didn't trash our constants */ - assert(machine->internal[IMM_ONES][0] == 1.0f); - assert(machine->internal[IMM_IDENTITY][0] == 0.0f); - assert(machine->internal[IMM_NEGS][0] == -1.0f); -} - - - -static void vaos_destroy( struct draw_vs_variant *variant ) -{ - struct draw_vs_variant_aos_sse *vaos = draw_vs_variant_aos_sse(variant); - - FREE( vaos->buffer ); - - x86_release_func( &vaos->func[0] ); - x86_release_func( &vaos->func[1] ); - - FREE(vaos); -} - - - -static struct draw_vs_variant *variant_aos_sse( struct draw_vertex_shader *vs, - const struct draw_vs_variant_key *key ) -{ - unsigned i; - struct draw_vs_variant_aos_sse *vaos = CALLOC_STRUCT(draw_vs_variant_aos_sse); - - if (!vaos) - goto fail; - - vaos->base.key = *key; - vaos->base.vs = vs; - vaos->base.set_buffer = vaos_set_buffer; - vaos->base.destroy = vaos_destroy; - vaos->base.run_linear = vaos_run_linear; - vaos->base.run_elts = vaos_run_elts; - - vaos->draw = vs->draw; - - for (i = 0; i < key->nr_inputs; i++) - vaos->nr_vb = MAX2( vaos->nr_vb, key->element[i].in.buffer + 1 ); - - vaos->buffer = MALLOC( vaos->nr_vb * sizeof(vaos->buffer[0]) ); - if (!vaos->buffer) - goto fail; - - if (0) - debug_printf("nr_vb: %d const: %x\n", vaos->nr_vb, vaos->base.key.const_vbuffers); - -#if 0 - tgsi_dump(vs->state.tokens, 0); -#endif - - if (!build_vertex_program( vaos, TRUE )) - goto fail; - - if (!build_vertex_program( vaos, FALSE )) - goto fail; - - vaos->gen_run_linear = (vaos_run_linear_func)x86_get_func(&vaos->func[0]); - if (!vaos->gen_run_linear) - goto fail; - - vaos->gen_run_elts = (vaos_run_elts_func)x86_get_func(&vaos->func[1]); - if (!vaos->gen_run_elts) - goto fail; - - return &vaos->base; - - fail: - if (vaos && vaos->buffer) - FREE(vaos->buffer); - - if (vaos) - x86_release_func( &vaos->func[0] ); - - if (vaos) - x86_release_func( &vaos->func[1] ); - - FREE(vaos); - - return NULL; -} - - -struct draw_vs_variant * -draw_vs_create_variant_aos_sse( struct draw_vertex_shader *vs, - const struct draw_vs_variant_key *key ) -{ - struct draw_vs_variant *variant = variant_aos_sse( vs, key ); - - if (variant == NULL) { - variant = draw_vs_create_variant_generic( vs, key ); - } - - return variant; -} - - - -#endif /* PIPE_ARCH_X86 */ diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.h b/src/gallium/auxiliary/draw/draw_vs_aos.h deleted file mode 100644 index 55e63d8..0000000 --- a/src/gallium/auxiliary/draw/draw_vs_aos.h +++ /dev/null @@ -1,255 +0,0 @@ -/************************************************************************** - * - * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - -/* Authors: Keith Whitwell - */ - -#ifndef DRAW_VS_AOS_H -#define DRAW_VS_AOS_H - -#include "pipe/p_config.h" -#include "tgsi/tgsi_exec.h" -#include "draw_vs.h" - -#ifdef PIPE_ARCH_X86 - -struct tgsi_token; -struct x86_function; - -#include "pipe/p_state.h" -#include "rtasm/rtasm_x86sse.h" - - - - - -#define X 0 -#define Y 1 -#define Z 2 -#define W 3 - -#define MAX_INPUTS PIPE_MAX_ATTRIBS -#define MAX_OUTPUTS PIPE_MAX_SHADER_OUTPUTS -#define MAX_TEMPS TGSI_EXEC_NUM_TEMPS -#define MAX_CONSTANTS 1024 /** only used for sanity checking */ -#define MAX_IMMEDIATES 1024 /** only used for sanity checking */ -#define MAX_INTERNALS 8 /** see IMM_x values below */ - -#define AOS_FILE_INTERNAL TGSI_FILE_COUNT - -#define FPU_RND_NEG 1 -#define FPU_RND_NEAREST 2 - -struct aos_machine; -typedef void (PIPE_CDECL *lit_func)( struct aos_machine *, - float *result, - const float *in, - unsigned count ); - -void PIPE_CDECL aos_do_lit( struct aos_machine *machine, - float *result, - const float *in, - unsigned count ); - -struct shine_tab { - float exponent; - float values[258]; - unsigned last_used; -}; - -struct lit_info { - lit_func func; - struct shine_tab *shine_tab; -}; - -#define MAX_SHINE_TAB 4 -#define MAX_LIT_INFO 16 - -struct aos_buffer { - const void *base_ptr; - unsigned stride; - void *ptr; /* updated per vertex */ -}; - - - - -/* This is the temporary storage used by all the aos_sse vs variants. - * Create one per context and reuse by passing a pointer in at - * vs_variant creation?? - */ -struct aos_machine { - float input [MAX_INPUTS ][4]; - float output [MAX_OUTPUTS ][4]; - float temp [MAX_TEMPS ][4]; - float internal [MAX_INTERNALS ][4]; - - float scale[4]; /* viewport */ - float translate[4]; /* viewport */ - - float tmp[2][4]; /* scratch space for LIT */ - - struct shine_tab shine_tab[MAX_SHINE_TAB]; - struct lit_info lit_info[MAX_LIT_INFO]; - unsigned now; - - - ushort fpu_rnd_nearest; - ushort fpu_rnd_neg_inf; - ushort fpu_restore; - ushort fpucntl; /* one of FPU_* above */ - - const float (*immediates)[4]; /* points to shader data */ - const void *constants[PIPE_MAX_CONSTANT_BUFFERS]; /* points to draw data */ - - const struct aos_buffer *buffer; /* points to ? */ -}; - - - - -struct aos_compilation { - struct x86_function *func; - struct draw_vs_variant_aos_sse *vaos; - - unsigned insn_counter; - unsigned num_immediates; - unsigned count; - unsigned lit_count; - - struct { - unsigned idx:16; - unsigned file:8; - unsigned dirty:8; - unsigned last_used; - } xmm[8]; - - unsigned x86_reg[2]; /* one of X86_* */ - - boolean input_fetched[PIPE_MAX_ATTRIBS]; - unsigned output_last_write[PIPE_MAX_ATTRIBS]; - - boolean have_sse2; - boolean error; - short fpucntl; - - /* these are actually known values, but putting them in a struct - * like this is helpful to keep them in sync across the file. - */ - struct x86_reg tmp_EAX; - struct x86_reg idx_EBX; /* either start+i or &elt[i] */ - struct x86_reg outbuf_ECX; - struct x86_reg machine_EDX; - struct x86_reg count_ESI; /* decrements to zero */ - struct x86_reg temp_EBP; - struct x86_reg stack_ESP; -}; - -struct x86_reg aos_get_xmm_reg( struct aos_compilation *cp ); -void aos_release_xmm_reg( struct aos_compilation *cp, unsigned idx ); - -void aos_adopt_xmm_reg( struct aos_compilation *cp, - struct x86_reg reg, - unsigned file, - unsigned idx, - unsigned dirty ); - -void aos_spill_all( struct aos_compilation *cp ); - -struct x86_reg aos_get_shader_reg( struct aos_compilation *cp, - unsigned file, - unsigned idx ); - -boolean aos_init_inputs( struct aos_compilation *cp, boolean linear ); -boolean aos_fetch_inputs( struct aos_compilation *cp, boolean linear ); -boolean aos_incr_inputs( struct aos_compilation *cp, boolean linear ); - -boolean aos_emit_outputs( struct aos_compilation *cp ); - - -#define IMM_ONES 0 /* 1, 1,1,1 */ -#define IMM_SWZ 1 /* 1,-1,0, 0xffffffff */ -#define IMM_IDENTITY 2 /* 0, 0,0,1 */ -#define IMM_INV_255 3 /* 1/255, 1/255, 1/255, 1/255 */ -#define IMM_255 4 /* 255, 255, 255, 255 */ -#define IMM_NEGS 5 /* -1,-1,-1,-1 */ -#define IMM_RSQ 6 /* -.5,1.5,_,_ */ -#define IMM_PSIZE 7 /* not really an immediate - updated each run */ - -struct x86_reg aos_get_internal( struct aos_compilation *cp, - unsigned imm ); -struct x86_reg aos_get_internal_xmm( struct aos_compilation *cp, - unsigned imm ); - - -#define AOS_ERROR(cp, msg) \ -do { \ - if (0) debug_printf("%s: x86 translation failed: %s\n", __FUNCTION__, msg); \ - cp->error = 1; \ -} while (0) - - -#define X86_NULL 0 -#define X86_IMMEDIATES 1 -#define X86_CONSTANTS 2 -#define X86_BUFFERS 3 - -struct x86_reg aos_get_x86( struct aos_compilation *cp, - unsigned which_reg, - unsigned value ); - - -typedef void (PIPE_CDECL *vaos_run_elts_func)( struct aos_machine *, - const unsigned *elts, - unsigned count, - void *output_buffer); - -typedef void (PIPE_CDECL *vaos_run_linear_func)( struct aos_machine *, - unsigned start, - unsigned count, - void *output_buffer); - - -struct draw_vs_variant_aos_sse { - struct draw_vs_variant base; - struct draw_context *draw; - - struct aos_buffer *buffer; - unsigned nr_vb; - - vaos_run_linear_func gen_run_linear; - vaos_run_elts_func gen_run_elts; - - - struct x86_function func[2]; -}; - - -#endif - -#endif - diff --git a/src/gallium/auxiliary/draw/draw_vs_aos_io.c b/src/gallium/auxiliary/draw/draw_vs_aos_io.c deleted file mode 100644 index f1dd448..0000000 --- a/src/gallium/auxiliary/draw/draw_vs_aos_io.c +++ /dev/null @@ -1,460 +0,0 @@ -/************************************************************************** - * - * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - - -#include "util/u_memory.h" -#include "pipe/p_shader_tokens.h" -#include "tgsi/tgsi_parse.h" -#include "tgsi/tgsi_util.h" -#include "tgsi/tgsi_exec.h" -#include "draw_vs.h" -#include "draw_vs_aos.h" -#include "draw_vertex.h" - -#include "rtasm/rtasm_x86sse.h" - -#ifdef PIPE_ARCH_X86 - -/* Note - don't yet have to worry about interacting with the code in - * draw_vs_aos.c as there is no intermingling of generated code... - * That may have to change, we'll see. - */ -static void emit_load_R32G32B32A32( struct aos_compilation *cp, - struct x86_reg data, - struct x86_reg src_ptr ) -{ - sse_movups(cp->func, data, src_ptr); -} - -static void emit_load_R32G32B32( struct aos_compilation *cp, - struct x86_reg data, - struct x86_reg src_ptr ) -{ -#if 1 - sse_movss(cp->func, data, x86_make_disp(src_ptr, 8)); - /* data = z ? ? ? */ - sse_shufps(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY ), SHUF(X,Y,Z,W) ); - /* data = z ? 0 1 */ - sse_shufps(cp->func, data, data, SHUF(Y,Z,X,W) ); - /* data = ? 0 z 1 */ - sse_movlps(cp->func, data, src_ptr); - /* data = x y z 1 */ -#else - sse_movups(cp->func, data, src_ptr); - /* data = x y z ? */ - sse2_pshufd(cp->func, data, data, SHUF(W,X,Y,Z) ); - /* data = ? x y z */ - sse_movss(cp->func, data, aos_get_internal_xmm( cp, IMM_ONES ) ); - /* data = 1 x y z */ - sse2_pshufd(cp->func, data, data, SHUF(Y,Z,W,X) ); - /* data = x y z 1 */ -#endif -} - -static void emit_load_R32G32( struct aos_compilation *cp, - struct x86_reg data, - struct x86_reg src_ptr ) -{ - sse_movups(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY ) ); - sse_movlps(cp->func, data, src_ptr); -} - - -static void emit_load_R32( struct aos_compilation *cp, - struct x86_reg data, - struct x86_reg src_ptr ) -{ - sse_movss(cp->func, data, src_ptr); - sse_orps(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY ) ); -} - - -static void emit_load_R8G8B8A8_UNORM( struct aos_compilation *cp, - struct x86_reg data, - struct x86_reg src_ptr ) -{ - sse_movss(cp->func, data, src_ptr); - sse2_punpcklbw(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY )); - sse2_punpcklbw(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY )); - sse2_cvtdq2ps(cp->func, data, data); - sse_mulps(cp->func, data, aos_get_internal(cp, IMM_INV_255)); -} - - - -/* Extended swizzles? Maybe later. - */ -static void emit_swizzle( struct aos_compilation *cp, - struct x86_reg dest, - struct x86_reg src, - ubyte shuffle ) -{ - sse_shufps(cp->func, dest, src, shuffle); -} - - - -static boolean get_buffer_ptr( struct aos_compilation *cp, - boolean linear, - unsigned buf_idx, - struct x86_reg elt, - struct x86_reg ptr) -{ - struct x86_reg buf = x86_make_disp(aos_get_x86( cp, 0, X86_BUFFERS ), - buf_idx * sizeof(struct aos_buffer)); - - struct x86_reg buf_stride = x86_make_disp(buf, - Offset(struct aos_buffer, stride)); - if (linear) { - struct x86_reg buf_ptr = x86_make_disp(buf, - Offset(struct aos_buffer, ptr)); - - - /* Calculate pointer to current attrib: - */ - x86_mov(cp->func, ptr, buf_ptr); - x86_mov(cp->func, elt, buf_stride); - x86_add(cp->func, elt, ptr); - if (buf_idx == 0) sse_prefetchnta(cp->func, x86_make_disp(elt, 192)); - x86_mov(cp->func, buf_ptr, elt); - } - else { - struct x86_reg buf_base_ptr = x86_make_disp(buf, - Offset(struct aos_buffer, base_ptr)); - - - /* Calculate pointer to current attrib: - */ - x86_mov(cp->func, ptr, buf_stride); - x86_imul(cp->func, ptr, elt); - x86_add(cp->func, ptr, buf_base_ptr); - } - - cp->insn_counter++; - - return TRUE; -} - - -static boolean load_input( struct aos_compilation *cp, - unsigned idx, - struct x86_reg bufptr ) -{ - unsigned format = cp->vaos->base.key.element[idx].in.format; - unsigned offset = cp->vaos->base.key.element[idx].in.offset; - struct x86_reg dataXMM = aos_get_xmm_reg(cp); - - /* Figure out source pointer address: - */ - struct x86_reg src = x86_make_disp(bufptr, offset); - - aos_adopt_xmm_reg( cp, - dataXMM, - TGSI_FILE_INPUT, - idx, - TRUE ); - - switch (format) { - case PIPE_FORMAT_R32_FLOAT: - emit_load_R32(cp, dataXMM, src); - break; - case PIPE_FORMAT_R32G32_FLOAT: - emit_load_R32G32(cp, dataXMM, src); - break; - case PIPE_FORMAT_R32G32B32_FLOAT: - emit_load_R32G32B32(cp, dataXMM, src); - break; - case PIPE_FORMAT_R32G32B32A32_FLOAT: - emit_load_R32G32B32A32(cp, dataXMM, src); - break; - case PIPE_FORMAT_A8R8G8B8_UNORM: - emit_load_R8G8B8A8_UNORM(cp, dataXMM, src); - emit_swizzle(cp, dataXMM, dataXMM, SHUF(Z,Y,X,W)); - break; - case PIPE_FORMAT_R8G8B8A8_UNORM: - emit_load_R8G8B8A8_UNORM(cp, dataXMM, src); - break; - default: - AOS_ERROR(cp, "unhandled input format"); - return FALSE; - } - - return TRUE; -} - -static boolean load_inputs( struct aos_compilation *cp, - unsigned buffer, - struct x86_reg ptr ) -{ - unsigned i; - - for (i = 0; i < cp->vaos->base.key.nr_inputs; i++) { - if (cp->vaos->base.key.element[i].in.buffer == buffer) { - - if (!load_input( cp, i, ptr )) - return FALSE; - - cp->insn_counter++; - } - } - - return TRUE; -} - -boolean aos_init_inputs( struct aos_compilation *cp, boolean linear ) -{ - unsigned i; - for (i = 0; i < cp->vaos->nr_vb; i++) { - struct x86_reg buf = x86_make_disp(aos_get_x86( cp, 0, X86_BUFFERS ), - i * sizeof(struct aos_buffer)); - - struct x86_reg buf_base_ptr = x86_make_disp(buf, - Offset(struct aos_buffer, base_ptr)); - - if (cp->vaos->base.key.const_vbuffers & (1<tmp_EAX; - - x86_mov(cp->func, ptr, buf_base_ptr); - - /* Load all inputs for this constant vertex buffer - */ - load_inputs( cp, i, x86_deref(ptr) ); - - /* Then just force them out to aos_machine.input[] - */ - aos_spill_all( cp ); - - } - else if (linear) { - - struct x86_reg elt = cp->idx_EBX; - struct x86_reg ptr = cp->tmp_EAX; - - struct x86_reg buf_stride = x86_make_disp(buf, - Offset(struct aos_buffer, stride)); - - struct x86_reg buf_ptr = x86_make_disp(buf, - Offset(struct aos_buffer, ptr)); - - - /* Calculate pointer to current attrib: - */ - x86_mov(cp->func, ptr, buf_stride); - x86_imul(cp->func, ptr, elt); - x86_add(cp->func, ptr, buf_base_ptr); - - - /* In the linear case, keep the buffer pointer instead of the - * index number. - */ - if (cp->vaos->nr_vb == 1) - x86_mov( cp->func, elt, ptr ); - else - x86_mov( cp->func, buf_ptr, ptr ); - - cp->insn_counter++; - } - } - - return TRUE; -} - -boolean aos_fetch_inputs( struct aos_compilation *cp, boolean linear ) -{ - unsigned j; - - for (j = 0; j < cp->vaos->nr_vb; j++) { - if (cp->vaos->base.key.const_vbuffers & (1<vaos->nr_vb == 1) { - load_inputs( cp, 0, cp->idx_EBX ); - } - else { - struct x86_reg elt = linear ? cp->idx_EBX : x86_deref(cp->idx_EBX); - struct x86_reg ptr = cp->tmp_EAX; - - if (!get_buffer_ptr( cp, linear, j, elt, ptr )) - return FALSE; - - if (!load_inputs( cp, j, ptr )) - return FALSE; - } - } - - return TRUE; -} - -boolean aos_incr_inputs( struct aos_compilation *cp, boolean linear ) -{ - if (linear && cp->vaos->nr_vb == 1) { - struct x86_reg stride = x86_make_disp(aos_get_x86( cp, 0, X86_BUFFERS ), - (0 * sizeof(struct aos_buffer) + - Offset(struct aos_buffer, stride))); - - x86_add(cp->func, cp->idx_EBX, stride); - sse_prefetchnta(cp->func, x86_make_disp(cp->idx_EBX, 192)); - } - else if (linear) { - /* Nothing to do */ - } - else { - x86_lea(cp->func, cp->idx_EBX, x86_make_disp(cp->idx_EBX, 4)); - } - - return TRUE; -} - - - - - - -static void emit_store_R32G32B32A32( struct aos_compilation *cp, - struct x86_reg dst_ptr, - struct x86_reg dataXMM ) -{ - sse_movups(cp->func, dst_ptr, dataXMM); -} - -static void emit_store_R32G32B32( struct aos_compilation *cp, - struct x86_reg dst_ptr, - struct x86_reg dataXMM ) -{ - sse_movlps(cp->func, dst_ptr, dataXMM); - sse_shufps(cp->func, dataXMM, dataXMM, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */ - sse_movss(cp->func, x86_make_disp(dst_ptr,8), dataXMM); -} - -static void emit_store_R32G32( struct aos_compilation *cp, - struct x86_reg dst_ptr, - struct x86_reg dataXMM ) -{ - sse_movlps(cp->func, dst_ptr, dataXMM); -} - -static void emit_store_R32( struct aos_compilation *cp, - struct x86_reg dst_ptr, - struct x86_reg dataXMM ) -{ - sse_movss(cp->func, dst_ptr, dataXMM); -} - - - -static void emit_store_R8G8B8A8_UNORM( struct aos_compilation *cp, - struct x86_reg dst_ptr, - struct x86_reg dataXMM ) -{ - sse_mulps(cp->func, dataXMM, aos_get_internal(cp, IMM_255)); - sse2_cvtps2dq(cp->func, dataXMM, dataXMM); - sse2_packssdw(cp->func, dataXMM, dataXMM); - sse2_packuswb(cp->func, dataXMM, dataXMM); - sse_movss(cp->func, dst_ptr, dataXMM); -} - - - - - -static boolean emit_output( struct aos_compilation *cp, - struct x86_reg ptr, - struct x86_reg dataXMM, - enum attrib_emit format ) -{ - switch (format) { - case EMIT_1F: - case EMIT_1F_PSIZE: - emit_store_R32(cp, ptr, dataXMM); - break; - case EMIT_2F: - emit_store_R32G32(cp, ptr, dataXMM); - break; - case EMIT_3F: - emit_store_R32G32B32(cp, ptr, dataXMM); - break; - case EMIT_4F: - emit_store_R32G32B32A32(cp, ptr, dataXMM); - break; - case EMIT_4UB: - emit_store_R8G8B8A8_UNORM(cp, ptr, dataXMM); - break; - case EMIT_4UB_BGRA: - emit_swizzle(cp, dataXMM, dataXMM, SHUF(Z,Y,X,W)); - emit_store_R8G8B8A8_UNORM(cp, ptr, dataXMM); - break; - default: - AOS_ERROR(cp, "unhandled output format"); - return FALSE; - } - - return TRUE; -} - - - -boolean aos_emit_outputs( struct aos_compilation *cp ) -{ - unsigned i; - - for (i = 0; i < cp->vaos->base.key.nr_outputs; i++) { - enum attrib_emit format = cp->vaos->base.key.element[i].out.format; - unsigned offset = cp->vaos->base.key.element[i].out.offset; - unsigned vs_output = cp->vaos->base.key.element[i].out.vs_output; - - struct x86_reg data; - - if (format == EMIT_1F_PSIZE) { - data = aos_get_internal_xmm( cp, IMM_PSIZE ); - } - else { - data = aos_get_shader_reg( cp, - TGSI_FILE_OUTPUT, - vs_output ); - } - - if (data.file != file_XMM) { - struct x86_reg tmp = aos_get_xmm_reg( cp ); - sse_movaps(cp->func, tmp, data); - data = tmp; - } - - if (!emit_output( cp, - x86_make_disp( cp->outbuf_ECX, offset ), - data, - format )) - return FALSE; - - aos_release_xmm_reg( cp, data.idx ); - - cp->insn_counter++; - } - - return TRUE; -} - -#endif diff --git a/src/gallium/auxiliary/draw/draw_vs_aos_machine.c b/src/gallium/auxiliary/draw/draw_vs_aos_machine.c deleted file mode 100644 index 0eda414..0000000 --- a/src/gallium/auxiliary/draw/draw_vs_aos_machine.c +++ /dev/null @@ -1,328 +0,0 @@ -/************************************************************************** - * - * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - - -#include "pipe/p_config.h" - - -#include "pipe/p_shader_tokens.h" -#include "util/u_math.h" -#include "util/u_memory.h" -#include "tgsi/tgsi_parse.h" -#include "tgsi/tgsi_util.h" -#include "tgsi/tgsi_exec.h" -#include "draw_vs.h" -#include "draw_vs_aos.h" -#include "draw_vertex.h" - -#ifdef PIPE_ARCH_X86 - -#include "rtasm/rtasm_x86sse.h" - - -#define X87_CW_EXCEPTION_INV_OP (1<<0) -#define X87_CW_EXCEPTION_DENORM_OP (1<<1) -#define X87_CW_EXCEPTION_ZERO_DIVIDE (1<<2) -#define X87_CW_EXCEPTION_OVERFLOW (1<<3) -#define X87_CW_EXCEPTION_UNDERFLOW (1<<4) -#define X87_CW_EXCEPTION_PRECISION (1<<5) -#define X87_CW_PRECISION_SINGLE (0<<8) -#define X87_CW_PRECISION_RESERVED (1<<8) -#define X87_CW_PRECISION_DOUBLE (2<<8) -#define X87_CW_PRECISION_DOUBLE_EXT (3<<8) -#define X87_CW_PRECISION_MASK (3<<8) -#define X87_CW_ROUND_NEAREST (0<<10) -#define X87_CW_ROUND_DOWN (1<<10) -#define X87_CW_ROUND_UP (2<<10) -#define X87_CW_ROUND_ZERO (3<<10) -#define X87_CW_ROUND_MASK (3<<10) -#define X87_CW_INFINITY (1<<12) - - -void PIPE_CDECL aos_do_lit( struct aos_machine *machine, - float *result, - const float *in, - unsigned count ) -{ - if (in[0] > 0) - { - if (in[1] <= 0.0) - { - result[0] = 1.0F; - result[1] = in[0]; - result[2] = 0.0F; - result[3] = 1.0F; - } - else - { - const float epsilon = 1.0F / 256.0F; - float exponent = CLAMP(in[3], -(128.0F - epsilon), (128.0F - epsilon)); - result[0] = 1.0F; - result[1] = in[0]; - result[2] = powf(in[1], exponent); - result[3] = 1.0; - } - } - else - { - result[0] = 1.0F; - result[1] = 0.0; - result[2] = 0.0; - result[3] = 1.0F; - } -} - - -static void PIPE_CDECL do_lit_lut( struct aos_machine *machine, - float *result, - const float *in, - unsigned count ) -{ - if (in[0] > 0) - { - if (in[1] <= 0.0) - { - result[0] = 1.0F; - result[1] = in[0]; - result[2] = 0.0F; - result[3] = 1.0F; - return; - } - - if (machine->lit_info[count].shine_tab->exponent != in[3]) { - machine->lit_info[count].func = aos_do_lit; - goto no_luck; - } - - if (in[1] <= 1.0) - { - const float *tab = machine->lit_info[count].shine_tab->values; - float f = in[1] * 256; - int k = (int)f; - float frac = f - (float)k; - - result[0] = 1.0F; - result[1] = in[0]; - result[2] = tab[k] + frac*(tab[k+1]-tab[k]); - result[3] = 1.0; - return; - } - - no_luck: - { - const float epsilon = 1.0F / 256.0F; - float exponent = CLAMP(in[3], -(128.0F - epsilon), (128.0F - epsilon)); - result[0] = 1.0F; - result[1] = in[0]; - result[2] = powf(in[1], exponent); - result[3] = 1.0; - } - } - else - { - result[0] = 1.0F; - result[1] = 0.0; - result[2] = 0.0; - result[3] = 1.0F; - } -} - - -static void do_populate_lut( struct shine_tab *tab, - float unclamped_exponent ) -{ - const float epsilon = 1.0F / 256.0F; - float exponent = CLAMP(unclamped_exponent, -(128.0F - epsilon), (128.0F - epsilon)); - unsigned i; - - tab->exponent = unclamped_exponent; /* for later comparison */ - - tab->values[0] = 0; - if (exponent == 0) { - for (i = 1; i < 258; i++) { - tab->values[i] = 1.0; - } - } - else { - for (i = 1; i < 258; i++) { - tab->values[i] = powf((float)i * epsilon, exponent); - } - } -} - - - - -static void PIPE_CDECL populate_lut( struct aos_machine *machine, - float *result, - const float *in, - unsigned count ) -{ - unsigned i, tab; - - /* Search for an existing table for this value. Note that without - * static analysis we don't really know if in[3] will be constant, - * but it usually is... - */ - for (tab = 0; tab < 4; tab++) { - if (machine->shine_tab[tab].exponent == in[3]) { - goto found; - } - } - - for (tab = 0, i = 1; i < 4; i++) { - if (machine->shine_tab[i].last_used < machine->shine_tab[tab].last_used) - tab = i; - } - - if (machine->shine_tab[tab].last_used == machine->now) { - /* No unused tables (this is not a ffvertex program...). Just - * call pow each time: - */ - machine->lit_info[count].func = aos_do_lit; - machine->lit_info[count].func( machine, result, in, count ); - return; - } - else { - do_populate_lut( &machine->shine_tab[tab], in[3] ); - } - - found: - machine->shine_tab[tab].last_used = machine->now; - machine->lit_info[count].shine_tab = &machine->shine_tab[tab]; - machine->lit_info[count].func = do_lit_lut; - machine->lit_info[count].func( machine, result, in, count ); -} - - -void -draw_vs_aos_machine_constants(struct aos_machine *machine, - unsigned slot, - const void *constants) -{ - machine->constants[slot] = constants; - - { - unsigned i; - for (i = 0; i < MAX_LIT_INFO; i++) { - machine->lit_info[i].func = populate_lut; - machine->now++; - } - } -} - - -void draw_vs_aos_machine_viewport( struct aos_machine *machine, - const struct pipe_viewport_state *viewport ) -{ - memcpy(machine->scale, viewport->scale, 4 * sizeof(float)); - memcpy(machine->translate, viewport->translate, 4 * sizeof(float)); -} - - - -void draw_vs_aos_machine_destroy( struct aos_machine *machine ) -{ - align_free(machine); -} - -struct aos_machine *draw_vs_aos_machine( void ) -{ - struct aos_machine *machine; - unsigned i; - float inv = 1.0f/255.0f; - float f255 = 255.0f; - - machine = align_malloc(sizeof(struct aos_machine), 16); - if (!machine) - return NULL; - - memset(machine, 0, sizeof(*machine)); - - ASSIGN_4V(machine->internal[IMM_SWZ], 1.0f, -1.0f, 0.0f, 1.0f); - *(unsigned *)&machine->internal[IMM_SWZ][3] = 0xffffffff; - - ASSIGN_4V(machine->internal[IMM_ONES], 1.0f, 1.0f, 1.0f, 1.0f); - ASSIGN_4V(machine->internal[IMM_NEGS], -1.0f, -1.0f, -1.0f, -1.0f); - ASSIGN_4V(machine->internal[IMM_IDENTITY], 0.0f, 0.0f, 0.0f, 1.0f); - ASSIGN_4V(machine->internal[IMM_INV_255], inv, inv, inv, inv); - ASSIGN_4V(machine->internal[IMM_255], f255, f255, f255, f255); - ASSIGN_4V(machine->internal[IMM_RSQ], -.5f, 1.5f, 0.0f, 0.0f); - - - machine->fpu_rnd_nearest = (X87_CW_EXCEPTION_INV_OP | - X87_CW_EXCEPTION_DENORM_OP | - X87_CW_EXCEPTION_ZERO_DIVIDE | - X87_CW_EXCEPTION_OVERFLOW | - X87_CW_EXCEPTION_UNDERFLOW | - X87_CW_EXCEPTION_PRECISION | - (1<<6) | - X87_CW_ROUND_NEAREST | - X87_CW_PRECISION_DOUBLE_EXT); - - assert(machine->fpu_rnd_nearest == 0x37f); - - machine->fpu_rnd_neg_inf = (X87_CW_EXCEPTION_INV_OP | - X87_CW_EXCEPTION_DENORM_OP | - X87_CW_EXCEPTION_ZERO_DIVIDE | - X87_CW_EXCEPTION_OVERFLOW | - X87_CW_EXCEPTION_UNDERFLOW | - X87_CW_EXCEPTION_PRECISION | - (1<<6) | - X87_CW_ROUND_DOWN | - X87_CW_PRECISION_DOUBLE_EXT); - - for (i = 0; i < MAX_SHINE_TAB; i++) - do_populate_lut( &machine->shine_tab[i], 1.0f ); - - return machine; -} - -#else - -void draw_vs_aos_machine_viewport( struct aos_machine *machine, - const struct pipe_viewport_state *viewport ) -{ -} - -void -draw_vs_aos_machine_constants(struct aos_machine *machine, - unsigned slot, - const void *constants) -{ -} - -void draw_vs_aos_machine_destroy( struct aos_machine *machine ) -{ -} - -struct aos_machine *draw_vs_aos_machine( void ) -{ - return NULL; -} -#endif - diff --git a/src/gallium/auxiliary/draw/draw_vs_ppc.c b/src/gallium/auxiliary/draw/draw_vs_ppc.c index cf894bb..7fb0e09 100644 --- a/src/gallium/auxiliary/draw/draw_vs_ppc.c +++ b/src/gallium/auxiliary/draw/draw_vs_ppc.c @@ -185,12 +185,7 @@ draw_create_vs_ppc(struct draw_context *draw, tgsi_scan_shader(templ->tokens, &vs->base.info); vs->base.draw = draw; -#if 0 - if (1) - vs->base.create_variant = draw_vs_variant_aos_ppc; - else -#endif - vs->base.create_variant = draw_vs_create_variant_generic; + vs->base.create_variant = draw_vs_create_variant_generic; vs->base.prepare = vs_ppc_prepare; vs->base.run_linear = vs_ppc_run_linear; vs->base.delete = vs_ppc_delete; diff --git a/src/gallium/auxiliary/draw/draw_vs_sse.c b/src/gallium/auxiliary/draw/draw_vs_sse.c deleted file mode 100644 index d918579..0000000 --- a/src/gallium/auxiliary/draw/draw_vs_sse.c +++ /dev/null @@ -1,225 +0,0 @@ -/************************************************************************** - * - * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - - /* - * Authors: - * Keith Whitwell - * Brian Paul - */ - -#include "util/u_math.h" -#include "util/u_memory.h" -#include "pipe/p_config.h" - -#include "draw_vs.h" - -#if defined(PIPE_ARCH_X86) - -#include "pipe/p_shader_tokens.h" - -#include "draw_private.h" -#include "draw_context.h" - -#include "rtasm/rtasm_cpu.h" -#include "rtasm/rtasm_x86sse.h" -#include "tgsi/tgsi_sse2.h" -#include "tgsi/tgsi_parse.h" -#include "tgsi/tgsi_exec.h" - -#define SSE_MAX_VERTICES 4 - - -struct draw_sse_vertex_shader { - struct draw_vertex_shader base; - struct x86_function sse2_program; - - tgsi_sse2_vs_func func; - - struct tgsi_exec_machine *machine; -}; - - -static void -vs_sse_prepare( struct draw_vertex_shader *base, - struct draw_context *draw ) -{ - struct draw_sse_vertex_shader *shader = (struct draw_sse_vertex_shader *)base; - struct tgsi_exec_machine *machine = shader->machine; - - machine->Samplers = draw->vs.samplers; - - if (base->info.uses_instanceid) { - unsigned i = machine->SysSemanticToIndex[TGSI_SEMANTIC_INSTANCEID]; - assert(i < Elements(machine->SystemValue)); - machine->SystemValue[i][0] = base->draw->instance_id; - } -} - - - -/* Simplified vertex shader interface for the pt paths. Given the - * complexity of code-generating all the above operations together, - * it's time to try doing all the other stuff separately. - */ -static void -vs_sse_run_linear( struct draw_vertex_shader *base, - const float (*input)[4], - float (*output)[4], - const void *constants[PIPE_MAX_CONSTANT_BUFFERS], - const unsigned const_size[PIPE_MAX_CONSTANT_BUFFERS], - unsigned count, - unsigned input_stride, - unsigned output_stride ) -{ - struct draw_sse_vertex_shader *shader = (struct draw_sse_vertex_shader *)base; - struct tgsi_exec_machine *machine = shader->machine; - unsigned int i; - - /* By default, execute all channels. XXX move this inside the loop - * below when we support shader conditionals/loops. - */ - tgsi_set_exec_mask(machine, 1, 1, 1, 1); - - for (i = 0; i < count; i += MAX_TGSI_VERTICES) { - unsigned int max_vertices = MIN2(MAX_TGSI_VERTICES, count - i); - - if (max_vertices < 4) { - /* disable the unused execution channels */ - tgsi_set_exec_mask(machine, - 1, - max_vertices > 1, - max_vertices > 2, - 0); - } - - /* run compiled shader - */ - shader->func(machine, - (const float (*)[4])constants[0], - shader->base.immediates, - input, - base->info.num_inputs, - input_stride, - output, - base->info.num_outputs, - output_stride ); - - input = (const float (*)[4])((const char *)input + input_stride * max_vertices); - output = (float (*)[4])((char *)output + output_stride * max_vertices); - } -} - - - - -static void -vs_sse_delete( struct draw_vertex_shader *base ) -{ - struct draw_sse_vertex_shader *shader = (struct draw_sse_vertex_shader *)base; - - x86_release_func( &shader->sse2_program ); - - align_free( (void *) shader->base.immediates ); - - FREE( (void*) shader->base.state.tokens ); - FREE( shader ); -} - - -struct draw_vertex_shader * -draw_create_vs_sse(struct draw_context *draw, - const struct pipe_shader_state *templ) -{ - struct draw_sse_vertex_shader *vs; - - if (!rtasm_cpu_has_sse2()) - return NULL; - - vs = CALLOC_STRUCT( draw_sse_vertex_shader ); - if (vs == NULL) - return NULL; - - /* we make a private copy of the tokens */ - vs->base.state.tokens = tgsi_dup_tokens(templ->tokens); - if (!vs->base.state.tokens) - goto fail; - - tgsi_scan_shader(templ->tokens, &vs->base.info); - - vs->base.draw = draw; - if (1) - vs->base.create_variant = draw_vs_create_variant_aos_sse; - else - vs->base.create_variant = draw_vs_create_variant_generic; - vs->base.prepare = vs_sse_prepare; - vs->base.run_linear = vs_sse_run_linear; - vs->base.delete = vs_sse_delete; - - vs->base.immediates = align_malloc(TGSI_EXEC_NUM_IMMEDIATES * 4 * - sizeof(float), 16); - - vs->machine = draw->vs.machine; - - x86_init_func( &vs->sse2_program ); - - if (!tgsi_emit_sse2( (struct tgsi_token *) vs->base.state.tokens, - &vs->sse2_program, - (float (*)[4])vs->base.immediates, - TRUE )) - goto fail; - - vs->func = (tgsi_sse2_vs_func) x86_get_func( &vs->sse2_program ); - if (!vs->func) { - goto fail; - } - - return &vs->base; - -fail: - if (0) - debug_warning("tgsi_emit_sse2() failed, falling back to interpreter\n"); - - x86_release_func( &vs->sse2_program ); - - FREE(vs); - return NULL; -} - - - -#else - -struct draw_vertex_shader * -draw_create_vs_sse( struct draw_context *draw, - const struct pipe_shader_state *templ ) -{ - return (void *) 0; -} - - -#endif - diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/tgsi_sse2.c deleted file mode 100644 index 5614caf..0000000 --- a/src/gallium/auxiliary/tgsi/tgsi_sse2.c +++ /dev/null @@ -1,3106 +0,0 @@ -/************************************************************************** - * - * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas. - * All Rights Reserved. - * Copyright 2009-2010 VMware, Inc. All rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - -#include "pipe/p_config.h" - -#include "tgsi/tgsi_sse2.h" - -#if defined(PIPE_ARCH_X86) && 0 /* See FIXME notes below */ - -#include "util/u_debug.h" -#include "pipe/p_shader_tokens.h" -#include "util/u_math.h" -#include "util/u_memory.h" -#if defined(PIPE_ARCH_SSE) -#include "util/u_sse.h" -#endif -#include "tgsi/tgsi_info.h" -#include "tgsi/tgsi_parse.h" -#include "tgsi/tgsi_util.h" -#include "tgsi/tgsi_dump.h" -#include "tgsi/tgsi_exec.h" - -#include "rtasm/rtasm_x86sse.h" - -/* for 1/sqrt() - * - * This costs about 100fps (close to 10%) in gears: - */ -#define HIGH_PRECISION 1 - -#define FAST_MATH 1 - - -#define FOR_EACH_CHANNEL( CHAN )\ - for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++) - -#define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\ - ((INST).Dst[0].Register.WriteMask & (1 << (CHAN))) - -#define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\ - if (IS_DST0_CHANNEL_ENABLED( INST, CHAN )) - -#define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\ - FOR_EACH_CHANNEL( CHAN )\ - IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN ) - -#define CHAN_X 0 -#define CHAN_Y 1 -#define CHAN_Z 2 -#define CHAN_W 3 - -#define TEMP_ONE_I TGSI_EXEC_TEMP_ONE_I -#define TEMP_ONE_C TGSI_EXEC_TEMP_ONE_C - -#define TEMP_R0 TGSI_EXEC_TEMP_R0 -#define TEMP_ADDR TGSI_EXEC_TEMP_ADDR -#define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I -#define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C - - -/** - * X86 utility functions. - */ - -static struct x86_reg -make_xmm( - unsigned xmm ) -{ - return x86_make_reg( - file_XMM, - (enum x86_reg_name) xmm ); -} - -/** - * X86 register mapping helpers. - */ - -static struct x86_reg -get_const_base( void ) -{ - return x86_make_reg( - file_REG32, - reg_AX ); -} - -static struct x86_reg -get_machine_base( void ) -{ - return x86_make_reg( - file_REG32, - reg_CX ); -} - -static struct x86_reg -get_input_base( void ) -{ - /* FIXME: tgsi_exec_machine::Inputs is a pointer now! */ - return x86_make_disp( - get_machine_base(), - Offset(struct tgsi_exec_machine, Inputs) ); -} - -static struct x86_reg -get_output_base( void ) -{ - /* FIXME: tgsi_exec_machine::Ouputs is a pointer now! */ - return x86_make_disp( - get_machine_base(), - Offset(struct tgsi_exec_machine, Outputs) ); -} - -static struct x86_reg -get_temp_base( void ) -{ - return x86_make_disp( - get_machine_base(), - Offset(struct tgsi_exec_machine, Temps) ); -} - -static struct x86_reg -get_coef_base( void ) -{ - return x86_make_reg( - file_REG32, - reg_BX ); -} - -static struct x86_reg -get_sampler_base( void ) -{ - return x86_make_reg( - file_REG32, - reg_DI ); -} - -static struct x86_reg -get_immediate_base( void ) -{ - return x86_make_reg( - file_REG32, - reg_DX ); -} - -static struct x86_reg -get_system_value_base( void ) -{ - return x86_make_disp( - get_machine_base(), - Offset(struct tgsi_exec_machine, SystemValue) ); -} - - -/** - * Data access helpers. - */ - - -static struct x86_reg -get_immediate( - unsigned vec, - unsigned chan ) -{ - return x86_make_disp( - get_immediate_base(), - (vec * 4 + chan) * 4 ); -} - -static struct x86_reg -get_const( - unsigned vec, - unsigned chan ) -{ - return x86_make_disp( - get_const_base(), - (vec * 4 + chan) * 4 ); -} - -static struct x86_reg -get_sampler_ptr( - unsigned unit ) -{ - return x86_make_disp( - get_sampler_base(), - unit * sizeof( struct tgsi_sampler * ) ); -} - -static struct x86_reg -get_input( - unsigned vec, - unsigned chan ) -{ - return x86_make_disp( - get_input_base(), - (vec * 4 + chan) * 16 ); -} - -static struct x86_reg -get_output( - unsigned vec, - unsigned chan ) -{ - return x86_make_disp( - get_output_base(), - (vec * 4 + chan) * 16 ); -} - -static struct x86_reg -get_temp( - unsigned vec, - unsigned chan ) -{ - return x86_make_disp( - get_temp_base(), - (vec * 4 + chan) * 16 ); -} - -static struct x86_reg -get_system_value( - unsigned vec, - unsigned chan ) -{ - return x86_make_disp( - get_system_value_base(), /* base */ - (vec * 4 + chan) * 4 ); /* byte offset from base */ -} - -static struct x86_reg -get_coef( - unsigned vec, - unsigned chan, - unsigned member ) -{ - return x86_make_disp( - get_coef_base(), - ((vec * 3 + member) * 4 + chan) * 4 ); -} - - -static void -emit_ret( - struct x86_function *func ) -{ - x86_ret( func ); -} - - -/** - * Data fetch helpers. - */ - -/** - * Copy a shader constant to xmm register - * \param xmm the destination xmm register - * \param vec the src const buffer index - * \param chan src channel to fetch (X, Y, Z or W) - */ -static void -emit_const( - struct x86_function *func, - uint xmm, - int vec, - uint chan, - uint indirect, - uint indirectFile, - int indirectIndex ) -{ - if (indirect) { - /* 'vec' is the offset from the address register's value. - * We're loading CONST[ADDR+vec] into an xmm register. - */ - struct x86_reg r0 = get_immediate_base(); - struct x86_reg r1 = get_coef_base(); - uint i; - - assert( indirectFile == TGSI_FILE_ADDRESS ); - assert( indirectIndex == 0 ); - assert( r0.mod == mod_REG ); - assert( r1.mod == mod_REG ); - - x86_push( func, r0 ); - x86_push( func, r1 ); - - /* - * Loop over the four pixels or vertices in the quad. - * Get the value of the address (offset) register for pixel/vertex[i], - * add it to the src offset and index into the constant buffer. - * Note that we're working on SOA data. - * If any of the pixel/vertex execution channels are unused their - * values will be garbage. It's very important that we don't use - * those garbage values as indexes into the constant buffer since - * that'll cause segfaults. - * The solution is to bitwise-AND the offset with the execution mask - * register whose values are either 0 or ~0. - * The caller must setup the execution mask register to indicate - * which channels are valid/alive before running the shader. - * The execution mask will also figure into loops and conditionals - * someday. - */ - for (i = 0; i < QUAD_SIZE; i++) { - /* r1 = address register[i] */ - x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) ); - /* r0 = execution mask[i] */ - x86_mov( func, r0, x86_make_disp( get_temp( TEMP_EXEC_MASK_I, TEMP_EXEC_MASK_C ), i * 4 ) ); - /* r1 = r1 & r0 */ - x86_and( func, r1, r0 ); - /* r0 = 'vec', the offset */ - x86_lea( func, r0, get_const( vec, chan ) ); - - /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm. - */ - x86_add( func, r1, r1 ); - x86_add( func, r1, r1 ); - x86_add( func, r1, r1 ); - x86_add( func, r1, r1 ); - - x86_add( func, r0, r1 ); /* r0 = r0 + r1 */ - x86_mov( func, r1, x86_deref( r0 ) ); - x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 ); - } - - x86_pop( func, r1 ); - x86_pop( func, r0 ); - - sse_movaps( - func, - make_xmm( xmm ), - get_temp( TEMP_R0, CHAN_X ) ); - } - else { - /* 'vec' is the index into the src register file, such as TEMP[vec] */ - assert( vec >= 0 ); - - sse_movss( - func, - make_xmm( xmm ), - get_const( vec, chan ) ); - sse_shufps( - func, - make_xmm( xmm ), - make_xmm( xmm ), - SHUF( 0, 0, 0, 0 ) ); - } -} - -static void -emit_immediate( - struct x86_function *func, - unsigned xmm, - unsigned vec, - unsigned chan ) -{ - sse_movss( - func, - make_xmm( xmm ), - get_immediate( vec, chan ) ); - sse_shufps( - func, - make_xmm( xmm ), - make_xmm( xmm ), - SHUF( 0, 0, 0, 0 ) ); -} - - -/** - * Copy a shader input to xmm register - * \param xmm the destination xmm register - * \param vec the src input attrib - * \param chan src channel to fetch (X, Y, Z or W) - */ -static void -emit_inputf( - struct x86_function *func, - unsigned xmm, - unsigned vec, - unsigned chan ) -{ - sse_movups( - func, - make_xmm( xmm ), - get_input( vec, chan ) ); -} - -/** - * Store an xmm register to a shader output - * \param xmm the source xmm register - * \param vec the dest output attrib - * \param chan src dest channel to store (X, Y, Z or W) - */ -static void -emit_output( - struct x86_function *func, - unsigned xmm, - unsigned vec, - unsigned chan ) -{ - sse_movups( - func, - get_output( vec, chan ), - make_xmm( xmm ) ); -} - -/** - * Copy a shader temporary to xmm register - * \param xmm the destination xmm register - * \param vec the src temp register - * \param chan src channel to fetch (X, Y, Z or W) - */ -static void -emit_tempf( - struct x86_function *func, - unsigned xmm, - unsigned vec, - unsigned chan ) -{ - sse_movaps( - func, - make_xmm( xmm ), - get_temp( vec, chan ) ); -} - -/** - * Copy a system value to xmm register - * \param xmm the destination xmm register - * \param vec the source system value register - * \param chan src channel to fetch (X, Y, Z or W) - */ -static void -emit_system_value( - struct x86_function *func, - unsigned xmm, - unsigned vec, - unsigned chan ) -{ - sse_movss( - func, - make_xmm( xmm ), - get_system_value( vec, chan ) ); - sse_shufps( - func, - make_xmm( xmm ), - make_xmm( xmm ), - SHUF( 0, 0, 0, 0 ) ); -} - -/** - * Load an xmm register with an input attrib coefficient (a0, dadx or dady) - * \param xmm the destination xmm register - * \param vec the src input/attribute coefficient index - * \param chan src channel to fetch (X, Y, Z or W) - * \param member 0=a0, 1=dadx, 2=dady - */ -static void -emit_coef( - struct x86_function *func, - unsigned xmm, - unsigned vec, - unsigned chan, - unsigned member ) -{ - sse_movss( - func, - make_xmm( xmm ), - get_coef( vec, chan, member ) ); - sse_shufps( - func, - make_xmm( xmm ), - make_xmm( xmm ), - SHUF( 0, 0, 0, 0 ) ); -} - -/** - * Data store helpers. - */ - -static void -emit_inputs( - struct x86_function *func, - unsigned xmm, - unsigned vec, - unsigned chan ) -{ - sse_movups( - func, - get_input( vec, chan ), - make_xmm( xmm ) ); -} - -static void -emit_temps( - struct x86_function *func, - unsigned xmm, - unsigned vec, - unsigned chan ) -{ - sse_movaps( - func, - get_temp( vec, chan ), - make_xmm( xmm ) ); -} - -static void -emit_addrs( - struct x86_function *func, - unsigned xmm, - unsigned vec, - unsigned chan ) -{ - assert( vec == 0 ); - - emit_temps( - func, - xmm, - vec + TGSI_EXEC_TEMP_ADDR, - chan ); -} - -/** - * Coefficent fetch helpers. - */ - -static void -emit_coef_a0( - struct x86_function *func, - unsigned xmm, - unsigned vec, - unsigned chan ) -{ - emit_coef( - func, - xmm, - vec, - chan, - 0 ); -} - -static void -emit_coef_dadx( - struct x86_function *func, - unsigned xmm, - unsigned vec, - unsigned chan ) -{ - emit_coef( - func, - xmm, - vec, - chan, - 1 ); -} - -static void -emit_coef_dady( - struct x86_function *func, - unsigned xmm, - unsigned vec, - unsigned chan ) -{ - emit_coef( - func, - xmm, - vec, - chan, - 2 ); -} - -/** - * Function call helpers. - */ - -/** - * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be - * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee - * that the stack pointer is 16 byte aligned, as expected. - */ -static void -emit_func_call( - struct x86_function *func, - unsigned xmm_save_mask, - const struct x86_reg *arg, - unsigned nr_args, - void (PIPE_CDECL *code)() ) -{ - struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX ); - unsigned i, n; - - x86_push( - func, - x86_make_reg( file_REG32, reg_AX) ); - x86_push( - func, - x86_make_reg( file_REG32, reg_CX) ); - x86_push( - func, - x86_make_reg( file_REG32, reg_DX) ); - - /* Store XMM regs to the stack - */ - for(i = 0, n = 0; i < 8; ++i) - if(xmm_save_mask & (1 << i)) - ++n; - - x86_sub_imm( - func, - x86_make_reg( file_REG32, reg_SP ), - n*16); - - for(i = 0, n = 0; i < 8; ++i) - if(xmm_save_mask & (1 << i)) { - sse_movups( - func, - x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ), - make_xmm( i ) ); - ++n; - } - - for (i = 0; i < nr_args; i++) { - /* Load the address of the buffer we use for passing arguments and - * receiving results: - */ - x86_lea( - func, - ecx, - arg[i] ); - - /* Push actual function arguments (currently just the pointer to - * the buffer above), and call the function: - */ - x86_push( func, ecx ); - } - - x86_mov_reg_imm( func, ecx, (unsigned long) code ); - x86_call( func, ecx ); - - /* Pop the arguments (or just add an immediate to esp) - */ - for (i = 0; i < nr_args; i++) { - x86_pop(func, ecx ); - } - - /* Pop the saved XMM regs: - */ - for(i = 0, n = 0; i < 8; ++i) - if(xmm_save_mask & (1 << i)) { - sse_movups( - func, - make_xmm( i ), - x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) ); - ++n; - } - - x86_add_imm( - func, - x86_make_reg( file_REG32, reg_SP ), - n*16); - - /* Restore GP registers in a reverse order. - */ - x86_pop( - func, - x86_make_reg( file_REG32, reg_DX) ); - x86_pop( - func, - x86_make_reg( file_REG32, reg_CX) ); - x86_pop( - func, - x86_make_reg( file_REG32, reg_AX) ); -} - -static void -emit_func_call_dst_src1( - struct x86_function *func, - unsigned xmm_save, - unsigned xmm_dst, - unsigned xmm_src0, - void (PIPE_CDECL *code)() ) -{ - struct x86_reg store = get_temp( TEMP_R0, 0 ); - unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst); - - /* Store our input parameters (in xmm regs) to the buffer we use - * for passing arguments. We will pass a pointer to this buffer as - * the actual function argument. - */ - sse_movaps( - func, - store, - make_xmm( xmm_src0 ) ); - - emit_func_call( func, - xmm_mask, - &store, - 1, - code ); - - sse_movaps( - func, - make_xmm( xmm_dst ), - store ); -} - - -static void -emit_func_call_dst_src2( - struct x86_function *func, - unsigned xmm_save, - unsigned xmm_dst, - unsigned xmm_src0, - unsigned xmm_src1, - void (PIPE_CDECL *code)() ) -{ - struct x86_reg store = get_temp( TEMP_R0, 0 ); - unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst); - - /* Store two inputs to parameter buffer. - */ - sse_movaps( - func, - store, - make_xmm( xmm_src0 ) ); - - sse_movaps( - func, - x86_make_disp( store, 4 * sizeof(float) ), - make_xmm( xmm_src1 ) ); - - - /* Emit the call - */ - emit_func_call( func, - xmm_mask, - &store, - 1, - code ); - - /* Retrieve the results: - */ - sse_movaps( - func, - make_xmm( xmm_dst ), - store ); -} - - - - - -#if defined(PIPE_ARCH_SSE) - -/* - * Fast SSE2 implementation of special math functions. - */ - -#define POLY0(x, c0) _mm_set1_ps(c0) -#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0)) -#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0)) -#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0)) -#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0)) -#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0)) - -#define EXP_POLY_DEGREE 3 -#define LOG_POLY_DEGREE 5 - -/** - * See http://www.devmaster.net/forums/showthread.php?p=43580 - */ -static INLINE __m128 -exp2f4(__m128 x) -{ - __m128i ipart; - __m128 fpart, expipart, expfpart; - - x = _mm_min_ps(x, _mm_set1_ps( 129.00000f)); - x = _mm_max_ps(x, _mm_set1_ps(-126.99999f)); - - /* ipart = int(x - 0.5) */ - ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f))); - - /* fpart = x - ipart */ - fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart)); - - /* expipart = (float) (1 << ipart) */ - expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23)); - - /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */ -#if EXP_POLY_DEGREE == 5 - expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f); -#elif EXP_POLY_DEGREE == 4 - expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f); -#elif EXP_POLY_DEGREE == 3 - expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f); -#elif EXP_POLY_DEGREE == 2 - expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f); -#else -#error -#endif - - return _mm_mul_ps(expipart, expfpart); -} - - -/** - * See http://www.devmaster.net/forums/showthread.php?p=43580 - */ -static INLINE __m128 -log2f4(__m128 x) -{ - __m128i expmask = _mm_set1_epi32(0x7f800000); - __m128i mantmask = _mm_set1_epi32(0x007fffff); - __m128 one = _mm_set1_ps(1.0f); - - __m128i i = _mm_castps_si128(x); - - /* exp = (float) exponent(x) */ - __m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127))); - - /* mant = (float) mantissa(x) */ - __m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one); - - __m128 logmant; - - /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[ - * These coefficients can be generate with - * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html - */ -#if LOG_POLY_DEGREE == 6 - logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f); -#elif LOG_POLY_DEGREE == 5 - logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f); -#elif LOG_POLY_DEGREE == 4 - logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f); -#elif LOG_POLY_DEGREE == 3 - logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f); -#else -#error -#endif - - /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/ - logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one)); - - return _mm_add_ps(logmant, exp); -} - - -static INLINE __m128 -powf4(__m128 x, __m128 y) -{ - return exp2f4(_mm_mul_ps(log2f4(x), y)); -} - -#endif /* PIPE_ARCH_SSE */ - - - -/** - * Low-level instruction translators. - */ - -static void -emit_abs( - struct x86_function *func, - unsigned xmm ) -{ - sse_andps( - func, - make_xmm( xmm ), - get_temp( - TGSI_EXEC_TEMP_7FFFFFFF_I, - TGSI_EXEC_TEMP_7FFFFFFF_C ) ); -} - -static void -emit_add( - struct x86_function *func, - unsigned xmm_dst, - unsigned xmm_src ) -{ - sse_addps( - func, - make_xmm( xmm_dst ), - make_xmm( xmm_src ) ); -} - -static void PIPE_CDECL -cos4f( - float *store ) -{ - store[0] = cosf( store[0] ); - store[1] = cosf( store[1] ); - store[2] = cosf( store[2] ); - store[3] = cosf( store[3] ); -} - -static void -emit_cos( - struct x86_function *func, - unsigned xmm_save, - unsigned xmm_dst ) -{ - emit_func_call_dst_src1( - func, - xmm_save, - xmm_dst, - xmm_dst, - cos4f ); -} - -static void PIPE_CDECL -#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE) -__attribute__((force_align_arg_pointer)) -#endif -ex24f( - float *store ) -{ -#if defined(PIPE_ARCH_SSE) - _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) )); -#else - store[0] = util_fast_exp2( store[0] ); - store[1] = util_fast_exp2( store[1] ); - store[2] = util_fast_exp2( store[2] ); - store[3] = util_fast_exp2( store[3] ); -#endif -} - -static void -emit_ex2( - struct x86_function *func, - unsigned xmm_save, - unsigned xmm_dst ) -{ - emit_func_call_dst_src1( - func, - xmm_save, - xmm_dst, - xmm_dst, - ex24f ); -} - -static void -emit_f2it( - struct x86_function *func, - unsigned xmm ) -{ - sse2_cvttps2dq( - func, - make_xmm( xmm ), - make_xmm( xmm ) ); -} - -static void -emit_i2f( - struct x86_function *func, - unsigned xmm ) -{ - sse2_cvtdq2ps( - func, - make_xmm( xmm ), - make_xmm( xmm ) ); -} - -static void PIPE_CDECL -flr4f( - float *store ) -{ - store[0] = floorf( store[0] ); - store[1] = floorf( store[1] ); - store[2] = floorf( store[2] ); - store[3] = floorf( store[3] ); -} - -static void -emit_flr( - struct x86_function *func, - unsigned xmm_save, - unsigned xmm_dst ) -{ - emit_func_call_dst_src1( - func, - xmm_save, - xmm_dst, - xmm_dst, - flr4f ); -} - -static void PIPE_CDECL -frc4f( - float *store ) -{ - store[0] -= floorf( store[0] ); - store[1] -= floorf( store[1] ); - store[2] -= floorf( store[2] ); - store[3] -= floorf( store[3] ); -} - -static void -emit_frc( - struct x86_function *func, - unsigned xmm_save, - unsigned xmm_dst ) -{ - emit_func_call_dst_src1( - func, - xmm_save, - xmm_dst, - xmm_dst, - frc4f ); -} - -static void PIPE_CDECL -#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE) -__attribute__((force_align_arg_pointer)) -#endif -lg24f( - float *store ) -{ -#if defined(PIPE_ARCH_SSE) - _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) )); -#else - store[0] = util_fast_log2( store[0] ); - store[1] = util_fast_log2( store[1] ); - store[2] = util_fast_log2( store[2] ); - store[3] = util_fast_log2( store[3] ); -#endif -} - -static void -emit_lg2( - struct x86_function *func, - unsigned xmm_save, - unsigned xmm_dst ) -{ - emit_func_call_dst_src1( - func, - xmm_save, - xmm_dst, - xmm_dst, - lg24f ); -} - -static void -emit_MOV( - struct x86_function *func, - unsigned xmm_dst, - unsigned xmm_src ) -{ - sse_movups( - func, - make_xmm( xmm_dst ), - make_xmm( xmm_src ) ); -} - -static void -emit_mul (struct x86_function *func, - unsigned xmm_dst, - unsigned xmm_src) -{ - sse_mulps( - func, - make_xmm( xmm_dst ), - make_xmm( xmm_src ) ); -} - -static void -emit_neg( - struct x86_function *func, - unsigned xmm ) -{ - sse_xorps( - func, - make_xmm( xmm ), - get_temp( - TGSI_EXEC_TEMP_80000000_I, - TGSI_EXEC_TEMP_80000000_C ) ); -} - -static void PIPE_CDECL -#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE) -__attribute__((force_align_arg_pointer)) -#endif -pow4f( - float *store ) -{ -#if defined(PIPE_ARCH_SSE) - _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) )); -#else - store[0] = util_fast_pow( store[0], store[4] ); - store[1] = util_fast_pow( store[1], store[5] ); - store[2] = util_fast_pow( store[2], store[6] ); - store[3] = util_fast_pow( store[3], store[7] ); -#endif -} - -static void -emit_pow( - struct x86_function *func, - unsigned xmm_save, - unsigned xmm_dst, - unsigned xmm_src0, - unsigned xmm_src1 ) -{ - emit_func_call_dst_src2( - func, - xmm_save, - xmm_dst, - xmm_src0, - xmm_src1, - pow4f ); -} - -static void -emit_rcp ( - struct x86_function *func, - unsigned xmm_dst, - unsigned xmm_src ) -{ - /* On Intel CPUs at least, this is only accurate to 12 bits -- not - * good enough. Need to either emit a proper divide or use the - * iterative technique described below in emit_rsqrt(). - */ - sse2_rcpps( - func, - make_xmm( xmm_dst ), - make_xmm( xmm_src ) ); -} - -static void PIPE_CDECL -rnd4f( - float *store ) -{ - store[0] = floorf( store[0] + 0.5f ); - store[1] = floorf( store[1] + 0.5f ); - store[2] = floorf( store[2] + 0.5f ); - store[3] = floorf( store[3] + 0.5f ); -} - -static void -emit_rnd( - struct x86_function *func, - unsigned xmm_save, - unsigned xmm_dst ) -{ - emit_func_call_dst_src1( - func, - xmm_save, - xmm_dst, - xmm_dst, - rnd4f ); -} - -static void -emit_rsqrt( - struct x86_function *func, - unsigned xmm_dst, - unsigned xmm_src ) -{ -#if HIGH_PRECISION - /* Although rsqrtps() and rcpps() are low precision on some/all SSE - * implementations, it is possible to improve its precision at - * fairly low cost, using a newton/raphson step, as below: - * - * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a) - * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)] - * - * See: http://softwarecommunity.intel.com/articles/eng/1818.htm - */ - { - struct x86_reg dst = make_xmm( xmm_dst ); - struct x86_reg src = make_xmm( xmm_src ); - struct x86_reg tmp0 = make_xmm( 2 ); - struct x86_reg tmp1 = make_xmm( 3 ); - - assert( xmm_dst != xmm_src ); - assert( xmm_dst != 2 && xmm_dst != 3 ); - assert( xmm_src != 2 && xmm_src != 3 ); - - sse_movaps( func, dst, get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) ); - sse_movaps( func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) ); - sse_rsqrtps( func, tmp1, src ); - sse_mulps( func, src, tmp1 ); - sse_mulps( func, dst, tmp1 ); - sse_mulps( func, src, tmp1 ); - sse_subps( func, tmp0, src ); - sse_mulps( func, dst, tmp0 ); - } -#else - /* On Intel CPUs at least, this is only accurate to 12 bits -- not - * good enough. - */ - sse_rsqrtps( - func, - make_xmm( xmm_dst ), - make_xmm( xmm_src ) ); -#endif -} - -static void -emit_setsign( - struct x86_function *func, - unsigned xmm ) -{ - sse_orps( - func, - make_xmm( xmm ), - get_temp( - TGSI_EXEC_TEMP_80000000_I, - TGSI_EXEC_TEMP_80000000_C ) ); -} - -static void PIPE_CDECL -sgn4f( - float *store ) -{ - store[0] = store[0] < 0.0f ? -1.0f : store[0] > 0.0f ? 1.0f : 0.0f; - store[1] = store[1] < 0.0f ? -1.0f : store[1] > 0.0f ? 1.0f : 0.0f; - store[2] = store[2] < 0.0f ? -1.0f : store[2] > 0.0f ? 1.0f : 0.0f; - store[3] = store[3] < 0.0f ? -1.0f : store[3] > 0.0f ? 1.0f : 0.0f; -} - -static void -emit_sgn( - struct x86_function *func, - unsigned xmm_save, - unsigned xmm_dst ) -{ - emit_func_call_dst_src1( - func, - xmm_save, - xmm_dst, - xmm_dst, - sgn4f ); -} - -static void PIPE_CDECL -sin4f( - float *store ) -{ - store[0] = sinf( store[0] ); - store[1] = sinf( store[1] ); - store[2] = sinf( store[2] ); - store[3] = sinf( store[3] ); -} - -static void -emit_sin (struct x86_function *func, - unsigned xmm_save, - unsigned xmm_dst) -{ - emit_func_call_dst_src1( - func, - xmm_save, - xmm_dst, - xmm_dst, - sin4f ); -} - -static void -emit_sub( - struct x86_function *func, - unsigned xmm_dst, - unsigned xmm_src ) -{ - sse_subps( - func, - make_xmm( xmm_dst ), - make_xmm( xmm_src ) ); -} - -/** - * Register fetch. - */ -static void -emit_fetch( - struct x86_function *func, - unsigned xmm, - const struct tgsi_full_src_register *reg, - const unsigned chan_index ) -{ - unsigned swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index ); - - switch (swizzle) { - case TGSI_SWIZZLE_X: - case TGSI_SWIZZLE_Y: - case TGSI_SWIZZLE_Z: - case TGSI_SWIZZLE_W: - switch (reg->Register.File) { - case TGSI_FILE_CONSTANT: - emit_const( - func, - xmm, - reg->Register.Index, - swizzle, - reg->Register.Indirect, - reg->Indirect.File, - reg->Indirect.Index ); - break; - - case TGSI_FILE_IMMEDIATE: - emit_immediate( - func, - xmm, - reg->Register.Index, - swizzle ); - break; - - case TGSI_FILE_SYSTEM_VALUE: - emit_system_value( - func, - xmm, - reg->Register.Index, - swizzle ); - break; - - case TGSI_FILE_INPUT: - emit_inputf( - func, - xmm, - reg->Register.Index, - swizzle ); - break; - - case TGSI_FILE_TEMPORARY: - emit_tempf( - func, - xmm, - reg->Register.Index, - swizzle ); - break; - - default: - assert( 0 ); - } - break; - - default: - assert( 0 ); - } - - switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) { - case TGSI_UTIL_SIGN_CLEAR: - emit_abs( func, xmm ); - break; - - case TGSI_UTIL_SIGN_SET: - emit_setsign( func, xmm ); - break; - - case TGSI_UTIL_SIGN_TOGGLE: - emit_neg( func, xmm ); - break; - - case TGSI_UTIL_SIGN_KEEP: - break; - } -} - -#define FETCH( FUNC, INST, XMM, INDEX, CHAN )\ - emit_fetch( FUNC, XMM, &(INST).Src[INDEX], CHAN ) - -/** - * Register store. - */ -static void -emit_store( - struct x86_function *func, - unsigned xmm, - const struct tgsi_full_dst_register *reg, - const struct tgsi_full_instruction *inst, - unsigned chan_index ) -{ - switch( inst->Instruction.Saturate ) { - case TGSI_SAT_NONE: - break; - - case TGSI_SAT_ZERO_ONE: - sse_maxps( - func, - make_xmm( xmm ), - get_temp( - TGSI_EXEC_TEMP_00000000_I, - TGSI_EXEC_TEMP_00000000_C ) ); - - sse_minps( - func, - make_xmm( xmm ), - get_temp( - TGSI_EXEC_TEMP_ONE_I, - TGSI_EXEC_TEMP_ONE_C ) ); - break; - - case TGSI_SAT_MINUS_PLUS_ONE: - assert( 0 ); - break; - } - - - switch( reg->Register.File ) { - case TGSI_FILE_OUTPUT: - emit_output( - func, - xmm, - reg->Register.Index, - chan_index ); - break; - - case TGSI_FILE_TEMPORARY: - emit_temps( - func, - xmm, - reg->Register.Index, - chan_index ); - break; - - case TGSI_FILE_ADDRESS: - emit_addrs( - func, - xmm, - reg->Register.Index, - chan_index ); - break; - - default: - assert( 0 ); - } -} - -#define STORE( FUNC, INST, XMM, INDEX, CHAN )\ - emit_store( FUNC, XMM, &(INST).Dst[INDEX], &(INST), CHAN ) - - -static void PIPE_CDECL -fetch_texel( struct tgsi_sampler **sampler, - float *store ) -{ -#if 0 - uint j; - - debug_printf("%s sampler: %p (%p) store: %p\n", - __FUNCTION__, - sampler, *sampler, - store ); - - for (j = 0; j < 4; j++) - debug_printf("sample %d texcoord %f %f %f lodbias %f\n", - j, - store[0+j], - store[4+j], - store[8 + j], - store[12 + j]); -#endif - - { - float rgba[NUM_CHANNELS][QUAD_SIZE]; - (*sampler)->get_samples(*sampler, - &store[0], /* s */ - &store[4], /* t */ - &store[8], /* r */ - &store[12], /* lodbias */ - tgsi_sampler_lod_bias, - rgba); /* results */ - - memcpy( store, rgba, 16 * sizeof(float)); - } - -#if 0 - for (j = 0; j < 4; j++) - debug_printf("sample %d result %f %f %f %f\n", - j, - store[0+j], - store[4+j], - store[8+j], - store[12+j]); -#endif -} - -/** - * High-level instruction translators. - */ -static void -emit_tex( struct x86_function *func, - const struct tgsi_full_instruction *inst, - boolean lodbias, - boolean projected) -{ - const uint unit = inst->Src[1].Register.Index; - struct x86_reg args[2]; - unsigned count; - unsigned i; - - assert(inst->Instruction.Texture); - switch (inst->Texture.Texture) { - case TGSI_TEXTURE_1D: - count = 1; - break; - case TGSI_TEXTURE_2D: - case TGSI_TEXTURE_RECT: - case TGSI_TEXTURE_1D_ARRAY: - count = 2; - break; - case TGSI_TEXTURE_SHADOW1D: - case TGSI_TEXTURE_SHADOW2D: - case TGSI_TEXTURE_SHADOWRECT: - case TGSI_TEXTURE_3D: - case TGSI_TEXTURE_CUBE: - case TGSI_TEXTURE_2D_ARRAY: - case TGSI_TEXTURE_SHADOW1D_ARRAY: - count = 3; - break; - case TGSI_TEXTURE_SHADOW2D_ARRAY: - count = 4; - break; - default: - assert(0); - return; - } - - if (lodbias) { - FETCH( func, *inst, 3, 0, 3 ); - } - else { - emit_tempf( - func, - 3, - TGSI_EXEC_TEMP_00000000_I, - TGSI_EXEC_TEMP_00000000_C ); - - } - - /* store lodbias whether enabled or not -- fetch_texel currently - * respects it always. - */ - sse_movaps( func, - get_temp( TEMP_R0, 3 ), - make_xmm( 3 ) ); - - if (projected) { - FETCH( func, *inst, 3, 0, 3 ); - - emit_rcp( func, 3, 3 ); - } - - for (i = 0; i < count; i++) { - FETCH( func, *inst, i, 0, i ); - - if (projected) { - sse_mulps( - func, - make_xmm( i ), - make_xmm( 3 ) ); - } - - /* Store in the argument buffer: - */ - sse_movaps( - func, - get_temp( TEMP_R0, i ), - make_xmm( i ) ); - } - - args[0] = get_temp( TEMP_R0, 0 ); - args[1] = get_sampler_ptr( unit ); - - emit_func_call( func, - 0, - args, - Elements(args), - fetch_texel ); - - /* If all four channels are enabled, could use a pointer to - * dst[0].x instead of TEMP_R0 for store? - */ - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, i ) { - - sse_movaps( - func, - make_xmm( 0 ), - get_temp( TEMP_R0, i ) ); - - STORE( func, *inst, 0, 0, i ); - } -} - - -static void -emit_kil( - struct x86_function *func, - const struct tgsi_full_src_register *reg ) -{ - unsigned uniquemask; - unsigned unique_count = 0; - unsigned chan_index; - unsigned i; - - /* This mask stores component bits that were already tested. Note that - * we test if the value is less than zero, so 1.0 and 0.0 need not to be - * tested. - */ - uniquemask = 0; - - FOR_EACH_CHANNEL( chan_index ) { - unsigned swizzle; - - /* unswizzle channel */ - swizzle = tgsi_util_get_full_src_register_swizzle( - reg, - chan_index ); - - /* check if the component has not been already tested */ - if( !(uniquemask & (1 << swizzle)) ) { - uniquemask |= 1 << swizzle; - - /* allocate register */ - emit_fetch( - func, - unique_count++, - reg, - chan_index ); - } - } - - x86_push( - func, - x86_make_reg( file_REG32, reg_AX ) ); - x86_push( - func, - x86_make_reg( file_REG32, reg_DX ) ); - - for (i = 0 ; i < unique_count; i++ ) { - struct x86_reg dataXMM = make_xmm(i); - - sse_cmpps( - func, - dataXMM, - get_temp( - TGSI_EXEC_TEMP_00000000_I, - TGSI_EXEC_TEMP_00000000_C ), - cc_LessThan ); - - if( i == 0 ) { - sse_movmskps( - func, - x86_make_reg( file_REG32, reg_AX ), - dataXMM ); - } - else { - sse_movmskps( - func, - x86_make_reg( file_REG32, reg_DX ), - dataXMM ); - x86_or( - func, - x86_make_reg( file_REG32, reg_AX ), - x86_make_reg( file_REG32, reg_DX ) ); - } - } - - x86_or( - func, - get_temp( - TGSI_EXEC_TEMP_KILMASK_I, - TGSI_EXEC_TEMP_KILMASK_C ), - x86_make_reg( file_REG32, reg_AX ) ); - - x86_pop( - func, - x86_make_reg( file_REG32, reg_DX ) ); - x86_pop( - func, - x86_make_reg( file_REG32, reg_AX ) ); -} - - -static void -emit_kilp( - struct x86_function *func ) -{ - /* XXX todo / fix me */ -} - - -static void -emit_setcc( - struct x86_function *func, - struct tgsi_full_instruction *inst, - enum sse_cc cc ) -{ - unsigned chan_index; - - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - FETCH( func, *inst, 0, 0, chan_index ); - FETCH( func, *inst, 1, 1, chan_index ); - sse_cmpps( - func, - make_xmm( 0 ), - make_xmm( 1 ), - cc ); - sse_andps( - func, - make_xmm( 0 ), - get_temp( - TEMP_ONE_I, - TEMP_ONE_C ) ); - STORE( func, *inst, 0, 0, chan_index ); - } -} - -static void -emit_cmp( - struct x86_function *func, - struct tgsi_full_instruction *inst ) -{ - unsigned chan_index; - - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - FETCH( func, *inst, 0, 0, chan_index ); - FETCH( func, *inst, 1, 1, chan_index ); - FETCH( func, *inst, 2, 2, chan_index ); - sse_cmpps( - func, - make_xmm( 0 ), - get_temp( - TGSI_EXEC_TEMP_00000000_I, - TGSI_EXEC_TEMP_00000000_C ), - cc_LessThan ); - sse_andps( - func, - make_xmm( 1 ), - make_xmm( 0 ) ); - sse_andnps( - func, - make_xmm( 0 ), - make_xmm( 2 ) ); - sse_orps( - func, - make_xmm( 0 ), - make_xmm( 1 ) ); - STORE( func, *inst, 0, 0, chan_index ); - } -} - - -/** - * Check if inst src/dest regs use indirect addressing into temporary, - * input or output register files. - */ -static boolean -indirect_reg_reference(const struct tgsi_full_instruction *inst) -{ - uint i; - for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { - const struct tgsi_full_src_register *reg = &inst->Src[i]; - if ((reg->Register.File == TGSI_FILE_TEMPORARY || - reg->Register.File == TGSI_FILE_INPUT || - reg->Register.File == TGSI_FILE_OUTPUT) && - reg->Register.Indirect) - return TRUE; - } - for (i = 0; i < inst->Instruction.NumDstRegs; i++) { - const struct tgsi_full_dst_register *reg = &inst->Dst[i]; - if ((reg->Register.File == TGSI_FILE_TEMPORARY || - reg->Register.File == TGSI_FILE_INPUT || - reg->Register.File == TGSI_FILE_OUTPUT) && - reg->Register.Indirect) - return TRUE; - } - return FALSE; -} - - -static int -emit_instruction( - struct x86_function *func, - struct tgsi_full_instruction *inst ) -{ - unsigned chan_index; - - /* we can't handle indirect addressing into temp register file yet */ - if (indirect_reg_reference(inst)) - return FALSE; - - switch (inst->Instruction.Opcode) { - case TGSI_OPCODE_ARL: - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - FETCH( func, *inst, 0, 0, chan_index ); - emit_flr(func, 0, 0); - emit_f2it( func, 0 ); - STORE( func, *inst, 0, 0, chan_index ); - } - break; - - case TGSI_OPCODE_MOV: - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - FETCH( func, *inst, 4 + chan_index, 0, chan_index ); - } - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - STORE( func, *inst, 4 + chan_index, 0, chan_index ); - } - break; - - case TGSI_OPCODE_LIT: - if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) || - IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) { - emit_tempf( - func, - 0, - TEMP_ONE_I, - TEMP_ONE_C); - if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) { - STORE( func, *inst, 0, 0, CHAN_X ); - } - if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) { - STORE( func, *inst, 0, 0, CHAN_W ); - } - } - if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) || - IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) { - if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) { - FETCH( func, *inst, 0, 0, CHAN_X ); - sse_maxps( - func, - make_xmm( 0 ), - get_temp( - TGSI_EXEC_TEMP_00000000_I, - TGSI_EXEC_TEMP_00000000_C ) ); - STORE( func, *inst, 0, 0, CHAN_Y ); - } - if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) { - /* XMM[1] = SrcReg[0].yyyy */ - FETCH( func, *inst, 1, 0, CHAN_Y ); - /* XMM[1] = max(XMM[1], 0) */ - sse_maxps( - func, - make_xmm( 1 ), - get_temp( - TGSI_EXEC_TEMP_00000000_I, - TGSI_EXEC_TEMP_00000000_C ) ); - /* XMM[2] = SrcReg[0].wwww */ - FETCH( func, *inst, 2, 0, CHAN_W ); - /* XMM[2] = min(XMM[2], 128.0) */ - sse_minps( - func, - make_xmm( 2 ), - get_temp( - TGSI_EXEC_TEMP_128_I, - TGSI_EXEC_TEMP_128_C ) ); - /* XMM[2] = max(XMM[2], -128.0) */ - sse_maxps( - func, - make_xmm( 2 ), - get_temp( - TGSI_EXEC_TEMP_MINUS_128_I, - TGSI_EXEC_TEMP_MINUS_128_C ) ); - emit_pow( func, 3, 1, 1, 2 ); - FETCH( func, *inst, 0, 0, CHAN_X ); - sse_xorps( - func, - make_xmm( 2 ), - make_xmm( 2 ) ); - sse_cmpps( - func, - make_xmm( 2 ), - make_xmm( 0 ), - cc_LessThan ); - sse_andps( - func, - make_xmm( 2 ), - make_xmm( 1 ) ); - STORE( func, *inst, 2, 0, CHAN_Z ); - } - } - break; - - case TGSI_OPCODE_RCP: - FETCH( func, *inst, 0, 0, CHAN_X ); - emit_rcp( func, 0, 0 ); - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - STORE( func, *inst, 0, 0, chan_index ); - } - break; - - case TGSI_OPCODE_RSQ: - FETCH( func, *inst, 0, 0, CHAN_X ); - emit_abs( func, 0 ); - emit_rsqrt( func, 1, 0 ); - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - STORE( func, *inst, 1, 0, chan_index ); - } - break; - - case TGSI_OPCODE_EXP: - if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) || - IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) || - IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) { - FETCH( func, *inst, 0, 0, CHAN_X ); - if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) || - IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) { - emit_MOV( func, 1, 0 ); - emit_flr( func, 2, 1 ); - /* dst.x = ex2(floor(src.x)) */ - if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) { - emit_MOV( func, 2, 1 ); - emit_ex2( func, 3, 2 ); - STORE( func, *inst, 2, 0, CHAN_X ); - } - /* dst.y = src.x - floor(src.x) */ - if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) { - emit_MOV( func, 2, 0 ); - emit_sub( func, 2, 1 ); - STORE( func, *inst, 2, 0, CHAN_Y ); - } - } - /* dst.z = ex2(src.x) */ - if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) { - emit_ex2( func, 3, 0 ); - STORE( func, *inst, 0, 0, CHAN_Z ); - } - } - /* dst.w = 1.0 */ - if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) { - emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C ); - STORE( func, *inst, 0, 0, CHAN_W ); - } - break; - - case TGSI_OPCODE_LOG: - if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) || - IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) || - IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) { - FETCH( func, *inst, 0, 0, CHAN_X ); - emit_abs( func, 0 ); - emit_MOV( func, 1, 0 ); - emit_lg2( func, 2, 1 ); - /* dst.z = lg2(abs(src.x)) */ - if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) { - STORE( func, *inst, 1, 0, CHAN_Z ); - } - if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) || - IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) { - emit_flr( func, 2, 1 ); - /* dst.x = floor(lg2(abs(src.x))) */ - if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) { - STORE( func, *inst, 1, 0, CHAN_X ); - } - /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */ - if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) { - emit_ex2( func, 2, 1 ); - emit_rcp( func, 1, 1 ); - emit_mul( func, 0, 1 ); - STORE( func, *inst, 0, 0, CHAN_Y ); - } - } - } - /* dst.w = 1.0 */ - if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) { - emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C ); - STORE( func, *inst, 0, 0, CHAN_W ); - } - break; - - case TGSI_OPCODE_MUL: - /* do all fetches and adds, storing results in temp regs */ - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - int r = chan_index + 1; - FETCH( func, *inst, 0, 0, chan_index ); /* load xmm[0] */ - FETCH( func, *inst, r, 1, chan_index ); /* load xmm[r] */ - emit_mul( func, r, 0 ); /* xmm[r] = xmm[r] * xmm[0] */ - } - /* do all stores of the temp regs */ - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - int r = chan_index + 1; - STORE( func, *inst, r, 0, chan_index ); /* store xmm[r] */ - } - break; - - case TGSI_OPCODE_ADD: - /* do all fetches and adds, storing results in temp regs */ - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - int r = chan_index + 1; - FETCH( func, *inst, 0, 0, chan_index ); /* load xmm[0] */ - FETCH( func, *inst, r, 1, chan_index ); /* load xmm[r] */ - emit_add( func, r, 0 ); /* xmm[r] = xmm[r] + xmm[0] */ - } - /* do all stores of the temp regs */ - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - int r = chan_index + 1; - STORE( func, *inst, r, 0, chan_index ); /* store xmm[r] */ - } - break; - - case TGSI_OPCODE_DP3: - FETCH( func, *inst, 0, 0, CHAN_X ); - FETCH( func, *inst, 1, 1, CHAN_X ); - emit_mul( func, 0, 1 ); - FETCH( func, *inst, 1, 0, CHAN_Y ); - FETCH( func, *inst, 2, 1, CHAN_Y ); - emit_mul( func, 1, 2 ); - emit_add( func, 0, 1 ); - FETCH( func, *inst, 1, 0, CHAN_Z ); - FETCH( func, *inst, 2, 1, CHAN_Z ); - emit_mul( func, 1, 2 ); - emit_add( func, 0, 1 ); - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - STORE( func, *inst, 0, 0, chan_index ); - } - break; - - case TGSI_OPCODE_DP4: - FETCH( func, *inst, 0, 0, CHAN_X ); - FETCH( func, *inst, 1, 1, CHAN_X ); - emit_mul( func, 0, 1 ); - FETCH( func, *inst, 1, 0, CHAN_Y ); - FETCH( func, *inst, 2, 1, CHAN_Y ); - emit_mul( func, 1, 2 ); - emit_add( func, 0, 1 ); - FETCH( func, *inst, 1, 0, CHAN_Z ); - FETCH( func, *inst, 2, 1, CHAN_Z ); - emit_mul(func, 1, 2 ); - emit_add(func, 0, 1 ); - FETCH( func, *inst, 1, 0, CHAN_W ); - FETCH( func, *inst, 2, 1, CHAN_W ); - emit_mul( func, 1, 2 ); - emit_add( func, 0, 1 ); - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - STORE( func, *inst, 0, 0, chan_index ); - } - break; - - case TGSI_OPCODE_DST: - IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) { - emit_tempf( - func, - 0, - TEMP_ONE_I, - TEMP_ONE_C ); - STORE( func, *inst, 0, 0, CHAN_X ); - } - IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) { - FETCH( func, *inst, 0, 0, CHAN_Y ); - FETCH( func, *inst, 1, 1, CHAN_Y ); - emit_mul( func, 0, 1 ); - STORE( func, *inst, 0, 0, CHAN_Y ); - } - IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) { - FETCH( func, *inst, 0, 0, CHAN_Z ); - STORE( func, *inst, 0, 0, CHAN_Z ); - } - IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) { - FETCH( func, *inst, 0, 1, CHAN_W ); - STORE( func, *inst, 0, 0, CHAN_W ); - } - break; - - case TGSI_OPCODE_MIN: - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - FETCH( func, *inst, 0, 0, chan_index ); - FETCH( func, *inst, 1, 1, chan_index ); - sse_minps( - func, - make_xmm( 0 ), - make_xmm( 1 ) ); - STORE( func, *inst, 0, 0, chan_index ); - } - break; - - case TGSI_OPCODE_MAX: - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - FETCH( func, *inst, 0, 0, chan_index ); - FETCH( func, *inst, 1, 1, chan_index ); - sse_maxps( - func, - make_xmm( 0 ), - make_xmm( 1 ) ); - STORE( func, *inst, 0, 0, chan_index ); - } - break; - - case TGSI_OPCODE_SLT: - emit_setcc( func, inst, cc_LessThan ); - break; - - case TGSI_OPCODE_SGE: - emit_setcc( func, inst, cc_NotLessThan ); - break; - - case TGSI_OPCODE_MAD: - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - FETCH( func, *inst, 0, 0, chan_index ); - FETCH( func, *inst, 1, 1, chan_index ); - FETCH( func, *inst, 2, 2, chan_index ); - emit_mul( func, 0, 1 ); - emit_add( func, 0, 2 ); - STORE( func, *inst, 0, 0, chan_index ); - } - break; - - case TGSI_OPCODE_SUB: - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - FETCH( func, *inst, 0, 0, chan_index ); - FETCH( func, *inst, 1, 1, chan_index ); - emit_sub( func, 0, 1 ); - STORE( func, *inst, 0, 0, chan_index ); - } - break; - - case TGSI_OPCODE_LRP: - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - FETCH( func, *inst, 0, 0, chan_index ); - FETCH( func, *inst, 1, 1, chan_index ); - FETCH( func, *inst, 2, 2, chan_index ); - emit_sub( func, 1, 2 ); - emit_mul( func, 0, 1 ); - emit_add( func, 0, 2 ); - STORE( func, *inst, 0, 0, chan_index ); - } - break; - - case TGSI_OPCODE_CND: - return 0; - break; - - case TGSI_OPCODE_DP2A: - FETCH( func, *inst, 0, 0, CHAN_X ); /* xmm0 = src[0].x */ - FETCH( func, *inst, 1, 1, CHAN_X ); /* xmm1 = src[1].x */ - emit_mul( func, 0, 1 ); /* xmm0 = xmm0 * xmm1 */ - FETCH( func, *inst, 1, 0, CHAN_Y ); /* xmm1 = src[0].y */ - FETCH( func, *inst, 2, 1, CHAN_Y ); /* xmm2 = src[1].y */ - emit_mul( func, 1, 2 ); /* xmm1 = xmm1 * xmm2 */ - emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */ - FETCH( func, *inst, 1, 2, CHAN_X ); /* xmm1 = src[2].x */ - emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */ - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - STORE( func, *inst, 0, 0, chan_index ); /* dest[ch] = xmm0 */ - } - break; - - case TGSI_OPCODE_FRC: - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - FETCH( func, *inst, 0, 0, chan_index ); - emit_frc( func, 0, 0 ); - STORE( func, *inst, 0, 0, chan_index ); - } - break; - - case TGSI_OPCODE_CLAMP: - return 0; - break; - - case TGSI_OPCODE_FLR: - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - FETCH( func, *inst, 0, 0, chan_index ); - emit_flr( func, 0, 0 ); - STORE( func, *inst, 0, 0, chan_index ); - } - break; - - case TGSI_OPCODE_ROUND: - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - FETCH( func, *inst, 0, 0, chan_index ); - emit_rnd( func, 0, 0 ); - STORE( func, *inst, 0, 0, chan_index ); - } - break; - - case TGSI_OPCODE_EX2: - FETCH( func, *inst, 0, 0, CHAN_X ); - emit_ex2( func, 0, 0 ); - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - STORE( func, *inst, 0, 0, chan_index ); - } - break; - - case TGSI_OPCODE_LG2: - FETCH( func, *inst, 0, 0, CHAN_X ); - emit_lg2( func, 0, 0 ); - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - STORE( func, *inst, 0, 0, chan_index ); - } - break; - - case TGSI_OPCODE_POW: - FETCH( func, *inst, 0, 0, CHAN_X ); - FETCH( func, *inst, 1, 1, CHAN_X ); - emit_pow( func, 0, 0, 0, 1 ); - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - STORE( func, *inst, 0, 0, chan_index ); - } - break; - - case TGSI_OPCODE_XPD: - /* Note: we do all stores after all operands have been fetched - * to avoid src/dst register aliasing issues for an instruction - * such as: XPD TEMP[2].xyz, TEMP[0], TEMP[2]; - */ - if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) || - IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) { - FETCH( func, *inst, 1, 1, CHAN_Z ); /* xmm[1] = src[1].z */ - FETCH( func, *inst, 3, 0, CHAN_Z ); /* xmm[3] = src[0].z */ - } - if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) || - IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) { - FETCH( func, *inst, 0, 0, CHAN_Y ); /* xmm[0] = src[0].y */ - FETCH( func, *inst, 4, 1, CHAN_Y ); /* xmm[4] = src[1].y */ - } - IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) { - emit_MOV( func, 7, 0 ); /* xmm[7] = xmm[0] */ - emit_mul( func, 7, 1 ); /* xmm[7] = xmm[2] * xmm[1] */ - emit_MOV( func, 5, 3 ); /* xmm[5] = xmm[3] */ - emit_mul( func, 5, 4 ); /* xmm[5] = xmm[5] * xmm[4] */ - emit_sub( func, 7, 5 ); /* xmm[7] = xmm[2] - xmm[5] */ - /* store xmm[7] in dst.x below */ - } - if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) || - IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) { - FETCH( func, *inst, 2, 1, CHAN_X ); /* xmm[2] = src[1].x */ - FETCH( func, *inst, 5, 0, CHAN_X ); /* xmm[5] = src[0].x */ - } - IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) { - emit_mul( func, 3, 2 ); /* xmm[3] = xmm[3] * xmm[2] */ - emit_mul( func, 1, 5 ); /* xmm[1] = xmm[1] * xmm[5] */ - emit_sub( func, 3, 1 ); /* xmm[3] = xmm[3] - xmm[1] */ - /* store xmm[3] in dst.y below */ - } - IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) { - emit_mul( func, 5, 4 ); /* xmm[5] = xmm[5] * xmm[4] */ - emit_mul( func, 0, 2 ); /* xmm[0] = xmm[0] * xmm[2] */ - emit_sub( func, 5, 0 ); /* xmm[5] = xmm[5] - xmm[0] */ - STORE( func, *inst, 5, 0, CHAN_Z ); /* dst.z = xmm[5] */ - } - IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) { - STORE( func, *inst, 7, 0, CHAN_X ); /* dst.x = xmm[7] */ - } - IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) { - STORE( func, *inst, 3, 0, CHAN_Y ); /* dst.y = xmm[3] */ - } - IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) { - emit_tempf( - func, - 0, - TEMP_ONE_I, - TEMP_ONE_C ); - STORE( func, *inst, 0, 0, CHAN_W ); - } - break; - - case TGSI_OPCODE_ABS: - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - FETCH( func, *inst, 0, 0, chan_index ); - emit_abs( func, 0) ; - - STORE( func, *inst, 0, 0, chan_index ); - } - break; - - case TGSI_OPCODE_RCC: - return 0; - break; - - case TGSI_OPCODE_DPH: - FETCH( func, *inst, 0, 0, CHAN_X ); - FETCH( func, *inst, 1, 1, CHAN_X ); - emit_mul( func, 0, 1 ); - FETCH( func, *inst, 1, 0, CHAN_Y ); - FETCH( func, *inst, 2, 1, CHAN_Y ); - emit_mul( func, 1, 2 ); - emit_add( func, 0, 1 ); - FETCH( func, *inst, 1, 0, CHAN_Z ); - FETCH( func, *inst, 2, 1, CHAN_Z ); - emit_mul( func, 1, 2 ); - emit_add( func, 0, 1 ); - FETCH( func, *inst, 1, 1, CHAN_W ); - emit_add( func, 0, 1 ); - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - STORE( func, *inst, 0, 0, chan_index ); - } - break; - - case TGSI_OPCODE_COS: - FETCH( func, *inst, 0, 0, CHAN_X ); - emit_cos( func, 0, 0 ); - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - STORE( func, *inst, 0, 0, chan_index ); - } - break; - - case TGSI_OPCODE_DDX: - return 0; - break; - - case TGSI_OPCODE_DDY: - return 0; - break; - - case TGSI_OPCODE_KILP: - /* predicated kill */ - emit_kilp( func ); - return 0; /* XXX fix me */ - break; - - case TGSI_OPCODE_KIL: - /* conditional kill */ - emit_kil( func, &inst->Src[0] ); - break; - - case TGSI_OPCODE_PK2H: - return 0; - break; - - case TGSI_OPCODE_PK2US: - return 0; - break; - - case TGSI_OPCODE_PK4B: - return 0; - break; - - case TGSI_OPCODE_PK4UB: - return 0; - break; - - case TGSI_OPCODE_RFL: - return 0; - break; - - case TGSI_OPCODE_SEQ: - emit_setcc( func, inst, cc_Equal ); - break; - - case TGSI_OPCODE_SFL: - return 0; - break; - - case TGSI_OPCODE_SGT: - emit_setcc( func, inst, cc_NotLessThanEqual ); - break; - - case TGSI_OPCODE_SIN: - FETCH( func, *inst, 0, 0, CHAN_X ); - emit_sin( func, 0, 0 ); - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - STORE( func, *inst, 0, 0, chan_index ); - } - break; - - case TGSI_OPCODE_SLE: - emit_setcc( func, inst, cc_LessThanEqual ); - break; - - case TGSI_OPCODE_SNE: - emit_setcc( func, inst, cc_NotEqual ); - break; - - case TGSI_OPCODE_STR: - return 0; - break; - - case TGSI_OPCODE_TEX: - emit_tex( func, inst, FALSE, FALSE ); - break; - - case TGSI_OPCODE_TXD: - return 0; - break; - - case TGSI_OPCODE_UP2H: - return 0; - break; - - case TGSI_OPCODE_UP2US: - return 0; - break; - - case TGSI_OPCODE_UP4B: - return 0; - break; - - case TGSI_OPCODE_UP4UB: - return 0; - break; - - case TGSI_OPCODE_X2D: - return 0; - break; - - case TGSI_OPCODE_ARA: - return 0; - break; - - case TGSI_OPCODE_ARR: - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - FETCH( func, *inst, 0, 0, chan_index ); - emit_rnd( func, 0, 0 ); - emit_f2it( func, 0 ); - STORE( func, *inst, 0, 0, chan_index ); - } - break; - - case TGSI_OPCODE_BRA: - return 0; - break; - - case TGSI_OPCODE_CAL: - return 0; - break; - - case TGSI_OPCODE_RET: - emit_ret( func ); - break; - - case TGSI_OPCODE_END: - break; - - case TGSI_OPCODE_SSG: - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - FETCH( func, *inst, 0, 0, chan_index ); - emit_sgn( func, 0, 0 ); - STORE( func, *inst, 0, 0, chan_index ); - } - break; - - case TGSI_OPCODE_CMP: - emit_cmp (func, inst); - break; - - case TGSI_OPCODE_SCS: - IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) { - FETCH( func, *inst, 0, 0, CHAN_X ); - emit_cos( func, 0, 0 ); - STORE( func, *inst, 0, 0, CHAN_X ); - } - IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) { - FETCH( func, *inst, 0, 0, CHAN_X ); - emit_sin( func, 0, 0 ); - STORE( func, *inst, 0, 0, CHAN_Y ); - } - IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) { - emit_tempf( - func, - 0, - TGSI_EXEC_TEMP_00000000_I, - TGSI_EXEC_TEMP_00000000_C ); - STORE( func, *inst, 0, 0, CHAN_Z ); - } - IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) { - emit_tempf( - func, - 0, - TEMP_ONE_I, - TEMP_ONE_C ); - STORE( func, *inst, 0, 0, CHAN_W ); - } - break; - - case TGSI_OPCODE_TXB: - emit_tex( func, inst, TRUE, FALSE ); - break; - - case TGSI_OPCODE_NRM: - /* fall-through */ - case TGSI_OPCODE_NRM4: - /* 3 or 4-component normalization */ - { - uint dims = (inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4; - - if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) || - IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) || - IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z) || - (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 4)) { - - /* NOTE: Cannot use xmm regs 2/3 here (see emit_rsqrt() above). */ - - /* xmm4 = src.x */ - /* xmm0 = src.x * src.x */ - FETCH(func, *inst, 0, 0, CHAN_X); - if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) { - emit_MOV(func, 4, 0); - } - emit_mul(func, 0, 0); - - /* xmm5 = src.y */ - /* xmm0 = xmm0 + src.y * src.y */ - FETCH(func, *inst, 1, 0, CHAN_Y); - if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) { - emit_MOV(func, 5, 1); - } - emit_mul(func, 1, 1); - emit_add(func, 0, 1); - - /* xmm6 = src.z */ - /* xmm0 = xmm0 + src.z * src.z */ - FETCH(func, *inst, 1, 0, CHAN_Z); - if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) { - emit_MOV(func, 6, 1); - } - emit_mul(func, 1, 1); - emit_add(func, 0, 1); - - if (dims == 4) { - /* xmm7 = src.w */ - /* xmm0 = xmm0 + src.w * src.w */ - FETCH(func, *inst, 1, 0, CHAN_W); - if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) { - emit_MOV(func, 7, 1); - } - emit_mul(func, 1, 1); - emit_add(func, 0, 1); - } - - /* xmm1 = 1 / sqrt(xmm0) */ - emit_rsqrt(func, 1, 0); - - /* dst.x = xmm1 * src.x */ - if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) { - emit_mul(func, 4, 1); - STORE(func, *inst, 4, 0, CHAN_X); - } - - /* dst.y = xmm1 * src.y */ - if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) { - emit_mul(func, 5, 1); - STORE(func, *inst, 5, 0, CHAN_Y); - } - - /* dst.z = xmm1 * src.z */ - if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) { - emit_mul(func, 6, 1); - STORE(func, *inst, 6, 0, CHAN_Z); - } - - /* dst.w = xmm1 * src.w */ - if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) && dims == 4) { - emit_mul(func, 7, 1); - STORE(func, *inst, 7, 0, CHAN_W); - } - } - - /* dst0.w = 1.0 */ - if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 3) { - emit_tempf(func, 0, TEMP_ONE_I, TEMP_ONE_C); - STORE(func, *inst, 0, 0, CHAN_W); - } - } - break; - - case TGSI_OPCODE_DIV: - return 0; - break; - - case TGSI_OPCODE_DP2: - FETCH( func, *inst, 0, 0, CHAN_X ); /* xmm0 = src[0].x */ - FETCH( func, *inst, 1, 1, CHAN_X ); /* xmm1 = src[1].x */ - emit_mul( func, 0, 1 ); /* xmm0 = xmm0 * xmm1 */ - FETCH( func, *inst, 1, 0, CHAN_Y ); /* xmm1 = src[0].y */ - FETCH( func, *inst, 2, 1, CHAN_Y ); /* xmm2 = src[1].y */ - emit_mul( func, 1, 2 ); /* xmm1 = xmm1 * xmm2 */ - emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */ - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - STORE( func, *inst, 0, 0, chan_index ); /* dest[ch] = xmm0 */ - } - break; - - case TGSI_OPCODE_TXL: - return 0; - break; - - case TGSI_OPCODE_TXP: - emit_tex( func, inst, FALSE, TRUE ); - break; - - case TGSI_OPCODE_BRK: - return 0; - break; - - case TGSI_OPCODE_IF: - return 0; - break; - - case TGSI_OPCODE_ELSE: - return 0; - break; - - case TGSI_OPCODE_ENDIF: - return 0; - break; - - case TGSI_OPCODE_PUSHA: - return 0; - break; - - case TGSI_OPCODE_POPA: - return 0; - break; - - case TGSI_OPCODE_CEIL: - return 0; - break; - - case TGSI_OPCODE_I2F: - return 0; - break; - - case TGSI_OPCODE_NOT: - return 0; - break; - - case TGSI_OPCODE_TRUNC: - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - FETCH( func, *inst, 0, 0, chan_index ); - emit_f2it( func, 0 ); - emit_i2f( func, 0 ); - STORE( func, *inst, 0, 0, chan_index ); - } - break; - - case TGSI_OPCODE_SHL: - return 0; - break; - - case TGSI_OPCODE_ISHR: - return 0; - break; - - case TGSI_OPCODE_AND: - return 0; - break; - - case TGSI_OPCODE_OR: - return 0; - break; - - case TGSI_OPCODE_MOD: - return 0; - break; - - case TGSI_OPCODE_XOR: - return 0; - break; - - case TGSI_OPCODE_SAD: - return 0; - break; - - case TGSI_OPCODE_TXF: - return 0; - break; - - case TGSI_OPCODE_TXQ: - return 0; - break; - - case TGSI_OPCODE_CONT: - return 0; - break; - - case TGSI_OPCODE_EMIT: - return 0; - break; - - case TGSI_OPCODE_ENDPRIM: - return 0; - break; - - default: - return 0; - } - - return 1; -} - -static void -emit_declaration( - struct x86_function *func, - struct tgsi_full_declaration *decl ) -{ - if( decl->Declaration.File == TGSI_FILE_INPUT ) { - unsigned first, last, mask; - unsigned i, j; - - first = decl->Range.First; - last = decl->Range.Last; - mask = decl->Declaration.UsageMask; - - for( i = first; i <= last; i++ ) { - for( j = 0; j < NUM_CHANNELS; j++ ) { - if( mask & (1 << j) ) { - switch( decl->Declaration.Interpolate ) { - case TGSI_INTERPOLATE_CONSTANT: - emit_coef_a0( func, 0, i, j ); - emit_inputs( func, 0, i, j ); - break; - - case TGSI_INTERPOLATE_LINEAR: - emit_tempf( func, 0, 0, TGSI_SWIZZLE_X ); - emit_coef_dadx( func, 1, i, j ); - emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y ); - emit_coef_dady( func, 3, i, j ); - emit_mul( func, 0, 1 ); /* x * dadx */ - emit_coef_a0( func, 4, i, j ); - emit_mul( func, 2, 3 ); /* y * dady */ - emit_add( func, 0, 4 ); /* x * dadx + a0 */ - emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */ - emit_inputs( func, 0, i, j ); - break; - - case TGSI_INTERPOLATE_PERSPECTIVE: - emit_tempf( func, 0, 0, TGSI_SWIZZLE_X ); - emit_coef_dadx( func, 1, i, j ); - emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y ); - emit_coef_dady( func, 3, i, j ); - emit_mul( func, 0, 1 ); /* x * dadx */ - emit_tempf( func, 4, 0, TGSI_SWIZZLE_W ); - emit_coef_a0( func, 5, i, j ); - emit_rcp( func, 4, 4 ); /* 1.0 / w */ - emit_mul( func, 2, 3 ); /* y * dady */ - emit_add( func, 0, 5 ); /* x * dadx + a0 */ - emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */ - emit_mul( func, 0, 4 ); /* (x * dadx + y * dady + a0) / w */ - emit_inputs( func, 0, i, j ); - break; - - default: - assert( 0 ); - break; - } - } - } - } - } -} - -static void aos_to_soa( struct x86_function *func, - uint arg_aos, - uint arg_machine, - uint arg_num, - uint arg_stride ) -{ - struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX ); - struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX ); - struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX ); - struct x86_reg stride = x86_make_reg( file_REG32, reg_DX ); - int loop_top, loop_exit_fixup; - - /* Save EBX */ - x86_push( func, x86_make_reg( file_REG32, reg_BX ) ); - - x86_mov( func, aos_input, x86_fn_arg( func, arg_aos ) ); - x86_mov( func, soa_input, x86_fn_arg( func, arg_machine ) ); - /* FIXME: tgsi_exec_machine::Inputs is a pointer now! */ - x86_lea( func, soa_input, - x86_make_disp( soa_input, - Offset(struct tgsi_exec_machine, Inputs) ) ); - x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) ); - x86_mov( func, stride, x86_fn_arg( func, arg_stride ) ); - - /* while (num_inputs != 0) */ - loop_top = x86_get_label( func ); - x86_cmp_imm( func, num_inputs, 0 ); - loop_exit_fixup = x86_jcc_forward( func, cc_E ); - - { - x86_push( func, aos_input ); - sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) ); - sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) ); - x86_add( func, aos_input, stride ); - sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) ); - sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) ); - x86_add( func, aos_input, stride ); - sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) ); - sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) ); - x86_add( func, aos_input, stride ); - sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) ); - sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) ); - x86_pop( func, aos_input ); - - sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) ); - sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) ); - sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 ); - sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd ); - sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 ); - sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd ); - - sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) ); - sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) ); - sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) ); - sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) ); - - /* Advance to next input */ - x86_lea( func, aos_input, x86_make_disp(aos_input, 16) ); - x86_lea( func, soa_input, x86_make_disp(soa_input, 64) ); - } - /* --num_inputs */ - x86_dec( func, num_inputs ); - x86_jmp( func, loop_top ); - x86_fixup_fwd_jump( func, loop_exit_fixup ); - - /* Restore EBX */ - x86_pop( func, x86_make_reg( file_REG32, reg_BX ) ); -} - -static void soa_to_aos( struct x86_function *func, - uint arg_aos, - uint arg_machine, - uint arg_num, - uint arg_stride ) -{ - struct x86_reg soa_output = x86_make_reg( file_REG32, reg_AX ); - struct x86_reg aos_output = x86_make_reg( file_REG32, reg_BX ); - struct x86_reg num_outputs = x86_make_reg( file_REG32, reg_CX ); - struct x86_reg temp = x86_make_reg( file_REG32, reg_DX ); - int inner_loop; - - /* Save EBX */ - x86_push( func, x86_make_reg( file_REG32, reg_BX ) ); - - x86_mov( func, aos_output, x86_fn_arg( func, arg_aos ) ); - x86_mov( func, soa_output, x86_fn_arg( func, arg_machine ) ); - /* FIXME: tgsi_exec_machine::Ouputs is a pointer now! */ - x86_lea( func, soa_output, - x86_make_disp( soa_output, - Offset(struct tgsi_exec_machine, Outputs) ) ); - x86_mov( func, num_outputs, x86_fn_arg( func, arg_num ) ); - - /* do */ - inner_loop = x86_get_label( func ); - { - sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) ); - sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) ); - sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) ); - sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) ); - - sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) ); - sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) ); - sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) ); - sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) ); - sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) ); - sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) ); - - x86_mov( func, temp, x86_fn_arg( func, arg_stride ) ); - x86_push( func, aos_output ); - sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) ); - sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) ); - x86_add( func, aos_output, temp ); - sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) ); - sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) ); - x86_add( func, aos_output, temp ); - sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) ); - sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) ); - x86_add( func, aos_output, temp ); - sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) ); - sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) ); - x86_pop( func, aos_output ); - - /* Advance to next output */ - x86_lea( func, aos_output, x86_make_disp(aos_output, 16) ); - x86_lea( func, soa_output, x86_make_disp(soa_output, 64) ); - } - /* while --num_outputs */ - x86_dec( func, num_outputs ); - x86_jcc( func, cc_NE, inner_loop ); - - /* Restore EBX */ - x86_pop( func, x86_make_reg( file_REG32, reg_BX ) ); -} - - -/** - * Check if the instructions dst register is the same as any src - * register and warn if there's a posible SOA dependency. - */ -static boolean -check_soa_dependencies(const struct tgsi_full_instruction *inst) -{ - uint opcode = inst->Instruction.Opcode; - - /* XXX: we only handle src/dst aliasing in a few opcodes currently. - * Need to use an additional temporay to hold the result in the - * cases where the code is too opaque to fix. - */ - - switch (opcode) { - case TGSI_OPCODE_ADD: - case TGSI_OPCODE_MOV: - case TGSI_OPCODE_MUL: - case TGSI_OPCODE_RCP: - case TGSI_OPCODE_RSQ: - case TGSI_OPCODE_EXP: - case TGSI_OPCODE_LOG: - case TGSI_OPCODE_DP3: - case TGSI_OPCODE_DP4: - case TGSI_OPCODE_DP2A: - case TGSI_OPCODE_EX2: - case TGSI_OPCODE_LG2: - case TGSI_OPCODE_POW: - case TGSI_OPCODE_XPD: - case TGSI_OPCODE_DPH: - case TGSI_OPCODE_COS: - case TGSI_OPCODE_SIN: - case TGSI_OPCODE_TEX: - case TGSI_OPCODE_TXB: - case TGSI_OPCODE_TXP: - case TGSI_OPCODE_NRM: - case TGSI_OPCODE_NRM4: - case TGSI_OPCODE_DP2: - /* OK - these opcodes correctly handle SOA dependencies */ - return TRUE; - default: - if (!tgsi_check_soa_dependencies(inst)) - return TRUE; - - debug_printf("Warning: src/dst aliasing in instruction" - " is not handled:\n"); - debug_printf("Warning: "); - tgsi_dump_instruction(inst, 1); - - return FALSE; - } -} - - -/** - * Translate a TGSI vertex/fragment shader to SSE2 code. - * Slightly different things are done for vertex vs. fragment shaders. - * - * \param tokens the TGSI input shader - * \param func the output SSE code/function - * \param immediates buffer to place immediates, later passed to SSE func - * \param return 1 for success, 0 if translation failed - */ -unsigned -tgsi_emit_sse2( - const struct tgsi_token *tokens, - struct x86_function *func, - float (*immediates)[4], - boolean do_swizzles ) -{ - struct tgsi_parse_context parse; - unsigned ok = 1; - uint num_immediates = 0; - - util_init_math(); - - func->csr = func->store; - - tgsi_parse_init( &parse, tokens ); - - /* Can't just use EDI, EBX without save/restoring them: - */ - x86_push( func, x86_make_reg( file_REG32, reg_BX ) ); - x86_push( func, x86_make_reg( file_REG32, reg_DI ) ); - - /* - * Different function args for vertex/fragment shaders: - */ - if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) { - if (do_swizzles) - aos_to_soa( func, - 4, /* aos_input */ - 1, /* machine */ - 5, /* num_inputs */ - 6 ); /* input_stride */ - } - - x86_mov( - func, - get_machine_base(), - x86_fn_arg( func, 1 ) ); - x86_mov( - func, - get_const_base(), - x86_fn_arg( func, 2 ) ); - x86_mov( - func, - get_immediate_base(), - x86_fn_arg( func, 3 ) ); - - if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) { - x86_mov( - func, - get_coef_base(), - x86_fn_arg( func, 4 ) ); - } - - x86_mov( - func, - get_sampler_base(), - x86_make_disp( get_machine_base(), - Offset( struct tgsi_exec_machine, Samplers ) ) ); - - while( !tgsi_parse_end_of_tokens( &parse ) && ok ) { - tgsi_parse_token( &parse ); - - switch( parse.FullToken.Token.Type ) { - case TGSI_TOKEN_TYPE_DECLARATION: - if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) { - emit_declaration( - func, - &parse.FullToken.FullDeclaration ); - } - break; - - case TGSI_TOKEN_TYPE_INSTRUCTION: - ok = emit_instruction( - func, - &parse.FullToken.FullInstruction ); - - if (!ok) { - uint opcode = parse.FullToken.FullInstruction.Instruction.Opcode; - uint proc = parse.FullHeader.Processor.Processor; - debug_printf("failed to translate tgsi opcode %d (%s) to SSE (%s)\n", - opcode, - tgsi_get_opcode_name(opcode), - tgsi_get_processor_name(proc)); - } - - if (ok) - ok = check_soa_dependencies(&parse.FullToken.FullInstruction); - break; - - case TGSI_TOKEN_TYPE_IMMEDIATE: - /* simply copy the immediate values into the next immediates[] slot */ - { - const uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1; - uint i; - assert(size <= 4); - assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES); - for( i = 0; i < size; i++ ) { - immediates[num_immediates][i] = - parse.FullToken.FullImmediate.u[i].Float; - } -#if 0 - debug_printf("SSE FS immediate[%d] = %f %f %f %f\n", - num_immediates, - immediates[num_immediates][0], - immediates[num_immediates][1], - immediates[num_immediates][2], - immediates[num_immediates][3]); -#endif - num_immediates++; - } - break; - case TGSI_TOKEN_TYPE_PROPERTY: - /* we just ignore them for now */ - break; - - default: - ok = 0; - assert( 0 ); - } - } - - if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) { - if (do_swizzles) - soa_to_aos( func, - 7, /* aos_output */ - 1, /* machine */ - 8, /* num_outputs */ - 9 ); /* output_stride */ - } - - /* Can't just use EBX, EDI without save/restoring them: - */ - x86_pop( func, x86_make_reg( file_REG32, reg_DI ) ); - x86_pop( func, x86_make_reg( file_REG32, reg_BX ) ); - - emit_ret( func ); - - tgsi_parse_free( &parse ); - - return ok; -} - -#else /* !PIPE_ARCH_X86 */ - -unsigned -tgsi_emit_sse2( - const struct tgsi_token *tokens, - struct x86_function *func, - float (*immediates)[4], - boolean do_swizzles ) -{ - return 0; -} - -#endif /* !PIPE_ARCH_X86 */ diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.h b/src/gallium/auxiliary/tgsi/tgsi_sse2.h deleted file mode 100644 index 00aa8b8..0000000 --- a/src/gallium/auxiliary/tgsi/tgsi_sse2.h +++ /dev/null @@ -1,80 +0,0 @@ -/************************************************************************** - * - * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - -#ifndef TGSI_SSE2_H -#define TGSI_SSE2_H - -#if defined __cplusplus -extern "C" { -#endif - -#include "pipe/p_compiler.h" - -struct tgsi_exec_machine; -struct tgsi_interp_coef; -struct tgsi_token; -struct x86_function; - -unsigned -tgsi_emit_sse2( - const struct tgsi_token *tokens, - struct x86_function *function, - float (*immediates)[4], - boolean do_swizzles ); - - -/* This is the function prototype generated when do_swizzles is false - * -- effectively for fragment shaders. - */ -typedef void (PIPE_CDECL *tgsi_sse2_fs_function) ( - struct tgsi_exec_machine *machine, /* 1 */ - const float (*constant)[4], /* 2 */ - const float (*immediate)[4], /* 3 */ - const struct tgsi_interp_coef *coef /* 4 */ - ); - - -/* This is the function prototype generated when do_swizzles is true - * -- effectively for vertex shaders. - */ -typedef void (PIPE_CDECL *tgsi_sse2_vs_func) ( - struct tgsi_exec_machine *machine, /* 1 */ - const float (*constant)[4], /* 2 */ - const float (*immediate)[4], /* 3 */ - const float (*aos_input)[4], /* 4 */ - uint num_inputs, /* 5 */ - uint input_stride, /* 6 */ - float (*aos_output)[4], /* 7 */ - uint num_outputs, /* 8 */ - uint output_stride ); /* 9 */ - - -#if defined __cplusplus -} -#endif - -#endif /* TGSI_SSE2_H */ diff --git a/src/gallium/drivers/softpipe/Android.mk b/src/gallium/drivers/softpipe/Android.mk index d198fa5..6a125a5 100644 --- a/src/gallium/drivers/softpipe/Android.mk +++ b/src/gallium/drivers/softpipe/Android.mk @@ -26,7 +26,6 @@ LOCAL_PATH := $(call my-dir) # from Makefile C_SOURCES = \ sp_fs_exec.c \ - sp_fs_sse.c \ sp_clear.c \ sp_fence.c \ sp_flush.c \ diff --git a/src/gallium/drivers/softpipe/Makefile b/src/gallium/drivers/softpipe/Makefile index 9403e6c..27b5d991 100644 --- a/src/gallium/drivers/softpipe/Makefile +++ b/src/gallium/drivers/softpipe/Makefile @@ -5,7 +5,6 @@ LIBNAME = softpipe C_SOURCES = \ sp_fs_exec.c \ - sp_fs_sse.c \ sp_clear.c \ sp_fence.c \ sp_flush.c \ diff --git a/src/gallium/drivers/softpipe/SConscript b/src/gallium/drivers/softpipe/SConscript index ea10e8a..da2c93e 100644 --- a/src/gallium/drivers/softpipe/SConscript +++ b/src/gallium/drivers/softpipe/SConscript @@ -6,7 +6,6 @@ softpipe = env.ConvenienceLibrary( target = 'softpipe', source = [ 'sp_fs_exec.c', - 'sp_fs_sse.c', 'sp_clear.c', 'sp_context.c', 'sp_draw_arrays.c', diff --git a/src/gallium/drivers/softpipe/sp_context.c b/src/gallium/drivers/softpipe/sp_context.c index c97b033..3a83e58 100644 --- a/src/gallium/drivers/softpipe/sp_context.c +++ b/src/gallium/drivers/softpipe/sp_context.c @@ -235,12 +235,6 @@ softpipe_create_context( struct pipe_screen *screen, util_init_math(); -#ifdef PIPE_ARCH_X86 - softpipe->use_sse = !debug_get_bool_option( "GALLIUM_NOSSE", FALSE ); -#else - softpipe->use_sse = FALSE; -#endif - softpipe->dump_fs = debug_get_bool_option( "SOFTPIPE_DUMP_FS", FALSE ); softpipe->dump_gs = debug_get_bool_option( "SOFTPIPE_DUMP_GS", FALSE ); diff --git a/src/gallium/drivers/softpipe/sp_context.h b/src/gallium/drivers/softpipe/sp_context.h index d51ce9f..5442aba 100644 --- a/src/gallium/drivers/softpipe/sp_context.h +++ b/src/gallium/drivers/softpipe/sp_context.h @@ -190,7 +190,6 @@ struct softpipe_context { struct softpipe_tex_tile_cache *vertex_tex_cache[PIPE_MAX_VERTEX_SAMPLERS]; struct softpipe_tex_tile_cache *geometry_tex_cache[PIPE_MAX_GEOMETRY_SAMPLERS]; - unsigned use_sse : 1; unsigned dump_fs : 1; unsigned dump_gs : 1; unsigned no_rast : 1; diff --git a/src/gallium/drivers/softpipe/sp_fs.h b/src/gallium/drivers/softpipe/sp_fs.h index d46d7d5..db689b8 100644 --- a/src/gallium/drivers/softpipe/sp_fs.h +++ b/src/gallium/drivers/softpipe/sp_fs.h @@ -36,10 +36,6 @@ struct sp_fragment_shader_variant * softpipe_create_fs_variant_exec(struct softpipe_context *softpipe, const struct pipe_shader_state *templ); -struct sp_fragment_shader_variant * -softpipe_create_fs_variant_sse(struct softpipe_context *softpipe, - const struct pipe_shader_state *templ); - struct tgsi_interp_coef; struct tgsi_exec_vector; diff --git a/src/gallium/drivers/softpipe/sp_fs_sse.c b/src/gallium/drivers/softpipe/sp_fs_sse.c deleted file mode 100644 index c873af1..0000000 --- a/src/gallium/drivers/softpipe/sp_fs_sse.c +++ /dev/null @@ -1,248 +0,0 @@ -/************************************************************************** - * - * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - -/** - * Execute fragment shader using runtime SSE code generation. - */ - -#include "sp_context.h" -#include "sp_state.h" -#include "sp_fs.h" -#include "sp_quad.h" - -#include "pipe/p_state.h" -#include "pipe/p_defines.h" -#include "util/u_memory.h" -#include "tgsi/tgsi_exec.h" -#include "tgsi/tgsi_sse2.h" - - -#if defined(PIPE_ARCH_X86) - -#include "rtasm/rtasm_x86sse.h" - - - -/** - * Subclass of sp_fragment_shader_variant - */ -struct sp_sse_fragment_shader -{ - struct sp_fragment_shader_variant base; - struct x86_function sse2_program; - tgsi_sse2_fs_function func; - float immediates[TGSI_EXEC_NUM_IMMEDIATES][4]; -}; - - -/** cast wrapper */ -static INLINE struct sp_sse_fragment_shader * -sp_sse_fragment_shader(const struct sp_fragment_shader_variant *base) -{ - return (struct sp_sse_fragment_shader *) base; -} - - -static void -fs_sse_prepare( const struct sp_fragment_shader_variant *base, - struct tgsi_exec_machine *machine, - struct tgsi_sampler **samplers ) -{ - machine->Samplers = samplers; -} - - - -/** - * Compute quad X,Y,Z,W for the four fragments in a quad. - * - * This should really be part of the compiled shader. - */ -static void -setup_pos_vector(const struct tgsi_interp_coef *coef, - float x, float y, - struct tgsi_exec_vector *quadpos) -{ - uint chan; - /* do X */ - quadpos->xyzw[0].f[0] = x; - quadpos->xyzw[0].f[1] = x + 1; - quadpos->xyzw[0].f[2] = x; - quadpos->xyzw[0].f[3] = x + 1; - - /* do Y */ - quadpos->xyzw[1].f[0] = y; - quadpos->xyzw[1].f[1] = y; - quadpos->xyzw[1].f[2] = y + 1; - quadpos->xyzw[1].f[3] = y + 1; - - /* do Z and W for all fragments in the quad */ - for (chan = 2; chan < 4; chan++) { - const float dadx = coef->dadx[chan]; - const float dady = coef->dady[chan]; - const float a0 = coef->a0[chan] + dadx * x + dady * y; - quadpos->xyzw[chan].f[0] = a0; - quadpos->xyzw[chan].f[1] = a0 + dadx; - quadpos->xyzw[chan].f[2] = a0 + dady; - quadpos->xyzw[chan].f[3] = a0 + dadx + dady; - } -} - - -/* TODO: codegenerate the whole run function, skip this wrapper. - * TODO: break dependency on tgsi_exec_machine struct - * TODO: push Position calculation into the generated shader - * TODO: process >1 quad at a time - */ -static unsigned -fs_sse_run( const struct sp_fragment_shader_variant *base, - struct tgsi_exec_machine *machine, - struct quad_header *quad ) -{ - struct sp_sse_fragment_shader *shader = sp_sse_fragment_shader(base); - - /* Compute X, Y, Z, W vals for this quad -- place in temp[0] for now */ - setup_pos_vector(quad->posCoef, - (float)quad->input.x0, (float)quad->input.y0, - machine->Temps); - - /* init kill mask */ - tgsi_set_kill_mask(machine, 0x0); - tgsi_set_exec_mask(machine, 1, 1, 1, 1); - - shader->func( machine, - (const float (*)[4])machine->Consts[0], - (const float (*)[4])shader->immediates, - machine->InterpCoefs - /*, &machine->QuadPos*/ - ); - - quad->inout.mask &= ~(machine->Temps[TGSI_EXEC_TEMP_KILMASK_I].xyzw[TGSI_EXEC_TEMP_KILMASK_C].u[0]); - if (quad->inout.mask == 0) - return FALSE; - - /* store outputs */ - { - const ubyte *sem_name = base->info.output_semantic_name; - const ubyte *sem_index = base->info.output_semantic_index; - const uint n = base->info.num_outputs; - uint i; - for (i = 0; i < n; i++) { - switch (sem_name[i]) { - case TGSI_SEMANTIC_COLOR: - { - uint cbuf = sem_index[i]; - - assert(sizeof(quad->output.color[cbuf]) == - sizeof(machine->Outputs[i])); - - /* copy float[4][4] result */ - memcpy(quad->output.color[cbuf], - &machine->Outputs[i], - sizeof(quad->output.color[0]) ); - } - break; - case TGSI_SEMANTIC_POSITION: - { - uint j; - for (j = 0; j < 4; j++) - quad->output.depth[j] = machine->Outputs[i].xyzw[2].f[j]; - } - break; - case TGSI_SEMANTIC_STENCIL: - { - uint j; - for (j = 0; j < 4; j++) - quad->output.stencil[j] = machine->Outputs[i].xyzw[1].f[j]; - } - break; - } - } - } - - return TRUE; -} - - -static void -fs_sse_delete( struct sp_fragment_shader_variant *base ) -{ - struct sp_sse_fragment_shader *shader = sp_sse_fragment_shader(base); - - x86_release_func( &shader->sse2_program ); - FREE(shader); -} - - -struct sp_fragment_shader_variant * -softpipe_create_fs_variant_sse(struct softpipe_context *softpipe, - const struct pipe_shader_state *templ) -{ - struct sp_sse_fragment_shader *shader; - - if (!softpipe->use_sse) - return NULL; - - shader = CALLOC_STRUCT(sp_sse_fragment_shader); - if (!shader) - return NULL; - - x86_init_func( &shader->sse2_program ); - - if (!tgsi_emit_sse2( templ->tokens, &shader->sse2_program, - shader->immediates, FALSE )) { - FREE(shader); - return NULL; - } - - shader->func = (tgsi_sse2_fs_function) x86_get_func( &shader->sse2_program ); - if (!shader->func) { - x86_release_func( &shader->sse2_program ); - FREE(shader); - return NULL; - } - - shader->base.prepare = fs_sse_prepare; - shader->base.run = fs_sse_run; - shader->base.delete = fs_sse_delete; - - return &shader->base; -} - - -#else - -/* Maybe put this variant in the header file. - */ -struct sp_fragment_shader_variant * -softpipe_create_fs_variant_sse(struct softpipe_context *softpipe, - const struct pipe_shader_state *templ) -{ - return NULL; -} - -#endif diff --git a/src/gallium/drivers/softpipe/sp_state_shader.c b/src/gallium/drivers/softpipe/sp_state_shader.c index 612dcb3..6acb57b 100644 --- a/src/gallium/drivers/softpipe/sp_state_shader.c +++ b/src/gallium/drivers/softpipe/sp_state_shader.c @@ -65,10 +65,7 @@ create_fs_variant(struct softpipe_context *softpipe, #endif /* codegen, create variant object */ - var = softpipe_create_fs_variant_sse(softpipe, curfs); - if (!var) { - var = softpipe_create_fs_variant_exec(softpipe, curfs); - } + var = softpipe_create_fs_variant_exec(softpipe, curfs); if (var) { var->key = *key; -- 2.7.4