From 615cdd3a535bb71754baa8b37e79b85af01854dd Mon Sep 17 00:00:00 2001 From: Keith Whitwell Date: Mon, 21 Apr 2008 12:39:59 +0100 Subject: [PATCH] tgsi: use new float math funcs, drop local disassembly code --- src/gallium/auxiliary/tgsi/exec/tgsi_exec.c | 88 ++--- src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c | 591 +++++++--------------------- 2 files changed, 178 insertions(+), 501 deletions(-) diff --git a/src/gallium/auxiliary/tgsi/exec/tgsi_exec.c b/src/gallium/auxiliary/tgsi/exec/tgsi_exec.c index 78e7dec..29e104b 100644 --- a/src/gallium/auxiliary/tgsi/exec/tgsi_exec.c +++ b/src/gallium/auxiliary/tgsi/exec/tgsi_exec.c @@ -287,10 +287,10 @@ micro_abs( union tgsi_exec_channel *dst, const union tgsi_exec_channel *src ) { - dst->f[0] = (float) fabs( (double) src->f[0] ); - dst->f[1] = (float) fabs( (double) src->f[1] ); - dst->f[2] = (float) fabs( (double) src->f[2] ); - dst->f[3] = (float) fabs( (double) src->f[3] ); + dst->f[0] = fabsf( src->f[0] ); + dst->f[1] = fabsf( src->f[1] ); + dst->f[2] = fabsf( src->f[2] ); + dst->f[3] = fabsf( src->f[3] ); } static void @@ -334,10 +334,10 @@ micro_ceil( union tgsi_exec_channel *dst, const union tgsi_exec_channel *src ) { - dst->f[0] = (float) ceil( (double) src->f[0] ); - dst->f[1] = (float) ceil( (double) src->f[1] ); - dst->f[2] = (float) ceil( (double) src->f[2] ); - dst->f[3] = (float) ceil( (double) src->f[3] ); + dst->f[0] = ceilf( src->f[0] ); + dst->f[1] = ceilf( src->f[1] ); + dst->f[2] = ceilf( src->f[2] ); + dst->f[3] = ceilf( src->f[3] ); } static void @@ -345,10 +345,10 @@ micro_cos( union tgsi_exec_channel *dst, const union tgsi_exec_channel *src ) { - dst->f[0] = (float) cos( (double) src->f[0] ); - dst->f[1] = (float) cos( (double) src->f[1] ); - dst->f[2] = (float) cos( (double) src->f[2] ); - dst->f[3] = (float) cos( (double) src->f[3] ); + dst->f[0] = cosf( src->f[0] ); + dst->f[1] = cosf( src->f[1] ); + dst->f[2] = cosf( src->f[2] ); + dst->f[3] = cosf( src->f[3] ); } static void @@ -430,10 +430,10 @@ micro_exp2( union tgsi_exec_channel *dst, const union tgsi_exec_channel *src) { - dst->f[0] = (float) pow( 2.0, (double) src->f[0] ); - dst->f[1] = (float) pow( 2.0, (double) src->f[1] ); - dst->f[2] = (float) pow( 2.0, (double) src->f[2] ); - dst->f[3] = (float) pow( 2.0, (double) src->f[3] ); + dst->f[0] = powf( 2.0f, src->f[0] ); + dst->f[1] = powf( 2.0f, src->f[1] ); + dst->f[2] = powf( 2.0f, src->f[2] ); + dst->f[3] = powf( 2.0f, src->f[3] ); } static void @@ -463,10 +463,10 @@ micro_flr( union tgsi_exec_channel *dst, const union tgsi_exec_channel *src ) { - dst->f[0] = (float) floor( (double) src->f[0] ); - dst->f[1] = (float) floor( (double) src->f[1] ); - dst->f[2] = (float) floor( (double) src->f[2] ); - dst->f[3] = (float) floor( (double) src->f[3] ); + dst->f[0] = floorf( src->f[0] ); + dst->f[1] = floorf( src->f[1] ); + dst->f[2] = floorf( src->f[2] ); + dst->f[3] = floorf( src->f[3] ); } static void @@ -474,10 +474,10 @@ micro_frc( union tgsi_exec_channel *dst, const union tgsi_exec_channel *src ) { - dst->f[0] = src->f[0] - (float) floor( (double) src->f[0] ); - dst->f[1] = src->f[1] - (float) floor( (double) src->f[1] ); - dst->f[2] = src->f[2] - (float) floor( (double) src->f[2] ); - dst->f[3] = src->f[3] - (float) floor( (double) src->f[3] ); + dst->f[0] = src->f[0] - floorf( src->f[0] ); + dst->f[1] = src->f[1] - floorf( src->f[1] ); + dst->f[2] = src->f[2] - floorf( src->f[2] ); + dst->f[3] = src->f[3] - floorf( src->f[3] ); } static void @@ -510,10 +510,10 @@ micro_lg2( union tgsi_exec_channel *dst, const union tgsi_exec_channel *src ) { - dst->f[0] = (float) log( (double) src->f[0] ) * 1.442695f; - dst->f[1] = (float) log( (double) src->f[1] ) * 1.442695f; - dst->f[2] = (float) log( (double) src->f[2] ) * 1.442695f; - dst->f[3] = (float) log( (double) src->f[3] ) * 1.442695f; + dst->f[0] = logf( src->f[0] ) * 1.442695f; + dst->f[1] = logf( src->f[1] ) * 1.442695f; + dst->f[2] = logf( src->f[2] ) * 1.442695f; + dst->f[3] = logf( src->f[3] ) * 1.442695f; } static void @@ -764,10 +764,10 @@ micro_pow( const union tgsi_exec_channel *src0, const union tgsi_exec_channel *src1 ) { - dst->f[0] = (float) pow( (double) src0->f[0], (double) src1->f[0] ); - dst->f[1] = (float) pow( (double) src0->f[1], (double) src1->f[1] ); - dst->f[2] = (float) pow( (double) src0->f[2], (double) src1->f[2] ); - dst->f[3] = (float) pow( (double) src0->f[3], (double) src1->f[3] ); + dst->f[0] = powf( src0->f[0], src1->f[0] ); + dst->f[1] = powf( src0->f[1], src1->f[1] ); + dst->f[2] = powf( src0->f[2], src1->f[2] ); + dst->f[3] = powf( src0->f[3], src1->f[3] ); } static void @@ -775,10 +775,10 @@ micro_rnd( union tgsi_exec_channel *dst, const union tgsi_exec_channel *src ) { - dst->f[0] = (float) floor( (double) (src->f[0] + 0.5f) ); - dst->f[1] = (float) floor( (double) (src->f[1] + 0.5f) ); - dst->f[2] = (float) floor( (double) (src->f[2] + 0.5f) ); - dst->f[3] = (float) floor( (double) (src->f[3] + 0.5f) ); + dst->f[0] = floorf( src->f[0] + 0.5f ); + dst->f[1] = floorf( src->f[1] + 0.5f ); + dst->f[2] = floorf( src->f[2] + 0.5f ); + dst->f[3] = floorf( src->f[3] + 0.5f ); } static void @@ -833,20 +833,20 @@ micro_sin( union tgsi_exec_channel *dst, const union tgsi_exec_channel *src ) { - dst->f[0] = (float) sin( (double) src->f[0] ); - dst->f[1] = (float) sin( (double) src->f[1] ); - dst->f[2] = (float) sin( (double) src->f[2] ); - dst->f[3] = (float) sin( (double) src->f[3] ); + dst->f[0] = sinf( src->f[0] ); + dst->f[1] = sinf( src->f[1] ); + dst->f[2] = sinf( src->f[2] ); + dst->f[3] = sinf( src->f[3] ); } static void micro_sqrt( union tgsi_exec_channel *dst, const union tgsi_exec_channel *src ) { - dst->f[0] = (float) sqrt( (double) src->f[0] ); - dst->f[1] = (float) sqrt( (double) src->f[1] ); - dst->f[2] = (float) sqrt( (double) src->f[2] ); - dst->f[3] = (float) sqrt( (double) src->f[3] ); + dst->f[0] = sqrtf( src->f[0] ); + dst->f[1] = sqrtf( src->f[1] ); + dst->f[2] = sqrtf( src->f[2] ); + dst->f[3] = sqrtf( src->f[3] ); } static void diff --git a/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c index d47935e..c3295a2 100755 --- a/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c +++ b/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c @@ -38,113 +38,6 @@ #define HIGH_PRECISION 1 /* for 1/sqrt() */ -#define DUMP_SSE 0 - -#if DUMP_SSE - -static void -_print_reg( - struct x86_reg reg ) -{ - if (reg.mod != mod_REG) - debug_printf( "[" ); - - switch( reg.file ) { - case file_REG32: - switch( reg.idx ) { - case reg_AX: - debug_printf( "EAX" ); - break; - case reg_CX: - debug_printf( "ECX" ); - break; - case reg_DX: - debug_printf( "EDX" ); - break; - case reg_BX: - debug_printf( "EBX" ); - break; - case reg_SP: - debug_printf( "ESP" ); - break; - case reg_BP: - debug_printf( "EBP" ); - break; - case reg_SI: - debug_printf( "ESI" ); - break; - case reg_DI: - debug_printf( "EDI" ); - break; - } - break; - case file_MMX: - assert( 0 ); - break; - case file_XMM: - debug_printf( "XMM%u", reg.idx ); - break; - case file_x87: - assert( 0 ); - break; - } - - if (reg.mod == mod_DISP8 || - reg.mod == mod_DISP32) - debug_printf("+%d", reg.disp); - - if (reg.mod != mod_REG) - debug_printf( "]" ); -} - -static void -_fill( - const char *op ) -{ - unsigned count = 10 - strlen( op ); - - while( count-- ) { - debug_printf( " " ); - } -} - -#define DUMP_START() debug_printf( "\nsse-dump start ----------------" ) -#define DUMP_END() debug_printf( "\nsse-dump end ----------------\n" ) -#define DUMP( OP ) debug_printf( "\n%s", OP ) -#define DUMP_I( OP, I ) do {\ - debug_printf( "\n%s", OP );\ - _fill( OP );\ - debug_printf( "%u", I ); } while( 0 ) -#define DUMP_R( OP, R0 ) do {\ - debug_printf( "\n%s", OP );\ - _fill( OP );\ - _print_reg( R0 ); } while( 0 ) -#define DUMP_RR( OP, R0, R1 ) do {\ - debug_printf( "\n%s", OP );\ - _fill( OP );\ - _print_reg( R0 );\ - debug_printf( ", " );\ - _print_reg( R1 ); } while( 0 ) -#define DUMP_RRI( OP, R0, R1, I ) do {\ - debug_printf( "\n%s", OP );\ - _fill( OP );\ - _print_reg( R0 );\ - debug_printf( ", " );\ - _print_reg( R1 );\ - debug_printf( ", " );\ - debug_printf( "%u", I ); } while( 0 ) - -#else - -#define DUMP_START() -#define DUMP_END() -#define DUMP( OP ) -#define DUMP_I( OP, I ) -#define DUMP_R( OP, R0 ) -#define DUMP_RR( OP, R0, R1 ) -#define DUMP_RRI( OP, R0, R1, I ) - -#endif #define FOR_EACH_CHANNEL( CHAN )\ for( CHAN = 0; CHAN < 4; CHAN++ ) @@ -310,200 +203,6 @@ get_coef( ((vec * 3 + member) * 4 + chan) * 4 ); } -/** - * X86 rtasm wrappers. - */ - -static void -emit_addps( - struct x86_function *func, - struct x86_reg dst, - struct x86_reg src ) -{ - DUMP_RR( "ADDPS", dst, src ); - sse_addps( func, dst, src ); -} - -static void -emit_andnps( - struct x86_function *func, - struct x86_reg dst, - struct x86_reg src ) -{ - DUMP_RR( "ANDNPS", dst, src ); - sse_andnps( func, dst, src ); -} - -static void -emit_andps( - struct x86_function *func, - struct x86_reg dst, - struct x86_reg src ) -{ - DUMP_RR( "ANDPS", dst, src ); - sse_andps( func, dst, src ); -} - -static void -emit_call( - struct x86_function *func, - void (* addr)() ) -{ - struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX ); - - DUMP_I( "CALL", addr ); - x86_mov_reg_imm( func, ecx, (unsigned long) addr ); - x86_call( func, ecx ); -} - -static void -emit_cmpps( - struct x86_function *func, - struct x86_reg dst, - struct x86_reg src, - enum sse_cc cc ) -{ - DUMP_RRI( "CMPPS", dst, src, cc ); - sse_cmpps( func, dst, src, cc ); -} - -static void -emit_cvttps2dq( - struct x86_function *func, - struct x86_reg dst, - struct x86_reg src ) -{ - DUMP_RR( "CVTTPS2DQ", dst, src ); - sse2_cvttps2dq( func, dst, src ); -} - -static void -emit_maxps( - struct x86_function *func, - struct x86_reg dst, - struct x86_reg src ) -{ - DUMP_RR( "MAXPS", dst, src ); - sse_maxps( func, dst, src ); -} - -static void -emit_minps( - struct x86_function *func, - struct x86_reg dst, - struct x86_reg src ) -{ - DUMP_RR( "MINPS", dst, src ); - sse_minps( func, dst, src ); -} - -static void -emit_mov( - struct x86_function *func, - struct x86_reg dst, - struct x86_reg src ) -{ - DUMP_RR( "MOV", dst, src ); - x86_mov( func, dst, src ); -} - -static void -emit_movaps( - struct x86_function *func, - struct x86_reg dst, - struct x86_reg src ) -{ - DUMP_RR( "MOVAPS", dst, src ); - sse_movaps( func, dst, src ); -} - -static void -emit_movss( - struct x86_function *func, - struct x86_reg dst, - struct x86_reg src ) -{ - DUMP_RR( "MOVSS", dst, src ); - sse_movss( func, dst, src ); -} - -static void -emit_movups( - struct x86_function *func, - struct x86_reg dst, - struct x86_reg src ) -{ - DUMP_RR( "MOVUPS", dst, src ); - sse_movups( func, dst, src ); -} - -static void -emit_mulps( - struct x86_function *func, - struct x86_reg dst, - struct x86_reg src ) -{ - DUMP_RR( "MULPS", dst, src ); - sse_mulps( func, dst, src ); -} - -static void -emit_or( - struct x86_function *func, - struct x86_reg dst, - struct x86_reg src ) -{ - DUMP_RR( "OR", dst, src ); - x86_or( func, dst, src ); -} - -static void -emit_orps( - struct x86_function *func, - struct x86_reg dst, - struct x86_reg src ) -{ - DUMP_RR( "ORPS", dst, src ); - sse_orps( func, dst, src ); -} - -static void -emit_pmovmskb( - struct x86_function *func, - struct x86_reg dst, - struct x86_reg src ) -{ - DUMP_RR( "PMOVMSKB", dst, src ); - sse_pmovmskb( func, dst, src ); -} - -static void -emit_pop( - struct x86_function *func, - struct x86_reg dst ) -{ - DUMP_R( "POP", dst ); - x86_pop( func, dst ); -} - -static void -emit_push( - struct x86_function *func, - struct x86_reg dst ) -{ - DUMP_R( "PUSH", dst ); - x86_push( func, dst ); -} - -static void -emit_rcpps( - struct x86_function *func, - struct x86_reg dst, - struct x86_reg src ) -{ - DUMP_RR( "RCPPS", dst, src ); - sse2_rcpps( func, dst, src ); -} #ifdef WIN32 static void @@ -511,7 +210,6 @@ emit_retw( struct x86_function *func, unsigned size ) { - DUMP_I( "RET", size ); x86_retw( func, size ); } #else @@ -519,51 +217,10 @@ static void emit_ret( struct x86_function *func ) { - DUMP( "RET" ); x86_ret( func ); } #endif -static void -emit_rsqrtps( - struct x86_function *func, - struct x86_reg dst, - struct x86_reg src ) -{ - DUMP_RR( "RSQRTPS", dst, src ); - sse_rsqrtps( func, dst, src ); -} - -static void -emit_shufps( - struct x86_function *func, - struct x86_reg dst, - struct x86_reg src, - unsigned char shuf ) -{ - DUMP_RRI( "SHUFPS", dst, src, shuf ); - sse_shufps( func, dst, src, shuf ); -} - -static void -emit_subps( - struct x86_function *func, - struct x86_reg dst, - struct x86_reg src ) -{ - DUMP_RR( "SUBPS", dst, src ); - sse_subps( func, dst, src ); -} - -static void -emit_xorps( - struct x86_function *func, - struct x86_reg dst, - struct x86_reg src ) -{ - DUMP_RR( "XORPS", dst, src ); - sse_xorps( func, dst, src ); -} /** * Data fetch helpers. @@ -582,11 +239,11 @@ emit_const( unsigned vec, unsigned chan ) { - emit_movss( + sse_movss( func, make_xmm( xmm ), get_const( vec, chan ) ); - emit_shufps( + sse_shufps( func, make_xmm( xmm ), make_xmm( xmm ), @@ -600,11 +257,11 @@ emit_immediate( unsigned vec, unsigned chan ) { - emit_movss( + sse_movss( func, make_xmm( xmm ), get_immediate( vec, chan ) ); - emit_shufps( + sse_shufps( func, make_xmm( xmm ), make_xmm( xmm ), @@ -625,7 +282,7 @@ emit_inputf( unsigned vec, unsigned chan ) { - emit_movups( + sse_movups( func, make_xmm( xmm ), get_input( vec, chan ) ); @@ -644,7 +301,7 @@ emit_output( unsigned vec, unsigned chan ) { - emit_movups( + sse_movups( func, get_output( vec, chan ), make_xmm( xmm ) ); @@ -663,7 +320,7 @@ emit_tempf( unsigned vec, unsigned chan ) { - emit_movaps( + sse_movaps( func, make_xmm( xmm ), get_temp( vec, chan ) ); @@ -684,11 +341,11 @@ emit_coef( unsigned chan, unsigned member ) { - emit_movss( + sse_movss( func, make_xmm( xmm ), get_coef( vec, chan, member ) ); - emit_shufps( + sse_shufps( func, make_xmm( xmm ), make_xmm( xmm ), @@ -706,7 +363,7 @@ emit_inputs( unsigned vec, unsigned chan ) { - emit_movups( + sse_movups( func, get_input( vec, chan ), make_xmm( xmm ) ); @@ -719,7 +376,7 @@ emit_temps( unsigned vec, unsigned chan ) { - emit_movaps( + sse_movaps( func, get_temp( vec, chan ), make_xmm( xmm ) ); @@ -796,39 +453,39 @@ static void emit_push_gp( struct x86_function *func ) { - emit_push( + x86_push( func, get_const_base() ); - emit_push( + x86_push( func, get_input_base() ); - emit_push( + x86_push( func, get_output_base() ); /* It is important on non-win32 platforms that temp base is pushed last. */ - emit_push( + x86_push( func, get_temp_base() ); } static void -emit_pop_gp( +x86_pop_gp( struct x86_function *func ) { /* Restore GP registers in a reverse order. */ - emit_pop( + x86_pop( func, get_temp_base() ); - emit_pop( + x86_pop( func, get_output_base() ); - emit_pop( + x86_pop( func, get_input_base() ); - emit_pop( + x86_pop( func, get_const_base() ); } @@ -839,7 +496,7 @@ emit_func_call_dst( unsigned xmm_dst, void (*code)() ) { - emit_movaps( + sse_movaps( func, get_temp( TEMP_R0, 0 ), make_xmm( xmm_dst ) ); @@ -848,19 +505,22 @@ emit_func_call_dst( func ); #ifdef WIN32 - emit_push( + x86_push( func, get_temp( TEMP_R0, 0 ) ); #endif - emit_call( - func, - code ); + { + struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX ); + + x86_mov_reg_imm( func, ecx, (unsigned long) code ); + x86_call( func, ecx ); + } - emit_pop_gp( + x86_pop_gp( func ); - emit_movaps( + sse_movaps( func, make_xmm( xmm_dst ), get_temp( TEMP_R0, 0 ) ); @@ -873,7 +533,7 @@ emit_func_call_dst_src( unsigned xmm_src, void (*code)() ) { - emit_movaps( + sse_movaps( func, get_temp( TEMP_R0, 1 ), make_xmm( xmm_src ) ); @@ -893,7 +553,7 @@ emit_abs( struct x86_function *func, unsigned xmm ) { - emit_andps( + sse_andps( func, make_xmm( xmm ), get_temp( @@ -907,7 +567,7 @@ emit_add( unsigned xmm_dst, unsigned xmm_src ) { - emit_addps( + sse_addps( func, make_xmm( xmm_dst ), make_xmm( xmm_src ) ); @@ -918,17 +578,15 @@ cos4f( float *store ) { #ifdef WIN32 - store[0] = (float) cos( (double) store[0] ); - store[1] = (float) cos( (double) store[1] ); - store[2] = (float) cos( (double) store[2] ); - store[3] = (float) cos( (double) store[3] ); + const unsigned X = 0; #else const unsigned X = TEMP_R0 * 16; +#endif + store[X + 0] = cosf( store[X + 0] ); store[X + 1] = cosf( store[X + 1] ); store[X + 2] = cosf( store[X + 2] ); store[X + 3] = cosf( store[X + 3] ); -#endif } static void @@ -947,17 +605,14 @@ ex24f( float *store ) { #ifdef WIN32 - store[0] = (float) pow( 2.0, (double) store[0] ); - store[1] = (float) pow( 2.0, (double) store[1] ); - store[2] = (float) pow( 2.0, (double) store[2] ); - store[3] = (float) pow( 2.0, (double) store[3] ); + const unsigned X = 0; #else const unsigned X = TEMP_R0 * 16; +#endif store[X + 0] = powf( 2.0f, store[X + 0] ); store[X + 1] = powf( 2.0f, store[X + 1] ); store[X + 2] = powf( 2.0f, store[X + 2] ); store[X + 3] = powf( 2.0f, store[X + 3] ); -#endif } static void @@ -976,7 +631,7 @@ emit_f2it( struct x86_function *func, unsigned xmm ) { - emit_cvttps2dq( + sse2_cvttps2dq( func, make_xmm( xmm ), make_xmm( xmm ) ); @@ -991,10 +646,10 @@ flr4f( #else const unsigned X = TEMP_R0 * 16; #endif - store[X + 0] = (float) floor( (double) store[X + 0] ); - store[X + 1] = (float) floor( (double) store[X + 1] ); - store[X + 2] = (float) floor( (double) store[X + 2] ); - store[X + 3] = (float) floor( (double) store[X + 3] ); + store[X + 0] = floorf( store[X + 0] ); + store[X + 1] = floorf( store[X + 1] ); + store[X + 2] = floorf( store[X + 2] ); + store[X + 3] = floorf( store[X + 3] ); } static void @@ -1017,10 +672,10 @@ frc4f( #else const unsigned X = TEMP_R0 * 16; #endif - store[X + 0] -= (float) floor( (double) store[X + 0] ); - store[X + 1] -= (float) floor( (double) store[X + 1] ); - store[X + 2] -= (float) floor( (double) store[X + 2] ); - store[X + 3] -= (float) floor( (double) store[X + 3] ); + store[X + 0] -= floorf( store[X + 0] ); + store[X + 1] -= floorf( store[X + 1] ); + store[X + 2] -= floorf( store[X + 2] ); + store[X + 3] -= floorf( store[X + 3] ); } static void @@ -1066,7 +721,7 @@ emit_MOV( unsigned xmm_dst, unsigned xmm_src ) { - emit_movups( + sse_movups( func, make_xmm( xmm_dst ), make_xmm( xmm_src ) ); @@ -1077,7 +732,7 @@ emit_mul (struct x86_function *func, unsigned xmm_dst, unsigned xmm_src) { - emit_mulps( + sse_mulps( func, make_xmm( xmm_dst ), make_xmm( xmm_src ) ); @@ -1088,7 +743,7 @@ emit_neg( struct x86_function *func, unsigned xmm ) { - emit_xorps( + sse_xorps( func, make_xmm( xmm ), get_temp( @@ -1101,17 +756,14 @@ pow4f( float *store ) { #ifdef WIN32 - store[0] = (float) pow( (double) store[0], (double) store[4] ); - store[1] = (float) pow( (double) store[1], (double) store[5] ); - store[2] = (float) pow( (double) store[2], (double) store[6] ); - store[3] = (float) pow( (double) store[3], (double) store[7] ); + const unsigned X = 0; #else const unsigned X = TEMP_R0 * 16; +#endif store[X + 0] = powf( store[X + 0], store[X + 4] ); store[X + 1] = powf( store[X + 1], store[X + 5] ); store[X + 2] = powf( store[X + 2], store[X + 6] ); store[X + 3] = powf( store[X + 3], store[X + 7] ); -#endif } static void @@ -1133,7 +785,11 @@ emit_rcp ( unsigned xmm_dst, unsigned xmm_src ) { - emit_rcpps( + /* On Intel CPUs at least, this is only accurate to 12 bits -- not + * good enough. Need to either emit a proper divide or use the + * iterative technique described below in emit_rsqrt(). + */ + sse2_rcpps( func, make_xmm( xmm_dst ), make_xmm( xmm_src ) ); @@ -1145,17 +801,14 @@ rsqrt4f( float *store ) { #ifdef WIN32 - store[0] = 1.0F / (float) sqrt( (double) store[0] ); - store[1] = 1.0F / (float) sqrt( (double) store[1] ); - store[2] = 1.0F / (float) sqrt( (double) store[2] ); - store[3] = 1.0F / (float) sqrt( (double) store[3] ); + const unsigned X = 0; #else const unsigned X = TEMP_R0 * 16; - store[X + 0] = 1.0F / sqrt( store[X + 0] ); - store[X + 1] = 1.0F / sqrt( store[X + 1] ); - store[X + 2] = 1.0F / sqrt( store[X + 2] ); - store[X + 3] = 1.0F / sqrt( store[X + 3] ); #endif + store[X + 0] = 1.0F / sqrtf( store[X + 0] ); + store[X + 1] = 1.0F / sqrtf( store[X + 1] ); + store[X + 2] = 1.0F / sqrtf( store[X + 2] ); + store[X + 3] = 1.0F / sqrtf( store[X + 3] ); } #endif @@ -1166,12 +819,41 @@ emit_rsqrt( unsigned xmm_src ) { #if HIGH_PRECISION +#if 1 emit_func_call_dst_src( func, xmm_dst, xmm_src, rsqrt4f ); #else + /* Although rsqrtps() and rcpps() are low precision on some/all SSE + * implementations, it is possible to improve its precision at + * fairly low cost, using a newton/raphson step, as below: + * + * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a) + * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)] + * + * See: http://softwarecommunity.intel.com/articles/eng/1818.htm + */ + /* This is some code that woudl do the above for a scalar 'a'. We + * obviously are interested in a vector version: + * + * movss xmm3, a; + * movss xmm1, half; + * movss xmm2, three; + * rsqrtss xmm0, xmm3; + * mulss xmm3, xmm0; + * mulss xmm1, xmm0; + * mulss xmm3, xmm0; + * subss xmm2, xmm3; + * mulss xmm1, xmm2; + * movss x, xmm1; + */ +#endif +#else + /* On Intel CPUs at least, this is only accurate to 12 bits -- not + * good enough. + */ emit_rsqrtps( func, make_xmm( xmm_dst ), @@ -1184,7 +866,7 @@ emit_setsign( struct x86_function *func, unsigned xmm ) { - emit_orps( + sse_orps( func, make_xmm( xmm ), get_temp( @@ -1197,17 +879,14 @@ sin4f( float *store ) { #ifdef WIN32 - store[0] = (float) sin( (double) store[0] ); - store[1] = (float) sin( (double) store[1] ); - store[2] = (float) sin( (double) store[2] ); - store[3] = (float) sin( (double) store[3] ); + const unsigned X = 0; #else const unsigned X = TEMP_R0 * 16; +#endif store[X + 0] = sinf( store[X + 0] ); store[X + 1] = sinf( store[X + 1] ); store[X + 2] = sinf( store[X + 2] ); store[X + 3] = sinf( store[X + 3] ); -#endif } static void @@ -1226,7 +905,7 @@ emit_sub( unsigned xmm_dst, unsigned xmm_src ) { - emit_subps( + sse_subps( func, make_xmm( xmm_dst ), make_xmm( xmm_src ) ); @@ -1435,16 +1114,16 @@ emit_kil( } } - emit_push( + x86_push( func, x86_make_reg( file_REG32, reg_AX ) ); - emit_push( + x86_push( func, x86_make_reg( file_REG32, reg_DX ) ); FOR_EACH_CHANNEL( chan_index ) { if( uniquemask & (1 << chan_index) ) { - emit_cmpps( + sse_cmpps( func, make_xmm( registers[chan_index] ), get_temp( @@ -1453,17 +1132,17 @@ emit_kil( cc_LessThan ); if( chan_index == firstchan ) { - emit_pmovmskb( + sse_pmovmskb( func, x86_make_reg( file_REG32, reg_AX ), make_xmm( registers[chan_index] ) ); } else { - emit_pmovmskb( + sse_pmovmskb( func, x86_make_reg( file_REG32, reg_DX ), make_xmm( registers[chan_index] ) ); - emit_or( + x86_or( func, x86_make_reg( file_REG32, reg_AX ), x86_make_reg( file_REG32, reg_DX ) ); @@ -1471,17 +1150,17 @@ emit_kil( } } - emit_or( + x86_or( func, get_temp( TGSI_EXEC_TEMP_KILMASK_I, TGSI_EXEC_TEMP_KILMASK_C ), x86_make_reg( file_REG32, reg_AX ) ); - emit_pop( + x86_pop( func, x86_make_reg( file_REG32, reg_DX ) ); - emit_pop( + x86_pop( func, x86_make_reg( file_REG32, reg_AX ) ); } @@ -1497,12 +1176,12 @@ emit_setcc( FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { FETCH( func, *inst, 0, 0, chan_index ); FETCH( func, *inst, 1, 1, chan_index ); - emit_cmpps( + sse_cmpps( func, make_xmm( 0 ), make_xmm( 1 ), cc ); - emit_andps( + sse_andps( func, make_xmm( 0 ), get_temp( @@ -1523,22 +1202,22 @@ emit_cmp( FETCH( func, *inst, 0, 0, chan_index ); FETCH( func, *inst, 1, 1, chan_index ); FETCH( func, *inst, 2, 2, chan_index ); - emit_cmpps( + sse_cmpps( func, make_xmm( 0 ), get_temp( TGSI_EXEC_TEMP_00000000_I, TGSI_EXEC_TEMP_00000000_C ), cc_LessThan ); - emit_andps( + sse_andps( func, make_xmm( 1 ), make_xmm( 0 ) ); - emit_andnps( + sse_andnps( func, make_xmm( 0 ), make_xmm( 2 ) ); - emit_orps( + sse_orps( func, make_xmm( 0 ), make_xmm( 1 ) ); @@ -1589,7 +1268,7 @@ emit_instruction( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) { if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) { FETCH( func, *inst, 0, 0, CHAN_X ); - emit_maxps( + sse_maxps( func, make_xmm( 0 ), get_temp( @@ -1601,7 +1280,7 @@ emit_instruction( /* XMM[1] = SrcReg[0].yyyy */ FETCH( func, *inst, 1, 0, CHAN_Y ); /* XMM[1] = max(XMM[1], 0) */ - emit_maxps( + sse_maxps( func, make_xmm( 1 ), get_temp( @@ -1610,14 +1289,14 @@ emit_instruction( /* XMM[2] = SrcReg[0].wwww */ FETCH( func, *inst, 2, 0, CHAN_W ); /* XMM[2] = min(XMM[2], 128.0) */ - emit_minps( + sse_minps( func, make_xmm( 2 ), get_temp( TGSI_EXEC_TEMP_128_I, TGSI_EXEC_TEMP_128_C ) ); /* XMM[2] = max(XMM[2], -128.0) */ - emit_maxps( + sse_maxps( func, make_xmm( 2 ), get_temp( @@ -1625,16 +1304,16 @@ emit_instruction( TGSI_EXEC_TEMP_MINUS_128_C ) ); emit_pow( func, 1, 2 ); FETCH( func, *inst, 0, 0, CHAN_X ); - emit_xorps( + sse_xorps( func, make_xmm( 2 ), make_xmm( 2 ) ); - emit_cmpps( + sse_cmpps( func, make_xmm( 2 ), make_xmm( 0 ), cc_LessThanEqual ); - emit_andps( + sse_andps( func, make_xmm( 2 ), make_xmm( 1 ) ); @@ -1756,7 +1435,7 @@ emit_instruction( FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { FETCH( func, *inst, 0, 0, chan_index ); FETCH( func, *inst, 1, 1, chan_index ); - emit_minps( + sse_minps( func, make_xmm( 0 ), make_xmm( 1 ) ); @@ -1768,7 +1447,7 @@ emit_instruction( FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { FETCH( func, *inst, 0, 0, chan_index ); FETCH( func, *inst, 1, 1, chan_index ); - emit_maxps( + sse_maxps( func, make_xmm( 0 ), make_xmm( 1 ) ); @@ -2376,8 +2055,6 @@ tgsi_emit_sse2( unsigned ok = 1; uint num_immediates = 0; - DUMP_START(); - func->csr = func->store; tgsi_parse_init( &parse, tokens ); @@ -2387,24 +2064,24 @@ tgsi_emit_sse2( */ if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) { /* DECLARATION phase, do not load output argument. */ - emit_mov( + x86_mov( func, get_input_base(), get_argument( 0 ) ); /* skipping outputs argument here */ - emit_mov( + x86_mov( func, get_const_base(), get_argument( 2 ) ); - emit_mov( + x86_mov( func, get_temp_base(), get_argument( 3 ) ); - emit_mov( + x86_mov( func, get_coef_base(), get_argument( 4 ) ); - emit_mov( + x86_mov( func, get_immediate_base(), get_argument( 5 ) ); @@ -2412,23 +2089,23 @@ tgsi_emit_sse2( else { assert(parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX); - emit_mov( + x86_mov( func, get_input_base(), get_argument( 0 ) ); - emit_mov( + x86_mov( func, get_output_base(), get_argument( 1 ) ); - emit_mov( + x86_mov( func, get_const_base(), get_argument( 2 ) ); - emit_mov( + x86_mov( func, get_temp_base(), get_argument( 3 ) ); - emit_mov( + x86_mov( func, get_immediate_base(), get_argument( 4 ) ); @@ -2451,7 +2128,7 @@ tgsi_emit_sse2( if( !instruction_phase ) { /* INSTRUCTION phase, overwrite coeff with output. */ instruction_phase = TRUE; - emit_mov( + x86_mov( func, get_output_base(), get_argument( 1 ) ); @@ -2463,8 +2140,10 @@ tgsi_emit_sse2( &parse.FullToken.FullInstruction ); if (!ok) { - debug_printf("failed to translate tgsi opcode %d to SSE\n", - parse.FullToken.FullInstruction.Instruction.Opcode ); + debug_printf("failed to translate tgsi opcode %d to SSE (%s)\n", + parse.FullToken.FullInstruction.Instruction.Opcode, + parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ? + "vertex shader" : "fragment shader"); } break; @@ -2499,8 +2178,6 @@ tgsi_emit_sse2( tgsi_parse_free( &parse ); - DUMP_END(); - return ok; } -- 2.7.4