tgsi: use new float math funcs, drop local disassembly code

author Keith Whitwell <keith@tungstengraphics.com>

Mon, 21 Apr 2008 11:39:59 +0000 (12:39 +0100)

committer Keith Whitwell <keith@tungstengraphics.com>

Mon, 21 Apr 2008 12:14:30 +0000 (13:14 +0100)
author Keith Whitwell <keith@tungstengraphics.com>
Mon, 21 Apr 2008 11:39:59 +0000 (12:39 +0100)
committer Keith Whitwell <keith@tungstengraphics.com>
Mon, 21 Apr 2008 12:14:30 +0000 (13:14 +0100)
diff --git a/src/gallium/auxiliary/tgsi/exec/tgsi_exec.c b/src/gallium/auxiliary/tgsi/exec/tgsi_exec.c

index 78e7dec..29e104b 100644 (file)
--- a/src/gallium/auxiliary/tgsi/exec/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/exec/tgsi_exec.c
@@ -287,10 +287,10 @@ micro_abs(
     union tgsi_exec_channel *dst,
     const union tgsi_exec_channel *src )
  {
-   dst->f[0] = (float) fabs( (double) src->f[0] );
-   dst->f[1] = (float) fabs( (double) src->f[1] );
-   dst->f[2] = (float) fabs( (double) src->f[2] );
-   dst->f[3] = (float) fabs( (double) src->f[3] );
+   dst->f[0] = fabsf( src->f[0] );
+   dst->f[1] = fabsf( src->f[1] );
+   dst->f[2] = fabsf( src->f[2] );
+   dst->f[3] = fabsf( src->f[3] );
  }
  
  static void
@@ -334,10 +334,10 @@ micro_ceil(
     union tgsi_exec_channel *dst,
     const union tgsi_exec_channel *src )
  {
-   dst->f[0] = (float) ceil( (double) src->f[0] );
-   dst->f[1] = (float) ceil( (double) src->f[1] );
-   dst->f[2] = (float) ceil( (double) src->f[2] );
-   dst->f[3] = (float) ceil( (double) src->f[3] );
+   dst->f[0] = ceilf( src->f[0] );
+   dst->f[1] = ceilf( src->f[1] );
+   dst->f[2] = ceilf( src->f[2] );
+   dst->f[3] = ceilf( src->f[3] );
  }
  
  static void
@@ -345,10 +345,10 @@ micro_cos(
     union tgsi_exec_channel *dst,
     const union tgsi_exec_channel *src )
  {
-   dst->f[0] = (float) cos( (double) src->f[0] );
-   dst->f[1] = (float) cos( (double) src->f[1] );
-   dst->f[2] = (float) cos( (double) src->f[2] );
-   dst->f[3] = (float) cos( (double) src->f[3] );
+   dst->f[0] = cosf( src->f[0] );
+   dst->f[1] = cosf( src->f[1] );
+   dst->f[2] = cosf( src->f[2] );
+   dst->f[3] = cosf( src->f[3] );
  }
  
  static void
@@ -430,10 +430,10 @@ micro_exp2(
     union tgsi_exec_channel *dst,
     const union tgsi_exec_channel *src)
  {
-   dst->f[0] = (float) pow( 2.0, (double) src->f[0] );
-   dst->f[1] = (float) pow( 2.0, (double) src->f[1] );
-   dst->f[2] = (float) pow( 2.0, (double) src->f[2] );
-   dst->f[3] = (float) pow( 2.0, (double) src->f[3] );
+   dst->f[0] = powf( 2.0f, src->f[0] );
+   dst->f[1] = powf( 2.0f, src->f[1] );
+   dst->f[2] = powf( 2.0f, src->f[2] );
+   dst->f[3] = powf( 2.0f, src->f[3] );
  }
  
  static void
@@ -463,10 +463,10 @@ micro_flr(
     union tgsi_exec_channel *dst,
     const union tgsi_exec_channel *src )
  {
-   dst->f[0] = (float) floor( (double) src->f[0] );
-   dst->f[1] = (float) floor( (double) src->f[1] );
-   dst->f[2] = (float) floor( (double) src->f[2] );
-   dst->f[3] = (float) floor( (double) src->f[3] );
+   dst->f[0] = floorf( src->f[0] );
+   dst->f[1] = floorf( src->f[1] );
+   dst->f[2] = floorf( src->f[2] );
+   dst->f[3] = floorf( src->f[3] );
  }
  
  static void
@@ -474,10 +474,10 @@ micro_frc(
     union tgsi_exec_channel *dst,
     const union tgsi_exec_channel *src )
  {
-   dst->f[0] = src->f[0] - (float) floor( (double) src->f[0] );
-   dst->f[1] = src->f[1] - (float) floor( (double) src->f[1] );
-   dst->f[2] = src->f[2] - (float) floor( (double) src->f[2] );
-   dst->f[3] = src->f[3] - (float) floor( (double) src->f[3] );
+   dst->f[0] = src->f[0] - floorf( src->f[0] );
+   dst->f[1] = src->f[1] - floorf( src->f[1] );
+   dst->f[2] = src->f[2] - floorf( src->f[2] );
+   dst->f[3] = src->f[3] - floorf( src->f[3] );
  }
  
  static void
@@ -510,10 +510,10 @@ micro_lg2(
     union tgsi_exec_channel *dst,
     const union tgsi_exec_channel *src )
  {
-   dst->f[0] = (float) log( (double) src->f[0] ) * 1.442695f;
-   dst->f[1] = (float) log( (double) src->f[1] ) * 1.442695f;
-   dst->f[2] = (float) log( (double) src->f[2] ) * 1.442695f;
-   dst->f[3] = (float) log( (double) src->f[3] ) * 1.442695f;
+   dst->f[0] = logf( src->f[0] ) * 1.442695f;
+   dst->f[1] = logf( src->f[1] ) * 1.442695f;
+   dst->f[2] = logf( src->f[2] ) * 1.442695f;
+   dst->f[3] = logf( src->f[3] ) * 1.442695f;
  }
  
  static void
@@ -764,10 +764,10 @@ micro_pow(
     const union tgsi_exec_channel *src0,
     const union tgsi_exec_channel *src1 )
  {
-   dst->f[0] = (float) pow( (double) src0->f[0], (double) src1->f[0] );
-   dst->f[1] = (float) pow( (double) src0->f[1], (double) src1->f[1] );
-   dst->f[2] = (float) pow( (double) src0->f[2], (double) src1->f[2] );
-   dst->f[3] = (float) pow( (double) src0->f[3], (double) src1->f[3] );
+   dst->f[0] = powf( src0->f[0], src1->f[0] );
+   dst->f[1] = powf( src0->f[1], src1->f[1] );
+   dst->f[2] = powf( src0->f[2], src1->f[2] );
+   dst->f[3] = powf( src0->f[3], src1->f[3] );
  }
  
  static void
@@ -775,10 +775,10 @@ micro_rnd(
     union tgsi_exec_channel *dst,
     const union tgsi_exec_channel *src )
  {
-   dst->f[0] = (float) floor( (double) (src->f[0] + 0.5f) );
-   dst->f[1] = (float) floor( (double) (src->f[1] + 0.5f) );
-   dst->f[2] = (float) floor( (double) (src->f[2] + 0.5f) );
-   dst->f[3] = (float) floor( (double) (src->f[3] + 0.5f) );
+   dst->f[0] = floorf( src->f[0] + 0.5f );
+   dst->f[1] = floorf( src->f[1] + 0.5f );
+   dst->f[2] = floorf( src->f[2] + 0.5f );
+   dst->f[3] = floorf( src->f[3] + 0.5f );
  }
  
  static void
@@ -833,20 +833,20 @@ micro_sin(
     union tgsi_exec_channel *dst,
     const union tgsi_exec_channel *src )
  {
-   dst->f[0] = (float) sin( (double) src->f[0] );
-   dst->f[1] = (float) sin( (double) src->f[1] );
-   dst->f[2] = (float) sin( (double) src->f[2] );
-   dst->f[3] = (float) sin( (double) src->f[3] );
+   dst->f[0] = sinf( src->f[0] );
+   dst->f[1] = sinf( src->f[1] );
+   dst->f[2] = sinf( src->f[2] );
+   dst->f[3] = sinf( src->f[3] );
  }
  
  static void
  micro_sqrt( union tgsi_exec_channel *dst,
              const union tgsi_exec_channel *src )
  {
-   dst->f[0] = (float) sqrt( (double) src->f[0] );
-   dst->f[1] = (float) sqrt( (double) src->f[1] );
-   dst->f[2] = (float) sqrt( (double) src->f[2] );
-   dst->f[3] = (float) sqrt( (double) src->f[3] );
+   dst->f[0] = sqrtf( src->f[0] );
+   dst->f[1] = sqrtf( src->f[1] );
+   dst->f[2] = sqrtf( src->f[2] );
+   dst->f[3] = sqrtf( src->f[3] );
  }
  
  static void
diff --git a/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c

index d47935e..c3295a2 100755 (executable)
--- a/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c
+++ b/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c
@@ -38,113 +38,6 @@
  
  #define HIGH_PRECISION 1  /* for 1/sqrt() */
  
-#define DUMP_SSE  0
-
-#if DUMP_SSE
-
-static void
-_print_reg(
-   struct x86_reg reg )
-{
-   if (reg.mod != mod_REG) 
-      debug_printf( "[" );
-      
-   switch( reg.file ) {
-   case file_REG32:
-      switch( reg.idx ) {
-      case reg_AX:
-         debug_printf( "EAX" );
-         break;
-      case reg_CX:
-         debug_printf( "ECX" );
-         break;
-      case reg_DX:
-         debug_printf( "EDX" );
-         break;
-      case reg_BX:
-         debug_printf( "EBX" );
-         break;
-      case reg_SP:
-         debug_printf( "ESP" );
-         break;
-      case reg_BP:
-         debug_printf( "EBP" );
-         break;
-      case reg_SI:
-         debug_printf( "ESI" );
-         break;
-      case reg_DI:
-         debug_printf( "EDI" );
-         break;
-      }
-      break;
-   case file_MMX:
-      assert( 0 );
-      break;
-   case file_XMM:
-      debug_printf( "XMM%u", reg.idx );
-      break;
-   case file_x87:
-      assert( 0 );
-      break;
-   }
-
-   if (reg.mod == mod_DISP8 ||
-       reg.mod == mod_DISP32)
-      debug_printf("+%d", reg.disp);
-
-   if (reg.mod != mod_REG) 
-      debug_printf( "]" );
-}
-
-static void
-_fill(
-   const char  *op )
-{
-   unsigned count = 10 - strlen( op );
-
-   while( count-- ) {
-      debug_printf( " " );
-   }
-}
-
-#define DUMP_START() debug_printf( "\nsse-dump start ----------------" )
-#define DUMP_END() debug_printf( "\nsse-dump end ----------------\n" )
-#define DUMP( OP ) debug_printf( "\n%s", OP )
-#define DUMP_I( OP, I ) do {\
-   debug_printf( "\n%s", OP );\
-   _fill( OP );\
-   debug_printf( "%u", I ); } while( 0 )
-#define DUMP_R( OP, R0 ) do {\
-   debug_printf( "\n%s", OP );\
-   _fill( OP );\
-   _print_reg( R0 ); } while( 0 )
-#define DUMP_RR( OP, R0, R1 ) do {\
-   debug_printf( "\n%s", OP );\
-   _fill( OP );\
-   _print_reg( R0 );\
-   debug_printf( ", " );\
-   _print_reg( R1 ); } while( 0 )
-#define DUMP_RRI( OP, R0, R1, I ) do {\
-   debug_printf( "\n%s", OP );\
-   _fill( OP );\
-   _print_reg( R0 );\
-   debug_printf( ", " );\
-   _print_reg( R1 );\
-   debug_printf( ", " );\
-   debug_printf( "%u", I ); } while( 0 )
-
-#else
-
-#define DUMP_START()
-#define DUMP_END()
-#define DUMP( OP )
-#define DUMP_I( OP, I )
-#define DUMP_R( OP, R0 )
-#define DUMP_RR( OP, R0, R1 )
-#define DUMP_RRI( OP, R0, R1, I )
-
-#endif
  
  #define FOR_EACH_CHANNEL( CHAN )\
     for( CHAN = 0; CHAN < 4; CHAN++ )
@@ -310,200 +203,6 @@ get_coef(
        ((vec * 3 + member) * 4 + chan) * 4 );
  }
  
-/**
- * X86 rtasm wrappers.
- */
-
-static void
-emit_addps(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "ADDPS", dst, src );
-   sse_addps( func, dst, src );
-}
-
-static void
-emit_andnps(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "ANDNPS", dst, src );
-   sse_andnps( func, dst, src );
-}
-
-static void
-emit_andps(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "ANDPS", dst, src );
-   sse_andps( func, dst, src );
-}
-
-static void
-emit_call(
-   struct x86_function  *func,
-   void                 (* addr)() )
-{
-   struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
-
-   DUMP_I( "CALL", addr );
-   x86_mov_reg_imm( func, ecx, (unsigned long) addr );
-   x86_call( func, ecx );
-}
-
-static void
-emit_cmpps(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src,
-   enum sse_cc          cc )
-{
-   DUMP_RRI( "CMPPS", dst, src, cc );
-   sse_cmpps( func, dst, src, cc );
-}
-
-static void
-emit_cvttps2dq(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "CVTTPS2DQ", dst, src );
-   sse2_cvttps2dq( func, dst, src );
-}
-
-static void
-emit_maxps(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "MAXPS", dst, src );
-   sse_maxps( func, dst, src );
-}
-
-static void
-emit_minps(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "MINPS", dst, src );
-   sse_minps( func, dst, src );
-}
-
-static void
-emit_mov(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "MOV", dst, src );
-   x86_mov( func, dst, src );
-}
-
-static void
-emit_movaps(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "MOVAPS", dst, src );
-   sse_movaps( func, dst, src );
-}
-
-static void
-emit_movss(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "MOVSS", dst, src );
-   sse_movss( func, dst, src );
-}
-
-static void
-emit_movups(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "MOVUPS", dst, src );
-   sse_movups( func, dst, src );
-}
-
-static void
-emit_mulps(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "MULPS", dst, src );
-   sse_mulps( func, dst, src );
-}
-
-static void
-emit_or(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "OR", dst, src );
-   x86_or( func, dst, src );
-}
-
-static void
-emit_orps(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "ORPS", dst, src );
-   sse_orps( func, dst, src );
-}
-
-static void
-emit_pmovmskb(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "PMOVMSKB", dst, src );
-   sse_pmovmskb( func, dst, src );
-}
-
-static void
-emit_pop(
-   struct x86_function  *func,
-   struct x86_reg       dst )
-{
-   DUMP_R( "POP", dst );
-   x86_pop( func, dst );
-}
-
-static void
-emit_push(
-   struct x86_function  *func,
-   struct x86_reg       dst )
-{
-   DUMP_R( "PUSH", dst );
-   x86_push( func, dst );
-}
-
-static void
-emit_rcpps(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "RCPPS", dst, src );
-   sse2_rcpps( func, dst, src );
-}
  
  #ifdef WIN32
  static void
@@ -511,7 +210,6 @@ emit_retw(
     struct x86_function  *func,
     unsigned             size )
  {
-   DUMP_I( "RET", size );
     x86_retw( func, size );
  }
  #else
@@ -519,51 +217,10 @@ static void
  emit_ret(
     struct x86_function  *func )
  {
-   DUMP( "RET" );
     x86_ret( func );
  }
  #endif
  
-static void
-emit_rsqrtps(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "RSQRTPS", dst, src );
-   sse_rsqrtps( func, dst, src );
-}
-
-static void
-emit_shufps(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src,
-   unsigned char        shuf )
-{
-   DUMP_RRI( "SHUFPS", dst, src, shuf );
-   sse_shufps( func, dst, src, shuf );
-}
-
-static void
-emit_subps(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "SUBPS", dst, src );
-   sse_subps( func, dst, src );
-}
-
-static void
-emit_xorps(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "XORPS", dst, src );
-   sse_xorps( func, dst, src );
-}
  
  /**
   * Data fetch helpers.
@@ -582,11 +239,11 @@ emit_const(
     unsigned vec,
     unsigned chan )
  {
-   emit_movss(
+   sse_movss(
        func,
        make_xmm( xmm ),
        get_const( vec, chan ) );
-   emit_shufps(
+   sse_shufps(
        func,
        make_xmm( xmm ),
        make_xmm( xmm ),
@@ -600,11 +257,11 @@ emit_immediate(
     unsigned vec,
     unsigned chan )
  {
-   emit_movss(
+   sse_movss(
        func,
        make_xmm( xmm ),
        get_immediate( vec, chan ) );
-   emit_shufps(
+   sse_shufps(
        func,
        make_xmm( xmm ),
        make_xmm( xmm ),
@@ -625,7 +282,7 @@ emit_inputf(
     unsigned vec,
     unsigned chan )
  {
-   emit_movups(
+   sse_movups(
        func,
        make_xmm( xmm ),
        get_input( vec, chan ) );
@@ -644,7 +301,7 @@ emit_output(
     unsigned vec,
     unsigned chan )
  {
-   emit_movups(
+   sse_movups(
        func,
        get_output( vec, chan ),
        make_xmm( xmm ) );
@@ -663,7 +320,7 @@ emit_tempf(
     unsigned vec,
     unsigned chan )
  {
-   emit_movaps(
+   sse_movaps(
        func,
        make_xmm( xmm ),
        get_temp( vec, chan ) );
@@ -684,11 +341,11 @@ emit_coef(
     unsigned chan,
     unsigned member )
  {
-   emit_movss(
+   sse_movss(
        func,
        make_xmm( xmm ),
        get_coef( vec, chan, member ) );
-   emit_shufps(
+   sse_shufps(
        func,
        make_xmm( xmm ),
        make_xmm( xmm ),
@@ -706,7 +363,7 @@ emit_inputs(
     unsigned vec,
     unsigned chan )
  {
-   emit_movups(
+   sse_movups(
        func,
        get_input( vec, chan ),
        make_xmm( xmm ) );
@@ -719,7 +376,7 @@ emit_temps(
     unsigned vec,
     unsigned chan )
  {
-   emit_movaps(
+   sse_movaps(
        func,
        get_temp( vec, chan ),
        make_xmm( xmm ) );
@@ -796,39 +453,39 @@ static void
  emit_push_gp(
     struct x86_function *func )
  {
-   emit_push(
+   x86_push(
        func,
        get_const_base() );
-   emit_push(
+   x86_push(
        func,
        get_input_base() );
-   emit_push(
+   x86_push(
        func,
        get_output_base() );
  
     /* It is important on non-win32 platforms that temp base is pushed last.
      */
-   emit_push(
+   x86_push(
        func,
        get_temp_base() );
  }
  
  static void
-emit_pop_gp(
+x86_pop_gp(
     struct x86_function *func )
  {
     /* Restore GP registers in a reverse order.
      */
-   emit_pop(
+   x86_pop(
        func,
        get_temp_base() );
-   emit_pop(
+   x86_pop(
        func,
        get_output_base() );
-   emit_pop(
+   x86_pop(
        func,
        get_input_base() );
-   emit_pop(
+   x86_pop(
        func,
        get_const_base() );
  }
@@ -839,7 +496,7 @@ emit_func_call_dst(
     unsigned xmm_dst,
     void (*code)() )
  {
-   emit_movaps(
+   sse_movaps(
        func,
        get_temp( TEMP_R0, 0 ),
        make_xmm( xmm_dst ) );
@@ -848,19 +505,22 @@ emit_func_call_dst(
        func );
  
  #ifdef WIN32
-   emit_push(
+   x86_push(
        func,
        get_temp( TEMP_R0, 0 ) );
  #endif
  
-   emit_call(
-      func,
-      code );
+   {
+      struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
+
+      x86_mov_reg_imm( func, ecx, (unsigned long) code );
+      x86_call( func, ecx );
+   }
  
-   emit_pop_gp(
+   x86_pop_gp(
        func );
  
-   emit_movaps(
+   sse_movaps(
        func,
        make_xmm( xmm_dst ),
        get_temp( TEMP_R0, 0 ) );
@@ -873,7 +533,7 @@ emit_func_call_dst_src(
     unsigned xmm_src,
     void (*code)() )
  {
-   emit_movaps(
+   sse_movaps(
        func,
        get_temp( TEMP_R0, 1 ),
        make_xmm( xmm_src ) );
@@ -893,7 +553,7 @@ emit_abs(
     struct x86_function *func,
     unsigned xmm )
  {
-   emit_andps(
+   sse_andps(
        func,
        make_xmm( xmm ),
        get_temp(
@@ -907,7 +567,7 @@ emit_add(
     unsigned xmm_dst,
     unsigned xmm_src )
  {
-   emit_addps(
+   sse_addps(
        func,
        make_xmm( xmm_dst ),
        make_xmm( xmm_src ) );
@@ -918,17 +578,15 @@ cos4f(
     float *store )
  {
  #ifdef WIN32
-   store[0] = (float) cos( (double) store[0] );
-   store[1] = (float) cos( (double) store[1] );
-   store[2] = (float) cos( (double) store[2] );
-   store[3] = (float) cos( (double) store[3] );
+   const unsigned X = 0;
  #else
     const unsigned X = TEMP_R0 * 16;
+#endif
+
     store[X + 0] = cosf( store[X + 0] );
     store[X + 1] = cosf( store[X + 1] );
     store[X + 2] = cosf( store[X + 2] );
     store[X + 3] = cosf( store[X + 3] );
-#endif
  }
  
  static void
@@ -947,17 +605,14 @@ ex24f(
     float *store )
  {
  #ifdef WIN32
-   store[0] = (float) pow( 2.0, (double) store[0] );
-   store[1] = (float) pow( 2.0, (double) store[1] );
-   store[2] = (float) pow( 2.0, (double) store[2] );
-   store[3] = (float) pow( 2.0, (double) store[3] );
+   const unsigned X = 0;
  #else
     const unsigned X = TEMP_R0 * 16;
+#endif
     store[X + 0] = powf( 2.0f, store[X + 0] );
     store[X + 1] = powf( 2.0f, store[X + 1] );
     store[X + 2] = powf( 2.0f, store[X + 2] );
     store[X + 3] = powf( 2.0f, store[X + 3] );
-#endif
  }
  
  static void
@@ -976,7 +631,7 @@ emit_f2it(
     struct x86_function *func,
     unsigned xmm )
  {
-   emit_cvttps2dq(
+   sse2_cvttps2dq(
        func,
        make_xmm( xmm ),
        make_xmm( xmm ) );
@@ -991,10 +646,10 @@ flr4f(
  #else
     const unsigned X = TEMP_R0 * 16;
  #endif
-   store[X + 0] = (float) floor( (double) store[X + 0] );
-   store[X + 1] = (float) floor( (double) store[X + 1] );
-   store[X + 2] = (float) floor( (double) store[X + 2] );
-   store[X + 3] = (float) floor( (double) store[X + 3] );
+   store[X + 0] = floorf( store[X + 0] );
+   store[X + 1] = floorf( store[X + 1] );
+   store[X + 2] = floorf( store[X + 2] );
+   store[X + 3] = floorf( store[X + 3] );
  }
  
  static void
@@ -1017,10 +672,10 @@ frc4f(
  #else
     const unsigned X = TEMP_R0 * 16;
  #endif
-   store[X + 0] -= (float) floor( (double) store[X + 0] );
-   store[X + 1] -= (float) floor( (double) store[X + 1] );
-   store[X + 2] -= (float) floor( (double) store[X + 2] );
-   store[X + 3] -= (float) floor( (double) store[X + 3] );
+   store[X + 0] -= floorf( store[X + 0] );
+   store[X + 1] -= floorf( store[X + 1] );
+   store[X + 2] -= floorf( store[X + 2] );
+   store[X + 3] -= floorf( store[X + 3] );
  }
  
  static void
@@ -1066,7 +721,7 @@ emit_MOV(
     unsigned xmm_dst,
     unsigned xmm_src )
  {
-   emit_movups(
+   sse_movups(
        func,
        make_xmm( xmm_dst ),
        make_xmm( xmm_src ) );
@@ -1077,7 +732,7 @@ emit_mul (struct x86_function *func,
            unsigned xmm_dst,
            unsigned xmm_src)
  {
-   emit_mulps(
+   sse_mulps(
        func,
        make_xmm( xmm_dst ),
        make_xmm( xmm_src ) );
@@ -1088,7 +743,7 @@ emit_neg(
     struct x86_function *func,
     unsigned xmm )
  {
-   emit_xorps(
+   sse_xorps(
        func,
        make_xmm( xmm ),
        get_temp(
@@ -1101,17 +756,14 @@ pow4f(
     float *store )
  {
  #ifdef WIN32
-   store[0] = (float) pow( (double) store[0], (double) store[4] );
-   store[1] = (float) pow( (double) store[1], (double) store[5] );
-   store[2] = (float) pow( (double) store[2], (double) store[6] );
-   store[3] = (float) pow( (double) store[3], (double) store[7] );
+   const unsigned X = 0;
  #else
     const unsigned X = TEMP_R0 * 16;
+#endif
     store[X + 0] = powf( store[X + 0], store[X + 4] );
     store[X + 1] = powf( store[X + 1], store[X + 5] );
     store[X + 2] = powf( store[X + 2], store[X + 6] );
     store[X + 3] = powf( store[X + 3], store[X + 7] );
-#endif
  }
  
  static void
@@ -1133,7 +785,11 @@ emit_rcp (
     unsigned xmm_dst,
     unsigned xmm_src )
  {
-   emit_rcpps(
+   /* On Intel CPUs at least, this is only accurate to 12 bits -- not
+    * good enough.  Need to either emit a proper divide or use the
+    * iterative technique described below in emit_rsqrt().
+    */
+   sse2_rcpps(
        func,
        make_xmm( xmm_dst ),
        make_xmm( xmm_src ) );
@@ -1145,17 +801,14 @@ rsqrt4f(
     float *store )
  {
  #ifdef WIN32
-   store[0] = 1.0F / (float) sqrt( (double) store[0] );
-   store[1] = 1.0F / (float) sqrt( (double) store[1] );
-   store[2] = 1.0F / (float) sqrt( (double) store[2] );
-   store[3] = 1.0F / (float) sqrt( (double) store[3] );
+   const unsigned X = 0;
  #else
     const unsigned X = TEMP_R0 * 16;
-   store[X + 0] = 1.0F / sqrt( store[X + 0] );
-   store[X + 1] = 1.0F / sqrt( store[X + 1] );
-   store[X + 2] = 1.0F / sqrt( store[X + 2] );
-   store[X + 3] = 1.0F / sqrt( store[X + 3] );
  #endif
+   store[X + 0] = 1.0F / sqrtf( store[X + 0] );
+   store[X + 1] = 1.0F / sqrtf( store[X + 1] );
+   store[X + 2] = 1.0F / sqrtf( store[X + 2] );
+   store[X + 3] = 1.0F / sqrtf( store[X + 3] );
  }
  #endif
  
@@ -1166,12 +819,41 @@ emit_rsqrt(
     unsigned xmm_src )
  {
  #if HIGH_PRECISION
+#if 1
     emit_func_call_dst_src(
        func,
        xmm_dst,
        xmm_src,
        rsqrt4f );
  #else
+   /* Although rsqrtps() and rcpps() are low precision on some/all SSE
+    * implementations, it is possible to improve its precision at
+    * fairly low cost, using a newton/raphson step, as below:
+    * 
+    * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
+    * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
+    *
+    * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
+    */
+   /* This is some code that woudl do the above for a scalar 'a'.  We
+    * obviously are interested in a vector version:
+    *
+    * movss   xmm3, a;
+    * movss   xmm1, half;
+    * movss   xmm2, three;
+    * rsqrtss xmm0, xmm3;
+    * mulss   xmm3, xmm0;
+    * mulss   xmm1, xmm0;
+    * mulss   xmm3, xmm0;
+    * subss   xmm2, xmm3;
+    * mulss   xmm1, xmm2;
+    * movss   x,    xmm1;
+    */
+#endif
+#else
+   /* On Intel CPUs at least, this is only accurate to 12 bits -- not
+    * good enough.
+    */
     emit_rsqrtps(
        func,
        make_xmm( xmm_dst ),
@@ -1184,7 +866,7 @@ emit_setsign(
     struct x86_function *func,
     unsigned xmm )
  {
-   emit_orps(
+   sse_orps(
        func,
        make_xmm( xmm ),
        get_temp(
@@ -1197,17 +879,14 @@ sin4f(
     float *store )
  {
  #ifdef WIN32
-   store[0] = (float) sin( (double) store[0] );
-   store[1] = (float) sin( (double) store[1] );
-   store[2] = (float) sin( (double) store[2] );
-   store[3] = (float) sin( (double) store[3] );
+   const unsigned X = 0;
  #else
     const unsigned X = TEMP_R0 * 16;
+#endif
     store[X + 0] = sinf( store[X + 0] );
     store[X + 1] = sinf( store[X + 1] );
     store[X + 2] = sinf( store[X + 2] );
     store[X + 3] = sinf( store[X + 3] );
-#endif
  }
  
  static void
@@ -1226,7 +905,7 @@ emit_sub(
     unsigned xmm_dst,
     unsigned xmm_src )
  {
-   emit_subps(
+   sse_subps(
        func,
        make_xmm( xmm_dst ),
        make_xmm( xmm_src ) );
@@ -1435,16 +1114,16 @@ emit_kil(
        }
     }
  
-   emit_push(
+   x86_push(
        func,
        x86_make_reg( file_REG32, reg_AX ) );
-   emit_push(
+   x86_push(
        func,
        x86_make_reg( file_REG32, reg_DX ) );
  
     FOR_EACH_CHANNEL( chan_index ) {
        if( uniquemask & (1 << chan_index) ) {
-         emit_cmpps(
+         sse_cmpps(
              func,
              make_xmm( registers[chan_index] ),
              get_temp(
@@ -1453,17 +1132,17 @@ emit_kil(
              cc_LessThan );
  
           if( chan_index == firstchan ) {
-            emit_pmovmskb(
+            sse_pmovmskb(
                 func,
                 x86_make_reg( file_REG32, reg_AX ),
                 make_xmm( registers[chan_index] ) );
           }
           else {
-            emit_pmovmskb(
+            sse_pmovmskb(
                 func,
                 x86_make_reg( file_REG32, reg_DX ),
                 make_xmm( registers[chan_index] ) );
-            emit_or(
+            x86_or(
                 func,
                 x86_make_reg( file_REG32, reg_AX ),
                 x86_make_reg( file_REG32, reg_DX ) );
@@ -1471,17 +1150,17 @@ emit_kil(
        }
     }
  
-   emit_or(
+   x86_or(
        func,
        get_temp(
           TGSI_EXEC_TEMP_KILMASK_I,
           TGSI_EXEC_TEMP_KILMASK_C ),
        x86_make_reg( file_REG32, reg_AX ) );
  
-   emit_pop(
+   x86_pop(
        func,
        x86_make_reg( file_REG32, reg_DX ) );
-   emit_pop(
+   x86_pop(
        func,
        x86_make_reg( file_REG32, reg_AX ) );
  }
@@ -1497,12 +1176,12 @@ emit_setcc(
     FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
        FETCH( func, *inst, 0, 0, chan_index );
        FETCH( func, *inst, 1, 1, chan_index );
-      emit_cmpps(
+      sse_cmpps(
           func,
           make_xmm( 0 ),
           make_xmm( 1 ),
           cc );
-      emit_andps(
+      sse_andps(
           func,
           make_xmm( 0 ),
           get_temp(
@@ -1523,22 +1202,22 @@ emit_cmp(
        FETCH( func, *inst, 0, 0, chan_index );
        FETCH( func, *inst, 1, 1, chan_index );
        FETCH( func, *inst, 2, 2, chan_index );
-      emit_cmpps(
+      sse_cmpps(
           func,
           make_xmm( 0 ),
           get_temp(
              TGSI_EXEC_TEMP_00000000_I,
              TGSI_EXEC_TEMP_00000000_C ),
           cc_LessThan );
-      emit_andps(
+      sse_andps(
           func,
           make_xmm( 1 ),
           make_xmm( 0 ) );
-      emit_andnps(
+      sse_andnps(
           func,
           make_xmm( 0 ),
           make_xmm( 2 ) );
-      emit_orps(
+      sse_orps(
           func,
           make_xmm( 0 ),
           make_xmm( 1 ) );
@@ -1589,7 +1268,7 @@ emit_instruction(
            IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
           if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
              FETCH( func, *inst, 0, 0, CHAN_X );
-            emit_maxps(
+            sse_maxps(
                 func,
                 make_xmm( 0 ),
                 get_temp(
@@ -1601,7 +1280,7 @@ emit_instruction(
              /* XMM[1] = SrcReg[0].yyyy */
              FETCH( func, *inst, 1, 0, CHAN_Y );
              /* XMM[1] = max(XMM[1], 0) */
-            emit_maxps(
+            sse_maxps(
                 func,
                 make_xmm( 1 ),
                 get_temp(
@@ -1610,14 +1289,14 @@ emit_instruction(
              /* XMM[2] = SrcReg[0].wwww */
              FETCH( func, *inst, 2, 0, CHAN_W );
              /* XMM[2] = min(XMM[2], 128.0) */
-            emit_minps(
+            sse_minps(
                 func,
                 make_xmm( 2 ),
                 get_temp(
                    TGSI_EXEC_TEMP_128_I,
                    TGSI_EXEC_TEMP_128_C ) );
              /* XMM[2] = max(XMM[2], -128.0) */
-            emit_maxps(
+            sse_maxps(
                 func,
                 make_xmm( 2 ),
                 get_temp(
@@ -1625,16 +1304,16 @@ emit_instruction(
                    TGSI_EXEC_TEMP_MINUS_128_C ) );
              emit_pow( func, 1, 2 );
              FETCH( func, *inst, 0, 0, CHAN_X );
-            emit_xorps(
+            sse_xorps(
                 func,
                 make_xmm( 2 ),
                 make_xmm( 2 ) );
-            emit_cmpps(
+            sse_cmpps(
                 func,
                 make_xmm( 2 ),
                 make_xmm( 0 ),
                 cc_LessThanEqual );
-            emit_andps(
+            sse_andps(
                 func,
                 make_xmm( 2 ),
                 make_xmm( 1 ) );
@@ -1756,7 +1435,7 @@ emit_instruction(
        FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
           FETCH( func, *inst, 0, 0, chan_index );
           FETCH( func, *inst, 1, 1, chan_index );
-         emit_minps(
+         sse_minps(
              func,
              make_xmm( 0 ),
              make_xmm( 1 ) );
@@ -1768,7 +1447,7 @@ emit_instruction(
        FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
           FETCH( func, *inst, 0, 0, chan_index );
           FETCH( func, *inst, 1, 1, chan_index );
-         emit_maxps(
+         sse_maxps(
              func,
              make_xmm( 0 ),
              make_xmm( 1 ) );
@@ -2376,8 +2055,6 @@ tgsi_emit_sse2(
     unsigned ok = 1;
     uint num_immediates = 0;
  
-   DUMP_START();
-
     func->csr = func->store;
  
     tgsi_parse_init( &parse, tokens );
@@ -2387,24 +2064,24 @@ tgsi_emit_sse2(
      */
     if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
        /* DECLARATION phase, do not load output argument. */
-      emit_mov(
+      x86_mov(
           func,
           get_input_base(),
           get_argument( 0 ) );
        /* skipping outputs argument here */
-      emit_mov(
+      x86_mov(
           func,
           get_const_base(),
           get_argument( 2 ) );
-      emit_mov(
+      x86_mov(
           func,
           get_temp_base(),
           get_argument( 3 ) );
-      emit_mov(
+      x86_mov(
           func,
           get_coef_base(),
           get_argument( 4 ) );
-      emit_mov(
+      x86_mov(
           func,
           get_immediate_base(),
           get_argument( 5 ) );
@@ -2412,23 +2089,23 @@ tgsi_emit_sse2(
     else {
        assert(parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX);
  
-      emit_mov(
+      x86_mov(
           func,
           get_input_base(),
           get_argument( 0 ) );
-      emit_mov(
+      x86_mov(
           func,
           get_output_base(),
           get_argument( 1 ) );
-      emit_mov(
+      x86_mov(
           func,
           get_const_base(),
           get_argument( 2 ) );
-      emit_mov(
+      x86_mov(
           func,
           get_temp_base(),
           get_argument( 3 ) );
-      emit_mov(
+      x86_mov(
           func,
           get_immediate_base(),
           get_argument( 4 ) );
@@ -2451,7 +2128,7 @@ tgsi_emit_sse2(
              if( !instruction_phase ) {
                 /* INSTRUCTION phase, overwrite coeff with output. */
                 instruction_phase = TRUE;
-               emit_mov(
+               x86_mov(
                    func,
                    get_output_base(),
                    get_argument( 1 ) );
@@ -2463,8 +2140,10 @@ tgsi_emit_sse2(
              &parse.FullToken.FullInstruction );
  
          if (!ok) {
-           debug_printf("failed to translate tgsi opcode %d to SSE\n", 
-                        parse.FullToken.FullInstruction.Instruction.Opcode );
+           debug_printf("failed to translate tgsi opcode %d to SSE (%s)\n", 
+                        parse.FullToken.FullInstruction.Instruction.Opcode,
+                         parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
+                         "vertex shader" : "fragment shader");
          }
           break;
  
@@ -2499,8 +2178,6 @@ tgsi_emit_sse2(
  
     tgsi_parse_free( &parse );
  
-   DUMP_END();
-
     return ok;
  }
author	Keith Whitwell <keith@tungstengraphics.com>
	Mon, 21 Apr 2008 11:39:59 +0000 (12:39 +0100)
committer	Keith Whitwell <keith@tungstengraphics.com>
	Mon, 21 Apr 2008 12:14:30 +0000 (13:14 +0100)
src/gallium/auxiliary/tgsi/exec/tgsi_exec.c		patch \| blob \| history
src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c		patch \| blob \| history