1 /**************************************************************************
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **************************************************************************/
28 #include "pipe/p_config.h"
30 #if defined(PIPE_ARCH_X86)
32 #include "util/u_debug.h"
33 #include "pipe/p_shader_tokens.h"
34 #include "util/u_math.h"
35 #include "util/u_memory.h"
36 #if defined(PIPE_ARCH_SSE)
37 #include "util/u_sse.h"
39 #include "tgsi/tgsi_info.h"
40 #include "tgsi/tgsi_parse.h"
41 #include "tgsi/tgsi_util.h"
42 #include "tgsi_exec.h"
43 #include "tgsi_sse2.h"
45 #include "rtasm/rtasm_x86sse.h"
49 * This costs about 100fps (close to 10%) in gears:
51 #define HIGH_PRECISION 1
56 #define FOR_EACH_CHANNEL( CHAN )\
57 for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
59 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
60 ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
62 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
63 if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
65 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
66 FOR_EACH_CHANNEL( CHAN )\
67 IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
74 #define TEMP_ONE_I TGSI_EXEC_TEMP_ONE_I
75 #define TEMP_ONE_C TGSI_EXEC_TEMP_ONE_C
77 #define TEMP_R0 TGSI_EXEC_TEMP_R0
78 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
79 #define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I
80 #define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C
84 * X86 utility functions.
93 (enum x86_reg_name) xmm );
97 * X86 register mapping helpers.
100 static struct x86_reg
101 get_const_base( void )
108 static struct x86_reg
109 get_machine_base( void )
116 static struct x86_reg
117 get_input_base( void )
119 return x86_make_disp(
121 Offset(struct tgsi_exec_machine, Inputs) );
124 static struct x86_reg
125 get_output_base( void )
127 return x86_make_disp(
129 Offset(struct tgsi_exec_machine, Outputs) );
132 static struct x86_reg
133 get_temp_base( void )
135 return x86_make_disp(
137 Offset(struct tgsi_exec_machine, Temps) );
140 static struct x86_reg
141 get_coef_base( void )
148 static struct x86_reg
149 get_sampler_base( void )
156 static struct x86_reg
157 get_immediate_base( void )
166 * Data access helpers.
170 static struct x86_reg
175 return x86_make_disp(
176 get_immediate_base(),
177 (vec * 4 + chan) * 4 );
180 static struct x86_reg
185 return x86_make_disp(
187 (vec * 4 + chan) * 4 );
190 static struct x86_reg
194 return x86_make_disp(
196 unit * sizeof( struct tgsi_sampler * ) );
199 static struct x86_reg
204 return x86_make_disp(
206 (vec * 4 + chan) * 16 );
209 static struct x86_reg
214 return x86_make_disp(
216 (vec * 4 + chan) * 16 );
219 static struct x86_reg
224 return x86_make_disp(
226 (vec * 4 + chan) * 16 );
229 static struct x86_reg
235 return x86_make_disp(
237 ((vec * 3 + member) * 4 + chan) * 4 );
243 struct x86_function *func )
250 * Data fetch helpers.
254 * Copy a shader constant to xmm register
255 * \param xmm the destination xmm register
256 * \param vec the src const buffer index
257 * \param chan src channel to fetch (X, Y, Z or W)
261 struct x86_function *func,
270 /* 'vec' is the offset from the address register's value.
271 * We're loading CONST[ADDR+vec] into an xmm register.
273 struct x86_reg r0 = get_immediate_base();
274 struct x86_reg r1 = get_coef_base();
277 assert( indirectFile == TGSI_FILE_ADDRESS );
278 assert( indirectIndex == 0 );
279 assert( r0.mod == mod_REG );
280 assert( r1.mod == mod_REG );
282 x86_push( func, r0 );
283 x86_push( func, r1 );
286 * Loop over the four pixels or vertices in the quad.
287 * Get the value of the address (offset) register for pixel/vertex[i],
288 * add it to the src offset and index into the constant buffer.
289 * Note that we're working on SOA data.
290 * If any of the pixel/vertex execution channels are unused their
291 * values will be garbage. It's very important that we don't use
292 * those garbage values as indexes into the constant buffer since
293 * that'll cause segfaults.
294 * The solution is to bitwise-AND the offset with the execution mask
295 * register whose values are either 0 or ~0.
296 * The caller must setup the execution mask register to indicate
297 * which channels are valid/alive before running the shader.
298 * The execution mask will also figure into loops and conditionals
301 for (i = 0; i < QUAD_SIZE; i++) {
302 /* r1 = address register[i] */
303 x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) );
304 /* r0 = execution mask[i] */
305 x86_mov( func, r0, x86_make_disp( get_temp( TEMP_EXEC_MASK_I, TEMP_EXEC_MASK_C ), i * 4 ) );
307 x86_and( func, r1, r0 );
308 /* r0 = 'vec', the offset */
309 x86_lea( func, r0, get_const( vec, chan ) );
311 /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm.
313 x86_add( func, r1, r1 );
314 x86_add( func, r1, r1 );
315 x86_add( func, r1, r1 );
316 x86_add( func, r1, r1 );
318 x86_add( func, r0, r1 ); /* r0 = r0 + r1 */
319 x86_mov( func, r1, x86_deref( r0 ) );
320 x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 );
329 get_temp( TEMP_R0, CHAN_X ) );
332 /* 'vec' is the index into the src register file, such as TEMP[vec] */
338 get_const( vec, chan ) );
343 SHUF( 0, 0, 0, 0 ) );
349 struct x86_function *func,
357 get_immediate( vec, chan ) );
362 SHUF( 0, 0, 0, 0 ) );
367 * Copy a shader input to xmm register
368 * \param xmm the destination xmm register
369 * \param vec the src input attrib
370 * \param chan src channel to fetch (X, Y, Z or W)
374 struct x86_function *func,
382 get_input( vec, chan ) );
386 * Store an xmm register to a shader output
387 * \param xmm the source xmm register
388 * \param vec the dest output attrib
389 * \param chan src dest channel to store (X, Y, Z or W)
393 struct x86_function *func,
400 get_output( vec, chan ),
405 * Copy a shader temporary to xmm register
406 * \param xmm the destination xmm register
407 * \param vec the src temp register
408 * \param chan src channel to fetch (X, Y, Z or W)
412 struct x86_function *func,
420 get_temp( vec, chan ) );
424 * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
425 * \param xmm the destination xmm register
426 * \param vec the src input/attribute coefficient index
427 * \param chan src channel to fetch (X, Y, Z or W)
428 * \param member 0=a0, 1=dadx, 2=dady
432 struct x86_function *func,
441 get_coef( vec, chan, member ) );
446 SHUF( 0, 0, 0, 0 ) );
450 * Data store helpers.
455 struct x86_function *func,
462 get_input( vec, chan ),
468 struct x86_function *func,
475 get_temp( vec, chan ),
481 struct x86_function *func,
491 vec + TGSI_EXEC_TEMP_ADDR,
496 * Coefficent fetch helpers.
501 struct x86_function *func,
516 struct x86_function *func,
531 struct x86_function *func,
545 * Function call helpers.
549 * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be
550 * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
551 * that the stack pointer is 16 byte aligned, as expected.
555 struct x86_function *func,
556 unsigned xmm_save_mask,
557 const struct x86_reg *arg,
559 void (PIPE_CDECL *code)() )
561 struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
566 x86_make_reg( file_REG32, reg_AX) );
569 x86_make_reg( file_REG32, reg_CX) );
572 x86_make_reg( file_REG32, reg_DX) );
574 /* Store XMM regs to the stack
576 for(i = 0, n = 0; i < 8; ++i)
577 if(xmm_save_mask & (1 << i))
582 x86_make_reg( file_REG32, reg_SP ),
585 for(i = 0, n = 0; i < 8; ++i)
586 if(xmm_save_mask & (1 << i)) {
589 x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ),
594 for (i = 0; i < nr_args; i++) {
595 /* Load the address of the buffer we use for passing arguments and
603 /* Push actual function arguments (currently just the pointer to
604 * the buffer above), and call the function:
606 x86_push( func, ecx );
609 x86_mov_reg_imm( func, ecx, (unsigned long) code );
610 x86_call( func, ecx );
612 /* Pop the arguments (or just add an immediate to esp)
614 for (i = 0; i < nr_args; i++) {
618 /* Pop the saved XMM regs:
620 for(i = 0, n = 0; i < 8; ++i)
621 if(xmm_save_mask & (1 << i)) {
625 x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) );
631 x86_make_reg( file_REG32, reg_SP ),
634 /* Restore GP registers in a reverse order.
638 x86_make_reg( file_REG32, reg_DX) );
641 x86_make_reg( file_REG32, reg_CX) );
644 x86_make_reg( file_REG32, reg_AX) );
648 emit_func_call_dst_src1(
649 struct x86_function *func,
653 void (PIPE_CDECL *code)() )
655 struct x86_reg store = get_temp( TEMP_R0, 0 );
656 unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst);
658 /* Store our input parameters (in xmm regs) to the buffer we use
659 * for passing arguments. We will pass a pointer to this buffer as
660 * the actual function argument.
665 make_xmm( xmm_src0 ) );
667 emit_func_call( func,
681 emit_func_call_dst_src2(
682 struct x86_function *func,
687 void (PIPE_CDECL *code)() )
689 struct x86_reg store = get_temp( TEMP_R0, 0 );
690 unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst);
692 /* Store two inputs to parameter buffer.
697 make_xmm( xmm_src0 ) );
701 x86_make_disp( store, 4 * sizeof(float) ),
702 make_xmm( xmm_src1 ) );
707 emit_func_call( func,
713 /* Retrieve the results:
725 #if defined(PIPE_ARCH_SSE)
728 * Fast SSE2 implementation of special math functions.
731 #define POLY0(x, c0) _mm_set1_ps(c0)
732 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
733 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
734 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
735 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
736 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
738 #define EXP_POLY_DEGREE 3
739 #define LOG_POLY_DEGREE 5
742 * See http://www.devmaster.net/forums/showthread.php?p=43580
748 __m128 fpart, expipart, expfpart;
750 x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
751 x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
753 /* ipart = int(x - 0.5) */
754 ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
756 /* fpart = x - ipart */
757 fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
759 /* expipart = (float) (1 << ipart) */
760 expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
762 /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
763 #if EXP_POLY_DEGREE == 5
764 expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
765 #elif EXP_POLY_DEGREE == 4
766 expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
767 #elif EXP_POLY_DEGREE == 3
768 expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
769 #elif EXP_POLY_DEGREE == 2
770 expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
775 return _mm_mul_ps(expipart, expfpart);
780 * See http://www.devmaster.net/forums/showthread.php?p=43580
785 __m128i expmask = _mm_set1_epi32(0x7f800000);
786 __m128i mantmask = _mm_set1_epi32(0x007fffff);
787 __m128 one = _mm_set1_ps(1.0f);
789 __m128i i = _mm_castps_si128(x);
791 /* exp = (float) exponent(x) */
792 __m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127)));
794 /* mant = (float) mantissa(x) */
795 __m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one);
799 /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
800 * These coefficients can be generate with
801 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
803 #if LOG_POLY_DEGREE == 6
804 logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f);
805 #elif LOG_POLY_DEGREE == 5
806 logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
807 #elif LOG_POLY_DEGREE == 4
808 logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
809 #elif LOG_POLY_DEGREE == 3
810 logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
815 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
816 logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one));
818 return _mm_add_ps(logmant, exp);
823 powf4(__m128 x, __m128 y)
825 return exp2f4(_mm_mul_ps(log2f4(x), y));
828 #endif /* PIPE_ARCH_SSE */
833 * Low-level instruction translators.
838 struct x86_function *func,
845 TGSI_EXEC_TEMP_7FFFFFFF_I,
846 TGSI_EXEC_TEMP_7FFFFFFF_C ) );
851 struct x86_function *func,
858 make_xmm( xmm_src ) );
861 static void PIPE_CDECL
865 store[0] = cosf( store[0] );
866 store[1] = cosf( store[1] );
867 store[2] = cosf( store[2] );
868 store[3] = cosf( store[3] );
873 struct x86_function *func,
877 emit_func_call_dst_src1(
885 static void PIPE_CDECL
886 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
887 __attribute__((force_align_arg_pointer))
892 #if defined(PIPE_ARCH_SSE)
893 _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
895 store[0] = util_fast_exp2( store[0] );
896 store[1] = util_fast_exp2( store[1] );
897 store[2] = util_fast_exp2( store[2] );
898 store[3] = util_fast_exp2( store[3] );
904 struct x86_function *func,
908 emit_func_call_dst_src1(
918 struct x86_function *func,
929 struct x86_function *func,
938 static void PIPE_CDECL
942 store[0] = floorf( store[0] );
943 store[1] = floorf( store[1] );
944 store[2] = floorf( store[2] );
945 store[3] = floorf( store[3] );
950 struct x86_function *func,
954 emit_func_call_dst_src1(
962 static void PIPE_CDECL
966 store[0] -= floorf( store[0] );
967 store[1] -= floorf( store[1] );
968 store[2] -= floorf( store[2] );
969 store[3] -= floorf( store[3] );
974 struct x86_function *func,
978 emit_func_call_dst_src1(
986 static void PIPE_CDECL
987 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
988 __attribute__((force_align_arg_pointer))
993 #if defined(PIPE_ARCH_SSE)
994 _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
996 store[0] = util_fast_log2( store[0] );
997 store[1] = util_fast_log2( store[1] );
998 store[2] = util_fast_log2( store[2] );
999 store[3] = util_fast_log2( store[3] );
1005 struct x86_function *func,
1009 emit_func_call_dst_src1(
1019 struct x86_function *func,
1025 make_xmm( xmm_dst ),
1026 make_xmm( xmm_src ) );
1030 emit_mul (struct x86_function *func,
1036 make_xmm( xmm_dst ),
1037 make_xmm( xmm_src ) );
1042 struct x86_function *func,
1049 TGSI_EXEC_TEMP_80000000_I,
1050 TGSI_EXEC_TEMP_80000000_C ) );
1053 static void PIPE_CDECL
1054 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
1055 __attribute__((force_align_arg_pointer))
1060 #if defined(PIPE_ARCH_SSE)
1061 _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
1063 store[0] = util_fast_pow( store[0], store[4] );
1064 store[1] = util_fast_pow( store[1], store[5] );
1065 store[2] = util_fast_pow( store[2], store[6] );
1066 store[3] = util_fast_pow( store[3], store[7] );
1072 struct x86_function *func,
1078 emit_func_call_dst_src2(
1089 struct x86_function *func,
1093 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1094 * good enough. Need to either emit a proper divide or use the
1095 * iterative technique described below in emit_rsqrt().
1099 make_xmm( xmm_dst ),
1100 make_xmm( xmm_src ) );
1103 static void PIPE_CDECL
1107 store[0] = floorf( store[0] + 0.5f );
1108 store[1] = floorf( store[1] + 0.5f );
1109 store[2] = floorf( store[2] + 0.5f );
1110 store[3] = floorf( store[3] + 0.5f );
1115 struct x86_function *func,
1119 emit_func_call_dst_src1(
1129 struct x86_function *func,
1134 /* Although rsqrtps() and rcpps() are low precision on some/all SSE
1135 * implementations, it is possible to improve its precision at
1136 * fairly low cost, using a newton/raphson step, as below:
1138 * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
1139 * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
1141 * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
1144 struct x86_reg dst = make_xmm( xmm_dst );
1145 struct x86_reg src = make_xmm( xmm_src );
1146 struct x86_reg tmp0 = make_xmm( 2 );
1147 struct x86_reg tmp1 = make_xmm( 3 );
1149 assert( xmm_dst != xmm_src );
1150 assert( xmm_dst != 2 && xmm_dst != 3 );
1151 assert( xmm_src != 2 && xmm_src != 3 );
1153 sse_movaps( func, dst, get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
1154 sse_movaps( func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
1155 sse_rsqrtps( func, tmp1, src );
1156 sse_mulps( func, src, tmp1 );
1157 sse_mulps( func, dst, tmp1 );
1158 sse_mulps( func, src, tmp1 );
1159 sse_subps( func, tmp0, src );
1160 sse_mulps( func, dst, tmp0 );
1163 /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1168 make_xmm( xmm_dst ),
1169 make_xmm( xmm_src ) );
1175 struct x86_function *func,
1182 TGSI_EXEC_TEMP_80000000_I,
1183 TGSI_EXEC_TEMP_80000000_C ) );
1186 static void PIPE_CDECL
1190 store[0] = store[0] < 0.0f ? -1.0f : store[0] > 0.0f ? 1.0f : 0.0f;
1191 store[1] = store[1] < 0.0f ? -1.0f : store[1] > 0.0f ? 1.0f : 0.0f;
1192 store[2] = store[2] < 0.0f ? -1.0f : store[2] > 0.0f ? 1.0f : 0.0f;
1193 store[3] = store[3] < 0.0f ? -1.0f : store[3] > 0.0f ? 1.0f : 0.0f;
1198 struct x86_function *func,
1202 emit_func_call_dst_src1(
1210 static void PIPE_CDECL
1214 store[0] = sinf( store[0] );
1215 store[1] = sinf( store[1] );
1216 store[2] = sinf( store[2] );
1217 store[3] = sinf( store[3] );
1221 emit_sin (struct x86_function *func,
1225 emit_func_call_dst_src1(
1235 struct x86_function *func,
1241 make_xmm( xmm_dst ),
1242 make_xmm( xmm_src ) );
1257 struct x86_function *func,
1259 const struct tgsi_full_src_register *reg,
1260 const unsigned chan_index )
1262 unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
1265 case TGSI_EXTSWIZZLE_X:
1266 case TGSI_EXTSWIZZLE_Y:
1267 case TGSI_EXTSWIZZLE_Z:
1268 case TGSI_EXTSWIZZLE_W:
1269 switch (reg->SrcRegister.File) {
1270 case TGSI_FILE_CONSTANT:
1274 reg->SrcRegister.Index,
1276 reg->SrcRegister.Indirect,
1277 reg->SrcRegisterInd.File,
1278 reg->SrcRegisterInd.Index );
1281 case TGSI_FILE_IMMEDIATE:
1285 reg->SrcRegister.Index,
1289 case TGSI_FILE_INPUT:
1293 reg->SrcRegister.Index,
1297 case TGSI_FILE_TEMPORARY:
1301 reg->SrcRegister.Index,
1310 case TGSI_EXTSWIZZLE_ZERO:
1314 TGSI_EXEC_TEMP_00000000_I,
1315 TGSI_EXEC_TEMP_00000000_C );
1318 case TGSI_EXTSWIZZLE_ONE:
1330 switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
1331 case TGSI_UTIL_SIGN_CLEAR:
1332 emit_abs( func, xmm );
1335 case TGSI_UTIL_SIGN_SET:
1336 emit_setsign( func, xmm );
1339 case TGSI_UTIL_SIGN_TOGGLE:
1340 emit_neg( func, xmm );
1343 case TGSI_UTIL_SIGN_KEEP:
1348 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
1349 emit_fetch( FUNC, XMM, &(INST).FullSrcRegisters[INDEX], CHAN )
1357 struct x86_function *func,
1359 const struct tgsi_full_dst_register *reg,
1360 const struct tgsi_full_instruction *inst,
1361 unsigned chan_index )
1363 switch( reg->DstRegister.File ) {
1364 case TGSI_FILE_OUTPUT:
1368 reg->DstRegister.Index,
1372 case TGSI_FILE_TEMPORARY:
1376 reg->DstRegister.Index,
1380 case TGSI_FILE_ADDRESS:
1384 reg->DstRegister.Index,
1392 switch( inst->Instruction.Saturate ) {
1396 case TGSI_SAT_ZERO_ONE:
1400 case TGSI_SAT_MINUS_PLUS_ONE:
1406 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1407 emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
1410 static void PIPE_CDECL
1411 fetch_texel( struct tgsi_sampler **sampler,
1417 debug_printf("%s sampler: %p (%p) store: %p\n",
1422 debug_printf("lodbias %f\n", store[12]);
1424 for (j = 0; j < 4; j++)
1425 debug_printf("sample %d texcoord %f %f\n",
1432 float rgba[NUM_CHANNELS][QUAD_SIZE];
1433 (*sampler)->get_samples(*sampler,
1437 0.0f, /*store[12], lodbias */
1440 memcpy( store, rgba, 16 * sizeof(float));
1444 for (j = 0; j < 4; j++)
1445 debug_printf("sample %d result %f %f %f %f\n",
1455 * High-level instruction translators.
1459 emit_tex( struct x86_function *func,
1460 const struct tgsi_full_instruction *inst,
1464 const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
1465 struct x86_reg args[2];
1469 switch (inst->InstructionExtTexture.Texture) {
1470 case TGSI_TEXTURE_1D:
1473 case TGSI_TEXTURE_2D:
1474 case TGSI_TEXTURE_RECT:
1477 case TGSI_TEXTURE_SHADOW1D:
1478 case TGSI_TEXTURE_SHADOW2D:
1479 case TGSI_TEXTURE_SHADOWRECT:
1480 case TGSI_TEXTURE_3D:
1481 case TGSI_TEXTURE_CUBE:
1490 FETCH( func, *inst, 3, 0, 3 );
1496 TGSI_EXEC_TEMP_00000000_I,
1497 TGSI_EXEC_TEMP_00000000_C );
1501 /* store lodbias whether enabled or not -- fetch_texel currently
1502 * respects it always.
1505 get_temp( TEMP_R0, 3 ),
1510 FETCH( func, *inst, 3, 0, 3 );
1512 emit_rcp( func, 3, 3 );
1515 for (i = 0; i < count; i++) {
1516 FETCH( func, *inst, i, 0, i );
1525 /* Store in the argument buffer:
1529 get_temp( TEMP_R0, i ),
1533 args[0] = get_temp( TEMP_R0, 0 );
1534 args[1] = get_sampler_ptr( unit );
1537 emit_func_call( func,
1543 /* If all four channels are enabled, could use a pointer to
1544 * dst[0].x instead of TEMP_R0 for store?
1546 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, i ) {
1551 get_temp( TEMP_R0, i ) );
1553 STORE( func, *inst, 0, 0, i );
1560 struct x86_function *func,
1561 const struct tgsi_full_src_register *reg )
1563 unsigned uniquemask;
1564 unsigned unique_count = 0;
1565 unsigned chan_index;
1568 /* This mask stores component bits that were already tested. Note that
1569 * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1571 uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
1573 FOR_EACH_CHANNEL( chan_index ) {
1576 /* unswizzle channel */
1577 swizzle = tgsi_util_get_full_src_register_extswizzle(
1581 /* check if the component has not been already tested */
1582 if( !(uniquemask & (1 << swizzle)) ) {
1583 uniquemask |= 1 << swizzle;
1585 /* allocate register */
1596 x86_make_reg( file_REG32, reg_AX ) );
1599 x86_make_reg( file_REG32, reg_DX ) );
1601 for (i = 0 ; i < unique_count; i++ ) {
1602 struct x86_reg dataXMM = make_xmm(i);
1608 TGSI_EXEC_TEMP_00000000_I,
1609 TGSI_EXEC_TEMP_00000000_C ),
1615 x86_make_reg( file_REG32, reg_AX ),
1621 x86_make_reg( file_REG32, reg_DX ),
1625 x86_make_reg( file_REG32, reg_AX ),
1626 x86_make_reg( file_REG32, reg_DX ) );
1633 TGSI_EXEC_TEMP_KILMASK_I,
1634 TGSI_EXEC_TEMP_KILMASK_C ),
1635 x86_make_reg( file_REG32, reg_AX ) );
1639 x86_make_reg( file_REG32, reg_DX ) );
1642 x86_make_reg( file_REG32, reg_AX ) );
1648 struct x86_function *func )
1650 /* XXX todo / fix me */
1656 struct x86_function *func,
1657 struct tgsi_full_instruction *inst,
1660 unsigned chan_index;
1662 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1663 FETCH( func, *inst, 0, 0, chan_index );
1664 FETCH( func, *inst, 1, 1, chan_index );
1676 STORE( func, *inst, 0, 0, chan_index );
1682 struct x86_function *func,
1683 struct tgsi_full_instruction *inst )
1685 unsigned chan_index;
1687 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1688 FETCH( func, *inst, 0, 0, chan_index );
1689 FETCH( func, *inst, 1, 1, chan_index );
1690 FETCH( func, *inst, 2, 2, chan_index );
1695 TGSI_EXEC_TEMP_00000000_I,
1696 TGSI_EXEC_TEMP_00000000_C ),
1710 STORE( func, *inst, 0, 0, chan_index );
1716 * Check if inst src/dest regs use indirect addressing into temporary
1720 indirect_temp_reference(const struct tgsi_full_instruction *inst)
1723 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1724 const struct tgsi_full_src_register *reg = &inst->FullSrcRegisters[i];
1725 if (reg->SrcRegister.File == TGSI_FILE_TEMPORARY &&
1726 reg->SrcRegister.Indirect)
1729 for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
1730 const struct tgsi_full_dst_register *reg = &inst->FullDstRegisters[i];
1731 if (reg->DstRegister.File == TGSI_FILE_TEMPORARY &&
1732 reg->DstRegister.Indirect)
1741 struct x86_function *func,
1742 struct tgsi_full_instruction *inst )
1744 unsigned chan_index;
1746 /* we can't handle indirect addressing into temp register file yet */
1747 if (indirect_temp_reference(inst))
1750 switch (inst->Instruction.Opcode) {
1751 case TGSI_OPCODE_ARL:
1752 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1753 FETCH( func, *inst, 0, 0, chan_index );
1754 emit_flr(func, 0, 0);
1755 emit_f2it( func, 0 );
1756 STORE( func, *inst, 0, 0, chan_index );
1760 case TGSI_OPCODE_MOV:
1761 case TGSI_OPCODE_SWZ:
1762 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1763 FETCH( func, *inst, 0, 0, chan_index );
1764 STORE( func, *inst, 0, 0, chan_index );
1768 case TGSI_OPCODE_LIT:
1769 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1770 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1776 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
1777 STORE( func, *inst, 0, 0, CHAN_X );
1779 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1780 STORE( func, *inst, 0, 0, CHAN_W );
1783 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1784 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1785 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1786 FETCH( func, *inst, 0, 0, CHAN_X );
1791 TGSI_EXEC_TEMP_00000000_I,
1792 TGSI_EXEC_TEMP_00000000_C ) );
1793 STORE( func, *inst, 0, 0, CHAN_Y );
1795 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1796 /* XMM[1] = SrcReg[0].yyyy */
1797 FETCH( func, *inst, 1, 0, CHAN_Y );
1798 /* XMM[1] = max(XMM[1], 0) */
1803 TGSI_EXEC_TEMP_00000000_I,
1804 TGSI_EXEC_TEMP_00000000_C ) );
1805 /* XMM[2] = SrcReg[0].wwww */
1806 FETCH( func, *inst, 2, 0, CHAN_W );
1807 /* XMM[2] = min(XMM[2], 128.0) */
1812 TGSI_EXEC_TEMP_128_I,
1813 TGSI_EXEC_TEMP_128_C ) );
1814 /* XMM[2] = max(XMM[2], -128.0) */
1819 TGSI_EXEC_TEMP_MINUS_128_I,
1820 TGSI_EXEC_TEMP_MINUS_128_C ) );
1821 emit_pow( func, 3, 1, 1, 2 );
1822 FETCH( func, *inst, 0, 0, CHAN_X );
1836 STORE( func, *inst, 2, 0, CHAN_Z );
1841 case TGSI_OPCODE_RCP:
1842 /* TGSI_OPCODE_RECIP */
1843 FETCH( func, *inst, 0, 0, CHAN_X );
1844 emit_rcp( func, 0, 0 );
1845 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1846 STORE( func, *inst, 0, 0, chan_index );
1850 case TGSI_OPCODE_RSQ:
1851 /* TGSI_OPCODE_RECIPSQRT */
1852 FETCH( func, *inst, 0, 0, CHAN_X );
1853 emit_abs( func, 0 );
1854 emit_rsqrt( func, 1, 0 );
1855 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1856 STORE( func, *inst, 1, 0, chan_index );
1860 case TGSI_OPCODE_EXP:
1861 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1862 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1863 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1864 FETCH( func, *inst, 0, 0, CHAN_X );
1865 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1866 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1867 emit_MOV( func, 1, 0 );
1868 emit_flr( func, 2, 1 );
1869 /* dst.x = ex2(floor(src.x)) */
1870 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1871 emit_MOV( func, 2, 1 );
1872 emit_ex2( func, 3, 2 );
1873 STORE( func, *inst, 2, 0, CHAN_X );
1875 /* dst.y = src.x - floor(src.x) */
1876 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1877 emit_MOV( func, 2, 0 );
1878 emit_sub( func, 2, 1 );
1879 STORE( func, *inst, 2, 0, CHAN_Y );
1882 /* dst.z = ex2(src.x) */
1883 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1884 emit_ex2( func, 3, 0 );
1885 STORE( func, *inst, 0, 0, CHAN_Z );
1889 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1890 emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1891 STORE( func, *inst, 0, 0, CHAN_W );
1895 case TGSI_OPCODE_LOG:
1896 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1897 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1898 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1899 FETCH( func, *inst, 0, 0, CHAN_X );
1900 emit_abs( func, 0 );
1901 emit_MOV( func, 1, 0 );
1902 emit_lg2( func, 2, 1 );
1903 /* dst.z = lg2(abs(src.x)) */
1904 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1905 STORE( func, *inst, 1, 0, CHAN_Z );
1907 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1908 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1909 emit_flr( func, 2, 1 );
1910 /* dst.x = floor(lg2(abs(src.x))) */
1911 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1912 STORE( func, *inst, 1, 0, CHAN_X );
1914 /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
1915 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1916 emit_ex2( func, 2, 1 );
1917 emit_rcp( func, 1, 1 );
1918 emit_mul( func, 0, 1 );
1919 STORE( func, *inst, 0, 0, CHAN_Y );
1924 if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1925 emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1926 STORE( func, *inst, 0, 0, CHAN_W );
1930 case TGSI_OPCODE_MUL:
1931 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1932 FETCH( func, *inst, 0, 0, chan_index );
1933 FETCH( func, *inst, 1, 1, chan_index );
1934 emit_mul( func, 0, 1 );
1935 STORE( func, *inst, 0, 0, chan_index );
1939 case TGSI_OPCODE_ADD:
1940 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1941 FETCH( func, *inst, 0, 0, chan_index );
1942 FETCH( func, *inst, 1, 1, chan_index );
1943 emit_add( func, 0, 1 );
1944 STORE( func, *inst, 0, 0, chan_index );
1948 case TGSI_OPCODE_DP3:
1949 /* TGSI_OPCODE_DOT3 */
1950 FETCH( func, *inst, 0, 0, CHAN_X );
1951 FETCH( func, *inst, 1, 1, CHAN_X );
1952 emit_mul( func, 0, 1 );
1953 FETCH( func, *inst, 1, 0, CHAN_Y );
1954 FETCH( func, *inst, 2, 1, CHAN_Y );
1955 emit_mul( func, 1, 2 );
1956 emit_add( func, 0, 1 );
1957 FETCH( func, *inst, 1, 0, CHAN_Z );
1958 FETCH( func, *inst, 2, 1, CHAN_Z );
1959 emit_mul( func, 1, 2 );
1960 emit_add( func, 0, 1 );
1961 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1962 STORE( func, *inst, 0, 0, chan_index );
1966 case TGSI_OPCODE_DP4:
1967 /* TGSI_OPCODE_DOT4 */
1968 FETCH( func, *inst, 0, 0, CHAN_X );
1969 FETCH( func, *inst, 1, 1, CHAN_X );
1970 emit_mul( func, 0, 1 );
1971 FETCH( func, *inst, 1, 0, CHAN_Y );
1972 FETCH( func, *inst, 2, 1, CHAN_Y );
1973 emit_mul( func, 1, 2 );
1974 emit_add( func, 0, 1 );
1975 FETCH( func, *inst, 1, 0, CHAN_Z );
1976 FETCH( func, *inst, 2, 1, CHAN_Z );
1977 emit_mul(func, 1, 2 );
1978 emit_add(func, 0, 1 );
1979 FETCH( func, *inst, 1, 0, CHAN_W );
1980 FETCH( func, *inst, 2, 1, CHAN_W );
1981 emit_mul( func, 1, 2 );
1982 emit_add( func, 0, 1 );
1983 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1984 STORE( func, *inst, 0, 0, chan_index );
1988 case TGSI_OPCODE_DST:
1989 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1995 STORE( func, *inst, 0, 0, CHAN_X );
1997 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1998 FETCH( func, *inst, 0, 0, CHAN_Y );
1999 FETCH( func, *inst, 1, 1, CHAN_Y );
2000 emit_mul( func, 0, 1 );
2001 STORE( func, *inst, 0, 0, CHAN_Y );
2003 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2004 FETCH( func, *inst, 0, 0, CHAN_Z );
2005 STORE( func, *inst, 0, 0, CHAN_Z );
2007 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2008 FETCH( func, *inst, 0, 1, CHAN_W );
2009 STORE( func, *inst, 0, 0, CHAN_W );
2013 case TGSI_OPCODE_MIN:
2014 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2015 FETCH( func, *inst, 0, 0, chan_index );
2016 FETCH( func, *inst, 1, 1, chan_index );
2021 STORE( func, *inst, 0, 0, chan_index );
2025 case TGSI_OPCODE_MAX:
2026 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2027 FETCH( func, *inst, 0, 0, chan_index );
2028 FETCH( func, *inst, 1, 1, chan_index );
2033 STORE( func, *inst, 0, 0, chan_index );
2037 case TGSI_OPCODE_SLT:
2038 /* TGSI_OPCODE_SETLT */
2039 emit_setcc( func, inst, cc_LessThan );
2042 case TGSI_OPCODE_SGE:
2043 /* TGSI_OPCODE_SETGE */
2044 emit_setcc( func, inst, cc_NotLessThan );
2047 case TGSI_OPCODE_MAD:
2048 /* TGSI_OPCODE_MADD */
2049 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2050 FETCH( func, *inst, 0, 0, chan_index );
2051 FETCH( func, *inst, 1, 1, chan_index );
2052 FETCH( func, *inst, 2, 2, chan_index );
2053 emit_mul( func, 0, 1 );
2054 emit_add( func, 0, 2 );
2055 STORE( func, *inst, 0, 0, chan_index );
2059 case TGSI_OPCODE_SUB:
2060 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2061 FETCH( func, *inst, 0, 0, chan_index );
2062 FETCH( func, *inst, 1, 1, chan_index );
2063 emit_sub( func, 0, 1 );
2064 STORE( func, *inst, 0, 0, chan_index );
2068 case TGSI_OPCODE_LRP:
2069 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2070 FETCH( func, *inst, 0, 0, chan_index );
2071 FETCH( func, *inst, 1, 1, chan_index );
2072 FETCH( func, *inst, 2, 2, chan_index );
2073 emit_sub( func, 1, 2 );
2074 emit_mul( func, 0, 1 );
2075 emit_add( func, 0, 2 );
2076 STORE( func, *inst, 0, 0, chan_index );
2080 case TGSI_OPCODE_CND:
2084 case TGSI_OPCODE_CND0:
2088 case TGSI_OPCODE_DP2A:
2089 FETCH( func, *inst, 0, 0, CHAN_X ); /* xmm0 = src[0].x */
2090 FETCH( func, *inst, 1, 1, CHAN_X ); /* xmm1 = src[1].x */
2091 emit_mul( func, 0, 1 ); /* xmm0 = xmm0 * xmm1 */
2092 FETCH( func, *inst, 1, 0, CHAN_Y ); /* xmm1 = src[0].y */
2093 FETCH( func, *inst, 2, 1, CHAN_Y ); /* xmm2 = src[1].y */
2094 emit_mul( func, 1, 2 ); /* xmm1 = xmm1 * xmm2 */
2095 emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
2096 FETCH( func, *inst, 1, 2, CHAN_X ); /* xmm1 = src[2].x */
2097 emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
2098 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2099 STORE( func, *inst, 0, 0, chan_index ); /* dest[ch] = xmm0 */
2103 case TGSI_OPCODE_FRC:
2104 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2105 FETCH( func, *inst, 0, 0, chan_index );
2106 emit_frc( func, 0, 0 );
2107 STORE( func, *inst, 0, 0, chan_index );
2111 case TGSI_OPCODE_CLAMP:
2115 case TGSI_OPCODE_FLR:
2116 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2117 FETCH( func, *inst, 0, 0, chan_index );
2118 emit_flr( func, 0, 0 );
2119 STORE( func, *inst, 0, 0, chan_index );
2123 case TGSI_OPCODE_ROUND:
2124 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2125 FETCH( func, *inst, 0, 0, chan_index );
2126 emit_rnd( func, 0, 0 );
2127 STORE( func, *inst, 0, 0, chan_index );
2131 case TGSI_OPCODE_EX2:
2132 FETCH( func, *inst, 0, 0, CHAN_X );
2133 emit_ex2( func, 0, 0 );
2134 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2135 STORE( func, *inst, 0, 0, chan_index );
2139 case TGSI_OPCODE_LG2:
2140 FETCH( func, *inst, 0, 0, CHAN_X );
2141 emit_lg2( func, 0, 0 );
2142 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2143 STORE( func, *inst, 0, 0, chan_index );
2147 case TGSI_OPCODE_POW:
2148 FETCH( func, *inst, 0, 0, CHAN_X );
2149 FETCH( func, *inst, 1, 1, CHAN_X );
2150 emit_pow( func, 0, 0, 0, 1 );
2151 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2152 STORE( func, *inst, 0, 0, chan_index );
2156 case TGSI_OPCODE_XPD:
2157 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
2158 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
2159 FETCH( func, *inst, 1, 1, CHAN_Z );
2160 FETCH( func, *inst, 3, 0, CHAN_Z );
2162 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
2163 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2164 FETCH( func, *inst, 0, 0, CHAN_Y );
2165 FETCH( func, *inst, 4, 1, CHAN_Y );
2167 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2168 emit_MOV( func, 2, 0 );
2169 emit_mul( func, 2, 1 );
2170 emit_MOV( func, 5, 3 );
2171 emit_mul( func, 5, 4 );
2172 emit_sub( func, 2, 5 );
2173 STORE( func, *inst, 2, 0, CHAN_X );
2175 if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
2176 IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2177 FETCH( func, *inst, 2, 1, CHAN_X );
2178 FETCH( func, *inst, 5, 0, CHAN_X );
2180 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2181 emit_mul( func, 3, 2 );
2182 emit_mul( func, 1, 5 );
2183 emit_sub( func, 3, 1 );
2184 STORE( func, *inst, 3, 0, CHAN_Y );
2186 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2187 emit_mul( func, 5, 4 );
2188 emit_mul( func, 0, 2 );
2189 emit_sub( func, 5, 0 );
2190 STORE( func, *inst, 5, 0, CHAN_Z );
2192 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2198 STORE( func, *inst, 0, 0, CHAN_W );
2202 case TGSI_OPCODE_ABS:
2203 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2204 FETCH( func, *inst, 0, 0, chan_index );
2205 emit_abs( func, 0) ;
2207 STORE( func, *inst, 0, 0, chan_index );
2211 case TGSI_OPCODE_RCC:
2215 case TGSI_OPCODE_DPH:
2216 FETCH( func, *inst, 0, 0, CHAN_X );
2217 FETCH( func, *inst, 1, 1, CHAN_X );
2218 emit_mul( func, 0, 1 );
2219 FETCH( func, *inst, 1, 0, CHAN_Y );
2220 FETCH( func, *inst, 2, 1, CHAN_Y );
2221 emit_mul( func, 1, 2 );
2222 emit_add( func, 0, 1 );
2223 FETCH( func, *inst, 1, 0, CHAN_Z );
2224 FETCH( func, *inst, 2, 1, CHAN_Z );
2225 emit_mul( func, 1, 2 );
2226 emit_add( func, 0, 1 );
2227 FETCH( func, *inst, 1, 1, CHAN_W );
2228 emit_add( func, 0, 1 );
2229 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2230 STORE( func, *inst, 0, 0, chan_index );
2234 case TGSI_OPCODE_COS:
2235 FETCH( func, *inst, 0, 0, CHAN_X );
2236 emit_cos( func, 0, 0 );
2237 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2238 STORE( func, *inst, 0, 0, chan_index );
2242 case TGSI_OPCODE_DDX:
2246 case TGSI_OPCODE_DDY:
2250 case TGSI_OPCODE_KILP:
2251 /* predicated kill */
2253 return 0; /* XXX fix me */
2256 case TGSI_OPCODE_KIL:
2257 /* conditional kill */
2258 emit_kil( func, &inst->FullSrcRegisters[0] );
2261 case TGSI_OPCODE_PK2H:
2265 case TGSI_OPCODE_PK2US:
2269 case TGSI_OPCODE_PK4B:
2273 case TGSI_OPCODE_PK4UB:
2277 case TGSI_OPCODE_RFL:
2281 case TGSI_OPCODE_SEQ:
2285 case TGSI_OPCODE_SFL:
2289 case TGSI_OPCODE_SGT:
2293 case TGSI_OPCODE_SIN:
2294 FETCH( func, *inst, 0, 0, CHAN_X );
2295 emit_sin( func, 0, 0 );
2296 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2297 STORE( func, *inst, 0, 0, chan_index );
2301 case TGSI_OPCODE_SLE:
2305 case TGSI_OPCODE_SNE:
2309 case TGSI_OPCODE_STR:
2313 case TGSI_OPCODE_TEX:
2314 emit_tex( func, inst, FALSE, FALSE );
2317 case TGSI_OPCODE_TXD:
2321 case TGSI_OPCODE_UP2H:
2325 case TGSI_OPCODE_UP2US:
2329 case TGSI_OPCODE_UP4B:
2333 case TGSI_OPCODE_UP4UB:
2337 case TGSI_OPCODE_X2D:
2341 case TGSI_OPCODE_ARA:
2345 case TGSI_OPCODE_ARR:
2346 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2347 FETCH( func, *inst, 0, 0, chan_index );
2348 emit_rnd( func, 0, 0 );
2349 emit_f2it( func, 0 );
2350 STORE( func, *inst, 0, 0, chan_index );
2354 case TGSI_OPCODE_BRA:
2358 case TGSI_OPCODE_CAL:
2362 case TGSI_OPCODE_RET:
2366 case TGSI_OPCODE_END:
2369 case TGSI_OPCODE_SSG:
2370 /* TGSI_OPCODE_SGN */
2371 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2372 FETCH( func, *inst, 0, 0, chan_index );
2373 emit_sgn( func, 0, 0 );
2374 STORE( func, *inst, 0, 0, chan_index );
2378 case TGSI_OPCODE_CMP:
2379 emit_cmp (func, inst);
2382 case TGSI_OPCODE_SCS:
2383 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2384 FETCH( func, *inst, 0, 0, CHAN_X );
2385 emit_cos( func, 0, 0 );
2386 STORE( func, *inst, 0, 0, CHAN_X );
2388 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2389 FETCH( func, *inst, 0, 0, CHAN_X );
2390 emit_sin( func, 0, 0 );
2391 STORE( func, *inst, 0, 0, CHAN_Y );
2393 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2397 TGSI_EXEC_TEMP_00000000_I,
2398 TGSI_EXEC_TEMP_00000000_C );
2399 STORE( func, *inst, 0, 0, CHAN_Z );
2401 IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2407 STORE( func, *inst, 0, 0, CHAN_W );
2411 case TGSI_OPCODE_TXB:
2412 emit_tex( func, inst, TRUE, FALSE );
2415 case TGSI_OPCODE_NRM:
2417 case TGSI_OPCODE_NRM4:
2418 /* 3 or 4-component normalization */
2420 uint dims = (inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4;
2422 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) ||
2423 IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2424 IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z) ||
2425 (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 4)) {
2427 /* NOTE: Cannot use xmm regs 2/3 here (see emit_rsqrt() above). */
2430 /* xmm0 = src.x * src.x */
2431 FETCH(func, *inst, 0, 0, CHAN_X);
2432 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
2433 emit_MOV(func, 4, 0);
2435 emit_mul(func, 0, 0);
2438 /* xmm0 = xmm0 + src.y * src.y */
2439 FETCH(func, *inst, 1, 0, CHAN_Y);
2440 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2441 emit_MOV(func, 5, 1);
2443 emit_mul(func, 1, 1);
2444 emit_add(func, 0, 1);
2447 /* xmm0 = xmm0 + src.z * src.z */
2448 FETCH(func, *inst, 1, 0, CHAN_Z);
2449 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2450 emit_MOV(func, 6, 1);
2452 emit_mul(func, 1, 1);
2453 emit_add(func, 0, 1);
2457 /* xmm0 = xmm0 + src.w * src.w */
2458 FETCH(func, *inst, 1, 0, CHAN_W);
2459 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
2460 emit_MOV(func, 7, 1);
2462 emit_mul(func, 1, 1);
2463 emit_add(func, 0, 1);
2466 /* xmm1 = 1 / sqrt(xmm0) */
2467 emit_rsqrt(func, 1, 0);
2469 /* dst.x = xmm1 * src.x */
2470 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
2471 emit_mul(func, 4, 1);
2472 STORE(func, *inst, 4, 0, CHAN_X);
2475 /* dst.y = xmm1 * src.y */
2476 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2477 emit_mul(func, 5, 1);
2478 STORE(func, *inst, 5, 0, CHAN_Y);
2481 /* dst.z = xmm1 * src.z */
2482 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2483 emit_mul(func, 6, 1);
2484 STORE(func, *inst, 6, 0, CHAN_Z);
2487 /* dst.w = xmm1 * src.w */
2488 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) && dims == 4) {
2489 emit_mul(func, 7, 1);
2490 STORE(func, *inst, 7, 0, CHAN_W);
2495 if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 3) {
2496 emit_tempf(func, 0, TEMP_ONE_I, TEMP_ONE_C);
2497 STORE(func, *inst, 0, 0, CHAN_W);
2502 case TGSI_OPCODE_DIV:
2506 case TGSI_OPCODE_DP2:
2507 FETCH( func, *inst, 0, 0, CHAN_X ); /* xmm0 = src[0].x */
2508 FETCH( func, *inst, 1, 1, CHAN_X ); /* xmm1 = src[1].x */
2509 emit_mul( func, 0, 1 ); /* xmm0 = xmm0 * xmm1 */
2510 FETCH( func, *inst, 1, 0, CHAN_Y ); /* xmm1 = src[0].y */
2511 FETCH( func, *inst, 2, 1, CHAN_Y ); /* xmm2 = src[1].y */
2512 emit_mul( func, 1, 2 ); /* xmm1 = xmm1 * xmm2 */
2513 emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
2514 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2515 STORE( func, *inst, 0, 0, chan_index ); /* dest[ch] = xmm0 */
2519 case TGSI_OPCODE_TXL:
2520 emit_tex( func, inst, TRUE, FALSE );
2523 case TGSI_OPCODE_TXP:
2524 emit_tex( func, inst, FALSE, TRUE );
2527 case TGSI_OPCODE_BRK:
2531 case TGSI_OPCODE_IF:
2535 case TGSI_OPCODE_BGNFOR:
2539 case TGSI_OPCODE_REP:
2543 case TGSI_OPCODE_ELSE:
2547 case TGSI_OPCODE_ENDIF:
2551 case TGSI_OPCODE_ENDFOR:
2555 case TGSI_OPCODE_ENDREP:
2559 case TGSI_OPCODE_PUSHA:
2563 case TGSI_OPCODE_POPA:
2567 case TGSI_OPCODE_CEIL:
2571 case TGSI_OPCODE_I2F:
2575 case TGSI_OPCODE_NOT:
2579 case TGSI_OPCODE_TRUNC:
2580 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2581 FETCH( func, *inst, 0, 0, chan_index );
2582 emit_f2it( func, 0 );
2583 emit_i2f( func, 0 );
2584 STORE( func, *inst, 0, 0, chan_index );
2588 case TGSI_OPCODE_SHL:
2592 case TGSI_OPCODE_SHR:
2596 case TGSI_OPCODE_AND:
2600 case TGSI_OPCODE_OR:
2604 case TGSI_OPCODE_MOD:
2608 case TGSI_OPCODE_XOR:
2612 case TGSI_OPCODE_SAD:
2616 case TGSI_OPCODE_TXF:
2620 case TGSI_OPCODE_TXQ:
2624 case TGSI_OPCODE_CONT:
2628 case TGSI_OPCODE_EMIT:
2632 case TGSI_OPCODE_ENDPRIM:
2645 struct x86_function *func,
2646 struct tgsi_full_declaration *decl )
2648 if( decl->Declaration.File == TGSI_FILE_INPUT ) {
2649 unsigned first, last, mask;
2652 first = decl->DeclarationRange.First;
2653 last = decl->DeclarationRange.Last;
2654 mask = decl->Declaration.UsageMask;
2656 for( i = first; i <= last; i++ ) {
2657 for( j = 0; j < NUM_CHANNELS; j++ ) {
2658 if( mask & (1 << j) ) {
2659 switch( decl->Declaration.Interpolate ) {
2660 case TGSI_INTERPOLATE_CONSTANT:
2661 emit_coef_a0( func, 0, i, j );
2662 emit_inputs( func, 0, i, j );
2665 case TGSI_INTERPOLATE_LINEAR:
2666 emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2667 emit_coef_dadx( func, 1, i, j );
2668 emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2669 emit_coef_dady( func, 3, i, j );
2670 emit_mul( func, 0, 1 ); /* x * dadx */
2671 emit_coef_a0( func, 4, i, j );
2672 emit_mul( func, 2, 3 ); /* y * dady */
2673 emit_add( func, 0, 4 ); /* x * dadx + a0 */
2674 emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */
2675 emit_inputs( func, 0, i, j );
2678 case TGSI_INTERPOLATE_PERSPECTIVE:
2679 emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2680 emit_coef_dadx( func, 1, i, j );
2681 emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2682 emit_coef_dady( func, 3, i, j );
2683 emit_mul( func, 0, 1 ); /* x * dadx */
2684 emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
2685 emit_coef_a0( func, 5, i, j );
2686 emit_rcp( func, 4, 4 ); /* 1.0 / w */
2687 emit_mul( func, 2, 3 ); /* y * dady */
2688 emit_add( func, 0, 5 ); /* x * dadx + a0 */
2689 emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */
2690 emit_mul( func, 0, 4 ); /* (x * dadx + y * dady + a0) / w */
2691 emit_inputs( func, 0, i, j );
2704 static void aos_to_soa( struct x86_function *func,
2710 struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX );
2711 struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
2712 struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
2713 struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
2718 x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2720 x86_mov( func, aos_input, x86_fn_arg( func, arg_aos ) );
2721 x86_mov( func, soa_input, x86_fn_arg( func, arg_machine ) );
2722 x86_lea( func, soa_input,
2723 x86_make_disp( soa_input,
2724 Offset(struct tgsi_exec_machine, Inputs) ) );
2725 x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
2726 x86_mov( func, stride, x86_fn_arg( func, arg_stride ) );
2729 inner_loop = x86_get_label( func );
2731 x86_push( func, aos_input );
2732 sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2733 sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2734 x86_add( func, aos_input, stride );
2735 sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2736 sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2737 x86_add( func, aos_input, stride );
2738 sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2739 sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2740 x86_add( func, aos_input, stride );
2741 sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2742 sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2743 x86_pop( func, aos_input );
2745 sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2746 sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2747 sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
2748 sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
2749 sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
2750 sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
2752 sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
2753 sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
2754 sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
2755 sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
2757 /* Advance to next input */
2758 x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
2759 x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
2761 /* while --num_inputs */
2762 x86_dec( func, num_inputs );
2763 x86_jcc( func, cc_NE, inner_loop );
2766 x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2769 static void soa_to_aos( struct x86_function *func,
2775 struct x86_reg soa_output = x86_make_reg( file_REG32, reg_AX );
2776 struct x86_reg aos_output = x86_make_reg( file_REG32, reg_BX );
2777 struct x86_reg num_outputs = x86_make_reg( file_REG32, reg_CX );
2778 struct x86_reg temp = x86_make_reg( file_REG32, reg_DX );
2782 x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2784 x86_mov( func, aos_output, x86_fn_arg( func, arg_aos ) );
2785 x86_mov( func, soa_output, x86_fn_arg( func, arg_machine ) );
2786 x86_lea( func, soa_output,
2787 x86_make_disp( soa_output,
2788 Offset(struct tgsi_exec_machine, Outputs) ) );
2789 x86_mov( func, num_outputs, x86_fn_arg( func, arg_num ) );
2792 inner_loop = x86_get_label( func );
2794 sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
2795 sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
2796 sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
2797 sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
2799 sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2800 sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2801 sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
2802 sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
2803 sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
2804 sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
2806 x86_mov( func, temp, x86_fn_arg( func, arg_stride ) );
2807 x86_push( func, aos_output );
2808 sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2809 sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2810 x86_add( func, aos_output, temp );
2811 sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2812 sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2813 x86_add( func, aos_output, temp );
2814 sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2815 sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2816 x86_add( func, aos_output, temp );
2817 sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2818 sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2819 x86_pop( func, aos_output );
2821 /* Advance to next output */
2822 x86_lea( func, aos_output, x86_make_disp(aos_output, 16) );
2823 x86_lea( func, soa_output, x86_make_disp(soa_output, 64) );
2825 /* while --num_outputs */
2826 x86_dec( func, num_outputs );
2827 x86_jcc( func, cc_NE, inner_loop );
2830 x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2834 * Translate a TGSI vertex/fragment shader to SSE2 code.
2835 * Slightly different things are done for vertex vs. fragment shaders.
2837 * \param tokens the TGSI input shader
2838 * \param func the output SSE code/function
2839 * \param immediates buffer to place immediates, later passed to SSE func
2840 * \param return 1 for success, 0 if translation failed
2844 const struct tgsi_token *tokens,
2845 struct x86_function *func,
2846 float (*immediates)[4],
2847 boolean do_swizzles )
2849 struct tgsi_parse_context parse;
2851 uint num_immediates = 0;
2855 func->csr = func->store;
2857 tgsi_parse_init( &parse, tokens );
2859 /* Can't just use EDI, EBX without save/restoring them:
2861 x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2862 x86_push( func, x86_make_reg( file_REG32, reg_DI ) );
2865 * Different function args for vertex/fragment shaders:
2867 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2873 6 ); /* input_stride */
2879 x86_fn_arg( func, 1 ) );
2883 x86_fn_arg( func, 2 ) );
2886 get_immediate_base(),
2887 x86_fn_arg( func, 3 ) );
2889 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2893 x86_fn_arg( func, 4 ) );
2899 x86_make_disp( get_machine_base(),
2900 Offset( struct tgsi_exec_machine, Samplers ) ) );
2903 while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
2904 tgsi_parse_token( &parse );
2906 switch( parse.FullToken.Token.Type ) {
2907 case TGSI_TOKEN_TYPE_DECLARATION:
2908 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2911 &parse.FullToken.FullDeclaration );
2915 case TGSI_TOKEN_TYPE_INSTRUCTION:
2916 ok = emit_instruction(
2918 &parse.FullToken.FullInstruction );
2921 uint opcode = parse.FullToken.FullInstruction.Instruction.Opcode;
2922 debug_printf("failed to translate tgsi opcode %d (%s) to SSE (%s)\n",
2924 tgsi_get_opcode_name(opcode),
2925 parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
2926 "vertex shader" : "fragment shader");
2930 case TGSI_TOKEN_TYPE_IMMEDIATE:
2931 /* simply copy the immediate values into the next immediates[] slot */
2933 const uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
2936 assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
2937 for( i = 0; i < size; i++ ) {
2938 immediates[num_immediates][i] =
2939 parse.FullToken.FullImmediate.u[i].Float;
2942 debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
2944 immediates[num_immediates][0],
2945 immediates[num_immediates][1],
2946 immediates[num_immediates][2],
2947 immediates[num_immediates][3]);
2959 if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2964 8, /* num_outputs */
2965 9 ); /* output_stride */
2968 /* Can't just use EBX, EDI without save/restoring them:
2970 x86_pop( func, x86_make_reg( file_REG32, reg_DI ) );
2971 x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2975 tgsi_parse_free( &parse );
2980 #endif /* PIPE_ARCH_X86 */