2 * Mesa 3-D graphics library
5 * Copyright (C) 1999-2004 Brian Paul All Rights Reserved.
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
14 * The above copyright notice and this permission notice shall be included
15 * in all copies or substantial portions of the Software.
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
21 * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 * Translate tgsi vertex programs to x86/x87/SSE/SSE2 machine code
27 * using the rtasm runtime assembler. Based on the old
28 * t_vb_arb_program_sse.c
32 #include "pipe/p_util.h"
33 #include "pipe/p_shader_tokens.h"
34 #include "tgsi/util/tgsi_parse.h"
35 #include "tgsi/util/tgsi_util.h"
36 #include "tgsi/exec/tgsi_exec.h"
37 #include "tgsi/util/tgsi_dump.h"
40 #include "draw_vs_aos.h"
42 #include "rtasm/rtasm_x86sse.h"
47 static const char *files[] =
60 static INLINE boolean eq( struct x86_reg a,
63 return (a.file == b.file &&
69 struct x86_reg aos_get_x86( struct aos_compilation *cp,
72 if (cp->ebp != value) {
77 offset = Offset(struct aos_machine, immediates);
80 offset = Offset(struct aos_machine, constants);
83 offset = Offset(struct aos_machine, attrib);
90 x86_mov(cp->func, cp->temp_EBP,
91 x86_make_disp(cp->machine_EDX, offset));
92 /* x86_deref(x86_make_disp(cp->machine_EDX, offset))); */
101 static struct x86_reg get_reg_ptr(struct aos_compilation *cp,
105 struct x86_reg ptr = cp->machine_EDX;
108 case TGSI_FILE_INPUT:
109 return x86_make_disp(ptr, Offset(struct aos_machine, input[idx]));
111 case TGSI_FILE_OUTPUT:
112 return x86_make_disp(ptr, Offset(struct aos_machine, output[idx]));
114 case TGSI_FILE_TEMPORARY:
115 return x86_make_disp(ptr, Offset(struct aos_machine, temp[idx]));
117 case AOS_FILE_INTERNAL:
118 return x86_make_disp(ptr, Offset(struct aos_machine, internal[idx]));
120 case TGSI_FILE_IMMEDIATE:
121 return x86_make_disp(aos_get_x86(cp, X86_IMMEDIATES), idx * 4 * sizeof(float));
123 case TGSI_FILE_CONSTANT:
124 return x86_make_disp(aos_get_x86(cp, X86_CONSTANTS), idx * 4 * sizeof(float));
127 ERROR(cp, "unknown reg file");
128 return x86_make_reg(0,0);
134 #define X87_CW_EXCEPTION_INV_OP (1<<0)
135 #define X87_CW_EXCEPTION_DENORM_OP (1<<1)
136 #define X87_CW_EXCEPTION_ZERO_DIVIDE (1<<2)
137 #define X87_CW_EXCEPTION_OVERFLOW (1<<3)
138 #define X87_CW_EXCEPTION_UNDERFLOW (1<<4)
139 #define X87_CW_EXCEPTION_PRECISION (1<<5)
140 #define X87_CW_PRECISION_SINGLE (0<<8)
141 #define X87_CW_PRECISION_RESERVED (1<<8)
142 #define X87_CW_PRECISION_DOUBLE (2<<8)
143 #define X87_CW_PRECISION_DOUBLE_EXT (3<<8)
144 #define X87_CW_PRECISION_MASK (3<<8)
145 #define X87_CW_ROUND_NEAREST (0<<10)
146 #define X87_CW_ROUND_DOWN (1<<10)
147 #define X87_CW_ROUND_UP (2<<10)
148 #define X87_CW_ROUND_ZERO (3<<10)
149 #define X87_CW_ROUND_MASK (3<<10)
150 #define X87_CW_INFINITY (1<<12)
155 static void spill( struct aos_compilation *cp, unsigned idx )
157 if (!cp->xmm[idx].dirty ||
158 (cp->xmm[idx].file != TGSI_FILE_INPUT && /* inputs are fetched into xmm & set dirty */
159 cp->xmm[idx].file != TGSI_FILE_OUTPUT &&
160 cp->xmm[idx].file != TGSI_FILE_TEMPORARY)) {
161 ERROR(cp, "invalid spill");
165 struct x86_reg oldval = get_reg_ptr(cp,
169 if (0) debug_printf("\nspill %s[%d]",
170 files[cp->xmm[idx].file],
173 assert(cp->xmm[idx].dirty);
174 sse_movaps(cp->func, oldval, x86_make_reg(file_XMM, idx));
175 cp->xmm[idx].dirty = 0;
180 static struct x86_reg get_xmm_writable( struct aos_compilation *cp,
183 if (reg.file != file_XMM ||
184 cp->xmm[reg.idx].file != TGSI_FILE_NULL)
186 struct x86_reg tmp = aos_get_xmm_reg(cp);
187 sse_movaps(cp->func, tmp, reg);
191 cp->xmm[reg.idx].last_used = cp->insn_counter;
195 static struct x86_reg get_xmm( struct aos_compilation *cp,
198 if (reg.file != file_XMM)
200 struct x86_reg tmp = aos_get_xmm_reg(cp);
201 sse_movaps(cp->func, tmp, reg);
205 cp->xmm[reg.idx].last_used = cp->insn_counter;
210 /* Allocate an empty xmm register, either as a temporary or later to
211 * "adopt" as a shader reg.
213 struct x86_reg aos_get_xmm_reg( struct aos_compilation *cp )
217 boolean found = FALSE;
219 for (i = 0; i < 8; i++)
220 if (cp->xmm[i].last_used != cp->insn_counter &&
221 cp->xmm[i].file == TGSI_FILE_NULL) {
227 for (i = 0; i < 8; i++)
228 if (cp->xmm[i].last_used < cp->xmm[oldest].last_used)
232 /* Need to write out the old value?
234 if (cp->xmm[oldest].dirty)
237 assert(cp->xmm[oldest].last_used != cp->insn_counter);
239 cp->xmm[oldest].file = TGSI_FILE_NULL;
240 cp->xmm[oldest].idx = 0;
241 cp->xmm[oldest].dirty = 0;
242 cp->xmm[oldest].last_used = cp->insn_counter;
243 return x86_make_reg(file_XMM, oldest);
246 void aos_release_xmm_reg( struct aos_compilation *cp,
249 cp->xmm[idx].file = TGSI_FILE_NULL;
250 cp->xmm[idx].idx = 0;
251 cp->xmm[idx].dirty = 0;
252 cp->xmm[idx].last_used = 0;
258 /* Mark an xmm reg as holding the current copy of a shader reg.
260 void aos_adopt_xmm_reg( struct aos_compilation *cp,
268 if (reg.file != file_XMM) {
274 /* If any xmm reg thinks it holds this shader reg, break the
277 for (i = 0; i < 8; i++) {
278 if (cp->xmm[i].file == file &&
279 cp->xmm[i].idx == idx)
281 /* If an xmm reg is already holding this shader reg, take into account its
284 dirty |= cp->xmm[i].dirty;
285 aos_release_xmm_reg(cp, i);
289 cp->xmm[reg.idx].file = file;
290 cp->xmm[reg.idx].idx = idx;
291 cp->xmm[reg.idx].dirty = dirty;
292 cp->xmm[reg.idx].last_used = cp->insn_counter;
296 /* Return a pointer to the in-memory copy of the reg, making sure it is uptodate.
298 static struct x86_reg aos_get_shader_reg_ptr( struct aos_compilation *cp,
304 /* Ensure the in-memory copy of this reg is up-to-date
306 for (i = 0; i < 8; i++) {
307 if (cp->xmm[i].file == file &&
308 cp->xmm[i].idx == idx &&
314 return get_reg_ptr( cp, file, idx );
318 /* As above, but return a pointer. Note - this pointer may alias
319 * those returned by get_arg_ptr().
321 static struct x86_reg get_dst_ptr( struct aos_compilation *cp,
322 const struct tgsi_full_dst_register *dst )
324 unsigned file = dst->DstRegister.File;
325 unsigned idx = dst->DstRegister.Index;
329 /* Ensure in-memory copy of this reg is up-to-date and invalidate
332 for (i = 0; i < 8; i++) {
333 if (cp->xmm[i].file == file &&
334 cp->xmm[i].idx == idx)
336 if (cp->xmm[i].dirty)
339 aos_release_xmm_reg(cp, i);
343 return get_reg_ptr( cp, file, idx );
350 /* Return an XMM reg if the argument is resident, otherwise return a
351 * base+offset pointer to the saved value.
353 struct x86_reg aos_get_shader_reg( struct aos_compilation *cp,
359 for (i = 0; i < 8; i++) {
360 if (cp->xmm[i].file == file &&
361 cp->xmm[i].idx == idx)
363 cp->xmm[i].last_used = cp->insn_counter;
364 return x86_make_reg(file_XMM, i);
368 /* If not found in the XMM register file, return an indirect
369 * reference to the in-memory copy:
371 return get_reg_ptr( cp, file, idx );
376 static struct x86_reg aos_get_shader_reg_xmm( struct aos_compilation *cp,
380 struct x86_reg reg = get_xmm( cp,
381 aos_get_shader_reg( cp, file, idx ) );
383 aos_adopt_xmm_reg( cp,
394 struct x86_reg aos_get_internal_xmm( struct aos_compilation *cp,
397 return aos_get_shader_reg_xmm( cp, AOS_FILE_INTERNAL, imm );
401 struct x86_reg aos_get_internal( struct aos_compilation *cp,
404 return aos_get_shader_reg( cp, AOS_FILE_INTERNAL, imm );
411 /* Emulate pshufd insn in regular SSE, if necessary:
413 static void emit_pshufd( struct aos_compilation *cp,
419 sse2_pshufd(cp->func, dst, arg0, shuf);
423 sse_movaps(cp->func, dst, arg0);
425 sse_shufps(cp->func, dst, dst, shuf);
429 /* load masks (pack into negs??)
430 * pshufd - shuffle according to writemask
435 static boolean mask_write( struct aos_compilation *cp,
437 struct x86_reg result,
440 struct x86_reg imm_swz = aos_get_internal_xmm(cp, IMM_SWZ);
441 struct x86_reg tmp = aos_get_xmm_reg(cp);
443 emit_pshufd(cp, tmp, imm_swz,
444 SHUF((mask & 1) ? 2 : 3,
447 (mask & 8) ? 2 : 3));
449 sse_andps(cp->func, dst, tmp);
450 sse_andnps(cp->func, tmp, result);
451 sse_orps(cp->func, dst, tmp);
453 aos_release_xmm_reg(cp, tmp.idx);
460 /* Helper for writemask:
462 static boolean emit_shuf_copy2( struct aos_compilation *cp,
468 struct x86_reg tmp = aos_get_xmm_reg(cp);
470 emit_pshufd(cp, dst, arg1, shuf);
471 emit_pshufd(cp, tmp, arg0, shuf);
472 sse_shufps(cp->func, dst, tmp, SHUF(X, Y, Z, W));
473 emit_pshufd(cp, dst, dst, shuf);
475 aos_release_xmm_reg(cp, tmp.idx);
481 #define SSE_SWIZZLE_NOOP ((0<<0) | (1<<2) | (2<<4) | (3<<6))
484 /* Locate a source register and perform any required (simple) swizzle.
486 * Just fail on complex swizzles at this point.
488 static struct x86_reg fetch_src( struct aos_compilation *cp,
489 const struct tgsi_full_src_register *src )
491 struct x86_reg arg0 = aos_get_shader_reg(cp,
492 src->SrcRegister.File,
493 src->SrcRegister.Index);
499 for (i = 0; i < 4; i++) {
500 unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( src, i );
501 unsigned neg = tgsi_util_get_full_src_register_sign_mode( src, i );
504 case TGSI_EXTSWIZZLE_ZERO:
505 case TGSI_EXTSWIZZLE_ONE:
506 ERROR(cp, "not supporting full swizzles yet in tgsi_aos_sse2");
510 swz |= (swizzle & 0x3) << (i * 2);
515 case TGSI_UTIL_SIGN_TOGGLE:
519 case TGSI_UTIL_SIGN_KEEP:
522 case TGSI_UTIL_SIGN_CLEAR:
527 ERROR(cp, "unsupported sign-mode");
532 if (swz != SSE_SWIZZLE_NOOP || negs != 0 || abs != 0) {
533 struct x86_reg dst = aos_get_xmm_reg(cp);
535 if (swz != SSE_SWIZZLE_NOOP)
536 emit_pshufd(cp, dst, arg0, swz);
538 sse_movaps(cp->func, dst, arg0);
540 if (negs && negs != 0xf) {
541 struct x86_reg imm_swz = aos_get_internal_xmm(cp, IMM_SWZ);
542 struct x86_reg tmp = aos_get_xmm_reg(cp);
545 * Use neg as arg to pshufd
548 emit_pshufd(cp, tmp, imm_swz,
549 SHUF((negs & 1) ? 1 : 0,
552 (negs & 8) ? 1 : 0));
553 sse_mulps(cp->func, dst, tmp);
555 aos_release_xmm_reg(cp, tmp.idx);
558 struct x86_reg imm_negs = aos_get_internal_xmm(cp, IMM_NEGS);
559 sse_mulps(cp->func, dst, imm_negs);
563 if (abs && abs != 0xf) {
564 ERROR(cp, "unsupported partial abs");
567 struct x86_reg neg = aos_get_internal(cp, IMM_NEGS);
568 struct x86_reg tmp = aos_get_xmm_reg(cp);
570 sse_movaps(cp->func, tmp, dst);
571 sse_mulps(cp->func, tmp, neg);
572 sse_maxps(cp->func, dst, tmp);
574 aos_release_xmm_reg(cp, tmp.idx);
583 static void x87_fld_src( struct aos_compilation *cp,
584 const struct tgsi_full_src_register *src,
587 struct x86_reg arg0 = aos_get_shader_reg_ptr(cp,
588 src->SrcRegister.File,
589 src->SrcRegister.Index);
591 unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( src, channel );
592 unsigned neg = tgsi_util_get_full_src_register_sign_mode( src, channel );
595 case TGSI_EXTSWIZZLE_ZERO:
596 x87_fldz( cp->func );
599 case TGSI_EXTSWIZZLE_ONE:
600 x87_fld1( cp->func );
604 x87_fld( cp->func, x86_make_disp(arg0, (swizzle & 3) * sizeof(float)) );
610 case TGSI_UTIL_SIGN_TOGGLE:
613 x87_fchs( cp->func );
616 case TGSI_UTIL_SIGN_KEEP:
619 case TGSI_UTIL_SIGN_CLEAR:
620 x87_fabs( cp->func );
623 case TGSI_UTIL_SIGN_SET:
624 x87_fabs( cp->func );
625 x87_fchs( cp->func );
629 ERROR(cp, "unsupported sign-mode");
639 /* Used to implement write masking. This and most of the other instructions
640 * here would be easier to implement if there had been a translation
641 * to a 2 argument format (dst/arg0, arg1) at the shader level before
642 * attempting to translate to x86/sse code.
644 static void store_dest( struct aos_compilation *cp,
645 const struct tgsi_full_dst_register *reg,
646 struct x86_reg result )
650 switch (reg->DstRegister.WriteMask) {
654 case TGSI_WRITEMASK_XYZW:
655 aos_adopt_xmm_reg(cp,
656 get_xmm_writable(cp, result),
657 reg->DstRegister.File,
658 reg->DstRegister.Index,
665 dst = aos_get_shader_reg_xmm(cp,
666 reg->DstRegister.File,
667 reg->DstRegister.Index);
669 switch (reg->DstRegister.WriteMask) {
670 case TGSI_WRITEMASK_X:
671 sse_movss(cp->func, dst, get_xmm(cp, result));
674 case TGSI_WRITEMASK_ZW:
675 sse_shufps(cp->func, dst, get_xmm(cp, result), SHUF(X, Y, Z, W));
678 case TGSI_WRITEMASK_XY:
679 result = get_xmm_writable(cp, result);
680 sse_shufps(cp->func, result, dst, SHUF(X, Y, Z, W));
684 case TGSI_WRITEMASK_YZW:
685 result = get_xmm_writable(cp, result);
686 sse_movss(cp->func, result, dst);
691 mask_write(cp, dst, result, reg->DstRegister.WriteMask);
695 aos_adopt_xmm_reg(cp,
697 reg->DstRegister.File,
698 reg->DstRegister.Index,
703 static void inject_scalar( struct aos_compilation *cp,
705 struct x86_reg result,
708 sse_shufps(cp->func, dst, dst, swizzle);
709 sse_movss(cp->func, dst, result);
710 sse_shufps(cp->func, dst, dst, swizzle);
714 static void store_scalar_dest( struct aos_compilation *cp,
715 const struct tgsi_full_dst_register *reg,
716 struct x86_reg result )
718 unsigned writemask = reg->DstRegister.WriteMask;
721 if (writemask != TGSI_WRITEMASK_X &&
722 writemask != TGSI_WRITEMASK_Y &&
723 writemask != TGSI_WRITEMASK_Z &&
724 writemask != TGSI_WRITEMASK_W &&
727 result = get_xmm_writable(cp, result); /* already true, right? */
728 sse_shufps(cp->func, result, result, SHUF(X,X,X,X));
729 store_dest(cp, reg, result);
733 result = get_xmm(cp, result);
734 dst = aos_get_shader_reg_xmm(cp,
735 reg->DstRegister.File,
736 reg->DstRegister.Index);
740 switch (reg->DstRegister.WriteMask) {
741 case TGSI_WRITEMASK_X:
742 sse_movss(cp->func, dst, result);
745 case TGSI_WRITEMASK_Y:
746 inject_scalar(cp, dst, result, SHUF(Y, X, Z, W));
749 case TGSI_WRITEMASK_Z:
750 inject_scalar(cp, dst, result, SHUF(Z, Y, X, W));
753 case TGSI_WRITEMASK_W:
754 inject_scalar(cp, dst, result, SHUF(W, Y, Z, X));
761 aos_adopt_xmm_reg(cp,
763 reg->DstRegister.File,
764 reg->DstRegister.Index,
770 static void x87_fst_or_nop( struct x86_function *func,
775 assert(ptr.file == file_REG32);
776 if (writemask & (1<<channel))
777 x87_fst( func, x86_make_disp(ptr, channel * sizeof(float)) );
780 static void x87_fstp_or_pop( struct x86_function *func,
785 assert(ptr.file == file_REG32);
786 if (writemask & (1<<channel))
787 x87_fstp( func, x86_make_disp(ptr, channel * sizeof(float)) );
789 x87_fstp( func, x86_make_reg( file_x87, 0 ));
796 static void x87_fstp_dest4( struct aos_compilation *cp,
797 const struct tgsi_full_dst_register *dst )
799 struct x86_reg ptr = get_dst_ptr(cp, dst);
800 unsigned writemask = dst->DstRegister.WriteMask;
802 x87_fst_or_nop(cp->func, writemask, 0, ptr);
803 x87_fst_or_nop(cp->func, writemask, 1, ptr);
804 x87_fst_or_nop(cp->func, writemask, 2, ptr);
805 x87_fstp_or_pop(cp->func, writemask, 3, ptr);
808 /* Save current x87 state and put it into single precision mode.
810 static void save_fpu_state( struct aos_compilation *cp )
812 x87_fnstcw( cp->func, x86_make_disp(cp->machine_EDX,
813 Offset(struct aos_machine, fpu_restore)));
816 static void restore_fpu_state( struct aos_compilation *cp )
818 x87_fnclex(cp->func);
819 x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX,
820 Offset(struct aos_machine, fpu_restore)));
823 static void set_fpu_round_neg_inf( struct aos_compilation *cp )
825 if (cp->fpucntl != FPU_RND_NEG) {
826 cp->fpucntl = FPU_RND_NEG;
827 x87_fnclex(cp->func);
828 x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX,
829 Offset(struct aos_machine, fpu_rnd_neg_inf)));
833 static void set_fpu_round_nearest( struct aos_compilation *cp )
835 if (cp->fpucntl != FPU_RND_NEAREST) {
836 cp->fpucntl = FPU_RND_NEAREST;
837 x87_fnclex(cp->func);
838 x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX,
839 Offset(struct aos_machine, fpu_rnd_nearest)));
844 static void x87_emit_ex2( struct aos_compilation *cp )
846 struct x86_reg st0 = x86_make_reg(file_x87, 0);
847 struct x86_reg st1 = x86_make_reg(file_x87, 1);
848 int stack = cp->func->x87_stack;
850 // set_fpu_round_neg_inf( cp );
852 x87_fld(cp->func, st0); /* a a */
853 x87_fprndint( cp->func ); /* int(a) a*/
854 x87_fsubr(cp->func, st1, st0); /* int(a) frc(a) */
855 x87_fxch(cp->func, st1); /* frc(a) int(a) */
856 x87_f2xm1(cp->func); /* (2^frc(a))-1 int(a) */
857 x87_fld1(cp->func); /* 1 (2^frc(a))-1 int(a) */
858 x87_faddp(cp->func, st1); /* 2^frac(a) int(a) */
859 x87_fscale(cp->func); /* (2^frac(a)*2^int(int(a))) int(a) */
861 x87_fstp(cp->func, st1); /* 2^a */
863 assert( stack == cp->func->x87_stack);
867 static void PIPE_CDECL print_reg( const char *msg,
870 debug_printf("%s: %f %f %f %f\n", msg, reg[0], reg[1], reg[2], reg[3]);
873 static void emit_print( struct aos_compilation *cp,
874 const char *message, /* must point to a static string! */
878 struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
879 struct x86_reg arg = aos_get_shader_reg_ptr( cp, file, idx );
882 /* There shouldn't be anything on the x87 stack. Can add this
883 * capacity later if need be.
885 assert(cp->func->x87_stack == 0);
887 /* For absolute correctness, need to spill/invalidate all XMM regs
888 * too. We're obviously not concerned about performance on this
889 * debug path, so here goes:
891 for (i = 0; i < 8; i++) {
892 if (cp->xmm[i].dirty)
895 aos_release_xmm_reg(cp, i);
898 /* Push caller-save (ie scratch) regs.
900 x86_cdecl_caller_push_regs( cp->func );
903 /* Push the arguments:
905 x86_lea( cp->func, ecx, arg );
906 x86_push( cp->func, ecx );
907 x86_push_imm32( cp->func, (int)message );
909 /* Call the helper. Could call debug_printf directly, but
910 * print_reg is a nice place to put a breakpoint if need be.
912 x86_mov_reg_imm( cp->func, ecx, (int)print_reg );
913 x86_call( cp->func, ecx );
914 x86_pop( cp->func, ecx );
915 x86_pop( cp->func, ecx );
917 /* Pop caller-save regs
919 x86_cdecl_caller_pop_regs( cp->func );
926 * The traditional instructions. All operate on internal registers
927 * and ignore write masks and swizzling issues.
930 static boolean emit_ABS( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
932 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
933 struct x86_reg neg = aos_get_internal(cp, IMM_NEGS);
934 struct x86_reg tmp = aos_get_xmm_reg(cp);
936 sse_movaps(cp->func, tmp, arg0);
937 sse_mulps(cp->func, tmp, neg);
938 sse_maxps(cp->func, tmp, arg0);
940 store_dest(cp, &op->FullDstRegisters[0], tmp);
944 static boolean emit_ADD( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
946 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
947 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
948 struct x86_reg dst = get_xmm_writable(cp, arg0);
950 sse_addps(cp->func, dst, arg1);
952 store_dest(cp, &op->FullDstRegisters[0], dst);
956 static boolean emit_COS( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
958 x87_fld_src(cp, &op->FullSrcRegisters[0], 0);
960 x87_fstp_dest4(cp, &op->FullDstRegisters[0]);
964 /* The dotproduct instructions don't really do that well in sse:
965 * XXX: produces wrong results -- disabled.
967 static boolean emit_DP3( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
969 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
970 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
971 struct x86_reg tmp = aos_get_xmm_reg(cp);
972 struct x86_reg dst = get_xmm_writable(cp, arg0);
974 sse_mulps(cp->func, dst, arg1);
975 /* Now the hard bit: sum the first 3 values:
977 sse_movhlps(cp->func, tmp, dst);
978 sse_addss(cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */
979 emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
980 sse_addss(cp->func, dst, tmp);
982 aos_release_xmm_reg(cp, tmp.idx);
983 store_scalar_dest(cp, &op->FullDstRegisters[0], dst);
987 static boolean emit_DP4( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
989 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
990 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
991 struct x86_reg tmp = aos_get_xmm_reg(cp);
992 struct x86_reg dst = get_xmm_writable(cp, arg0);
994 sse_mulps(cp->func, dst, arg1);
996 /* Now the hard bit: sum the values:
998 sse_movhlps(cp->func, tmp, dst);
999 sse_addps(cp->func, dst, tmp); /* a*x+c*z, b*y+d*w, a*x+c*z, b*y+d*w */
1000 emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
1001 sse_addss(cp->func, dst, tmp);
1003 aos_release_xmm_reg(cp, tmp.idx);
1004 store_scalar_dest(cp, &op->FullDstRegisters[0], dst);
1008 static boolean emit_DPH( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1010 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1011 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
1012 struct x86_reg tmp = aos_get_xmm_reg(cp);
1013 struct x86_reg dst = get_xmm_writable(cp, arg0);
1015 sse_mulps(cp->func, dst, arg1);
1017 /* Now the hard bit: sum the values (from DP3):
1019 sse_movhlps(cp->func, tmp, dst);
1020 sse_addss(cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */
1021 emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
1022 sse_addss(cp->func, dst, tmp);
1023 emit_pshufd(cp, tmp, arg1, SHUF(W,W,W,W));
1024 sse_addss(cp->func, dst, tmp);
1026 aos_release_xmm_reg(cp, tmp.idx);
1027 store_scalar_dest(cp, &op->FullDstRegisters[0], dst);
1031 static boolean emit_DST( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1033 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1034 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
1035 struct x86_reg dst = aos_get_xmm_reg(cp);
1036 struct x86_reg tmp = aos_get_xmm_reg(cp);
1037 struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
1039 /* dst[0] = 1.0 * 1.0F; */
1040 /* dst[1] = arg0[1] * arg1[1]; */
1041 /* dst[2] = arg0[2] * 1.0; */
1042 /* dst[3] = 1.0 * arg1[3]; */
1044 emit_shuf_copy2(cp, dst, arg0, ones, SHUF(X,W,Z,Y));
1045 emit_shuf_copy2(cp, tmp, arg1, ones, SHUF(X,Z,Y,W));
1046 sse_mulps(cp->func, dst, tmp);
1048 aos_release_xmm_reg(cp, tmp.idx);
1049 store_dest(cp, &op->FullDstRegisters[0], dst);
1053 static boolean emit_LG2( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1055 x87_fld1(cp->func); /* 1 */
1056 x87_fld_src(cp, &op->FullSrcRegisters[0], 0); /* a0 1 */
1057 x87_fyl2x(cp->func); /* log2(a0) */
1058 x87_fstp_dest4(cp, &op->FullDstRegisters[0]);
1063 static boolean emit_EX2( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1065 x87_fld_src(cp, &op->FullSrcRegisters[0], 0);
1067 x87_fstp_dest4(cp, &op->FullDstRegisters[0]);
1072 static boolean emit_FLR( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1074 struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]);
1075 unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask;
1078 set_fpu_round_neg_inf( cp );
1080 /* Load all sources first to avoid aliasing
1082 for (i = 3; i >= 0; i--) {
1083 if (writemask & (1<<i)) {
1084 x87_fld_src(cp, &op->FullSrcRegisters[0], i);
1088 for (i = 0; i < 4; i++) {
1089 if (writemask & (1<<i)) {
1090 x87_fprndint( cp->func );
1091 x87_fstp(cp->func, x86_make_disp(dst, i*4));
1099 static boolean emit_RND( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1101 struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]);
1102 unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask;
1105 set_fpu_round_nearest( cp );
1107 /* Load all sources first to avoid aliasing
1109 for (i = 3; i >= 0; i--) {
1110 if (writemask & (1<<i)) {
1111 x87_fld_src(cp, &op->FullSrcRegisters[0], i);
1115 for (i = 0; i < 4; i++) {
1116 if (writemask & (1<<i)) {
1117 x87_fprndint( cp->func );
1118 x87_fstp(cp->func, x86_make_disp(dst, i*4));
1126 static boolean emit_FRC( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1128 struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]);
1129 struct x86_reg st0 = x86_make_reg(file_x87, 0);
1130 struct x86_reg st1 = x86_make_reg(file_x87, 1);
1131 unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask;
1134 set_fpu_round_neg_inf( cp );
1136 /* suck all the source values onto the stack before writing out any
1137 * dst, which may alias...
1139 for (i = 3; i >= 0; i--) {
1140 if (writemask & (1<<i)) {
1141 x87_fld_src(cp, &op->FullSrcRegisters[0], i);
1145 for (i = 0; i < 4; i++) {
1146 if (writemask & (1<<i)) {
1147 x87_fld(cp->func, st0); /* a a */
1148 x87_fprndint( cp->func ); /* flr(a) a */
1149 x87_fsubp(cp->func, st1); /* frc(a) */
1150 x87_fstp(cp->func, x86_make_disp(dst, i*4));
1162 static boolean emit_LIT( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1164 struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
1165 unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask;
1166 unsigned lit_count = cp->lit_count++;
1167 struct x86_reg result, arg0;
1171 /* For absolute correctness, need to spill/invalidate all XMM regs
1174 for (i = 0; i < 8; i++) {
1175 if (cp->xmm[i].dirty)
1177 aos_release_xmm_reg(cp, i);
1181 if (writemask != TGSI_WRITEMASK_XYZW)
1182 result = x86_make_disp(cp->machine_EDX, Offset(struct aos_machine, tmp[0]));
1184 result = get_dst_ptr(cp, &op->FullDstRegisters[0]);
1187 arg0 = fetch_src( cp, &op->FullSrcRegisters[0] );
1188 if (arg0.file == file_XMM) {
1189 struct x86_reg tmp = x86_make_disp(cp->machine_EDX,
1190 Offset(struct aos_machine, tmp[1]));
1191 sse_movaps( cp->func, tmp, arg0 );
1197 /* Push caller-save (ie scratch) regs.
1199 x86_cdecl_caller_push_regs( cp->func );
1201 /* Push the arguments:
1203 x86_push_imm32( cp->func, lit_count );
1205 x86_lea( cp->func, ecx, arg0 );
1206 x86_push( cp->func, ecx );
1208 x86_lea( cp->func, ecx, result );
1209 x86_push( cp->func, ecx );
1211 x86_push( cp->func, cp->machine_EDX );
1213 if (lit_count < MAX_LIT_INFO) {
1214 x86_mov( cp->func, ecx, x86_make_disp( cp->machine_EDX,
1215 Offset(struct aos_machine, lit_info) +
1216 lit_count * sizeof(struct lit_info) +
1217 Offset(struct lit_info, func)));
1220 x86_mov_reg_imm( cp->func, ecx, (int)aos_do_lit );
1223 x86_call( cp->func, ecx );
1225 x86_pop( cp->func, ecx ); /* fixme... */
1226 x86_pop( cp->func, ecx );
1227 x86_pop( cp->func, ecx );
1228 x86_pop( cp->func, ecx );
1230 x86_cdecl_caller_pop_regs( cp->func );
1232 if (writemask != TGSI_WRITEMASK_XYZW) {
1234 &op->FullDstRegisters[0],
1235 get_xmm_writable( cp, result ) );
1242 static boolean emit_inline_LIT( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1244 struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]);
1245 unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask;
1247 if (writemask & TGSI_WRITEMASK_YZ) {
1248 struct x86_reg st1 = x86_make_reg(file_x87, 1);
1249 struct x86_reg st2 = x86_make_reg(file_x87, 2);
1251 /* a1' = a1 <= 0 ? 1 : a1;
1253 x87_fldz(cp->func); /* 1 0 */
1255 x87_fld1(cp->func); /* 1 0 */
1257 /* Correct but slow due to fp exceptions generated in fyl2x - fix me.
1259 x87_fldz(cp->func); /* 1 0 */
1261 x87_fld_src(cp, &op->FullSrcRegisters[0], 1); /* a1 1 0 */
1262 x87_fcomi(cp->func, st2); /* a1 1 0 */
1263 x87_fcmovb(cp->func, st1); /* a1' 1 0 */
1264 x87_fstp(cp->func, st1); /* a1' 0 */
1265 x87_fstp(cp->func, st1); /* a1' */
1267 x87_fld_src(cp, &op->FullSrcRegisters[0], 3); /* a3 a1' */
1268 x87_fxch(cp->func, st1); /* a1' a3 */
1271 /* Compute pow(a1, a3)
1273 x87_fyl2x(cp->func); /* a3*log2(a1) */
1274 x87_emit_ex2( cp ); /* 2^(a3*log2(a1)) */
1277 /* a0' = max2(a0, 0):
1279 x87_fldz(cp->func); /* 0 r2 */
1280 x87_fld_src(cp, &op->FullSrcRegisters[0], 0); /* a0 0 r2 */
1281 x87_fcomi(cp->func, st1);
1282 x87_fcmovb(cp->func, st1); /* a0' 0 r2 */
1284 x87_fst_or_nop(cp->func, writemask, 1, dst); /* result[1] = a0' */
1286 x87_fcomi(cp->func, st1); /* a0' 0 r2 */
1287 x87_fcmovnbe(cp->func, st2); /* r2' 0' r2 */
1289 x87_fstp_or_pop(cp->func, writemask, 2, dst); /* 0 r2 */
1290 x87_fpop(cp->func); /* r2 */
1294 if (writemask & TGSI_WRITEMASK_XW) {
1296 x87_fst_or_nop(cp->func, writemask, 0, dst);
1297 x87_fstp_or_pop(cp->func, writemask, 3, dst);
1306 static boolean emit_MAX( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1308 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1309 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
1310 struct x86_reg dst = get_xmm_writable(cp, arg0);
1312 sse_maxps(cp->func, dst, arg1);
1314 store_dest(cp, &op->FullDstRegisters[0], dst);
1319 static boolean emit_MIN( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1321 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1322 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
1323 struct x86_reg dst = get_xmm_writable(cp, arg0);
1325 sse_minps(cp->func, dst, arg1);
1327 store_dest(cp, &op->FullDstRegisters[0], dst);
1331 static boolean emit_MOV( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1333 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1334 struct x86_reg dst = get_xmm_writable(cp, arg0);
1336 /* potentially nothing to do */
1338 store_dest(cp, &op->FullDstRegisters[0], dst);
1342 static boolean emit_MUL( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1344 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1345 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
1346 struct x86_reg dst = get_xmm_writable(cp, arg0);
1348 sse_mulps(cp->func, dst, arg1);
1350 store_dest(cp, &op->FullDstRegisters[0], dst);
1355 static boolean emit_MAD( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1357 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1358 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
1359 struct x86_reg arg2 = fetch_src(cp, &op->FullSrcRegisters[2]);
1361 /* If we can't clobber old contents of arg0, get a temporary & copy
1362 * it there, then clobber it...
1364 arg0 = get_xmm_writable(cp, arg0);
1366 sse_mulps(cp->func, arg0, arg1);
1367 sse_addps(cp->func, arg0, arg2);
1368 store_dest(cp, &op->FullDstRegisters[0], arg0);
1372 /* A wrapper for powf().
1373 * Makes sure it is cdecl and operates on floats.
1375 static float PIPE_CDECL _powerf( float x, float y )
1377 return powf( x, y );
1380 /* Really not sufficient -- need to check for conditions that could
1381 * generate inf/nan values, which will slow things down hugely.
1383 static boolean emit_POW( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1386 x87_fld_src(cp, &op->FullSrcRegisters[1], 0); /* a1.x */
1387 x87_fld_src(cp, &op->FullSrcRegisters[0], 0); /* a0.x a1.x */
1388 x87_fyl2x(cp->func); /* a1*log2(a0) */
1390 x87_emit_ex2( cp ); /* 2^(a1*log2(a0)) */
1392 x87_fstp_dest4(cp, &op->FullDstRegisters[0]);
1396 /* For absolute correctness, need to spill/invalidate all XMM regs
1399 for (i = 0; i < 8; i++) {
1400 if (cp->xmm[i].dirty)
1402 aos_release_xmm_reg(cp, i);
1405 /* Push caller-save (ie scratch) regs.
1407 x86_cdecl_caller_push_regs( cp->func );
1409 x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, -8) );
1411 x87_fld_src( cp, &op->FullSrcRegisters[1], 0 );
1412 x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 4 ) );
1413 x87_fld_src( cp, &op->FullSrcRegisters[0], 0 );
1414 x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 0 ) );
1416 x86_mov_reg_imm( cp->func, cp->tmp_EAX, (unsigned long) _powerf );
1417 x86_call( cp->func, cp->tmp_EAX );
1419 x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, 8) );
1421 x86_cdecl_caller_pop_regs( cp->func );
1423 /* Note retval on x87 stack:
1425 cp->func->x87_stack++;
1427 x87_fstp_dest4( cp, &op->FullDstRegisters[0] );
1433 static boolean emit_RCP( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1435 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1436 struct x86_reg dst = aos_get_xmm_reg(cp);
1438 if (cp->have_sse2) {
1439 sse2_rcpss(cp->func, dst, arg0);
1440 /* extend precision here...
1444 struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
1445 sse_movss(cp->func, dst, ones);
1446 sse_divss(cp->func, dst, arg0);
1449 store_scalar_dest(cp, &op->FullDstRegisters[0], dst);
1454 /* Although rsqrtps() and rcpps() are low precision on some/all SSE
1455 * implementations, it is possible to improve its precision at
1456 * fairly low cost, using a newton/raphson step, as below:
1458 * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
1459 * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
1461 * x1 = rsqrtps(a) * [1.5 - .5 * a * rsqrtps(a) * rsqrtps(a)]
1464 * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
1466 static boolean emit_RSQ( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1470 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1471 struct x86_reg r = aos_get_xmm_reg(cp);
1472 sse_rsqrtss(cp->func, r, arg0);
1473 store_scalar_dest(cp, &op->FullDstRegisters[0], r);
1477 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1478 struct x86_reg r = aos_get_xmm_reg(cp);
1480 struct x86_reg neg_half = get_reg_ptr( cp, AOS_FILE_INTERNAL, IMM_RSQ );
1481 struct x86_reg one_point_five = x86_make_disp( neg_half, 4 );
1482 struct x86_reg src = get_xmm_writable( cp, arg0 );
1484 sse_rsqrtss( cp->func, r, src ); /* rsqrtss(a) */
1485 sse_mulss( cp->func, src, neg_half ); /* -.5 * a */
1486 sse_mulss( cp->func, src, r ); /* -.5 * a * r */
1487 sse_mulss( cp->func, src, r ); /* -.5 * a * r * r */
1488 sse_addss( cp->func, src, one_point_five ); /* 1.5 - .5 * a * r * r */
1489 sse_mulss( cp->func, r, src ); /* r * (1.5 - .5 * a * r * r) */
1491 store_scalar_dest(cp, &op->FullDstRegisters[0], r);
1497 static boolean emit_SGE( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1499 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1500 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
1501 struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
1502 struct x86_reg dst = get_xmm_writable(cp, arg0);
1504 sse_cmpps(cp->func, dst, arg1, cc_NotLessThan);
1505 sse_andps(cp->func, dst, ones);
1507 store_dest(cp, &op->FullDstRegisters[0], dst);
1511 static boolean emit_SIN( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1513 x87_fld_src(cp, &op->FullSrcRegisters[0], 0);
1515 x87_fstp_dest4(cp, &op->FullDstRegisters[0]);
1521 static boolean emit_SLT( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1523 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1524 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
1525 struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
1526 struct x86_reg dst = get_xmm_writable(cp, arg0);
1528 sse_cmpps(cp->func, dst, arg1, cc_LessThan);
1529 sse_andps(cp->func, dst, ones);
1531 store_dest(cp, &op->FullDstRegisters[0], dst);
1535 static boolean emit_SUB( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1537 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1538 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
1539 struct x86_reg dst = get_xmm_writable(cp, arg0);
1541 sse_subps(cp->func, dst, arg1);
1543 store_dest(cp, &op->FullDstRegisters[0], dst);
1548 static boolean emit_XPD( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1550 struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
1551 struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
1552 struct x86_reg tmp0 = aos_get_xmm_reg(cp);
1553 struct x86_reg tmp1 = aos_get_xmm_reg(cp);
1555 emit_pshufd(cp, tmp1, arg1, SHUF(Y, Z, X, W));
1556 sse_mulps(cp->func, tmp1, arg0);
1557 emit_pshufd(cp, tmp0, arg0, SHUF(Y, Z, X, W));
1558 sse_mulps(cp->func, tmp0, arg1);
1559 sse_subps(cp->func, tmp1, tmp0);
1560 sse_shufps(cp->func, tmp1, tmp1, SHUF(Y, Z, X, W));
1562 /* dst[2] = arg0[0] * arg1[1] - arg0[1] * arg1[0]; */
1563 /* dst[0] = arg0[1] * arg1[2] - arg0[2] * arg1[1]; */
1564 /* dst[1] = arg0[2] * arg1[0] - arg0[0] * arg1[2]; */
1565 /* dst[3] is undef */
1568 aos_release_xmm_reg(cp, tmp0.idx);
1569 store_dest(cp, &op->FullDstRegisters[0], tmp1);
1576 emit_instruction( struct aos_compilation *cp,
1577 struct tgsi_full_instruction *inst )
1579 x87_assert_stack_empty(cp->func);
1581 switch( inst->Instruction.Opcode ) {
1582 case TGSI_OPCODE_MOV:
1583 return emit_MOV( cp, inst );
1585 case TGSI_OPCODE_LIT:
1586 return emit_LIT(cp, inst);
1588 case TGSI_OPCODE_RCP:
1589 return emit_RCP(cp, inst);
1591 case TGSI_OPCODE_RSQ:
1592 return emit_RSQ(cp, inst);
1594 case TGSI_OPCODE_EXP:
1595 /*return emit_EXP(cp, inst);*/
1598 case TGSI_OPCODE_LOG:
1599 /*return emit_LOG(cp, inst);*/
1602 case TGSI_OPCODE_MUL:
1603 return emit_MUL(cp, inst);
1605 case TGSI_OPCODE_ADD:
1606 return emit_ADD(cp, inst);
1608 case TGSI_OPCODE_DP3:
1609 return emit_DP3(cp, inst);
1611 case TGSI_OPCODE_DP4:
1612 return emit_DP4(cp, inst);
1614 case TGSI_OPCODE_DST:
1615 return emit_DST(cp, inst);
1617 case TGSI_OPCODE_MIN:
1618 return emit_MIN(cp, inst);
1620 case TGSI_OPCODE_MAX:
1621 return emit_MAX(cp, inst);
1623 case TGSI_OPCODE_SLT:
1624 return emit_SLT(cp, inst);
1626 case TGSI_OPCODE_SGE:
1627 return emit_SGE(cp, inst);
1629 case TGSI_OPCODE_MAD:
1630 return emit_MAD(cp, inst);
1632 case TGSI_OPCODE_SUB:
1633 return emit_SUB(cp, inst);
1635 case TGSI_OPCODE_LERP:
1636 // return emit_LERP(cp, inst);
1639 case TGSI_OPCODE_FRAC:
1640 return emit_FRC(cp, inst);
1642 case TGSI_OPCODE_CLAMP:
1643 // return emit_CLAMP(cp, inst);
1646 case TGSI_OPCODE_FLOOR:
1647 return emit_FLR(cp, inst);
1649 case TGSI_OPCODE_ROUND:
1650 return emit_RND(cp, inst);
1652 case TGSI_OPCODE_EXPBASE2:
1653 return emit_EX2(cp, inst);
1655 case TGSI_OPCODE_LOGBASE2:
1656 return emit_LG2(cp, inst);
1658 case TGSI_OPCODE_POWER:
1659 return emit_POW(cp, inst);
1661 case TGSI_OPCODE_CROSSPRODUCT:
1662 return emit_XPD(cp, inst);
1664 case TGSI_OPCODE_ABS:
1665 return emit_ABS(cp, inst);
1667 case TGSI_OPCODE_DPH:
1668 return emit_DPH(cp, inst);
1670 case TGSI_OPCODE_COS:
1671 return emit_COS(cp, inst);
1673 case TGSI_OPCODE_SIN:
1674 return emit_SIN(cp, inst);
1676 case TGSI_OPCODE_END:
1685 static boolean emit_viewport( struct aos_compilation *cp )
1687 struct x86_reg pos = aos_get_shader_reg_xmm(cp,
1691 struct x86_reg scale = x86_make_disp(cp->machine_EDX,
1692 Offset(struct aos_machine, scale));
1694 struct x86_reg translate = x86_make_disp(cp->machine_EDX,
1695 Offset(struct aos_machine, translate));
1697 sse_mulps(cp->func, pos, scale);
1698 sse_addps(cp->func, pos, translate);
1700 aos_adopt_xmm_reg( cp,
1709 /* This is useful to be able to see the results on softpipe. Doesn't
1710 * do proper clipping, just assumes the backend can do it during
1711 * rasterization -- for debug only...
1713 static boolean emit_rhw_viewport( struct aos_compilation *cp )
1715 struct x86_reg tmp = aos_get_xmm_reg(cp);
1716 struct x86_reg pos = aos_get_shader_reg_xmm(cp,
1720 struct x86_reg scale = x86_make_disp(cp->machine_EDX,
1721 Offset(struct aos_machine, scale));
1723 struct x86_reg translate = x86_make_disp(cp->machine_EDX,
1724 Offset(struct aos_machine, translate));
1728 emit_pshufd(cp, tmp, pos, SHUF(W, W, W, W));
1729 sse2_rcpss(cp->func, tmp, tmp);
1730 sse_shufps(cp->func, tmp, tmp, SHUF(X, X, X, X));
1732 sse_mulps(cp->func, pos, scale);
1733 sse_mulps(cp->func, pos, tmp);
1734 sse_addps(cp->func, pos, translate);
1738 mask_write(cp, pos, tmp, TGSI_WRITEMASK_W);
1740 aos_adopt_xmm_reg( cp,
1750 static boolean note_immediate( struct aos_compilation *cp,
1751 struct tgsi_full_immediate *imm )
1753 unsigned pos = cp->num_immediates++;
1756 for (j = 0; j < imm->Immediate.Size; j++) {
1757 cp->vaos->machine->immediate[pos][j] = imm->u.ImmediateFloat32[j].Float;
1767 static void find_last_write_outputs( struct aos_compilation *cp )
1769 struct tgsi_parse_context parse;
1770 unsigned this_instruction = 0;
1773 tgsi_parse_init( &parse, cp->vaos->base.vs->state.tokens );
1775 while (!tgsi_parse_end_of_tokens( &parse )) {
1777 tgsi_parse_token( &parse );
1779 if (parse.FullToken.Token.Type != TGSI_TOKEN_TYPE_INSTRUCTION)
1782 for (i = 0; i < TGSI_FULL_MAX_DST_REGISTERS; i++) {
1783 if (parse.FullToken.FullInstruction.FullDstRegisters[i].DstRegister.File ==
1786 unsigned idx = parse.FullToken.FullInstruction.FullDstRegisters[i].DstRegister.Index;
1787 cp->output_last_write[idx] = this_instruction;
1794 tgsi_parse_free( &parse );
1798 #define ARG_MACHINE 1
1799 #define ARG_START_ELTS 2
1801 #define ARG_OUTBUF 4
1804 static boolean build_vertex_program( struct draw_vs_varient_aos_sse *varient,
1807 struct tgsi_parse_context parse;
1808 struct aos_compilation cp;
1809 unsigned fixup, label;
1811 tgsi_parse_init( &parse, varient->base.vs->state.tokens );
1813 memset(&cp, 0, sizeof(cp));
1815 cp.insn_counter = 1;
1818 cp.func = &varient->func[ linear ? 0 : 1 ];
1820 cp.tmp_EAX = x86_make_reg(file_REG32, reg_AX);
1821 cp.idx_EBX = x86_make_reg(file_REG32, reg_BX);
1822 cp.outbuf_ECX = x86_make_reg(file_REG32, reg_CX);
1823 cp.machine_EDX = x86_make_reg(file_REG32, reg_DX);
1824 cp.count_ESI = x86_make_reg(file_REG32, reg_SI);
1825 cp.temp_EBP = x86_make_reg(file_REG32, reg_BP);
1826 cp.stack_ESP = x86_make_reg( file_REG32, reg_SP );
1828 x86_init_func(cp.func);
1830 find_last_write_outputs(&cp);
1832 x86_push(cp.func, cp.idx_EBX);
1833 x86_push(cp.func, cp.count_ESI);
1834 x86_push(cp.func, cp.temp_EBP);
1837 /* Load arguments into regs:
1839 x86_mov(cp.func, cp.machine_EDX, x86_fn_arg(cp.func, ARG_MACHINE));
1840 x86_mov(cp.func, cp.idx_EBX, x86_fn_arg(cp.func, ARG_START_ELTS));
1841 x86_mov(cp.func, cp.count_ESI, x86_fn_arg(cp.func, ARG_COUNT));
1842 x86_mov(cp.func, cp.outbuf_ECX, x86_fn_arg(cp.func, ARG_OUTBUF));
1845 /* Compare count to zero and possibly bail.
1847 x86_xor(cp.func, cp.tmp_EAX, cp.tmp_EAX);
1848 x86_cmp(cp.func, cp.count_ESI, cp.tmp_EAX);
1849 fixup = x86_jcc_forward(cp.func, cc_E);
1852 save_fpu_state( &cp );
1853 set_fpu_round_nearest( &cp );
1855 /* Note address for loop jump
1857 label = x86_get_label(cp.func);
1859 /* Fetch inputs... TODO: fetch lazily...
1861 if (!aos_fetch_inputs( &cp, linear ))
1866 while( !tgsi_parse_end_of_tokens( &parse ) && !cp.error )
1868 tgsi_parse_token( &parse );
1870 switch (parse.FullToken.Token.Type) {
1871 case TGSI_TOKEN_TYPE_IMMEDIATE:
1873 if (!note_immediate( &cp, &parse.FullToken.FullImmediate ))
1878 case TGSI_TOKEN_TYPE_INSTRUCTION:
1880 tgsi_dump_instruction( &parse.FullToken.FullInstruction, cp.insn_counter );
1882 if (!emit_instruction( &cp, &parse.FullToken.FullInstruction ))
1887 x87_assert_stack_empty(cp.func);
1897 for (i = 0; i < 8; i++) {
1898 if (cp.xmm[i].file != TGSI_FILE_OUTPUT) {
1899 cp.xmm[i].file = TGSI_FILE_NULL;
1900 cp.xmm[i].dirty = 0;
1908 if (cp.vaos->base.key.clip) {
1909 /* not really handling clipping, just do the rhw so we can
1910 * see the results...
1912 emit_rhw_viewport(&cp);
1914 else if (cp.vaos->base.key.viewport) {
1918 /* Emit output... TODO: do this eagerly after the last write to a
1921 if (!aos_emit_outputs( &cp ))
1929 x86_make_disp(cp.outbuf_ECX,
1930 cp.vaos->base.key.output_stride));
1935 x86_inc(cp.func, cp.idx_EBX);
1938 x86_lea(cp.func, cp.idx_EBX, x86_make_disp(cp.idx_EBX, 4));
1942 /* decr count, loop if not zero
1944 x86_dec(cp.func, cp.count_ESI);
1945 x86_jcc(cp.func, cc_NZ, label);
1947 restore_fpu_state(&cp);
1949 /* Land forward jump here:
1951 x86_fixup_fwd_jump(cp.func, fixup);
1955 if (cp.func->need_emms)
1958 x86_pop(cp.func, cp.temp_EBP);
1959 x86_pop(cp.func, cp.count_ESI);
1960 x86_pop(cp.func, cp.idx_EBX);
1962 x87_assert_stack_empty(cp.func);
1965 tgsi_parse_free( &parse );
1969 tgsi_parse_free( &parse );
1975 static void vaos_set_buffer( struct draw_vs_varient *varient,
1980 struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
1983 for (i = 0; i < vaos->base.key.nr_inputs; i++) {
1984 if (vaos->base.key.element[i].in.buffer == buf) {
1985 vaos->attrib[i].input_ptr = ((char *)ptr +
1986 vaos->base.key.element[i].in.offset);
1987 vaos->attrib[i].input_stride = stride;
1994 static void PIPE_CDECL vaos_run_elts( struct draw_vs_varient *varient,
1995 const unsigned *elts,
1997 void *output_buffer )
1999 struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
2000 struct aos_machine *machine = vaos->draw->vs.aos_machine;
2002 machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size;
2003 machine->constants = vaos->draw->vs.aligned_constants;
2004 machine->immediates = vaos->base.vs->immediates;
2005 machine->attrib = vaos->attrib;
2007 vaos->gen_run_elts( machine,
2013 static void PIPE_CDECL vaos_run_linear( struct draw_vs_varient *varient,
2016 void *output_buffer )
2018 struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
2019 struct aos_machine *machine = vaos->draw->vs.aos_machine;
2021 machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size;
2022 machine->constants = vaos->draw->vs.aligned_constants;
2023 machine->immediates = vaos->base.vs->immediates;
2024 machine->attrib = vaos->attrib;
2026 vaos->gen_run_linear( machine,
2034 static void vaos_destroy( struct draw_vs_varient *varient )
2036 struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
2038 FREE( vaos->attrib );
2040 x86_release_func( &vaos->func[0] );
2041 x86_release_func( &vaos->func[1] );
2048 static struct draw_vs_varient *varient_aos_sse( struct draw_vertex_shader *vs,
2049 const struct draw_vs_varient_key *key )
2051 struct draw_vs_varient_aos_sse *vaos = CALLOC_STRUCT(draw_vs_varient_aos_sse);
2056 vaos->base.key = *key;
2058 vaos->base.set_input = vaos_set_buffer;
2059 vaos->base.destroy = vaos_destroy;
2060 vaos->base.run_linear = vaos_run_linear;
2061 vaos->base.run_elts = vaos_run_elts;
2063 vaos->draw = vs->draw;
2065 vaos->attrib = MALLOC( key->nr_inputs * sizeof(vaos->attrib[0]) );
2069 tgsi_dump(vs->state.tokens, 0);
2071 if (!build_vertex_program( vaos, TRUE ))
2074 if (!build_vertex_program( vaos, FALSE ))
2077 vaos->gen_run_linear = (vaos_run_linear_func)x86_get_func(&vaos->func[0]);
2078 if (!vaos->gen_run_linear)
2081 vaos->gen_run_elts = (vaos_run_elts_func)x86_get_func(&vaos->func[1]);
2082 if (!vaos->gen_run_elts)
2088 if (vaos && vaos->attrib)
2092 x86_release_func( &vaos->func[0] );
2095 x86_release_func( &vaos->func[1] );
2103 struct draw_vs_varient *draw_vs_varient_aos_sse( struct draw_vertex_shader *vs,
2104 const struct draw_vs_varient_key *key )
2106 struct draw_vs_varient *varient = varient_aos_sse( vs, key );
2108 if (varient == NULL) {
2110 varient = draw_vs_varient_generic( vs, key );