src/gallium/auxiliary/tgsi/tgsi_sse2.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28 #include "pipe/p_config.h"
  29
  30 #if defined(PIPE_ARCH_X86)
  31
  32 #include "util/u_debug.h"
  33 #include "pipe/p_shader_tokens.h"
  34 #include "util/u_math.h"
  35 #include "util/u_memory.h"
  36 #if defined(PIPE_ARCH_SSE)
  37 #include "util/u_sse.h"
  38 #endif
  39 #include "tgsi/tgsi_info.h"
  40 #include "tgsi/tgsi_parse.h"
  41 #include "tgsi/tgsi_util.h"
  42 #include "tgsi_exec.h"
  43 #include "tgsi_sse2.h"
  44
  45 #include "rtasm/rtasm_x86sse.h"
  46
  47 /* for 1/sqrt()
  48  *
  49  * This costs about 100fps (close to 10%) in gears:
  50  */
  51 #define HIGH_PRECISION 1
  52
  53 #define FAST_MATH 1
  54
  55
  56 #define FOR_EACH_CHANNEL( CHAN )\
  57    for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
  58
  59 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
  60    ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
  61
  62 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
  63    if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
  64
  65 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
  66    FOR_EACH_CHANNEL( CHAN )\
  67       IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
  68
  69 #define CHAN_X 0
  70 #define CHAN_Y 1
  71 #define CHAN_Z 2
  72 #define CHAN_W 3
  73
  74 #define TEMP_ONE_I   TGSI_EXEC_TEMP_ONE_I
  75 #define TEMP_ONE_C   TGSI_EXEC_TEMP_ONE_C
  76
  77 #define TEMP_R0   TGSI_EXEC_TEMP_R0
  78 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
  79 #define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I
  80 #define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C
  81
  82
  83 /**
  84  * X86 utility functions.
  85  */
  86
  87 static struct x86_reg
  88 make_xmm(
  89    unsigned xmm )
  90 {
  91    return x86_make_reg(
  92       file_XMM,
  93       (enum x86_reg_name) xmm );
  94 }
  95
  96 /**
  97  * X86 register mapping helpers.
  98  */
  99
 100 static struct x86_reg
 101 get_const_base( void )
 102 {
 103    return x86_make_reg(
 104       file_REG32,
 105       reg_AX );
 106 }
 107
 108 static struct x86_reg
 109 get_machine_base( void )
 110 {
 111    return x86_make_reg(
 112       file_REG32,
 113       reg_CX );
 114 }
 115
 116 static struct x86_reg
 117 get_input_base( void )
 118 {
 119    return x86_make_disp(
 120       get_machine_base(),
 121       Offset(struct tgsi_exec_machine, Inputs) );
 122 }
 123
 124 static struct x86_reg
 125 get_output_base( void )
 126 {
 127    return x86_make_disp(
 128       get_machine_base(),
 129       Offset(struct tgsi_exec_machine, Outputs) );
 130 }
 131
 132 static struct x86_reg
 133 get_temp_base( void )
 134 {
 135    return x86_make_disp(
 136       get_machine_base(),
 137       Offset(struct tgsi_exec_machine, Temps) );
 138 }
 139
 140 static struct x86_reg
 141 get_coef_base( void )
 142 {
 143    return x86_make_reg(
 144       file_REG32,
 145       reg_BX );
 146 }
 147
 148 static struct x86_reg
 149 get_sampler_base( void )
 150 {
 151    return x86_make_reg(
 152       file_REG32,
 153       reg_DI );
 154 }
 155
 156 static struct x86_reg
 157 get_immediate_base( void )
 158 {
 159    return x86_make_reg(
 160       file_REG32,
 161       reg_DX );
 162 }
 163
 164
 165 /**
 166  * Data access helpers.
 167  */
 168
 169
 170 static struct x86_reg
 171 get_immediate(
 172    unsigned vec,
 173    unsigned chan )
 174 {
 175    return x86_make_disp(
 176       get_immediate_base(),
 177       (vec * 4 + chan) * 4 );
 178 }
 179
 180 static struct x86_reg
 181 get_const(
 182    unsigned vec,
 183    unsigned chan )
 184 {
 185    return x86_make_disp(
 186       get_const_base(),
 187       (vec * 4 + chan) * 4 );
 188 }
 189
 190 static struct x86_reg
 191 get_sampler_ptr(
 192    unsigned unit )
 193 {
 194    return x86_make_disp(
 195       get_sampler_base(),
 196       unit * sizeof( struct tgsi_sampler * ) );
 197 }
 198
 199 static struct x86_reg
 200 get_input(
 201    unsigned vec,
 202    unsigned chan )
 203 {
 204    return x86_make_disp(
 205       get_input_base(),
 206       (vec * 4 + chan) * 16 );
 207 }
 208
 209 static struct x86_reg
 210 get_output(
 211    unsigned vec,
 212    unsigned chan )
 213 {
 214    return x86_make_disp(
 215       get_output_base(),
 216       (vec * 4 + chan) * 16 );
 217 }
 218
 219 static struct x86_reg
 220 get_temp(
 221    unsigned vec,
 222    unsigned chan )
 223 {
 224    return x86_make_disp(
 225       get_temp_base(),
 226       (vec * 4 + chan) * 16 );
 227 }
 228
 229 static struct x86_reg
 230 get_coef(
 231    unsigned vec,
 232    unsigned chan,
 233    unsigned member )
 234 {
 235    return x86_make_disp(
 236       get_coef_base(),
 237       ((vec * 3 + member) * 4 + chan) * 4 );
 238 }
 239
 240
 241 static void
 242 emit_ret(
 243    struct x86_function  *func )
 244 {
 245    x86_ret( func );
 246 }
 247
 248
 249 /**
 250  * Data fetch helpers.
 251  */
 252
 253 /**
 254  * Copy a shader constant to xmm register
 255  * \param xmm  the destination xmm register
 256  * \param vec  the src const buffer index
 257  * \param chan  src channel to fetch (X, Y, Z or W)
 258  */
 259 static void
 260 emit_const(
 261    struct x86_function *func,
 262    uint xmm,
 263    int vec,
 264    uint chan,
 265    uint indirect,
 266    uint indirectFile,
 267    int indirectIndex )
 268 {
 269    if (indirect) {
 270       /* 'vec' is the offset from the address register's value.
 271        * We're loading CONST[ADDR+vec] into an xmm register.
 272        */
 273       struct x86_reg r0 = get_immediate_base();
 274       struct x86_reg r1 = get_coef_base();
 275       uint i;
 276
 277       assert( indirectFile == TGSI_FILE_ADDRESS );
 278       assert( indirectIndex == 0 );
 279       assert( r0.mod == mod_REG );
 280       assert( r1.mod == mod_REG );
 281
 282       x86_push( func, r0 );
 283       x86_push( func, r1 );
 284
 285       /*
 286        * Loop over the four pixels or vertices in the quad.
 287        * Get the value of the address (offset) register for pixel/vertex[i],
 288        * add it to the src offset and index into the constant buffer.
 289        * Note that we're working on SOA data.
 290        * If any of the pixel/vertex execution channels are unused their
 291        * values will be garbage.  It's very important that we don't use
 292        * those garbage values as indexes into the constant buffer since
 293        * that'll cause segfaults.
 294        * The solution is to bitwise-AND the offset with the execution mask
 295        * register whose values are either 0 or ~0.
 296        * The caller must setup the execution mask register to indicate
 297        * which channels are valid/alive before running the shader.
 298        * The execution mask will also figure into loops and conditionals
 299        * someday.
 300        */
 301       for (i = 0; i < QUAD_SIZE; i++) {
 302          /* r1 = address register[i] */
 303          x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) );
 304          /* r0 = execution mask[i] */
 305          x86_mov( func, r0, x86_make_disp( get_temp( TEMP_EXEC_MASK_I, TEMP_EXEC_MASK_C ), i * 4 ) );
 306          /* r1 = r1 & r0 */
 307          x86_and( func, r1, r0 );
 308          /* r0 = 'vec', the offset */
 309          x86_lea( func, r0, get_const( vec, chan ) );
 310
 311          /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm.
 312           */
 313          x86_add( func, r1, r1 );
 314          x86_add( func, r1, r1 );
 315          x86_add( func, r1, r1 );
 316          x86_add( func, r1, r1 );
 317
 318          x86_add( func, r0, r1 );  /* r0 = r0 + r1 */
 319          x86_mov( func, r1, x86_deref( r0 ) );
 320          x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 );
 321       }
 322
 323       x86_pop( func, r1 );
 324       x86_pop( func, r0 );
 325
 326       sse_movaps(
 327          func,
 328          make_xmm( xmm ),
 329          get_temp( TEMP_R0, CHAN_X ) );
 330    }
 331    else {
 332       /* 'vec' is the index into the src register file, such as TEMP[vec] */
 333       assert( vec >= 0 );
 334
 335       sse_movss(
 336          func,
 337          make_xmm( xmm ),
 338          get_const( vec, chan ) );
 339       sse_shufps(
 340          func,
 341          make_xmm( xmm ),
 342          make_xmm( xmm ),
 343          SHUF( 0, 0, 0, 0 ) );
 344    }
 345 }
 346
 347 static void
 348 emit_immediate(
 349    struct x86_function *func,
 350    unsigned xmm,
 351    unsigned vec,
 352    unsigned chan )
 353 {
 354    sse_movss(
 355       func,
 356       make_xmm( xmm ),
 357       get_immediate( vec, chan ) );
 358    sse_shufps(
 359       func,
 360       make_xmm( xmm ),
 361       make_xmm( xmm ),
 362       SHUF( 0, 0, 0, 0 ) );
 363 }
 364
 365
 366 /**
 367  * Copy a shader input to xmm register
 368  * \param xmm  the destination xmm register
 369  * \param vec  the src input attrib
 370  * \param chan  src channel to fetch (X, Y, Z or W)
 371  */
 372 static void
 373 emit_inputf(
 374    struct x86_function *func,
 375    unsigned xmm,
 376    unsigned vec,
 377    unsigned chan )
 378 {
 379    sse_movups(
 380       func,
 381       make_xmm( xmm ),
 382       get_input( vec, chan ) );
 383 }
 384
 385 /**
 386  * Store an xmm register to a shader output
 387  * \param xmm  the source xmm register
 388  * \param vec  the dest output attrib
 389  * \param chan  src dest channel to store (X, Y, Z or W)
 390  */
 391 static void
 392 emit_output(
 393    struct x86_function *func,
 394    unsigned xmm,
 395    unsigned vec,
 396    unsigned chan )
 397 {
 398    sse_movups(
 399       func,
 400       get_output( vec, chan ),
 401       make_xmm( xmm ) );
 402 }
 403
 404 /**
 405  * Copy a shader temporary to xmm register
 406  * \param xmm  the destination xmm register
 407  * \param vec  the src temp register
 408  * \param chan  src channel to fetch (X, Y, Z or W)
 409  */
 410 static void
 411 emit_tempf(
 412    struct x86_function *func,
 413    unsigned xmm,
 414    unsigned vec,
 415    unsigned chan )
 416 {
 417    sse_movaps(
 418       func,
 419       make_xmm( xmm ),
 420       get_temp( vec, chan ) );
 421 }
 422
 423 /**
 424  * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
 425  * \param xmm  the destination xmm register
 426  * \param vec  the src input/attribute coefficient index
 427  * \param chan  src channel to fetch (X, Y, Z or W)
 428  * \param member  0=a0, 1=dadx, 2=dady
 429  */
 430 static void
 431 emit_coef(
 432    struct x86_function *func,
 433    unsigned xmm,
 434    unsigned vec,
 435    unsigned chan,
 436    unsigned member )
 437 {
 438    sse_movss(
 439       func,
 440       make_xmm( xmm ),
 441       get_coef( vec, chan, member ) );
 442    sse_shufps(
 443       func,
 444       make_xmm( xmm ),
 445       make_xmm( xmm ),
 446       SHUF( 0, 0, 0, 0 ) );
 447 }
 448
 449 /**
 450  * Data store helpers.
 451  */
 452
 453 static void
 454 emit_inputs(
 455    struct x86_function *func,
 456    unsigned xmm,
 457    unsigned vec,
 458    unsigned chan )
 459 {
 460    sse_movups(
 461       func,
 462       get_input( vec, chan ),
 463       make_xmm( xmm ) );
 464 }
 465
 466 static void
 467 emit_temps(
 468    struct x86_function *func,
 469    unsigned xmm,
 470    unsigned vec,
 471    unsigned chan )
 472 {
 473    sse_movaps(
 474       func,
 475       get_temp( vec, chan ),
 476       make_xmm( xmm ) );
 477 }
 478
 479 static void
 480 emit_addrs(
 481    struct x86_function *func,
 482    unsigned xmm,
 483    unsigned vec,
 484    unsigned chan )
 485 {
 486    assert( vec == 0 );
 487
 488    emit_temps(
 489       func,
 490       xmm,
 491       vec + TGSI_EXEC_TEMP_ADDR,
 492       chan );
 493 }
 494
 495 /**
 496  * Coefficent fetch helpers.
 497  */
 498
 499 static void
 500 emit_coef_a0(
 501    struct x86_function *func,
 502    unsigned xmm,
 503    unsigned vec,
 504    unsigned chan )
 505 {
 506    emit_coef(
 507       func,
 508       xmm,
 509       vec,
 510       chan,
 511       0 );
 512 }
 513
 514 static void
 515 emit_coef_dadx(
 516    struct x86_function *func,
 517    unsigned xmm,
 518    unsigned vec,
 519    unsigned chan )
 520 {
 521    emit_coef(
 522       func,
 523       xmm,
 524       vec,
 525       chan,
 526       1 );
 527 }
 528
 529 static void
 530 emit_coef_dady(
 531    struct x86_function *func,
 532    unsigned xmm,
 533    unsigned vec,
 534    unsigned chan )
 535 {
 536    emit_coef(
 537       func,
 538       xmm,
 539       vec,
 540       chan,
 541       2 );
 542 }
 543
 544 /**
 545  * Function call helpers.
 546  */
 547
 548 /**
 549  * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be
 550  * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
 551  * that the stack pointer is 16 byte aligned, as expected.
 552  */
 553 static void
 554 emit_func_call(
 555    struct x86_function *func,
 556    unsigned xmm_save_mask,
 557    const struct x86_reg *arg,
 558    unsigned nr_args,
 559    void (PIPE_CDECL *code)() )
 560 {
 561    struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
 562    unsigned i, n;
 563
 564    x86_push(
 565       func,
 566       x86_make_reg( file_REG32, reg_AX) );
 567    x86_push(
 568       func,
 569       x86_make_reg( file_REG32, reg_CX) );
 570    x86_push(
 571       func,
 572       x86_make_reg( file_REG32, reg_DX) );
 573
 574    /* Store XMM regs to the stack
 575     */
 576    for(i = 0, n = 0; i < 8; ++i)
 577       if(xmm_save_mask & (1 << i))
 578          ++n;
 579
 580    x86_sub_imm(
 581       func,
 582       x86_make_reg( file_REG32, reg_SP ),
 583       n*16);
 584
 585    for(i = 0, n = 0; i < 8; ++i)
 586       if(xmm_save_mask & (1 << i)) {
 587          sse_movups(
 588             func,
 589             x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ),
 590             make_xmm( i ) );
 591          ++n;
 592       }
 593
 594    for (i = 0; i < nr_args; i++) {
 595       /* Load the address of the buffer we use for passing arguments and
 596        * receiving results:
 597        */
 598       x86_lea(
 599          func,
 600          ecx,
 601          arg[i] );
 602
 603       /* Push actual function arguments (currently just the pointer to
 604        * the buffer above), and call the function:
 605        */
 606       x86_push( func, ecx );
 607    }
 608
 609    x86_mov_reg_imm( func, ecx, (unsigned long) code );
 610    x86_call( func, ecx );
 611
 612    /* Pop the arguments (or just add an immediate to esp)
 613     */
 614    for (i = 0; i < nr_args; i++) {
 615       x86_pop(func, ecx );
 616    }
 617
 618    /* Pop the saved XMM regs:
 619     */
 620    for(i = 0, n = 0; i < 8; ++i)
 621       if(xmm_save_mask & (1 << i)) {
 622          sse_movups(
 623             func,
 624             make_xmm( i ),
 625             x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) );
 626          ++n;
 627       }
 628
 629    x86_add_imm(
 630       func,
 631       x86_make_reg( file_REG32, reg_SP ),
 632       n*16);
 633
 634    /* Restore GP registers in a reverse order.
 635     */
 636    x86_pop(
 637       func,
 638       x86_make_reg( file_REG32, reg_DX) );
 639    x86_pop(
 640       func,
 641       x86_make_reg( file_REG32, reg_CX) );
 642    x86_pop(
 643       func,
 644       x86_make_reg( file_REG32, reg_AX) );
 645 }
 646
 647 static void
 648 emit_func_call_dst_src1(
 649    struct x86_function *func,
 650    unsigned xmm_save,
 651    unsigned xmm_dst,
 652    unsigned xmm_src0,
 653    void (PIPE_CDECL *code)() )
 654 {
 655    struct x86_reg store = get_temp( TEMP_R0, 0 );
 656    unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst);
 657
 658    /* Store our input parameters (in xmm regs) to the buffer we use
 659     * for passing arguments.  We will pass a pointer to this buffer as
 660     * the actual function argument.
 661     */
 662    sse_movaps(
 663       func,
 664       store,
 665       make_xmm( xmm_src0 ) );
 666
 667    emit_func_call( func,
 668                    xmm_mask,
 669                    &store,
 670                    1,
 671                    code );
 672
 673    sse_movaps(
 674       func,
 675       make_xmm( xmm_dst ),
 676       store );
 677 }
 678
 679
 680 static void
 681 emit_func_call_dst_src2(
 682    struct x86_function *func,
 683    unsigned xmm_save,
 684    unsigned xmm_dst,
 685    unsigned xmm_src0,
 686    unsigned xmm_src1,
 687    void (PIPE_CDECL *code)() )
 688 {
 689    struct x86_reg store = get_temp( TEMP_R0, 0 );
 690    unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst);
 691
 692    /* Store two inputs to parameter buffer.
 693     */
 694    sse_movaps(
 695       func,
 696       store,
 697       make_xmm( xmm_src0 ) );
 698
 699    sse_movaps(
 700       func,
 701       x86_make_disp( store, 4 * sizeof(float) ),
 702       make_xmm( xmm_src1 ) );
 703
 704
 705    /* Emit the call
 706     */
 707    emit_func_call( func,
 708                    xmm_mask,
 709                    &store,
 710                    1,
 711                    code );
 712
 713    /* Retrieve the results:
 714     */
 715    sse_movaps(
 716       func,
 717       make_xmm( xmm_dst ),
 718       store );
 719 }
 720
 721
 722
 723
 724
 725 #if defined(PIPE_ARCH_SSE)
 726
 727 /*
 728  * Fast SSE2 implementation of special math functions.
 729  */
 730
 731 #define POLY0(x, c0) _mm_set1_ps(c0)
 732 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
 733 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
 734 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
 735 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
 736 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
 737
 738 #define EXP_POLY_DEGREE 3
 739 #define LOG_POLY_DEGREE 5
 740
 741 /**
 742  * See http://www.devmaster.net/forums/showthread.php?p=43580
 743  */
 744 static INLINE __m128
 745 exp2f4(__m128 x)
 746 {
 747    __m128i ipart;
 748    __m128 fpart, expipart, expfpart;
 749
 750    x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
 751    x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
 752
 753    /* ipart = int(x - 0.5) */
 754    ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
 755
 756    /* fpart = x - ipart */
 757    fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
 758
 759    /* expipart = (float) (1 << ipart) */
 760    expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
 761
 762    /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
 763 #if EXP_POLY_DEGREE == 5
 764    expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
 765 #elif EXP_POLY_DEGREE == 4
 766    expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
 767 #elif EXP_POLY_DEGREE == 3
 768    expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
 769 #elif EXP_POLY_DEGREE == 2
 770    expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
 771 #else
 772 #error
 773 #endif
 774
 775    return _mm_mul_ps(expipart, expfpart);
 776 }
 777
 778
 779 /**
 780  * See http://www.devmaster.net/forums/showthread.php?p=43580
 781  */
 782 static INLINE __m128
 783 log2f4(__m128 x)
 784 {
 785    __m128i expmask = _mm_set1_epi32(0x7f800000);
 786    __m128i mantmask = _mm_set1_epi32(0x007fffff);
 787    __m128 one = _mm_set1_ps(1.0f);
 788
 789    __m128i i = _mm_castps_si128(x);
 790
 791    /* exp = (float) exponent(x) */
 792    __m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127)));
 793
 794    /* mant = (float) mantissa(x) */
 795    __m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one);
 796
 797    __m128 logmant;
 798
 799    /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
 800     * These coefficients can be generate with
 801     * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
 802     */
 803 #if LOG_POLY_DEGREE == 6
 804    logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f);
 805 #elif LOG_POLY_DEGREE == 5
 806    logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
 807 #elif LOG_POLY_DEGREE == 4
 808    logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
 809 #elif LOG_POLY_DEGREE == 3
 810    logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
 811 #else
 812 #error
 813 #endif
 814
 815    /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
 816    logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one));
 817
 818    return _mm_add_ps(logmant, exp);
 819 }
 820
 821
 822 static INLINE __m128
 823 powf4(__m128 x, __m128 y)
 824 {
 825    return exp2f4(_mm_mul_ps(log2f4(x), y));
 826 }
 827
 828 #endif /* PIPE_ARCH_SSE */
 829
 830
 831
 832 /**
 833  * Low-level instruction translators.
 834  */
 835
 836 static void
 837 emit_abs(
 838    struct x86_function *func,
 839    unsigned xmm )
 840 {
 841    sse_andps(
 842       func,
 843       make_xmm( xmm ),
 844       get_temp(
 845          TGSI_EXEC_TEMP_7FFFFFFF_I,
 846          TGSI_EXEC_TEMP_7FFFFFFF_C ) );
 847 }
 848
 849 static void
 850 emit_add(
 851    struct x86_function *func,
 852    unsigned xmm_dst,
 853    unsigned xmm_src )
 854 {
 855    sse_addps(
 856       func,
 857       make_xmm( xmm_dst ),
 858       make_xmm( xmm_src ) );
 859 }
 860
 861 static void PIPE_CDECL
 862 cos4f(
 863    float *store )
 864 {
 865    store[0] = cosf( store[0] );
 866    store[1] = cosf( store[1] );
 867    store[2] = cosf( store[2] );
 868    store[3] = cosf( store[3] );
 869 }
 870
 871 static void
 872 emit_cos(
 873    struct x86_function *func,
 874    unsigned xmm_save,
 875    unsigned xmm_dst )
 876 {
 877    emit_func_call_dst_src1(
 878       func,
 879       xmm_save,
 880       xmm_dst,
 881       xmm_dst,
 882       cos4f );
 883 }
 884
 885 static void PIPE_CDECL
 886 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
 887 __attribute__((force_align_arg_pointer))
 888 #endif
 889 ex24f(
 890    float *store )
 891 {
 892 #if defined(PIPE_ARCH_SSE)
 893    _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
 894 #else
 895    store[0] = util_fast_exp2( store[0] );
 896    store[1] = util_fast_exp2( store[1] );
 897    store[2] = util_fast_exp2( store[2] );
 898    store[3] = util_fast_exp2( store[3] );
 899 #endif
 900 }
 901
 902 static void
 903 emit_ex2(
 904    struct x86_function *func,
 905    unsigned xmm_save,
 906    unsigned xmm_dst )
 907 {
 908    emit_func_call_dst_src1(
 909       func,
 910       xmm_save,
 911       xmm_dst,
 912       xmm_dst,
 913       ex24f );
 914 }
 915
 916 static void
 917 emit_f2it(
 918    struct x86_function *func,
 919    unsigned xmm )
 920 {
 921    sse2_cvttps2dq(
 922       func,
 923       make_xmm( xmm ),
 924       make_xmm( xmm ) );
 925 }
 926
 927 static void
 928 emit_i2f(
 929    struct x86_function *func,
 930    unsigned xmm )
 931 {
 932    sse2_cvtdq2ps(
 933       func,
 934       make_xmm( xmm ),
 935       make_xmm( xmm ) );
 936 }
 937
 938 static void PIPE_CDECL
 939 flr4f(
 940    float *store )
 941 {
 942    store[0] = floorf( store[0] );
 943    store[1] = floorf( store[1] );
 944    store[2] = floorf( store[2] );
 945    store[3] = floorf( store[3] );
 946 }
 947
 948 static void
 949 emit_flr(
 950    struct x86_function *func,
 951    unsigned xmm_save,
 952    unsigned xmm_dst )
 953 {
 954    emit_func_call_dst_src1(
 955       func,
 956       xmm_save,
 957       xmm_dst,
 958       xmm_dst,
 959       flr4f );
 960 }
 961
 962 static void PIPE_CDECL
 963 frc4f(
 964    float *store )
 965 {
 966    store[0] -= floorf( store[0] );
 967    store[1] -= floorf( store[1] );
 968    store[2] -= floorf( store[2] );
 969    store[3] -= floorf( store[3] );
 970 }
 971
 972 static void
 973 emit_frc(
 974    struct x86_function *func,
 975    unsigned xmm_save,
 976    unsigned xmm_dst )
 977 {
 978    emit_func_call_dst_src1(
 979       func,
 980       xmm_save,
 981       xmm_dst,
 982       xmm_dst,
 983       frc4f );
 984 }
 985
 986 static void PIPE_CDECL
 987 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
 988 __attribute__((force_align_arg_pointer))
 989 #endif
 990 lg24f(
 991    float *store )
 992 {
 993 #if defined(PIPE_ARCH_SSE)
 994    _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
 995 #else
 996    store[0] = util_fast_log2( store[0] );
 997    store[1] = util_fast_log2( store[1] );
 998    store[2] = util_fast_log2( store[2] );
 999    store[3] = util_fast_log2( store[3] );
1000 #endif
1001 }
1002
1003 static void
1004 emit_lg2(
1005    struct x86_function *func,
1006    unsigned xmm_save,
1007    unsigned xmm_dst )
1008 {
1009    emit_func_call_dst_src1(
1010       func,
1011       xmm_save,
1012       xmm_dst,
1013       xmm_dst,
1014       lg24f );
1015 }
1016
1017 static void
1018 emit_MOV(
1019    struct x86_function *func,
1020    unsigned xmm_dst,
1021    unsigned xmm_src )
1022 {
1023    sse_movups(
1024       func,
1025       make_xmm( xmm_dst ),
1026       make_xmm( xmm_src ) );
1027 }
1028
1029 static void
1030 emit_mul (struct x86_function *func,
1031           unsigned xmm_dst,
1032           unsigned xmm_src)
1033 {
1034    sse_mulps(
1035       func,
1036       make_xmm( xmm_dst ),
1037       make_xmm( xmm_src ) );
1038 }
1039
1040 static void
1041 emit_neg(
1042    struct x86_function *func,
1043    unsigned xmm )
1044 {
1045    sse_xorps(
1046       func,
1047       make_xmm( xmm ),
1048       get_temp(
1049          TGSI_EXEC_TEMP_80000000_I,
1050          TGSI_EXEC_TEMP_80000000_C ) );
1051 }
1052
1053 static void PIPE_CDECL
1054 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
1055 __attribute__((force_align_arg_pointer))
1056 #endif
1057 pow4f(
1058    float *store )
1059 {
1060 #if defined(PIPE_ARCH_SSE)
1061    _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
1062 #else
1063    store[0] = util_fast_pow( store[0], store[4] );
1064    store[1] = util_fast_pow( store[1], store[5] );
1065    store[2] = util_fast_pow( store[2], store[6] );
1066    store[3] = util_fast_pow( store[3], store[7] );
1067 #endif
1068 }
1069
1070 static void
1071 emit_pow(
1072    struct x86_function *func,
1073    unsigned xmm_save,
1074    unsigned xmm_dst,
1075    unsigned xmm_src0,
1076    unsigned xmm_src1 )
1077 {
1078    emit_func_call_dst_src2(
1079       func,
1080       xmm_save,
1081       xmm_dst,
1082       xmm_src0,
1083       xmm_src1,
1084       pow4f );
1085 }
1086
1087 static void
1088 emit_rcp (
1089    struct x86_function *func,
1090    unsigned xmm_dst,
1091    unsigned xmm_src )
1092 {
1093    /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1094     * good enough.  Need to either emit a proper divide or use the
1095     * iterative technique described below in emit_rsqrt().
1096     */
1097    sse2_rcpps(
1098       func,
1099       make_xmm( xmm_dst ),
1100       make_xmm( xmm_src ) );
1101 }
1102
1103 static void PIPE_CDECL
1104 rnd4f(
1105    float *store )
1106 {
1107    store[0] = floorf( store[0] + 0.5f );
1108    store[1] = floorf( store[1] + 0.5f );
1109    store[2] = floorf( store[2] + 0.5f );
1110    store[3] = floorf( store[3] + 0.5f );
1111 }
1112
1113 static void
1114 emit_rnd(
1115    struct x86_function *func,
1116    unsigned xmm_save,
1117    unsigned xmm_dst )
1118 {
1119    emit_func_call_dst_src1(
1120       func,
1121       xmm_save,
1122       xmm_dst,
1123       xmm_dst,
1124       rnd4f );
1125 }
1126
1127 static void
1128 emit_rsqrt(
1129    struct x86_function *func,
1130    unsigned xmm_dst,
1131    unsigned xmm_src )
1132 {
1133 #if HIGH_PRECISION
1134    /* Although rsqrtps() and rcpps() are low precision on some/all SSE
1135     * implementations, it is possible to improve its precision at
1136     * fairly low cost, using a newton/raphson step, as below:
1137     *
1138     * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
1139     * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
1140     *
1141     * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
1142     */
1143    {
1144       struct x86_reg dst = make_xmm( xmm_dst );
1145       struct x86_reg src = make_xmm( xmm_src );
1146       struct x86_reg tmp0 = make_xmm( 2 );
1147       struct x86_reg tmp1 = make_xmm( 3 );
1148
1149       assert( xmm_dst != xmm_src );
1150       assert( xmm_dst != 2 && xmm_dst != 3 );
1151       assert( xmm_src != 2 && xmm_src != 3 );
1152
1153       sse_movaps(  func, dst,  get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
1154       sse_movaps(  func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
1155       sse_rsqrtps( func, tmp1, src  );
1156       sse_mulps(   func, src,  tmp1 );
1157       sse_mulps(   func, dst,  tmp1 );
1158       sse_mulps(   func, src,  tmp1 );
1159       sse_subps(   func, tmp0, src  );
1160       sse_mulps(   func, dst,  tmp0 );
1161    }
1162 #else
1163    /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1164     * good enough.
1165     */
1166    sse_rsqrtps(
1167       func,
1168       make_xmm( xmm_dst ),
1169       make_xmm( xmm_src ) );
1170 #endif
1171 }
1172
1173 static void
1174 emit_setsign(
1175    struct x86_function *func,
1176    unsigned xmm )
1177 {
1178    sse_orps(
1179       func,
1180       make_xmm( xmm ),
1181       get_temp(
1182          TGSI_EXEC_TEMP_80000000_I,
1183          TGSI_EXEC_TEMP_80000000_C ) );
1184 }
1185
1186 static void PIPE_CDECL
1187 sgn4f(
1188    float *store )
1189 {
1190    store[0] = store[0] < 0.0f ? -1.0f : store[0] > 0.0f ? 1.0f : 0.0f;
1191    store[1] = store[1] < 0.0f ? -1.0f : store[1] > 0.0f ? 1.0f : 0.0f;
1192    store[2] = store[2] < 0.0f ? -1.0f : store[2] > 0.0f ? 1.0f : 0.0f;
1193    store[3] = store[3] < 0.0f ? -1.0f : store[3] > 0.0f ? 1.0f : 0.0f;
1194 }
1195
1196 static void
1197 emit_sgn(
1198    struct x86_function *func,
1199    unsigned xmm_save,
1200    unsigned xmm_dst )
1201 {
1202    emit_func_call_dst_src1(
1203       func,
1204       xmm_save,
1205       xmm_dst,
1206       xmm_dst,
1207       sgn4f );
1208 }
1209
1210 static void PIPE_CDECL
1211 sin4f(
1212    float *store )
1213 {
1214    store[0] = sinf( store[0] );
1215    store[1] = sinf( store[1] );
1216    store[2] = sinf( store[2] );
1217    store[3] = sinf( store[3] );
1218 }
1219
1220 static void
1221 emit_sin (struct x86_function *func,
1222           unsigned xmm_save,
1223           unsigned xmm_dst)
1224 {
1225    emit_func_call_dst_src1(
1226       func,
1227       xmm_save,
1228       xmm_dst,
1229       xmm_dst,
1230       sin4f );
1231 }
1232
1233 static void
1234 emit_sub(
1235    struct x86_function *func,
1236    unsigned xmm_dst,
1237    unsigned xmm_src )
1238 {
1239    sse_subps(
1240       func,
1241       make_xmm( xmm_dst ),
1242       make_xmm( xmm_src ) );
1243 }
1244
1245
1246
1247
1248
1249
1250
1251 /**
1252  * Register fetch.
1253  */
1254
1255 static void
1256 emit_fetch(
1257    struct x86_function *func,
1258    unsigned xmm,
1259    const struct tgsi_full_src_register *reg,
1260    const unsigned chan_index )
1261 {
1262    unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
1263
1264    switch (swizzle) {
1265    case TGSI_EXTSWIZZLE_X:
1266    case TGSI_EXTSWIZZLE_Y:
1267    case TGSI_EXTSWIZZLE_Z:
1268    case TGSI_EXTSWIZZLE_W:
1269       switch (reg->SrcRegister.File) {
1270       case TGSI_FILE_CONSTANT:
1271          emit_const(
1272             func,
1273             xmm,
1274             reg->SrcRegister.Index,
1275             swizzle,
1276             reg->SrcRegister.Indirect,
1277             reg->SrcRegisterInd.File,
1278             reg->SrcRegisterInd.Index );
1279          break;
1280
1281       case TGSI_FILE_IMMEDIATE:
1282          emit_immediate(
1283             func,
1284             xmm,
1285             reg->SrcRegister.Index,
1286             swizzle );
1287          break;
1288
1289       case TGSI_FILE_INPUT:
1290          emit_inputf(
1291             func,
1292             xmm,
1293             reg->SrcRegister.Index,
1294             swizzle );
1295          break;
1296
1297       case TGSI_FILE_TEMPORARY:
1298          emit_tempf(
1299             func,
1300             xmm,
1301             reg->SrcRegister.Index,
1302             swizzle );
1303          break;
1304
1305       default:
1306          assert( 0 );
1307       }
1308       break;
1309
1310    case TGSI_EXTSWIZZLE_ZERO:
1311       emit_tempf(
1312          func,
1313          xmm,
1314          TGSI_EXEC_TEMP_00000000_I,
1315          TGSI_EXEC_TEMP_00000000_C );
1316       break;
1317
1318    case TGSI_EXTSWIZZLE_ONE:
1319       emit_tempf(
1320          func,
1321          xmm,
1322          TEMP_ONE_I,
1323          TEMP_ONE_C );
1324       break;
1325
1326    default:
1327       assert( 0 );
1328    }
1329
1330    switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
1331    case TGSI_UTIL_SIGN_CLEAR:
1332       emit_abs( func, xmm );
1333       break;
1334
1335    case TGSI_UTIL_SIGN_SET:
1336       emit_setsign( func, xmm );
1337       break;
1338
1339    case TGSI_UTIL_SIGN_TOGGLE:
1340       emit_neg( func, xmm );
1341       break;
1342
1343    case TGSI_UTIL_SIGN_KEEP:
1344       break;
1345    }
1346 }
1347
1348 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
1349    emit_fetch( FUNC, XMM, &(INST).FullSrcRegisters[INDEX], CHAN )
1350
1351 /**
1352  * Register store.
1353  */
1354
1355 static void
1356 emit_store(
1357    struct x86_function *func,
1358    unsigned xmm,
1359    const struct tgsi_full_dst_register *reg,
1360    const struct tgsi_full_instruction *inst,
1361    unsigned chan_index )
1362 {
1363    switch( reg->DstRegister.File ) {
1364    case TGSI_FILE_OUTPUT:
1365       emit_output(
1366          func,
1367          xmm,
1368          reg->DstRegister.Index,
1369          chan_index );
1370       break;
1371
1372    case TGSI_FILE_TEMPORARY:
1373       emit_temps(
1374          func,
1375          xmm,
1376          reg->DstRegister.Index,
1377          chan_index );
1378       break;
1379
1380    case TGSI_FILE_ADDRESS:
1381       emit_addrs(
1382          func,
1383          xmm,
1384          reg->DstRegister.Index,
1385          chan_index );
1386       break;
1387
1388    default:
1389       assert( 0 );
1390    }
1391
1392    switch( inst->Instruction.Saturate ) {
1393    case TGSI_SAT_NONE:
1394       break;
1395
1396    case TGSI_SAT_ZERO_ONE:
1397       /* assert( 0 ); */
1398       break;
1399
1400    case TGSI_SAT_MINUS_PLUS_ONE:
1401       assert( 0 );
1402       break;
1403    }
1404 }
1405
1406 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1407    emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
1408
1409
1410 static void PIPE_CDECL
1411 fetch_texel( struct tgsi_sampler **sampler,
1412              float *store )
1413 {
1414 #if 0
1415    uint j;
1416
1417    debug_printf("%s sampler: %p (%p) store: %p\n",
1418                 __FUNCTION__,
1419                 sampler, *sampler,
1420                 store );
1421
1422    debug_printf("lodbias %f\n", store[12]);
1423
1424    for (j = 0; j < 4; j++)
1425       debug_printf("sample %d texcoord %f %f\n",
1426                    j,
1427                    store[0+j],
1428                    store[4+j]);
1429 #endif
1430
1431    {
1432       float rgba[NUM_CHANNELS][QUAD_SIZE];
1433       (*sampler)->get_samples(*sampler,
1434                               &store[0],
1435                               &store[4],
1436                               &store[8],
1437                               0.0f, /*store[12],  lodbias */
1438                               rgba);
1439
1440       memcpy( store, rgba, 16 * sizeof(float));
1441    }
1442
1443 #if 0
1444    for (j = 0; j < 4; j++)
1445       debug_printf("sample %d result %f %f %f %f\n",
1446                    j,
1447                    store[0+j],
1448                    store[4+j],
1449                    store[8+j],
1450                    store[12+j]);
1451 #endif
1452 }
1453
1454 /**
1455  * High-level instruction translators.
1456  */
1457
1458 static void
1459 emit_tex( struct x86_function *func,
1460           const struct tgsi_full_instruction *inst,
1461           boolean lodbias,
1462           boolean projected)
1463 {
1464    const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
1465    struct x86_reg args[2];
1466    unsigned count;
1467    unsigned i;
1468
1469    switch (inst->InstructionExtTexture.Texture) {
1470    case TGSI_TEXTURE_1D:
1471       count = 1;
1472       break;
1473    case TGSI_TEXTURE_2D:
1474    case TGSI_TEXTURE_RECT:
1475       count = 2;
1476       break;
1477    case TGSI_TEXTURE_SHADOW1D:
1478    case TGSI_TEXTURE_SHADOW2D:
1479    case TGSI_TEXTURE_SHADOWRECT:
1480    case TGSI_TEXTURE_3D:
1481    case TGSI_TEXTURE_CUBE:
1482       count = 3;
1483       break;
1484    default:
1485       assert(0);
1486       return;
1487    }
1488
1489    if (lodbias) {
1490       FETCH( func, *inst, 3, 0, 3 );
1491    }
1492    else {
1493       emit_tempf(
1494          func,
1495          3,
1496          TGSI_EXEC_TEMP_00000000_I,
1497          TGSI_EXEC_TEMP_00000000_C );
1498
1499    }
1500
1501    /* store lodbias whether enabled or not -- fetch_texel currently
1502     * respects it always.
1503     */
1504    sse_movaps( func,
1505                get_temp( TEMP_R0, 3 ),
1506                make_xmm( 3 ) );
1507
1508
1509    if (projected) {
1510       FETCH( func, *inst, 3, 0, 3 );
1511
1512       emit_rcp( func, 3, 3 );
1513    }
1514
1515    for (i = 0; i < count; i++) {
1516       FETCH( func, *inst, i, 0, i );
1517
1518       if (projected) {
1519          sse_mulps(
1520             func,
1521             make_xmm( i ),
1522             make_xmm( 3 ) );
1523       }
1524
1525       /* Store in the argument buffer:
1526        */
1527       sse_movaps(
1528          func,
1529          get_temp( TEMP_R0, i ),
1530          make_xmm( i ) );
1531    }
1532
1533    args[0] = get_temp( TEMP_R0, 0 );
1534    args[1] = get_sampler_ptr( unit );
1535
1536
1537    emit_func_call( func,
1538                    0,
1539                    args,
1540                    Elements(args),
1541                    fetch_texel );
1542
1543    /* If all four channels are enabled, could use a pointer to
1544     * dst[0].x instead of TEMP_R0 for store?
1545     */
1546    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, i ) {
1547
1548       sse_movaps(
1549          func,
1550          make_xmm( 0 ),
1551          get_temp( TEMP_R0, i ) );
1552
1553       STORE( func, *inst, 0, 0, i );
1554    }
1555 }
1556
1557
1558 static void
1559 emit_kil(
1560    struct x86_function *func,
1561    const struct tgsi_full_src_register *reg )
1562 {
1563    unsigned uniquemask;
1564    unsigned unique_count = 0;
1565    unsigned chan_index;
1566    unsigned i;
1567
1568    /* This mask stores component bits that were already tested. Note that
1569     * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1570     * tested. */
1571    uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
1572
1573    FOR_EACH_CHANNEL( chan_index ) {
1574       unsigned swizzle;
1575
1576       /* unswizzle channel */
1577       swizzle = tgsi_util_get_full_src_register_extswizzle(
1578          reg,
1579          chan_index );
1580
1581       /* check if the component has not been already tested */
1582       if( !(uniquemask & (1 << swizzle)) ) {
1583          uniquemask |= 1 << swizzle;
1584
1585          /* allocate register */
1586          emit_fetch(
1587             func,
1588             unique_count++,
1589             reg,
1590             chan_index );
1591       }
1592    }
1593
1594    x86_push(
1595       func,
1596       x86_make_reg( file_REG32, reg_AX ) );
1597    x86_push(
1598       func,
1599       x86_make_reg( file_REG32, reg_DX ) );
1600
1601    for (i = 0 ; i < unique_count; i++ ) {
1602       struct x86_reg dataXMM = make_xmm(i);
1603
1604       sse_cmpps(
1605          func,
1606          dataXMM,
1607          get_temp(
1608             TGSI_EXEC_TEMP_00000000_I,
1609             TGSI_EXEC_TEMP_00000000_C ),
1610          cc_LessThan );
1611
1612       if( i == 0 ) {
1613          sse_movmskps(
1614             func,
1615             x86_make_reg( file_REG32, reg_AX ),
1616             dataXMM );
1617       }
1618       else {
1619          sse_movmskps(
1620             func,
1621             x86_make_reg( file_REG32, reg_DX ),
1622             dataXMM );
1623          x86_or(
1624             func,
1625             x86_make_reg( file_REG32, reg_AX ),
1626             x86_make_reg( file_REG32, reg_DX ) );
1627       }
1628    }
1629
1630    x86_or(
1631       func,
1632       get_temp(
1633          TGSI_EXEC_TEMP_KILMASK_I,
1634          TGSI_EXEC_TEMP_KILMASK_C ),
1635       x86_make_reg( file_REG32, reg_AX ) );
1636
1637    x86_pop(
1638       func,
1639       x86_make_reg( file_REG32, reg_DX ) );
1640    x86_pop(
1641       func,
1642       x86_make_reg( file_REG32, reg_AX ) );
1643 }
1644
1645
1646 static void
1647 emit_kilp(
1648    struct x86_function *func )
1649 {
1650    /* XXX todo / fix me */
1651 }
1652
1653
1654 static void
1655 emit_setcc(
1656    struct x86_function *func,
1657    struct tgsi_full_instruction *inst,
1658    enum sse_cc cc )
1659 {
1660    unsigned chan_index;
1661
1662    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1663       FETCH( func, *inst, 0, 0, chan_index );
1664       FETCH( func, *inst, 1, 1, chan_index );
1665       sse_cmpps(
1666          func,
1667          make_xmm( 0 ),
1668          make_xmm( 1 ),
1669          cc );
1670       sse_andps(
1671          func,
1672          make_xmm( 0 ),
1673          get_temp(
1674             TEMP_ONE_I,
1675             TEMP_ONE_C ) );
1676       STORE( func, *inst, 0, 0, chan_index );
1677    }
1678 }
1679
1680 static void
1681 emit_cmp(
1682    struct x86_function *func,
1683    struct tgsi_full_instruction *inst )
1684 {
1685    unsigned chan_index;
1686
1687    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1688       FETCH( func, *inst, 0, 0, chan_index );
1689       FETCH( func, *inst, 1, 1, chan_index );
1690       FETCH( func, *inst, 2, 2, chan_index );
1691       sse_cmpps(
1692          func,
1693          make_xmm( 0 ),
1694          get_temp(
1695             TGSI_EXEC_TEMP_00000000_I,
1696             TGSI_EXEC_TEMP_00000000_C ),
1697          cc_LessThan );
1698       sse_andps(
1699          func,
1700          make_xmm( 1 ),
1701          make_xmm( 0 ) );
1702       sse_andnps(
1703          func,
1704          make_xmm( 0 ),
1705          make_xmm( 2 ) );
1706       sse_orps(
1707          func,
1708          make_xmm( 0 ),
1709          make_xmm( 1 ) );
1710       STORE( func, *inst, 0, 0, chan_index );
1711    }
1712 }
1713
1714
1715 /**
1716  * Check if inst src/dest regs use indirect addressing into temporary
1717  * register file.
1718  */
1719 static boolean
1720 indirect_temp_reference(const struct tgsi_full_instruction *inst)
1721 {
1722    uint i;
1723    for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1724       const struct tgsi_full_src_register *reg = &inst->FullSrcRegisters[i];
1725       if (reg->SrcRegister.File == TGSI_FILE_TEMPORARY &&
1726           reg->SrcRegister.Indirect)
1727          return TRUE;
1728    }
1729    for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
1730       const struct tgsi_full_dst_register *reg = &inst->FullDstRegisters[i];
1731       if (reg->DstRegister.File == TGSI_FILE_TEMPORARY &&
1732           reg->DstRegister.Indirect)
1733          return TRUE;
1734    }
1735    return FALSE;
1736 }
1737
1738
1739 static int
1740 emit_instruction(
1741    struct x86_function *func,
1742    struct tgsi_full_instruction *inst )
1743 {
1744    unsigned chan_index;
1745
1746    /* we can't handle indirect addressing into temp register file yet */
1747    if (indirect_temp_reference(inst))
1748       return FALSE;
1749
1750    switch (inst->Instruction.Opcode) {
1751    case TGSI_OPCODE_ARL:
1752       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1753          FETCH( func, *inst, 0, 0, chan_index );
1754          emit_flr(func, 0, 0);
1755          emit_f2it( func, 0 );
1756          STORE( func, *inst, 0, 0, chan_index );
1757       }
1758       break;
1759
1760    case TGSI_OPCODE_MOV:
1761    case TGSI_OPCODE_SWZ:
1762       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1763          FETCH( func, *inst, 0, 0, chan_index );
1764          STORE( func, *inst, 0, 0, chan_index );
1765       }
1766       break;
1767
1768    case TGSI_OPCODE_LIT:
1769       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1770           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1771          emit_tempf(
1772             func,
1773             0,
1774             TEMP_ONE_I,
1775             TEMP_ONE_C);
1776          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
1777             STORE( func, *inst, 0, 0, CHAN_X );
1778          }
1779          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1780             STORE( func, *inst, 0, 0, CHAN_W );
1781          }
1782       }
1783       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1784           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1785          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1786             FETCH( func, *inst, 0, 0, CHAN_X );
1787             sse_maxps(
1788                func,
1789                make_xmm( 0 ),
1790                get_temp(
1791                   TGSI_EXEC_TEMP_00000000_I,
1792                   TGSI_EXEC_TEMP_00000000_C ) );
1793             STORE( func, *inst, 0, 0, CHAN_Y );
1794          }
1795          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1796             /* XMM[1] = SrcReg[0].yyyy */
1797             FETCH( func, *inst, 1, 0, CHAN_Y );
1798             /* XMM[1] = max(XMM[1], 0) */
1799             sse_maxps(
1800                func,
1801                make_xmm( 1 ),
1802                get_temp(
1803                   TGSI_EXEC_TEMP_00000000_I,
1804                   TGSI_EXEC_TEMP_00000000_C ) );
1805             /* XMM[2] = SrcReg[0].wwww */
1806             FETCH( func, *inst, 2, 0, CHAN_W );
1807             /* XMM[2] = min(XMM[2], 128.0) */
1808             sse_minps(
1809                func,
1810                make_xmm( 2 ),
1811                get_temp(
1812                   TGSI_EXEC_TEMP_128_I,
1813                   TGSI_EXEC_TEMP_128_C ) );
1814             /* XMM[2] = max(XMM[2], -128.0) */
1815             sse_maxps(
1816                func,
1817                make_xmm( 2 ),
1818                get_temp(
1819                   TGSI_EXEC_TEMP_MINUS_128_I,
1820                   TGSI_EXEC_TEMP_MINUS_128_C ) );
1821             emit_pow( func, 3, 1, 1, 2 );
1822             FETCH( func, *inst, 0, 0, CHAN_X );
1823             sse_xorps(
1824                func,
1825                make_xmm( 2 ),
1826                make_xmm( 2 ) );
1827             sse_cmpps(
1828                func,
1829                make_xmm( 2 ),
1830                make_xmm( 0 ),
1831                cc_LessThan );
1832             sse_andps(
1833                func,
1834                make_xmm( 2 ),
1835                make_xmm( 1 ) );
1836             STORE( func, *inst, 2, 0, CHAN_Z );
1837          }
1838       }
1839       break;
1840
1841    case TGSI_OPCODE_RCP:
1842    /* TGSI_OPCODE_RECIP */
1843       FETCH( func, *inst, 0, 0, CHAN_X );
1844       emit_rcp( func, 0, 0 );
1845       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1846          STORE( func, *inst, 0, 0, chan_index );
1847       }
1848       break;
1849
1850    case TGSI_OPCODE_RSQ:
1851    /* TGSI_OPCODE_RECIPSQRT */
1852       FETCH( func, *inst, 0, 0, CHAN_X );
1853       emit_abs( func, 0 );
1854       emit_rsqrt( func, 1, 0 );
1855       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1856          STORE( func, *inst, 1, 0, chan_index );
1857       }
1858       break;
1859
1860    case TGSI_OPCODE_EXP:
1861       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1862           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1863           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1864          FETCH( func, *inst, 0, 0, CHAN_X );
1865          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1866              IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1867             emit_MOV( func, 1, 0 );
1868             emit_flr( func, 2, 1 );
1869             /* dst.x = ex2(floor(src.x)) */
1870             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1871                emit_MOV( func, 2, 1 );
1872                emit_ex2( func, 3, 2 );
1873                STORE( func, *inst, 2, 0, CHAN_X );
1874             }
1875             /* dst.y = src.x - floor(src.x) */
1876             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1877                emit_MOV( func, 2, 0 );
1878                emit_sub( func, 2, 1 );
1879                STORE( func, *inst, 2, 0, CHAN_Y );
1880             }
1881          }
1882          /* dst.z = ex2(src.x) */
1883          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1884             emit_ex2( func, 3, 0 );
1885             STORE( func, *inst, 0, 0, CHAN_Z );
1886          }
1887       }
1888       /* dst.w = 1.0 */
1889       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1890          emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1891          STORE( func, *inst, 0, 0, CHAN_W );
1892       }
1893       break;
1894
1895    case TGSI_OPCODE_LOG:
1896       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1897           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1898           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1899          FETCH( func, *inst, 0, 0, CHAN_X );
1900          emit_abs( func, 0 );
1901          emit_MOV( func, 1, 0 );
1902          emit_lg2( func, 2, 1 );
1903          /* dst.z = lg2(abs(src.x)) */
1904          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1905             STORE( func, *inst, 1, 0, CHAN_Z );
1906          }
1907          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1908              IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1909             emit_flr( func, 2, 1 );
1910             /* dst.x = floor(lg2(abs(src.x))) */
1911             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1912                STORE( func, *inst, 1, 0, CHAN_X );
1913             }
1914             /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
1915             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1916                emit_ex2( func, 2, 1 );
1917                emit_rcp( func, 1, 1 );
1918                emit_mul( func, 0, 1 );
1919                STORE( func, *inst, 0, 0, CHAN_Y );
1920             }
1921          }
1922       }
1923       /* dst.w = 1.0 */
1924       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1925          emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1926          STORE( func, *inst, 0, 0, CHAN_W );
1927       }
1928       break;
1929
1930    case TGSI_OPCODE_MUL:
1931       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1932          FETCH( func, *inst, 0, 0, chan_index );
1933          FETCH( func, *inst, 1, 1, chan_index );
1934          emit_mul( func, 0, 1 );
1935          STORE( func, *inst, 0, 0, chan_index );
1936       }
1937       break;
1938
1939    case TGSI_OPCODE_ADD:
1940       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1941          FETCH( func, *inst, 0, 0, chan_index );
1942          FETCH( func, *inst, 1, 1, chan_index );
1943          emit_add( func, 0, 1 );
1944          STORE( func, *inst, 0, 0, chan_index );
1945       }
1946       break;
1947
1948    case TGSI_OPCODE_DP3:
1949    /* TGSI_OPCODE_DOT3 */
1950       FETCH( func, *inst, 0, 0, CHAN_X );
1951       FETCH( func, *inst, 1, 1, CHAN_X );
1952       emit_mul( func, 0, 1 );
1953       FETCH( func, *inst, 1, 0, CHAN_Y );
1954       FETCH( func, *inst, 2, 1, CHAN_Y );
1955       emit_mul( func, 1, 2 );
1956       emit_add( func, 0, 1 );
1957       FETCH( func, *inst, 1, 0, CHAN_Z );
1958       FETCH( func, *inst, 2, 1, CHAN_Z );
1959       emit_mul( func, 1, 2 );
1960       emit_add( func, 0, 1 );
1961       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1962          STORE( func, *inst, 0, 0, chan_index );
1963       }
1964       break;
1965
1966    case TGSI_OPCODE_DP4:
1967    /* TGSI_OPCODE_DOT4 */
1968       FETCH( func, *inst, 0, 0, CHAN_X );
1969       FETCH( func, *inst, 1, 1, CHAN_X );
1970       emit_mul( func, 0, 1 );
1971       FETCH( func, *inst, 1, 0, CHAN_Y );
1972       FETCH( func, *inst, 2, 1, CHAN_Y );
1973       emit_mul( func, 1, 2 );
1974       emit_add( func, 0, 1 );
1975       FETCH( func, *inst, 1, 0, CHAN_Z );
1976       FETCH( func, *inst, 2, 1, CHAN_Z );
1977       emit_mul(func, 1, 2 );
1978       emit_add(func, 0, 1 );
1979       FETCH( func, *inst, 1, 0, CHAN_W );
1980       FETCH( func, *inst, 2, 1, CHAN_W );
1981       emit_mul( func, 1, 2 );
1982       emit_add( func, 0, 1 );
1983       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1984          STORE( func, *inst, 0, 0, chan_index );
1985       }
1986       break;
1987
1988    case TGSI_OPCODE_DST:
1989       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1990          emit_tempf(
1991             func,
1992             0,
1993             TEMP_ONE_I,
1994             TEMP_ONE_C );
1995          STORE( func, *inst, 0, 0, CHAN_X );
1996       }
1997       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1998          FETCH( func, *inst, 0, 0, CHAN_Y );
1999          FETCH( func, *inst, 1, 1, CHAN_Y );
2000          emit_mul( func, 0, 1 );
2001          STORE( func, *inst, 0, 0, CHAN_Y );
2002       }
2003       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2004          FETCH( func, *inst, 0, 0, CHAN_Z );
2005          STORE( func, *inst, 0, 0, CHAN_Z );
2006       }
2007       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2008          FETCH( func, *inst, 0, 1, CHAN_W );
2009          STORE( func, *inst, 0, 0, CHAN_W );
2010       }
2011       break;
2012
2013    case TGSI_OPCODE_MIN:
2014       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2015          FETCH( func, *inst, 0, 0, chan_index );
2016          FETCH( func, *inst, 1, 1, chan_index );
2017          sse_minps(
2018             func,
2019             make_xmm( 0 ),
2020             make_xmm( 1 ) );
2021          STORE( func, *inst, 0, 0, chan_index );
2022       }
2023       break;
2024
2025    case TGSI_OPCODE_MAX:
2026       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2027          FETCH( func, *inst, 0, 0, chan_index );
2028          FETCH( func, *inst, 1, 1, chan_index );
2029          sse_maxps(
2030             func,
2031             make_xmm( 0 ),
2032             make_xmm( 1 ) );
2033          STORE( func, *inst, 0, 0, chan_index );
2034       }
2035       break;
2036
2037    case TGSI_OPCODE_SLT:
2038    /* TGSI_OPCODE_SETLT */
2039       emit_setcc( func, inst, cc_LessThan );
2040       break;
2041
2042    case TGSI_OPCODE_SGE:
2043    /* TGSI_OPCODE_SETGE */
2044       emit_setcc( func, inst, cc_NotLessThan );
2045       break;
2046
2047    case TGSI_OPCODE_MAD:
2048    /* TGSI_OPCODE_MADD */
2049       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2050          FETCH( func, *inst, 0, 0, chan_index );
2051          FETCH( func, *inst, 1, 1, chan_index );
2052          FETCH( func, *inst, 2, 2, chan_index );
2053          emit_mul( func, 0, 1 );
2054          emit_add( func, 0, 2 );
2055          STORE( func, *inst, 0, 0, chan_index );
2056       }
2057       break;
2058
2059    case TGSI_OPCODE_SUB:
2060       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2061          FETCH( func, *inst, 0, 0, chan_index );
2062          FETCH( func, *inst, 1, 1, chan_index );
2063          emit_sub( func, 0, 1 );
2064          STORE( func, *inst, 0, 0, chan_index );
2065       }
2066       break;
2067
2068    case TGSI_OPCODE_LRP:
2069       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2070          FETCH( func, *inst, 0, 0, chan_index );
2071          FETCH( func, *inst, 1, 1, chan_index );
2072          FETCH( func, *inst, 2, 2, chan_index );
2073          emit_sub( func, 1, 2 );
2074          emit_mul( func, 0, 1 );
2075          emit_add( func, 0, 2 );
2076          STORE( func, *inst, 0, 0, chan_index );
2077       }
2078       break;
2079
2080    case TGSI_OPCODE_CND:
2081       return 0;
2082       break;
2083
2084    case TGSI_OPCODE_CND0:
2085       return 0;
2086       break;
2087
2088    case TGSI_OPCODE_DP2A:
2089       FETCH( func, *inst, 0, 0, CHAN_X );  /* xmm0 = src[0].x */
2090       FETCH( func, *inst, 1, 1, CHAN_X );  /* xmm1 = src[1].x */
2091       emit_mul( func, 0, 1 );              /* xmm0 = xmm0 * xmm1 */
2092       FETCH( func, *inst, 1, 0, CHAN_Y );  /* xmm1 = src[0].y */
2093       FETCH( func, *inst, 2, 1, CHAN_Y );  /* xmm2 = src[1].y */
2094       emit_mul( func, 1, 2 );              /* xmm1 = xmm1 * xmm2 */
2095       emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
2096       FETCH( func, *inst, 1, 2, CHAN_X );  /* xmm1 = src[2].x */
2097       emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
2098       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2099          STORE( func, *inst, 0, 0, chan_index );  /* dest[ch] = xmm0 */
2100       }
2101       break;
2102
2103    case TGSI_OPCODE_FRC:
2104       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2105          FETCH( func, *inst, 0, 0, chan_index );
2106          emit_frc( func, 0, 0 );
2107          STORE( func, *inst, 0, 0, chan_index );
2108       }
2109       break;
2110
2111    case TGSI_OPCODE_CLAMP:
2112       return 0;
2113       break;
2114
2115    case TGSI_OPCODE_FLR:
2116       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2117          FETCH( func, *inst, 0, 0, chan_index );
2118          emit_flr( func, 0, 0 );
2119          STORE( func, *inst, 0, 0, chan_index );
2120       }
2121       break;
2122
2123    case TGSI_OPCODE_ROUND:
2124       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2125          FETCH( func, *inst, 0, 0, chan_index );
2126          emit_rnd( func, 0, 0 );
2127          STORE( func, *inst, 0, 0, chan_index );
2128       }
2129       break;
2130
2131    case TGSI_OPCODE_EX2:
2132       FETCH( func, *inst, 0, 0, CHAN_X );
2133       emit_ex2( func, 0, 0 );
2134       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2135          STORE( func, *inst, 0, 0, chan_index );
2136       }
2137       break;
2138
2139    case TGSI_OPCODE_LG2:
2140       FETCH( func, *inst, 0, 0, CHAN_X );
2141       emit_lg2( func, 0, 0 );
2142       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2143          STORE( func, *inst, 0, 0, chan_index );
2144       }
2145       break;
2146
2147    case TGSI_OPCODE_POW:
2148       FETCH( func, *inst, 0, 0, CHAN_X );
2149       FETCH( func, *inst, 1, 1, CHAN_X );
2150       emit_pow( func, 0, 0, 0, 1 );
2151       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2152          STORE( func, *inst, 0, 0, chan_index );
2153       }
2154       break;
2155
2156    case TGSI_OPCODE_XPD:
2157       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
2158           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
2159          FETCH( func, *inst, 1, 1, CHAN_Z );
2160          FETCH( func, *inst, 3, 0, CHAN_Z );
2161       }
2162       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
2163           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2164          FETCH( func, *inst, 0, 0, CHAN_Y );
2165          FETCH( func, *inst, 4, 1, CHAN_Y );
2166       }
2167       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2168          emit_MOV( func, 2, 0 );
2169          emit_mul( func, 2, 1 );
2170          emit_MOV( func, 5, 3 );
2171          emit_mul( func, 5, 4 );
2172          emit_sub( func, 2, 5 );
2173          STORE( func, *inst, 2, 0, CHAN_X );
2174       }
2175       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
2176           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2177          FETCH( func, *inst, 2, 1, CHAN_X );
2178          FETCH( func, *inst, 5, 0, CHAN_X );
2179       }
2180       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2181          emit_mul( func, 3, 2 );
2182          emit_mul( func, 1, 5 );
2183          emit_sub( func, 3, 1 );
2184          STORE( func, *inst, 3, 0, CHAN_Y );
2185       }
2186       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2187          emit_mul( func, 5, 4 );
2188          emit_mul( func, 0, 2 );
2189          emit_sub( func, 5, 0 );
2190          STORE( func, *inst, 5, 0, CHAN_Z );
2191       }
2192       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2193          emit_tempf(
2194             func,
2195             0,
2196             TEMP_ONE_I,
2197             TEMP_ONE_C );
2198          STORE( func, *inst, 0, 0, CHAN_W );
2199       }
2200       break;
2201
2202    case TGSI_OPCODE_ABS:
2203       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2204          FETCH( func, *inst, 0, 0, chan_index );
2205          emit_abs( func, 0) ;
2206
2207          STORE( func, *inst, 0, 0, chan_index );
2208       }
2209       break;
2210
2211    case TGSI_OPCODE_RCC:
2212       return 0;
2213       break;
2214
2215    case TGSI_OPCODE_DPH:
2216       FETCH( func, *inst, 0, 0, CHAN_X );
2217       FETCH( func, *inst, 1, 1, CHAN_X );
2218       emit_mul( func, 0, 1 );
2219       FETCH( func, *inst, 1, 0, CHAN_Y );
2220       FETCH( func, *inst, 2, 1, CHAN_Y );
2221       emit_mul( func, 1, 2 );
2222       emit_add( func, 0, 1 );
2223       FETCH( func, *inst, 1, 0, CHAN_Z );
2224       FETCH( func, *inst, 2, 1, CHAN_Z );
2225       emit_mul( func, 1, 2 );
2226       emit_add( func, 0, 1 );
2227       FETCH( func, *inst, 1, 1, CHAN_W );
2228       emit_add( func, 0, 1 );
2229       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2230          STORE( func, *inst, 0, 0, chan_index );
2231       }
2232       break;
2233
2234    case TGSI_OPCODE_COS:
2235       FETCH( func, *inst, 0, 0, CHAN_X );
2236       emit_cos( func, 0, 0 );
2237       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2238          STORE( func, *inst, 0, 0, chan_index );
2239       }
2240       break;
2241
2242    case TGSI_OPCODE_DDX:
2243       return 0;
2244       break;
2245
2246    case TGSI_OPCODE_DDY:
2247       return 0;
2248       break;
2249
2250    case TGSI_OPCODE_KILP:
2251       /* predicated kill */
2252       emit_kilp( func );
2253       return 0; /* XXX fix me */
2254       break;
2255
2256    case TGSI_OPCODE_KIL:
2257       /* conditional kill */
2258       emit_kil( func, &inst->FullSrcRegisters[0] );
2259       break;
2260
2261    case TGSI_OPCODE_PK2H:
2262       return 0;
2263       break;
2264
2265    case TGSI_OPCODE_PK2US:
2266       return 0;
2267       break;
2268
2269    case TGSI_OPCODE_PK4B:
2270       return 0;
2271       break;
2272
2273    case TGSI_OPCODE_PK4UB:
2274       return 0;
2275       break;
2276
2277    case TGSI_OPCODE_RFL:
2278       return 0;
2279       break;
2280
2281    case TGSI_OPCODE_SEQ:
2282       return 0;
2283       break;
2284
2285    case TGSI_OPCODE_SFL:
2286       return 0;
2287       break;
2288
2289    case TGSI_OPCODE_SGT:
2290       return 0;
2291       break;
2292
2293    case TGSI_OPCODE_SIN:
2294       FETCH( func, *inst, 0, 0, CHAN_X );
2295       emit_sin( func, 0, 0 );
2296       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2297          STORE( func, *inst, 0, 0, chan_index );
2298       }
2299       break;
2300
2301    case TGSI_OPCODE_SLE:
2302       return 0;
2303       break;
2304
2305    case TGSI_OPCODE_SNE:
2306       return 0;
2307       break;
2308
2309    case TGSI_OPCODE_STR:
2310       return 0;
2311       break;
2312
2313    case TGSI_OPCODE_TEX:
2314       emit_tex( func, inst, FALSE, FALSE );
2315       break;
2316
2317    case TGSI_OPCODE_TXD:
2318       return 0;
2319       break;
2320
2321    case TGSI_OPCODE_UP2H:
2322       return 0;
2323       break;
2324
2325    case TGSI_OPCODE_UP2US:
2326       return 0;
2327       break;
2328
2329    case TGSI_OPCODE_UP4B:
2330       return 0;
2331       break;
2332
2333    case TGSI_OPCODE_UP4UB:
2334       return 0;
2335       break;
2336
2337    case TGSI_OPCODE_X2D:
2338       return 0;
2339       break;
2340
2341    case TGSI_OPCODE_ARA:
2342       return 0;
2343       break;
2344
2345    case TGSI_OPCODE_ARR:
2346       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2347          FETCH( func, *inst, 0, 0, chan_index );
2348          emit_rnd( func, 0, 0 );
2349          emit_f2it( func, 0 );
2350          STORE( func, *inst, 0, 0, chan_index );
2351       }
2352       break;
2353
2354    case TGSI_OPCODE_BRA:
2355       return 0;
2356       break;
2357
2358    case TGSI_OPCODE_CAL:
2359       return 0;
2360       break;
2361
2362    case TGSI_OPCODE_RET:
2363       emit_ret( func );
2364       break;
2365
2366    case TGSI_OPCODE_END:
2367       break;
2368
2369    case TGSI_OPCODE_SSG:
2370    /* TGSI_OPCODE_SGN */
2371       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2372          FETCH( func, *inst, 0, 0, chan_index );
2373          emit_sgn( func, 0, 0 );
2374          STORE( func, *inst, 0, 0, chan_index );
2375       }
2376       break;
2377
2378    case TGSI_OPCODE_CMP:
2379       emit_cmp (func, inst);
2380       break;
2381
2382    case TGSI_OPCODE_SCS:
2383       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2384          FETCH( func, *inst, 0, 0, CHAN_X );
2385          emit_cos( func, 0, 0 );
2386          STORE( func, *inst, 0, 0, CHAN_X );
2387       }
2388       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2389          FETCH( func, *inst, 0, 0, CHAN_X );
2390          emit_sin( func, 0, 0 );
2391          STORE( func, *inst, 0, 0, CHAN_Y );
2392       }
2393       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2394          emit_tempf(
2395             func,
2396             0,
2397             TGSI_EXEC_TEMP_00000000_I,
2398             TGSI_EXEC_TEMP_00000000_C );
2399          STORE( func, *inst, 0, 0, CHAN_Z );
2400       }
2401       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2402          emit_tempf(
2403             func,
2404             0,
2405             TEMP_ONE_I,
2406             TEMP_ONE_C );
2407          STORE( func, *inst, 0, 0, CHAN_W );
2408       }
2409       break;
2410
2411    case TGSI_OPCODE_TXB:
2412       emit_tex( func, inst, TRUE, FALSE );
2413       break;
2414
2415    case TGSI_OPCODE_NRM:
2416       /* fall-through */
2417    case TGSI_OPCODE_NRM4:
2418       /* 3 or 4-component normalization */
2419       {
2420          uint dims = (inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4;
2421
2422          if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) ||
2423              IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2424              IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z) ||
2425              (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 4)) {
2426
2427             /* NOTE: Cannot use xmm regs 2/3 here (see emit_rsqrt() above). */
2428
2429             /* xmm4 = src.x */
2430             /* xmm0 = src.x * src.x */
2431             FETCH(func, *inst, 0, 0, CHAN_X);
2432             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
2433                emit_MOV(func, 4, 0);
2434             }
2435             emit_mul(func, 0, 0);
2436
2437             /* xmm5 = src.y */
2438             /* xmm0 = xmm0 + src.y * src.y */
2439             FETCH(func, *inst, 1, 0, CHAN_Y);
2440             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2441                emit_MOV(func, 5, 1);
2442             }
2443             emit_mul(func, 1, 1);
2444             emit_add(func, 0, 1);
2445
2446             /* xmm6 = src.z */
2447             /* xmm0 = xmm0 + src.z * src.z */
2448             FETCH(func, *inst, 1, 0, CHAN_Z);
2449             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2450                emit_MOV(func, 6, 1);
2451             }
2452             emit_mul(func, 1, 1);
2453             emit_add(func, 0, 1);
2454
2455             if (dims == 4) {
2456                /* xmm7 = src.w */
2457                /* xmm0 = xmm0 + src.w * src.w */
2458                FETCH(func, *inst, 1, 0, CHAN_W);
2459                if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
2460                   emit_MOV(func, 7, 1);
2461                }
2462                emit_mul(func, 1, 1);
2463                emit_add(func, 0, 1);
2464             }
2465
2466             /* xmm1 = 1 / sqrt(xmm0) */
2467             emit_rsqrt(func, 1, 0);
2468
2469             /* dst.x = xmm1 * src.x */
2470             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
2471                emit_mul(func, 4, 1);
2472                STORE(func, *inst, 4, 0, CHAN_X);
2473             }
2474
2475             /* dst.y = xmm1 * src.y */
2476             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2477                emit_mul(func, 5, 1);
2478                STORE(func, *inst, 5, 0, CHAN_Y);
2479             }
2480
2481             /* dst.z = xmm1 * src.z */
2482             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2483                emit_mul(func, 6, 1);
2484                STORE(func, *inst, 6, 0, CHAN_Z);
2485             }
2486
2487             /* dst.w = xmm1 * src.w */
2488             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) && dims == 4) {
2489                emit_mul(func, 7, 1);
2490                STORE(func, *inst, 7, 0, CHAN_W);
2491             }
2492          }
2493
2494          /* dst0.w = 1.0 */
2495          if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 3) {
2496             emit_tempf(func, 0, TEMP_ONE_I, TEMP_ONE_C);
2497             STORE(func, *inst, 0, 0, CHAN_W);
2498          }
2499       }
2500       break;
2501
2502    case TGSI_OPCODE_DIV:
2503       return 0;
2504       break;
2505
2506    case TGSI_OPCODE_DP2:
2507       FETCH( func, *inst, 0, 0, CHAN_X );  /* xmm0 = src[0].x */
2508       FETCH( func, *inst, 1, 1, CHAN_X );  /* xmm1 = src[1].x */
2509       emit_mul( func, 0, 1 );              /* xmm0 = xmm0 * xmm1 */
2510       FETCH( func, *inst, 1, 0, CHAN_Y );  /* xmm1 = src[0].y */
2511       FETCH( func, *inst, 2, 1, CHAN_Y );  /* xmm2 = src[1].y */
2512       emit_mul( func, 1, 2 );              /* xmm1 = xmm1 * xmm2 */
2513       emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
2514       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2515          STORE( func, *inst, 0, 0, chan_index );  /* dest[ch] = xmm0 */
2516       }
2517       break;
2518
2519    case TGSI_OPCODE_TXL:
2520       emit_tex( func, inst, TRUE, FALSE );
2521       break;
2522
2523    case TGSI_OPCODE_TXP:
2524       emit_tex( func, inst, FALSE, TRUE );
2525       break;
2526
2527    case TGSI_OPCODE_BRK:
2528       return 0;
2529       break;
2530
2531    case TGSI_OPCODE_IF:
2532       return 0;
2533       break;
2534
2535    case TGSI_OPCODE_BGNFOR:
2536       return 0;
2537       break;
2538
2539    case TGSI_OPCODE_REP:
2540       return 0;
2541       break;
2542
2543    case TGSI_OPCODE_ELSE:
2544       return 0;
2545       break;
2546
2547    case TGSI_OPCODE_ENDIF:
2548       return 0;
2549       break;
2550
2551    case TGSI_OPCODE_ENDFOR:
2552       return 0;
2553       break;
2554
2555    case TGSI_OPCODE_ENDREP:
2556       return 0;
2557       break;
2558
2559    case TGSI_OPCODE_PUSHA:
2560       return 0;
2561       break;
2562
2563    case TGSI_OPCODE_POPA:
2564       return 0;
2565       break;
2566
2567    case TGSI_OPCODE_CEIL:
2568       return 0;
2569       break;
2570
2571    case TGSI_OPCODE_I2F:
2572       return 0;
2573       break;
2574
2575    case TGSI_OPCODE_NOT:
2576       return 0;
2577       break;
2578
2579    case TGSI_OPCODE_TRUNC:
2580       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2581          FETCH( func, *inst, 0, 0, chan_index );
2582          emit_f2it( func, 0 );
2583          emit_i2f( func, 0 );
2584          STORE( func, *inst, 0, 0, chan_index );
2585       }
2586       break;
2587
2588    case TGSI_OPCODE_SHL:
2589       return 0;
2590       break;
2591
2592    case TGSI_OPCODE_SHR:
2593       return 0;
2594       break;
2595
2596    case TGSI_OPCODE_AND:
2597       return 0;
2598       break;
2599
2600    case TGSI_OPCODE_OR:
2601       return 0;
2602       break;
2603
2604    case TGSI_OPCODE_MOD:
2605       return 0;
2606       break;
2607
2608    case TGSI_OPCODE_XOR:
2609       return 0;
2610       break;
2611
2612    case TGSI_OPCODE_SAD:
2613       return 0;
2614       break;
2615
2616    case TGSI_OPCODE_TXF:
2617       return 0;
2618       break;
2619
2620    case TGSI_OPCODE_TXQ:
2621       return 0;
2622       break;
2623
2624    case TGSI_OPCODE_CONT:
2625       return 0;
2626       break;
2627
2628    case TGSI_OPCODE_EMIT:
2629       return 0;
2630       break;
2631
2632    case TGSI_OPCODE_ENDPRIM:
2633       return 0;
2634       break;
2635
2636    default:
2637       return 0;
2638    }
2639
2640    return 1;
2641 }
2642
2643 static void
2644 emit_declaration(
2645    struct x86_function *func,
2646    struct tgsi_full_declaration *decl )
2647 {
2648    if( decl->Declaration.File == TGSI_FILE_INPUT ) {
2649       unsigned first, last, mask;
2650       unsigned i, j;
2651
2652       first = decl->DeclarationRange.First;
2653       last = decl->DeclarationRange.Last;
2654       mask = decl->Declaration.UsageMask;
2655
2656       for( i = first; i <= last; i++ ) {
2657          for( j = 0; j < NUM_CHANNELS; j++ ) {
2658             if( mask & (1 << j) ) {
2659                switch( decl->Declaration.Interpolate ) {
2660                case TGSI_INTERPOLATE_CONSTANT:
2661                   emit_coef_a0( func, 0, i, j );
2662                   emit_inputs( func, 0, i, j );
2663                   break;
2664
2665                case TGSI_INTERPOLATE_LINEAR:
2666                   emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2667                   emit_coef_dadx( func, 1, i, j );
2668                   emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2669                   emit_coef_dady( func, 3, i, j );
2670                   emit_mul( func, 0, 1 );    /* x * dadx */
2671                   emit_coef_a0( func, 4, i, j );
2672                   emit_mul( func, 2, 3 );    /* y * dady */
2673                   emit_add( func, 0, 4 );    /* x * dadx + a0 */
2674                   emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
2675                   emit_inputs( func, 0, i, j );
2676                   break;
2677
2678                case TGSI_INTERPOLATE_PERSPECTIVE:
2679                   emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2680                   emit_coef_dadx( func, 1, i, j );
2681                   emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2682                   emit_coef_dady( func, 3, i, j );
2683                   emit_mul( func, 0, 1 );    /* x * dadx */
2684                   emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
2685                   emit_coef_a0( func, 5, i, j );
2686                   emit_rcp( func, 4, 4 );    /* 1.0 / w */
2687                   emit_mul( func, 2, 3 );    /* y * dady */
2688                   emit_add( func, 0, 5 );    /* x * dadx + a0 */
2689                   emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
2690                   emit_mul( func, 0, 4 );    /* (x * dadx + y * dady + a0) / w */
2691                   emit_inputs( func, 0, i, j );
2692                   break;
2693
2694                default:
2695                   assert( 0 );
2696                   break;
2697                }
2698             }
2699          }
2700       }
2701    }
2702 }
2703
2704 static void aos_to_soa( struct x86_function *func,
2705                         uint arg_aos,
2706                         uint arg_machine,
2707                         uint arg_num,
2708                         uint arg_stride )
2709 {
2710    struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX );
2711    struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
2712    struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
2713    struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
2714    int inner_loop;
2715
2716
2717    /* Save EBX */
2718    x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2719
2720    x86_mov( func, aos_input,  x86_fn_arg( func, arg_aos ) );
2721    x86_mov( func, soa_input,  x86_fn_arg( func, arg_machine ) );
2722    x86_lea( func, soa_input,
2723             x86_make_disp( soa_input,
2724                            Offset(struct tgsi_exec_machine, Inputs) ) );
2725    x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
2726    x86_mov( func, stride,     x86_fn_arg( func, arg_stride ) );
2727
2728    /* do */
2729    inner_loop = x86_get_label( func );
2730    {
2731       x86_push( func, aos_input );
2732       sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2733       sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2734       x86_add( func, aos_input, stride );
2735       sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2736       sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2737       x86_add( func, aos_input, stride );
2738       sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2739       sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2740       x86_add( func, aos_input, stride );
2741       sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2742       sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2743       x86_pop( func, aos_input );
2744
2745       sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2746       sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2747       sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
2748       sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
2749       sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
2750       sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
2751
2752       sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
2753       sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
2754       sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
2755       sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
2756
2757       /* Advance to next input */
2758       x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
2759       x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
2760    }
2761    /* while --num_inputs */
2762    x86_dec( func, num_inputs );
2763    x86_jcc( func, cc_NE, inner_loop );
2764
2765    /* Restore EBX */
2766    x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2767 }
2768
2769 static void soa_to_aos( struct x86_function *func,
2770                         uint arg_aos,
2771                         uint arg_machine,
2772                         uint arg_num,
2773                         uint arg_stride )
2774 {
2775    struct x86_reg soa_output = x86_make_reg( file_REG32, reg_AX );
2776    struct x86_reg aos_output = x86_make_reg( file_REG32, reg_BX );
2777    struct x86_reg num_outputs = x86_make_reg( file_REG32, reg_CX );
2778    struct x86_reg temp = x86_make_reg( file_REG32, reg_DX );
2779    int inner_loop;
2780
2781    /* Save EBX */
2782    x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2783
2784    x86_mov( func, aos_output, x86_fn_arg( func, arg_aos ) );
2785    x86_mov( func, soa_output, x86_fn_arg( func, arg_machine ) );
2786    x86_lea( func, soa_output,
2787             x86_make_disp( soa_output,
2788                            Offset(struct tgsi_exec_machine, Outputs) ) );
2789    x86_mov( func, num_outputs, x86_fn_arg( func, arg_num ) );
2790
2791    /* do */
2792    inner_loop = x86_get_label( func );
2793    {
2794       sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
2795       sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
2796       sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
2797       sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
2798
2799       sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2800       sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2801       sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
2802       sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
2803       sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
2804       sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
2805
2806       x86_mov( func, temp, x86_fn_arg( func, arg_stride ) );
2807       x86_push( func, aos_output );
2808       sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2809       sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2810       x86_add( func, aos_output, temp );
2811       sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2812       sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2813       x86_add( func, aos_output, temp );
2814       sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2815       sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2816       x86_add( func, aos_output, temp );
2817       sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2818       sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2819       x86_pop( func, aos_output );
2820
2821       /* Advance to next output */
2822       x86_lea( func, aos_output, x86_make_disp(aos_output, 16) );
2823       x86_lea( func, soa_output, x86_make_disp(soa_output, 64) );
2824    }
2825    /* while --num_outputs */
2826    x86_dec( func, num_outputs );
2827    x86_jcc( func, cc_NE, inner_loop );
2828
2829    /* Restore EBX */
2830    x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2831 }
2832
2833 /**
2834  * Translate a TGSI vertex/fragment shader to SSE2 code.
2835  * Slightly different things are done for vertex vs. fragment shaders.
2836  *
2837  * \param tokens  the TGSI input shader
2838  * \param func  the output SSE code/function
2839  * \param immediates  buffer to place immediates, later passed to SSE func
2840  * \param return  1 for success, 0 if translation failed
2841  */
2842 unsigned
2843 tgsi_emit_sse2(
2844    const struct tgsi_token *tokens,
2845    struct x86_function *func,
2846    float (*immediates)[4],
2847    boolean do_swizzles )
2848 {
2849    struct tgsi_parse_context parse;
2850    unsigned ok = 1;
2851    uint num_immediates = 0;
2852
2853    util_init_math();
2854
2855    func->csr = func->store;
2856
2857    tgsi_parse_init( &parse, tokens );
2858
2859    /* Can't just use EDI, EBX without save/restoring them:
2860     */
2861    x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2862    x86_push( func, x86_make_reg( file_REG32, reg_DI ) );
2863
2864    /*
2865     * Different function args for vertex/fragment shaders:
2866     */
2867    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2868       if (do_swizzles)
2869          aos_to_soa( func,
2870                      4,         /* aos_input */
2871                      1,         /* machine */
2872                      5,         /* num_inputs */
2873                      6 );       /* input_stride */
2874    }
2875
2876    x86_mov(
2877       func,
2878       get_machine_base(),
2879       x86_fn_arg( func, 1 ) );
2880    x86_mov(
2881       func,
2882       get_const_base(),
2883       x86_fn_arg( func, 2 ) );
2884    x86_mov(
2885       func,
2886       get_immediate_base(),
2887       x86_fn_arg( func, 3 ) );
2888
2889    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2890       x86_mov(
2891          func,
2892          get_coef_base(),
2893          x86_fn_arg( func, 4 ) );
2894    }
2895
2896    x86_mov(
2897       func,
2898       get_sampler_base(),
2899       x86_make_disp( get_machine_base(),
2900                      Offset( struct tgsi_exec_machine, Samplers ) ) );
2901
2902
2903    while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
2904       tgsi_parse_token( &parse );
2905
2906       switch( parse.FullToken.Token.Type ) {
2907       case TGSI_TOKEN_TYPE_DECLARATION:
2908          if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2909             emit_declaration(
2910                func,
2911                &parse.FullToken.FullDeclaration );
2912          }
2913          break;
2914
2915       case TGSI_TOKEN_TYPE_INSTRUCTION:
2916          ok = emit_instruction(
2917             func,
2918             &parse.FullToken.FullInstruction );
2919
2920          if (!ok) {
2921             uint opcode = parse.FullToken.FullInstruction.Instruction.Opcode;
2922             debug_printf("failed to translate tgsi opcode %d (%s) to SSE (%s)\n",
2923                          opcode,
2924                          tgsi_get_opcode_name(opcode),
2925                          parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
2926                          "vertex shader" : "fragment shader");
2927          }
2928          break;
2929
2930       case TGSI_TOKEN_TYPE_IMMEDIATE:
2931          /* simply copy the immediate values into the next immediates[] slot */
2932          {
2933             const uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
2934             uint i;
2935             assert(size <= 4);
2936             assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
2937             for( i = 0; i < size; i++ ) {
2938                immediates[num_immediates][i] =
2939                   parse.FullToken.FullImmediate.u[i].Float;
2940             }
2941 #if 0
2942             debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
2943                    num_immediates,
2944                    immediates[num_immediates][0],
2945                    immediates[num_immediates][1],
2946                    immediates[num_immediates][2],
2947                    immediates[num_immediates][3]);
2948 #endif
2949             num_immediates++;
2950          }
2951          break;
2952
2953       default:
2954          ok = 0;
2955          assert( 0 );
2956       }
2957    }
2958
2959    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2960       if (do_swizzles)
2961          soa_to_aos( func,
2962                      7,         /* aos_output */
2963                      1,         /* machine */
2964                      8,         /* num_outputs */
2965                      9 );       /* output_stride */
2966    }
2967
2968    /* Can't just use EBX, EDI without save/restoring them:
2969     */
2970    x86_pop( func, x86_make_reg( file_REG32, reg_DI ) );
2971    x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2972
2973    emit_ret( func );
2974
2975    tgsi_parse_free( &parse );
2976
2977    return ok;
2978 }
2979
2980 #endif /* PIPE_ARCH_X86 */
2981