src/gallium/auxiliary/translate/translate_sse.c

   1 /*
   2  * Copyright 2003 Tungsten Graphics, inc.
   3  * All Rights Reserved.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * on the rights to use, copy, modify, merge, publish, distribute, sub
   9  * license, and/or sell copies of the Software, and to permit persons to whom
  10  * the Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
  19  * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  *
  24  * Authors:
  25  *    Keith Whitwell <keithw@tungstengraphics.com>
  26  */
  27
  28
  29 #include "pipe/p_config.h"
  30 #include "pipe/p_compiler.h"
  31 #include "util/u_memory.h"
  32 #include "util/u_math.h"
  33 #include "util/u_format.h"
  34
  35 #include "translate.h"
  36
  37
  38 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
  39
  40 #include "rtasm/rtasm_cpu.h"
  41 #include "rtasm/rtasm_x86sse.h"
  42
  43
  44 #define X    0
  45 #define Y    1
  46 #define Z    2
  47 #define W    3
  48
  49
  50 struct translate_buffer {
  51    const void *base_ptr;
  52    uintptr_t stride;
  53    unsigned max_index;
  54 };
  55
  56 struct translate_buffer_variant {
  57    unsigned buffer_index;
  58    unsigned instance_divisor;
  59    void *ptr;                    /* updated either per vertex or per instance */
  60 };
  61
  62
  63 #define ELEMENT_BUFFER_INSTANCE_ID  1001
  64
  65 #define NUM_CONSTS 7
  66
  67 enum
  68 {
  69    CONST_IDENTITY,
  70    CONST_INV_127,
  71    CONST_INV_255,
  72    CONST_INV_32767,
  73    CONST_INV_65535,
  74    CONST_INV_2147483647,
  75    CONST_255
  76 };
  77
  78 #define C(v) {(float)(v), (float)(v), (float)(v), (float)(v)}
  79 static float consts[NUM_CONSTS][4] = {
  80       {0, 0, 0, 1},
  81       C(1.0 / 127.0),
  82       C(1.0 / 255.0),
  83       C(1.0 / 32767.0),
  84       C(1.0 / 65535.0),
  85       C(1.0 / 2147483647.0),
  86       C(255.0)
  87 };
  88 #undef C
  89
  90 struct translate_sse {
  91    struct translate translate;
  92
  93    struct x86_function linear_func;
  94    struct x86_function elt_func;
  95    struct x86_function elt16_func;
  96    struct x86_function elt8_func;
  97    struct x86_function *func;
  98
  99    PIPE_ALIGN_VAR(16) float consts[NUM_CONSTS][4];
 100    int8_t reg_to_const[16];
 101    int8_t const_to_reg[NUM_CONSTS];
 102
 103    struct translate_buffer buffer[PIPE_MAX_ATTRIBS];
 104    unsigned nr_buffers;
 105
 106    /* Multiple buffer variants can map to a single buffer. */
 107    struct translate_buffer_variant buffer_variant[PIPE_MAX_ATTRIBS];
 108    unsigned nr_buffer_variants;
 109
 110    /* Multiple elements can map to a single buffer variant. */
 111    unsigned element_to_buffer_variant[PIPE_MAX_ATTRIBS];
 112
 113    boolean use_instancing;
 114    unsigned instance_id;
 115
 116    /* these are actually known values, but putting them in a struct
 117     * like this is helpful to keep them in sync across the file.
 118     */
 119    struct x86_reg tmp_EAX;
 120    struct x86_reg tmp2_EDX;
 121    struct x86_reg src_ECX;
 122    struct x86_reg idx_ESI;     /* either start+i or &elt[i] */
 123    struct x86_reg machine_EDI;
 124    struct x86_reg outbuf_EBX;
 125    struct x86_reg count_EBP;    /* decrements to zero */
 126 };
 127
 128 static int get_offset( const void *a, const void *b )
 129 {
 130    return (const char *)b - (const char *)a;
 131 }
 132
 133 static struct x86_reg get_const( struct translate_sse *p, unsigned id)
 134 {
 135    struct x86_reg reg;
 136    unsigned i;
 137
 138    if(p->const_to_reg[id] >= 0)
 139       return x86_make_reg(file_XMM, p->const_to_reg[id]);
 140
 141    for(i = 2; i < 8; ++i)
 142    {
 143       if(p->reg_to_const[i] < 0)
 144          break;
 145    }
 146
 147    /* TODO: be smarter here */
 148    if(i == 8)
 149       --i;
 150
 151    reg = x86_make_reg(file_XMM, i);
 152
 153    if(p->reg_to_const[i] >= 0)
 154       p->const_to_reg[p->reg_to_const[i]] = -1;
 155
 156    p->reg_to_const[i] = id;
 157    p->const_to_reg[id] = i;
 158
 159    /* TODO: this should happen outside the loop, if possible */
 160    sse_movaps(p->func, reg,
 161          x86_make_disp(p->machine_EDI,
 162                get_offset(p, &p->consts[id][0])));
 163
 164    return reg;
 165 }
 166
 167 /* load the data in a SSE2 register, padding with zeros */
 168 static boolean emit_load_sse2( struct translate_sse *p,
 169                                        struct x86_reg data,
 170                                        struct x86_reg src,
 171                                        unsigned size)
 172 {
 173    struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
 174    struct x86_reg tmp = p->tmp_EAX;
 175    switch(size)
 176    {
 177    case 1:
 178       x86_movzx8(p->func, tmp, src);
 179       sse2_movd(p->func, data, tmp);
 180       break;
 181    case 2:
 182       x86_movzx16(p->func, tmp, src);
 183       sse2_movd(p->func, data, tmp);
 184       break;
 185    case 3:
 186       x86_movzx8(p->func, tmp, x86_make_disp(src, 2));
 187       x86_shl_imm(p->func, tmp, 16);
 188       x86_mov16(p->func, tmp, src);
 189       sse2_movd(p->func, data, tmp);
 190       break;
 191    case 4:
 192       sse2_movd(p->func, data, src);
 193       break;
 194    case 6:
 195       sse2_movd(p->func, data, src);
 196       x86_movzx16(p->func, tmp, x86_make_disp(src, 4));
 197       sse2_movd(p->func, tmpXMM, tmp);
 198       sse2_punpckldq(p->func, data, tmpXMM);
 199       break;
 200    case 8:
 201       sse2_movq(p->func, data, src);
 202       break;
 203    case 12:
 204       sse2_movq(p->func, data, src);
 205       sse2_movd(p->func, tmpXMM, x86_make_disp(src, 8));
 206       sse2_punpcklqdq(p->func, data, tmpXMM);
 207       break;
 208    case 16:
 209       sse2_movdqu(p->func, data, src);
 210       break;
 211    default:
 212       return FALSE;
 213    }
 214    return TRUE;
 215 }
 216
 217 /* this value can be passed for the out_chans argument */
 218 #define CHANNELS_0001 5
 219
 220 /* this function will load #chans float values, and will
 221  * pad the register with zeroes at least up to out_chans.
 222  *
 223  * If out_chans is set to CHANNELS_0001, then the fourth
 224  * value will be padded with 1. Only pass this value if
 225  * chans < 4 or results are undefined.
 226  */
 227 static void emit_load_float32( struct translate_sse *p,
 228                                        struct x86_reg data,
 229                                        struct x86_reg arg0,
 230                                        unsigned out_chans,
 231                                        unsigned chans)
 232 {
 233    switch(chans)
 234    {
 235    case 1:
 236       /* a 0 0 0
 237        * a 0 0 1
 238        */
 239       sse_movss(p->func, data, arg0);
 240       if(out_chans == CHANNELS_0001)
 241          sse_orps(p->func, data, get_const(p, CONST_IDENTITY) );
 242       break;
 243    case 2:
 244       /* 0 0 0 1
 245        * a b 0 1
 246        */
 247       if(out_chans == CHANNELS_0001)
 248          sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W) );
 249       else if(out_chans > 2)
 250          sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY) );
 251       sse_movlps(p->func, data, arg0);
 252       break;
 253    case 3:
 254       /* Have to jump through some hoops:
 255        *
 256        * c 0 0 0
 257        * c 0 0 1 if out_chans == CHANNELS_0001
 258        * 0 0 c 0/1
 259        * a b c 0/1
 260        */
 261       sse_movss(p->func, data, x86_make_disp(arg0, 8));
 262       if(out_chans == CHANNELS_0001)
 263          sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X,Y,Z,W) );
 264       sse_shufps(p->func, data, data, SHUF(Y,Z,X,W) );
 265       sse_movlps(p->func, data, arg0);
 266       break;
 267    case 4:
 268       sse_movups(p->func, data, arg0);
 269       break;
 270    }
 271 }
 272
 273 /* this function behaves like emit_load_float32, but loads
 274    64-bit floating point numbers, converting them to 32-bit
 275   ones */
 276 static void emit_load_float64to32( struct translate_sse *p,
 277                                        struct x86_reg data,
 278                                        struct x86_reg arg0,
 279                                        unsigned out_chans,
 280                                        unsigned chans)
 281 {
 282    struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
 283    switch(chans)
 284    {
 285    case 1:
 286       sse2_movsd(p->func, data, arg0);
 287       if(out_chans > 1)
 288          sse2_cvtpd2ps(p->func, data, data);
 289       else
 290          sse2_cvtsd2ss(p->func, data, data);
 291       if(out_chans == CHANNELS_0001)
 292          sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W)  );
 293       break;
 294    case 2:
 295       sse2_movupd(p->func, data, arg0);
 296       sse2_cvtpd2ps(p->func, data, data);
 297       if(out_chans == CHANNELS_0001)
 298          sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W) );
 299       else if(out_chans > 2)
 300          sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY) );
 301        break;
 302    case 3:
 303       sse2_movupd(p->func, data, arg0);
 304       sse2_cvtpd2ps(p->func, data, data);
 305       sse2_movsd(p->func, tmpXMM, x86_make_disp(arg0, 16));
 306       if(out_chans > 3)
 307          sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM);
 308       else
 309          sse2_cvtsd2ss(p->func, tmpXMM, tmpXMM);
 310       sse_movlhps(p->func, data, tmpXMM);
 311       if(out_chans == CHANNELS_0001)
 312          sse_orps(p->func, data, get_const(p, CONST_IDENTITY) );
 313       break;
 314    case 4:
 315       sse2_movupd(p->func, data, arg0);
 316       sse2_cvtpd2ps(p->func, data, data);
 317       sse2_movupd(p->func, tmpXMM, x86_make_disp(arg0, 16));
 318       sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM);
 319       sse_movlhps(p->func, data, tmpXMM);
 320       break;
 321    }
 322 }
 323
 324 static void emit_mov64(struct translate_sse *p, struct x86_reg dst_gpr, struct x86_reg dst_xmm, struct x86_reg src_gpr,  struct x86_reg src_xmm)
 325 {
 326    if(x86_target(p->func) != X86_32)
 327       x64_mov64(p->func, dst_gpr, src_gpr);
 328    else
 329    {
 330       /* TODO: when/on which CPUs is SSE2 actually better than SSE? */
 331       if(x86_target_caps(p->func) & X86_SSE2)
 332          sse2_movq(p->func, dst_xmm, src_xmm);
 333       else
 334          sse_movlps(p->func, dst_xmm, src_xmm);
 335    }
 336 }
 337
 338 static void emit_load64(struct translate_sse *p, struct x86_reg dst_gpr, struct x86_reg dst_xmm, struct x86_reg src)
 339 {
 340    emit_mov64(p, dst_gpr, dst_xmm, src, src);
 341 }
 342
 343 static void emit_store64(struct translate_sse *p, struct x86_reg dst, struct x86_reg src_gpr, struct x86_reg src_xmm)
 344 {
 345    emit_mov64(p, dst, dst, src_gpr, src_xmm);
 346 }
 347
 348 static void emit_mov128(struct translate_sse *p, struct x86_reg dst, struct x86_reg src)
 349 {
 350    if(x86_target_caps(p->func) & X86_SSE2)
 351       sse2_movdqu(p->func, dst, src);
 352    else
 353       sse_movups(p->func, dst, src);
 354 }
 355
 356 /* TODO: this uses unaligned accesses liberally, which is great on Nehalem,
 357  * but may or may not be good on older processors
 358  * TODO: may perhaps want to use non-temporal stores here if possible
 359  */
 360 static void emit_memcpy(struct translate_sse *p, struct x86_reg dst, struct x86_reg src, unsigned size)
 361 {
 362    struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
 363    struct x86_reg dataXMM2 = x86_make_reg(file_XMM, 1);
 364    struct x86_reg dataGPR = p->tmp_EAX;
 365    struct x86_reg dataGPR2 = p->tmp2_EDX;
 366
 367    if(size < 8)
 368    {
 369       switch (size)
 370       {
 371       case 1:
 372          x86_mov8(p->func, dataGPR, src);
 373          x86_mov8(p->func, dst, dataGPR);
 374          break;
 375       case 2:
 376          x86_mov16(p->func, dataGPR, src);
 377          x86_mov16(p->func, dst, dataGPR);
 378          break;
 379       case 3:
 380          x86_mov16(p->func, dataGPR, src);
 381          x86_mov8(p->func, dataGPR2, x86_make_disp(src, 2));
 382          x86_mov16(p->func, dst, dataGPR);
 383          x86_mov8(p->func, x86_make_disp(dst, 2), dataGPR2);
 384          break;
 385       case 4:
 386          x86_mov(p->func, dataGPR, src);
 387          x86_mov(p->func, dst, dataGPR);
 388          break;
 389       case 6:
 390          x86_mov(p->func, dataGPR, src);
 391          x86_mov16(p->func, dataGPR2, x86_make_disp(src, 4));
 392          x86_mov(p->func, dst, dataGPR);
 393          x86_mov16(p->func, x86_make_disp(dst, 4), dataGPR2);
 394          break;
 395       }
 396    }
 397    else if(!(x86_target_caps(p->func) & X86_SSE))
 398    {
 399       unsigned i = 0;
 400       assert((size & 3) == 0);
 401       for(i = 0; i < size; i += 4)
 402       {
 403          x86_mov(p->func, dataGPR, x86_make_disp(src, i));
 404          x86_mov(p->func, x86_make_disp(dst, i), dataGPR);
 405       }
 406    }
 407    else
 408    {
 409       switch(size)
 410       {
 411       case 8:
 412          emit_load64(p, dataGPR, dataXMM, src);
 413          emit_store64(p, dst, dataGPR, dataXMM);
 414          break;
 415       case 12:
 416          emit_load64(p, dataGPR2, dataXMM, src);
 417          x86_mov(p->func, dataGPR, x86_make_disp(src, 8));
 418          emit_store64(p, dst, dataGPR2, dataXMM);
 419          x86_mov(p->func, x86_make_disp(dst, 8), dataGPR);
 420          break;
 421       case 16:
 422          emit_mov128(p, dataXMM, src);
 423          emit_mov128(p, dst, dataXMM);
 424          break;
 425       case 24:
 426          emit_mov128(p, dataXMM, src);
 427          emit_load64(p, dataGPR, dataXMM2, x86_make_disp(src, 16));
 428          emit_mov128(p, dst, dataXMM);
 429          emit_store64(p, x86_make_disp(dst, 16), dataGPR, dataXMM2);
 430          break;
 431       case 32:
 432          emit_mov128(p, dataXMM, src);
 433          emit_mov128(p, dataXMM2, x86_make_disp(src, 16));
 434          emit_mov128(p, dst, dataXMM);
 435          emit_mov128(p, x86_make_disp(dst, 16), dataXMM2);
 436          break;
 437       default:
 438          assert(0);
 439       }
 440    }
 441 }
 442
 443 static boolean translate_attr_convert( struct translate_sse *p,
 444                                const struct translate_element *a,
 445                                struct x86_reg src,
 446                                struct x86_reg dst)
 447
 448 {
 449    const struct util_format_description* input_desc = util_format_description(a->input_format);
 450    const struct util_format_description* output_desc = util_format_description(a->output_format);
 451    unsigned i;
 452    boolean id_swizzle = TRUE;
 453    unsigned swizzle[4] = {UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE};
 454    unsigned needed_chans = 0;
 455    unsigned imms[2] = {0, 0x3f800000};
 456
 457    if(a->output_format == PIPE_FORMAT_NONE || a->input_format == PIPE_FORMAT_NONE)
 458       return FALSE;
 459
 460    if(input_desc->channel[0].size & 7)
 461       return FALSE;
 462
 463    if(input_desc->colorspace != output_desc->colorspace)
 464       return FALSE;
 465
 466    for(i = 1; i < input_desc->nr_channels; ++i)
 467    {
 468       if(memcmp(&input_desc->channel[i], &input_desc->channel[0], sizeof(input_desc->channel[0])))
 469          return FALSE;
 470    }
 471
 472    for(i = 1; i < output_desc->nr_channels; ++i)
 473    {
 474       if(memcmp(&output_desc->channel[i], &output_desc->channel[0], sizeof(output_desc->channel[0])))
 475          return FALSE;
 476    }
 477
 478    for(i = 0; i < output_desc->nr_channels; ++i)
 479    {
 480       if(output_desc->swizzle[i] < 4)
 481          swizzle[output_desc->swizzle[i]] = input_desc->swizzle[i];
 482    }
 483
 484    if((x86_target_caps(p->func) & X86_SSE) && (0
 485          || a->output_format == PIPE_FORMAT_R32_FLOAT
 486          || a->output_format == PIPE_FORMAT_R32G32_FLOAT
 487          || a->output_format == PIPE_FORMAT_R32G32B32_FLOAT
 488          || a->output_format == PIPE_FORMAT_R32G32B32A32_FLOAT))
 489    {
 490       struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
 491
 492       for(i = 0; i < output_desc->nr_channels; ++i)
 493       {
 494          if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0 && i >= input_desc->nr_channels)
 495             swizzle[i] = i;
 496       }
 497
 498       for(i = 0; i < output_desc->nr_channels; ++i)
 499       {
 500          if(swizzle[i] < 4)
 501             needed_chans = MAX2(needed_chans, swizzle[i] + 1);
 502          if(swizzle[i] < UTIL_FORMAT_SWIZZLE_0 && swizzle[i] != i)
 503             id_swizzle = FALSE;
 504       }
 505
 506       if(needed_chans > 0)
 507       {
 508          switch(input_desc->channel[0].type)
 509          {
 510          case UTIL_FORMAT_TYPE_UNSIGNED:
 511             if(!(x86_target_caps(p->func) & X86_SSE2))
 512                return FALSE;
 513             emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3);
 514
 515             /* TODO: add support for SSE4.1 pmovzx */
 516             switch(input_desc->channel[0].size)
 517             {
 518             case 8:
 519                /* TODO: this may be inefficient due to get_identity() being used both as a float and integer register */
 520                sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
 521                sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
 522                break;
 523             case 16:
 524                sse2_punpcklwd(p->func, dataXMM, get_const(p, CONST_IDENTITY));
 525                break;
 526             case 32: /* we lose precision here */
 527                sse2_psrld_imm(p->func, dataXMM, 1);
 528                break;
 529             default:
 530                return FALSE;
 531             }
 532             sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
 533             if(input_desc->channel[0].normalized)
 534             {
 535                struct x86_reg factor;
 536                switch(input_desc->channel[0].size)
 537                {
 538                case 8:
 539                   factor = get_const(p, CONST_INV_255);
 540                   break;
 541                case 16:
 542                   factor = get_const(p, CONST_INV_65535);
 543                   break;
 544                case 32:
 545                   factor = get_const(p, CONST_INV_2147483647);
 546                   break;
 547                default:
 548                   assert(0);
 549                   factor.disp = 0;
 550                   factor.file = 0;
 551                   factor.idx = 0;
 552                   factor.mod = 0;
 553                   break;
 554                }
 555                sse_mulps(p->func, dataXMM, factor);
 556             }
 557             else if(input_desc->channel[0].size == 32)
 558                sse_addps(p->func, dataXMM, dataXMM); /* compensate for the bit we threw away to fit u32 into s32 */
 559             break;
 560          case UTIL_FORMAT_TYPE_SIGNED:
 561             if(!(x86_target_caps(p->func) & X86_SSE2))
 562                return FALSE;
 563             emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3);
 564
 565             /* TODO: add support for SSE4.1 pmovsx */
 566             switch(input_desc->channel[0].size)
 567             {
 568             case 8:
 569                sse2_punpcklbw(p->func, dataXMM, dataXMM);
 570                sse2_punpcklbw(p->func, dataXMM, dataXMM);
 571                sse2_psrad_imm(p->func, dataXMM, 24);
 572                break;
 573             case 16:
 574                sse2_punpcklwd(p->func, dataXMM, dataXMM);
 575                sse2_psrad_imm(p->func, dataXMM, 16);
 576                break;
 577             case 32: /* we lose precision here */
 578                break;
 579             default:
 580                return FALSE;
 581             }
 582             sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
 583             if(input_desc->channel[0].normalized)
 584             {
 585                struct x86_reg factor;
 586                switch(input_desc->channel[0].size)
 587                {
 588                case 8:
 589                   factor = get_const(p, CONST_INV_127);
 590                   break;
 591                case 16:
 592                   factor = get_const(p, CONST_INV_32767);
 593                   break;
 594                case 32:
 595                   factor = get_const(p, CONST_INV_2147483647);
 596                   break;
 597                default:
 598                   assert(0);
 599                   factor.disp = 0;
 600                   factor.file = 0;
 601                   factor.idx = 0;
 602                   factor.mod = 0;
 603                   break;
 604                }
 605                sse_mulps(p->func, dataXMM, factor);
 606             }
 607             break;
 608
 609             break;
 610          case UTIL_FORMAT_TYPE_FLOAT:
 611             if(input_desc->channel[0].size != 32 && input_desc->channel[0].size != 64)
 612                return FALSE;
 613             if(swizzle[3] == UTIL_FORMAT_SWIZZLE_1 && input_desc->nr_channels <= 3)
 614             {
 615                swizzle[3] = UTIL_FORMAT_SWIZZLE_W;
 616                needed_chans = CHANNELS_0001;
 617             }
 618             switch(input_desc->channel[0].size)
 619             {
 620             case 32:
 621                emit_load_float32(p, dataXMM, src, needed_chans, input_desc->nr_channels);
 622                break;
 623             case 64: /* we lose precision here */
 624                if(!(x86_target_caps(p->func) & X86_SSE2))
 625                   return FALSE;
 626                emit_load_float64to32(p, dataXMM, src, needed_chans, input_desc->nr_channels);
 627                break;
 628             default:
 629                return FALSE;
 630             }
 631             break;
 632          default:
 633             return FALSE;
 634          }
 635
 636          if(!id_swizzle)
 637             sse_shufps(p->func, dataXMM, dataXMM, SHUF(swizzle[0], swizzle[1], swizzle[2], swizzle[3]) );
 638       }
 639
 640       if(output_desc->nr_channels >= 4
 641             && swizzle[0] < UTIL_FORMAT_SWIZZLE_0
 642             && swizzle[1] < UTIL_FORMAT_SWIZZLE_0
 643             && swizzle[2] < UTIL_FORMAT_SWIZZLE_0
 644             && swizzle[3] < UTIL_FORMAT_SWIZZLE_0
 645             )
 646          sse_movups(p->func, dst, dataXMM);
 647       else
 648       {
 649          if(output_desc->nr_channels >= 2
 650                && swizzle[0] < UTIL_FORMAT_SWIZZLE_0
 651                && swizzle[1] < UTIL_FORMAT_SWIZZLE_0)
 652             sse_movlps(p->func, dst, dataXMM);
 653          else
 654          {
 655             if(swizzle[0] < UTIL_FORMAT_SWIZZLE_0)
 656                sse_movss(p->func, dst, dataXMM);
 657             else
 658                x86_mov_imm(p->func, dst, imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]);
 659
 660             if(output_desc->nr_channels >= 2)
 661             {
 662                if(swizzle[1] < UTIL_FORMAT_SWIZZLE_0)
 663                {
 664                   sse_shufps(p->func, dataXMM, dataXMM, SHUF(1, 1, 2, 3));
 665                   sse_movss(p->func, x86_make_disp(dst, 4), dataXMM);
 666                }
 667                else
 668                   x86_mov_imm(p->func, x86_make_disp(dst, 4), imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0]);
 669             }
 670          }
 671
 672          if(output_desc->nr_channels >= 3)
 673          {
 674             if(output_desc->nr_channels >= 4
 675                   && swizzle[2] < UTIL_FORMAT_SWIZZLE_0
 676                   && swizzle[3] < UTIL_FORMAT_SWIZZLE_0)
 677                sse_movhps(p->func, x86_make_disp(dst, 8), dataXMM);
 678             else
 679             {
 680                if(swizzle[2] < UTIL_FORMAT_SWIZZLE_0)
 681                {
 682                   sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 2, 2, 3));
 683                   sse_movss(p->func, x86_make_disp(dst, 8), dataXMM);
 684                }
 685                else
 686                   x86_mov_imm(p->func, x86_make_disp(dst, 8), imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]);
 687
 688                if(output_desc->nr_channels >= 4)
 689                {
 690                   if(swizzle[3] < UTIL_FORMAT_SWIZZLE_0)
 691                   {
 692                      sse_shufps(p->func, dataXMM, dataXMM, SHUF(3, 3, 3, 3));
 693                      sse_movss(p->func, x86_make_disp(dst, 12), dataXMM);
 694                   }
 695                   else
 696                      x86_mov_imm(p->func, x86_make_disp(dst, 12), imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0]);
 697                }
 698             }
 699          }
 700       }
 701       return TRUE;
 702    }
 703    else if((x86_target_caps(p->func) & X86_SSE2) && input_desc->channel[0].size == 8 && output_desc->channel[0].size == 16
 704          && output_desc->channel[0].normalized == input_desc->channel[0].normalized
 705          && (0
 706                || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED)
 707                || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
 708                || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
 709                ))
 710    {
 711       struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
 712       struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
 713       struct x86_reg tmp = p->tmp_EAX;
 714       unsigned imms[2] = {0, 1};
 715
 716       for(i = 0; i < output_desc->nr_channels; ++i)
 717       {
 718          if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0 && i >= input_desc->nr_channels)
 719             swizzle[i] = i;
 720       }
 721
 722       for(i = 0; i < output_desc->nr_channels; ++i)
 723       {
 724          if(swizzle[i] < 4)
 725             needed_chans = MAX2(needed_chans, swizzle[i] + 1);
 726          if(swizzle[i] < UTIL_FORMAT_SWIZZLE_0 && swizzle[i] != i)
 727             id_swizzle = FALSE;
 728       }
 729
 730       if(needed_chans > 0)
 731       {
 732          emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3);
 733
 734          switch(input_desc->channel[0].type)
 735          {
 736          case UTIL_FORMAT_TYPE_UNSIGNED:
 737             if(input_desc->channel[0].normalized)
 738             {
 739                sse2_punpcklbw(p->func, dataXMM, dataXMM);
 740                if(output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
 741                        sse2_psrlw_imm(p->func, dataXMM, 1);
 742             }
 743             else
 744                sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
 745             break;
 746          case UTIL_FORMAT_TYPE_SIGNED:
 747             if(input_desc->channel[0].normalized)
 748             {
 749                sse2_movq(p->func, tmpXMM, get_const(p, CONST_IDENTITY));
 750                sse2_punpcklbw(p->func, tmpXMM, dataXMM);
 751                sse2_psllw_imm(p->func, dataXMM, 9);
 752                sse2_psrlw_imm(p->func, dataXMM, 8);
 753                sse2_por(p->func, tmpXMM, dataXMM);
 754                sse2_psrlw_imm(p->func, dataXMM, 7);
 755                sse2_por(p->func, tmpXMM, dataXMM);
 756                {
 757                   struct x86_reg t = dataXMM;
 758                   dataXMM = tmpXMM;
 759                   tmpXMM = t;
 760                }
 761             }
 762             else
 763             {
 764                sse2_punpcklbw(p->func, dataXMM, dataXMM);
 765                sse2_psraw_imm(p->func, dataXMM, 8);
 766             }
 767             break;
 768          default:
 769             assert(0);
 770          }
 771
 772          if(output_desc->channel[0].normalized)
 773             imms[1] = (output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) ? 0xffff : 0x7ffff;
 774
 775          if(!id_swizzle)
 776             sse2_pshuflw(p->func, dataXMM, dataXMM, (swizzle[0] & 3) | ((swizzle[1] & 3) << 2) | ((swizzle[2] & 3) << 4) | ((swizzle[3] & 3) << 6));
 777       }
 778
 779       if(output_desc->nr_channels >= 4
 780             && swizzle[0] < UTIL_FORMAT_SWIZZLE_0
 781             && swizzle[1] < UTIL_FORMAT_SWIZZLE_0
 782             && swizzle[2] < UTIL_FORMAT_SWIZZLE_0
 783             && swizzle[3] < UTIL_FORMAT_SWIZZLE_0
 784             )
 785          sse2_movq(p->func, dst, dataXMM);
 786       else
 787       {
 788          if(swizzle[0] < UTIL_FORMAT_SWIZZLE_0)
 789          {
 790             if(output_desc->nr_channels >= 2 && swizzle[1] < UTIL_FORMAT_SWIZZLE_0)
 791                sse2_movd(p->func, dst, dataXMM);
 792             else
 793             {
 794                sse2_movd(p->func, tmp, dataXMM);
 795                x86_mov16(p->func, dst, tmp);
 796                if(output_desc->nr_channels >= 2)
 797                   x86_mov16_imm(p->func, x86_make_disp(dst, 2), imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0]);
 798             }
 799          }
 800          else
 801          {
 802             if(output_desc->nr_channels >= 2 && swizzle[1] >= UTIL_FORMAT_SWIZZLE_0)
 803                x86_mov_imm(p->func, dst, (imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0] << 16) | imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]);
 804             else
 805             {
 806                x86_mov16_imm(p->func, dst, imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]);
 807                if(output_desc->nr_channels >= 2)
 808                {
 809                   sse2_movd(p->func, tmp, dataXMM);
 810                   x86_shr_imm(p->func, tmp, 16);
 811                   x86_mov16(p->func, x86_make_disp(dst, 2), tmp);
 812                }
 813             }
 814          }
 815
 816          if(output_desc->nr_channels >= 3)
 817          {
 818             if(swizzle[2] < UTIL_FORMAT_SWIZZLE_0)
 819             {
 820                if(output_desc->nr_channels >= 4 && swizzle[3] < UTIL_FORMAT_SWIZZLE_0)
 821                {
 822                   sse2_psrlq_imm(p->func, dataXMM, 32);
 823                   sse2_movd(p->func, x86_make_disp(dst, 4), dataXMM);
 824                }
 825                else
 826                {
 827                   sse2_psrlq_imm(p->func, dataXMM, 32);
 828                   sse2_movd(p->func, tmp, dataXMM);
 829                   x86_mov16(p->func, x86_make_disp(dst, 4), tmp);
 830                   if(output_desc->nr_channels >= 4)
 831                   {
 832                      x86_mov16_imm(p->func, x86_make_disp(dst, 6), imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0]);
 833                   }
 834                }
 835             }
 836             else
 837             {
 838                if(output_desc->nr_channels >= 4 && swizzle[3] >= UTIL_FORMAT_SWIZZLE_0)
 839                   x86_mov_imm(p->func, x86_make_disp(dst, 4), (imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0] << 16) | imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]);
 840                else
 841                {
 842                   x86_mov16_imm(p->func, x86_make_disp(dst, 4), imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]);
 843
 844                   if(output_desc->nr_channels >= 4)
 845                   {
 846                      sse2_psrlq_imm(p->func, dataXMM, 48);
 847                      sse2_movd(p->func, tmp, dataXMM);
 848                      x86_mov16(p->func, x86_make_disp(dst, 6), tmp);
 849                   }
 850                }
 851             }
 852          }
 853       }
 854       return TRUE;
 855    }
 856    else if(!memcmp(&output_desc->channel[0], &input_desc->channel[0], sizeof(output_desc->channel[0])))
 857    {
 858       struct x86_reg tmp = p->tmp_EAX;
 859       unsigned i;
 860       if(input_desc->channel[0].size == 8 && input_desc->nr_channels == 4 && output_desc->nr_channels == 4
 861                      && swizzle[0] == UTIL_FORMAT_SWIZZLE_W
 862                      && swizzle[1] == UTIL_FORMAT_SWIZZLE_Z
 863                      && swizzle[2] == UTIL_FORMAT_SWIZZLE_Y
 864                      && swizzle[3] == UTIL_FORMAT_SWIZZLE_X)
 865       {
 866          /* TODO: support movbe */
 867          x86_mov(p->func, tmp, src);
 868          x86_bswap(p->func, tmp);
 869          x86_mov(p->func, dst, tmp);
 870          return TRUE;
 871       }
 872
 873       for(i = 0; i < output_desc->nr_channels; ++i)
 874       {
 875          switch(output_desc->channel[0].size)
 876          {
 877          case 8:
 878             if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
 879             {
 880                unsigned v = 0;
 881                if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
 882                {
 883                   switch(output_desc->channel[0].type)
 884                   {
 885                   case UTIL_FORMAT_TYPE_UNSIGNED:
 886                      v = output_desc->channel[0].normalized ? 0xff : 1;
 887                      break;
 888                   case UTIL_FORMAT_TYPE_SIGNED:
 889                      v = output_desc->channel[0].normalized ? 0x7f : 1;
 890                      break;
 891                   default:
 892                      return FALSE;
 893                   }
 894                }
 895                x86_mov8_imm(p->func, x86_make_disp(dst, i * 1), v);
 896             }
 897             else
 898             {
 899                x86_mov8(p->func, tmp, x86_make_disp(src, swizzle[i] * 1));
 900                x86_mov8(p->func, x86_make_disp(dst, i * 1), tmp);
 901             }
 902             break;
 903          case 16:
 904             if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
 905             {
 906                unsigned v = 0;
 907                if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
 908                {
 909                   switch(output_desc->channel[1].type)
 910                   {
 911                   case UTIL_FORMAT_TYPE_UNSIGNED:
 912                      v = output_desc->channel[1].normalized ? 0xffff : 1;
 913                      break;
 914                   case UTIL_FORMAT_TYPE_SIGNED:
 915                      v = output_desc->channel[1].normalized ? 0x7fff : 1;
 916                      break;
 917                   case UTIL_FORMAT_TYPE_FLOAT:
 918                      v = 0x3c00;
 919                      break;
 920                   default:
 921                      return FALSE;
 922                   }
 923                }
 924                x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), v);
 925             }
 926             else if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0)
 927                x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), 0);
 928             else
 929             {
 930                x86_mov16(p->func, tmp, x86_make_disp(src, swizzle[i] * 2));
 931                x86_mov16(p->func, x86_make_disp(dst, i * 2), tmp);
 932             }
 933             break;
 934          case 32:
 935             if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
 936             {
 937                unsigned v = 0;
 938                if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
 939                {
 940                   switch(output_desc->channel[1].type)
 941                   {
 942                   case UTIL_FORMAT_TYPE_UNSIGNED:
 943                      v = output_desc->channel[1].normalized ? 0xffffffff : 1;
 944                      break;
 945                   case UTIL_FORMAT_TYPE_SIGNED:
 946                      v = output_desc->channel[1].normalized ? 0x7fffffff : 1;
 947                      break;
 948                   case UTIL_FORMAT_TYPE_FLOAT:
 949                      v = 0x3f800000;
 950                      break;
 951                   default:
 952                      return FALSE;
 953                   }
 954                }
 955                x86_mov_imm(p->func, x86_make_disp(dst, i * 4), v);
 956             }
 957             else
 958             {
 959                x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 4));
 960                x86_mov(p->func, x86_make_disp(dst, i * 4), tmp);
 961             }
 962             break;
 963          case 64:
 964             if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
 965             {
 966                unsigned l = 0;
 967                unsigned h = 0;
 968                if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
 969                {
 970                   switch(output_desc->channel[1].type)
 971                   {
 972                   case UTIL_FORMAT_TYPE_UNSIGNED:
 973                      h = output_desc->channel[1].normalized ? 0xffffffff : 0;
 974                      l = output_desc->channel[1].normalized ? 0xffffffff : 1;
 975                      break;
 976                   case UTIL_FORMAT_TYPE_SIGNED:
 977                      h = output_desc->channel[1].normalized ? 0x7fffffff : 0;
 978                      l = output_desc->channel[1].normalized ? 0xffffffff : 1;
 979                      break;
 980                   case UTIL_FORMAT_TYPE_FLOAT:
 981                      h = 0x3ff00000;
 982                      l = 0;
 983                      break;
 984                   default:
 985                      return FALSE;
 986                   }
 987                }
 988                x86_mov_imm(p->func, x86_make_disp(dst, i * 8), l);
 989                x86_mov_imm(p->func, x86_make_disp(dst, i * 8 + 4), h);
 990             }
 991             else
 992             {
 993                if(x86_target_caps(p->func) & X86_SSE)
 994                {
 995                   struct x86_reg tmpXMM = x86_make_reg(file_XMM, 0);
 996                   emit_load64(p, tmp, tmpXMM, x86_make_disp(src, swizzle[i] * 8));
 997                   emit_store64(p, x86_make_disp(dst, i * 8), tmp, tmpXMM);
 998                }
 999                else
1000                {
1001                   x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8));
1002                   x86_mov(p->func, x86_make_disp(dst, i * 8), tmp);
1003                   x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8 + 4));
1004                   x86_mov(p->func, x86_make_disp(dst, i * 8 + 4), tmp);
1005                }
1006             }
1007             break;
1008          default:
1009             return FALSE;
1010          }
1011       }
1012       return TRUE;
1013    }
1014    /* special case for draw's EMIT_4UB (RGBA) and EMIT_4UB_BGRA */
1015    else if((x86_target_caps(p->func) & X86_SSE2) &&
1016          a->input_format == PIPE_FORMAT_R32G32B32A32_FLOAT && (0
1017                || a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM
1018                || a->output_format == PIPE_FORMAT_R8G8B8A8_UNORM
1019          ))
1020    {
1021       struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
1022
1023       /* load */
1024       sse_movups(p->func, dataXMM, src);
1025
1026       if (a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM)
1027          sse_shufps(p->func, dataXMM, dataXMM, SHUF(2,1,0,3));
1028
1029       /* scale by 255.0 */
1030       sse_mulps(p->func, dataXMM, get_const(p, CONST_255));
1031
1032       /* pack and emit */
1033       sse2_cvtps2dq(p->func, dataXMM, dataXMM);
1034       sse2_packssdw(p->func, dataXMM, dataXMM);
1035       sse2_packuswb(p->func, dataXMM, dataXMM);
1036       sse2_movd(p->func, dst, dataXMM);
1037
1038       return TRUE;
1039    }
1040
1041    return FALSE;
1042 }
1043
1044 static boolean translate_attr( struct translate_sse *p,
1045                                const struct translate_element *a,
1046                                struct x86_reg src,
1047                                struct x86_reg dst)
1048 {
1049    if(a->input_format == a->output_format)
1050    {
1051       emit_memcpy(p, dst, src, util_format_get_stride(a->input_format, 1));
1052       return TRUE;
1053    }
1054
1055    return translate_attr_convert(p, a, src, dst);
1056 }
1057
1058 static boolean init_inputs( struct translate_sse *p,
1059                             unsigned index_size )
1060 {
1061    unsigned i;
1062    struct x86_reg instance_id = x86_make_disp(p->machine_EDI,
1063                                               get_offset(p, &p->instance_id));
1064
1065    for (i = 0; i < p->nr_buffer_variants; i++) {
1066       struct translate_buffer_variant *variant = &p->buffer_variant[i];
1067       struct translate_buffer *buffer = &p->buffer[variant->buffer_index];
1068
1069       if (!index_size || variant->instance_divisor) {
1070          struct x86_reg buf_max_index = x86_make_disp(p->machine_EDI,
1071                                                      get_offset(p, &buffer->max_index));
1072          struct x86_reg buf_stride   = x86_make_disp(p->machine_EDI,
1073                                                      get_offset(p, &buffer->stride));
1074          struct x86_reg buf_ptr      = x86_make_disp(p->machine_EDI,
1075                                                      get_offset(p, &variant->ptr));
1076          struct x86_reg buf_base_ptr = x86_make_disp(p->machine_EDI,
1077                                                      get_offset(p, &buffer->base_ptr));
1078          struct x86_reg elt = p->idx_ESI;
1079          struct x86_reg tmp_EAX = p->tmp_EAX;
1080
1081          /* Calculate pointer to first attrib:
1082           *   base_ptr + stride * index, where index depends on instance divisor
1083           */
1084          if (variant->instance_divisor) {
1085             /* Our index is instance ID divided by instance divisor.
1086              */
1087             x86_mov(p->func, tmp_EAX, instance_id);
1088
1089             if (variant->instance_divisor != 1) {
1090                struct x86_reg tmp_EDX = p->tmp2_EDX;
1091                struct x86_reg tmp_ECX = p->src_ECX;
1092
1093                /* TODO: Add x86_shr() to rtasm and use it whenever
1094                 *       instance divisor is power of two.
1095                 */
1096
1097                x86_xor(p->func, tmp_EDX, tmp_EDX);
1098                x86_mov_reg_imm(p->func, tmp_ECX, variant->instance_divisor);
1099                x86_div(p->func, tmp_ECX);    /* EAX = EDX:EAX / ECX */
1100             }
1101
1102             /* XXX we need to clamp the index here too, but to a
1103              * per-array max value, not the draw->pt.max_index value
1104              * that's being given to us via translate->set_buffer().
1105              */
1106          } else {
1107             x86_mov(p->func, tmp_EAX, elt);
1108
1109             /* Clamp to max_index
1110              */
1111             x86_cmp(p->func, tmp_EAX, buf_max_index);
1112             x86_cmovcc(p->func, tmp_EAX, buf_max_index, cc_AE);
1113          }
1114
1115          x86_imul(p->func, tmp_EAX, buf_stride);
1116          x64_rexw(p->func);
1117          x86_add(p->func, tmp_EAX, buf_base_ptr);
1118
1119          x86_cmp(p->func, p->count_EBP, p->tmp_EAX);
1120
1121          /* In the linear case, keep the buffer pointer instead of the
1122           * index number.
1123           */
1124          if (!index_size && p->nr_buffer_variants == 1)
1125          {
1126             x64_rexw(p->func);
1127             x86_mov(p->func, elt, tmp_EAX);
1128          }
1129          else
1130          {
1131             x64_rexw(p->func);
1132             x86_mov(p->func, buf_ptr, tmp_EAX);
1133          }
1134       }
1135    }
1136
1137    return TRUE;
1138 }
1139
1140
1141 static struct x86_reg get_buffer_ptr( struct translate_sse *p,
1142                                       unsigned index_size,
1143                                       unsigned var_idx,
1144                                       struct x86_reg elt )
1145 {
1146    if (var_idx == ELEMENT_BUFFER_INSTANCE_ID) {
1147       return x86_make_disp(p->machine_EDI,
1148                            get_offset(p, &p->instance_id));
1149    }
1150    if (!index_size && p->nr_buffer_variants == 1) {
1151       return p->idx_ESI;
1152    }
1153    else if (!index_size || p->buffer_variant[var_idx].instance_divisor) {
1154       struct x86_reg ptr = p->src_ECX;
1155       struct x86_reg buf_ptr =
1156          x86_make_disp(p->machine_EDI,
1157                        get_offset(p, &p->buffer_variant[var_idx].ptr));
1158
1159       x64_rexw(p->func);
1160       x86_mov(p->func, ptr, buf_ptr);
1161       return ptr;
1162    }
1163    else {
1164       struct x86_reg ptr = p->src_ECX;
1165       const struct translate_buffer_variant *variant = &p->buffer_variant[var_idx];
1166
1167       struct x86_reg buf_stride =
1168          x86_make_disp(p->machine_EDI,
1169                        get_offset(p, &p->buffer[variant->buffer_index].stride));
1170
1171       struct x86_reg buf_base_ptr =
1172          x86_make_disp(p->machine_EDI,
1173                        get_offset(p, &p->buffer[variant->buffer_index].base_ptr));
1174
1175       struct x86_reg buf_max_index =
1176          x86_make_disp(p->machine_EDI,
1177                        get_offset(p, &p->buffer[variant->buffer_index].max_index));
1178
1179
1180
1181       /* Calculate pointer to current attrib:
1182        */
1183       switch(index_size)
1184       {
1185       case 1:
1186          x86_movzx8(p->func, ptr, elt);
1187          break;
1188       case 2:
1189          x86_movzx16(p->func, ptr, elt);
1190          break;
1191       case 4:
1192          x86_mov(p->func, ptr, elt);
1193          break;
1194       }
1195
1196       /* Clamp to max_index
1197        */
1198       x86_cmp(p->func, ptr, buf_max_index);
1199       x86_cmovcc(p->func, ptr, buf_max_index, cc_AE);
1200
1201       x86_imul(p->func, ptr, buf_stride);
1202       x64_rexw(p->func);
1203       x86_add(p->func, ptr, buf_base_ptr);
1204       return ptr;
1205    }
1206 }
1207
1208
1209
1210 static boolean incr_inputs( struct translate_sse *p,
1211                             unsigned index_size )
1212 {
1213    if (!index_size && p->nr_buffer_variants == 1) {
1214       struct x86_reg stride = x86_make_disp(p->machine_EDI,
1215                                             get_offset(p, &p->buffer[0].stride));
1216
1217       if (p->buffer_variant[0].instance_divisor == 0) {
1218          x64_rexw(p->func);
1219          x86_add(p->func, p->idx_ESI, stride);
1220          sse_prefetchnta(p->func, x86_make_disp(p->idx_ESI, 192));
1221       }
1222    }
1223    else if (!index_size) {
1224       unsigned i;
1225
1226       /* Is this worthwhile??
1227        */
1228       for (i = 0; i < p->nr_buffer_variants; i++) {
1229          struct translate_buffer_variant *variant = &p->buffer_variant[i];
1230          struct x86_reg buf_ptr = x86_make_disp(p->machine_EDI,
1231                                                 get_offset(p, &variant->ptr));
1232          struct x86_reg buf_stride = x86_make_disp(p->machine_EDI,
1233                                                    get_offset(p, &p->buffer[variant->buffer_index].stride));
1234
1235          if (variant->instance_divisor == 0) {
1236             x86_mov(p->func, p->tmp_EAX, buf_stride);
1237             x64_rexw(p->func);
1238             x86_add(p->func, p->tmp_EAX, buf_ptr);
1239             if (i == 0) sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192));
1240             x64_rexw(p->func);
1241             x86_mov(p->func, buf_ptr, p->tmp_EAX);
1242          }
1243       }
1244    }
1245    else {
1246       x64_rexw(p->func);
1247       x86_lea(p->func, p->idx_ESI, x86_make_disp(p->idx_ESI, index_size));
1248    }
1249
1250    return TRUE;
1251 }
1252
1253
1254 /* Build run( struct translate *machine,
1255  *            unsigned start,
1256  *            unsigned count,
1257  *            void *output_buffer )
1258  * or
1259  *  run_elts( struct translate *machine,
1260  *            unsigned *elts,
1261  *            unsigned count,
1262  *            void *output_buffer )
1263  *
1264  *  Lots of hardcoding
1265  *
1266  * EAX -- pointer to current output vertex
1267  * ECX -- pointer to current attribute
1268  *
1269  */
1270 static boolean build_vertex_emit( struct translate_sse *p,
1271                                   struct x86_function *func,
1272                                   unsigned index_size )
1273 {
1274    int fixup, label;
1275    unsigned j;
1276
1277    memset(p->reg_to_const, 0xff, sizeof(p->reg_to_const));
1278    memset(p->const_to_reg, 0xff, sizeof(p->const_to_reg));
1279
1280    p->tmp_EAX       = x86_make_reg(file_REG32, reg_AX);
1281    p->idx_ESI       = x86_make_reg(file_REG32, reg_SI);
1282    p->outbuf_EBX    = x86_make_reg(file_REG32, reg_BX);
1283    p->machine_EDI   = x86_make_reg(file_REG32, reg_DI);
1284    p->count_EBP     = x86_make_reg(file_REG32, reg_BP);
1285    p->tmp2_EDX     = x86_make_reg(file_REG32, reg_DX);
1286    p->src_ECX     = x86_make_reg(file_REG32, reg_CX);
1287
1288    p->func = func;
1289
1290    x86_init_func(p->func);
1291
1292    if(x86_target(p->func) == X86_64_WIN64_ABI)
1293    {
1294            /* the ABI guarantees a 16-byte aligned 32-byte "shadow space" above the return address */
1295            sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8), x86_make_reg(file_XMM, 6));
1296            sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24), x86_make_reg(file_XMM, 7));
1297    }
1298
1299    x86_push(p->func, p->outbuf_EBX);
1300    x86_push(p->func, p->count_EBP);
1301
1302 /* on non-Win64 x86-64, these are already in the right registers */
1303    if(x86_target(p->func) != X86_64_STD_ABI)
1304    {
1305       x86_push(p->func, p->machine_EDI);
1306       x86_push(p->func, p->idx_ESI);
1307
1308       x86_mov(p->func, p->machine_EDI, x86_fn_arg(p->func, 1));
1309       x86_mov(p->func, p->idx_ESI, x86_fn_arg(p->func, 2));
1310    }
1311
1312    x86_mov(p->func, p->count_EBP, x86_fn_arg(p->func, 3));
1313
1314    if(x86_target(p->func) != X86_32)
1315       x64_mov64(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 5));
1316    else
1317       x86_mov(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 5));
1318
1319    /* Load instance ID.
1320     */
1321    if (p->use_instancing) {
1322       x86_mov(p->func,
1323               p->tmp_EAX,
1324               x86_fn_arg(p->func, 4));
1325       x86_mov(p->func,
1326               x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)),
1327               p->tmp_EAX);
1328    }
1329
1330    /* Get vertex count, compare to zero
1331     */
1332    x86_xor(p->func, p->tmp_EAX, p->tmp_EAX);
1333    x86_cmp(p->func, p->count_EBP, p->tmp_EAX);
1334    fixup = x86_jcc_forward(p->func, cc_E);
1335
1336    /* always load, needed or not:
1337     */
1338    init_inputs(p, index_size);
1339
1340    /* Note address for loop jump
1341     */
1342    label = x86_get_label(p->func);
1343    {
1344       struct x86_reg elt = !index_size ? p->idx_ESI : x86_deref(p->idx_ESI);
1345       int last_variant = -1;
1346       struct x86_reg vb;
1347
1348       for (j = 0; j < p->translate.key.nr_elements; j++) {
1349          const struct translate_element *a = &p->translate.key.element[j];
1350          unsigned variant = p->element_to_buffer_variant[j];
1351
1352          /* Figure out source pointer address:
1353           */
1354          if (variant != last_variant) {
1355             last_variant = variant;
1356             vb = get_buffer_ptr(p, index_size, variant, elt);
1357          }
1358
1359          if (!translate_attr( p, a,
1360                               x86_make_disp(vb, a->input_offset),
1361                               x86_make_disp(p->outbuf_EBX, a->output_offset)))
1362             return FALSE;
1363       }
1364
1365       /* Next output vertex:
1366        */
1367       x64_rexw(p->func);
1368       x86_lea(p->func,
1369               p->outbuf_EBX,
1370               x86_make_disp(p->outbuf_EBX,
1371                             p->translate.key.output_stride));
1372
1373       /* Incr index
1374        */
1375       incr_inputs( p, index_size );
1376    }
1377
1378    /* decr count, loop if not zero
1379     */
1380    x86_dec(p->func, p->count_EBP);
1381    x86_jcc(p->func, cc_NZ, label);
1382
1383    /* Exit mmx state?
1384     */
1385    if (p->func->need_emms)
1386       mmx_emms(p->func);
1387
1388    /* Land forward jump here:
1389     */
1390    x86_fixup_fwd_jump(p->func, fixup);
1391
1392    /* Pop regs and return
1393     */
1394
1395    if(x86_target(p->func) != X86_64_STD_ABI)
1396    {
1397       x86_pop(p->func, p->idx_ESI);
1398       x86_pop(p->func, p->machine_EDI);
1399    }
1400
1401    x86_pop(p->func, p->count_EBP);
1402    x86_pop(p->func, p->outbuf_EBX);
1403
1404    if(x86_target(p->func) == X86_64_WIN64_ABI)
1405    {
1406            sse2_movdqa(p->func, x86_make_reg(file_XMM, 6), x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8));
1407            sse2_movdqa(p->func, x86_make_reg(file_XMM, 7), x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24));
1408    }
1409    x86_ret(p->func);
1410
1411    return TRUE;
1412 }
1413
1414
1415
1416
1417
1418
1419
1420 static void translate_sse_set_buffer( struct translate *translate,
1421                                 unsigned buf,
1422                                 const void *ptr,
1423                                 unsigned stride,
1424                                 unsigned max_index )
1425 {
1426    struct translate_sse *p = (struct translate_sse *)translate;
1427
1428    if (buf < p->nr_buffers) {
1429       p->buffer[buf].base_ptr = (char *)ptr;
1430       p->buffer[buf].stride = stride;
1431       p->buffer[buf].max_index = max_index;
1432    }
1433
1434    if (0) debug_printf("%s %d/%d: %p %d\n",
1435                        __FUNCTION__, buf,
1436                        p->nr_buffers,
1437                        ptr, stride);
1438 }
1439
1440
1441 static void translate_sse_release( struct translate *translate )
1442 {
1443    struct translate_sse *p = (struct translate_sse *)translate;
1444
1445    x86_release_func( &p->linear_func );
1446    x86_release_func( &p->elt_func );
1447
1448    os_free_aligned(p);
1449 }
1450
1451
1452 struct translate *translate_sse2_create( const struct translate_key *key )
1453 {
1454    struct translate_sse *p = NULL;
1455    unsigned i;
1456
1457    /* this is misnamed, it actually refers to whether rtasm is enabled or not */
1458    if (!rtasm_cpu_has_sse())
1459       goto fail;
1460
1461    p = os_malloc_aligned(sizeof(struct translate_sse), 16);
1462    if (p == NULL)
1463       goto fail;
1464    memset(p, 0, sizeof(*p));
1465    memcpy(p->consts, consts, sizeof(consts));
1466
1467    p->translate.key = *key;
1468    p->translate.release = translate_sse_release;
1469    p->translate.set_buffer = translate_sse_set_buffer;
1470
1471    for (i = 0; i < key->nr_elements; i++) {
1472       if (key->element[i].type == TRANSLATE_ELEMENT_NORMAL) {
1473          unsigned j;
1474
1475          p->nr_buffers = MAX2(p->nr_buffers, key->element[i].input_buffer + 1);
1476
1477          if (key->element[i].instance_divisor) {
1478             p->use_instancing = TRUE;
1479          }
1480
1481          /*
1482           * Map vertex element to vertex buffer variant.
1483           */
1484          for (j = 0; j < p->nr_buffer_variants; j++) {
1485             if (p->buffer_variant[j].buffer_index == key->element[i].input_buffer &&
1486                 p->buffer_variant[j].instance_divisor == key->element[i].instance_divisor) {
1487                break;
1488             }
1489          }
1490          if (j == p->nr_buffer_variants) {
1491             p->buffer_variant[j].buffer_index = key->element[i].input_buffer;
1492             p->buffer_variant[j].instance_divisor = key->element[i].instance_divisor;
1493             p->nr_buffer_variants++;
1494          }
1495          p->element_to_buffer_variant[i] = j;
1496       } else {
1497          assert(key->element[i].type == TRANSLATE_ELEMENT_INSTANCE_ID);
1498
1499          p->element_to_buffer_variant[i] = ELEMENT_BUFFER_INSTANCE_ID;
1500       }
1501    }
1502
1503    if (0) debug_printf("nr_buffers: %d\n", p->nr_buffers);
1504
1505    if (!build_vertex_emit(p, &p->linear_func, 0))
1506       goto fail;
1507
1508    if (!build_vertex_emit(p, &p->elt_func, 4))
1509       goto fail;
1510
1511    if (!build_vertex_emit(p, &p->elt16_func, 2))
1512       goto fail;
1513
1514    if (!build_vertex_emit(p, &p->elt8_func, 1))
1515       goto fail;
1516
1517    p->translate.run = (run_func) x86_get_func(&p->linear_func);
1518    if (p->translate.run == NULL)
1519       goto fail;
1520
1521    p->translate.run_elts = (run_elts_func) x86_get_func(&p->elt_func);
1522    if (p->translate.run_elts == NULL)
1523       goto fail;
1524
1525    p->translate.run_elts16 = (run_elts16_func) x86_get_func(&p->elt16_func);
1526    if (p->translate.run_elts16 == NULL)
1527       goto fail;
1528
1529    p->translate.run_elts8 = (run_elts8_func) x86_get_func(&p->elt8_func);
1530    if (p->translate.run_elts8 == NULL)
1531       goto fail;
1532
1533    return &p->translate;
1534
1535  fail:
1536    if (p)
1537       translate_sse_release( &p->translate );
1538
1539    return NULL;
1540 }
1541
1542
1543
1544 #else
1545
1546 struct translate *translate_sse2_create( const struct translate_key *key )
1547 {
1548    return NULL;
1549 }
1550
1551 #endif