src/gallium/drivers/cell/spu/spu_texture.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28
  29 #include <math.h>
  30
  31 #include "pipe/p_compiler.h"
  32 #include "spu_main.h"
  33 #include "spu_texture.h"
  34 #include "spu_tile.h"
  35 #include "spu_colorpack.h"
  36 #include "spu_dcache.h"
  37
  38
  39 /**
  40  * Mark all tex cache entries as invalid.
  41  */
  42 void
  43 invalidate_tex_cache(void)
  44 {
  45    uint lvl;
  46    for (lvl = 0; lvl < CELL_MAX_TEXTURE_LEVELS; lvl++) {
  47       uint unit = 0;
  48       uint bytes = 4 * spu.texture[unit].level[lvl].width
  49          * spu.texture[unit].level[lvl].height;
  50
  51       if (spu.texture[unit].target == PIPE_TEXTURE_CUBE)
  52          bytes *= 6;
  53       else if (spu.texture[unit].target == PIPE_TEXTURE_3D)
  54          bytes *= spu.texture[unit].level[lvl].depth;
  55
  56       spu_dcache_mark_dirty((unsigned) spu.texture[unit].level[lvl].start, bytes);
  57    }
  58 }
  59
  60
  61 /**
  62  * Get four texels from locations (x[0], y[0]), (x[1], y[1]) ...
  63  *
  64  * NOTE: in the typical case of bilinear filtering, the four texels
  65  * are in a 2x2 group so we could get by with just two dcache fetches
  66  * (two side-by-side texels per fetch).  But when bilinear filtering
  67  * wraps around a texture edge, we'll probably need code like we have
  68  * now.
  69  * FURTHERMORE: since we're rasterizing a quad of 2x2 pixels at a time,
  70  * it's quite likely that the four pixels in a quad will need some of the
  71  * same texels.  So look into doing texture fetches for four pixels at
  72  * a time.
  73  */
  74 static void
  75 get_four_texels(const struct spu_texture_level *tlevel, uint face,
  76                 vec_int4 x, vec_int4 y,
  77                 vec_uint4 *texels)
  78 {
  79    unsigned texture_ea = (uintptr_t) tlevel->start;
  80    const vec_int4 tile_x = spu_rlmask(x, -5);  /* tile_x = x / 32 */
  81    const vec_int4 tile_y = spu_rlmask(y, -5);  /* tile_y = y / 32 */
  82    const qword offset_x = si_andi((qword) x, 0x1f); /* offset_x = x & 0x1f */
  83    const qword offset_y = si_andi((qword) y, 0x1f); /* offset_y = y & 0x1f */
  84
  85    const qword tiles_per_row = (qword) spu_splats(tlevel->tiles_per_row);
  86    const qword tile_size = (qword) spu_splats((unsigned) sizeof(tile_t));
  87
  88    qword tile_offset = si_mpya((qword) tile_y, tiles_per_row, (qword) tile_x);
  89    tile_offset = si_mpy((qword) tile_offset, tile_size);
  90
  91    qword texel_offset = si_a(si_mpyui(offset_y, 32), offset_x);
  92    texel_offset = si_mpyui(texel_offset, 4);
  93
  94    vec_uint4 offset = (vec_uint4) si_a(tile_offset, texel_offset);
  95
  96    texture_ea = texture_ea + face * tlevel->bytes_per_image;
  97
  98    spu_dcache_fetch_unaligned((qword *) & texels[0],
  99                               texture_ea + spu_extract(offset, 0), 4);
 100    spu_dcache_fetch_unaligned((qword *) & texels[1],
 101                               texture_ea + spu_extract(offset, 1), 4);
 102    spu_dcache_fetch_unaligned((qword *) & texels[2],
 103                               texture_ea + spu_extract(offset, 2), 4);
 104    spu_dcache_fetch_unaligned((qword *) & texels[3],
 105                               texture_ea + spu_extract(offset, 3), 4);
 106 }
 107
 108
 109 /** clamp vec to [0, max] */
 110 static INLINE vector signed int
 111 spu_clamp(vector signed int vec, vector signed int max)
 112 {
 113    static const vector signed int zero = {0,0,0,0};
 114    vector unsigned int c;
 115    c = spu_cmpgt(vec, zero);    /* c = vec > zero ? ~0 : 0 */
 116    vec = spu_sel(zero, vec, c);
 117    c = spu_cmpgt(vec, max);    /* c = vec > max ? ~0 : 0 */
 118    vec = spu_sel(vec, max, c);
 119    return vec;
 120 }
 121
 122
 123
 124 /**
 125  * Do nearest texture sampling for four pixels.
 126  * \param colors  returned colors in SOA format (rrrr, gggg, bbbb, aaaa).
 127  */
 128 void
 129 sample_texture_2d_nearest(vector float s, vector float t,
 130                           uint unit, uint level, uint face,
 131                           vector float colors[4])
 132 {
 133    const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
 134    vector float ss = spu_mul(s, tlevel->scale_s);
 135    vector float tt = spu_mul(t, tlevel->scale_t);
 136    vector signed int is = spu_convts(ss, 0);
 137    vector signed int it = spu_convts(tt, 0);
 138    vec_uint4 texels[4];
 139
 140    /* PIPE_TEX_WRAP_REPEAT */
 141    is = spu_and(is, tlevel->mask_s);
 142    it = spu_and(it, tlevel->mask_t);
 143
 144    /* PIPE_TEX_WRAP_CLAMP */
 145    is = spu_clamp(is, tlevel->max_s);
 146    it = spu_clamp(it, tlevel->max_t);
 147
 148    get_four_texels(tlevel, face, is, it, texels);
 149
 150    /* convert four packed ARGBA pixels to float RRRR,GGGG,BBBB,AAAA */
 151    spu_unpack_A8R8G8B8_transpose4(texels, colors);
 152 }
 153
 154
 155 /**
 156  * Do bilinear texture sampling for four pixels.
 157  * \param colors  returned colors in SOA format (rrrr, gggg, bbbb, aaaa).
 158  */
 159 void
 160 sample_texture_2d_bilinear(vector float s, vector float t,
 161                            uint unit, uint level, uint face,
 162                            vector float colors[4])
 163 {
 164    const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
 165    static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f};
 166
 167    vector float ss = spu_madd(s, tlevel->scale_s, half);
 168    vector float tt = spu_madd(t, tlevel->scale_t, half);
 169
 170    vector signed int is0 = spu_convts(ss, 0);
 171    vector signed int it0 = spu_convts(tt, 0);
 172
 173    /* is + 1, it + 1 */
 174    vector signed int is1 = spu_add(is0, 1);
 175    vector signed int it1 = spu_add(it0, 1);
 176
 177    /* PIPE_TEX_WRAP_REPEAT */
 178    is0 = spu_and(is0, tlevel->mask_s);
 179    it0 = spu_and(it0, tlevel->mask_t);
 180    is1 = spu_and(is1, tlevel->mask_s);
 181    it1 = spu_and(it1, tlevel->mask_t);
 182
 183    /* PIPE_TEX_WRAP_CLAMP */
 184    is0 = spu_clamp(is0, tlevel->max_s);
 185    it0 = spu_clamp(it0, tlevel->max_t);
 186    is1 = spu_clamp(is1, tlevel->max_s);
 187    it1 = spu_clamp(it1, tlevel->max_t);
 188
 189    /* get packed int texels */
 190    vector unsigned int texels[16];
 191    get_four_texels(tlevel, face, is0, it0, texels + 0);  /* upper-left */
 192    get_four_texels(tlevel, face, is1, it0, texels + 4);  /* upper-right */
 193    get_four_texels(tlevel, face, is0, it1, texels + 8);  /* lower-left */
 194    get_four_texels(tlevel, face, is1, it1, texels + 12); /* lower-right */
 195
 196    /* convert packed int texels to float colors */
 197    vector float ftexels[16];
 198    spu_unpack_A8R8G8B8_transpose4(texels + 0, ftexels + 0);
 199    spu_unpack_A8R8G8B8_transpose4(texels + 4, ftexels + 4);
 200    spu_unpack_A8R8G8B8_transpose4(texels + 8, ftexels + 8);
 201    spu_unpack_A8R8G8B8_transpose4(texels + 12, ftexels + 12);
 202
 203    /* Compute weighting factors in [0,1]
 204     * Multiply texcoord by 1024, AND with 1023, convert back to float.
 205     */
 206    vector float ss1024 = spu_mul(ss, spu_splats(1024.0f));
 207    vector signed int iss1024 = spu_convts(ss1024, 0);
 208    iss1024 = spu_and(iss1024, 1023);
 209    vector float sWeights0 = spu_convtf(iss1024, 10);
 210
 211    vector float tt1024 = spu_mul(tt, spu_splats(1024.0f));
 212    vector signed int itt1024 = spu_convts(tt1024, 0);
 213    itt1024 = spu_and(itt1024, 1023);
 214    vector float tWeights0 = spu_convtf(itt1024, 10);
 215
 216    /* 1 - sWeight and 1 - tWeight */
 217    vector float sWeights1 = spu_sub(spu_splats(1.0f), sWeights0);
 218    vector float tWeights1 = spu_sub(spu_splats(1.0f), tWeights0);
 219
 220    /* reds, for four pixels */
 221    ftexels[ 0] = spu_mul(ftexels[ 0], spu_mul(sWeights1, tWeights1)); /*ul*/
 222    ftexels[ 4] = spu_mul(ftexels[ 4], spu_mul(sWeights0, tWeights1)); /*ur*/
 223    ftexels[ 8] = spu_mul(ftexels[ 8], spu_mul(sWeights1, tWeights0)); /*ll*/
 224    ftexels[12] = spu_mul(ftexels[12], spu_mul(sWeights0, tWeights0)); /*lr*/
 225    colors[0] = spu_add(spu_add(ftexels[0], ftexels[4]),
 226                        spu_add(ftexels[8], ftexels[12]));
 227
 228    /* greens, for four pixels */
 229    ftexels[ 1] = spu_mul(ftexels[ 1], spu_mul(sWeights1, tWeights1)); /*ul*/
 230    ftexels[ 5] = spu_mul(ftexels[ 5], spu_mul(sWeights0, tWeights1)); /*ur*/
 231    ftexels[ 9] = spu_mul(ftexels[ 9], spu_mul(sWeights1, tWeights0)); /*ll*/
 232    ftexels[13] = spu_mul(ftexels[13], spu_mul(sWeights0, tWeights0)); /*lr*/
 233    colors[1] = spu_add(spu_add(ftexels[1], ftexels[5]),
 234                        spu_add(ftexels[9], ftexels[13]));
 235
 236    /* blues, for four pixels */
 237    ftexels[ 2] = spu_mul(ftexels[ 2], spu_mul(sWeights1, tWeights1)); /*ul*/
 238    ftexels[ 6] = spu_mul(ftexels[ 6], spu_mul(sWeights0, tWeights1)); /*ur*/
 239    ftexels[10] = spu_mul(ftexels[10], spu_mul(sWeights1, tWeights0)); /*ll*/
 240    ftexels[14] = spu_mul(ftexels[14], spu_mul(sWeights0, tWeights0)); /*lr*/
 241    colors[2] = spu_add(spu_add(ftexels[2], ftexels[6]),
 242                        spu_add(ftexels[10], ftexels[14]));
 243
 244    /* alphas, for four pixels */
 245    ftexels[ 3] = spu_mul(ftexels[ 3], spu_mul(sWeights1, tWeights1)); /*ul*/
 246    ftexels[ 7] = spu_mul(ftexels[ 7], spu_mul(sWeights0, tWeights1)); /*ur*/
 247    ftexels[11] = spu_mul(ftexels[11], spu_mul(sWeights1, tWeights0)); /*ll*/
 248    ftexels[15] = spu_mul(ftexels[15], spu_mul(sWeights0, tWeights0)); /*lr*/
 249    colors[3] = spu_add(spu_add(ftexels[3], ftexels[7]),
 250                        spu_add(ftexels[11], ftexels[15]));
 251 }
 252
 253
 254
 255 /**
 256  * Adapted from /opt/cell/sdk/usr/spu/include/transpose_matrix4x4.h
 257  */
 258 static INLINE void
 259 transpose(vector unsigned int *mOut0,
 260           vector unsigned int *mOut1,
 261           vector unsigned int *mOut2,
 262           vector unsigned int *mOut3,
 263           vector unsigned int *mIn)
 264 {
 265   vector unsigned int abcd, efgh, ijkl, mnop;   /* input vectors */
 266   vector unsigned int aeim, bfjn, cgko, dhlp;   /* output vectors */
 267   vector unsigned int aibj, ckdl, emfn, gohp;   /* intermediate vectors */
 268
 269   vector unsigned char shufflehi = ((vector unsigned char) {
 270                                                0x00, 0x01, 0x02, 0x03,
 271                                                0x10, 0x11, 0x12, 0x13,
 272                                                0x04, 0x05, 0x06, 0x07,
 273                                                0x14, 0x15, 0x16, 0x17});
 274   vector unsigned char shufflelo = ((vector unsigned char) {
 275                                                0x08, 0x09, 0x0A, 0x0B,
 276                                                0x18, 0x19, 0x1A, 0x1B,
 277                                                0x0C, 0x0D, 0x0E, 0x0F,
 278                                                0x1C, 0x1D, 0x1E, 0x1F});
 279   abcd = *(mIn+0);
 280   efgh = *(mIn+1);
 281   ijkl = *(mIn+2);
 282   mnop = *(mIn+3);
 283
 284   aibj = spu_shuffle(abcd, ijkl, shufflehi);
 285   ckdl = spu_shuffle(abcd, ijkl, shufflelo);
 286   emfn = spu_shuffle(efgh, mnop, shufflehi);
 287   gohp = spu_shuffle(efgh, mnop, shufflelo);
 288
 289   aeim = spu_shuffle(aibj, emfn, shufflehi);
 290   bfjn = spu_shuffle(aibj, emfn, shufflelo);
 291   cgko = spu_shuffle(ckdl, gohp, shufflehi);
 292   dhlp = spu_shuffle(ckdl, gohp, shufflelo);
 293
 294   *mOut0 = aeim;
 295   *mOut1 = bfjn;
 296   *mOut2 = cgko;
 297   *mOut3 = dhlp;
 298 }
 299
 300
 301 /**
 302  * Bilinear filtering, using int instead of float arithmetic for computing
 303  * sample weights.
 304  */
 305 void
 306 sample_texture_2d_bilinear_int(vector float s, vector float t,
 307                                uint unit, uint level, uint face,
 308                                vector float colors[4])
 309 {
 310    const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
 311    static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f};
 312
 313    /* Scale texcoords by size of texture, and add half pixel bias */
 314    vector float ss = spu_madd(s, tlevel->scale_s, half);
 315    vector float tt = spu_madd(t, tlevel->scale_t, half);
 316
 317    /* convert float coords to fixed-pt coords with 7 fraction bits */
 318    vector signed int is = spu_convts(ss, 7);  /* XXX really need floor() here */
 319    vector signed int it = spu_convts(tt, 7);  /* XXX really need floor() here */
 320
 321    /* compute integer texel weights in [0, 127] */
 322    vector signed int sWeights0 = spu_and(is, 127);
 323    vector signed int tWeights0 = spu_and(it, 127);
 324    vector signed int sWeights1 = spu_sub(127, sWeights0);
 325    vector signed int tWeights1 = spu_sub(127, tWeights0);
 326
 327    /* texel coords: is0 = is / 128, it0 = is / 128 */
 328    vector signed int is0 = spu_rlmask(is, -7);
 329    vector signed int it0 = spu_rlmask(it, -7);
 330
 331    /* texel coords: i1 = is0 + 1, it1 = it0 + 1 */
 332    vector signed int is1 = spu_add(is0, 1);
 333    vector signed int it1 = spu_add(it0, 1);
 334
 335    /* PIPE_TEX_WRAP_REPEAT */
 336    is0 = spu_and(is0, tlevel->mask_s);
 337    it0 = spu_and(it0, tlevel->mask_t);
 338    is1 = spu_and(is1, tlevel->mask_s);
 339    it1 = spu_and(it1, tlevel->mask_t);
 340
 341    /* PIPE_TEX_WRAP_CLAMP */
 342    is0 = spu_clamp(is0, tlevel->max_s);
 343    it0 = spu_clamp(it0, tlevel->max_t);
 344    is1 = spu_clamp(is1, tlevel->max_s);
 345    it1 = spu_clamp(it1, tlevel->max_t);
 346
 347    /* get packed int texels */
 348    vector unsigned int texels[16];
 349    get_four_texels(tlevel, face, is0, it0, texels + 0);  /* upper-left */
 350    get_four_texels(tlevel, face, is1, it0, texels + 4);  /* upper-right */
 351    get_four_texels(tlevel, face, is0, it1, texels + 8);  /* lower-left */
 352    get_four_texels(tlevel, face, is1, it1, texels + 12); /* lower-right */
 353
 354    /* twiddle packed 32-bit BGRA pixels into RGBA as four unsigned ints */
 355    {
 356       static const unsigned char ZERO = 0x80;
 357       int i;
 358       for (i = 0; i < 16; i++) {
 359          texels[i] = spu_shuffle(texels[i], texels[i],
 360                                  ((vector unsigned char) {
 361                                     ZERO, ZERO, ZERO, 1,
 362                                     ZERO, ZERO, ZERO, 2,
 363                                     ZERO, ZERO, ZERO, 3,
 364                                     ZERO, ZERO, ZERO, 0}));
 365       }
 366    }
 367
 368    /* convert RGBA,RGBA,RGBA,RGBA to RRRR,GGGG,BBBB,AAAA */
 369    vector unsigned int texel0, texel1, texel2, texel3, texel4, texel5, texel6, texel7,
 370       texel8, texel9, texel10, texel11, texel12, texel13, texel14, texel15;
 371    transpose(&texel0, &texel1, &texel2, &texel3, texels + 0);
 372    transpose(&texel4, &texel5, &texel6, &texel7, texels + 4);
 373    transpose(&texel8, &texel9, &texel10, &texel11, texels + 8);
 374    transpose(&texel12, &texel13, &texel14, &texel15, texels + 12);
 375
 376    /* computed weighted colors */
 377    vector unsigned int c0, c1, c2, c3, cSum;
 378
 379    /* red */
 380    c0 = (vector unsigned int) si_mpy((qword) texel0, si_mpy((qword) sWeights1, (qword) tWeights1)); /*ul*/
 381    c1 = (vector unsigned int) si_mpy((qword) texel4, si_mpy((qword) sWeights0, (qword) tWeights1)); /*ur*/
 382    c2 = (vector unsigned int) si_mpy((qword) texel8, si_mpy((qword) sWeights1, (qword) tWeights0)); /*ll*/
 383    c3 = (vector unsigned int) si_mpy((qword) texel12, si_mpy((qword) sWeights0, (qword) tWeights0)); /*lr*/
 384    cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
 385    colors[0] = spu_convtf(cSum, 22);
 386
 387    /* green */
 388    c0 = (vector unsigned int) si_mpy((qword) texel1, si_mpy((qword) sWeights1, (qword) tWeights1)); /*ul*/
 389    c1 = (vector unsigned int) si_mpy((qword) texel5, si_mpy((qword) sWeights0, (qword) tWeights1)); /*ur*/
 390    c2 = (vector unsigned int) si_mpy((qword) texel9, si_mpy((qword) sWeights1, (qword) tWeights0)); /*ll*/
 391    c3 = (vector unsigned int) si_mpy((qword) texel13, si_mpy((qword) sWeights0, (qword) tWeights0)); /*lr*/
 392    cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
 393    colors[1] = spu_convtf(cSum, 22);
 394
 395    /* blue */
 396    c0 = (vector unsigned int) si_mpy((qword) texel2, si_mpy((qword) sWeights1, (qword) tWeights1)); /*ul*/
 397    c1 = (vector unsigned int) si_mpy((qword) texel6, si_mpy((qword) sWeights0, (qword) tWeights1)); /*ur*/
 398    c2 = (vector unsigned int) si_mpy((qword) texel10, si_mpy((qword) sWeights1, (qword) tWeights0)); /*ll*/
 399    c3 = (vector unsigned int) si_mpy((qword) texel14, si_mpy((qword) sWeights0, (qword) tWeights0)); /*lr*/
 400    cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
 401    colors[2] = spu_convtf(cSum, 22);
 402
 403    /* alpha */
 404    c0 = (vector unsigned int) si_mpy((qword) texel3, si_mpy((qword) sWeights1, (qword) tWeights1)); /*ul*/
 405    c1 = (vector unsigned int) si_mpy((qword) texel7, si_mpy((qword) sWeights0, (qword) tWeights1)); /*ur*/
 406    c2 = (vector unsigned int) si_mpy((qword) texel11, si_mpy((qword) sWeights1, (qword) tWeights0)); /*ll*/
 407    c3 = (vector unsigned int) si_mpy((qword) texel15, si_mpy((qword) sWeights0, (qword) tWeights0)); /*lr*/
 408    cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
 409    colors[3] = spu_convtf(cSum, 22);
 410 }
 411
 412
 413
 414 /**
 415  * Compute level of detail factor from texcoords.
 416  */
 417 static INLINE float
 418 compute_lambda_2d(uint unit, vector float s, vector float t)
 419 {
 420    uint baseLevel = 0;
 421    float width = spu.texture[unit].level[baseLevel].width;
 422    float height = spu.texture[unit].level[baseLevel].width;
 423    float dsdx = width * (spu_extract(s, 1) - spu_extract(s, 0));
 424    float dsdy = width * (spu_extract(s, 2) - spu_extract(s, 0));
 425    float dtdx = height * (spu_extract(t, 1) - spu_extract(t, 0));
 426    float dtdy = height * (spu_extract(t, 2) - spu_extract(t, 0));
 427 #if 0
 428    /* ideal value */
 429    float x = dsdx * dsdx + dtdx * dtdx;
 430    float y = dsdy * dsdy + dtdy * dtdy;
 431    float rho = x > y ? x : y;
 432    rho = sqrtf(rho);
 433 #else
 434    /* approximation */
 435    dsdx = fabsf(dsdx);
 436    dsdy = fabsf(dsdy);
 437    dtdx = fabsf(dtdx);
 438    dtdy = fabsf(dtdy);
 439    float rho = (dsdx + dsdy + dtdx + dtdy) * 0.5;
 440 #endif
 441    float lambda = logf(rho) * 1.442695f; /* compute logbase2(rho) */
 442    return lambda;
 443 }
 444
 445
 446 /**
 447  * Blend two sets of colors according to weight.
 448  */
 449 static void
 450 blend_colors(vector float c0[4], const vector float c1[4], float weight)
 451 {
 452    vector float t = spu_splats(weight);
 453    vector float dc0 = spu_sub(c1[0], c0[0]);
 454    vector float dc1 = spu_sub(c1[1], c0[1]);
 455    vector float dc2 = spu_sub(c1[2], c0[2]);
 456    vector float dc3 = spu_sub(c1[3], c0[3]);
 457    c0[0] = spu_madd(dc0, t, c0[0]);
 458    c0[1] = spu_madd(dc1, t, c0[1]);
 459    c0[2] = spu_madd(dc2, t, c0[2]);
 460    c0[3] = spu_madd(dc3, t, c0[3]);
 461 }
 462
 463
 464 /**
 465  * Texture sampling with level of detail selection and possibly mipmap
 466  * interpolation.
 467  */
 468 void
 469 sample_texture_2d_lod(vector float s, vector float t,
 470                       uint unit, uint level_ignored, uint face,
 471                       vector float colors[4])
 472 {
 473    /*
 474     * Note that we're computing a lambda/lod here that's used for all
 475     * four pixels in the quad.
 476     */
 477    float lambda = compute_lambda_2d(unit, s, t);
 478
 479    (void) face;
 480    (void) level_ignored;
 481
 482    /* apply lod bias */
 483    lambda += spu.sampler[unit].lod_bias;
 484
 485    /* clamp */
 486    if (lambda < spu.sampler[unit].min_lod)
 487       lambda = spu.sampler[unit].min_lod;
 488    else if (lambda > spu.sampler[unit].max_lod)
 489       lambda = spu.sampler[unit].max_lod;
 490
 491    if (lambda <= 0.0f) {
 492       /* magnify */
 493       spu.mag_sample_texture_2d[unit](s, t, unit, 0, face, colors);
 494    }
 495    else {
 496       /* minify */
 497       if (spu.sampler[unit].min_img_filter == PIPE_TEX_FILTER_LINEAR) {
 498          /* sample two mipmap levels and interpolate */
 499          int level = (int) lambda;
 500          if (level > (int) spu.texture[unit].max_level)
 501             level = spu.texture[unit].max_level;
 502          spu.min_sample_texture_2d[unit](s, t, unit, level, face, colors);
 503          if (spu.sampler[unit].min_img_filter == PIPE_TEX_FILTER_LINEAR) {
 504             /* sample second mipmap level */
 505             float weight = lambda - (float) level;
 506             level++;
 507             if (level <= (int) spu.texture[unit].max_level) {
 508                vector float colors2[4];
 509                spu.min_sample_texture_2d[unit](s, t, unit, level, face, colors2);
 510                blend_colors(colors, colors2, weight);
 511             }
 512          }
 513       }
 514       else {
 515          /* sample one mipmap level */
 516          int level = (int) (lambda + 0.5f);
 517          if (level > (int) spu.texture[unit].max_level)
 518             level = spu.texture[unit].max_level;
 519          spu.min_sample_texture_2d[unit](s, t, unit, level, face, colors);
 520       }
 521    }
 522 }
 523
 524
 525 /** XXX need a SIMD version of this */
 526 static unsigned
 527 choose_cube_face(float rx, float ry, float rz, float *newS, float *newT)
 528 {
 529    /*
 530       major axis
 531       direction     target                             sc     tc    ma
 532       ----------    -------------------------------    ---    ---   ---
 533        +rx          TEXTURE_CUBE_MAP_POSITIVE_X_EXT    -rz    -ry   rx
 534        -rx          TEXTURE_CUBE_MAP_NEGATIVE_X_EXT    +rz    -ry   rx
 535        +ry          TEXTURE_CUBE_MAP_POSITIVE_Y_EXT    +rx    +rz   ry
 536        -ry          TEXTURE_CUBE_MAP_NEGATIVE_Y_EXT    +rx    -rz   ry
 537        +rz          TEXTURE_CUBE_MAP_POSITIVE_Z_EXT    +rx    -ry   rz
 538        -rz          TEXTURE_CUBE_MAP_NEGATIVE_Z_EXT    -rx    -ry   rz
 539    */
 540    const float arx = fabsf(rx);
 541    const float ary = fabsf(ry);
 542    const float arz = fabsf(rz);
 543    unsigned face;
 544    float sc, tc, ma;
 545
 546    if (arx > ary && arx > arz) {
 547       if (rx >= 0.0F) {
 548          face = PIPE_TEX_FACE_POS_X;
 549          sc = -rz;
 550          tc = -ry;
 551          ma = arx;
 552       }
 553       else {
 554          face = PIPE_TEX_FACE_NEG_X;
 555          sc = rz;
 556          tc = -ry;
 557          ma = arx;
 558       }
 559    }
 560    else if (ary > arx && ary > arz) {
 561       if (ry >= 0.0F) {
 562          face = PIPE_TEX_FACE_POS_Y;
 563          sc = rx;
 564          tc = rz;
 565          ma = ary;
 566       }
 567       else {
 568          face = PIPE_TEX_FACE_NEG_Y;
 569          sc = rx;
 570          tc = -rz;
 571          ma = ary;
 572       }
 573    }
 574    else {
 575       if (rz > 0.0F) {
 576          face = PIPE_TEX_FACE_POS_Z;
 577          sc = rx;
 578          tc = -ry;
 579          ma = arz;
 580       }
 581       else {
 582          face = PIPE_TEX_FACE_NEG_Z;
 583          sc = -rx;
 584          tc = -ry;
 585          ma = arz;
 586       }
 587    }
 588
 589    *newS = (sc / ma + 1.0F) * 0.5F;
 590    *newT = (tc / ma + 1.0F) * 0.5F;
 591
 592    return face;
 593 }
 594
 595
 596
 597 void
 598 sample_texture_cube(vector float s, vector float t, vector float r,
 599                     uint unit, vector float colors[4])
 600 {
 601    uint p, faces[4], level = 0;
 602    float newS[4], newT[4];
 603
 604    /* Compute cube faces referenced by the four sets of texcoords.
 605     * XXX we should SIMD-ize this.
 606     */
 607    for (p = 0; p < 4; p++) {
 608       float rx = spu_extract(s, p);
 609       float ry = spu_extract(t, p);
 610       float rz = spu_extract(r, p);
 611       faces[p] = choose_cube_face(rx, ry, rz, &newS[p], &newT[p]);
 612    }
 613
 614    if (faces[0] == faces[1] &&
 615        faces[0] == faces[2] &&
 616        faces[0] == faces[3]) {
 617       /* GOOD!  All four texcoords refer to the same cube face */
 618       s = (vector float) {newS[0], newS[1], newS[2], newS[3]};
 619       t = (vector float) {newT[0], newT[1], newT[2], newT[3]};
 620       spu.sample_texture_2d[unit](s, t, unit, level, faces[0], colors);
 621    }
 622    else {
 623       /* BAD!  The four texcoords refer to different faces */
 624       for (p = 0; p < 4; p++) {
 625          vector float c[4];
 626
 627          spu.sample_texture_2d[unit](spu_splats(newS[p]), spu_splats(newT[p]),
 628                                      unit, level, faces[p], c);
 629
 630          float red = spu_extract(c[0], p);
 631          float green = spu_extract(c[1], p);
 632          float blue = spu_extract(c[2], p);
 633          float alpha = spu_extract(c[3], p);
 634
 635          colors[0] = spu_insert(red,   colors[0], p);
 636          colors[1] = spu_insert(green, colors[1], p);
 637          colors[2] = spu_insert(blue,  colors[2], p);
 638          colors[3] = spu_insert(alpha, colors[3], p);
 639       }
 640    }
 641 }