Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/Adlfloat4SSE.inl

   1 /*
   2 Copyright (c) 2012 Advanced Micro Devices, Inc.
   3
   4 This software is provided 'as-is', without any express or implied warranty.
   5 In no event will the authors be held liable for any damages arising from the use of this software.
   6 Permission is granted to anyone to use this software for any purpose,
   7 including commercial applications, and to alter it and redistribute it freely,
   8 subject to the following restrictions:
   9
  10 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
  11 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
  12 3. This notice may not be removed or altered from any source distribution.
  13 */
  14 //Originally written by Takahiro Harada
  15
  16
  17 //#define CHECK_ALIGNMENT(a) CLASSERT((u32(&(a)) & 0xf) == 0);
  18 #define CHECK_ALIGNMENT(a) a;
  19
  20
  21 __inline
  22 float4 make_float4(float x, float y, float z, float w = 0.f)
  23 {
  24         float4 v;
  25         v.m_quad = _mm_set_ps(w,z,y,x);
  26
  27         return v;
  28 }
  29
  30 __inline
  31 float4 make_float4(float x)
  32 {
  33         return make_float4(x,x,x,x);
  34 }
  35
  36 __inline
  37 float4 make_float4(const int4& x)
  38 {
  39         return make_float4((float)x.s[0], (float)x.s[1], (float)x.s[2], (float)x.s[3]);
  40 }
  41
  42 __inline
  43 float2 make_float2(float x, float y)
  44 {
  45         float2 v;
  46         v.s[0] = x; v.s[1] = y;
  47         return v;
  48 }
  49
  50 __inline
  51 float2 make_float2(float x)
  52 {
  53         return make_float2(x,x);
  54 }
  55
  56 __inline
  57 float2 make_float2(const int2& x)
  58 {
  59         return make_float2((float)x.s[0], (float)x.s[1]);
  60 }
  61
  62 __inline
  63 int4 make_int4(int x, int y, int z, int w = 0)
  64 {
  65         int4 v;
  66         v.s[0] = x; v.s[1] = y; v.s[2] = z; v.s[3] = w;
  67         return v;
  68 }
  69
  70 __inline
  71 int4 make_int4(int x)
  72 {
  73         return make_int4(x,x,x,x);
  74 }
  75
  76 __inline
  77 int4 make_int4(const float4& x)
  78 {
  79         return make_int4((int)x.x, (int)x.y, (int)x.z, (int)x.w);
  80 }
  81
  82 __inline
  83 int2 make_int2(int a, int b)
  84 {
  85         int2 ans; ans.x = a; ans.y = b;
  86         return ans;
  87 }
  88
  89 __inline
  90 float4 operator-(const float4& a)
  91 {
  92         float4 zero; zero.m_quad = _mm_setzero_ps();
  93         float4 ans; ans.m_quad = _mm_sub_ps( zero.m_quad, a.m_quad );
  94         return ans;
  95 }
  96
  97 __inline
  98 float4 operator*(const float4& a, const float4& b)
  99 {
 100         CHECK_ALIGNMENT(a);
 101
 102         float4 out;
 103         out.m_quad = _mm_mul_ps( a.m_quad, b.m_quad );
 104         return out;
 105 }
 106
 107 __inline
 108 float4 operator*(float a, const float4& b)
 109 {
 110         float4 av; av.m_quad = _mm_set1_ps( a );
 111         return av*b;
 112 }
 113
 114 __inline
 115 float4 operator*(const float4& b, float a)
 116 {
 117         CHECK_ALIGNMENT(b);
 118
 119         float4 av; av.m_quad = _mm_set1_ps( a );
 120         return av*b;
 121 }
 122
 123 __inline
 124 void operator*=(float4& a, const float4& b)
 125 {
 126         CHECK_ALIGNMENT(a);
 127
 128         a = a*b;
 129 }
 130
 131 __inline
 132 void operator*=(float4& a, float b)
 133 {
 134         CHECK_ALIGNMENT(a);
 135
 136         float4 bv; bv.m_quad = _mm_set1_ps( b );
 137         a = a*bv;
 138 }
 139
 140 //
 141 __inline
 142 float4 operator/(const float4& a, const float4& b)
 143 {
 144         CHECK_ALIGNMENT(a);
 145
 146         float4 out;
 147         out.m_quad = _mm_div_ps( a.m_quad, b.m_quad );
 148         return out;
 149 }
 150
 151 __inline
 152 float4 operator/(const float4& b, float a)
 153 {
 154         CHECK_ALIGNMENT(b);
 155
 156         float4 av; av.m_quad = _mm_set1_ps( a );
 157         float4 out;
 158         out = b/av;
 159         return out;
 160 }
 161
 162 __inline
 163 void operator/=(float4& a, const float4& b)
 164 {
 165         a = a/b;
 166 }
 167
 168 __inline
 169 void operator/=(float4& a, float b)
 170 {
 171         CLASSERT((u32(&a) & 0xf) == 0);
 172
 173         float4 bv; bv.m_quad = _mm_set1_ps( b );
 174         a = a/bv;
 175 }
 176 //
 177
 178 __inline
 179 float4 operator+(const float4& a, const float4& b)
 180 {
 181         CHECK_ALIGNMENT(a);
 182
 183         float4 out;
 184         out.m_quad = _mm_add_ps( a.m_quad, b.m_quad );
 185         return out;
 186 }
 187
 188 __inline
 189 float4 operator+(const float4& a, float b)
 190 {
 191         CHECK_ALIGNMENT(a);
 192
 193         float4 bv; bv.m_quad = _mm_set1_ps( b );
 194         return a+bv;
 195 }
 196
 197 __inline
 198 float4 operator-(const float4& a, const float4& b)
 199 {
 200         CHECK_ALIGNMENT(a);
 201
 202         float4 out;
 203         out.m_quad = _mm_sub_ps( a.m_quad, b.m_quad );
 204         return out;
 205 }
 206
 207 __inline
 208 float4 operator-(const float4& a, float b)
 209 {
 210         CHECK_ALIGNMENT(a);
 211
 212         float4 bv; bv.m_quad = _mm_set1_ps( b );
 213         return a-bv;
 214 }
 215
 216 __inline
 217 void operator+=(float4& a, const float4& b)
 218 {
 219         CHECK_ALIGNMENT(a);
 220
 221         a = a + b;
 222 }
 223
 224 __inline
 225 void operator+=(float4& a, float b)
 226 {
 227         CHECK_ALIGNMENT(a);
 228
 229         float4 bv; bv.m_quad = _mm_set1_ps( b );
 230
 231         a = a + bv;
 232 }
 233
 234 __inline
 235 void operator-=(float4& a, const float4& b)
 236 {
 237         CHECK_ALIGNMENT(a);
 238
 239         a = a - b;
 240 }
 241
 242 __inline
 243 void operator-=(float4& a, float b)
 244 {
 245         CHECK_ALIGNMENT(a);
 246
 247         float4 bv; bv.m_quad = _mm_set1_ps( b );
 248
 249         a = a - bv;
 250 }
 251
 252
 253
 254
 255
 256 __inline
 257 float4 cross3(const float4& a, const float4& b)
 258 {       //      xnamathvector.inl
 259         union IntVec
 260         {
 261                 unsigned int m_i[4];
 262                 __m128 m_v;
 263         };
 264
 265         IntVec mask3 = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000};
 266         __m128 V1 = a.m_quad;
 267         __m128 V2 = b.m_quad;
 268
 269     __m128 vTemp1 = _mm_shuffle_ps(V1,V1,_MM_SHUFFLE(3,0,2,1));
 270     // z2,x2,y2,w2
 271     __m128 vTemp2 = _mm_shuffle_ps(V2,V2,_MM_SHUFFLE(3,1,0,2));
 272     // Perform the left operation
 273     __m128 vResult = _mm_mul_ps(vTemp1,vTemp2);
 274     // z1,x1,y1,w1
 275     vTemp1 = _mm_shuffle_ps(vTemp1,vTemp1,_MM_SHUFFLE(3,0,2,1));
 276     // y2,z2,x2,w2
 277     vTemp2 = _mm_shuffle_ps(vTemp2,vTemp2,_MM_SHUFFLE(3,1,0,2));
 278     // Perform the right operation
 279     vTemp1 = _mm_mul_ps(vTemp1,vTemp2);
 280     // Subract the right from left, and return answer
 281     vResult = _mm_sub_ps(vResult,vTemp1);
 282     // Set w to zero
 283         float4 ans; ans.m_quad = _mm_and_ps(vResult,mask3.m_v);
 284         return ans;
 285 }
 286
 287 __inline
 288 float dot3F4(const float4& a, const float4& b)
 289 {
 290 //      return a.x*b.x+a.y*b.y+a.z*b.z;
 291     // Perform the dot product
 292         __m128 V1 = a.m_quad;
 293         __m128 V2 = b.m_quad;
 294
 295         __m128 vDot = _mm_mul_ps(V1,V2);
 296     // x=Dot.vector4_f32[1], y=Dot.vector4_f32[2]
 297     __m128 vTemp = _mm_shuffle_ps(vDot,vDot,_MM_SHUFFLE(2,1,2,1));
 298     // Result.vector4_f32[0] = x+y
 299     vDot = _mm_add_ss(vDot,vTemp);
 300     // x=Dot.vector4_f32[2]
 301     vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,1,1,1));
 302     // Result.vector4_f32[0] = (x+y)+z
 303     vDot = _mm_add_ss(vDot,vTemp);
 304     // Splat x
 305         float4 ans; ans.m_quad = _mm_shuffle_ps(vDot,vDot,_MM_SHUFFLE(0,0,0,0));
 306         return ans.x;
 307 }
 308
 309 __inline
 310 float length3(const float4& a)
 311 {
 312         return sqrtf(dot3F4(a,a));
 313 }
 314
 315 __inline
 316 float dot4(const float4& a, const float4& b)
 317 {
 318         return a.x*b.x+a.y*b.y+a.z*b.z+a.w*b.w;
 319 }
 320
 321 //      for height
 322 __inline
 323 float dot3w1(const float4& point, const float4& eqn)
 324 {
 325         return point.x*eqn.x+point.y*eqn.y+point.z*eqn.z+eqn.w;
 326 }
 327
 328 __inline
 329 float4 normalize3(const float4& a)
 330 {
 331         float length = sqrtf(dot3F4(a, a));
 332         return 1.f/length * a;
 333 }
 334
 335 __inline
 336 float4 normalize4(const float4& a)
 337 {
 338         float length = sqrtf(dot4(a, a));
 339         return 1.f/length * a;
 340 }
 341
 342 __inline
 343 float4 createEquation(const float4& a, const float4& b, const float4& c)
 344 {
 345         float4 eqn;
 346         float4 ab = b-a;
 347         float4 ac = c-a;
 348         eqn = normalize3( cross3(ab, ac) );
 349         eqn.w = -dot3F4(eqn,a);
 350         return eqn;
 351 }
 352
 353
 354 template<typename T>
 355 __inline
 356 T max2(const T& a, const T& b)
 357 {
 358         return (a>b)? a:b;
 359 }
 360
 361 template<typename T>
 362 __inline
 363 T min2(const T& a, const T& b)
 364 {
 365         return (a<b)? a:b;
 366 }
 367
 368 template<>
 369 __inline
 370 float4 max2(const float4& a, const float4& b)
 371 {
 372         return make_float4( max2(a.x,b.x), max2(a.y,b.y), max2(a.z,b.z), max2(a.w,b.w) );
 373 }
 374
 375 template<>
 376 __inline
 377 float4 min2(const float4& a, const float4& b)
 378 {
 379         return make_float4( min2(a.x,b.x), min2(a.y,b.y), min2(a.z,b.z), min2(a.w,b.w) );
 380 }
 381