2 Copyright (c) 2012 Advanced Micro Devices, Inc.
4 This software is provided 'as-is', without any express or implied warranty.
5 In no event will the authors be held liable for any damages arising from the use of this software.
6 Permission is granted to anyone to use this software for any purpose,
7 including commercial applications, and to alter it and redistribute it freely,
8 subject to the following restrictions:
10 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
11 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
12 3. This notice may not be removed or altered from any source distribution.
14 //Originally written by Takahiro Harada
17 //#define CHECK_ALIGNMENT(a) CLASSERT((u32(&(a)) & 0xf) == 0);
18 #define CHECK_ALIGNMENT(a) a;
22 float4 make_float4(float x, float y, float z, float w = 0.f)
25 v.m_quad = _mm_set_ps(w,z,y,x);
31 float4 make_float4(float x)
33 return make_float4(x,x,x,x);
37 float4 make_float4(const int4& x)
39 return make_float4((float)x.s[0], (float)x.s[1], (float)x.s[2], (float)x.s[3]);
43 float2 make_float2(float x, float y)
46 v.s[0] = x; v.s[1] = y;
51 float2 make_float2(float x)
53 return make_float2(x,x);
57 float2 make_float2(const int2& x)
59 return make_float2((float)x.s[0], (float)x.s[1]);
63 int4 make_int4(int x, int y, int z, int w = 0)
66 v.s[0] = x; v.s[1] = y; v.s[2] = z; v.s[3] = w;
73 return make_int4(x,x,x,x);
77 int4 make_int4(const float4& x)
79 return make_int4((int)x.x, (int)x.y, (int)x.z, (int)x.w);
83 int2 make_int2(int a, int b)
85 int2 ans; ans.x = a; ans.y = b;
90 float4 operator-(const float4& a)
92 float4 zero; zero.m_quad = _mm_setzero_ps();
93 float4 ans; ans.m_quad = _mm_sub_ps( zero.m_quad, a.m_quad );
98 float4 operator*(const float4& a, const float4& b)
103 out.m_quad = _mm_mul_ps( a.m_quad, b.m_quad );
108 float4 operator*(float a, const float4& b)
110 float4 av; av.m_quad = _mm_set1_ps( a );
115 float4 operator*(const float4& b, float a)
119 float4 av; av.m_quad = _mm_set1_ps( a );
124 void operator*=(float4& a, const float4& b)
132 void operator*=(float4& a, float b)
136 float4 bv; bv.m_quad = _mm_set1_ps( b );
142 float4 operator/(const float4& a, const float4& b)
147 out.m_quad = _mm_div_ps( a.m_quad, b.m_quad );
152 float4 operator/(const float4& b, float a)
156 float4 av; av.m_quad = _mm_set1_ps( a );
163 void operator/=(float4& a, const float4& b)
169 void operator/=(float4& a, float b)
171 CLASSERT((u32(&a) & 0xf) == 0);
173 float4 bv; bv.m_quad = _mm_set1_ps( b );
179 float4 operator+(const float4& a, const float4& b)
184 out.m_quad = _mm_add_ps( a.m_quad, b.m_quad );
189 float4 operator+(const float4& a, float b)
193 float4 bv; bv.m_quad = _mm_set1_ps( b );
198 float4 operator-(const float4& a, const float4& b)
203 out.m_quad = _mm_sub_ps( a.m_quad, b.m_quad );
208 float4 operator-(const float4& a, float b)
212 float4 bv; bv.m_quad = _mm_set1_ps( b );
217 void operator+=(float4& a, const float4& b)
225 void operator+=(float4& a, float b)
229 float4 bv; bv.m_quad = _mm_set1_ps( b );
235 void operator-=(float4& a, const float4& b)
243 void operator-=(float4& a, float b)
247 float4 bv; bv.m_quad = _mm_set1_ps( b );
257 float4 cross3(const float4& a, const float4& b)
258 { // xnamathvector.inl
265 IntVec mask3 = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000};
266 __m128 V1 = a.m_quad;
267 __m128 V2 = b.m_quad;
269 __m128 vTemp1 = _mm_shuffle_ps(V1,V1,_MM_SHUFFLE(3,0,2,1));
271 __m128 vTemp2 = _mm_shuffle_ps(V2,V2,_MM_SHUFFLE(3,1,0,2));
272 // Perform the left operation
273 __m128 vResult = _mm_mul_ps(vTemp1,vTemp2);
275 vTemp1 = _mm_shuffle_ps(vTemp1,vTemp1,_MM_SHUFFLE(3,0,2,1));
277 vTemp2 = _mm_shuffle_ps(vTemp2,vTemp2,_MM_SHUFFLE(3,1,0,2));
278 // Perform the right operation
279 vTemp1 = _mm_mul_ps(vTemp1,vTemp2);
280 // Subract the right from left, and return answer
281 vResult = _mm_sub_ps(vResult,vTemp1);
283 float4 ans; ans.m_quad = _mm_and_ps(vResult,mask3.m_v);
288 float dot3F4(const float4& a, const float4& b)
290 // return a.x*b.x+a.y*b.y+a.z*b.z;
291 // Perform the dot product
292 __m128 V1 = a.m_quad;
293 __m128 V2 = b.m_quad;
295 __m128 vDot = _mm_mul_ps(V1,V2);
296 // x=Dot.vector4_f32[1], y=Dot.vector4_f32[2]
297 __m128 vTemp = _mm_shuffle_ps(vDot,vDot,_MM_SHUFFLE(2,1,2,1));
298 // Result.vector4_f32[0] = x+y
299 vDot = _mm_add_ss(vDot,vTemp);
300 // x=Dot.vector4_f32[2]
301 vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,1,1,1));
302 // Result.vector4_f32[0] = (x+y)+z
303 vDot = _mm_add_ss(vDot,vTemp);
305 float4 ans; ans.m_quad = _mm_shuffle_ps(vDot,vDot,_MM_SHUFFLE(0,0,0,0));
310 float length3(const float4& a)
312 return sqrtf(dot3F4(a,a));
316 float dot4(const float4& a, const float4& b)
318 return a.x*b.x+a.y*b.y+a.z*b.z+a.w*b.w;
323 float dot3w1(const float4& point, const float4& eqn)
325 return point.x*eqn.x+point.y*eqn.y+point.z*eqn.z+eqn.w;
329 float4 normalize3(const float4& a)
331 float length = sqrtf(dot3F4(a, a));
332 return 1.f/length * a;
336 float4 normalize4(const float4& a)
338 float length = sqrtf(dot4(a, a));
339 return 1.f/length * a;
343 float4 createEquation(const float4& a, const float4& b, const float4& c)
348 eqn = normalize3( cross3(ab, ac) );
349 eqn.w = -dot3F4(eqn,a);
356 T max2(const T& a, const T& b)
363 T min2(const T& a, const T& b)
370 float4 max2(const float4& a, const float4& b)
372 return make_float4( max2(a.x,b.x), max2(a.y,b.y), max2(a.z,b.z), max2(a.w,b.w) );
377 float4 min2(const float4& a, const float4& b)
379 return make_float4( min2(a.x,b.x), min2(a.y,b.y), min2(a.z,b.z), min2(a.w,b.w) );