Use only lower floats for Vector3 dot and equality
authorCarol Eidt <carol.eidt@microsoft.com>
Thu, 1 Dec 2016 03:31:58 +0000 (19:31 -0800)
committerCarol Eidt <carol.eidt@microsoft.com>
Wed, 7 Dec 2016 16:43:06 +0000 (08:43 -0800)
For both dot product and comparisons that produce a boolean result, we need to use only the lower 3 floats. The bug was exposed by a case where the result of a call was being used in one of these operations without being stored to a local (which would have caused the upper bits to be cleared).

Fix dotnet/coreclr#8220

Commit migrated from https://github.com/dotnet/coreclr/commit/0403e4d81f67a9abe61bf8637deff85b971381b7

src/coreclr/src/jit/gentree.h
src/coreclr/src/jit/simd.cpp
src/coreclr/src/jit/simd.h
src/coreclr/src/jit/simdcodegenxarch.cpp
src/coreclr/tests/src/JIT/Regression/JitBlue/GitHub_8220/GitHub_8220.cs [new file with mode: 0644]
src/coreclr/tests/src/JIT/Regression/JitBlue/GitHub_8220/GitHub_8220.csproj [new file with mode: 0644]

index 9be18b8..05a3a4a 100644 (file)
@@ -955,6 +955,9 @@ public:
                                        //                  struct fields constituting a single call argument.
 
 //----------------------------------------------------------------
+#define GTF_SIMD12_OP 0x80000000 // GT_SIMD -- Indicates that the operands need to be handled as SIMD12
+                                 //            even if they have been retyped as SIMD16.
+//----------------------------------------------------------------
 
 #define GTF_STMT_CMPADD 0x80000000  // GT_STMT    -- added by compiler
 #define GTF_STMT_HAS_CSE 0x40000000 // GT_STMT    -- CSE def or use was subsituted
index 7b38960..39664c4 100644 (file)
@@ -754,7 +754,7 @@ SIMDIntrinsicID Compiler::impSIMDLongRelOpEqual(CORINFO_CLASS_HANDLE typeHnd,
     //
     // Equality(v1, v2):
     // tmp = (v1 == v2) i.e. compare for equality as if v1 and v2 are vector<int>
-    // result = BitwiseAnd(t, shuffle(t, (2, 3, 1 0)))
+    // result = BitwiseAnd(t, shuffle(t, (2, 3, 0, 1)))
     // Shuffle is meant to swap the comparison results of low-32-bits and high 32-bits of respective long elements.
 
     // Compare vector<long> as if they were vector<int> and assign the result to a temp
@@ -768,7 +768,7 @@ SIMDIntrinsicID Compiler::impSIMDLongRelOpEqual(CORINFO_CLASS_HANDLE typeHnd,
     // op2 = Shuffle(tmp, 0xB1)
     // IntrinsicId = BitwiseAnd
     *pOp1 = gtNewOperNode(GT_COMMA, simdType, asg, tmp);
-    *pOp2 = gtNewSIMDNode(simdType, gtNewLclvNode(lclNum, simdType), gtNewIconNode(SHUFFLE_ZWYX, TYP_INT),
+    *pOp2 = gtNewSIMDNode(simdType, gtNewLclvNode(lclNum, simdType), gtNewIconNode(SHUFFLE_ZWXY, TYP_INT),
                           SIMDIntrinsicShuffleSSE2, TYP_INT, size);
     return SIMDIntrinsicBitwiseAnd;
 }
@@ -2248,7 +2248,11 @@ GenTreePtr Compiler::impSIMDIntrinsic(OPCODE                opcode,
             assert(op2->TypeGet() == simdType);
 
             simdTree = gtNewSIMDNode(genActualType(callType), op1, op2, SIMDIntrinsicOpEquality, baseType, size);
-            retVal   = simdTree;
+            if (simdType == TYP_SIMD12)
+            {
+                simdTree->gtFlags |= GTF_SIMD12_OP;
+            }
+            retVal = simdTree;
         }
         break;
 
@@ -2259,7 +2263,11 @@ GenTreePtr Compiler::impSIMDIntrinsic(OPCODE                opcode,
             op2      = impSIMDPopStack(simdType);
             op1      = impSIMDPopStack(simdType, instMethod);
             simdTree = gtNewSIMDNode(genActualType(callType), op1, op2, SIMDIntrinsicOpInEquality, baseType, size);
-            retVal   = simdTree;
+            if (simdType == TYP_SIMD12)
+            {
+                simdTree->gtFlags |= GTF_SIMD12_OP;
+            }
+            retVal = simdTree;
         }
         break;
 
@@ -2407,7 +2415,11 @@ GenTreePtr Compiler::impSIMDIntrinsic(OPCODE                opcode,
             op1 = impSIMDPopStack(simdType, instMethod);
 
             simdTree = gtNewSIMDNode(baseType, op1, op2, simdIntrinsicID, baseType, size);
-            retVal   = simdTree;
+            if (simdType == TYP_SIMD12)
+            {
+                simdTree->gtFlags |= GTF_SIMD12_OP;
+            }
+            retVal = simdTree;
         }
         break;
 
index 807ccf9..c4a8866 100644 (file)
@@ -32,11 +32,16 @@ struct SIMDIntrinsicInfo
 #ifdef _TARGET_XARCH_
 // SSE2 Shuffle control byte to shuffle vector <W, Z, Y, X>
 // These correspond to shuffle immediate byte in shufps SSE2 instruction.
-#define SHUFFLE_XXXX 0x00
-#define SHUFFLE_ZWYX 0xB1
-#define SHUFFLE_WWYY 0xF5
-#define SHUFFLE_ZZXX 0xA0
-#endif // _TARGET_XARCH_
+#define SHUFFLE_XXXX 0x00 // 00 00 00 00
+#define SHUFFLE_XXWW 0x0F // 00 00 11 11
+#define SHUFFLE_XYZW 0x1B // 00 01 10 11
+#define SHUFFLE_YXYX 0x44 // 01 00 01 00
+#define SHUFFLE_YYZZ 0x5A // 01 01 10 10
+#define SHUFFLE_ZXXY 0x81 // 10 00 00 01
+#define SHUFFLE_ZWXY 0xB1 // 10 11 00 01
+#define SHUFFLE_WWYY 0xF5 // 11 11 01 01
+#define SHUFFLE_ZZXX 0xA0 // 10 10 00 00
+#endif
 
 #endif // FEATURE_SIMD
 
index effb68b..5641ad1 100644 (file)
@@ -1199,11 +1199,26 @@ void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode)
                 // "all ones" where the number of ones is given by the vector byte size, not by the
                 // vector component count. So, for AVX registers we need to compare to 0xFFFFFFFF and
                 // for SSE registers we need to compare to 0x0000FFFF.
+                // The SIMD12 case is handled specially, because we can't rely on the upper bytes being
+                // zero, so we must compare only the lower 3 floats (hence the byte mask of 0xFFF).
                 // Note that -1 is used instead of 0xFFFFFFFF, on x64 emit doesn't correctly recognize
                 // that 0xFFFFFFFF can be encoded in a single byte and emits the longer 3DFFFFFFFF
                 // encoding instead of 83F8FF.
-                getEmitter()->emitIns_R_I(INS_cmp, EA_4BYTE, intReg,
-                                          emitActualTypeSize(simdType) == 32 ? -1 : 0x0000FFFF);
+                ssize_t mask;
+                if ((simdNode->gtFlags & GTF_SIMD12_OP) != 0)
+                {
+                    mask = 0x00000FFF;
+                    getEmitter()->emitIns_R_I(INS_and, EA_4BYTE, intReg, mask);
+                }
+                else if (emitActualTypeSize(simdType) == 32)
+                {
+                    mask = -1;
+                }
+                else
+                {
+                    mask = 0x0000FFFF;
+                }
+                getEmitter()->emitIns_R_I(INS_cmp, EA_4BYTE, intReg, mask);
             }
 
             if (targetReg != REG_NA)
@@ -1343,15 +1358,34 @@ void CodeGen::genSIMDIntrinsicDotProduct(GenTreeSIMD* simdNode)
 
         // DotProduct(v1, v2)
         // Here v0 = targetReg, v1 = op1Reg, v2 = op2Reg and tmp = tmpReg1
-        if (baseType == TYP_FLOAT)
+        if ((simdNode->gtFlags & GTF_SIMD12_OP) != 0)
         {
+            assert(baseType == TYP_FLOAT);
             // v0 = v1 * v2
             // tmp = v0                                       // v0  = (3, 2, 1, 0) - each element is given by its
             //                                                // position
-            // tmp = shuffle(tmp, tmp, Shuffle(2,3,0,1))      // tmp = (2, 3, 0, 1)
+            // tmp = shuffle(tmp, tmp, SHUFFLE_ZXXY)          // tmp = (2, 0, 0, 1) - don't really care what's in upper
+            //                                                // bits
+            // v0 = v0 + tmp                                  // v0  = (3+2, 0+2, 1+0, 0+1)
+            // tmp = shuffle(tmp, tmp, SHUFFLE_XXWW)          // tmp = (  1,   1,   2,   2)
+            // v0 = v0 + tmp                                  // v0  = (1+2+3,  0+1+2, 0+1+2, 0+1+2)
+            //
+            inst_RV_RV(INS_mulps, targetReg, op2Reg);
+            inst_RV_RV(INS_movaps, tmpReg1, targetReg);
+            inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg1, tmpReg1, SHUFFLE_ZXXY);
+            inst_RV_RV(INS_addps, targetReg, tmpReg1);
+            inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg1, tmpReg1, SHUFFLE_XXWW);
+            inst_RV_RV(INS_addps, targetReg, tmpReg1);
+        }
+        else if (baseType == TYP_FLOAT)
+        {
+            // v0 = v1 * v2
+            // tmp = v0                                       // v0  = (3, 2, 1, 0) - each element is given by its
+            //                                                // position
+            // tmp = shuffle(tmp, tmp, SHUFFLE_ZWXY)          // tmp = (2, 3, 0, 1)
             // v0 = v0 + tmp                                  // v0  = (3+2, 2+3, 1+0, 0+1)
             // tmp = v0
-            // tmp = shuffle(tmp, tmp, Shuffle(0,1,2,3))      // tmp = (0+1, 1+0, 2+3, 3+2)
+            // tmp = shuffle(tmp, tmp, SHUFFLE_XYZW)          // tmp = (0+1, 1+0, 2+3, 3+2)
             // v0 = v0 + tmp                                  // v0  = (0+1+2+3, 0+1+2+3, 0+1+2+3, 0+1+2+3)
             //                                                // Essentially horizontal addtion of all elements.
             //                                                // We could achieve the same using SSEv3 instruction
@@ -1359,10 +1393,10 @@ void CodeGen::genSIMDIntrinsicDotProduct(GenTreeSIMD* simdNode)
             //
             inst_RV_RV(INS_mulps, targetReg, op2Reg);
             inst_RV_RV(INS_movaps, tmpReg1, targetReg);
-            inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg1, tmpReg1, 0xb1);
+            inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg1, tmpReg1, SHUFFLE_ZWXY);
             inst_RV_RV(INS_addps, targetReg, tmpReg1);
             inst_RV_RV(INS_movaps, tmpReg1, targetReg);
-            inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg1, tmpReg1, 0x1b);
+            inst_RV_RV_IV(INS_shufps, EA_16BYTE, tmpReg1, tmpReg1, SHUFFLE_XYZW);
             inst_RV_RV(INS_addps, targetReg, tmpReg1);
         }
         else
@@ -1406,8 +1440,10 @@ void CodeGen::genSIMDIntrinsicDotProduct(GenTreeSIMD* simdNode)
             emitAttr emitSize = emitActualTypeSize(simdEvalType);
             if (baseType == TYP_FLOAT)
             {
-                inst_RV_RV_IV(INS_dpps, emitSize, targetReg, op2Reg, 0xf1);
-
+                // dpps computes the dot product of the upper & lower halves of the 32-byte register.
+                // Notice that if this is a TYP_SIMD16 or smaller on AVX, then we don't need a tmpReg.
+                unsigned mask = ((simdNode->gtFlags & GTF_SIMD12_OP) != 0) ? 0x71 : 0xf1;
+                inst_RV_RV_IV(INS_dpps, emitSize, targetReg, op2Reg, mask);
                 // dpps computes the dot product of the upper & lower halves of the 32-byte register.
                 // Notice that if this is a TYP_SIMD16 or smaller on AVX, then we don't need a tmpReg.
                 // If this is TYP_SIMD32, we need to combine the lower & upper results.
@@ -1993,7 +2029,7 @@ void CodeGen::genLoadIndTypeSIMD12(GenTree* treeNode)
     getEmitter()->emitIns_R_AR(ins_Load(TYP_DOUBLE), EA_8BYTE, targetReg, operandReg, 0);
 
     // combine upper 4 bytes and lower 8 bytes in targetReg
-    getEmitter()->emitIns_R_R_I(INS_shufps, emitActualTypeSize(TYP_SIMD16), targetReg, tmpReg, 0x44);
+    getEmitter()->emitIns_R_R_I(INS_shufps, emitActualTypeSize(TYP_SIMD16), targetReg, tmpReg, SHUFFLE_YXYX);
 
     genProduceReg(treeNode);
 }
@@ -2076,7 +2112,7 @@ void CodeGen::genLoadLclTypeSIMD12(GenTree* treeNode)
     getEmitter()->emitIns_R_S(ins_Move_Extend(TYP_DOUBLE, false), EA_8BYTE, targetReg, varNum, offs);
 
     // combine upper 4 bytes and lower 8 bytes in targetReg
-    getEmitter()->emitIns_R_R_I(INS_shufps, emitActualTypeSize(TYP_SIMD16), targetReg, tmpReg, 0x44);
+    getEmitter()->emitIns_R_R_I(INS_shufps, emitActualTypeSize(TYP_SIMD16), targetReg, tmpReg, SHUFFLE_YXYX);
 
     genProduceReg(treeNode);
 }
diff --git a/src/coreclr/tests/src/JIT/Regression/JitBlue/GitHub_8220/GitHub_8220.cs b/src/coreclr/tests/src/JIT/Regression/JitBlue/GitHub_8220/GitHub_8220.cs
new file mode 100644 (file)
index 0000000..f38d5ba
--- /dev/null
@@ -0,0 +1,170 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+// Regression test for Vector3 intrinsics using upper non-zero'd bits from
+// a byref return.
+
+using System;
+using System.Diagnostics;
+using System.Runtime.CompilerServices;
+using System.Numerics;
+
+namespace Test
+{
+
+    public class Program
+    {
+        static Random random;
+
+        static Program()
+        {
+            random = new Random(1);
+        }
+
+        [MethodImpl( MethodImplOptions.NoInlining )]
+        public static double StackScribble()
+        {
+            double d1 = random.NextDouble();
+            double d2 = random.NextDouble();
+            double d3 = random.NextDouble();
+            double d4 = random.NextDouble();
+            double d5 = random.NextDouble();
+            double d6 = random.NextDouble();
+            double d7 = random.NextDouble();
+            double d8 = random.NextDouble();
+            double d9 = random.NextDouble();
+            double d10 = random.NextDouble();
+            double d11 = random.NextDouble();
+            double d12 = random.NextDouble();
+            double d13 = random.NextDouble();
+            double d14 = random.NextDouble();
+            double d15 = random.NextDouble();
+            double d16 = random.NextDouble();
+            double d17 = random.NextDouble();
+            double d18 = random.NextDouble();
+            double d19 = random.NextDouble();
+            double d20 = random.NextDouble();
+            double d21 = random.NextDouble();
+            double d22 = random.NextDouble();
+            double d23 = random.NextDouble();
+            double d24 = random.NextDouble();
+            double d25 = random.NextDouble();
+            double d26 = random.NextDouble();
+            double d27 = random.NextDouble();
+            double d28 = random.NextDouble();
+            double d29 = random.NextDouble();
+            double d30 = random.NextDouble();
+            double d31 = random.NextDouble();
+            double d32 = random.NextDouble();
+            double d33 = random.NextDouble();
+            double d34 = random.NextDouble();
+            double d35 = random.NextDouble();
+            double d36 = random.NextDouble();
+            double d37 = random.NextDouble();
+            double d38 = random.NextDouble();
+            double d39 = random.NextDouble();
+            double d40 = random.NextDouble();
+            return d1 + d2 + d3 + d4 + d5 + d6 + d7 + d8 + d9 + d10 +
+                   d11 + d12 + d13 + d14 + d15 + d16 + d17 + d18 + d19 + d20 +
+                   d21 + d22 + d23 + d24 + d25 + d26 + d27 + d28 + d29 + d20 +
+                   d31 + d32 + d33 + d34 + d35 + d36 + d37 + d38 + d39 + d40;
+        }
+
+        [MethodImpl( MethodImplOptions.NoInlining )]
+        public static Vector3 getTestValue(float f1, float f2, float f3)
+        {
+            return new Vector3(f1, f2, f3);
+        }
+
+        public static bool Check(float value, float expectedValue)
+        {
+            // These may differ in the last place.
+            float expectedValueLow;
+            float expectedValueHigh;
+
+            unsafe
+            {
+                UInt32 expectedValueUInt = *(UInt32*)&expectedValue;
+                UInt32 expectedValueUIntLow = (expectedValueUInt == 0) ? 0 : expectedValueUInt - 1;
+                UInt32 expectedValueUIntHigh = (expectedValueUInt == 0xffffffff) ? 0xffffffff : expectedValueUInt + 1;
+                expectedValueLow = *(float*)&expectedValueUIntLow;
+                expectedValueHigh = *(float*)&expectedValueUIntHigh;
+            }
+            float errorMargin = Math.Abs(expectedValueHigh - expectedValueLow);
+            if (Math.Abs(value - expectedValue) > errorMargin)
+            {
+                return false;
+            }
+            return true;
+        }
+
+        [MethodImpl(MethodImplOptions.NoInlining)]
+        public static int testDotProduct(Vector3 v0)
+        {
+            float f1 = (float)random.NextDouble();
+            float f2 = (float)random.NextDouble();
+            float f3 = (float)random.NextDouble();
+
+            Vector3 v1 = Vector3.Normalize(getTestValue(f1, f2, f3) - v0);
+            Vector3 v2 = new Vector3(f1, f2, f3) - v0;
+            v2 = v2 / v2.Length();
+
+            if (!Check(v1.X, v2.X) || !Check(v1.Y, v2.Y) || !Check(v1.Z, v2.Z))
+            {
+                Console.WriteLine("Vectors do not match " + v1 + v2);
+                return -1;
+            }
+
+            return 100;
+        }
+
+        [MethodImpl(MethodImplOptions.NoInlining)]
+        public static int testEquals(Vector3 v0)
+        {
+            float f1 = (float)random.NextDouble();
+            float f2 = (float)random.NextDouble();
+            float f3 = (float)random.NextDouble();
+
+            Vector3 v1 = new Vector3(f1, f2, f3) - v0;
+            bool result = v1.Equals(getTestValue(f1, f2, f3) - v0);
+
+            if ((result == false) || !v1.Equals(getTestValue(f1, f2, f3) - v0))
+            {
+                Console.WriteLine("Equals returns wrong value " + v1);
+                return -1;
+            }
+
+            return 100;
+        }
+
+        public static int Main()
+        {
+            int returnValue = 100;
+            Console.WriteLine("Testing Dot Product");
+            for (int i = 0; i < 10; i++)
+            {
+                StackScribble();
+                if (testDotProduct(new Vector3(1.0F, 2.0F, 3.0F)) != 100)
+                {
+                    Console.WriteLine("Failed on iteration " + i);
+                    returnValue = -1;
+                    break;
+                }
+            }
+            Console.WriteLine("Testing Equals");
+            for (int i = 0; i < 10; i++)
+            {
+                StackScribble();
+                if (testEquals(new Vector3(1.0F, 2.0F, 3.0F)) != 100)
+                {
+                    Console.WriteLine("Failed on iteration " + i);
+                    returnValue = -1;
+                    break;
+                }
+            }
+            return returnValue;
+        }
+    }
+}
+
diff --git a/src/coreclr/tests/src/JIT/Regression/JitBlue/GitHub_8220/GitHub_8220.csproj b/src/coreclr/tests/src/JIT/Regression/JitBlue/GitHub_8220/GitHub_8220.csproj
new file mode 100644 (file)
index 0000000..939d010
--- /dev/null
@@ -0,0 +1,45 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="12.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <Import Project="$([MSBuild]::GetDirectoryNameOfFileAbove($(MSBuildThisFileDirectory), dir.props))\dir.props" />
+  <PropertyGroup>
+    <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
+    <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
+    <AssemblyName>$(MSBuildProjectName)</AssemblyName>
+    <SchemaVersion>2.0</SchemaVersion>
+    <ProjectGuid>{76E69AA0-8C5A-4F76-8561-B8089FFA8D79}</ProjectGuid>
+    <OutputType>Exe</OutputType>
+    <ProjectTypeGuids>{786C830F-07A1-408B-BD7F-6EE04809D6DB};{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}</ProjectTypeGuids>
+    <ReferencePath>$(ProgramFiles)\Common Files\microsoft shared\VSTT\11.0\UITestExtensionPackages</ReferencePath>
+    <SolutionDir Condition="$(SolutionDir) == '' Or $(SolutionDir) == '*Undefined*'">..\..\</SolutionDir>
+
+    <NuGetPackageImportStamp>7a9bfb7d</NuGetPackageImportStamp>
+  </PropertyGroup>
+  <!-- Default configurations to help VS understand the configurations -->
+  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
+  </PropertyGroup>
+  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
+  </PropertyGroup>
+  <ItemGroup>
+    <CodeAnalysisDependentAssemblyPaths Condition=" '$(VS100COMNTOOLS)' != '' " Include="$(VS100COMNTOOLS)..\IDE\PrivateAssemblies">
+      <Visible>False</Visible>
+    </CodeAnalysisDependentAssemblyPaths>
+  </ItemGroup>
+  <PropertyGroup>
+    <DebugType></DebugType>
+    <Optimize>True</Optimize>
+    <AllowUnsafeBlocks>True</AllowUnsafeBlocks>
+  </PropertyGroup>
+  <ItemGroup>
+    <Compile Include="$(MSBuildProjectName).cs" />
+  </ItemGroup>
+  <ItemGroup>
+    <Service Include="{82A7F48D-3B50-4B1E-B82E-3ADA8210C358}" />
+  </ItemGroup>
+  <PropertyGroup>
+    <ProjectJson>$(JitPackagesConfigFileDirectory)benchmark\project.json</ProjectJson>
+    <ProjectLockJson>$(JitPackagesConfigFileDirectory)benchmark\project.lock.json</ProjectLockJson>
+  </PropertyGroup>
+  <Import Project="$([MSBuild]::GetDirectoryNameOfFileAbove($(MSBuildThisFileDirectory), dir.targets))\dir.targets" />
+  <PropertyGroup Condition=" '$(MsBuildProjectDirOverride)' != '' ">
+  </PropertyGroup>
+</Project>