From c4603ee6a1e29bcf9e4aff2ea095ec529024692c Mon Sep 17 00:00:00 2001
From: Eugene Rozenfeld <erozen@microsoft.com>
Date: Thu, 18 Feb 2016 17:46:38 -0800
Subject: [PATCH] Fix for SIMD intrinsic Initialize expansion.

Initialize expansion was missing register zero initialization
in some cases. This caused silent bad codegen in #3208.

Closes #3208.
---
 src/jit/codegenlinear.h      |  9 ++++-
 src/jit/simdcodegenxarch.cpp | 95 +++++++++++++++++++++++++++++++-------------
 2 files changed, 75 insertions(+), 29 deletions(-)

diff --git a/src/jit/codegenlinear.h b/src/jit/codegenlinear.h
index 851ee7a..08a7da2 100644
--- a/src/jit/codegenlinear.h
+++ b/src/jit/codegenlinear.h
@@ -58,8 +58,15 @@
 #endif
 
 #ifdef FEATURE_SIMD
+    enum SIMDScalarMoveType
+    {
+        SMT_ZeroInitUpper,                  // zero initlaize target upper bits
+        SMT_ZeroInitUpper_SrcHasUpperZeros, // zero initialize target upper bits; source upper bits are known to be zero
+        SMT_PreserveUpper                   // preserve target upper bits
+    };
+
     instruction         getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_types baseType, unsigned *ival = nullptr);
-    void                genSIMDScalarMove(var_types type, regNumber target, regNumber src, bool zeroInit);
+    void                genSIMDScalarMove(var_types type, regNumber target, regNumber src, SIMDScalarMoveType moveType);
     void                genSIMDIntrinsicInit(GenTreeSIMD* simdNode);
     void                genSIMDIntrinsicInitN(GenTreeSIMD* simdNode);
     void                genSIMDIntrinsicInitArray(GenTreeSIMD* simdNode);
diff --git a/src/jit/simdcodegenxarch.cpp b/src/jit/simdcodegenxarch.cpp
index bbed0f2..f419055 100644
--- a/src/jit/simdcodegenxarch.cpp
+++ b/src/jit/simdcodegenxarch.cpp
@@ -452,7 +452,7 @@ CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId,
 //    type             the type of value to be moved
 //    targetReg        the target reg
 //    srcReg           the src reg
-//    zeroInit         true if the upper bits of targetReg should be zero'd
+//    moveType         action to be performed on target upper bits
 //
 // Return Value:
 //    None
@@ -461,43 +461,71 @@ CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId,
 //    This is currently only supported for floating point types.
 //
 void
-CodeGen::genSIMDScalarMove(var_types type, regNumber targetReg, regNumber srcReg, bool zeroInit)
+CodeGen::genSIMDScalarMove(var_types type, regNumber targetReg, regNumber srcReg, SIMDScalarMoveType moveType)
 {
     var_types targetType = compiler->getSIMDVectorType();
     assert(varTypeIsFloating(type));
 #ifdef FEATURE_AVX_SUPPORT
     if (compiler->getSIMDInstructionSet() == InstructionSet_AVX)
     {
-        if (zeroInit)
+        switch (moveType)
         {
-            // insertps is a 128-bit only instruction, and clears the upper 128 bits, which is what we want.
-            // The insertpsImm selects which fields are copied and zero'd of the lower 128 bits, so we choose
-            // to zero all but the lower bits.
-            unsigned int insertpsImm = (INSERTPS_TARGET_SELECT(0)|INSERTPS_ZERO(1)|INSERTPS_ZERO(2)|INSERTPS_ZERO(3));
-            inst_RV_RV_IV(INS_insertps, EA_16BYTE, targetReg, srcReg, insertpsImm);
-        }
-        else if (srcReg != targetReg)
-        {
-            instruction ins = ins_Store(type);
-            if (getEmitter()->IsThreeOperandMoveAVXInstruction(ins))
+        case SMT_PreserveUpper:
+            if (srcReg != targetReg)
             {
-                // In general, when we use a three-operands move instruction, we want to merge the src with itself.
-                // This is an exception in that we actually want the "merge" behavior, so we must specify it with
-                // all 3 operands.
-                inst_RV_RV_RV(ins, targetReg, targetReg, srcReg, emitTypeSize(targetType));
+                instruction ins = ins_Store(type);
+                if (getEmitter()->IsThreeOperandMoveAVXInstruction(ins))
+                {
+                    // In general, when we use a three-operands move instruction, we want to merge the src with itself.
+                    // This is an exception in that we actually want the "merge" behavior, so we must specify it with
+                    // all 3 operands.
+                    inst_RV_RV_RV(ins, targetReg, targetReg, srcReg, emitTypeSize(targetType));
+                }
+                else
+                {
+                    inst_RV_RV(ins, targetReg, srcReg, targetType, emitTypeSize(targetType));
+                }
             }
-            else
+            break;
+
+        case SMT_ZeroInitUpper:
+            {
+                // insertps is a 128-bit only instruction, and clears the upper 128 bits, which is what we want.
+                // The insertpsImm selects which fields are copied and zero'd of the lower 128 bits, so we choose
+                // to zero all but the lower bits.
+                unsigned int insertpsImm = (INSERTPS_TARGET_SELECT(0) | INSERTPS_ZERO(1) | INSERTPS_ZERO(2) | INSERTPS_ZERO(3));
+                inst_RV_RV_IV(INS_insertps, EA_16BYTE, targetReg, srcReg, insertpsImm);
+                break;
+            }
+
+        case SMT_ZeroInitUpper_SrcHasUpperZeros:
+            if (srcReg != targetReg)
             {
+                instruction ins = ins_Copy(type);
+                assert(!getEmitter()->IsThreeOperandMoveAVXInstruction(ins));
                 inst_RV_RV(ins, targetReg, srcReg, targetType, emitTypeSize(targetType));
             }
+            break;
+
+        default:
+            unreached();
         }
     }
     else
 #endif // FEATURE_AVX_SUPPORT
     {
         // SSE
-        if (zeroInit)
+
+        switch (moveType)
         {
+        case SMT_PreserveUpper:
+            if (srcReg != targetReg)
+            {
+                inst_RV_RV(ins_Store(type), targetReg, srcReg, targetType, emitTypeSize(targetType));
+            }
+            break;
+
+        case SMT_ZeroInitUpper:
             if (srcReg == targetReg)
             {
                 // There is no guarantee that upper bits of op1Reg are zero.
@@ -513,10 +541,17 @@ CodeGen::genSIMDScalarMove(var_types type, regNumber targetReg, regNumber srcReg
                 inst_RV_RV(ins, targetReg, targetReg, targetType, emitTypeSize(targetType));
                 inst_RV_RV(ins_Store(type), targetReg, srcReg);
             }
-        }
-        else if (srcReg != targetReg)
-        {
-            inst_RV_RV(ins_Store(type), targetReg, srcReg, targetType, emitTypeSize(targetType));
+            break;
+
+        case SMT_ZeroInitUpper_SrcHasUpperZeros:
+            if (srcReg != targetReg)
+            {
+                inst_RV_RV(ins_Copy(type), targetReg, srcReg, targetType, emitTypeSize(targetType));
+            }
+            break;
+
+        default:
+            unreached();
         }
     }
 }
@@ -619,9 +654,13 @@ CodeGen::genSIMDIntrinsicInit(GenTreeSIMD* simdNode)
             // data section using movss/sd.  Similarly if op1 is a memory op we
             // would have loaded it using movss/sd.  Movss/sd when loading a xmm reg
             // from memory would zero-out upper bits. In these cases we can
-            // avoid explicitly zero'ing out targetReg.
-            bool zeroInitRequired = !(op1->IsCnsFltOrDbl() || op1->isMemoryOp());
-            genSIMDScalarMove(TYP_FLOAT, targetReg, op1Reg, zeroInitRequired);
+            // avoid explicitly zero'ing out targetReg if targetReg and op1Reg are the same or do it more efficiently
+            // if they are not the same.
+            SIMDScalarMoveType moveType = op1->IsCnsFltOrDbl() || op1->isMemoryOp()
+                ? SMT_ZeroInitUpper_SrcHasUpperZeros
+                : SMT_ZeroInitUpper;
+
+            genSIMDScalarMove(TYP_FLOAT, targetReg, op1Reg, moveType);
 
             if (size == 8)
             {
@@ -726,7 +765,7 @@ CodeGen::genSIMDIntrinsicInitN(GenTreeSIMD* simdNode)
         // This allows us to efficiently stitch together a vector as follows:
         // vectorReg = (vectorReg << offset)
         // VectorReg[0] = listItemReg
-        // Use genSIMDScalarMove with zeroInit of false in order to ensure that the upper
+        // Use genSIMDScalarMove with SMT_PreserveUpper in order to ensure that the upper
         // bits of vectorReg are not modified.
 
         regNumber operandReg = operandRegs[initCount - i - 1];
@@ -734,7 +773,7 @@ CodeGen::genSIMDIntrinsicInitN(GenTreeSIMD* simdNode)
         {                         
             getEmitter()->emitIns_R_I(insLeftShift, EA_16BYTE, vectorReg, baseTypeSize);
         }
-        genSIMDScalarMove(baseType, vectorReg, operandReg, false /* do not zeroInit */);
+        genSIMDScalarMove(baseType, vectorReg, operandReg, SMT_PreserveUpper);
 
         offset += baseTypeSize;
     }
-- 
2.7.4