JIT: Optimize logical right shifts for byte values on XArch (#86841)

author Miha Zupan <mihazupan.zupan1@gmail.com>

Mon, 17 Jul 2023 13:22:53 +0000 (15:22 +0200)

committer GitHub <noreply@github.com>

Mon, 17 Jul 2023 13:22:53 +0000 (06:22 -0700)
author Miha Zupan <mihazupan.zupan1@gmail.com>
Mon, 17 Jul 2023 13:22:53 +0000 (15:22 +0200)
committer GitHub <noreply@github.com>
Mon, 17 Jul 2023 13:22:53 +0000 (06:22 -0700)
diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp

index 66943d2..3d3c23f 100644 (file)
--- a/src/coreclr/jit/gentree.cpp
+++ b/src/coreclr/jit/gentree.cpp
@@ -19799,7 +19799,7 @@ GenTree* Compiler::gtNewSimdBinOpNode(
                  simdBaseType    = TYP_LONG;
              }
  
-            assert(!varTypeIsByte(simdBaseType));
+            assert(!varTypeIsByte(simdBaseType) || op == GT_RSZ);
  
              // "over shifting" is platform specific behavior. We will match the C# behavior
              // this requires we mask with (sizeof(T) * 8) - 1 which ensures the shift cannot
@@ -19809,6 +19809,8 @@ GenTree* Compiler::gtNewSimdBinOpNode(
  
              unsigned shiftCountMask = (genTypeSize(simdBaseType) * 8) - 1;
  
+            GenTree* nonConstantByteShiftCountOp = NULL;
+
              if (op2->IsCnsIntOrI())
              {
                  op2->AsIntCon()->gtIconVal &= shiftCountMask;
@@ -19816,6 +19818,14 @@ GenTree* Compiler::gtNewSimdBinOpNode(
              else
              {
                  op2 = gtNewOperNode(GT_AND, TYP_INT, op2, gtNewIconNode(shiftCountMask));
+
+                if (varTypeIsByte(simdBaseType))
+                {
+                    // Save the intermediate "shiftCount & shiftCountMask" value as we will need it to
+                    // calculate which bits we should mask off after the 32-bit shift we use for 8-bit values.
+                    nonConstantByteShiftCountOp = fgMakeMultiUse(&op2);
+                }
+
                  op2 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op2, NI_SSE2_ConvertScalarToVector128Int32, CORINFO_TYPE_INT,
                                                 16);
              }
@@ -19908,6 +19918,34 @@ GenTree* Compiler::gtNewSimdBinOpNode(
                  assert(op == GT_RSZ);
                  intrinsic = NI_SSE2_ShiftRightLogical;
              }
+
+            if (varTypeIsByte(simdBaseType))
+            {
+                assert(op == GT_RSZ);
+
+                // We don't have actual instructions for shifting bytes, so we'll emulate them
+                // by shifting 32-bit values and masking off the bits that should be zeroed.
+                GenTree* maskAmountOp;
+
+                if (op2->IsCnsIntOrI())
+                {
+                    ssize_t shiftCount = op2->AsIntCon()->gtIconVal;
+                    ssize_t mask       = 255 >> shiftCount;
+
+                    maskAmountOp = gtNewIconNode(mask, type);
+                }
+                else
+                {
+                    assert(nonConstantByteShiftCountOp != NULL);
+
+                    maskAmountOp = gtNewOperNode(GT_RSZ, TYP_INT, gtNewIconNode(255), nonConstantByteShiftCountOp);
+                }
+
+                GenTree* shiftOp = gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsic, CORINFO_TYPE_INT, simdSize);
+                GenTree* maskOp  = gtNewSimdCreateBroadcastNode(type, maskAmountOp, simdBaseJitType, simdSize);
+
+                return gtNewSimdBinOpNode(GT_AND, type, shiftOp, maskOp, simdBaseJitType, simdSize);
+            }
              break;
          }
  
diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp

index fc81d05..92dbab0 100644 (file)
--- a/src/coreclr/jit/hwintrinsicxarch.cpp
+++ b/src/coreclr/jit/hwintrinsicxarch.cpp
@@ -2637,12 +2637,6 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic        intrinsic,
          {
              assert(sig->numArgs == 2);
  
-            if (varTypeIsByte(simdBaseType))
-            {
-                // byte and sbyte would require more work to support
-                break;
-            }
-
              if ((simdSize != 32) || compOpportunisticallyDependsOn(InstructionSet_AVX2))
              {
                  op2 = impPopStack().val;
diff --git a/src/libraries/System.Private.CoreLib/src/System/SearchValues/IndexOfAnyAsciiSearcher.cs b/src/libraries/System.Private.CoreLib/src/System/SearchValues/IndexOfAnyAsciiSearcher.cs

index 21e3742..4ce70d7 100644 (file)
--- a/src/libraries/System.Private.CoreLib/src/System/SearchValues/IndexOfAnyAsciiSearcher.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/SearchValues/IndexOfAnyAsciiSearcher.cs
@@ -879,10 +879,10 @@ namespace System.Buffers
  
              // On ARM, we have an instruction for an arithmetic right shift of 1-byte signed values.
              // The shift will map values above 127 to values above 16, which the shuffle will then map to 0.
-            // On X86 and WASM, use a 4-byte value shift with AND 15 to emulate a 1-byte value logical shift.
+            // On X86 and WASM, use a logical right shift instead.
              Vector128<byte> highNibbles = AdvSimd.IsSupported
                  ? AdvSimd.ShiftRightArithmetic(source.AsSByte(), 4).AsByte()
-                : (source.AsInt32() >>> 4).AsByte() & Vector128.Create((byte)0xF);
+                : source >>> 4;
  
              // The bitmapLookup represents a 8x16 table of bits, indicating whether a character is present in the needle.
              // Lookup the rows via the lower nibble and the column via the higher nibble.
@@ -913,7 +913,7 @@ namespace System.Buffers
          private static Vector256<byte> IndexOfAnyLookupCore(Vector256<byte> source, Vector256<byte> bitmapLookup)
          {
              // See comments in IndexOfAnyLookupCore(Vector128<byte>) above for more details.
-            Vector256<byte> highNibbles = Vector256.ShiftRightLogical(source.AsInt32(), 4).AsByte() & Vector256.Create((byte)0xF);
+            Vector256<byte> highNibbles = source >>> 4;
              Vector256<byte> bitMask = Avx2.Shuffle(bitmapLookup, source);
              Vector256<byte> bitPositions = Avx2.Shuffle(Vector256.Create(0x8040201008040201).AsByte(), highNibbles);
              Vector256<byte> result = bitMask & bitPositions;
diff --git a/src/libraries/System.Private.CoreLib/src/System/SearchValues/ProbabilisticMap.cs b/src/libraries/System.Private.CoreLib/src/System/SearchValues/ProbabilisticMap.cs

index 8a0d653..f42e130 100644 (file)
--- a/src/libraries/System.Private.CoreLib/src/System/SearchValues/ProbabilisticMap.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/SearchValues/ProbabilisticMap.cs
@@ -131,10 +131,9 @@ namespace System.Buffers
          [CompExactlyDependsOn(typeof(Avx2))]
          private static Vector256<byte> IsCharBitSetAvx2(Vector256<byte> charMapLower, Vector256<byte> charMapUpper, Vector256<byte> values)
          {
-            // X86 doesn't have a logical right shift intrinsic for bytes: https://github.com/dotnet/runtime/issues/82564
-            Vector256<byte> highNibble = (values.AsInt32() >>> VectorizedIndexShift).AsByte() & Vector256.Create((byte)15);
+            Vector256<byte> shifted = values >>> VectorizedIndexShift;
  
-            Vector256<byte> bitPositions = Avx2.Shuffle(Vector256.Create(0x8040201008040201).AsByte(), highNibble);
+            Vector256<byte> bitPositions = Avx2.Shuffle(Vector256.Create(0x8040201008040201).AsByte(), shifted);
  
              Vector256<byte> index = values & Vector256.Create((byte)VectorizedIndexMask);
              Vector256<byte> bitMaskLower = Avx2.Shuffle(charMapLower, index);
@@ -175,12 +174,9 @@ namespace System.Buffers
          [CompExactlyDependsOn(typeof(PackedSimd))]
          private static Vector128<byte> IsCharBitSet(Vector128<byte> charMapLower, Vector128<byte> charMapUpper, Vector128<byte> values)
          {
-            // X86 doesn't have a logical right shift intrinsic for bytes: https://github.com/dotnet/runtime/issues/82564
-            Vector128<byte> highNibble = Sse2.IsSupported
-                ? (values.AsInt32() >>> VectorizedIndexShift).AsByte() & Vector128.Create((byte)15)
-                : values >>> VectorizedIndexShift;
+            Vector128<byte> shifted = values >>> VectorizedIndexShift;
  
-            Vector128<byte> bitPositions = Vector128.ShuffleUnsafe(Vector128.Create(0x8040201008040201).AsByte(), highNibble);
+            Vector128<byte> bitPositions = Vector128.ShuffleUnsafe(Vector128.Create(0x8040201008040201).AsByte(), shifted);
  
              Vector128<byte> index = values & Vector128.Create((byte)VectorizedIndexMask);
              Vector128<byte> bitMask;
author	Miha Zupan <mihazupan.zupan1@gmail.com>
	Mon, 17 Jul 2023 13:22:53 +0000 (15:22 +0200)
committer	GitHub <noreply@github.com>
	Mon, 17 Jul 2023 13:22:53 +0000 (06:22 -0700)
src/coreclr/jit/gentree.cpp		patch \| blob \| history
src/coreclr/jit/hwintrinsicxarch.cpp		patch \| blob \| history
src/libraries/System.Private.CoreLib/src/System/SearchValues/IndexOfAnyAsciiSearcher.cs		patch \| blob \| history
src/libraries/System.Private.CoreLib/src/System/SearchValues/ProbabilisticMap.cs		patch \| blob \| history