[wasm] Improve SIMD vector equality operator (#79719)
authorRadek Doulik <radek.doulik@gmail.com>
Fri, 16 Dec 2022 08:50:03 +0000 (09:50 +0100)
committerGitHub <noreply@github.com>
Fri, 16 Dec 2022 08:50:03 +0000 (09:50 +0100)
Improve the code we emit for vector equality. Instead of using multiple shuffles, use alltrue instructions

    i8x16.all_true(a: v128) -> i32
    i16x8.all_true(a: v128) -> i32
    i32x4.all_true(a: v128) -> i32
    i64x2.all_true(a: v128) -> i32

That saves size and greatly improves performance. For example Span's SequenceEqual improves like this on chrome.

| measurement | old | new |
|-:|-:|-:|
|              Span, SequenceEqual bytes |     0.0087ms |     0.0021ms |
|              Span, SequenceEqual chars |     0.0174ms |     0.0042ms |

The dotnet.wasm size drops by cca 20kbytes for bench sample.

The code diff:

```
> wa-diff -d -f corlib_System_SpanHelpers_SequenceEqual_byte__byte__uintptr dotnet.old.wasm dotnet.new.wasm
...
          v128.load    [SIMD]
          i8x16.eq    [SIMD]
-         local.tee $4
+         i8x16.all.true    [SIMD]
-         local.get $4
-         i8x16.shuffle 0x00000000000000000f0e0d0c0b0a0908    [SIMD]
-         local.get $4
-         v128.and    [SIMD]
-         local.tee $4
-         local.get $4
-         i8x16.shuffle 0x00000000000000000000000007060504    [SIMD]
-         local.get $4
-         v128.and    [SIMD]
-         local.tee $4
-         local.get $4
-         i8x16.shuffle 0x00000000000000000000000000000302    [SIMD]
-         local.get $4
-         v128.and    [SIMD]
-         local.tee $4
-         local.get $4
-         i8x16.shuffle 0x00000000000000000000000000000001    [SIMD]
-         local.get $4
-         v128.and    [SIMD]
-         i8x16.extract.lane.u 0    [SIMD]
          i32.eqz
          if
...
```

src/mono/mono/mini/llvm-intrinsics.h
src/mono/mono/mini/mini-llvm.c

index acc2f5dc38df31c245483f8f8b98ecc16a0ec1f1..3c79c7aa59427eeb2a6cfce5bcf869cb47f57841 100644 (file)
@@ -253,6 +253,10 @@ INTRINS_OVR(SSE_SSUB_SATI16, ssub_sat, Generic, v128_i2_t)
 INTRINS_OVR(SSE_USUB_SATI16, usub_sat, Generic, v128_i2_t)
 #endif
 #if defined(TARGET_WASM)
+INTRINS_OVR(WASM_ALLTRUE_V16, wasm_alltrue, Wasm, sse_i1_t)
+INTRINS_OVR(WASM_ALLTRUE_V8, wasm_alltrue, Wasm, sse_i2_t)
+INTRINS_OVR(WASM_ALLTRUE_V4, wasm_alltrue, Wasm, sse_i4_t)
+INTRINS_OVR(WASM_ALLTRUE_V2, wasm_alltrue, Wasm, sse_i8_t)
 INTRINS_OVR(WASM_ANYTRUE_V16, wasm_anytrue, Wasm, sse_i1_t)
 INTRINS_OVR(WASM_ANYTRUE_V8, wasm_anytrue, Wasm, sse_i2_t)
 INTRINS_OVR(WASM_ANYTRUE_V4, wasm_anytrue, Wasm, sse_i4_t)
index 5f90a4919e1112a54ae17f12e9924c4b24854ae4..67779805df504c7deb68a7ca3cd75e48e6f6c2fe 100644 (file)
@@ -9791,13 +9791,10 @@ MONO_RESTORE_WARNING
                        int nelems;
 
 #if defined(TARGET_WASM)
+                       IntrinsicId intrins = (IntrinsicId)0;
+                       nelems = LLVMGetVectorSize (LLVMTypeOf (lhs));
                        /* The wasm code generator doesn't understand the shuffle/and code sequence below */
-                       LLVMValueRef val;
                        if (LLVMIsNull (lhs) || LLVMIsNull (rhs)) {
-                               val = LLVMIsNull (lhs) ? rhs : lhs;
-                               nelems = LLVMGetVectorSize (LLVMTypeOf (lhs));
-
-                               IntrinsicId intrins = (IntrinsicId)0;
                                switch (nelems) {
                                case 16:
                                        intrins = INTRINS_WASM_ANYTRUE_V16;
@@ -9814,6 +9811,8 @@ MONO_RESTORE_WARNING
                                default:
                                        g_assert_not_reached ();
                                }
+                               LLVMValueRef val = LLVMIsNull (lhs) ? rhs : lhs;
+
                                /* res = !wasm.anytrue (val) */
                                values [ins->dreg] = call_intrins (ctx, intrins, &val, "");
                                values [ins->dreg] = LLVMBuildZExt (builder, LLVMBuildICmp (builder, LLVMIntEQ, values [ins->dreg], const_int32 (0), ""), LLVMInt32Type (), dname);
@@ -9839,6 +9838,26 @@ MONO_RESTORE_WARNING
 
                        t = LLVMVectorType (elemt, nelems);
                        cmp = LLVMBuildSExt (builder, cmp, t, "");
+#if defined(TARGET_WASM)
+                       switch (nelems) {
+                       case 16:
+                               intrins = INTRINS_WASM_ALLTRUE_V16;
+                               break;
+                       case 8:
+                               intrins = INTRINS_WASM_ALLTRUE_V8;
+                               break;
+                       case 4:
+                               intrins = INTRINS_WASM_ALLTRUE_V4;
+                               break;
+                       case 2:
+                               intrins = INTRINS_WASM_ALLTRUE_V2;
+                               break;
+                       default:
+                               g_assert_not_reached ();
+                       }
+                       values [ins->dreg] = call_intrins (ctx, intrins, &cmp, "");
+                       break;
+#endif
                        // cmp is a <nelems x elemt> vector, each element is either 0xff... or 0
                        int half = nelems / 2;
                        while (half >= 1) {