[mono] Implement all Ssse3 and Sse42 intrinsics (#33591)
authorEgor Bogatov <egorbo@gmail.com>
Sun, 15 Mar 2020 03:20:36 +0000 (06:20 +0300)
committerGitHub <noreply@github.com>
Sun, 15 Mar 2020 03:20:36 +0000 (23:20 -0400)
* implement Ssse3

* Implement Sse42

* remove unrelated change

* Fix C compilation error

src/mono/mono/mini/llvm-intrinsics.h
src/mono/mono/mini/mini-llvm.c
src/mono/mono/mini/mini-ops.h
src/mono/mono/mini/mini.h
src/mono/mono/mini/simd-intrinsics-netcore.c
src/mono/mono/mini/simd-methods-netcore.h

index ffe9ee0..1eccab2 100644 (file)
@@ -187,6 +187,21 @@ INTRINS(SSE_SFENCE, x86_sse_sfence)
 INTRINS(SSE_MFENCE, x86_sse2_mfence)
 INTRINS(SSE_LFENCE, x86_sse2_lfence)
 INTRINS(SSE_LDU_DQ, x86_sse3_ldu_dq)
+INTRINS(SSE_PHADDW, x86_ssse3_phadd_w_128)
+INTRINS(SSE_PHADDD, x86_ssse3_phadd_d_128)
+INTRINS(SSE_PHADDSW, x86_ssse3_phadd_sw_128)
+INTRINS(SSE_PHSUBW, x86_ssse3_phsub_w_128)
+INTRINS(SSE_PHSUBD, x86_ssse3_phsub_d_128)
+INTRINS(SSE_PHSUBSW, x86_ssse3_phsub_sw_128)
+INTRINS(SSE_PMADDUBSW, x86_ssse3_pmadd_ub_sw_128)
+INTRINS(SSE_PMULHRSW, x86_ssse3_pmul_hr_sw_128)
+INTRINS(SSE_PSIGNB, x86_ssse3_psign_b_128)
+INTRINS(SSE_PSIGNW, x86_ssse3_psign_w_128)
+INTRINS(SSE_PSIGND, x86_ssse3_psign_d_128)
+INTRINS(SSE_CRC32_32_8, x86_sse42_crc32_32_8)
+INTRINS(SSE_CRC32_32_16, x86_sse42_crc32_32_16)
+INTRINS(SSE_CRC32_32_32, x86_sse42_crc32_32_32)
+INTRINS(SSE_CRC32_64_64, x86_sse42_crc32_64_64)
 #if LLVM_API_VERSION >= 800
        // these intrinsics were renamed in LLVM 8
 INTRINS_OVR(SSE_SADD_SATI8, sadd_sat)
index 270ef5c..ba11f8c 100644 (file)
@@ -7937,8 +7937,19 @@ process_bb (EmitContext *ctx, MonoBasicBlock *bb)
                        case SIMD_OP_SSE_ADDSUBPD: id = INTRINS_SSE_ADDSUBPD; break;
                        case SIMD_OP_SSE_HADDPS: id = INTRINS_SSE_HADDPS; break;
                        case SIMD_OP_SSE_HADDPD: id = INTRINS_SSE_HADDPD; break;
+                       case SIMD_OP_SSE_PHADDW: id = INTRINS_SSE_PHADDW; break;
+                       case SIMD_OP_SSE_PHADDD: id = INTRINS_SSE_PHADDD; break;
+                       case SIMD_OP_SSE_PHSUBW: id = INTRINS_SSE_PHSUBW; break;
+                       case SIMD_OP_SSE_PHSUBD: id = INTRINS_SSE_PHSUBD; break;
                        case SIMD_OP_SSE_HSUBPS: id = INTRINS_SSE_HSUBPS; break;
                        case SIMD_OP_SSE_HSUBPD: id = INTRINS_SSE_HSUBPD; break;
+                       case SIMD_OP_SSE_PHADDSW: id = INTRINS_SSE_PHADDSW; break;
+                       case SIMD_OP_SSE_PHSUBSW: id = INTRINS_SSE_PHSUBSW; break;
+                       case SIMD_OP_SSE_PSIGNB: id = INTRINS_SSE_PSIGNB; break;
+                       case SIMD_OP_SSE_PSIGNW: id = INTRINS_SSE_PSIGNW; break;
+                       case SIMD_OP_SSE_PSIGND: id = INTRINS_SSE_PSIGND; break;
+                       case SIMD_OP_SSE_PMADDUBSW: id = INTRINS_SSE_PMADDUBSW; break;
+                       case SIMD_OP_SSE_PMULHRSW: id = INTRINS_SSE_PMULHRSW; break;
                        default: g_assert_not_reached (); break;
                        }
                        values [ins->dreg] = call_intrins (ctx, id, args, "");
@@ -8202,6 +8213,30 @@ process_bb (EmitContext *ctx, MonoBasicBlock *bb)
                        break;
                }
 
+               case OP_SSSE3_ABS: {
+                       // %sub = sub <16 x i8> zeroinitializer, %arg
+                       // %cmp = icmp sgt <16 x i8> %arg, zeroinitializer
+                       // %abs = select <16 x i1> %cmp, <16 x i8> %arg, <16 x i8> %sub
+                       LLVMTypeRef typ = type_to_sse_type (ins->inst_c1);
+                       LLVMValueRef sub = LLVMBuildSub(builder, LLVMConstNull(typ), lhs, "");
+                       LLVMValueRef cmp = LLVMBuildICmp(builder, LLVMIntSGT, lhs, LLVMConstNull(typ), "");
+                       LLVMValueRef abs = LLVMBuildSelect (builder, cmp, lhs, sub, "");
+                       values [ins->dreg] = convert (ctx, abs, typ);
+                       break;
+               }
+               
+               case OP_SSSE3_ALIGNR: {
+                       LLVMValueRef mask_values [16];
+                       for (int i = 0; i < 16; i++)
+                               mask_values [i] = LLVMConstInt (LLVMInt8Type (), i + ins->inst_c0, FALSE);
+                       LLVMValueRef shuffled = LLVMBuildShuffleVector (builder, 
+                               convert (ctx, rhs, sse_i1_t),
+                               convert (ctx, lhs, sse_i1_t),
+                               LLVMConstVector (mask_values, 16), "");
+                       values [ins->dreg] = convert (ctx, shuffled, type_to_sse_type (ins->inst_c1));
+                       break;
+               }
+
                case OP_CREATE_SCALAR:
                case OP_CREATE_SCALAR_UNSAFE: {
                        LLVMTypeRef type = type_to_sse_type (ins->inst_c1);
@@ -8261,6 +8296,23 @@ process_bb (EmitContext *ctx, MonoBasicBlock *bb)
                        values [ins->dreg] = LLVMBuildZExt (builder, cmp_zero, LLVMInt8Type (), "");
                        break;
                }
+
+               case OP_SSE42_CRC32:
+               case OP_SSE42_CRC64: {
+                       LLVMValueRef args [2];
+                       args [0] = lhs;
+                       args [1] = convert (ctx, rhs, primitive_type_to_llvm_type (ins->inst_c0));
+                       IntrinsicId id;
+                       switch (ins->inst_c0) {
+                       case MONO_TYPE_U1: id = INTRINS_SSE_CRC32_32_8; break;
+                       case MONO_TYPE_U2: id = INTRINS_SSE_CRC32_32_16; break;
+                       case MONO_TYPE_U4: id = INTRINS_SSE_CRC32_32_32; break;
+                       case MONO_TYPE_U8: id = INTRINS_SSE_CRC32_64_64; break;
+                       default: g_assert_not_reached (); break;
+                       }
+                       values [ins->dreg] = call_intrins (ctx, id, args, "");
+                       break;
+               }
 #endif
 
 #ifdef ENABLE_NETCORE
index 5ab15bb..6bd050c 100644 (file)
@@ -1077,7 +1077,9 @@ MINI_OP(OP_SSE3_MOVSLDUP, "sse3_movsldup", XREG, XREG, NONE)
 MINI_OP(OP_SSE3_MOVDDUP_MEM, "sse3_movddup_mem", XREG, IREG, NONE)
 
 /* ssse 3 */
+MINI_OP(OP_SSSE3_ABS, "ssse3_abs", XREG, XREG, NONE)
 MINI_OP(OP_SSSE3_SHUFFLE, "ssse3_shuffle", XREG, XREG, XREG)
+MINI_OP3(OP_SSSE3_ALIGNR, "ssse3_alignr", XREG, XREG, XREG, IREG)
 
 /* sse 4.1 */
 /* inst_c0 is the rounding mode: 0 = round, 1 = floor, 2 = ceiling */
@@ -1086,6 +1088,10 @@ MINI_OP(OP_SSE41_ROUNDSS, "roundss", XREG, XREG, NONE)
 MINI_OP3(OP_SSE41_INSERT, "sse41_insert", XREG, XREG, XREG, IREG)
 MINI_OP(OP_SSE41_PTESTZ, "sse41_ptestz", IREG, XREG, XREG)
 
+/* sse 4.2 */
+MINI_OP(OP_SSE42_CRC32, "sse42_crc32", IREG, IREG, IREG)
+MINI_OP(OP_SSE42_CRC64, "sse42_crc64", LREG, LREG, LREG)
+
 /* Intel BMI1 */
 /* Count trailing zeroes, return 32/64 if the input is 0 */
 MINI_OP(OP_CTTZ32, "cttz32", IREG, IREG, NONE)
index 46263d3..a0bd720 100644 (file)
@@ -2951,8 +2951,19 @@ typedef enum {
        SIMD_OP_SSE_ADDSUBPD,
        SIMD_OP_SSE_HADDPS,
        SIMD_OP_SSE_HADDPD,
+       SIMD_OP_SSE_PHADDW,
+       SIMD_OP_SSE_PHADDD,
+       SIMD_OP_SSE_PHSUBW,
+       SIMD_OP_SSE_PHSUBD,
        SIMD_OP_SSE_HSUBPS,
        SIMD_OP_SSE_HSUBPD,
+       SIMD_OP_SSE_PHADDSW,
+       SIMD_OP_SSE_PHSUBSW,
+       SIMD_OP_SSE_PSIGNB,
+       SIMD_OP_SSE_PSIGNW,
+       SIMD_OP_SSE_PSIGND,
+       SIMD_OP_SSE_PMADDUBSW,
+       SIMD_OP_SSE_PMULHRSW,
        SIMD_OP_SSE_LDDQU
 } SimdOp;
 
index 9890068..a9c872b 100644 (file)
@@ -887,7 +887,16 @@ static SimdIntrinsic sse3_methods [] = {
 };
 
 static SimdIntrinsic ssse3_methods [] = {
+       {SN_Abs, OP_SSSE3_ABS},
+       {SN_AlignRight},
+       {SN_HorizontalAdd},
+       {SN_HorizontalAddSaturate, OP_XOP_X_X_X, SIMD_OP_SSE_PHADDSW},
+       {SN_HorizontalSubtract},
+       {SN_HorizontalSubtractSaturate, OP_XOP_X_X_X, SIMD_OP_SSE_PHSUBSW},
+       {SN_MultiplyAddAdjacent, OP_XOP_X_X_X, SIMD_OP_SSE_PMADDUBSW},
+       {SN_MultiplyHighRoundScale, OP_XOP_X_X_X, SIMD_OP_SSE_PMULHRSW},
        {SN_Shuffle, OP_SSSE3_SHUFFLE},
+       {SN_Sign},
        {SN_get_IsSupported}
 };
 
@@ -899,6 +908,12 @@ static SimdIntrinsic sse41_methods [] = {
        {SN_get_IsSupported}
 };
 
+static SimdIntrinsic sse42_methods [] = {
+       {SN_CompareGreaterThan, OP_XCOMPARE, CMP_GT},
+       {SN_Crc32},
+       {SN_get_IsSupported}
+};
+
 static SimdIntrinsic popcnt_methods [] = {
        {SN_PopCount},
        {SN_get_IsSupported}
@@ -1235,7 +1250,6 @@ emit_x86_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature
                case SN_ShuffleLow:
                        g_assert (fsig->param_count == 2);
                        return emit_simd_ins_for_sig (cfg, klass, OP_SSE2_PSHUFLW, 0, arg0_type, fsig, args);
-                       return NULL;
                case SN_SqrtScalar:
                        if (fsig->param_count == 1)
                                return emit_simd_ins_for_sig (cfg, klass, OP_XOP_X_X, info->instc0, arg0_type, fsig, args);
@@ -1307,13 +1321,33 @@ emit_x86_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature
                if (info->op != 0)
                        return emit_simd_ins_for_sig (cfg, klass, info->op, info->instc0, arg0_type, fsig, args);
 
-               supported = (mini_get_cpu_features (cfg) & MONO_CPU_X86_SSSE3) != 0 && is_corlib; // We only support the subset used by corelib
+               supported = (mini_get_cpu_features (cfg) & MONO_CPU_X86_SSSE3) != 0;
 
                switch (id) {
                case SN_get_IsSupported:
                        EMIT_NEW_ICONST (cfg, ins, supported ? 1 : 0);
                        ins->type = STACK_I4;
                        return ins;
+               case SN_AlignRight:
+                       if (args [2]->opcode == OP_ICONST)
+                               return emit_simd_ins_for_sig (cfg, klass, OP_SSSE3_ALIGNR, args [2]->inst_c0, arg0_type, fsig, args);
+                       else
+                               // FIXME: non-constant mask (generate switch)
+                               return NULL;
+               case SN_HorizontalAdd:
+                       if (arg0_type == MONO_TYPE_I2)
+                               return emit_simd_ins_for_sig (cfg, klass, OP_XOP_X_X_X, SIMD_OP_SSE_PHADDW, arg0_type, fsig, args);
+                       return emit_simd_ins_for_sig (cfg, klass, OP_XOP_X_X_X, SIMD_OP_SSE_PHADDD, arg0_type, fsig, args);
+               case SN_HorizontalSubtract:
+                       if (arg0_type == MONO_TYPE_I2)
+                               return emit_simd_ins_for_sig (cfg, klass, OP_XOP_X_X_X, SIMD_OP_SSE_PHSUBW, arg0_type, fsig, args);
+                       return emit_simd_ins_for_sig (cfg, klass, OP_XOP_X_X_X, SIMD_OP_SSE_PHSUBD, arg0_type, fsig, args);
+               case SN_Sign:
+                       if (arg0_type == MONO_TYPE_I1)
+                               return emit_simd_ins_for_sig (cfg, klass, OP_XOP_X_X_X, SIMD_OP_SSE_PSIGNB, arg0_type, fsig, args);
+                       if (arg0_type == MONO_TYPE_I2)
+                               return emit_simd_ins_for_sig (cfg, klass, OP_XOP_X_X_X, SIMD_OP_SSE_PSIGNW, arg0_type, fsig, args);
+                       return emit_simd_ins_for_sig (cfg, klass, OP_XOP_X_X_X, SIMD_OP_SSE_PSIGND, arg0_type, fsig, args);
                default:
                        g_assert_not_reached ();
                        break;
@@ -1353,6 +1387,38 @@ emit_x86_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature
                }
        }
 
+       if (is_hw_intrinsics_class (klass, "Sse42", &is_64bit)) {
+               if (!COMPILE_LLVM (cfg))
+                       return NULL;
+               info = lookup_intrins_info (sse42_methods, sizeof (sse42_methods), cmethod);
+               if (!info)
+                       return NULL;
+               int id = info->id;
+
+               /* Common case */
+               if (info->op != 0)
+                       return emit_simd_ins_for_sig (cfg, klass, info->op, info->instc0, arg0_type, fsig, args);
+
+               // FIXME: remove is_corlib check once Sse41 is implemented
+               supported = COMPILE_LLVM (cfg) && (mini_get_cpu_features (cfg) & MONO_CPU_X86_SSE42) != 0 && is_corlib; 
+
+               switch (id) {
+               case SN_get_IsSupported:
+                       EMIT_NEW_ICONST (cfg, ins, supported ? 1 : 0);
+                       ins->type = STACK_I4;
+                       return ins;
+               case SN_Crc32: {
+                       MonoTypeEnum arg1_type = get_underlying_type (fsig->params [1]);
+                       return emit_simd_ins_for_sig (cfg, klass, 
+                               arg1_type == MONO_TYPE_U8 ? OP_SSE42_CRC64 : OP_SSE42_CRC32, 
+                               arg1_type, arg0_type, fsig, args);
+               }
+               default:
+                       g_assert_not_reached ();
+                       break;
+               }
+       }
+
        if (is_hw_intrinsics_class (klass, "Popcnt", &is_64bit)) {
                info = lookup_intrins_info (popcnt_methods, sizeof (popcnt_methods), cmethod);
                if (!info)
index db13a99..c63a9cc 100644 (file)
@@ -182,6 +182,15 @@ METHOD(LoadDquVector128)
 METHOD(MoveAndDuplicate)
 METHOD(MoveHighAndDuplicate)
 METHOD(MoveLowAndDuplicate)
+// Ssse3
+METHOD(Abs)
+METHOD(AlignRight)
+METHOD(HorizontalAddSaturate)
+METHOD(HorizontalSubtractSaturate)
+METHOD(MultiplyHighRoundScale)
+METHOD(Sign)
 // Sse41
 METHOD(Insert)
 METHOD(TestZ)
+// Sse42
+METHOD(Crc32)