From a899a7f87b2ff278f8a12b170f1478ae6f0d9008 Mon Sep 17 00:00:00 2001 From: Egor Bogatov Date: Sun, 15 Mar 2020 06:20:36 +0300 Subject: [PATCH] [mono] Implement all Ssse3 and Sse42 intrinsics (#33591) * implement Ssse3 * Implement Sse42 * remove unrelated change * Fix C compilation error --- src/mono/mono/mini/llvm-intrinsics.h | 15 ++++++ src/mono/mono/mini/mini-llvm.c | 52 +++++++++++++++++++++ src/mono/mono/mini/mini-ops.h | 6 +++ src/mono/mono/mini/mini.h | 11 +++++ src/mono/mono/mini/simd-intrinsics-netcore.c | 70 +++++++++++++++++++++++++++- src/mono/mono/mini/simd-methods-netcore.h | 9 ++++ 6 files changed, 161 insertions(+), 2 deletions(-) diff --git a/src/mono/mono/mini/llvm-intrinsics.h b/src/mono/mono/mini/llvm-intrinsics.h index ffe9ee0..1eccab2 100644 --- a/src/mono/mono/mini/llvm-intrinsics.h +++ b/src/mono/mono/mini/llvm-intrinsics.h @@ -187,6 +187,21 @@ INTRINS(SSE_SFENCE, x86_sse_sfence) INTRINS(SSE_MFENCE, x86_sse2_mfence) INTRINS(SSE_LFENCE, x86_sse2_lfence) INTRINS(SSE_LDU_DQ, x86_sse3_ldu_dq) +INTRINS(SSE_PHADDW, x86_ssse3_phadd_w_128) +INTRINS(SSE_PHADDD, x86_ssse3_phadd_d_128) +INTRINS(SSE_PHADDSW, x86_ssse3_phadd_sw_128) +INTRINS(SSE_PHSUBW, x86_ssse3_phsub_w_128) +INTRINS(SSE_PHSUBD, x86_ssse3_phsub_d_128) +INTRINS(SSE_PHSUBSW, x86_ssse3_phsub_sw_128) +INTRINS(SSE_PMADDUBSW, x86_ssse3_pmadd_ub_sw_128) +INTRINS(SSE_PMULHRSW, x86_ssse3_pmul_hr_sw_128) +INTRINS(SSE_PSIGNB, x86_ssse3_psign_b_128) +INTRINS(SSE_PSIGNW, x86_ssse3_psign_w_128) +INTRINS(SSE_PSIGND, x86_ssse3_psign_d_128) +INTRINS(SSE_CRC32_32_8, x86_sse42_crc32_32_8) +INTRINS(SSE_CRC32_32_16, x86_sse42_crc32_32_16) +INTRINS(SSE_CRC32_32_32, x86_sse42_crc32_32_32) +INTRINS(SSE_CRC32_64_64, x86_sse42_crc32_64_64) #if LLVM_API_VERSION >= 800 // these intrinsics were renamed in LLVM 8 INTRINS_OVR(SSE_SADD_SATI8, sadd_sat) diff --git a/src/mono/mono/mini/mini-llvm.c b/src/mono/mono/mini/mini-llvm.c index 270ef5c..ba11f8c 100644 --- a/src/mono/mono/mini/mini-llvm.c +++ b/src/mono/mono/mini/mini-llvm.c @@ -7937,8 +7937,19 @@ process_bb (EmitContext *ctx, MonoBasicBlock *bb) case SIMD_OP_SSE_ADDSUBPD: id = INTRINS_SSE_ADDSUBPD; break; case SIMD_OP_SSE_HADDPS: id = INTRINS_SSE_HADDPS; break; case SIMD_OP_SSE_HADDPD: id = INTRINS_SSE_HADDPD; break; + case SIMD_OP_SSE_PHADDW: id = INTRINS_SSE_PHADDW; break; + case SIMD_OP_SSE_PHADDD: id = INTRINS_SSE_PHADDD; break; + case SIMD_OP_SSE_PHSUBW: id = INTRINS_SSE_PHSUBW; break; + case SIMD_OP_SSE_PHSUBD: id = INTRINS_SSE_PHSUBD; break; case SIMD_OP_SSE_HSUBPS: id = INTRINS_SSE_HSUBPS; break; case SIMD_OP_SSE_HSUBPD: id = INTRINS_SSE_HSUBPD; break; + case SIMD_OP_SSE_PHADDSW: id = INTRINS_SSE_PHADDSW; break; + case SIMD_OP_SSE_PHSUBSW: id = INTRINS_SSE_PHSUBSW; break; + case SIMD_OP_SSE_PSIGNB: id = INTRINS_SSE_PSIGNB; break; + case SIMD_OP_SSE_PSIGNW: id = INTRINS_SSE_PSIGNW; break; + case SIMD_OP_SSE_PSIGND: id = INTRINS_SSE_PSIGND; break; + case SIMD_OP_SSE_PMADDUBSW: id = INTRINS_SSE_PMADDUBSW; break; + case SIMD_OP_SSE_PMULHRSW: id = INTRINS_SSE_PMULHRSW; break; default: g_assert_not_reached (); break; } values [ins->dreg] = call_intrins (ctx, id, args, ""); @@ -8202,6 +8213,30 @@ process_bb (EmitContext *ctx, MonoBasicBlock *bb) break; } + case OP_SSSE3_ABS: { + // %sub = sub <16 x i8> zeroinitializer, %arg + // %cmp = icmp sgt <16 x i8> %arg, zeroinitializer + // %abs = select <16 x i1> %cmp, <16 x i8> %arg, <16 x i8> %sub + LLVMTypeRef typ = type_to_sse_type (ins->inst_c1); + LLVMValueRef sub = LLVMBuildSub(builder, LLVMConstNull(typ), lhs, ""); + LLVMValueRef cmp = LLVMBuildICmp(builder, LLVMIntSGT, lhs, LLVMConstNull(typ), ""); + LLVMValueRef abs = LLVMBuildSelect (builder, cmp, lhs, sub, ""); + values [ins->dreg] = convert (ctx, abs, typ); + break; + } + + case OP_SSSE3_ALIGNR: { + LLVMValueRef mask_values [16]; + for (int i = 0; i < 16; i++) + mask_values [i] = LLVMConstInt (LLVMInt8Type (), i + ins->inst_c0, FALSE); + LLVMValueRef shuffled = LLVMBuildShuffleVector (builder, + convert (ctx, rhs, sse_i1_t), + convert (ctx, lhs, sse_i1_t), + LLVMConstVector (mask_values, 16), ""); + values [ins->dreg] = convert (ctx, shuffled, type_to_sse_type (ins->inst_c1)); + break; + } + case OP_CREATE_SCALAR: case OP_CREATE_SCALAR_UNSAFE: { LLVMTypeRef type = type_to_sse_type (ins->inst_c1); @@ -8261,6 +8296,23 @@ process_bb (EmitContext *ctx, MonoBasicBlock *bb) values [ins->dreg] = LLVMBuildZExt (builder, cmp_zero, LLVMInt8Type (), ""); break; } + + case OP_SSE42_CRC32: + case OP_SSE42_CRC64: { + LLVMValueRef args [2]; + args [0] = lhs; + args [1] = convert (ctx, rhs, primitive_type_to_llvm_type (ins->inst_c0)); + IntrinsicId id; + switch (ins->inst_c0) { + case MONO_TYPE_U1: id = INTRINS_SSE_CRC32_32_8; break; + case MONO_TYPE_U2: id = INTRINS_SSE_CRC32_32_16; break; + case MONO_TYPE_U4: id = INTRINS_SSE_CRC32_32_32; break; + case MONO_TYPE_U8: id = INTRINS_SSE_CRC32_64_64; break; + default: g_assert_not_reached (); break; + } + values [ins->dreg] = call_intrins (ctx, id, args, ""); + break; + } #endif #ifdef ENABLE_NETCORE diff --git a/src/mono/mono/mini/mini-ops.h b/src/mono/mono/mini/mini-ops.h index 5ab15bb..6bd050c 100644 --- a/src/mono/mono/mini/mini-ops.h +++ b/src/mono/mono/mini/mini-ops.h @@ -1077,7 +1077,9 @@ MINI_OP(OP_SSE3_MOVSLDUP, "sse3_movsldup", XREG, XREG, NONE) MINI_OP(OP_SSE3_MOVDDUP_MEM, "sse3_movddup_mem", XREG, IREG, NONE) /* ssse 3 */ +MINI_OP(OP_SSSE3_ABS, "ssse3_abs", XREG, XREG, NONE) MINI_OP(OP_SSSE3_SHUFFLE, "ssse3_shuffle", XREG, XREG, XREG) +MINI_OP3(OP_SSSE3_ALIGNR, "ssse3_alignr", XREG, XREG, XREG, IREG) /* sse 4.1 */ /* inst_c0 is the rounding mode: 0 = round, 1 = floor, 2 = ceiling */ @@ -1086,6 +1088,10 @@ MINI_OP(OP_SSE41_ROUNDSS, "roundss", XREG, XREG, NONE) MINI_OP3(OP_SSE41_INSERT, "sse41_insert", XREG, XREG, XREG, IREG) MINI_OP(OP_SSE41_PTESTZ, "sse41_ptestz", IREG, XREG, XREG) +/* sse 4.2 */ +MINI_OP(OP_SSE42_CRC32, "sse42_crc32", IREG, IREG, IREG) +MINI_OP(OP_SSE42_CRC64, "sse42_crc64", LREG, LREG, LREG) + /* Intel BMI1 */ /* Count trailing zeroes, return 32/64 if the input is 0 */ MINI_OP(OP_CTTZ32, "cttz32", IREG, IREG, NONE) diff --git a/src/mono/mono/mini/mini.h b/src/mono/mono/mini/mini.h index 46263d3..a0bd720 100644 --- a/src/mono/mono/mini/mini.h +++ b/src/mono/mono/mini/mini.h @@ -2951,8 +2951,19 @@ typedef enum { SIMD_OP_SSE_ADDSUBPD, SIMD_OP_SSE_HADDPS, SIMD_OP_SSE_HADDPD, + SIMD_OP_SSE_PHADDW, + SIMD_OP_SSE_PHADDD, + SIMD_OP_SSE_PHSUBW, + SIMD_OP_SSE_PHSUBD, SIMD_OP_SSE_HSUBPS, SIMD_OP_SSE_HSUBPD, + SIMD_OP_SSE_PHADDSW, + SIMD_OP_SSE_PHSUBSW, + SIMD_OP_SSE_PSIGNB, + SIMD_OP_SSE_PSIGNW, + SIMD_OP_SSE_PSIGND, + SIMD_OP_SSE_PMADDUBSW, + SIMD_OP_SSE_PMULHRSW, SIMD_OP_SSE_LDDQU } SimdOp; diff --git a/src/mono/mono/mini/simd-intrinsics-netcore.c b/src/mono/mono/mini/simd-intrinsics-netcore.c index 9890068..a9c872b 100644 --- a/src/mono/mono/mini/simd-intrinsics-netcore.c +++ b/src/mono/mono/mini/simd-intrinsics-netcore.c @@ -887,7 +887,16 @@ static SimdIntrinsic sse3_methods [] = { }; static SimdIntrinsic ssse3_methods [] = { + {SN_Abs, OP_SSSE3_ABS}, + {SN_AlignRight}, + {SN_HorizontalAdd}, + {SN_HorizontalAddSaturate, OP_XOP_X_X_X, SIMD_OP_SSE_PHADDSW}, + {SN_HorizontalSubtract}, + {SN_HorizontalSubtractSaturate, OP_XOP_X_X_X, SIMD_OP_SSE_PHSUBSW}, + {SN_MultiplyAddAdjacent, OP_XOP_X_X_X, SIMD_OP_SSE_PMADDUBSW}, + {SN_MultiplyHighRoundScale, OP_XOP_X_X_X, SIMD_OP_SSE_PMULHRSW}, {SN_Shuffle, OP_SSSE3_SHUFFLE}, + {SN_Sign}, {SN_get_IsSupported} }; @@ -899,6 +908,12 @@ static SimdIntrinsic sse41_methods [] = { {SN_get_IsSupported} }; +static SimdIntrinsic sse42_methods [] = { + {SN_CompareGreaterThan, OP_XCOMPARE, CMP_GT}, + {SN_Crc32}, + {SN_get_IsSupported} +}; + static SimdIntrinsic popcnt_methods [] = { {SN_PopCount}, {SN_get_IsSupported} @@ -1235,7 +1250,6 @@ emit_x86_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature case SN_ShuffleLow: g_assert (fsig->param_count == 2); return emit_simd_ins_for_sig (cfg, klass, OP_SSE2_PSHUFLW, 0, arg0_type, fsig, args); - return NULL; case SN_SqrtScalar: if (fsig->param_count == 1) return emit_simd_ins_for_sig (cfg, klass, OP_XOP_X_X, info->instc0, arg0_type, fsig, args); @@ -1307,13 +1321,33 @@ emit_x86_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature if (info->op != 0) return emit_simd_ins_for_sig (cfg, klass, info->op, info->instc0, arg0_type, fsig, args); - supported = (mini_get_cpu_features (cfg) & MONO_CPU_X86_SSSE3) != 0 && is_corlib; // We only support the subset used by corelib + supported = (mini_get_cpu_features (cfg) & MONO_CPU_X86_SSSE3) != 0; switch (id) { case SN_get_IsSupported: EMIT_NEW_ICONST (cfg, ins, supported ? 1 : 0); ins->type = STACK_I4; return ins; + case SN_AlignRight: + if (args [2]->opcode == OP_ICONST) + return emit_simd_ins_for_sig (cfg, klass, OP_SSSE3_ALIGNR, args [2]->inst_c0, arg0_type, fsig, args); + else + // FIXME: non-constant mask (generate switch) + return NULL; + case SN_HorizontalAdd: + if (arg0_type == MONO_TYPE_I2) + return emit_simd_ins_for_sig (cfg, klass, OP_XOP_X_X_X, SIMD_OP_SSE_PHADDW, arg0_type, fsig, args); + return emit_simd_ins_for_sig (cfg, klass, OP_XOP_X_X_X, SIMD_OP_SSE_PHADDD, arg0_type, fsig, args); + case SN_HorizontalSubtract: + if (arg0_type == MONO_TYPE_I2) + return emit_simd_ins_for_sig (cfg, klass, OP_XOP_X_X_X, SIMD_OP_SSE_PHSUBW, arg0_type, fsig, args); + return emit_simd_ins_for_sig (cfg, klass, OP_XOP_X_X_X, SIMD_OP_SSE_PHSUBD, arg0_type, fsig, args); + case SN_Sign: + if (arg0_type == MONO_TYPE_I1) + return emit_simd_ins_for_sig (cfg, klass, OP_XOP_X_X_X, SIMD_OP_SSE_PSIGNB, arg0_type, fsig, args); + if (arg0_type == MONO_TYPE_I2) + return emit_simd_ins_for_sig (cfg, klass, OP_XOP_X_X_X, SIMD_OP_SSE_PSIGNW, arg0_type, fsig, args); + return emit_simd_ins_for_sig (cfg, klass, OP_XOP_X_X_X, SIMD_OP_SSE_PSIGND, arg0_type, fsig, args); default: g_assert_not_reached (); break; @@ -1353,6 +1387,38 @@ emit_x86_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature } } + if (is_hw_intrinsics_class (klass, "Sse42", &is_64bit)) { + if (!COMPILE_LLVM (cfg)) + return NULL; + info = lookup_intrins_info (sse42_methods, sizeof (sse42_methods), cmethod); + if (!info) + return NULL; + int id = info->id; + + /* Common case */ + if (info->op != 0) + return emit_simd_ins_for_sig (cfg, klass, info->op, info->instc0, arg0_type, fsig, args); + + // FIXME: remove is_corlib check once Sse41 is implemented + supported = COMPILE_LLVM (cfg) && (mini_get_cpu_features (cfg) & MONO_CPU_X86_SSE42) != 0 && is_corlib; + + switch (id) { + case SN_get_IsSupported: + EMIT_NEW_ICONST (cfg, ins, supported ? 1 : 0); + ins->type = STACK_I4; + return ins; + case SN_Crc32: { + MonoTypeEnum arg1_type = get_underlying_type (fsig->params [1]); + return emit_simd_ins_for_sig (cfg, klass, + arg1_type == MONO_TYPE_U8 ? OP_SSE42_CRC64 : OP_SSE42_CRC32, + arg1_type, arg0_type, fsig, args); + } + default: + g_assert_not_reached (); + break; + } + } + if (is_hw_intrinsics_class (klass, "Popcnt", &is_64bit)) { info = lookup_intrins_info (popcnt_methods, sizeof (popcnt_methods), cmethod); if (!info) diff --git a/src/mono/mono/mini/simd-methods-netcore.h b/src/mono/mono/mini/simd-methods-netcore.h index db13a99..c63a9cc 100644 --- a/src/mono/mono/mini/simd-methods-netcore.h +++ b/src/mono/mono/mini/simd-methods-netcore.h @@ -182,6 +182,15 @@ METHOD(LoadDquVector128) METHOD(MoveAndDuplicate) METHOD(MoveHighAndDuplicate) METHOD(MoveLowAndDuplicate) +// Ssse3 +METHOD(Abs) +METHOD(AlignRight) +METHOD(HorizontalAddSaturate) +METHOD(HorizontalSubtractSaturate) +METHOD(MultiplyHighRoundScale) +METHOD(Sign) // Sse41 METHOD(Insert) METHOD(TestZ) +// Sse42 +METHOD(Crc32) -- 2.7.4