From: Zoltan Varga Date: Mon, 9 Mar 2020 22:37:37 +0000 (-0400) Subject: [jit] Implement support for all Sse1 intrinsics for netcore. (#33356) X-Git-Tag: submit/tizen/20210909.063632~9259 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=1e04f68b1b97bf36e4f1fc75ea23b9d156c19920;p=platform%2Fupstream%2Fdotnet%2Fruntime.git [jit] Implement support for all Sse1 intrinsics for netcore. (#33356) * [jit] Implement support for all Sse1 intrinsics for netcore. * Add generic OP_XOP opcodes for opcodes which the JIT doesn't care about. Add a SimdOp enum to list the operations performed by these opcodes. * Add a SimdIntrinsic struct so the mapping between the .net methods and the JIT opcodes can be specified declaratively. * Add all intrinsics from the Sse class. * Fix UnpackHigh/UnpackLow. * Implement missing load/store intrinsics. * Implement missing opcodes. * Fix nontemporal metadata. * Fix MOVNTPS alignment.. * Fix OP_XOP_X_X. --- diff --git a/src/mono/mono/mini/llvm-intrinsics.h b/src/mono/mono/mini/llvm-intrinsics.h index 4520b90..19ffaee 100644 --- a/src/mono/mono/mini/llvm-intrinsics.h +++ b/src/mono/mono/mini/llvm-intrinsics.h @@ -59,6 +59,7 @@ INTRINS_OVR(CTLZ_I32, ctlz) INTRINS_OVR(CTLZ_I64, ctlz) INTRINS_OVR(CTTZ_I32, cttz) INTRINS_OVR(CTTZ_I64, cttz) +INTRINS(PREFETCH, prefetch) INTRINS(BZHI_I32, x86_bmi_bzhi_32) INTRINS(BZHI_I64, x86_bmi_bzhi_64) INTRINS(BEXTR_I32, x86_bmi_bextr_32) @@ -81,8 +82,11 @@ INTRINS(SSE_PSRLI_Q, x86_sse2_psrli_q) INTRINS(SSE_PSLLI_Q, x86_sse2_pslli_q) INTRINS(SSE_SQRT_PD, x86_sse2_sqrt_pd) INTRINS(SSE_SQRT_PS, x86_sse_sqrt_ps) -INTRINS(SSE_RSQRT_PS, x86_sse_rsqrt_ps) INTRINS(SSE_RCP_PS, x86_sse_rcp_ps) +INTRINS(SSE_RSQRT_PS, x86_sse_rsqrt_ps) +INTRINS(SSE_SQRT_SS, x86_sse_sqrt_ss) +INTRINS(SSE_RCP_SS, x86_sse_rcp_ss) +INTRINS(SSE_RSQRT_SS, x86_sse_rsqrt_ss) INTRINS(SSE_CVTTPD2DQ, x86_sse2_cvttpd2dq) INTRINS(SSE_CVTTPS2DQ, x86_sse2_cvttps2dq) INTRINS(SSE_CVTDQ2PS, x86_sse2_cvtdq2ps) @@ -149,6 +153,7 @@ INTRINS(SSE_ROUNDSS, x86_sse41_round_ss) INTRINS(SSE_ROUNDPD, x86_sse41_round_pd) INTRINS(SSE_PTESTZ, x86_sse41_ptestz) INTRINS(SSE_INSERTPS, x86_sse41_insertps) +INTRINS(SSE_SFENCE, x86_sse_sfence) #if LLVM_API_VERSION >= 800 // these intrinsics were renamed in LLVM 8 INTRINS_OVR(SSE_SADD_SATI8, sadd_sat) diff --git a/src/mono/mono/mini/mini-llvm.c b/src/mono/mono/mini/mini-llvm.c index be17a2e3..137cf15 100644 --- a/src/mono/mono/mini/mini-llvm.c +++ b/src/mono/mono/mini/mini-llvm.c @@ -276,6 +276,8 @@ static LLVMRealPredicate fpcond_to_llvm_cond [] = { LLVMRealUGE, LLVMRealULT, LLVMRealUGT, + LLVMRealORD, + LLVMRealUNO }; static MonoLLVMModule aot_module; @@ -306,7 +308,7 @@ set_failure (EmitContext *ctx, const char *message) } static LLVMValueRef -ConstInt32 (int v) +const_int32 (int v) { return LLVMConstInt (LLVMInt32Type (), v, FALSE); } @@ -2137,6 +2139,20 @@ set_nonnull_load_flag (LLVMValueRef v) } static void +set_nontemporal_flag (LLVMValueRef v) +{ + LLVMValueRef md_arg; + int md_kind; + const char *flag_name; + + // FIXME: Cache this + flag_name = "nontemporal"; + md_kind = LLVMGetMDKindID (flag_name, strlen (flag_name)); + md_arg = const_int32 (1); + LLVMSetMetadata (v, md_kind, LLVMMDNode (&md_arg, 1)); +} + +static void set_invariant_load_flag (LLVMValueRef v) { LLVMValueRef md_arg; @@ -5701,7 +5717,7 @@ process_bb (EmitContext *ctx, MonoBasicBlock *bb) v = mono_llvm_build_alloca (builder, LLVMInt8Type (), LLVMConstInt (LLVMInt32Type (), size, FALSE), MONO_ARCH_FRAME_ALIGNMENT, ""); if (ins->flags & MONO_INST_INIT) - emit_memset (ctx, builder, v, ConstInt32 (size), MONO_ARCH_FRAME_ALIGNMENT); + emit_memset (ctx, builder, v, const_int32 (size), MONO_ARCH_FRAME_ALIGNMENT); values [ins->dreg] = v; break; @@ -6559,7 +6575,7 @@ process_bb (EmitContext *ctx, MonoBasicBlock *bb) if (!addresses [ins->dreg]) addresses [ins->dreg] = build_alloca (ctx, m_class_get_byval_arg (klass)); LLVMValueRef ptr = LLVMBuildBitCast (builder, addresses [ins->dreg], LLVMPointerType (LLVMInt8Type (), 0), ""); - emit_memset (ctx, builder, ptr, ConstInt32 (mono_class_value_size (klass, NULL)), 0); + emit_memset (ctx, builder, ptr, const_int32 (mono_class_value_size (klass, NULL)), 0); break; } case OP_DUMMY_VZERO: @@ -7456,6 +7472,47 @@ process_bb (EmitContext *ctx, MonoBasicBlock *bb) values [ins->dreg] = mono_llvm_build_aligned_load (builder, dst_vec, "", FALSE, ins->inst_c0); // inst_c0 is alignment break; } + case OP_SSE_MOVSS: { + LLVMValueRef addr = convert (ctx, lhs, LLVMPointerType (LLVMFloatType (), 0)); + LLVMValueRef val = mono_llvm_build_load (builder, addr, "", FALSE); + values [ins->dreg] = LLVMBuildInsertElement (builder, LLVMConstNull (type_to_sse_type (ins->inst_c1)), val, LLVMConstInt (LLVMInt32Type (), 0, FALSE), ""); + break; + } + case OP_SSE_MOVSS_STORE: { + LLVMValueRef addr = convert (ctx, lhs, LLVMPointerType (LLVMFloatType (), 0)); + LLVMValueRef val = LLVMBuildExtractElement (builder, rhs, LLVMConstInt (LLVMInt32Type (), 0, FALSE), ""); + mono_llvm_build_store (builder, val, addr, FALSE, LLVM_BARRIER_NONE); + break; + } + + case OP_SSE_MOVLPS_LOAD: + case OP_SSE_MOVHPS_LOAD: { + /* Load two floats from rhs and store them in the low/high part of lhs */ + LLVMValueRef addr = rhs; + LLVMValueRef addr1 = convert (ctx, addr, LLVMPointerType (LLVMFloatType (), 0)); + LLVMValueRef addr2 = convert (ctx, LLVMBuildAdd (builder, convert (ctx, addr, IntPtrType ()), convert (ctx, LLVMConstInt (LLVMInt32Type (), 4, FALSE), IntPtrType ()), ""), LLVMPointerType (LLVMFloatType (), 0)); + LLVMValueRef val1 = mono_llvm_build_load (builder, addr1, "", FALSE); + LLVMValueRef val2 = mono_llvm_build_load (builder, addr2, "", FALSE); + int index1 = ins->opcode == OP_SSE_MOVLPS_LOAD ? 0 : 2; + int index2 = ins->opcode == OP_SSE_MOVLPS_LOAD ? 1 : 3; + values [ins->dreg] = LLVMBuildInsertElement (builder, LLVMBuildInsertElement (builder, lhs, val1, LLVMConstInt (LLVMInt32Type (), index1, FALSE), ""), val2, LLVMConstInt (LLVMInt32Type (), index2, FALSE), ""); + break; + } + + case OP_SSE_MOVLPS_STORE: + case OP_SSE_MOVHPS_STORE: { + /* Store two floats from the low/hight part of rhs into lhs */ + LLVMValueRef addr = lhs; + LLVMValueRef addr1 = convert (ctx, addr, LLVMPointerType (LLVMFloatType (), 0)); + LLVMValueRef addr2 = convert (ctx, LLVMBuildAdd (builder, convert (ctx, addr, IntPtrType ()), convert (ctx, LLVMConstInt (LLVMInt32Type (), 4, FALSE), IntPtrType ()), ""), LLVMPointerType (LLVMFloatType (), 0)); + int index1 = ins->opcode == OP_SSE_MOVLPS_STORE ? 0 : 2; + int index2 = ins->opcode == OP_SSE_MOVLPS_STORE ? 1 : 3; + LLVMValueRef val1 = LLVMBuildExtractElement (builder, rhs, LLVMConstInt (LLVMInt32Type (), index1, FALSE), ""); + LLVMValueRef val2 = LLVMBuildExtractElement (builder, rhs, LLVMConstInt (LLVMInt32Type (), index2, FALSE), ""); + mono_llvm_build_store (builder, val1, addr1, FALSE, LLVM_BARRIER_NONE); + mono_llvm_build_store (builder, val2, addr2, FALSE, LLVM_BARRIER_NONE); + break; + } case OP_SSE_STORE: { LLVMValueRef dst_vec = convert (ctx, lhs, LLVMPointerType (LLVMTypeOf (rhs), 0)); @@ -7469,6 +7526,35 @@ process_bb (EmitContext *ctx, MonoBasicBlock *bb) mono_llvm_build_aligned_store (builder, first_elem, dst, FALSE, 1); break; } + case OP_SSE_MOVNTPS: { + LLVMValueRef store = mono_llvm_build_aligned_store (builder, rhs, lhs, FALSE, 16); + set_nontemporal_flag (store); + break; + } + case OP_SSE_PREFETCHT0: { + LLVMValueRef addr = convert (ctx, lhs, LLVMPointerType (LLVMInt8Type (), 0)); + LLVMValueRef args [] = { addr, const_int32 (0), const_int32 (3), const_int32 (1) }; + call_intrins (ctx, INTRINS_PREFETCH, args, ""); + break; + } + case OP_SSE_PREFETCHT1: { + LLVMValueRef addr = convert (ctx, lhs, LLVMPointerType (LLVMInt8Type (), 0)); + LLVMValueRef args [] = { addr, const_int32 (0), const_int32 (2), const_int32 (1) }; + call_intrins (ctx, INTRINS_PREFETCH, args, ""); + break; + } + case OP_SSE_PREFETCHT2: { + LLVMValueRef addr = convert (ctx, lhs, LLVMPointerType (LLVMInt8Type (), 0)); + LLVMValueRef args [] = { addr, const_int32 (0), const_int32 (1), const_int32 (1) }; + call_intrins (ctx, INTRINS_PREFETCH, args, ""); + break; + } + case OP_SSE_PREFETCHNTA: { + LLVMValueRef addr = convert (ctx, lhs, LLVMPointerType (LLVMInt8Type (), 0)); + LLVMValueRef args [] = { addr, const_int32 (0), const_int32 (0), const_int32 (1) }; + call_intrins (ctx, INTRINS_PREFETCH, args, ""); + break; + } case OP_SSE_SHUFFLE: { LLVMValueRef shuffle_vec = create_const_vector_4_i32 ( @@ -7536,24 +7622,169 @@ process_bb (EmitContext *ctx, MonoBasicBlock *bb) break; } - case OP_SSE2_ADDS: { - gint32 intrinsic = 0; - if (ins->inst_c1 == MONO_TYPE_I1) - intrinsic = INTRINS_SSE_SADD_SATI8; - else if (ins->inst_c1 == MONO_TYPE_U1) - intrinsic = INTRINS_SSE_UADD_SATI8; - else if (ins->inst_c1 == MONO_TYPE_I2) - intrinsic = INTRINS_SSE_SADD_SATI16; - else if (ins->inst_c1 == MONO_TYPE_U2) - intrinsic = INTRINS_SSE_UADD_SATI16; - else + case OP_SSE_ADDSS: + case OP_SSE_SUBSS: + case OP_SSE_DIVSS: + case OP_SSE_MULSS: + case OP_SSE2_ADDSD: { + LLVMValueRef v1 = LLVMBuildExtractElement (builder, lhs, LLVMConstInt (LLVMInt32Type (), 0, FALSE), ""); + LLVMValueRef v2 = LLVMBuildExtractElement (builder, rhs, LLVMConstInt (LLVMInt32Type (), 0, FALSE), ""); + + LLVMValueRef v = NULL; + switch (ins->opcode) { + case OP_SSE_ADDSS: + case OP_SSE2_ADDSD: + v = LLVMBuildFAdd (builder, v1, v2, ""); + break; + case OP_SSE_SUBSS: + v = LLVMBuildFSub (builder, v1, v2, ""); + break; + case OP_SSE_DIVSS: + v = LLVMBuildFDiv (builder, v1, v2, ""); + break; + case OP_SSE_MULSS: + v = LLVMBuildFMul (builder, v1, v2, ""); + break; + default: + g_assert_not_reached (); + } + values [ins->dreg] = LLVMBuildInsertElement (builder, lhs, v, LLVMConstInt (LLVMInt32Type (), 0, FALSE), ""); + break; + } + + case OP_SSE_CMPSS: + case OP_SSE2_CMPSD: { + int imm = -1; + switch (ins->inst_c0) { + case CMP_EQ: imm = 0; break; + case CMP_GT: imm = 6; break; + case CMP_GE: imm = 5; break; + case CMP_LT: imm = 1; break; + case CMP_LE: imm = 2; break; + case CMP_NE: imm = 4; break; + case CMP_ORD: imm = 7; break; + case CMP_UNORD: imm = 3; break; + default: g_assert_not_reached (); break; + } + LLVMValueRef cmp = LLVMConstInt (LLVMInt8Type (), imm, FALSE); + LLVMValueRef args [] = { lhs, rhs, cmp }; + switch (ins->opcode) { + case OP_SSE_CMPSS: + values [ins->dreg] = call_intrins (ctx, INTRINS_SSE_CMPSS, args, ""); + break; + case OP_SSE2_CMPSD: + values [ins->dreg] = call_intrins (ctx, INTRINS_SSE_CMPSD, args, ""); + break; + default: g_assert_not_reached (); + break; + } + break; + } + case OP_SSE_COMISS: { + LLVMValueRef args [] = { lhs, rhs }; + IntrinsicId id = (IntrinsicId)0; + switch (ins->inst_c0) { + case CMP_EQ: id = INTRINS_SSE_COMIEQ_SS; break; + case CMP_GT: id = INTRINS_SSE_COMIGT_SS; break; + case CMP_GE: id = INTRINS_SSE_COMIGE_SS; break; + case CMP_LT: id = INTRINS_SSE_COMILT_SS; break; + case CMP_LE: id = INTRINS_SSE_COMILE_SS; break; + case CMP_NE: id = INTRINS_SSE_COMINEQ_SS; break; + default: g_assert_not_reached (); break; + } + values [ins->dreg] = call_intrins (ctx, id, args, ""); + break; + } + case OP_SSE_UCOMISS: { + LLVMValueRef args [] = { lhs, rhs }; + IntrinsicId id = (IntrinsicId)0; + switch (ins->inst_c0) { + case CMP_EQ: id = INTRINS_SSE_UCOMIEQ_SS; break; + case CMP_GT: id = INTRINS_SSE_UCOMIGT_SS; break; + case CMP_GE: id = INTRINS_SSE_UCOMIGE_SS; break; + case CMP_LT: id = INTRINS_SSE_UCOMILT_SS; break; + case CMP_LE: id = INTRINS_SSE_UCOMILE_SS; break; + case CMP_NE: id = INTRINS_SSE_UCOMINEQ_SS; break; + default: g_assert_not_reached (); break; + } + values [ins->dreg] = call_intrins (ctx, id, args, ""); + break; + } + case OP_XOP: { + IntrinsicId id = (IntrinsicId)0; + switch (ins->inst_c0) { + case SIMD_OP_SSE_SFENCE: id = INTRINS_SSE_SFENCE; break; + default: g_assert_not_reached (); break; + } + call_intrins (ctx, id, NULL, ""); + break; + } + case OP_XOP_X_X: { + IntrinsicId id = (IntrinsicId)0; + switch (ins->inst_c0) { + case SIMD_OP_SSE_SQRTPS: id = INTRINS_SSE_SQRT_PS; break; + case SIMD_OP_SSE_RCPPS: id = INTRINS_SSE_RCP_PS; break; + case SIMD_OP_SSE_RSQRTPS: id = INTRINS_SSE_RSQRT_PS; break; + case SIMD_OP_SSE_SQRTSS: id = INTRINS_SSE_SQRT_SS; break; + case SIMD_OP_SSE_RCPSS: id = INTRINS_SSE_RCP_SS; break; + case SIMD_OP_SSE_RSQRTSS: id = INTRINS_SSE_RSQRT_SS; break; + default: g_assert_not_reached (); break; + } + values [ins->dreg] = call_intrins (ctx, id, &lhs, ""); + break; + } + case OP_XOP_I4_X: + case OP_XOP_I8_X: { + IntrinsicId id = (IntrinsicId)0; + switch (ins->inst_c0) { + case SIMD_OP_SSE_CVTSS2SI: id = INTRINS_SSE_CVTSS2SI; break; + case SIMD_OP_SSE_CVTTSS2SI: id = INTRINS_SSE_CVTTSS2SI; break; + case SIMD_OP_SSE_CVTSS2SI64: id = INTRINS_SSE_CVTSS2SI64; break; + case SIMD_OP_SSE_CVTTSS2SI64: id = INTRINS_SSE_CVTTSS2SI64; break; + case SIMD_OP_SSE_CVTSD2SI: id = INTRINS_SSE_CVTSD2SI; break; + case SIMD_OP_SSE_CVTSD2SI64: id = INTRINS_SSE_CVTSD2SI64; break; + case SIMD_OP_SSE_CVTTSD2SI64: id = INTRINS_SSE_CVTTSD2SI64; break; + default: g_assert_not_reached (); break; + } + values [ins->dreg] = call_intrins (ctx, id, &lhs, ""); + break; + } + case OP_XOP_X_X_X: + case OP_XOP_X_X_I4: + case OP_XOP_X_X_I8: { + LLVMValueRef args [] = { lhs, rhs }; + IntrinsicId id = (IntrinsicId)0; + switch (ins->inst_c0) { + case SIMD_OP_SSE_CVTSI2SS: id = INTRINS_SSE_CVTSI2SS; break; + case SIMD_OP_SSE_CVTSI2SS64: id = INTRINS_SSE_CVTSI2SS64; break; + case SIMD_OP_SSE_CVTSI2SD: id = INTRINS_SSE_CVTSI2SD; break; + case SIMD_OP_SSE_CVTSI2SD64: id = INTRINS_SSE_CVTSI2SD64; break; + case SIMD_OP_SSE_MAXPS: id = INTRINS_SSE_MAXPS; break; + case SIMD_OP_SSE_MAXSS: id = INTRINS_SSE_MAXSS; break; + case SIMD_OP_SSE_MINPS: id = INTRINS_SSE_MINPS; break; + case SIMD_OP_SSE_MINSS: id = INTRINS_SSE_MINSS; break; + default: g_assert_not_reached (); break; + } + values [ins->dreg] = call_intrins (ctx, id, args, ""); + break; + } + + case OP_SSE2_ADDS: { + IntrinsicId id = (IntrinsicId)0; + switch (ins->inst_c1) { + case MONO_TYPE_I1: id = INTRINS_SSE_SADD_SATI8; break; + case MONO_TYPE_U1: id = INTRINS_SSE_UADD_SATI8; break; + case MONO_TYPE_I2: id = INTRINS_SSE_SADD_SATI16; break; + case MONO_TYPE_U2: id = INTRINS_SSE_UADD_SATI16; break; + default: g_assert_not_reached (); break; + } LLVMValueRef args [2]; args [0] = convert (ctx, lhs, type_to_sse_type (ins->inst_c1)); args [1] = convert (ctx, rhs, type_to_sse_type (ins->inst_c1)); values [ins->dreg] = convert (ctx, - call_intrins (ctx, intrinsic, args, dname), + call_intrins (ctx, id, args, dname), type_to_sse_type (ins->inst_c1)); break; } diff --git a/src/mono/mono/mini/mini-ops.h b/src/mono/mono/mini/mini-ops.h index 325e0de..e0cd478 100644 --- a/src/mono/mono/mini/mini-ops.h +++ b/src/mono/mono/mini/mini-ops.h @@ -1021,12 +1021,33 @@ MINI_OP(OP_SSE_AND, "sse_and", XREG, XREG, XREG) MINI_OP(OP_SSE_OR, "sse_or", XREG, XREG, XREG) MINI_OP(OP_SSE_XOR, "sse_xor", XREG, XREG, XREG) MINI_OP(OP_SSE_ANDN, "sse_andn", XREG, XREG, XREG) +MINI_OP(OP_SSE_ADDSS, "sse_addss", XREG, XREG, XREG) +MINI_OP(OP_SSE_SUBSS, "sse_subss", XREG, XREG, XREG) +MINI_OP(OP_SSE_DIVSS, "sse_divss", XREG, XREG, XREG) +MINI_OP(OP_SSE_MULSS, "sse_mulss", XREG, XREG, XREG) +MINI_OP(OP_SSE_CMPSS, "sse_cmpss", XREG, XREG, XREG) +MINI_OP(OP_SSE_COMISS, "sse_comiss", IREG, XREG, XREG) +MINI_OP(OP_SSE_UCOMISS, "sse_ucomiss", IREG, XREG, XREG) +MINI_OP(OP_SSE_MOVSS, "sse_movss", XREG, IREG, NONE) +MINI_OP(OP_SSE_MOVSS_STORE, "sse_movss_store", NONE, IREG, XREG) +MINI_OP(OP_SSE_MOVHPS_LOAD, "sse_movhps_load", XREG, XREG, IREG) +MINI_OP(OP_SSE_MOVLPS_LOAD, "sse_movlps_load", XREG, XREG, IREG) +MINI_OP(OP_SSE_MOVHPS_STORE, "sse_movhps_store", NONE, IREG, XREG) +MINI_OP(OP_SSE_MOVLPS_STORE, "sse_movlps_store", NONE, IREG, XREG) +MINI_OP(OP_SSE_MOVNTPS, "sse_movntps", NONE, IREG, XREG) +MINI_OP(OP_SSE_PREFETCHT0, "sse_prefetcht0", NONE, IREG, NONE) +MINI_OP(OP_SSE_PREFETCHT1, "sse_prefetcht1", NONE, IREG, NONE) +MINI_OP(OP_SSE_PREFETCHT2, "sse_prefetcht2", NONE, IREG, NONE) +MINI_OP(OP_SSE_PREFETCHNTA, "sse_prefetchnta", NONE, IREG, NONE) /* sse 2 */ MINI_OP(OP_SSE2_PACKUS, "sse2_packus", XREG, XREG, XREG) MINI_OP(OP_SSE2_SRLI, "sse2_srli", XREG, XREG, XREG) MINI_OP(OP_SSE2_SHUFFLE, "sse2_shuffle", XREG, XREG, XREG) MINI_OP(OP_SSE2_ADDS, "sse2_adds", XREG, XREG, XREG) +MINI_OP(OP_SSE2_ADDSD, "sse2_addsd", XREG, XREG, XREG) +MINI_OP(OP_SSE2_CMPSD, "sse2_cmpsd", XREG, XREG, XREG) +MINI_OP(OP_SSE2_COMIEQ_SD, "sse2_comieq_sd", XREG, XREG, XREG) /* sse 3 */ MINI_OP(OP_SSE3_MOVDDUP, "sse3_movddup", XREG, XREG, NONE) @@ -1440,8 +1461,20 @@ MINI_OP(OP_XEQUAL, "xequal", IREG, XREG, XREG) /* Per element compate, inst_c0 contains a CompRelation */ MINI_OP(OP_XCOMPARE, "xcompare", XREG, XREG, XREG) MINI_OP(OP_XCOMPARE_FP, "xcompare_fp", XREG, XREG, XREG) -/* Binary op, inst_c0 contains the operation */ + +/* + * Generic SIMD operations, the rest of the JIT doesn't care about the exact operation. + */ MINI_OP(OP_XBINOP, "xbinop", XREG, XREG, XREG) +/* inst_c0 contains a SimdOp, inst_c1 might contain additional data */ +MINI_OP(OP_XOP, "xop", NONE, NONE, NONE) +MINI_OP(OP_XOP_X_X, "xop_x_x", XREG, XREG, NONE) +MINI_OP(OP_XOP_I4_X, "xop_i4_x", IREG, XREG, NONE) +MINI_OP(OP_XOP_I8_X, "xop_i8_x", LREG, XREG, NONE) +MINI_OP(OP_XOP_X_X_X, "xop_x_x_x", XREG, XREG, XREG) +MINI_OP(OP_XOP_X_X_I4, "xop_x_x_i4", XREG, XREG, IREG) +MINI_OP(OP_XOP_X_X_I8, "xop_x_x_i8", XREG, XREG, LREG) + MINI_OP(OP_XCAST, "xcast", XREG, XREG, NONE) /* Extract element of vector */ /* The index is assumed to be in range */ @@ -1456,5 +1489,3 @@ MINI_OP(OP_LZCNT32, "lzcnt32", IREG, IREG, NONE) MINI_OP(OP_LZCNT64, "lzcnt64", LREG, LREG, NONE) MINI_OP(OP_POPCNT32, "popcnt32", IREG, IREG, NONE) MINI_OP(OP_POPCNT64, "popcnt64", LREG, LREG, NONE) - - diff --git a/src/mono/mono/mini/mini.h b/src/mono/mono/mini/mini.h index 0b46f65..f0e9cdb 100644 --- a/src/mono/mono/mini/mini.h +++ b/src/mono/mono/mini/mini.h @@ -1904,7 +1904,9 @@ typedef enum { CMP_LE_UN, CMP_GE_UN, CMP_LT_UN, - CMP_GT_UN + CMP_GT_UN, + CMP_ORD, + CMP_UNORD } CompRelation; typedef enum { @@ -2881,6 +2883,32 @@ enum { SIMD_PREFETCH_MODE_2, }; +/* SIMD operations */ +typedef enum { + SIMD_OP_SSE_CVTSS2SI, + SIMD_OP_SSE_CVTTSS2SI, + SIMD_OP_SSE_CVTSS2SI64, + SIMD_OP_SSE_CVTTSS2SI64, + SIMD_OP_SSE_CVTSD2SI, + SIMD_OP_SSE_CVTSD2SI64, + SIMD_OP_SSE_CVTTSD2SI64, + SIMD_OP_SSE_CVTSI2SS, + SIMD_OP_SSE_CVTSI2SS64, + SIMD_OP_SSE_CVTSI2SD, + SIMD_OP_SSE_CVTSI2SD64, + SIMD_OP_SSE_MAXPS, + SIMD_OP_SSE_MAXSS, + SIMD_OP_SSE_MINPS, + SIMD_OP_SSE_MINSS, + SIMD_OP_SSE_SFENCE, + SIMD_OP_SSE_SQRTPS, + SIMD_OP_SSE_RCPPS, + SIMD_OP_SSE_RSQRTPS, + SIMD_OP_SSE_SQRTSS, + SIMD_OP_SSE_RCPSS, + SIMD_OP_SSE_RSQRTSS +} SimdOp; + const char *mono_arch_xregname (int reg); guint32 mono_arch_cpu_enumerate_simd_versions (void); MonoCPUFeatures mono_arch_get_cpu_features (void); diff --git a/src/mono/mono/mini/simd-intrinsics-netcore.c b/src/mono/mono/mini/simd-intrinsics-netcore.c index fc5b621..ea8a8da 100644 --- a/src/mono/mono/mini/simd-intrinsics-netcore.c +++ b/src/mono/mono/mini/simd-intrinsics-netcore.c @@ -55,6 +55,15 @@ enum { static int register_size; +typedef struct { + // One of the SN_ constants + guint16 id; + // ins->opcode + int op; + // ins->inst_c0 + int instc0; +} SimdIntrinsic; + void mono_simd_intrinsics_init (void) { @@ -80,18 +89,16 @@ simd_intrinsic_compare_by_name (const void *key, const void *value) } static int +simd_intrinsic_info_compare_by_name (const void *key, const void *value) +{ + SimdIntrinsic *info = (SimdIntrinsic*)value; + return strcmp ((const char*)key, method_name (info->id)); +} + +static int lookup_intrins (guint16 *intrinsics, int size, MonoMethod *cmethod) { const guint16 *result = (const guint16 *)mono_binary_search (cmethod->name, intrinsics, size / sizeof (guint16), sizeof (guint16), &simd_intrinsic_compare_by_name); - -#if FALSE - for (int i = 0; i < (size / sizeof (guint16)) - 1; ++i) { - if (method_name (intrinsics [i])[0] > method_name (intrinsics [i + 1])[0]) { - printf ("%s %s\n",method_name (intrinsics [i]), method_name (intrinsics [i + 1])); - g_assert_not_reached (); - } - } -#endif if (result == NULL) return -1; @@ -99,6 +106,29 @@ lookup_intrins (guint16 *intrinsics, int size, MonoMethod *cmethod) return (int)*result; } +static SimdIntrinsic* +lookup_intrins_info (SimdIntrinsic *intrinsics, int size, MonoMethod *cmethod) +{ +#if 0 + for (int i = 0; i < (size / sizeof (SimdIntrinsic)) - 1; ++i) { + const char *n1 = method_name (intrinsics [i].id); + const char *n2 = method_name (intrinsics [i + 1].id); + int len1 = strlen (n1); + int len2 = strlen (n2); + for (int j = 0; j < len1 && j < len2; ++j) { + if (n1 [j] > n2 [j]) { + printf ("%s %s\n", n1, n2); + g_assert_not_reached (); + } else if (n1 [j] < n2 [j]) { + break; + } + } + } +#endif + + return (SimdIntrinsic *)mono_binary_search (cmethod->name, intrinsics, size / sizeof (SimdIntrinsic), sizeof (SimdIntrinsic), &simd_intrinsic_info_compare_by_name); +} + static int type_to_expand_op (MonoType *type) { @@ -181,6 +211,9 @@ emit_simd_ins (MonoCompile *cfg, MonoClass *klass, int opcode, int sreg1, int sr } else if (spec [MONO_INST_DEST] == 'i') { ins->dreg = alloc_ireg (cfg); ins->type = STACK_I4; + } else if (spec [MONO_INST_DEST] == 'l') { + ins->dreg = alloc_lreg (cfg); + ins->type = STACK_I8; } ins->sreg1 = sreg1; ins->sreg2 = sreg2; @@ -639,44 +672,115 @@ emit_sys_numerics_vector_t (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSig #ifdef TARGET_AMD64 -static guint16 sse_methods [] = { - SN_Add, - SN_And, - SN_AndNot, - SN_CompareEqual, - SN_CompareNotEqual, - SN_Divide, - SN_LoadAlignedVector128, - SN_LoadVector128, - SN_MoveHighToLow, - SN_MoveLowToHigh, - SN_MoveMask, - SN_MoveScalar, - SN_Multiply, - SN_Or, - SN_Shuffle, - SN_Store, - SN_StoreAligned, - SN_Subtract, - SN_UnpackHigh, - SN_UnpackLow, - SN_Xor, - SN_get_IsSupported +static SimdIntrinsic sse_methods [] = { + {SN_Add, OP_XBINOP, OP_FADD}, + {SN_AddScalar, OP_SSE_ADDSS}, + {SN_And, OP_SSE_AND}, + {SN_AndNot, OP_SSE_ANDN}, + {SN_CompareEqual, OP_XCOMPARE_FP, CMP_EQ}, + {SN_CompareGreaterThan, OP_XCOMPARE_FP,CMP_GT}, + {SN_CompareGreaterThanOrEqual, OP_XCOMPARE_FP, CMP_GE}, + {SN_CompareLessThan, OP_XCOMPARE_FP, CMP_LT}, + {SN_CompareLessThanOrEqual, OP_XCOMPARE_FP, CMP_LE}, + {SN_CompareNotEqual, OP_XCOMPARE_FP, CMP_NE}, + {SN_CompareNotGreaterThan, OP_XCOMPARE_FP, CMP_LE}, + {SN_CompareNotGreaterThanOrEqual, OP_XCOMPARE_FP, CMP_LT}, + {SN_CompareNotLessThan, OP_XCOMPARE_FP, CMP_GE}, + {SN_CompareNotLessThanOrEqual, OP_XCOMPARE_FP, CMP_GT}, + {SN_CompareOrdered, OP_XCOMPARE_FP, CMP_ORD}, + {SN_CompareScalarEqual, OP_SSE_CMPSS, CMP_EQ}, + {SN_CompareScalarGreaterThan, OP_SSE_CMPSS, CMP_GT}, + {SN_CompareScalarGreaterThanOrEqual, OP_SSE_CMPSS, CMP_GE}, + {SN_CompareScalarLessThan, OP_SSE_CMPSS, CMP_LT}, + {SN_CompareScalarLessThanOrEqual, OP_SSE_CMPSS, CMP_LE}, + {SN_CompareScalarNotEqual, OP_SSE_CMPSS, CMP_NE}, + {SN_CompareScalarNotGreaterThan, OP_SSE_CMPSS, CMP_LE}, + {SN_CompareScalarNotGreaterThanOrEqual, OP_SSE_CMPSS, CMP_LT}, + {SN_CompareScalarNotLessThan, OP_SSE_CMPSS, CMP_GE}, + {SN_CompareScalarNotLessThanOrEqual, OP_SSE_CMPSS, CMP_GT}, + {SN_CompareScalarOrdered, OP_SSE_CMPSS, CMP_ORD}, + {SN_CompareScalarOrderedEqual, OP_SSE_COMISS, CMP_EQ}, + {SN_CompareScalarOrderedGreaterThan, OP_SSE_COMISS, CMP_GT}, + {SN_CompareScalarOrderedGreaterThanOrEqual, OP_SSE_COMISS, CMP_GE}, + {SN_CompareScalarOrderedLessThan, OP_SSE_COMISS, CMP_LT}, + {SN_CompareScalarOrderedLessThanOrEqual, OP_SSE_COMISS, CMP_LE}, + {SN_CompareScalarOrderedNotEqual, OP_SSE_COMISS, CMP_NE}, + {SN_CompareScalarUnordered, OP_SSE_CMPSS, CMP_UNORD}, + {SN_CompareScalarUnorderedEqual, OP_SSE_UCOMISS, CMP_EQ}, + {SN_CompareScalarUnorderedGreaterThan, OP_SSE_UCOMISS, CMP_GT}, + {SN_CompareScalarUnorderedGreaterThanOrEqual, OP_SSE_UCOMISS, CMP_GE}, + {SN_CompareScalarUnorderedLessThan, OP_SSE_UCOMISS, CMP_LT}, + {SN_CompareScalarUnorderedLessThanOrEqual, OP_SSE_UCOMISS, CMP_LE}, + {SN_CompareScalarUnorderedNotEqual, OP_SSE_UCOMISS, CMP_NE}, + {SN_CompareUnordered, OP_XCOMPARE_FP, CMP_UNORD}, + {SN_ConvertScalarToVector128Single}, + {SN_ConvertToInt32, OP_XOP_I4_X, SIMD_OP_SSE_CVTSS2SI}, + {SN_ConvertToInt32WithTruncation, OP_XOP_I4_X, SIMD_OP_SSE_CVTTSS2SI}, + {SN_ConvertToInt64, OP_XOP_I8_X, SIMD_OP_SSE_CVTSS2SI64}, + {SN_ConvertToInt64WithTruncation, OP_XOP_I8_X, SIMD_OP_SSE_CVTTSS2SI64}, + {SN_Divide, OP_XBINOP, OP_FDIV}, + {SN_DivideScalar, OP_SSE_DIVSS}, + {SN_LoadAlignedVector128, OP_SSE_LOADU, 16 /* alignment */}, + {SN_LoadHigh, OP_SSE_MOVHPS_LOAD}, + {SN_LoadLow, OP_SSE_MOVLPS_LOAD}, + {SN_LoadScalarVector128, OP_SSE_MOVSS}, + {SN_LoadVector128, OP_SSE_LOADU, 1 /* alignment */}, + {SN_Max, OP_XOP_X_X_X, SIMD_OP_SSE_MAXPS}, + {SN_MaxScalar, OP_XOP_X_X_X, SIMD_OP_SSE_MAXSS}, + {SN_Min, OP_XOP_X_X_X, SIMD_OP_SSE_MINPS}, + {SN_MinScalar, OP_XOP_X_X_X, SIMD_OP_SSE_MINSS}, + {SN_MoveHighToLow, OP_SSE_MOVEHL}, + {SN_MoveLowToHigh, OP_SSE_MOVELH}, + {SN_MoveMask, OP_SSE_MOVMSK}, + {SN_MoveScalar, OP_SSE_MOVS2}, + {SN_Multiply, OP_XBINOP, OP_FMUL}, + {SN_MultiplyScalar, OP_SSE_MULSS}, + {SN_Or, OP_SSE_OR}, + {SN_Prefetch0, OP_SSE_PREFETCHT0}, + {SN_Prefetch1, OP_SSE_PREFETCHT1}, + {SN_Prefetch2, OP_SSE_PREFETCHT2}, + {SN_PrefetchNonTemporal, OP_SSE_PREFETCHNTA}, + {SN_Reciprocal, OP_XOP_X_X, SIMD_OP_SSE_RCPPS}, + {SN_ReciprocalScalar, 0, SIMD_OP_SSE_RCPSS}, + {SN_ReciprocalSqrt, OP_XOP_X_X, SIMD_OP_SSE_RSQRTPS}, + {SN_ReciprocalSqrtScalar, 0, SIMD_OP_SSE_RSQRTSS}, + {SN_Sqrt, OP_XOP_X_X, SIMD_OP_SSE_SQRTPS}, + {SN_SqrtScalar, 0, SIMD_OP_SSE_SQRTSS}, + {SN_Shuffle}, + {SN_Store, OP_SSE_STORE, 1 /* alignment */}, + {SN_StoreAligned, OP_SSE_STORE, 16 /* alignment */}, + {SN_StoreAlignedNonTemporal, OP_SSE_MOVNTPS}, + {SN_StoreFence, OP_XOP, SIMD_OP_SSE_SFENCE}, + {SN_StoreHigh, OP_SSE_MOVHPS_STORE}, + {SN_StoreLow, OP_SSE_MOVLPS_STORE}, + {SN_StoreScalar, OP_SSE_MOVSS_STORE}, + {SN_Subtract, OP_XBINOP, OP_FSUB}, + {SN_SubtractScalar, OP_SSE_SUBSS}, + {SN_UnpackHigh, OP_SSE_UNPACKHI}, + {SN_UnpackLow, OP_SSE_UNPACKLO}, + {SN_Xor, OP_SSE_XOR}, + {SN_get_IsSupported} }; static guint16 sse2_methods [] = { SN_Add, SN_AddSaturate, + SN_AddScalar, SN_And, SN_AndNot, + SN_Average, SN_CompareEqual, SN_CompareGreaterThan, SN_CompareLessThan, SN_CompareNotEqual, + SN_CompareScalarEqual, + SN_ConvertScalarToVector128Double, SN_ConvertScalarToVector128Int32, SN_ConvertScalarToVector128Int64, SN_ConvertScalarToVector128UInt32, SN_ConvertScalarToVector128UInt64, + SN_ConvertToInt64, + SN_ConvertToInt64WithTruncation, SN_ConvertToUInt32, SN_ConvertToUInt64, SN_LoadAlignedVector128, @@ -751,50 +855,28 @@ emit_x86_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature gboolean supported, is_64bit; MonoClass *klass = cmethod->klass; MonoTypeEnum arg0_type = fsig->param_count > 0 ? get_underlying_type (fsig->params [0]) : MONO_TYPE_VOID; + SimdIntrinsic *info; + gboolean is_corlib = m_class_get_image (cfg->method->klass) == mono_get_corlib (); if (is_hw_intrinsics_class (klass, "Sse", &is_64bit)) { if (!COMPILE_LLVM (cfg)) return NULL; - id = lookup_intrins (sse_methods, sizeof (sse_methods), cmethod); - if (id == -1) + info = lookup_intrins_info (sse_methods, sizeof (sse_methods), cmethod); + if (!info) return NULL; + int id = info->id; + + supported = (mini_get_cpu_features (cfg) & MONO_CPU_X86_SSE) != 0; + + /* Common case */ + if (info->op != 0) + return emit_simd_ins_for_sig (cfg, klass, info->op, info->instc0, arg0_type, fsig, args); - supported = (mini_get_cpu_features (cfg) & MONO_CPU_X86_SSE) != 0 && - m_class_get_image (cfg->method->klass) == mono_get_corlib (); // We only support the subset used by corelib - switch (id) { case SN_get_IsSupported: EMIT_NEW_ICONST (cfg, ins, supported ? 1 : 0); ins->type = STACK_I4; return ins; - case SN_LoadAlignedVector128: - return emit_simd_ins_for_sig (cfg, klass, OP_SSE_LOADU, 16 /*alignment*/, arg0_type, fsig, args); - case SN_LoadVector128: - return emit_simd_ins_for_sig (cfg, klass, OP_SSE_LOADU, 1 /*alignment*/, arg0_type, fsig, args); - case SN_MoveMask: - return emit_simd_ins_for_sig (cfg, klass, OP_SSE_MOVMSK, -1, arg0_type, fsig, args); - case SN_MoveScalar: - return emit_simd_ins_for_sig (cfg, klass, OP_SSE_MOVS2, -1, arg0_type, fsig, args); - case SN_CompareNotEqual: - return emit_simd_ins_for_sig (cfg, klass, OP_XCOMPARE_FP, CMP_NE, arg0_type, fsig, args); - case SN_CompareEqual: - return emit_simd_ins_for_sig (cfg, klass, OP_XCOMPARE_FP, CMP_EQ, arg0_type, fsig, args); - case SN_And: - return emit_simd_ins_for_sig (cfg, klass, OP_SSE_AND, -1, arg0_type, fsig, args); - case SN_AndNot: - return emit_simd_ins_for_sig (cfg, klass, OP_SSE_ANDN, -1, arg0_type, fsig, args); - case SN_Or: - return emit_simd_ins_for_sig (cfg, klass, OP_SSE_OR, -1, arg0_type, fsig, args); - case SN_Xor: - return emit_simd_ins_for_sig (cfg, klass, OP_SSE_XOR, -1, arg0_type, fsig, args); - case SN_Multiply: - return emit_simd_ins_for_sig (cfg, klass, OP_XBINOP, OP_FMUL, arg0_type, fsig, args); - case SN_Divide: - return emit_simd_ins_for_sig (cfg, klass, OP_XBINOP, OP_FDIV, arg0_type, fsig, args); - case SN_Add: - return emit_simd_ins_for_sig (cfg, klass, OP_XBINOP, OP_FADD, arg0_type, fsig, args); - case SN_Subtract: - return emit_simd_ins_for_sig (cfg, klass, OP_XBINOP, OP_FSUB, arg0_type, fsig, args); case SN_Shuffle: { if (args [2]->opcode != OP_ICONST) { mono_cfg_set_exception (cfg, MONO_EXCEPTION_MONO_ERROR); @@ -804,18 +886,23 @@ emit_x86_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature } return emit_simd_ins_for_sig (cfg, klass, OP_SSE_SHUFFLE, args [2]->inst_c0 /*mask*/, arg0_type, fsig, args); } - case SN_MoveHighToLow: - return emit_simd_ins_for_sig (cfg, klass, OP_SSE_MOVEHL, -1, arg0_type, fsig, args); - case SN_MoveLowToHigh: - return emit_simd_ins_for_sig (cfg, klass, OP_SSE_MOVELH, -1, arg0_type, fsig, args); - case SN_UnpackLow: - return emit_simd_ins_for_sig (cfg, klass, OP_SSE_UNPACKLO, -1, arg0_type, fsig, args); - case SN_UnpackHigh: - return emit_simd_ins_for_sig (cfg, klass, OP_SSE_UNPACKHI, -1, arg0_type, fsig, args); - case SN_Store: - return emit_simd_ins_for_sig (cfg, klass, OP_SSE_STORE, 1 /*alignment*/, arg0_type, fsig, args); - case SN_StoreAligned: - return emit_simd_ins_for_sig (cfg, klass, OP_SSE_STORE, 16 /*alignment*/, arg0_type, fsig, args); + case SN_ConvertScalarToVector128Single: + if (fsig->params [1]->type == MONO_TYPE_I4) + return emit_simd_ins_for_sig (cfg, klass, OP_XOP_X_X_I4, SIMD_OP_SSE_CVTSI2SS, 0, fsig, args); + else if (fsig->params [1]->type == MONO_TYPE_I8) + return emit_simd_ins_for_sig (cfg, klass, OP_XOP_X_X_I8, SIMD_OP_SSE_CVTSI2SS64, 0, fsig, args); + else + g_assert_not_reached (); + break; + case SN_ReciprocalScalar: + case SN_ReciprocalSqrtScalar: + case SN_SqrtScalar: + if (fsig->param_count == 1) + return emit_simd_ins_for_sig (cfg, klass, OP_XOP_X_X, info->instc0, arg0_type, fsig, args); + else + return NULL; + case SN_LoadScalarVector128: + return NULL; default: return NULL; } @@ -828,8 +915,7 @@ emit_x86_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature if (id == -1) return NULL; - supported = (mini_get_cpu_features (cfg) & MONO_CPU_X86_SSE2) != 0 && - m_class_get_image (cfg->method->klass) == mono_get_corlib (); // We only support the subset used by corelib + supported = (mini_get_cpu_features (cfg) & MONO_CPU_X86_SSE3) != 0 && is_corlib;// We only support the subset used by corelib switch (id) { case SN_get_IsSupported: { @@ -843,10 +929,19 @@ emit_x86_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature return emit_simd_ins_for_sig (cfg, klass, OP_XBINOP, arg0_type == MONO_TYPE_R8 ? OP_FADD : OP_IADD, arg0_type, fsig, args); case SN_AddSaturate: return emit_simd_ins_for_sig (cfg, klass, OP_SSE2_ADDS, -1, arg0_type, fsig, args); + case SN_AddScalar: + return emit_simd_ins_for_sig (cfg, klass, OP_SSE2_ADDSD, -1, arg0_type, fsig, args); case SN_And: return emit_simd_ins_for_sig (cfg, klass, OP_SSE_AND, -1, arg0_type, fsig, args); case SN_AndNot: return emit_simd_ins_for_sig (cfg, klass, OP_SSE_ANDN, -1, arg0_type, fsig, args); + case SN_Average: + if (arg0_type == MONO_TYPE_U1) + return emit_simd_ins_for_sig (cfg, klass, OP_PAVGB_UN, -1, arg0_type, fsig, args); + else if (arg0_type == MONO_TYPE_U2) + return emit_simd_ins_for_sig (cfg, klass, OP_PAVGW_UN, -1, arg0_type, fsig, args); + else + return NULL; case SN_CompareNotEqual: return emit_simd_ins_for_sig (cfg, klass, arg0_type == MONO_TYPE_R8 ? OP_XCOMPARE_FP : OP_XCOMPARE, CMP_NE, arg0_type, fsig, args); case SN_CompareEqual: @@ -855,6 +950,8 @@ emit_x86_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature return emit_simd_ins_for_sig (cfg, klass, arg0_type == MONO_TYPE_R8 ? OP_XCOMPARE_FP : OP_XCOMPARE, CMP_GT, arg0_type, fsig, args); case SN_CompareLessThan: return emit_simd_ins_for_sig (cfg, klass, arg0_type == MONO_TYPE_R8 ? OP_XCOMPARE_FP : OP_XCOMPARE, CMP_LT, arg0_type, fsig, args); + case SN_CompareScalarEqual: + return emit_simd_ins_for_sig (cfg, klass, OP_SSE2_CMPSD, CMP_EQ, arg0_type, fsig, args); case SN_ConvertScalarToVector128Int32: case SN_ConvertScalarToVector128Int64: case SN_ConvertScalarToVector128UInt32: @@ -924,8 +1021,7 @@ emit_x86_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature if (id == -1) return NULL; - supported = (mini_get_cpu_features (cfg) & MONO_CPU_X86_SSE3) != 0 && - m_class_get_image (cfg->method->klass) == mono_get_corlib (); // We only support the subset used by corelib + supported = (mini_get_cpu_features (cfg) & MONO_CPU_X86_SSE3) != 0 && is_corlib; // We only support the subset used by corelib switch (id) { case SN_get_IsSupported: @@ -946,8 +1042,7 @@ emit_x86_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature if (id == -1) return NULL; - supported = (mini_get_cpu_features (cfg) & MONO_CPU_X86_SSSE3) != 0 && - m_class_get_image (cfg->method->klass) == mono_get_corlib (); // We only support the subset used by corelib + supported = (mini_get_cpu_features (cfg) & MONO_CPU_X86_SSSE3) != 0 && is_corlib; // We only support the subset used by corelib switch (id) { case SN_get_IsSupported: @@ -968,8 +1063,7 @@ emit_x86_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature if (id == -1) return NULL; - supported = COMPILE_LLVM (cfg) && (mini_get_cpu_features (cfg) & MONO_CPU_X86_SSE41) != 0 && - m_class_get_image (cfg->method->klass) == mono_get_corlib (); // We only support the subset used by corelib + supported = COMPILE_LLVM (cfg) && (mini_get_cpu_features (cfg) & MONO_CPU_X86_SSE41) != 0 && is_corlib; // We only support the subset used by corelib switch (id) { case SN_get_IsSupported: @@ -1360,7 +1454,8 @@ mono_emit_simd_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSign #ifdef TARGET_AMD64 // TODO: test and enable for x86 too if (!strcmp (class_ns, "System.Runtime.Intrinsics.X86")) { - return emit_x86_intrinsics (cfg ,cmethod, fsig, args); + MonoInst *ins = emit_x86_intrinsics (cfg ,cmethod, fsig, args); + return ins; } #endif diff --git a/src/mono/mono/mini/simd-methods-netcore.h b/src/mono/mono/mini/simd-methods-netcore.h index 067be67..64eec13 100644 --- a/src/mono/mono/mini/simd-methods-netcore.h +++ b/src/mono/mono/mini/simd-methods-netcore.h @@ -7,6 +7,8 @@ METHOD(LessThan) METHOD(LessThanOrEqual) METHOD(Min) METHOD(Max) +METHOD(MinScalar) +METHOD(MaxScalar) METHOD(PopCount) METHOD(LeadingZeroCount) METHOD(get_Count) @@ -28,8 +30,10 @@ METHOD(op_Multiply) METHOD(op_Subtraction) // Vector METHOD(ConvertToInt32) +METHOD(ConvertToInt32WithTruncation) METHOD(ConvertToUInt32) METHOD(ConvertToInt64) +METHOD(ConvertToInt64WithTruncation) METHOD(ConvertToUInt64) METHOD(ConvertToSingle) METHOD(ConvertToDouble) @@ -62,27 +66,82 @@ METHOD(ParallelBitDeposit) METHOD(ParallelBitExtract) // Sse METHOD(Add) +METHOD(CompareGreaterThanOrEqual) +METHOD(CompareLessThanOrEqual) METHOD(CompareNotEqual) +METHOD(CompareNotGreaterThan) +METHOD(CompareNotGreaterThanOrEqual) +METHOD(CompareNotLessThan) +METHOD(CompareNotLessThanOrEqual) +METHOD(CompareScalarGreaterThan) +METHOD(CompareScalarGreaterThanOrEqual) +METHOD(CompareScalarLessThan) +METHOD(CompareScalarLessThanOrEqual) +METHOD(CompareScalarNotEqual) +METHOD(CompareScalarNotGreaterThan) +METHOD(CompareScalarNotGreaterThanOrEqual) +METHOD(CompareScalarNotLessThan) +METHOD(CompareScalarNotLessThanOrEqual) +METHOD(CompareScalarOrderedEqual) +METHOD(CompareScalarOrderedGreaterThan) +METHOD(CompareScalarOrderedGreaterThanOrEqual) +METHOD(CompareScalarOrderedLessThan) +METHOD(CompareScalarOrderedLessThanOrEqual) +METHOD(CompareScalarOrderedNotEqual) +METHOD(CompareScalarUnorderedEqual) +METHOD(CompareScalarUnorderedGreaterThan) +METHOD(CompareScalarUnorderedGreaterThanOrEqual) +METHOD(CompareScalarUnorderedLessThan) +METHOD(CompareScalarUnorderedLessThanOrEqual) +METHOD(CompareScalarUnorderedNotEqual) +METHOD(CompareOrdered) +METHOD(CompareUnordered) +METHOD(CompareScalarOrdered) +METHOD(CompareScalarUnordered) +METHOD(ConvertScalarToVector128Single) METHOD(Divide) +METHOD(DivideScalar) METHOD(Store) +METHOD(StoreFence) +METHOD(StoreHigh) +METHOD(StoreLow) METHOD(Subtract) +METHOD(SubtractScalar) METHOD(CompareEqual) +METHOD(LoadHigh) +METHOD(LoadLow) METHOD(LoadVector128) +METHOD(LoadScalarVector128) METHOD(MoveHighToLow) METHOD(MoveLowToHigh) METHOD(MoveMask) METHOD(MoveScalar) METHOD(Multiply) +METHOD(MultiplyScalar) METHOD(Shuffle) METHOD(UnpackHigh) METHOD(UnpackLow) +METHOD(Prefetch0) +METHOD(Prefetch1) +METHOD(Prefetch2) +METHOD(PrefetchNonTemporal) +METHOD(Reciprocal) +METHOD(ReciprocalScalar) +METHOD(ReciprocalSqrt) +METHOD(ReciprocalSqrtScalar) +METHOD(Sqrt) +METHOD(SqrtScalar) // Sse2 METHOD(AddSaturate) +METHOD(AddScalar) METHOD(And) +METHOD(Average) METHOD(Or) METHOD(LoadAlignedVector128) METHOD(Xor) METHOD(CompareGreaterThan) +METHOD(CompareScalarEqual) +METHOD(ConvertScalarToVector128Double) METHOD(ConvertScalarToVector128Int32) METHOD(ConvertScalarToVector128Int64) METHOD(ConvertScalarToVector128UInt32) @@ -90,10 +149,11 @@ METHOD(ConvertScalarToVector128UInt64) METHOD(PackUnsignedSaturate) METHOD(StoreScalar) METHOD(StoreAligned) +METHOD(StoreAlignedNonTemporal) METHOD(ShiftRightLogical) METHOD(CompareLessThan) // Sse3 METHOD(MoveAndDuplicate) // Sse41 METHOD(Insert) -METHOD(TestZ) \ No newline at end of file +METHOD(TestZ)