[jit] Implement support for all Sse1 intrinsics for netcore. (#33356)
authorZoltan Varga <vargaz@gmail.com>
Mon, 9 Mar 2020 22:37:37 +0000 (18:37 -0400)
committerGitHub <noreply@github.com>
Mon, 9 Mar 2020 22:37:37 +0000 (18:37 -0400)
* [jit] Implement support for all Sse1 intrinsics for netcore.

* Add generic OP_XOP opcodes for opcodes which the JIT doesn't care about.
  Add a SimdOp enum to list the operations performed by these opcodes.
* Add a SimdIntrinsic struct so the mapping between the .net methods
  and the JIT opcodes can be specified declaratively.
* Add all intrinsics from the Sse class.

* Fix UnpackHigh/UnpackLow.

* Implement missing load/store intrinsics.

* Implement missing opcodes.

* Fix nontemporal metadata.

* Fix MOVNTPS alignment..

* Fix OP_XOP_X_X.

src/mono/mono/mini/llvm-intrinsics.h
src/mono/mono/mini/mini-llvm.c
src/mono/mono/mini/mini-ops.h
src/mono/mono/mini/mini.h
src/mono/mono/mini/simd-intrinsics-netcore.c
src/mono/mono/mini/simd-methods-netcore.h

index 4520b90..19ffaee 100644 (file)
@@ -59,6 +59,7 @@ INTRINS_OVR(CTLZ_I32, ctlz)
 INTRINS_OVR(CTLZ_I64, ctlz)
 INTRINS_OVR(CTTZ_I32, cttz)
 INTRINS_OVR(CTTZ_I64, cttz)
+INTRINS(PREFETCH, prefetch)
 INTRINS(BZHI_I32, x86_bmi_bzhi_32)
 INTRINS(BZHI_I64, x86_bmi_bzhi_64)
 INTRINS(BEXTR_I32, x86_bmi_bextr_32)
@@ -81,8 +82,11 @@ INTRINS(SSE_PSRLI_Q, x86_sse2_psrli_q)
 INTRINS(SSE_PSLLI_Q, x86_sse2_pslli_q)
 INTRINS(SSE_SQRT_PD, x86_sse2_sqrt_pd)
 INTRINS(SSE_SQRT_PS, x86_sse_sqrt_ps)
-INTRINS(SSE_RSQRT_PS, x86_sse_rsqrt_ps)
 INTRINS(SSE_RCP_PS, x86_sse_rcp_ps)
+INTRINS(SSE_RSQRT_PS, x86_sse_rsqrt_ps)
+INTRINS(SSE_SQRT_SS, x86_sse_sqrt_ss)
+INTRINS(SSE_RCP_SS, x86_sse_rcp_ss)
+INTRINS(SSE_RSQRT_SS, x86_sse_rsqrt_ss)
 INTRINS(SSE_CVTTPD2DQ, x86_sse2_cvttpd2dq)
 INTRINS(SSE_CVTTPS2DQ, x86_sse2_cvttps2dq)
 INTRINS(SSE_CVTDQ2PS, x86_sse2_cvtdq2ps)
@@ -149,6 +153,7 @@ INTRINS(SSE_ROUNDSS, x86_sse41_round_ss)
 INTRINS(SSE_ROUNDPD, x86_sse41_round_pd)
 INTRINS(SSE_PTESTZ, x86_sse41_ptestz)
 INTRINS(SSE_INSERTPS, x86_sse41_insertps)
+INTRINS(SSE_SFENCE, x86_sse_sfence)
 #if LLVM_API_VERSION >= 800
        // these intrinsics were renamed in LLVM 8
 INTRINS_OVR(SSE_SADD_SATI8, sadd_sat)
index be17a2e..137cf15 100644 (file)
@@ -276,6 +276,8 @@ static LLVMRealPredicate fpcond_to_llvm_cond [] = {
        LLVMRealUGE,
        LLVMRealULT,
        LLVMRealUGT,
+       LLVMRealORD,
+       LLVMRealUNO
 };
 
 static MonoLLVMModule aot_module;
@@ -306,7 +308,7 @@ set_failure (EmitContext *ctx, const char *message)
 }
 
 static LLVMValueRef
-ConstInt32 (int v)
+const_int32 (int v)
 {
        return LLVMConstInt (LLVMInt32Type (), v, FALSE);
 }
@@ -2137,6 +2139,20 @@ set_nonnull_load_flag (LLVMValueRef v)
 }
 
 static void
+set_nontemporal_flag (LLVMValueRef v)
+{
+       LLVMValueRef md_arg;
+       int md_kind;
+       const char *flag_name;
+
+       // FIXME: Cache this
+       flag_name = "nontemporal";
+       md_kind = LLVMGetMDKindID (flag_name, strlen (flag_name));
+       md_arg = const_int32 (1);
+       LLVMSetMetadata (v, md_kind, LLVMMDNode (&md_arg, 1));
+}
+
+static void
 set_invariant_load_flag (LLVMValueRef v)
 {
        LLVMValueRef md_arg;
@@ -5701,7 +5717,7 @@ process_bb (EmitContext *ctx, MonoBasicBlock *bb)
                        v = mono_llvm_build_alloca (builder, LLVMInt8Type (), LLVMConstInt (LLVMInt32Type (), size, FALSE), MONO_ARCH_FRAME_ALIGNMENT, "");
 
                        if (ins->flags & MONO_INST_INIT)
-                               emit_memset (ctx, builder, v, ConstInt32 (size), MONO_ARCH_FRAME_ALIGNMENT);
+                               emit_memset (ctx, builder, v, const_int32 (size), MONO_ARCH_FRAME_ALIGNMENT);
 
                        values [ins->dreg] = v;
                        break;
@@ -6559,7 +6575,7 @@ process_bb (EmitContext *ctx, MonoBasicBlock *bb)
                        if (!addresses [ins->dreg])
                                addresses [ins->dreg] = build_alloca (ctx, m_class_get_byval_arg (klass));
                        LLVMValueRef ptr = LLVMBuildBitCast (builder, addresses [ins->dreg], LLVMPointerType (LLVMInt8Type (), 0), "");
-                       emit_memset (ctx, builder, ptr, ConstInt32 (mono_class_value_size (klass, NULL)), 0);
+                       emit_memset (ctx, builder, ptr, const_int32 (mono_class_value_size (klass, NULL)), 0);
                        break;
                }
                case OP_DUMMY_VZERO:
@@ -7456,6 +7472,47 @@ process_bb (EmitContext *ctx, MonoBasicBlock *bb)
                        values [ins->dreg] = mono_llvm_build_aligned_load (builder, dst_vec, "", FALSE, ins->inst_c0); // inst_c0 is alignment
                        break;
                }
+               case OP_SSE_MOVSS: {
+                       LLVMValueRef addr = convert (ctx, lhs, LLVMPointerType (LLVMFloatType (), 0));
+                       LLVMValueRef val = mono_llvm_build_load (builder, addr, "", FALSE);
+                       values [ins->dreg] = LLVMBuildInsertElement (builder, LLVMConstNull (type_to_sse_type (ins->inst_c1)), val, LLVMConstInt (LLVMInt32Type (), 0, FALSE), "");
+                       break;
+               }
+               case OP_SSE_MOVSS_STORE: {
+                       LLVMValueRef addr = convert (ctx, lhs, LLVMPointerType (LLVMFloatType (), 0));
+                       LLVMValueRef val = LLVMBuildExtractElement (builder, rhs, LLVMConstInt (LLVMInt32Type (), 0, FALSE), "");
+                       mono_llvm_build_store (builder, val, addr, FALSE, LLVM_BARRIER_NONE);
+                       break;
+               }
+
+               case OP_SSE_MOVLPS_LOAD:
+               case OP_SSE_MOVHPS_LOAD: {
+                       /* Load two floats from rhs and store them in the low/high part of lhs */
+                       LLVMValueRef addr = rhs;
+                       LLVMValueRef addr1 = convert (ctx, addr, LLVMPointerType (LLVMFloatType (), 0));
+                       LLVMValueRef addr2 = convert (ctx, LLVMBuildAdd (builder, convert (ctx, addr, IntPtrType ()), convert (ctx, LLVMConstInt (LLVMInt32Type (), 4, FALSE), IntPtrType ()), ""), LLVMPointerType (LLVMFloatType (), 0));
+                       LLVMValueRef val1 = mono_llvm_build_load (builder, addr1, "", FALSE);
+                       LLVMValueRef val2 = mono_llvm_build_load (builder, addr2, "", FALSE);
+                       int index1 = ins->opcode == OP_SSE_MOVLPS_LOAD ? 0 : 2;
+                       int index2 = ins->opcode == OP_SSE_MOVLPS_LOAD ? 1 : 3;
+                       values [ins->dreg] = LLVMBuildInsertElement (builder, LLVMBuildInsertElement (builder, lhs, val1, LLVMConstInt (LLVMInt32Type (), index1, FALSE), ""), val2, LLVMConstInt (LLVMInt32Type (), index2, FALSE), "");
+                       break;
+               }
+
+               case OP_SSE_MOVLPS_STORE:
+               case OP_SSE_MOVHPS_STORE: {
+                       /* Store two floats from the low/hight part of rhs into lhs */
+                       LLVMValueRef addr = lhs;
+                       LLVMValueRef addr1 = convert (ctx, addr, LLVMPointerType (LLVMFloatType (), 0));
+                       LLVMValueRef addr2 = convert (ctx, LLVMBuildAdd (builder, convert (ctx, addr, IntPtrType ()), convert (ctx, LLVMConstInt (LLVMInt32Type (), 4, FALSE), IntPtrType ()), ""), LLVMPointerType (LLVMFloatType (), 0));
+                       int index1 = ins->opcode == OP_SSE_MOVLPS_STORE ? 0 : 2;
+                       int index2 = ins->opcode == OP_SSE_MOVLPS_STORE ? 1 : 3;
+                       LLVMValueRef val1 = LLVMBuildExtractElement (builder, rhs, LLVMConstInt (LLVMInt32Type (), index1, FALSE), "");
+                       LLVMValueRef val2 = LLVMBuildExtractElement (builder, rhs, LLVMConstInt (LLVMInt32Type (), index2, FALSE), "");
+                       mono_llvm_build_store (builder, val1, addr1, FALSE, LLVM_BARRIER_NONE);
+                       mono_llvm_build_store (builder, val2, addr2, FALSE, LLVM_BARRIER_NONE);
+                       break;
+               }
 
                case OP_SSE_STORE: {
                        LLVMValueRef dst_vec = convert (ctx, lhs, LLVMPointerType (LLVMTypeOf (rhs), 0));
@@ -7469,6 +7526,35 @@ process_bb (EmitContext *ctx, MonoBasicBlock *bb)
                        mono_llvm_build_aligned_store (builder, first_elem, dst, FALSE, 1);
                        break;
                }
+               case OP_SSE_MOVNTPS: {
+                       LLVMValueRef store = mono_llvm_build_aligned_store (builder, rhs, lhs, FALSE, 16);
+                       set_nontemporal_flag (store);
+                       break;
+               }
+               case OP_SSE_PREFETCHT0: {
+                       LLVMValueRef addr = convert (ctx, lhs, LLVMPointerType (LLVMInt8Type (), 0));
+                       LLVMValueRef args [] = { addr, const_int32 (0), const_int32 (3), const_int32 (1) };
+                       call_intrins (ctx, INTRINS_PREFETCH, args, "");
+                       break;
+               }
+               case OP_SSE_PREFETCHT1: {
+                       LLVMValueRef addr = convert (ctx, lhs, LLVMPointerType (LLVMInt8Type (), 0));
+                       LLVMValueRef args [] = { addr, const_int32 (0), const_int32 (2), const_int32 (1) };
+                       call_intrins (ctx, INTRINS_PREFETCH, args, "");
+                       break;
+               }
+               case OP_SSE_PREFETCHT2: {
+                       LLVMValueRef addr = convert (ctx, lhs, LLVMPointerType (LLVMInt8Type (), 0));
+                       LLVMValueRef args [] = { addr, const_int32 (0), const_int32 (1), const_int32 (1) };
+                       call_intrins (ctx, INTRINS_PREFETCH, args, "");
+                       break;
+               }
+               case OP_SSE_PREFETCHNTA: {
+                       LLVMValueRef addr = convert (ctx, lhs, LLVMPointerType (LLVMInt8Type (), 0));
+                       LLVMValueRef args [] = { addr, const_int32 (0), const_int32 (0), const_int32 (1) };
+                       call_intrins (ctx, INTRINS_PREFETCH, args, "");
+                       break;
+               }
 
                case OP_SSE_SHUFFLE: {
                        LLVMValueRef shuffle_vec = create_const_vector_4_i32 (
@@ -7536,24 +7622,169 @@ process_bb (EmitContext *ctx, MonoBasicBlock *bb)
                        break;
                }
 
-               case OP_SSE2_ADDS: {
-                       gint32 intrinsic = 0;
-                       if (ins->inst_c1 == MONO_TYPE_I1)
-                               intrinsic = INTRINS_SSE_SADD_SATI8;
-                       else if (ins->inst_c1 == MONO_TYPE_U1)
-                               intrinsic = INTRINS_SSE_UADD_SATI8;
-                       else if (ins->inst_c1 == MONO_TYPE_I2)
-                               intrinsic = INTRINS_SSE_SADD_SATI16;
-                       else if (ins->inst_c1 == MONO_TYPE_U2)
-                               intrinsic = INTRINS_SSE_UADD_SATI16;
-                       else
+               case OP_SSE_ADDSS:
+               case OP_SSE_SUBSS:
+               case OP_SSE_DIVSS:
+               case OP_SSE_MULSS:
+               case OP_SSE2_ADDSD: {
+                       LLVMValueRef v1 = LLVMBuildExtractElement (builder, lhs, LLVMConstInt (LLVMInt32Type (), 0, FALSE), "");
+                       LLVMValueRef v2 = LLVMBuildExtractElement (builder, rhs, LLVMConstInt (LLVMInt32Type (), 0, FALSE), "");
+
+                       LLVMValueRef v = NULL;
+                       switch (ins->opcode) {
+                       case OP_SSE_ADDSS:
+                       case OP_SSE2_ADDSD:
+                               v = LLVMBuildFAdd (builder, v1, v2, "");
+                               break;
+                       case OP_SSE_SUBSS:
+                               v = LLVMBuildFSub (builder, v1, v2, "");
+                               break;
+                       case OP_SSE_DIVSS:
+                               v = LLVMBuildFDiv (builder, v1, v2, "");
+                               break;
+                       case OP_SSE_MULSS:
+                               v = LLVMBuildFMul (builder, v1, v2, "");
+                               break;
+                       default:
+                               g_assert_not_reached ();
+                       }
+                       values [ins->dreg] = LLVMBuildInsertElement (builder, lhs, v, LLVMConstInt (LLVMInt32Type (), 0, FALSE), "");
+                       break;
+               }
+
+               case OP_SSE_CMPSS:
+               case OP_SSE2_CMPSD: {
+                       int imm = -1;
+                       switch (ins->inst_c0) {
+                       case CMP_EQ: imm = 0; break;
+                       case CMP_GT: imm = 6; break;
+                       case CMP_GE: imm = 5; break;
+                       case CMP_LT: imm = 1; break;
+                       case CMP_LE: imm = 2; break;
+                       case CMP_NE: imm = 4; break;
+                       case CMP_ORD: imm = 7; break;
+                       case CMP_UNORD: imm = 3; break;
+                       default: g_assert_not_reached (); break;
+                       }
+                       LLVMValueRef cmp = LLVMConstInt (LLVMInt8Type (), imm, FALSE);
+                       LLVMValueRef args [] = { lhs, rhs, cmp };
+                       switch (ins->opcode) {
+                       case OP_SSE_CMPSS:
+                               values [ins->dreg] = call_intrins (ctx, INTRINS_SSE_CMPSS, args, "");
+                               break;
+                       case OP_SSE2_CMPSD:
+                               values [ins->dreg] = call_intrins (ctx, INTRINS_SSE_CMPSD, args, "");
+                               break;
+                       default:
                                g_assert_not_reached ();
+                               break;
+                       }
+                       break;
+               }
+               case OP_SSE_COMISS: {
+                       LLVMValueRef args [] = { lhs, rhs };
+                       IntrinsicId id = (IntrinsicId)0;
+                       switch (ins->inst_c0) {
+                       case CMP_EQ: id = INTRINS_SSE_COMIEQ_SS; break;
+                       case CMP_GT: id = INTRINS_SSE_COMIGT_SS; break;
+                       case CMP_GE: id = INTRINS_SSE_COMIGE_SS; break;
+                       case CMP_LT: id = INTRINS_SSE_COMILT_SS; break;
+                       case CMP_LE: id = INTRINS_SSE_COMILE_SS; break;
+                       case CMP_NE: id = INTRINS_SSE_COMINEQ_SS; break;
+                       default: g_assert_not_reached (); break;
+                       }
+                       values [ins->dreg] = call_intrins (ctx, id, args, "");
+                       break;
+               }
+               case OP_SSE_UCOMISS: {
+                       LLVMValueRef args [] = { lhs, rhs };
+                       IntrinsicId id = (IntrinsicId)0;
+                       switch (ins->inst_c0) {
+                       case CMP_EQ: id = INTRINS_SSE_UCOMIEQ_SS; break;
+                       case CMP_GT: id = INTRINS_SSE_UCOMIGT_SS; break;
+                       case CMP_GE: id = INTRINS_SSE_UCOMIGE_SS; break;
+                       case CMP_LT: id = INTRINS_SSE_UCOMILT_SS; break;
+                       case CMP_LE: id = INTRINS_SSE_UCOMILE_SS; break;
+                       case CMP_NE: id = INTRINS_SSE_UCOMINEQ_SS; break;
+                       default: g_assert_not_reached (); break;
+                       }
+                       values [ins->dreg] = call_intrins (ctx, id, args, "");
+                       break;
+               }
+               case OP_XOP: {
+                       IntrinsicId id = (IntrinsicId)0;
+                       switch (ins->inst_c0) {
+                       case SIMD_OP_SSE_SFENCE: id = INTRINS_SSE_SFENCE; break;
+                       default: g_assert_not_reached (); break;
+                       }
+                       call_intrins (ctx, id, NULL, "");
+                       break;
+               }
+               case OP_XOP_X_X: {
+                       IntrinsicId id = (IntrinsicId)0;
+                       switch (ins->inst_c0) {
+                       case SIMD_OP_SSE_SQRTPS: id = INTRINS_SSE_SQRT_PS; break;
+                       case SIMD_OP_SSE_RCPPS: id = INTRINS_SSE_RCP_PS; break;
+                       case SIMD_OP_SSE_RSQRTPS: id = INTRINS_SSE_RSQRT_PS; break;
+                       case SIMD_OP_SSE_SQRTSS: id = INTRINS_SSE_SQRT_SS; break;
+                       case SIMD_OP_SSE_RCPSS: id = INTRINS_SSE_RCP_SS; break;
+                       case SIMD_OP_SSE_RSQRTSS: id = INTRINS_SSE_RSQRT_SS; break;
+                       default: g_assert_not_reached (); break;
+                       }
+                       values [ins->dreg] = call_intrins (ctx, id, &lhs, "");
+                       break;
+               }
+               case OP_XOP_I4_X:
+               case OP_XOP_I8_X: {
+                       IntrinsicId id = (IntrinsicId)0;
+                       switch (ins->inst_c0) {
+                       case SIMD_OP_SSE_CVTSS2SI: id = INTRINS_SSE_CVTSS2SI; break;
+                       case SIMD_OP_SSE_CVTTSS2SI: id = INTRINS_SSE_CVTTSS2SI; break;
+                       case SIMD_OP_SSE_CVTSS2SI64: id = INTRINS_SSE_CVTSS2SI64; break;
+                       case SIMD_OP_SSE_CVTTSS2SI64: id = INTRINS_SSE_CVTTSS2SI64; break;
+                       case SIMD_OP_SSE_CVTSD2SI: id = INTRINS_SSE_CVTSD2SI; break;
+                       case SIMD_OP_SSE_CVTSD2SI64: id = INTRINS_SSE_CVTSD2SI64; break;
+                       case SIMD_OP_SSE_CVTTSD2SI64: id = INTRINS_SSE_CVTTSD2SI64; break;
+                       default: g_assert_not_reached (); break;
+                       }
+                       values [ins->dreg] = call_intrins (ctx, id, &lhs, "");
+                       break;
+               }
+               case OP_XOP_X_X_X:
+               case OP_XOP_X_X_I4:
+               case OP_XOP_X_X_I8: {
+                       LLVMValueRef args [] = { lhs, rhs };
+                       IntrinsicId id = (IntrinsicId)0;
+                       switch (ins->inst_c0) {
+                       case SIMD_OP_SSE_CVTSI2SS: id = INTRINS_SSE_CVTSI2SS; break;
+                       case SIMD_OP_SSE_CVTSI2SS64: id = INTRINS_SSE_CVTSI2SS64; break;
+                       case SIMD_OP_SSE_CVTSI2SD: id = INTRINS_SSE_CVTSI2SD; break;
+                       case SIMD_OP_SSE_CVTSI2SD64: id = INTRINS_SSE_CVTSI2SD64; break;
+                       case SIMD_OP_SSE_MAXPS: id = INTRINS_SSE_MAXPS; break;
+                       case SIMD_OP_SSE_MAXSS: id = INTRINS_SSE_MAXSS; break;
+                       case SIMD_OP_SSE_MINPS: id = INTRINS_SSE_MINPS; break;
+                       case SIMD_OP_SSE_MINSS: id = INTRINS_SSE_MINSS; break;
+                       default: g_assert_not_reached (); break;
+                       }
+                       values [ins->dreg] = call_intrins (ctx, id, args, "");
+                       break;
+               }
+
+               case OP_SSE2_ADDS: {
+                       IntrinsicId id = (IntrinsicId)0;
+                       switch (ins->inst_c1) {
+                       case MONO_TYPE_I1: id = INTRINS_SSE_SADD_SATI8; break;
+                       case MONO_TYPE_U1: id = INTRINS_SSE_UADD_SATI8; break;
+                       case MONO_TYPE_I2: id = INTRINS_SSE_SADD_SATI16; break;
+                       case MONO_TYPE_U2: id = INTRINS_SSE_UADD_SATI16; break;
+                       default: g_assert_not_reached (); break;
+                       }
 
                        LLVMValueRef args [2];
                        args [0] = convert (ctx, lhs, type_to_sse_type (ins->inst_c1));
                        args [1] = convert (ctx, rhs, type_to_sse_type (ins->inst_c1));
                        values [ins->dreg] = convert (ctx, 
-                               call_intrins (ctx, intrinsic, args, dname),
+                               call_intrins (ctx, id, args, dname),
                                type_to_sse_type (ins->inst_c1));
                        break;
                }
index 325e0de..e0cd478 100644 (file)
@@ -1021,12 +1021,33 @@ MINI_OP(OP_SSE_AND, "sse_and", XREG, XREG, XREG)
 MINI_OP(OP_SSE_OR, "sse_or", XREG, XREG, XREG)
 MINI_OP(OP_SSE_XOR, "sse_xor", XREG, XREG, XREG)
 MINI_OP(OP_SSE_ANDN, "sse_andn", XREG, XREG, XREG)
+MINI_OP(OP_SSE_ADDSS, "sse_addss", XREG, XREG, XREG)
+MINI_OP(OP_SSE_SUBSS, "sse_subss", XREG, XREG, XREG)
+MINI_OP(OP_SSE_DIVSS, "sse_divss", XREG, XREG, XREG)
+MINI_OP(OP_SSE_MULSS, "sse_mulss", XREG, XREG, XREG)
+MINI_OP(OP_SSE_CMPSS, "sse_cmpss", XREG, XREG, XREG)
+MINI_OP(OP_SSE_COMISS, "sse_comiss", IREG, XREG, XREG)
+MINI_OP(OP_SSE_UCOMISS, "sse_ucomiss", IREG, XREG, XREG)
+MINI_OP(OP_SSE_MOVSS, "sse_movss", XREG, IREG, NONE)
+MINI_OP(OP_SSE_MOVSS_STORE, "sse_movss_store", NONE, IREG, XREG)
+MINI_OP(OP_SSE_MOVHPS_LOAD, "sse_movhps_load", XREG, XREG, IREG)
+MINI_OP(OP_SSE_MOVLPS_LOAD, "sse_movlps_load", XREG, XREG, IREG)
+MINI_OP(OP_SSE_MOVHPS_STORE, "sse_movhps_store", NONE, IREG, XREG)
+MINI_OP(OP_SSE_MOVLPS_STORE, "sse_movlps_store", NONE, IREG, XREG)
+MINI_OP(OP_SSE_MOVNTPS, "sse_movntps", NONE, IREG, XREG)
+MINI_OP(OP_SSE_PREFETCHT0, "sse_prefetcht0", NONE, IREG, NONE)
+MINI_OP(OP_SSE_PREFETCHT1, "sse_prefetcht1", NONE, IREG, NONE)
+MINI_OP(OP_SSE_PREFETCHT2, "sse_prefetcht2", NONE, IREG, NONE)
+MINI_OP(OP_SSE_PREFETCHNTA, "sse_prefetchnta", NONE, IREG, NONE)
 
 /* sse 2 */
 MINI_OP(OP_SSE2_PACKUS, "sse2_packus", XREG, XREG, XREG)
 MINI_OP(OP_SSE2_SRLI, "sse2_srli", XREG, XREG, XREG)
 MINI_OP(OP_SSE2_SHUFFLE, "sse2_shuffle", XREG, XREG, XREG)
 MINI_OP(OP_SSE2_ADDS, "sse2_adds", XREG, XREG, XREG)
+MINI_OP(OP_SSE2_ADDSD, "sse2_addsd", XREG, XREG, XREG)
+MINI_OP(OP_SSE2_CMPSD, "sse2_cmpsd", XREG, XREG, XREG)
+MINI_OP(OP_SSE2_COMIEQ_SD, "sse2_comieq_sd", XREG, XREG, XREG)
 
 /* sse 3 */
 MINI_OP(OP_SSE3_MOVDDUP, "sse3_movddup", XREG, XREG, NONE)
@@ -1440,8 +1461,20 @@ MINI_OP(OP_XEQUAL, "xequal", IREG, XREG, XREG)
 /* Per element compate, inst_c0 contains a CompRelation */
 MINI_OP(OP_XCOMPARE, "xcompare", XREG, XREG, XREG)
 MINI_OP(OP_XCOMPARE_FP, "xcompare_fp", XREG, XREG, XREG)
-/* Binary op, inst_c0 contains the operation */
+
+/*
+ * Generic SIMD operations, the rest of the JIT doesn't care about the exact operation.
+ */
 MINI_OP(OP_XBINOP, "xbinop", XREG, XREG, XREG)
+/* inst_c0 contains a SimdOp, inst_c1 might contain additional data */
+MINI_OP(OP_XOP, "xop", NONE, NONE, NONE)
+MINI_OP(OP_XOP_X_X, "xop_x_x", XREG, XREG, NONE)
+MINI_OP(OP_XOP_I4_X, "xop_i4_x", IREG, XREG, NONE)
+MINI_OP(OP_XOP_I8_X, "xop_i8_x", LREG, XREG, NONE)
+MINI_OP(OP_XOP_X_X_X, "xop_x_x_x", XREG, XREG, XREG)
+MINI_OP(OP_XOP_X_X_I4, "xop_x_x_i4", XREG, XREG, IREG)
+MINI_OP(OP_XOP_X_X_I8, "xop_x_x_i8", XREG, XREG, LREG)
+
 MINI_OP(OP_XCAST, "xcast", XREG, XREG, NONE)
 /* Extract element of vector */
 /* The index is assumed to be in range */
@@ -1456,5 +1489,3 @@ MINI_OP(OP_LZCNT32, "lzcnt32", IREG, IREG, NONE)
 MINI_OP(OP_LZCNT64, "lzcnt64", LREG, LREG, NONE)
 MINI_OP(OP_POPCNT32, "popcnt32", IREG, IREG, NONE)
 MINI_OP(OP_POPCNT64, "popcnt64", LREG, LREG, NONE)
-
-
index 0b46f65..f0e9cdb 100644 (file)
@@ -1904,7 +1904,9 @@ typedef enum {
        CMP_LE_UN,
        CMP_GE_UN,
        CMP_LT_UN,
-       CMP_GT_UN
+       CMP_GT_UN,
+       CMP_ORD,
+       CMP_UNORD
 } CompRelation;
 
 typedef enum {
@@ -2881,6 +2883,32 @@ enum {
        SIMD_PREFETCH_MODE_2,
 };
 
+/* SIMD operations */
+typedef enum {
+       SIMD_OP_SSE_CVTSS2SI,
+       SIMD_OP_SSE_CVTTSS2SI,
+       SIMD_OP_SSE_CVTSS2SI64,
+       SIMD_OP_SSE_CVTTSS2SI64,
+       SIMD_OP_SSE_CVTSD2SI,
+       SIMD_OP_SSE_CVTSD2SI64,
+       SIMD_OP_SSE_CVTTSD2SI64,
+       SIMD_OP_SSE_CVTSI2SS,
+       SIMD_OP_SSE_CVTSI2SS64,
+       SIMD_OP_SSE_CVTSI2SD,
+       SIMD_OP_SSE_CVTSI2SD64,
+       SIMD_OP_SSE_MAXPS,
+       SIMD_OP_SSE_MAXSS,
+       SIMD_OP_SSE_MINPS,
+       SIMD_OP_SSE_MINSS,
+       SIMD_OP_SSE_SFENCE,
+       SIMD_OP_SSE_SQRTPS,
+       SIMD_OP_SSE_RCPPS,
+       SIMD_OP_SSE_RSQRTPS,
+       SIMD_OP_SSE_SQRTSS,
+       SIMD_OP_SSE_RCPSS,
+       SIMD_OP_SSE_RSQRTSS
+} SimdOp;
+
 const char *mono_arch_xregname (int reg);
 guint32     mono_arch_cpu_enumerate_simd_versions (void);
 MonoCPUFeatures mono_arch_get_cpu_features (void);
index fc5b621..ea8a8da 100644 (file)
@@ -55,6 +55,15 @@ enum {
 
 static int register_size;
 
+typedef struct {
+       // One of the SN_ constants
+       guint16 id;
+       // ins->opcode
+       int op;
+       // ins->inst_c0
+       int instc0;
+} SimdIntrinsic;
+
 void
 mono_simd_intrinsics_init (void)
 {
@@ -80,18 +89,16 @@ simd_intrinsic_compare_by_name (const void *key, const void *value)
 }
 
 static int
+simd_intrinsic_info_compare_by_name (const void *key, const void *value)
+{
+       SimdIntrinsic *info = (SimdIntrinsic*)value;
+       return strcmp ((const char*)key, method_name (info->id));
+}
+
+static int
 lookup_intrins (guint16 *intrinsics, int size, MonoMethod *cmethod)
 {
        const guint16 *result = (const guint16 *)mono_binary_search (cmethod->name, intrinsics, size / sizeof (guint16), sizeof (guint16), &simd_intrinsic_compare_by_name);
-
-#if FALSE
-       for (int i = 0; i < (size / sizeof (guint16)) - 1; ++i) {
-               if (method_name (intrinsics [i])[0] > method_name (intrinsics [i + 1])[0]) {
-                       printf ("%s %s\n",method_name (intrinsics [i]), method_name (intrinsics [i + 1]));
-                       g_assert_not_reached ();
-               }
-       }
-#endif
        
        if (result == NULL)
                return -1;
@@ -99,6 +106,29 @@ lookup_intrins (guint16 *intrinsics, int size, MonoMethod *cmethod)
                return (int)*result;
 }
 
+static SimdIntrinsic*
+lookup_intrins_info (SimdIntrinsic *intrinsics, int size, MonoMethod *cmethod)
+{
+#if 0
+       for (int i = 0; i < (size / sizeof (SimdIntrinsic)) - 1; ++i) {
+               const char *n1 = method_name (intrinsics [i].id);
+               const char *n2 = method_name (intrinsics [i + 1].id);
+               int len1 = strlen (n1);
+               int len2 = strlen (n2);
+               for (int j = 0; j < len1 && j < len2; ++j) {
+                       if (n1 [j] > n2 [j]) {
+                               printf ("%s %s\n", n1, n2);
+                               g_assert_not_reached ();
+                       } else if (n1 [j] < n2 [j]) {
+                               break;
+                       }
+               }
+       }
+#endif
+
+       return (SimdIntrinsic *)mono_binary_search (cmethod->name, intrinsics, size / sizeof (SimdIntrinsic), sizeof (SimdIntrinsic), &simd_intrinsic_info_compare_by_name);
+}
+
 static int
 type_to_expand_op (MonoType *type)
 {
@@ -181,6 +211,9 @@ emit_simd_ins (MonoCompile *cfg, MonoClass *klass, int opcode, int sreg1, int sr
        } else if (spec [MONO_INST_DEST] == 'i') {
                ins->dreg = alloc_ireg (cfg);
                ins->type = STACK_I4;
+       } else if (spec [MONO_INST_DEST] == 'l') {
+               ins->dreg = alloc_lreg (cfg);
+               ins->type = STACK_I8;
        }
        ins->sreg1 = sreg1;
        ins->sreg2 = sreg2;
@@ -639,44 +672,115 @@ emit_sys_numerics_vector_t (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSig
 
 #ifdef TARGET_AMD64
 
-static guint16 sse_methods [] = {
-       SN_Add,
-       SN_And,
-       SN_AndNot,
-       SN_CompareEqual,
-       SN_CompareNotEqual,
-       SN_Divide,
-       SN_LoadAlignedVector128,
-       SN_LoadVector128,
-       SN_MoveHighToLow,
-       SN_MoveLowToHigh,
-       SN_MoveMask,
-       SN_MoveScalar,
-       SN_Multiply,
-       SN_Or,
-       SN_Shuffle,
-       SN_Store,
-       SN_StoreAligned,
-       SN_Subtract,
-       SN_UnpackHigh,
-       SN_UnpackLow,
-       SN_Xor,
-       SN_get_IsSupported
+static SimdIntrinsic sse_methods [] = {
+       {SN_Add, OP_XBINOP, OP_FADD},
+       {SN_AddScalar, OP_SSE_ADDSS},
+       {SN_And, OP_SSE_AND},
+       {SN_AndNot, OP_SSE_ANDN},
+       {SN_CompareEqual, OP_XCOMPARE_FP, CMP_EQ},
+       {SN_CompareGreaterThan, OP_XCOMPARE_FP,CMP_GT},
+       {SN_CompareGreaterThanOrEqual, OP_XCOMPARE_FP, CMP_GE},
+       {SN_CompareLessThan, OP_XCOMPARE_FP, CMP_LT},
+       {SN_CompareLessThanOrEqual, OP_XCOMPARE_FP, CMP_LE},
+       {SN_CompareNotEqual, OP_XCOMPARE_FP, CMP_NE},
+       {SN_CompareNotGreaterThan, OP_XCOMPARE_FP, CMP_LE},
+       {SN_CompareNotGreaterThanOrEqual, OP_XCOMPARE_FP, CMP_LT},
+       {SN_CompareNotLessThan, OP_XCOMPARE_FP, CMP_GE},
+       {SN_CompareNotLessThanOrEqual, OP_XCOMPARE_FP, CMP_GT},
+       {SN_CompareOrdered, OP_XCOMPARE_FP, CMP_ORD},
+       {SN_CompareScalarEqual, OP_SSE_CMPSS, CMP_EQ},
+       {SN_CompareScalarGreaterThan, OP_SSE_CMPSS, CMP_GT},
+       {SN_CompareScalarGreaterThanOrEqual, OP_SSE_CMPSS, CMP_GE},
+       {SN_CompareScalarLessThan, OP_SSE_CMPSS, CMP_LT},
+       {SN_CompareScalarLessThanOrEqual, OP_SSE_CMPSS, CMP_LE},
+       {SN_CompareScalarNotEqual, OP_SSE_CMPSS, CMP_NE},
+       {SN_CompareScalarNotGreaterThan, OP_SSE_CMPSS, CMP_LE},
+       {SN_CompareScalarNotGreaterThanOrEqual, OP_SSE_CMPSS, CMP_LT},
+       {SN_CompareScalarNotLessThan, OP_SSE_CMPSS, CMP_GE},
+       {SN_CompareScalarNotLessThanOrEqual, OP_SSE_CMPSS, CMP_GT},
+       {SN_CompareScalarOrdered, OP_SSE_CMPSS, CMP_ORD},
+       {SN_CompareScalarOrderedEqual, OP_SSE_COMISS, CMP_EQ},
+       {SN_CompareScalarOrderedGreaterThan, OP_SSE_COMISS, CMP_GT},
+       {SN_CompareScalarOrderedGreaterThanOrEqual, OP_SSE_COMISS, CMP_GE},
+       {SN_CompareScalarOrderedLessThan, OP_SSE_COMISS, CMP_LT},
+       {SN_CompareScalarOrderedLessThanOrEqual, OP_SSE_COMISS, CMP_LE},
+       {SN_CompareScalarOrderedNotEqual, OP_SSE_COMISS, CMP_NE},
+       {SN_CompareScalarUnordered, OP_SSE_CMPSS, CMP_UNORD},
+       {SN_CompareScalarUnorderedEqual, OP_SSE_UCOMISS, CMP_EQ},
+       {SN_CompareScalarUnorderedGreaterThan, OP_SSE_UCOMISS, CMP_GT},
+       {SN_CompareScalarUnorderedGreaterThanOrEqual, OP_SSE_UCOMISS, CMP_GE},
+       {SN_CompareScalarUnorderedLessThan, OP_SSE_UCOMISS, CMP_LT},
+       {SN_CompareScalarUnorderedLessThanOrEqual, OP_SSE_UCOMISS, CMP_LE},
+       {SN_CompareScalarUnorderedNotEqual, OP_SSE_UCOMISS, CMP_NE},
+       {SN_CompareUnordered, OP_XCOMPARE_FP, CMP_UNORD},
+       {SN_ConvertScalarToVector128Single},
+       {SN_ConvertToInt32, OP_XOP_I4_X, SIMD_OP_SSE_CVTSS2SI},
+       {SN_ConvertToInt32WithTruncation, OP_XOP_I4_X, SIMD_OP_SSE_CVTTSS2SI},
+       {SN_ConvertToInt64, OP_XOP_I8_X, SIMD_OP_SSE_CVTSS2SI64},
+       {SN_ConvertToInt64WithTruncation, OP_XOP_I8_X, SIMD_OP_SSE_CVTTSS2SI64},
+       {SN_Divide, OP_XBINOP, OP_FDIV},
+       {SN_DivideScalar, OP_SSE_DIVSS},
+       {SN_LoadAlignedVector128, OP_SSE_LOADU, 16 /* alignment */},
+       {SN_LoadHigh, OP_SSE_MOVHPS_LOAD},
+       {SN_LoadLow, OP_SSE_MOVLPS_LOAD},
+       {SN_LoadScalarVector128, OP_SSE_MOVSS},
+       {SN_LoadVector128, OP_SSE_LOADU, 1 /* alignment */},
+       {SN_Max, OP_XOP_X_X_X, SIMD_OP_SSE_MAXPS},
+       {SN_MaxScalar, OP_XOP_X_X_X, SIMD_OP_SSE_MAXSS},
+       {SN_Min, OP_XOP_X_X_X, SIMD_OP_SSE_MINPS},
+       {SN_MinScalar, OP_XOP_X_X_X, SIMD_OP_SSE_MINSS},
+       {SN_MoveHighToLow, OP_SSE_MOVEHL},
+       {SN_MoveLowToHigh, OP_SSE_MOVELH},
+       {SN_MoveMask, OP_SSE_MOVMSK},
+       {SN_MoveScalar, OP_SSE_MOVS2},
+       {SN_Multiply, OP_XBINOP, OP_FMUL},
+       {SN_MultiplyScalar, OP_SSE_MULSS},
+       {SN_Or, OP_SSE_OR},
+       {SN_Prefetch0, OP_SSE_PREFETCHT0},
+       {SN_Prefetch1, OP_SSE_PREFETCHT1},
+       {SN_Prefetch2, OP_SSE_PREFETCHT2},
+       {SN_PrefetchNonTemporal, OP_SSE_PREFETCHNTA},
+       {SN_Reciprocal, OP_XOP_X_X, SIMD_OP_SSE_RCPPS},
+       {SN_ReciprocalScalar, 0, SIMD_OP_SSE_RCPSS},
+       {SN_ReciprocalSqrt, OP_XOP_X_X, SIMD_OP_SSE_RSQRTPS},
+       {SN_ReciprocalSqrtScalar, 0, SIMD_OP_SSE_RSQRTSS},
+       {SN_Sqrt, OP_XOP_X_X, SIMD_OP_SSE_SQRTPS},
+       {SN_SqrtScalar, 0, SIMD_OP_SSE_SQRTSS},
+       {SN_Shuffle},
+       {SN_Store, OP_SSE_STORE, 1 /* alignment */},
+       {SN_StoreAligned, OP_SSE_STORE, 16 /* alignment */},
+       {SN_StoreAlignedNonTemporal, OP_SSE_MOVNTPS},
+       {SN_StoreFence, OP_XOP, SIMD_OP_SSE_SFENCE},
+       {SN_StoreHigh, OP_SSE_MOVHPS_STORE},
+       {SN_StoreLow, OP_SSE_MOVLPS_STORE},
+       {SN_StoreScalar, OP_SSE_MOVSS_STORE},
+       {SN_Subtract, OP_XBINOP, OP_FSUB},
+       {SN_SubtractScalar, OP_SSE_SUBSS},
+       {SN_UnpackHigh, OP_SSE_UNPACKHI},
+       {SN_UnpackLow, OP_SSE_UNPACKLO},
+       {SN_Xor, OP_SSE_XOR},
+       {SN_get_IsSupported}
 };
 
 static guint16 sse2_methods [] = {
        SN_Add,
        SN_AddSaturate,
+       SN_AddScalar,
        SN_And,
        SN_AndNot,
+       SN_Average,
        SN_CompareEqual,
        SN_CompareGreaterThan,
        SN_CompareLessThan,
        SN_CompareNotEqual,
+       SN_CompareScalarEqual,
+       SN_ConvertScalarToVector128Double,
        SN_ConvertScalarToVector128Int32,
        SN_ConvertScalarToVector128Int64,
        SN_ConvertScalarToVector128UInt32,
        SN_ConvertScalarToVector128UInt64,
+       SN_ConvertToInt64,
+       SN_ConvertToInt64WithTruncation,
        SN_ConvertToUInt32,
        SN_ConvertToUInt64,
        SN_LoadAlignedVector128,
@@ -751,50 +855,28 @@ emit_x86_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature
        gboolean supported, is_64bit;
        MonoClass *klass = cmethod->klass;
        MonoTypeEnum arg0_type = fsig->param_count > 0 ? get_underlying_type (fsig->params [0]) : MONO_TYPE_VOID;
+       SimdIntrinsic *info;
+       gboolean is_corlib = m_class_get_image (cfg->method->klass) == mono_get_corlib ();
 
        if (is_hw_intrinsics_class (klass, "Sse", &is_64bit)) {
                if (!COMPILE_LLVM (cfg))
                        return NULL;
-               id = lookup_intrins (sse_methods, sizeof (sse_methods), cmethod);
-               if (id == -1)
+               info = lookup_intrins_info (sse_methods, sizeof (sse_methods), cmethod);
+               if (!info)
                        return NULL;
+               int id = info->id;
+
+               supported = (mini_get_cpu_features (cfg) & MONO_CPU_X86_SSE) != 0;
+
+               /* Common case */
+               if (info->op != 0)
+                       return emit_simd_ins_for_sig (cfg, klass, info->op, info->instc0, arg0_type, fsig, args);
 
-               supported = (mini_get_cpu_features (cfg) & MONO_CPU_X86_SSE) != 0 &&
-                       m_class_get_image (cfg->method->klass) == mono_get_corlib (); // We only support the subset used by corelib
-               
                switch (id) {
                case SN_get_IsSupported:
                        EMIT_NEW_ICONST (cfg, ins, supported ? 1 : 0);
                        ins->type = STACK_I4;
                        return ins;
-               case SN_LoadAlignedVector128:
-                       return emit_simd_ins_for_sig (cfg, klass, OP_SSE_LOADU, 16 /*alignment*/, arg0_type, fsig, args);
-               case SN_LoadVector128:
-                       return emit_simd_ins_for_sig (cfg, klass, OP_SSE_LOADU, 1 /*alignment*/, arg0_type, fsig, args);
-               case SN_MoveMask:
-                       return emit_simd_ins_for_sig (cfg, klass, OP_SSE_MOVMSK, -1, arg0_type, fsig, args);
-               case SN_MoveScalar:
-                       return emit_simd_ins_for_sig (cfg, klass, OP_SSE_MOVS2, -1, arg0_type, fsig, args);
-               case SN_CompareNotEqual:
-                       return emit_simd_ins_for_sig (cfg, klass, OP_XCOMPARE_FP, CMP_NE, arg0_type, fsig, args);
-               case SN_CompareEqual:
-                       return emit_simd_ins_for_sig (cfg, klass, OP_XCOMPARE_FP, CMP_EQ, arg0_type, fsig, args);
-               case SN_And:
-                       return emit_simd_ins_for_sig (cfg, klass, OP_SSE_AND, -1, arg0_type, fsig, args);
-               case SN_AndNot:
-                       return emit_simd_ins_for_sig (cfg, klass, OP_SSE_ANDN, -1, arg0_type, fsig, args);
-               case SN_Or:
-                       return emit_simd_ins_for_sig (cfg, klass, OP_SSE_OR, -1, arg0_type, fsig, args);
-               case SN_Xor:
-                       return emit_simd_ins_for_sig (cfg, klass, OP_SSE_XOR, -1, arg0_type, fsig, args);
-               case SN_Multiply:
-                       return emit_simd_ins_for_sig (cfg, klass, OP_XBINOP, OP_FMUL, arg0_type, fsig, args);
-               case SN_Divide:
-                       return emit_simd_ins_for_sig (cfg, klass, OP_XBINOP, OP_FDIV, arg0_type, fsig, args);
-               case SN_Add:
-                       return emit_simd_ins_for_sig (cfg, klass, OP_XBINOP, OP_FADD, arg0_type, fsig, args);
-               case SN_Subtract:
-                       return emit_simd_ins_for_sig (cfg, klass, OP_XBINOP, OP_FSUB, arg0_type, fsig, args);
                case SN_Shuffle: {
                        if (args [2]->opcode != OP_ICONST) {
                                mono_cfg_set_exception (cfg, MONO_EXCEPTION_MONO_ERROR);
@@ -804,18 +886,23 @@ emit_x86_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature
                        }
                        return emit_simd_ins_for_sig (cfg, klass, OP_SSE_SHUFFLE, args [2]->inst_c0 /*mask*/, arg0_type, fsig, args);
                }
-               case SN_MoveHighToLow:
-                       return emit_simd_ins_for_sig (cfg, klass, OP_SSE_MOVEHL, -1, arg0_type, fsig, args);
-               case SN_MoveLowToHigh:
-                       return emit_simd_ins_for_sig (cfg, klass, OP_SSE_MOVELH, -1, arg0_type, fsig, args);
-               case SN_UnpackLow:
-                       return emit_simd_ins_for_sig (cfg, klass, OP_SSE_UNPACKLO, -1, arg0_type, fsig, args);
-               case SN_UnpackHigh:
-                       return emit_simd_ins_for_sig (cfg, klass, OP_SSE_UNPACKHI, -1, arg0_type, fsig, args);
-               case SN_Store:
-                       return emit_simd_ins_for_sig (cfg, klass, OP_SSE_STORE, 1 /*alignment*/, arg0_type, fsig, args);
-               case SN_StoreAligned:
-                       return emit_simd_ins_for_sig (cfg, klass, OP_SSE_STORE, 16 /*alignment*/, arg0_type, fsig, args);
+               case SN_ConvertScalarToVector128Single:
+                       if (fsig->params [1]->type == MONO_TYPE_I4)
+                               return emit_simd_ins_for_sig (cfg, klass, OP_XOP_X_X_I4, SIMD_OP_SSE_CVTSI2SS, 0, fsig, args);
+                       else if (fsig->params [1]->type == MONO_TYPE_I8)
+                               return emit_simd_ins_for_sig (cfg, klass, OP_XOP_X_X_I8, SIMD_OP_SSE_CVTSI2SS64, 0, fsig, args);
+                       else
+                               g_assert_not_reached ();
+                       break;
+               case SN_ReciprocalScalar:
+               case SN_ReciprocalSqrtScalar:
+               case SN_SqrtScalar:
+                       if (fsig->param_count == 1)
+                               return emit_simd_ins_for_sig (cfg, klass, OP_XOP_X_X, info->instc0, arg0_type, fsig, args);
+                       else
+                               return NULL;
+               case SN_LoadScalarVector128:
+                       return NULL;
                default:
                        return NULL;
                }
@@ -828,8 +915,7 @@ emit_x86_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature
                if (id == -1)
                        return NULL;
 
-               supported = (mini_get_cpu_features (cfg) & MONO_CPU_X86_SSE2) != 0 &&
-                       m_class_get_image (cfg->method->klass) == mono_get_corlib (); // We only support the subset used by corelib
+               supported = (mini_get_cpu_features (cfg) & MONO_CPU_X86_SSE3) != 0 && is_corlib;// We only support the subset used by corelib
                
                switch (id) {
                case SN_get_IsSupported: {
@@ -843,10 +929,19 @@ emit_x86_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature
                        return emit_simd_ins_for_sig (cfg, klass, OP_XBINOP, arg0_type == MONO_TYPE_R8 ? OP_FADD : OP_IADD, arg0_type, fsig, args);
                case SN_AddSaturate:
                        return emit_simd_ins_for_sig (cfg, klass, OP_SSE2_ADDS, -1, arg0_type, fsig, args);
+               case SN_AddScalar:
+                       return emit_simd_ins_for_sig (cfg, klass, OP_SSE2_ADDSD, -1, arg0_type, fsig, args);
                case SN_And:
                        return emit_simd_ins_for_sig (cfg, klass, OP_SSE_AND, -1, arg0_type, fsig, args);
                case SN_AndNot:
                        return emit_simd_ins_for_sig (cfg, klass, OP_SSE_ANDN, -1, arg0_type, fsig, args);
+               case SN_Average:
+                       if (arg0_type == MONO_TYPE_U1)
+                               return emit_simd_ins_for_sig (cfg, klass, OP_PAVGB_UN, -1, arg0_type, fsig, args);
+                       else if (arg0_type == MONO_TYPE_U2)
+                               return emit_simd_ins_for_sig (cfg, klass, OP_PAVGW_UN, -1, arg0_type, fsig, args);
+                       else
+                               return NULL;
                case SN_CompareNotEqual:
                        return emit_simd_ins_for_sig (cfg, klass, arg0_type == MONO_TYPE_R8 ? OP_XCOMPARE_FP : OP_XCOMPARE, CMP_NE, arg0_type, fsig, args);
                case SN_CompareEqual:
@@ -855,6 +950,8 @@ emit_x86_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature
                        return emit_simd_ins_for_sig (cfg, klass, arg0_type == MONO_TYPE_R8 ? OP_XCOMPARE_FP : OP_XCOMPARE, CMP_GT, arg0_type, fsig, args);
                case SN_CompareLessThan:
                        return emit_simd_ins_for_sig (cfg, klass, arg0_type == MONO_TYPE_R8 ? OP_XCOMPARE_FP : OP_XCOMPARE, CMP_LT, arg0_type, fsig, args);
+               case SN_CompareScalarEqual:
+                       return emit_simd_ins_for_sig (cfg, klass, OP_SSE2_CMPSD, CMP_EQ, arg0_type, fsig, args);
                case SN_ConvertScalarToVector128Int32:
                case SN_ConvertScalarToVector128Int64:
                case SN_ConvertScalarToVector128UInt32:
@@ -924,8 +1021,7 @@ emit_x86_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature
                if (id == -1)
                        return NULL;
 
-               supported = (mini_get_cpu_features (cfg) & MONO_CPU_X86_SSE3) != 0 &&
-                       m_class_get_image (cfg->method->klass) == mono_get_corlib (); // We only support the subset used by corelib
+               supported = (mini_get_cpu_features (cfg) & MONO_CPU_X86_SSE3) != 0 && is_corlib; // We only support the subset used by corelib
 
                switch (id) {
                case SN_get_IsSupported:
@@ -946,8 +1042,7 @@ emit_x86_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature
                if (id == -1)
                        return NULL;
 
-               supported = (mini_get_cpu_features (cfg) & MONO_CPU_X86_SSSE3) != 0 &&
-                       m_class_get_image (cfg->method->klass) == mono_get_corlib (); // We only support the subset used by corelib
+               supported = (mini_get_cpu_features (cfg) & MONO_CPU_X86_SSSE3) != 0 && is_corlib; // We only support the subset used by corelib
 
                switch (id) {
                case SN_get_IsSupported:
@@ -968,8 +1063,7 @@ emit_x86_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature
                if (id == -1)
                        return NULL;
 
-               supported = COMPILE_LLVM (cfg) && (mini_get_cpu_features (cfg) & MONO_CPU_X86_SSE41) != 0 &&
-                       m_class_get_image (cfg->method->klass) == mono_get_corlib (); // We only support the subset used by corelib
+               supported = COMPILE_LLVM (cfg) && (mini_get_cpu_features (cfg) & MONO_CPU_X86_SSE41) != 0 && is_corlib; // We only support the subset used by corelib
 
                switch (id) {
                case SN_get_IsSupported:
@@ -1360,7 +1454,8 @@ mono_emit_simd_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSign
 
 #ifdef TARGET_AMD64 // TODO: test and enable for x86 too
        if (!strcmp (class_ns, "System.Runtime.Intrinsics.X86")) {
-               return emit_x86_intrinsics (cfg ,cmethod, fsig, args);
+               MonoInst *ins = emit_x86_intrinsics (cfg ,cmethod, fsig, args);
+               return ins;
        }
 #endif
 
index 067be67..64eec13 100644 (file)
@@ -7,6 +7,8 @@ METHOD(LessThan)
 METHOD(LessThanOrEqual)
 METHOD(Min)
 METHOD(Max)
+METHOD(MinScalar)
+METHOD(MaxScalar)
 METHOD(PopCount)
 METHOD(LeadingZeroCount)
 METHOD(get_Count)
@@ -28,8 +30,10 @@ METHOD(op_Multiply)
 METHOD(op_Subtraction)
 // Vector
 METHOD(ConvertToInt32)
+METHOD(ConvertToInt32WithTruncation)
 METHOD(ConvertToUInt32)
 METHOD(ConvertToInt64)
+METHOD(ConvertToInt64WithTruncation)
 METHOD(ConvertToUInt64)
 METHOD(ConvertToSingle)
 METHOD(ConvertToDouble)
@@ -62,27 +66,82 @@ METHOD(ParallelBitDeposit)
 METHOD(ParallelBitExtract)
 // Sse
 METHOD(Add)
+METHOD(CompareGreaterThanOrEqual)
+METHOD(CompareLessThanOrEqual)
 METHOD(CompareNotEqual)
+METHOD(CompareNotGreaterThan)
+METHOD(CompareNotGreaterThanOrEqual)
+METHOD(CompareNotLessThan)
+METHOD(CompareNotLessThanOrEqual)
+METHOD(CompareScalarGreaterThan)
+METHOD(CompareScalarGreaterThanOrEqual)
+METHOD(CompareScalarLessThan)
+METHOD(CompareScalarLessThanOrEqual)
+METHOD(CompareScalarNotEqual)
+METHOD(CompareScalarNotGreaterThan)
+METHOD(CompareScalarNotGreaterThanOrEqual)
+METHOD(CompareScalarNotLessThan)
+METHOD(CompareScalarNotLessThanOrEqual)
+METHOD(CompareScalarOrderedEqual)
+METHOD(CompareScalarOrderedGreaterThan)
+METHOD(CompareScalarOrderedGreaterThanOrEqual)
+METHOD(CompareScalarOrderedLessThan)
+METHOD(CompareScalarOrderedLessThanOrEqual)
+METHOD(CompareScalarOrderedNotEqual)
+METHOD(CompareScalarUnorderedEqual)
+METHOD(CompareScalarUnorderedGreaterThan)
+METHOD(CompareScalarUnorderedGreaterThanOrEqual)
+METHOD(CompareScalarUnorderedLessThan)
+METHOD(CompareScalarUnorderedLessThanOrEqual)
+METHOD(CompareScalarUnorderedNotEqual)
+METHOD(CompareOrdered)
+METHOD(CompareUnordered)
+METHOD(CompareScalarOrdered)
+METHOD(CompareScalarUnordered)
+METHOD(ConvertScalarToVector128Single)
 METHOD(Divide)
+METHOD(DivideScalar)
 METHOD(Store)
+METHOD(StoreFence)
+METHOD(StoreHigh)
+METHOD(StoreLow)
 METHOD(Subtract)
+METHOD(SubtractScalar)
 METHOD(CompareEqual)
+METHOD(LoadHigh)
+METHOD(LoadLow)
 METHOD(LoadVector128)
+METHOD(LoadScalarVector128)
 METHOD(MoveHighToLow)
 METHOD(MoveLowToHigh)
 METHOD(MoveMask)
 METHOD(MoveScalar)
 METHOD(Multiply)
+METHOD(MultiplyScalar)
 METHOD(Shuffle)
 METHOD(UnpackHigh)
 METHOD(UnpackLow)
+METHOD(Prefetch0)
+METHOD(Prefetch1)
+METHOD(Prefetch2)
+METHOD(PrefetchNonTemporal)
+METHOD(Reciprocal)
+METHOD(ReciprocalScalar)
+METHOD(ReciprocalSqrt)
+METHOD(ReciprocalSqrtScalar)
+METHOD(Sqrt)
+METHOD(SqrtScalar)
 // Sse2
 METHOD(AddSaturate)
+METHOD(AddScalar)
 METHOD(And)
+METHOD(Average)
 METHOD(Or)
 METHOD(LoadAlignedVector128)
 METHOD(Xor)
 METHOD(CompareGreaterThan)
+METHOD(CompareScalarEqual)
+METHOD(ConvertScalarToVector128Double)
 METHOD(ConvertScalarToVector128Int32)
 METHOD(ConvertScalarToVector128Int64)
 METHOD(ConvertScalarToVector128UInt32)
@@ -90,10 +149,11 @@ METHOD(ConvertScalarToVector128UInt64)
 METHOD(PackUnsignedSaturate)
 METHOD(StoreScalar)
 METHOD(StoreAligned)
+METHOD(StoreAlignedNonTemporal)
 METHOD(ShiftRightLogical)
 METHOD(CompareLessThan)
 // Sse3
 METHOD(MoveAndDuplicate)
 // Sse41
 METHOD(Insert)
-METHOD(TestZ)
\ No newline at end of file
+METHOD(TestZ)