* [jit] Implement support for all Sse1 intrinsics for netcore.
* Add generic OP_XOP opcodes for opcodes which the JIT doesn't care about.
Add a SimdOp enum to list the operations performed by these opcodes.
* Add a SimdIntrinsic struct so the mapping between the .net methods
and the JIT opcodes can be specified declaratively.
* Add all intrinsics from the Sse class.
* Fix UnpackHigh/UnpackLow.
* Implement missing load/store intrinsics.
* Implement missing opcodes.
* Fix nontemporal metadata.
* Fix MOVNTPS alignment..
* Fix OP_XOP_X_X.
INTRINS_OVR(CTLZ_I64, ctlz)
INTRINS_OVR(CTTZ_I32, cttz)
INTRINS_OVR(CTTZ_I64, cttz)
+INTRINS(PREFETCH, prefetch)
INTRINS(BZHI_I32, x86_bmi_bzhi_32)
INTRINS(BZHI_I64, x86_bmi_bzhi_64)
INTRINS(BEXTR_I32, x86_bmi_bextr_32)
INTRINS(SSE_PSLLI_Q, x86_sse2_pslli_q)
INTRINS(SSE_SQRT_PD, x86_sse2_sqrt_pd)
INTRINS(SSE_SQRT_PS, x86_sse_sqrt_ps)
-INTRINS(SSE_RSQRT_PS, x86_sse_rsqrt_ps)
INTRINS(SSE_RCP_PS, x86_sse_rcp_ps)
+INTRINS(SSE_RSQRT_PS, x86_sse_rsqrt_ps)
+INTRINS(SSE_SQRT_SS, x86_sse_sqrt_ss)
+INTRINS(SSE_RCP_SS, x86_sse_rcp_ss)
+INTRINS(SSE_RSQRT_SS, x86_sse_rsqrt_ss)
INTRINS(SSE_CVTTPD2DQ, x86_sse2_cvttpd2dq)
INTRINS(SSE_CVTTPS2DQ, x86_sse2_cvttps2dq)
INTRINS(SSE_CVTDQ2PS, x86_sse2_cvtdq2ps)
INTRINS(SSE_ROUNDPD, x86_sse41_round_pd)
INTRINS(SSE_PTESTZ, x86_sse41_ptestz)
INTRINS(SSE_INSERTPS, x86_sse41_insertps)
+INTRINS(SSE_SFENCE, x86_sse_sfence)
#if LLVM_API_VERSION >= 800
// these intrinsics were renamed in LLVM 8
INTRINS_OVR(SSE_SADD_SATI8, sadd_sat)
LLVMRealUGE,
LLVMRealULT,
LLVMRealUGT,
+ LLVMRealORD,
+ LLVMRealUNO
};
static MonoLLVMModule aot_module;
}
static LLVMValueRef
-ConstInt32 (int v)
+const_int32 (int v)
{
return LLVMConstInt (LLVMInt32Type (), v, FALSE);
}
}
static void
+set_nontemporal_flag (LLVMValueRef v)
+{
+ LLVMValueRef md_arg;
+ int md_kind;
+ const char *flag_name;
+
+ // FIXME: Cache this
+ flag_name = "nontemporal";
+ md_kind = LLVMGetMDKindID (flag_name, strlen (flag_name));
+ md_arg = const_int32 (1);
+ LLVMSetMetadata (v, md_kind, LLVMMDNode (&md_arg, 1));
+}
+
+static void
set_invariant_load_flag (LLVMValueRef v)
{
LLVMValueRef md_arg;
v = mono_llvm_build_alloca (builder, LLVMInt8Type (), LLVMConstInt (LLVMInt32Type (), size, FALSE), MONO_ARCH_FRAME_ALIGNMENT, "");
if (ins->flags & MONO_INST_INIT)
- emit_memset (ctx, builder, v, ConstInt32 (size), MONO_ARCH_FRAME_ALIGNMENT);
+ emit_memset (ctx, builder, v, const_int32 (size), MONO_ARCH_FRAME_ALIGNMENT);
values [ins->dreg] = v;
break;
if (!addresses [ins->dreg])
addresses [ins->dreg] = build_alloca (ctx, m_class_get_byval_arg (klass));
LLVMValueRef ptr = LLVMBuildBitCast (builder, addresses [ins->dreg], LLVMPointerType (LLVMInt8Type (), 0), "");
- emit_memset (ctx, builder, ptr, ConstInt32 (mono_class_value_size (klass, NULL)), 0);
+ emit_memset (ctx, builder, ptr, const_int32 (mono_class_value_size (klass, NULL)), 0);
break;
}
case OP_DUMMY_VZERO:
values [ins->dreg] = mono_llvm_build_aligned_load (builder, dst_vec, "", FALSE, ins->inst_c0); // inst_c0 is alignment
break;
}
+ case OP_SSE_MOVSS: {
+ LLVMValueRef addr = convert (ctx, lhs, LLVMPointerType (LLVMFloatType (), 0));
+ LLVMValueRef val = mono_llvm_build_load (builder, addr, "", FALSE);
+ values [ins->dreg] = LLVMBuildInsertElement (builder, LLVMConstNull (type_to_sse_type (ins->inst_c1)), val, LLVMConstInt (LLVMInt32Type (), 0, FALSE), "");
+ break;
+ }
+ case OP_SSE_MOVSS_STORE: {
+ LLVMValueRef addr = convert (ctx, lhs, LLVMPointerType (LLVMFloatType (), 0));
+ LLVMValueRef val = LLVMBuildExtractElement (builder, rhs, LLVMConstInt (LLVMInt32Type (), 0, FALSE), "");
+ mono_llvm_build_store (builder, val, addr, FALSE, LLVM_BARRIER_NONE);
+ break;
+ }
+
+ case OP_SSE_MOVLPS_LOAD:
+ case OP_SSE_MOVHPS_LOAD: {
+ /* Load two floats from rhs and store them in the low/high part of lhs */
+ LLVMValueRef addr = rhs;
+ LLVMValueRef addr1 = convert (ctx, addr, LLVMPointerType (LLVMFloatType (), 0));
+ LLVMValueRef addr2 = convert (ctx, LLVMBuildAdd (builder, convert (ctx, addr, IntPtrType ()), convert (ctx, LLVMConstInt (LLVMInt32Type (), 4, FALSE), IntPtrType ()), ""), LLVMPointerType (LLVMFloatType (), 0));
+ LLVMValueRef val1 = mono_llvm_build_load (builder, addr1, "", FALSE);
+ LLVMValueRef val2 = mono_llvm_build_load (builder, addr2, "", FALSE);
+ int index1 = ins->opcode == OP_SSE_MOVLPS_LOAD ? 0 : 2;
+ int index2 = ins->opcode == OP_SSE_MOVLPS_LOAD ? 1 : 3;
+ values [ins->dreg] = LLVMBuildInsertElement (builder, LLVMBuildInsertElement (builder, lhs, val1, LLVMConstInt (LLVMInt32Type (), index1, FALSE), ""), val2, LLVMConstInt (LLVMInt32Type (), index2, FALSE), "");
+ break;
+ }
+
+ case OP_SSE_MOVLPS_STORE:
+ case OP_SSE_MOVHPS_STORE: {
+ /* Store two floats from the low/hight part of rhs into lhs */
+ LLVMValueRef addr = lhs;
+ LLVMValueRef addr1 = convert (ctx, addr, LLVMPointerType (LLVMFloatType (), 0));
+ LLVMValueRef addr2 = convert (ctx, LLVMBuildAdd (builder, convert (ctx, addr, IntPtrType ()), convert (ctx, LLVMConstInt (LLVMInt32Type (), 4, FALSE), IntPtrType ()), ""), LLVMPointerType (LLVMFloatType (), 0));
+ int index1 = ins->opcode == OP_SSE_MOVLPS_STORE ? 0 : 2;
+ int index2 = ins->opcode == OP_SSE_MOVLPS_STORE ? 1 : 3;
+ LLVMValueRef val1 = LLVMBuildExtractElement (builder, rhs, LLVMConstInt (LLVMInt32Type (), index1, FALSE), "");
+ LLVMValueRef val2 = LLVMBuildExtractElement (builder, rhs, LLVMConstInt (LLVMInt32Type (), index2, FALSE), "");
+ mono_llvm_build_store (builder, val1, addr1, FALSE, LLVM_BARRIER_NONE);
+ mono_llvm_build_store (builder, val2, addr2, FALSE, LLVM_BARRIER_NONE);
+ break;
+ }
case OP_SSE_STORE: {
LLVMValueRef dst_vec = convert (ctx, lhs, LLVMPointerType (LLVMTypeOf (rhs), 0));
mono_llvm_build_aligned_store (builder, first_elem, dst, FALSE, 1);
break;
}
+ case OP_SSE_MOVNTPS: {
+ LLVMValueRef store = mono_llvm_build_aligned_store (builder, rhs, lhs, FALSE, 16);
+ set_nontemporal_flag (store);
+ break;
+ }
+ case OP_SSE_PREFETCHT0: {
+ LLVMValueRef addr = convert (ctx, lhs, LLVMPointerType (LLVMInt8Type (), 0));
+ LLVMValueRef args [] = { addr, const_int32 (0), const_int32 (3), const_int32 (1) };
+ call_intrins (ctx, INTRINS_PREFETCH, args, "");
+ break;
+ }
+ case OP_SSE_PREFETCHT1: {
+ LLVMValueRef addr = convert (ctx, lhs, LLVMPointerType (LLVMInt8Type (), 0));
+ LLVMValueRef args [] = { addr, const_int32 (0), const_int32 (2), const_int32 (1) };
+ call_intrins (ctx, INTRINS_PREFETCH, args, "");
+ break;
+ }
+ case OP_SSE_PREFETCHT2: {
+ LLVMValueRef addr = convert (ctx, lhs, LLVMPointerType (LLVMInt8Type (), 0));
+ LLVMValueRef args [] = { addr, const_int32 (0), const_int32 (1), const_int32 (1) };
+ call_intrins (ctx, INTRINS_PREFETCH, args, "");
+ break;
+ }
+ case OP_SSE_PREFETCHNTA: {
+ LLVMValueRef addr = convert (ctx, lhs, LLVMPointerType (LLVMInt8Type (), 0));
+ LLVMValueRef args [] = { addr, const_int32 (0), const_int32 (0), const_int32 (1) };
+ call_intrins (ctx, INTRINS_PREFETCH, args, "");
+ break;
+ }
case OP_SSE_SHUFFLE: {
LLVMValueRef shuffle_vec = create_const_vector_4_i32 (
break;
}
- case OP_SSE2_ADDS: {
- gint32 intrinsic = 0;
- if (ins->inst_c1 == MONO_TYPE_I1)
- intrinsic = INTRINS_SSE_SADD_SATI8;
- else if (ins->inst_c1 == MONO_TYPE_U1)
- intrinsic = INTRINS_SSE_UADD_SATI8;
- else if (ins->inst_c1 == MONO_TYPE_I2)
- intrinsic = INTRINS_SSE_SADD_SATI16;
- else if (ins->inst_c1 == MONO_TYPE_U2)
- intrinsic = INTRINS_SSE_UADD_SATI16;
- else
+ case OP_SSE_ADDSS:
+ case OP_SSE_SUBSS:
+ case OP_SSE_DIVSS:
+ case OP_SSE_MULSS:
+ case OP_SSE2_ADDSD: {
+ LLVMValueRef v1 = LLVMBuildExtractElement (builder, lhs, LLVMConstInt (LLVMInt32Type (), 0, FALSE), "");
+ LLVMValueRef v2 = LLVMBuildExtractElement (builder, rhs, LLVMConstInt (LLVMInt32Type (), 0, FALSE), "");
+
+ LLVMValueRef v = NULL;
+ switch (ins->opcode) {
+ case OP_SSE_ADDSS:
+ case OP_SSE2_ADDSD:
+ v = LLVMBuildFAdd (builder, v1, v2, "");
+ break;
+ case OP_SSE_SUBSS:
+ v = LLVMBuildFSub (builder, v1, v2, "");
+ break;
+ case OP_SSE_DIVSS:
+ v = LLVMBuildFDiv (builder, v1, v2, "");
+ break;
+ case OP_SSE_MULSS:
+ v = LLVMBuildFMul (builder, v1, v2, "");
+ break;
+ default:
+ g_assert_not_reached ();
+ }
+ values [ins->dreg] = LLVMBuildInsertElement (builder, lhs, v, LLVMConstInt (LLVMInt32Type (), 0, FALSE), "");
+ break;
+ }
+
+ case OP_SSE_CMPSS:
+ case OP_SSE2_CMPSD: {
+ int imm = -1;
+ switch (ins->inst_c0) {
+ case CMP_EQ: imm = 0; break;
+ case CMP_GT: imm = 6; break;
+ case CMP_GE: imm = 5; break;
+ case CMP_LT: imm = 1; break;
+ case CMP_LE: imm = 2; break;
+ case CMP_NE: imm = 4; break;
+ case CMP_ORD: imm = 7; break;
+ case CMP_UNORD: imm = 3; break;
+ default: g_assert_not_reached (); break;
+ }
+ LLVMValueRef cmp = LLVMConstInt (LLVMInt8Type (), imm, FALSE);
+ LLVMValueRef args [] = { lhs, rhs, cmp };
+ switch (ins->opcode) {
+ case OP_SSE_CMPSS:
+ values [ins->dreg] = call_intrins (ctx, INTRINS_SSE_CMPSS, args, "");
+ break;
+ case OP_SSE2_CMPSD:
+ values [ins->dreg] = call_intrins (ctx, INTRINS_SSE_CMPSD, args, "");
+ break;
+ default:
g_assert_not_reached ();
+ break;
+ }
+ break;
+ }
+ case OP_SSE_COMISS: {
+ LLVMValueRef args [] = { lhs, rhs };
+ IntrinsicId id = (IntrinsicId)0;
+ switch (ins->inst_c0) {
+ case CMP_EQ: id = INTRINS_SSE_COMIEQ_SS; break;
+ case CMP_GT: id = INTRINS_SSE_COMIGT_SS; break;
+ case CMP_GE: id = INTRINS_SSE_COMIGE_SS; break;
+ case CMP_LT: id = INTRINS_SSE_COMILT_SS; break;
+ case CMP_LE: id = INTRINS_SSE_COMILE_SS; break;
+ case CMP_NE: id = INTRINS_SSE_COMINEQ_SS; break;
+ default: g_assert_not_reached (); break;
+ }
+ values [ins->dreg] = call_intrins (ctx, id, args, "");
+ break;
+ }
+ case OP_SSE_UCOMISS: {
+ LLVMValueRef args [] = { lhs, rhs };
+ IntrinsicId id = (IntrinsicId)0;
+ switch (ins->inst_c0) {
+ case CMP_EQ: id = INTRINS_SSE_UCOMIEQ_SS; break;
+ case CMP_GT: id = INTRINS_SSE_UCOMIGT_SS; break;
+ case CMP_GE: id = INTRINS_SSE_UCOMIGE_SS; break;
+ case CMP_LT: id = INTRINS_SSE_UCOMILT_SS; break;
+ case CMP_LE: id = INTRINS_SSE_UCOMILE_SS; break;
+ case CMP_NE: id = INTRINS_SSE_UCOMINEQ_SS; break;
+ default: g_assert_not_reached (); break;
+ }
+ values [ins->dreg] = call_intrins (ctx, id, args, "");
+ break;
+ }
+ case OP_XOP: {
+ IntrinsicId id = (IntrinsicId)0;
+ switch (ins->inst_c0) {
+ case SIMD_OP_SSE_SFENCE: id = INTRINS_SSE_SFENCE; break;
+ default: g_assert_not_reached (); break;
+ }
+ call_intrins (ctx, id, NULL, "");
+ break;
+ }
+ case OP_XOP_X_X: {
+ IntrinsicId id = (IntrinsicId)0;
+ switch (ins->inst_c0) {
+ case SIMD_OP_SSE_SQRTPS: id = INTRINS_SSE_SQRT_PS; break;
+ case SIMD_OP_SSE_RCPPS: id = INTRINS_SSE_RCP_PS; break;
+ case SIMD_OP_SSE_RSQRTPS: id = INTRINS_SSE_RSQRT_PS; break;
+ case SIMD_OP_SSE_SQRTSS: id = INTRINS_SSE_SQRT_SS; break;
+ case SIMD_OP_SSE_RCPSS: id = INTRINS_SSE_RCP_SS; break;
+ case SIMD_OP_SSE_RSQRTSS: id = INTRINS_SSE_RSQRT_SS; break;
+ default: g_assert_not_reached (); break;
+ }
+ values [ins->dreg] = call_intrins (ctx, id, &lhs, "");
+ break;
+ }
+ case OP_XOP_I4_X:
+ case OP_XOP_I8_X: {
+ IntrinsicId id = (IntrinsicId)0;
+ switch (ins->inst_c0) {
+ case SIMD_OP_SSE_CVTSS2SI: id = INTRINS_SSE_CVTSS2SI; break;
+ case SIMD_OP_SSE_CVTTSS2SI: id = INTRINS_SSE_CVTTSS2SI; break;
+ case SIMD_OP_SSE_CVTSS2SI64: id = INTRINS_SSE_CVTSS2SI64; break;
+ case SIMD_OP_SSE_CVTTSS2SI64: id = INTRINS_SSE_CVTTSS2SI64; break;
+ case SIMD_OP_SSE_CVTSD2SI: id = INTRINS_SSE_CVTSD2SI; break;
+ case SIMD_OP_SSE_CVTSD2SI64: id = INTRINS_SSE_CVTSD2SI64; break;
+ case SIMD_OP_SSE_CVTTSD2SI64: id = INTRINS_SSE_CVTTSD2SI64; break;
+ default: g_assert_not_reached (); break;
+ }
+ values [ins->dreg] = call_intrins (ctx, id, &lhs, "");
+ break;
+ }
+ case OP_XOP_X_X_X:
+ case OP_XOP_X_X_I4:
+ case OP_XOP_X_X_I8: {
+ LLVMValueRef args [] = { lhs, rhs };
+ IntrinsicId id = (IntrinsicId)0;
+ switch (ins->inst_c0) {
+ case SIMD_OP_SSE_CVTSI2SS: id = INTRINS_SSE_CVTSI2SS; break;
+ case SIMD_OP_SSE_CVTSI2SS64: id = INTRINS_SSE_CVTSI2SS64; break;
+ case SIMD_OP_SSE_CVTSI2SD: id = INTRINS_SSE_CVTSI2SD; break;
+ case SIMD_OP_SSE_CVTSI2SD64: id = INTRINS_SSE_CVTSI2SD64; break;
+ case SIMD_OP_SSE_MAXPS: id = INTRINS_SSE_MAXPS; break;
+ case SIMD_OP_SSE_MAXSS: id = INTRINS_SSE_MAXSS; break;
+ case SIMD_OP_SSE_MINPS: id = INTRINS_SSE_MINPS; break;
+ case SIMD_OP_SSE_MINSS: id = INTRINS_SSE_MINSS; break;
+ default: g_assert_not_reached (); break;
+ }
+ values [ins->dreg] = call_intrins (ctx, id, args, "");
+ break;
+ }
+
+ case OP_SSE2_ADDS: {
+ IntrinsicId id = (IntrinsicId)0;
+ switch (ins->inst_c1) {
+ case MONO_TYPE_I1: id = INTRINS_SSE_SADD_SATI8; break;
+ case MONO_TYPE_U1: id = INTRINS_SSE_UADD_SATI8; break;
+ case MONO_TYPE_I2: id = INTRINS_SSE_SADD_SATI16; break;
+ case MONO_TYPE_U2: id = INTRINS_SSE_UADD_SATI16; break;
+ default: g_assert_not_reached (); break;
+ }
LLVMValueRef args [2];
args [0] = convert (ctx, lhs, type_to_sse_type (ins->inst_c1));
args [1] = convert (ctx, rhs, type_to_sse_type (ins->inst_c1));
values [ins->dreg] = convert (ctx,
- call_intrins (ctx, intrinsic, args, dname),
+ call_intrins (ctx, id, args, dname),
type_to_sse_type (ins->inst_c1));
break;
}
MINI_OP(OP_SSE_OR, "sse_or", XREG, XREG, XREG)
MINI_OP(OP_SSE_XOR, "sse_xor", XREG, XREG, XREG)
MINI_OP(OP_SSE_ANDN, "sse_andn", XREG, XREG, XREG)
+MINI_OP(OP_SSE_ADDSS, "sse_addss", XREG, XREG, XREG)
+MINI_OP(OP_SSE_SUBSS, "sse_subss", XREG, XREG, XREG)
+MINI_OP(OP_SSE_DIVSS, "sse_divss", XREG, XREG, XREG)
+MINI_OP(OP_SSE_MULSS, "sse_mulss", XREG, XREG, XREG)
+MINI_OP(OP_SSE_CMPSS, "sse_cmpss", XREG, XREG, XREG)
+MINI_OP(OP_SSE_COMISS, "sse_comiss", IREG, XREG, XREG)
+MINI_OP(OP_SSE_UCOMISS, "sse_ucomiss", IREG, XREG, XREG)
+MINI_OP(OP_SSE_MOVSS, "sse_movss", XREG, IREG, NONE)
+MINI_OP(OP_SSE_MOVSS_STORE, "sse_movss_store", NONE, IREG, XREG)
+MINI_OP(OP_SSE_MOVHPS_LOAD, "sse_movhps_load", XREG, XREG, IREG)
+MINI_OP(OP_SSE_MOVLPS_LOAD, "sse_movlps_load", XREG, XREG, IREG)
+MINI_OP(OP_SSE_MOVHPS_STORE, "sse_movhps_store", NONE, IREG, XREG)
+MINI_OP(OP_SSE_MOVLPS_STORE, "sse_movlps_store", NONE, IREG, XREG)
+MINI_OP(OP_SSE_MOVNTPS, "sse_movntps", NONE, IREG, XREG)
+MINI_OP(OP_SSE_PREFETCHT0, "sse_prefetcht0", NONE, IREG, NONE)
+MINI_OP(OP_SSE_PREFETCHT1, "sse_prefetcht1", NONE, IREG, NONE)
+MINI_OP(OP_SSE_PREFETCHT2, "sse_prefetcht2", NONE, IREG, NONE)
+MINI_OP(OP_SSE_PREFETCHNTA, "sse_prefetchnta", NONE, IREG, NONE)
/* sse 2 */
MINI_OP(OP_SSE2_PACKUS, "sse2_packus", XREG, XREG, XREG)
MINI_OP(OP_SSE2_SRLI, "sse2_srli", XREG, XREG, XREG)
MINI_OP(OP_SSE2_SHUFFLE, "sse2_shuffle", XREG, XREG, XREG)
MINI_OP(OP_SSE2_ADDS, "sse2_adds", XREG, XREG, XREG)
+MINI_OP(OP_SSE2_ADDSD, "sse2_addsd", XREG, XREG, XREG)
+MINI_OP(OP_SSE2_CMPSD, "sse2_cmpsd", XREG, XREG, XREG)
+MINI_OP(OP_SSE2_COMIEQ_SD, "sse2_comieq_sd", XREG, XREG, XREG)
/* sse 3 */
MINI_OP(OP_SSE3_MOVDDUP, "sse3_movddup", XREG, XREG, NONE)
/* Per element compate, inst_c0 contains a CompRelation */
MINI_OP(OP_XCOMPARE, "xcompare", XREG, XREG, XREG)
MINI_OP(OP_XCOMPARE_FP, "xcompare_fp", XREG, XREG, XREG)
-/* Binary op, inst_c0 contains the operation */
+
+/*
+ * Generic SIMD operations, the rest of the JIT doesn't care about the exact operation.
+ */
MINI_OP(OP_XBINOP, "xbinop", XREG, XREG, XREG)
+/* inst_c0 contains a SimdOp, inst_c1 might contain additional data */
+MINI_OP(OP_XOP, "xop", NONE, NONE, NONE)
+MINI_OP(OP_XOP_X_X, "xop_x_x", XREG, XREG, NONE)
+MINI_OP(OP_XOP_I4_X, "xop_i4_x", IREG, XREG, NONE)
+MINI_OP(OP_XOP_I8_X, "xop_i8_x", LREG, XREG, NONE)
+MINI_OP(OP_XOP_X_X_X, "xop_x_x_x", XREG, XREG, XREG)
+MINI_OP(OP_XOP_X_X_I4, "xop_x_x_i4", XREG, XREG, IREG)
+MINI_OP(OP_XOP_X_X_I8, "xop_x_x_i8", XREG, XREG, LREG)
+
MINI_OP(OP_XCAST, "xcast", XREG, XREG, NONE)
/* Extract element of vector */
/* The index is assumed to be in range */
MINI_OP(OP_LZCNT64, "lzcnt64", LREG, LREG, NONE)
MINI_OP(OP_POPCNT32, "popcnt32", IREG, IREG, NONE)
MINI_OP(OP_POPCNT64, "popcnt64", LREG, LREG, NONE)
-
-
CMP_LE_UN,
CMP_GE_UN,
CMP_LT_UN,
- CMP_GT_UN
+ CMP_GT_UN,
+ CMP_ORD,
+ CMP_UNORD
} CompRelation;
typedef enum {
SIMD_PREFETCH_MODE_2,
};
+/* SIMD operations */
+typedef enum {
+ SIMD_OP_SSE_CVTSS2SI,
+ SIMD_OP_SSE_CVTTSS2SI,
+ SIMD_OP_SSE_CVTSS2SI64,
+ SIMD_OP_SSE_CVTTSS2SI64,
+ SIMD_OP_SSE_CVTSD2SI,
+ SIMD_OP_SSE_CVTSD2SI64,
+ SIMD_OP_SSE_CVTTSD2SI64,
+ SIMD_OP_SSE_CVTSI2SS,
+ SIMD_OP_SSE_CVTSI2SS64,
+ SIMD_OP_SSE_CVTSI2SD,
+ SIMD_OP_SSE_CVTSI2SD64,
+ SIMD_OP_SSE_MAXPS,
+ SIMD_OP_SSE_MAXSS,
+ SIMD_OP_SSE_MINPS,
+ SIMD_OP_SSE_MINSS,
+ SIMD_OP_SSE_SFENCE,
+ SIMD_OP_SSE_SQRTPS,
+ SIMD_OP_SSE_RCPPS,
+ SIMD_OP_SSE_RSQRTPS,
+ SIMD_OP_SSE_SQRTSS,
+ SIMD_OP_SSE_RCPSS,
+ SIMD_OP_SSE_RSQRTSS
+} SimdOp;
+
const char *mono_arch_xregname (int reg);
guint32 mono_arch_cpu_enumerate_simd_versions (void);
MonoCPUFeatures mono_arch_get_cpu_features (void);
static int register_size;
+typedef struct {
+ // One of the SN_ constants
+ guint16 id;
+ // ins->opcode
+ int op;
+ // ins->inst_c0
+ int instc0;
+} SimdIntrinsic;
+
void
mono_simd_intrinsics_init (void)
{
}
static int
+simd_intrinsic_info_compare_by_name (const void *key, const void *value)
+{
+ SimdIntrinsic *info = (SimdIntrinsic*)value;
+ return strcmp ((const char*)key, method_name (info->id));
+}
+
+static int
lookup_intrins (guint16 *intrinsics, int size, MonoMethod *cmethod)
{
const guint16 *result = (const guint16 *)mono_binary_search (cmethod->name, intrinsics, size / sizeof (guint16), sizeof (guint16), &simd_intrinsic_compare_by_name);
-
-#if FALSE
- for (int i = 0; i < (size / sizeof (guint16)) - 1; ++i) {
- if (method_name (intrinsics [i])[0] > method_name (intrinsics [i + 1])[0]) {
- printf ("%s %s\n",method_name (intrinsics [i]), method_name (intrinsics [i + 1]));
- g_assert_not_reached ();
- }
- }
-#endif
if (result == NULL)
return -1;
return (int)*result;
}
+static SimdIntrinsic*
+lookup_intrins_info (SimdIntrinsic *intrinsics, int size, MonoMethod *cmethod)
+{
+#if 0
+ for (int i = 0; i < (size / sizeof (SimdIntrinsic)) - 1; ++i) {
+ const char *n1 = method_name (intrinsics [i].id);
+ const char *n2 = method_name (intrinsics [i + 1].id);
+ int len1 = strlen (n1);
+ int len2 = strlen (n2);
+ for (int j = 0; j < len1 && j < len2; ++j) {
+ if (n1 [j] > n2 [j]) {
+ printf ("%s %s\n", n1, n2);
+ g_assert_not_reached ();
+ } else if (n1 [j] < n2 [j]) {
+ break;
+ }
+ }
+ }
+#endif
+
+ return (SimdIntrinsic *)mono_binary_search (cmethod->name, intrinsics, size / sizeof (SimdIntrinsic), sizeof (SimdIntrinsic), &simd_intrinsic_info_compare_by_name);
+}
+
static int
type_to_expand_op (MonoType *type)
{
} else if (spec [MONO_INST_DEST] == 'i') {
ins->dreg = alloc_ireg (cfg);
ins->type = STACK_I4;
+ } else if (spec [MONO_INST_DEST] == 'l') {
+ ins->dreg = alloc_lreg (cfg);
+ ins->type = STACK_I8;
}
ins->sreg1 = sreg1;
ins->sreg2 = sreg2;
#ifdef TARGET_AMD64
-static guint16 sse_methods [] = {
- SN_Add,
- SN_And,
- SN_AndNot,
- SN_CompareEqual,
- SN_CompareNotEqual,
- SN_Divide,
- SN_LoadAlignedVector128,
- SN_LoadVector128,
- SN_MoveHighToLow,
- SN_MoveLowToHigh,
- SN_MoveMask,
- SN_MoveScalar,
- SN_Multiply,
- SN_Or,
- SN_Shuffle,
- SN_Store,
- SN_StoreAligned,
- SN_Subtract,
- SN_UnpackHigh,
- SN_UnpackLow,
- SN_Xor,
- SN_get_IsSupported
+static SimdIntrinsic sse_methods [] = {
+ {SN_Add, OP_XBINOP, OP_FADD},
+ {SN_AddScalar, OP_SSE_ADDSS},
+ {SN_And, OP_SSE_AND},
+ {SN_AndNot, OP_SSE_ANDN},
+ {SN_CompareEqual, OP_XCOMPARE_FP, CMP_EQ},
+ {SN_CompareGreaterThan, OP_XCOMPARE_FP,CMP_GT},
+ {SN_CompareGreaterThanOrEqual, OP_XCOMPARE_FP, CMP_GE},
+ {SN_CompareLessThan, OP_XCOMPARE_FP, CMP_LT},
+ {SN_CompareLessThanOrEqual, OP_XCOMPARE_FP, CMP_LE},
+ {SN_CompareNotEqual, OP_XCOMPARE_FP, CMP_NE},
+ {SN_CompareNotGreaterThan, OP_XCOMPARE_FP, CMP_LE},
+ {SN_CompareNotGreaterThanOrEqual, OP_XCOMPARE_FP, CMP_LT},
+ {SN_CompareNotLessThan, OP_XCOMPARE_FP, CMP_GE},
+ {SN_CompareNotLessThanOrEqual, OP_XCOMPARE_FP, CMP_GT},
+ {SN_CompareOrdered, OP_XCOMPARE_FP, CMP_ORD},
+ {SN_CompareScalarEqual, OP_SSE_CMPSS, CMP_EQ},
+ {SN_CompareScalarGreaterThan, OP_SSE_CMPSS, CMP_GT},
+ {SN_CompareScalarGreaterThanOrEqual, OP_SSE_CMPSS, CMP_GE},
+ {SN_CompareScalarLessThan, OP_SSE_CMPSS, CMP_LT},
+ {SN_CompareScalarLessThanOrEqual, OP_SSE_CMPSS, CMP_LE},
+ {SN_CompareScalarNotEqual, OP_SSE_CMPSS, CMP_NE},
+ {SN_CompareScalarNotGreaterThan, OP_SSE_CMPSS, CMP_LE},
+ {SN_CompareScalarNotGreaterThanOrEqual, OP_SSE_CMPSS, CMP_LT},
+ {SN_CompareScalarNotLessThan, OP_SSE_CMPSS, CMP_GE},
+ {SN_CompareScalarNotLessThanOrEqual, OP_SSE_CMPSS, CMP_GT},
+ {SN_CompareScalarOrdered, OP_SSE_CMPSS, CMP_ORD},
+ {SN_CompareScalarOrderedEqual, OP_SSE_COMISS, CMP_EQ},
+ {SN_CompareScalarOrderedGreaterThan, OP_SSE_COMISS, CMP_GT},
+ {SN_CompareScalarOrderedGreaterThanOrEqual, OP_SSE_COMISS, CMP_GE},
+ {SN_CompareScalarOrderedLessThan, OP_SSE_COMISS, CMP_LT},
+ {SN_CompareScalarOrderedLessThanOrEqual, OP_SSE_COMISS, CMP_LE},
+ {SN_CompareScalarOrderedNotEqual, OP_SSE_COMISS, CMP_NE},
+ {SN_CompareScalarUnordered, OP_SSE_CMPSS, CMP_UNORD},
+ {SN_CompareScalarUnorderedEqual, OP_SSE_UCOMISS, CMP_EQ},
+ {SN_CompareScalarUnorderedGreaterThan, OP_SSE_UCOMISS, CMP_GT},
+ {SN_CompareScalarUnorderedGreaterThanOrEqual, OP_SSE_UCOMISS, CMP_GE},
+ {SN_CompareScalarUnorderedLessThan, OP_SSE_UCOMISS, CMP_LT},
+ {SN_CompareScalarUnorderedLessThanOrEqual, OP_SSE_UCOMISS, CMP_LE},
+ {SN_CompareScalarUnorderedNotEqual, OP_SSE_UCOMISS, CMP_NE},
+ {SN_CompareUnordered, OP_XCOMPARE_FP, CMP_UNORD},
+ {SN_ConvertScalarToVector128Single},
+ {SN_ConvertToInt32, OP_XOP_I4_X, SIMD_OP_SSE_CVTSS2SI},
+ {SN_ConvertToInt32WithTruncation, OP_XOP_I4_X, SIMD_OP_SSE_CVTTSS2SI},
+ {SN_ConvertToInt64, OP_XOP_I8_X, SIMD_OP_SSE_CVTSS2SI64},
+ {SN_ConvertToInt64WithTruncation, OP_XOP_I8_X, SIMD_OP_SSE_CVTTSS2SI64},
+ {SN_Divide, OP_XBINOP, OP_FDIV},
+ {SN_DivideScalar, OP_SSE_DIVSS},
+ {SN_LoadAlignedVector128, OP_SSE_LOADU, 16 /* alignment */},
+ {SN_LoadHigh, OP_SSE_MOVHPS_LOAD},
+ {SN_LoadLow, OP_SSE_MOVLPS_LOAD},
+ {SN_LoadScalarVector128, OP_SSE_MOVSS},
+ {SN_LoadVector128, OP_SSE_LOADU, 1 /* alignment */},
+ {SN_Max, OP_XOP_X_X_X, SIMD_OP_SSE_MAXPS},
+ {SN_MaxScalar, OP_XOP_X_X_X, SIMD_OP_SSE_MAXSS},
+ {SN_Min, OP_XOP_X_X_X, SIMD_OP_SSE_MINPS},
+ {SN_MinScalar, OP_XOP_X_X_X, SIMD_OP_SSE_MINSS},
+ {SN_MoveHighToLow, OP_SSE_MOVEHL},
+ {SN_MoveLowToHigh, OP_SSE_MOVELH},
+ {SN_MoveMask, OP_SSE_MOVMSK},
+ {SN_MoveScalar, OP_SSE_MOVS2},
+ {SN_Multiply, OP_XBINOP, OP_FMUL},
+ {SN_MultiplyScalar, OP_SSE_MULSS},
+ {SN_Or, OP_SSE_OR},
+ {SN_Prefetch0, OP_SSE_PREFETCHT0},
+ {SN_Prefetch1, OP_SSE_PREFETCHT1},
+ {SN_Prefetch2, OP_SSE_PREFETCHT2},
+ {SN_PrefetchNonTemporal, OP_SSE_PREFETCHNTA},
+ {SN_Reciprocal, OP_XOP_X_X, SIMD_OP_SSE_RCPPS},
+ {SN_ReciprocalScalar, 0, SIMD_OP_SSE_RCPSS},
+ {SN_ReciprocalSqrt, OP_XOP_X_X, SIMD_OP_SSE_RSQRTPS},
+ {SN_ReciprocalSqrtScalar, 0, SIMD_OP_SSE_RSQRTSS},
+ {SN_Sqrt, OP_XOP_X_X, SIMD_OP_SSE_SQRTPS},
+ {SN_SqrtScalar, 0, SIMD_OP_SSE_SQRTSS},
+ {SN_Shuffle},
+ {SN_Store, OP_SSE_STORE, 1 /* alignment */},
+ {SN_StoreAligned, OP_SSE_STORE, 16 /* alignment */},
+ {SN_StoreAlignedNonTemporal, OP_SSE_MOVNTPS},
+ {SN_StoreFence, OP_XOP, SIMD_OP_SSE_SFENCE},
+ {SN_StoreHigh, OP_SSE_MOVHPS_STORE},
+ {SN_StoreLow, OP_SSE_MOVLPS_STORE},
+ {SN_StoreScalar, OP_SSE_MOVSS_STORE},
+ {SN_Subtract, OP_XBINOP, OP_FSUB},
+ {SN_SubtractScalar, OP_SSE_SUBSS},
+ {SN_UnpackHigh, OP_SSE_UNPACKHI},
+ {SN_UnpackLow, OP_SSE_UNPACKLO},
+ {SN_Xor, OP_SSE_XOR},
+ {SN_get_IsSupported}
};
static guint16 sse2_methods [] = {
SN_Add,
SN_AddSaturate,
+ SN_AddScalar,
SN_And,
SN_AndNot,
+ SN_Average,
SN_CompareEqual,
SN_CompareGreaterThan,
SN_CompareLessThan,
SN_CompareNotEqual,
+ SN_CompareScalarEqual,
+ SN_ConvertScalarToVector128Double,
SN_ConvertScalarToVector128Int32,
SN_ConvertScalarToVector128Int64,
SN_ConvertScalarToVector128UInt32,
SN_ConvertScalarToVector128UInt64,
+ SN_ConvertToInt64,
+ SN_ConvertToInt64WithTruncation,
SN_ConvertToUInt32,
SN_ConvertToUInt64,
SN_LoadAlignedVector128,
gboolean supported, is_64bit;
MonoClass *klass = cmethod->klass;
MonoTypeEnum arg0_type = fsig->param_count > 0 ? get_underlying_type (fsig->params [0]) : MONO_TYPE_VOID;
+ SimdIntrinsic *info;
+ gboolean is_corlib = m_class_get_image (cfg->method->klass) == mono_get_corlib ();
if (is_hw_intrinsics_class (klass, "Sse", &is_64bit)) {
if (!COMPILE_LLVM (cfg))
return NULL;
- id = lookup_intrins (sse_methods, sizeof (sse_methods), cmethod);
- if (id == -1)
+ info = lookup_intrins_info (sse_methods, sizeof (sse_methods), cmethod);
+ if (!info)
return NULL;
+ int id = info->id;
+
+ supported = (mini_get_cpu_features (cfg) & MONO_CPU_X86_SSE) != 0;
+
+ /* Common case */
+ if (info->op != 0)
+ return emit_simd_ins_for_sig (cfg, klass, info->op, info->instc0, arg0_type, fsig, args);
- supported = (mini_get_cpu_features (cfg) & MONO_CPU_X86_SSE) != 0 &&
- m_class_get_image (cfg->method->klass) == mono_get_corlib (); // We only support the subset used by corelib
-
switch (id) {
case SN_get_IsSupported:
EMIT_NEW_ICONST (cfg, ins, supported ? 1 : 0);
ins->type = STACK_I4;
return ins;
- case SN_LoadAlignedVector128:
- return emit_simd_ins_for_sig (cfg, klass, OP_SSE_LOADU, 16 /*alignment*/, arg0_type, fsig, args);
- case SN_LoadVector128:
- return emit_simd_ins_for_sig (cfg, klass, OP_SSE_LOADU, 1 /*alignment*/, arg0_type, fsig, args);
- case SN_MoveMask:
- return emit_simd_ins_for_sig (cfg, klass, OP_SSE_MOVMSK, -1, arg0_type, fsig, args);
- case SN_MoveScalar:
- return emit_simd_ins_for_sig (cfg, klass, OP_SSE_MOVS2, -1, arg0_type, fsig, args);
- case SN_CompareNotEqual:
- return emit_simd_ins_for_sig (cfg, klass, OP_XCOMPARE_FP, CMP_NE, arg0_type, fsig, args);
- case SN_CompareEqual:
- return emit_simd_ins_for_sig (cfg, klass, OP_XCOMPARE_FP, CMP_EQ, arg0_type, fsig, args);
- case SN_And:
- return emit_simd_ins_for_sig (cfg, klass, OP_SSE_AND, -1, arg0_type, fsig, args);
- case SN_AndNot:
- return emit_simd_ins_for_sig (cfg, klass, OP_SSE_ANDN, -1, arg0_type, fsig, args);
- case SN_Or:
- return emit_simd_ins_for_sig (cfg, klass, OP_SSE_OR, -1, arg0_type, fsig, args);
- case SN_Xor:
- return emit_simd_ins_for_sig (cfg, klass, OP_SSE_XOR, -1, arg0_type, fsig, args);
- case SN_Multiply:
- return emit_simd_ins_for_sig (cfg, klass, OP_XBINOP, OP_FMUL, arg0_type, fsig, args);
- case SN_Divide:
- return emit_simd_ins_for_sig (cfg, klass, OP_XBINOP, OP_FDIV, arg0_type, fsig, args);
- case SN_Add:
- return emit_simd_ins_for_sig (cfg, klass, OP_XBINOP, OP_FADD, arg0_type, fsig, args);
- case SN_Subtract:
- return emit_simd_ins_for_sig (cfg, klass, OP_XBINOP, OP_FSUB, arg0_type, fsig, args);
case SN_Shuffle: {
if (args [2]->opcode != OP_ICONST) {
mono_cfg_set_exception (cfg, MONO_EXCEPTION_MONO_ERROR);
}
return emit_simd_ins_for_sig (cfg, klass, OP_SSE_SHUFFLE, args [2]->inst_c0 /*mask*/, arg0_type, fsig, args);
}
- case SN_MoveHighToLow:
- return emit_simd_ins_for_sig (cfg, klass, OP_SSE_MOVEHL, -1, arg0_type, fsig, args);
- case SN_MoveLowToHigh:
- return emit_simd_ins_for_sig (cfg, klass, OP_SSE_MOVELH, -1, arg0_type, fsig, args);
- case SN_UnpackLow:
- return emit_simd_ins_for_sig (cfg, klass, OP_SSE_UNPACKLO, -1, arg0_type, fsig, args);
- case SN_UnpackHigh:
- return emit_simd_ins_for_sig (cfg, klass, OP_SSE_UNPACKHI, -1, arg0_type, fsig, args);
- case SN_Store:
- return emit_simd_ins_for_sig (cfg, klass, OP_SSE_STORE, 1 /*alignment*/, arg0_type, fsig, args);
- case SN_StoreAligned:
- return emit_simd_ins_for_sig (cfg, klass, OP_SSE_STORE, 16 /*alignment*/, arg0_type, fsig, args);
+ case SN_ConvertScalarToVector128Single:
+ if (fsig->params [1]->type == MONO_TYPE_I4)
+ return emit_simd_ins_for_sig (cfg, klass, OP_XOP_X_X_I4, SIMD_OP_SSE_CVTSI2SS, 0, fsig, args);
+ else if (fsig->params [1]->type == MONO_TYPE_I8)
+ return emit_simd_ins_for_sig (cfg, klass, OP_XOP_X_X_I8, SIMD_OP_SSE_CVTSI2SS64, 0, fsig, args);
+ else
+ g_assert_not_reached ();
+ break;
+ case SN_ReciprocalScalar:
+ case SN_ReciprocalSqrtScalar:
+ case SN_SqrtScalar:
+ if (fsig->param_count == 1)
+ return emit_simd_ins_for_sig (cfg, klass, OP_XOP_X_X, info->instc0, arg0_type, fsig, args);
+ else
+ return NULL;
+ case SN_LoadScalarVector128:
+ return NULL;
default:
return NULL;
}
if (id == -1)
return NULL;
- supported = (mini_get_cpu_features (cfg) & MONO_CPU_X86_SSE2) != 0 &&
- m_class_get_image (cfg->method->klass) == mono_get_corlib (); // We only support the subset used by corelib
+ supported = (mini_get_cpu_features (cfg) & MONO_CPU_X86_SSE3) != 0 && is_corlib;// We only support the subset used by corelib
switch (id) {
case SN_get_IsSupported: {
return emit_simd_ins_for_sig (cfg, klass, OP_XBINOP, arg0_type == MONO_TYPE_R8 ? OP_FADD : OP_IADD, arg0_type, fsig, args);
case SN_AddSaturate:
return emit_simd_ins_for_sig (cfg, klass, OP_SSE2_ADDS, -1, arg0_type, fsig, args);
+ case SN_AddScalar:
+ return emit_simd_ins_for_sig (cfg, klass, OP_SSE2_ADDSD, -1, arg0_type, fsig, args);
case SN_And:
return emit_simd_ins_for_sig (cfg, klass, OP_SSE_AND, -1, arg0_type, fsig, args);
case SN_AndNot:
return emit_simd_ins_for_sig (cfg, klass, OP_SSE_ANDN, -1, arg0_type, fsig, args);
+ case SN_Average:
+ if (arg0_type == MONO_TYPE_U1)
+ return emit_simd_ins_for_sig (cfg, klass, OP_PAVGB_UN, -1, arg0_type, fsig, args);
+ else if (arg0_type == MONO_TYPE_U2)
+ return emit_simd_ins_for_sig (cfg, klass, OP_PAVGW_UN, -1, arg0_type, fsig, args);
+ else
+ return NULL;
case SN_CompareNotEqual:
return emit_simd_ins_for_sig (cfg, klass, arg0_type == MONO_TYPE_R8 ? OP_XCOMPARE_FP : OP_XCOMPARE, CMP_NE, arg0_type, fsig, args);
case SN_CompareEqual:
return emit_simd_ins_for_sig (cfg, klass, arg0_type == MONO_TYPE_R8 ? OP_XCOMPARE_FP : OP_XCOMPARE, CMP_GT, arg0_type, fsig, args);
case SN_CompareLessThan:
return emit_simd_ins_for_sig (cfg, klass, arg0_type == MONO_TYPE_R8 ? OP_XCOMPARE_FP : OP_XCOMPARE, CMP_LT, arg0_type, fsig, args);
+ case SN_CompareScalarEqual:
+ return emit_simd_ins_for_sig (cfg, klass, OP_SSE2_CMPSD, CMP_EQ, arg0_type, fsig, args);
case SN_ConvertScalarToVector128Int32:
case SN_ConvertScalarToVector128Int64:
case SN_ConvertScalarToVector128UInt32:
if (id == -1)
return NULL;
- supported = (mini_get_cpu_features (cfg) & MONO_CPU_X86_SSE3) != 0 &&
- m_class_get_image (cfg->method->klass) == mono_get_corlib (); // We only support the subset used by corelib
+ supported = (mini_get_cpu_features (cfg) & MONO_CPU_X86_SSE3) != 0 && is_corlib; // We only support the subset used by corelib
switch (id) {
case SN_get_IsSupported:
if (id == -1)
return NULL;
- supported = (mini_get_cpu_features (cfg) & MONO_CPU_X86_SSSE3) != 0 &&
- m_class_get_image (cfg->method->klass) == mono_get_corlib (); // We only support the subset used by corelib
+ supported = (mini_get_cpu_features (cfg) & MONO_CPU_X86_SSSE3) != 0 && is_corlib; // We only support the subset used by corelib
switch (id) {
case SN_get_IsSupported:
if (id == -1)
return NULL;
- supported = COMPILE_LLVM (cfg) && (mini_get_cpu_features (cfg) & MONO_CPU_X86_SSE41) != 0 &&
- m_class_get_image (cfg->method->klass) == mono_get_corlib (); // We only support the subset used by corelib
+ supported = COMPILE_LLVM (cfg) && (mini_get_cpu_features (cfg) & MONO_CPU_X86_SSE41) != 0 && is_corlib; // We only support the subset used by corelib
switch (id) {
case SN_get_IsSupported:
#ifdef TARGET_AMD64 // TODO: test and enable for x86 too
if (!strcmp (class_ns, "System.Runtime.Intrinsics.X86")) {
- return emit_x86_intrinsics (cfg ,cmethod, fsig, args);
+ MonoInst *ins = emit_x86_intrinsics (cfg ,cmethod, fsig, args);
+ return ins;
}
#endif
METHOD(LessThanOrEqual)
METHOD(Min)
METHOD(Max)
+METHOD(MinScalar)
+METHOD(MaxScalar)
METHOD(PopCount)
METHOD(LeadingZeroCount)
METHOD(get_Count)
METHOD(op_Subtraction)
// Vector
METHOD(ConvertToInt32)
+METHOD(ConvertToInt32WithTruncation)
METHOD(ConvertToUInt32)
METHOD(ConvertToInt64)
+METHOD(ConvertToInt64WithTruncation)
METHOD(ConvertToUInt64)
METHOD(ConvertToSingle)
METHOD(ConvertToDouble)
METHOD(ParallelBitExtract)
// Sse
METHOD(Add)
+METHOD(CompareGreaterThanOrEqual)
+METHOD(CompareLessThanOrEqual)
METHOD(CompareNotEqual)
+METHOD(CompareNotGreaterThan)
+METHOD(CompareNotGreaterThanOrEqual)
+METHOD(CompareNotLessThan)
+METHOD(CompareNotLessThanOrEqual)
+METHOD(CompareScalarGreaterThan)
+METHOD(CompareScalarGreaterThanOrEqual)
+METHOD(CompareScalarLessThan)
+METHOD(CompareScalarLessThanOrEqual)
+METHOD(CompareScalarNotEqual)
+METHOD(CompareScalarNotGreaterThan)
+METHOD(CompareScalarNotGreaterThanOrEqual)
+METHOD(CompareScalarNotLessThan)
+METHOD(CompareScalarNotLessThanOrEqual)
+METHOD(CompareScalarOrderedEqual)
+METHOD(CompareScalarOrderedGreaterThan)
+METHOD(CompareScalarOrderedGreaterThanOrEqual)
+METHOD(CompareScalarOrderedLessThan)
+METHOD(CompareScalarOrderedLessThanOrEqual)
+METHOD(CompareScalarOrderedNotEqual)
+METHOD(CompareScalarUnorderedEqual)
+METHOD(CompareScalarUnorderedGreaterThan)
+METHOD(CompareScalarUnorderedGreaterThanOrEqual)
+METHOD(CompareScalarUnorderedLessThan)
+METHOD(CompareScalarUnorderedLessThanOrEqual)
+METHOD(CompareScalarUnorderedNotEqual)
+METHOD(CompareOrdered)
+METHOD(CompareUnordered)
+METHOD(CompareScalarOrdered)
+METHOD(CompareScalarUnordered)
+METHOD(ConvertScalarToVector128Single)
METHOD(Divide)
+METHOD(DivideScalar)
METHOD(Store)
+METHOD(StoreFence)
+METHOD(StoreHigh)
+METHOD(StoreLow)
METHOD(Subtract)
+METHOD(SubtractScalar)
METHOD(CompareEqual)
+METHOD(LoadHigh)
+METHOD(LoadLow)
METHOD(LoadVector128)
+METHOD(LoadScalarVector128)
METHOD(MoveHighToLow)
METHOD(MoveLowToHigh)
METHOD(MoveMask)
METHOD(MoveScalar)
METHOD(Multiply)
+METHOD(MultiplyScalar)
METHOD(Shuffle)
METHOD(UnpackHigh)
METHOD(UnpackLow)
+METHOD(Prefetch0)
+METHOD(Prefetch1)
+METHOD(Prefetch2)
+METHOD(PrefetchNonTemporal)
+METHOD(Reciprocal)
+METHOD(ReciprocalScalar)
+METHOD(ReciprocalSqrt)
+METHOD(ReciprocalSqrtScalar)
+METHOD(Sqrt)
+METHOD(SqrtScalar)
// Sse2
METHOD(AddSaturate)
+METHOD(AddScalar)
METHOD(And)
+METHOD(Average)
METHOD(Or)
METHOD(LoadAlignedVector128)
METHOD(Xor)
METHOD(CompareGreaterThan)
+METHOD(CompareScalarEqual)
+METHOD(ConvertScalarToVector128Double)
METHOD(ConvertScalarToVector128Int32)
METHOD(ConvertScalarToVector128Int64)
METHOD(ConvertScalarToVector128UInt32)
METHOD(PackUnsignedSaturate)
METHOD(StoreScalar)
METHOD(StoreAligned)
+METHOD(StoreAlignedNonTemporal)
METHOD(ShiftRightLogical)
METHOD(CompareLessThan)
// Sse3
METHOD(MoveAndDuplicate)
// Sse41
METHOD(Insert)
-METHOD(TestZ)
\ No newline at end of file
+METHOD(TestZ)