From 81fe6bed00df6986e2387f5cfb68696d4597b9aa Mon Sep 17 00:00:00 2001 From: Egor Bogatov Date: Tue, 7 Apr 2020 04:25:11 +0300 Subject: [PATCH] [mono] Implement Arm intrinsics: ArmBase, Crc32 (#34240) * Implement Arm.ArmBase and Arm.Crc32 intrinsics --- src/mono/mono/mini/aot-compiler.c | 5 +- src/mono/mono/mini/llvm-intrinsics.h | 12 ++ src/mono/mono/mini/mini-arm64.c | 14 +++ src/mono/mono/mini/mini-arm64.h | 9 ++ src/mono/mono/mini/mini-llvm.c | 93 ++++++++++++++-- src/mono/mono/mini/mini-ops.h | 100 +++++++++-------- src/mono/mono/mini/mini.c | 5 + src/mono/mono/mini/mini.h | 21 +++- src/mono/mono/mini/simd-intrinsics-netcore.c | 157 ++++++++++++++++++++++----- src/mono/mono/mini/simd-methods-netcore.h | 6 + 10 files changed, 340 insertions(+), 82 deletions(-) diff --git a/src/mono/mono/mini/aot-compiler.c b/src/mono/mono/mini/aot-compiler.c index a83aa8b..bab6b97 100644 --- a/src/mono/mono/mini/aot-compiler.c +++ b/src/mono/mono/mini/aot-compiler.c @@ -8122,7 +8122,10 @@ parse_cpu_features (const gchar *attr) feature = (MonoCPUFeatures) (MONO_CPU_X86_FULL_SSEAVX_COMBINED & ~feature); #elif defined(TARGET_ARM64) - // TODO: neon, sha1, sha2, asimd, etc... + if (!strcmp (attr + prefix, "base")) + feature = MONO_CPU_ARM64_BASE; + else if (!strcmp (attr + prefix, "crc")) + feature = MONO_CPU_ARM64_CRC; #elif defined(TARGET_WASM) if (!strcmp (attr + prefix, "simd")) feature = MONO_CPU_WASM_SIMD; diff --git a/src/mono/mono/mini/llvm-intrinsics.h b/src/mono/mono/mini/llvm-intrinsics.h index a0cc84e..ea79ba2 100644 --- a/src/mono/mono/mini/llvm-intrinsics.h +++ b/src/mono/mono/mini/llvm-intrinsics.h @@ -226,6 +226,18 @@ INTRINS_OVR(WASM_ANYTRUE_V8, wasm_anytrue) INTRINS_OVR(WASM_ANYTRUE_V4, wasm_anytrue) INTRINS_OVR(WASM_ANYTRUE_V2, wasm_anytrue) #endif +#if defined(TARGET_ARM64) +INTRINS_OVR(BITREVERSE_I32, bitreverse) +INTRINS_OVR(BITREVERSE_I64, bitreverse) +INTRINS(AARCH64_CRC32B, aarch64_crc32b) +INTRINS(AARCH64_CRC32H, aarch64_crc32h) +INTRINS(AARCH64_CRC32W, aarch64_crc32w) +INTRINS(AARCH64_CRC32X, aarch64_crc32x) +INTRINS(AARCH64_CRC32CB, aarch64_crc32cb) +INTRINS(AARCH64_CRC32CH, aarch64_crc32ch) +INTRINS(AARCH64_CRC32CW, aarch64_crc32cw) +INTRINS(AARCH64_CRC32CX, aarch64_crc32cx) +#endif #undef INTRINS #undef INTRINS_OVR diff --git a/src/mono/mono/mini/mini-arm64.c b/src/mono/mono/mini/mini-arm64.c index aa019c5..8a8a5b0 100644 --- a/src/mono/mono/mini/mini-arm64.c +++ b/src/mono/mono/mini/mini-arm64.c @@ -86,6 +86,20 @@ mono_arch_fregname (int reg) return "unknown fp"; } +const char * +mono_arch_xregname (int reg) +{ + static const char * rnames[] = { + "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", + "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", + "v30", "v31" + }; + if (reg >= 0 && reg < 32) + return rnames [reg]; + return "unknown"; +} + int mono_arch_get_argument_info (MonoMethodSignature *csig, int param_count, MonoJitArgumentInfo *arg_info) { diff --git a/src/mono/mono/mini/mini-arm64.h b/src/mono/mono/mini/mini-arm64.h index 381160e..41590fb 100644 --- a/src/mono/mono/mini/mini-arm64.h +++ b/src/mono/mono/mini/mini-arm64.h @@ -19,6 +19,11 @@ #define MONO_MAX_IREGS 32 #define MONO_MAX_FREGS 32 +#define MONO_MAX_XREGS 32 + +#if !defined(DISABLE_SIMD) && defined(ENABLE_NETCORE) +#define MONO_ARCH_SIMD_INTRINSICS 1 +#endif #define MONO_CONTEXT_SET_LLVM_EXC_REG(ctx, exc) do { (ctx)->regs [0] = (gsize)exc; } while (0) @@ -41,6 +46,10 @@ /* v8..v15 */ #define MONO_ARCH_CALLEE_SAVED_FREGS 0xff00 +#define MONO_ARCH_CALLEE_SAVED_XREGS 0 + +#define MONO_ARCH_CALLEE_XREGS MONO_ARCH_CALLEE_FREGS + #define MONO_ARCH_USE_FPSTACK FALSE #define MONO_ARCH_INST_SREG2_MASK(ins) (0) diff --git a/src/mono/mono/mini/mini-llvm.c b/src/mono/mono/mini/mini-llvm.c index 4392f33..25208f9 100644 --- a/src/mono/mono/mini/mini-llvm.c +++ b/src/mono/mono/mini/mini-llvm.c @@ -8678,14 +8678,6 @@ process_bb (EmitContext *ctx, MonoBasicBlock *bb) case OP_POPCNT64: values [ins->dreg] = call_intrins (ctx, INTRINS_CTPOP_I64, &lhs, ""); break; - case OP_LZCNT32: - case OP_LZCNT64: { - LLVMValueRef args [2]; - args [0] = lhs; - args [1] = LLVMConstInt (LLVMInt1Type (), 1, FALSE); - values [ins->dreg] = call_intrins (ctx, ins->opcode == OP_LZCNT32 ? INTRINS_CTLZ_I32 : INTRINS_CTLZ_I64, args, ""); - break; - } case OP_CTTZ32: case OP_CTTZ64: { LLVMValueRef args [2]; @@ -8745,7 +8737,78 @@ process_bb (EmitContext *ctx, MonoBasicBlock *bb) break; } #endif /* ENABLE_NETCORE */ -#endif /* SIMD */ +#endif /* defined(TARGET_X86) || defined(TARGET_AMD64) */ + +// Shared between ARM64 and X86 +#if defined(ENABLE_NETCORE) && (defined(TARGET_ARM64) || defined(TARGET_X86) || defined(TARGET_AMD64)) + case OP_LZCNT32: + case OP_LZCNT64: { + LLVMValueRef args [2]; + args [0] = lhs; + args [1] = LLVMConstInt (LLVMInt1Type (), 1, FALSE); + values [ins->dreg] = LLVMBuildCall (builder, get_intrins (ctx, ins->opcode == OP_LZCNT32 ? INTRINS_CTLZ_I32 : INTRINS_CTLZ_I64), args, 2, ""); + break; + } +#endif + +#if defined(ENABLE_NETCORE) && defined(TARGET_ARM64) + case OP_XOP_I4_I4: + case OP_XOP_I8_I8: { + IntrinsicId id = (IntrinsicId)0; + switch (ins->inst_c0) { + case SIMD_OP_ARM64_RBIT32: id = INTRINS_BITREVERSE_I32; break; + case SIMD_OP_ARM64_RBIT64: id = INTRINS_BITREVERSE_I64; break; + default: g_assert_not_reached (); break; + } + values [ins->dreg] = call_intrins (ctx, id, &lhs, ""); + break; + } + case OP_XOP_I4_I4_I4: + case OP_XOP_I4_I4_I8: { + IntrinsicId id = (IntrinsicId)0; + gboolean zext_last = FALSE; + switch (ins->inst_c0) { + case SIMD_OP_ARM64_CRC32B: id = INTRINS_AARCH64_CRC32B; zext_last = TRUE; break; + case SIMD_OP_ARM64_CRC32H: id = INTRINS_AARCH64_CRC32H; zext_last = TRUE; break; + case SIMD_OP_ARM64_CRC32W: id = INTRINS_AARCH64_CRC32W; zext_last = TRUE; break; + case SIMD_OP_ARM64_CRC32X: id = INTRINS_AARCH64_CRC32X; break; + case SIMD_OP_ARM64_CRC32CB: id = INTRINS_AARCH64_CRC32CB; zext_last = TRUE; break; + case SIMD_OP_ARM64_CRC32CH: id = INTRINS_AARCH64_CRC32CH; zext_last = TRUE; break; + case SIMD_OP_ARM64_CRC32CW: id = INTRINS_AARCH64_CRC32CW; zext_last = TRUE; break; + case SIMD_OP_ARM64_CRC32CX: id = INTRINS_AARCH64_CRC32CX; break; + default: g_assert_not_reached (); break; + } + LLVMValueRef arg1 = rhs; + if (zext_last) + arg1 = LLVMBuildZExt (ctx->builder, arg1, LLVMInt32Type (), ""); + LLVMValueRef args [] = { lhs, arg1 }; + values [ins->dreg] = call_intrins (ctx, id, args, ""); + break; + } + case OP_LSCNT32: + case OP_LSCNT64: { + // %shr = ashr i32 %x, 31 + // %xor = xor i32 %shr, %x + // %mul = shl i32 %xor, 1 + // %add = or i32 %mul, 1 + // %0 = tail call i32 @llvm.ctlz.i32(i32 %add, i1 false) + LLVMValueRef shr = LLVMBuildAShr (builder, lhs, ins->opcode == OP_LSCNT32 ? + LLVMConstInt (LLVMInt32Type (), 31, FALSE) : + LLVMConstInt (LLVMInt64Type (), 63, FALSE), ""); + LLVMValueRef one = ins->opcode == OP_LSCNT32 ? + LLVMConstInt (LLVMInt32Type (), 1, FALSE) : + LLVMConstInt (LLVMInt64Type (), 1, FALSE); + LLVMValueRef xor = LLVMBuildXor (builder, shr, lhs, ""); + LLVMValueRef mul = LLVMBuildShl (builder, xor, one, ""); + LLVMValueRef add = LLVMBuildOr (builder, mul, one, ""); + + LLVMValueRef args [2]; + args [0] = add; + args [1] = LLVMConstInt (LLVMInt1Type (), 0, FALSE); + values [ins->dreg] = LLVMBuildCall (builder, get_intrins (ctx, ins->opcode == OP_LSCNT32 ? INTRINS_CTLZ_I32 : INTRINS_CTLZ_I64), args, 2, ""); + break; + } +#endif case OP_DUMMY_USE: break; @@ -10151,6 +10214,14 @@ add_intrinsic (LLVMModuleRef module, int id) intrins = add_intrins1 (module, id, sse_i8_t); break; #endif +#ifdef TARGET_ARM64 + case INTRINS_BITREVERSE_I32: + intrins = add_intrins1 (module, id, LLVMInt32Type ()); + break; + case INTRINS_BITREVERSE_I64: + intrins = add_intrins1 (module, id, LLVMInt64Type ()); + break; +#endif default: g_assert_not_reached (); break; @@ -11532,8 +11603,12 @@ MonoCPUFeatures mono_llvm_get_cpu_features (void) { "bmi", MONO_CPU_X86_BMI1 }, { "bmi2", MONO_CPU_X86_BMI2 }, #endif +#if defined(TARGET_ARM64) + { "crc", MONO_CPU_ARM64_CRC }, +#endif }; if (!cpu_features) cpu_features = MONO_CPU_INITED | (MonoCPUFeatures)mono_llvm_check_cpu_features (flags_map, G_N_ELEMENTS (flags_map)); + return cpu_features; } diff --git a/src/mono/mono/mini/mini-ops.h b/src/mono/mono/mini/mini-ops.h index 95e50fd..a7279a1 100644 --- a/src/mono/mono/mini/mini-ops.h +++ b/src/mono/mono/mini/mini-ops.h @@ -785,6 +785,53 @@ MINI_OP(OP_NOT_NULL, "not_null", NONE, IREG, NONE) /* SIMD opcodes. */ +#if defined(TARGET_X86) || defined(TARGET_AMD64) || defined(TARGET_WASM) || defined(TARGET_ARM64) + +MINI_OP(OP_EXTRACT_I4, "extract_i4", IREG, XREG, NONE) +MINI_OP(OP_ICONV_TO_R4_RAW, "iconv_to_r4_raw", FREG, IREG, NONE) + +MINI_OP(OP_EXTRACT_I2, "extract_i2", IREG, XREG, NONE) +MINI_OP(OP_EXTRACT_U2, "extract_u2", IREG, XREG, NONE) +MINI_OP(OP_EXTRACT_I1, "extract_i1", IREG, XREG, NONE) +MINI_OP(OP_EXTRACT_U1, "extract_u1", IREG, XREG, NONE) +MINI_OP(OP_EXTRACT_R4, "extract_r4", FREG, XREG, NONE) +MINI_OP(OP_EXTRACT_R8, "extract_r8", FREG, XREG, NONE) +MINI_OP(OP_EXTRACT_I8, "extract_i8", LREG, XREG, NONE) + +/* Used by LLVM */ +MINI_OP(OP_INSERT_I1, "insert_i1", XREG, XREG, IREG) +MINI_OP(OP_INSERT_I2, "insert_i2", XREG, XREG, IREG) +MINI_OP(OP_INSERT_I4, "insert_i4", XREG, XREG, IREG) +MINI_OP(OP_INSERT_I8, "insert_i8", XREG, XREG, LREG) +MINI_OP(OP_INSERT_R4, "insert_r4", XREG, XREG, FREG) +MINI_OP(OP_INSERT_R8, "insert_r8", XREG, XREG, FREG) + +MINI_OP(OP_EXTRACTX_U2, "extractx_u2", IREG, XREG, NONE) + +/*these slow ops are modeled around the availability of a fast 2 bytes insert op*/ +/*insertx_u1_slow takes old value and new value as source regs */ +MINI_OP(OP_INSERTX_U1_SLOW, "insertx_u1_slow", XREG, IREG, IREG) +/*insertx_i4_slow takes target xreg and new value as source regs */ +MINI_OP(OP_INSERTX_I4_SLOW, "insertx_i4_slow", XREG, XREG, IREG) + +MINI_OP(OP_INSERTX_R4_SLOW, "insertx_r4_slow", XREG, XREG, FREG) +MINI_OP(OP_INSERTX_R8_SLOW, "insertx_r8_slow", XREG, XREG, FREG) +MINI_OP(OP_INSERTX_I8_SLOW, "insertx_i8_slow", XREG, XREG, LREG) + +MINI_OP(OP_FCONV_TO_R4_X, "fconv_to_r4_x", XREG, FREG, NONE) +MINI_OP(OP_FCONV_TO_R8_X, "fconv_to_r8_x", XREG, FREG, NONE) +MINI_OP(OP_XCONV_R8_TO_I4, "xconv_r8_to_i4", IREG, XREG, NONE) +MINI_OP(OP_ICONV_TO_X, "iconv_to_x", XREG, IREG, NONE) + +MINI_OP(OP_EXPAND_I1, "expand_i1", XREG, IREG, NONE) +MINI_OP(OP_EXPAND_I2, "expand_i2", XREG, IREG, NONE) +MINI_OP(OP_EXPAND_I4, "expand_i4", XREG, IREG, NONE) +MINI_OP(OP_EXPAND_R4, "expand_r4", XREG, FREG, NONE) +MINI_OP(OP_EXPAND_I8, "expand_i8", XREG, IREG, NONE) +MINI_OP(OP_EXPAND_R8, "expand_r8", XREG, FREG, NONE) + +#endif + #if defined(TARGET_X86) || defined(TARGET_AMD64) || defined(TARGET_WASM) MINI_OP(OP_ADDPS, "addps", XREG, XREG, XREG) @@ -945,49 +992,6 @@ MINI_OP(OP_PSHLD_REG, "pshld_reg", XREG, XREG, XREG) MINI_OP(OP_PSHLQ, "pshlq", XREG, XREG, NONE) MINI_OP(OP_PSHLQ_REG, "pshlq_reg", XREG, XREG, XREG) -MINI_OP(OP_EXTRACT_I4, "extract_i4", IREG, XREG, NONE) -MINI_OP(OP_ICONV_TO_R4_RAW, "iconv_to_r4_raw", FREG, IREG, NONE) - -MINI_OP(OP_EXTRACT_I2, "extract_i2", IREG, XREG, NONE) -MINI_OP(OP_EXTRACT_U2, "extract_u2", IREG, XREG, NONE) -MINI_OP(OP_EXTRACT_I1, "extract_i1", IREG, XREG, NONE) -MINI_OP(OP_EXTRACT_U1, "extract_u1", IREG, XREG, NONE) -MINI_OP(OP_EXTRACT_R4, "extract_r4", FREG, XREG, NONE) -MINI_OP(OP_EXTRACT_R8, "extract_r8", FREG, XREG, NONE) -MINI_OP(OP_EXTRACT_I8, "extract_i8", LREG, XREG, NONE) - -/* Used by LLVM */ -MINI_OP(OP_INSERT_I1, "insert_i1", XREG, XREG, IREG) -MINI_OP(OP_INSERT_I2, "insert_i2", XREG, XREG, IREG) -MINI_OP(OP_INSERT_I4, "insert_i4", XREG, XREG, IREG) -MINI_OP(OP_INSERT_I8, "insert_i8", XREG, XREG, LREG) -MINI_OP(OP_INSERT_R4, "insert_r4", XREG, XREG, FREG) -MINI_OP(OP_INSERT_R8, "insert_r8", XREG, XREG, FREG) - -MINI_OP(OP_EXTRACTX_U2, "extractx_u2", IREG, XREG, NONE) - -/*these slow ops are modeled around the availability of a fast 2 bytes insert op*/ -/*insertx_u1_slow takes old value and new value as source regs */ -MINI_OP(OP_INSERTX_U1_SLOW, "insertx_u1_slow", XREG, IREG, IREG) -/*insertx_i4_slow takes target xreg and new value as source regs */ -MINI_OP(OP_INSERTX_I4_SLOW, "insertx_i4_slow", XREG, XREG, IREG) - -MINI_OP(OP_INSERTX_R4_SLOW, "insertx_r4_slow", XREG, XREG, FREG) -MINI_OP(OP_INSERTX_R8_SLOW, "insertx_r8_slow", XREG, XREG, FREG) -MINI_OP(OP_INSERTX_I8_SLOW, "insertx_i8_slow", XREG, XREG, LREG) - -MINI_OP(OP_FCONV_TO_R4_X, "fconv_to_r4_x", XREG, FREG, NONE) -MINI_OP(OP_FCONV_TO_R8_X, "fconv_to_r8_x", XREG, FREG, NONE) -MINI_OP(OP_XCONV_R8_TO_I4, "xconv_r8_to_i4", IREG, XREG, NONE) -MINI_OP(OP_ICONV_TO_X, "iconv_to_x", XREG, IREG, NONE) - -MINI_OP(OP_EXPAND_I1, "expand_i1", XREG, IREG, NONE) -MINI_OP(OP_EXPAND_I2, "expand_i2", XREG, IREG, NONE) -MINI_OP(OP_EXPAND_I4, "expand_i4", XREG, IREG, NONE) -MINI_OP(OP_EXPAND_R4, "expand_r4", XREG, FREG, NONE) -MINI_OP(OP_EXPAND_I8, "expand_i8", XREG, IREG, NONE) -MINI_OP(OP_EXPAND_R8, "expand_r8", XREG, FREG, NONE) - MINI_OP(OP_PREFETCH_MEMBASE, "prefetch_membase", NONE, IREG, NONE) MINI_OP(OP_CVTDQ2PD, "cvtdq2pd", XREG, XREG, NONE) @@ -1522,6 +1526,11 @@ MINI_OP(OP_XOP_I8_X, "xop_i8_x", LREG, XREG, NONE) MINI_OP(OP_XOP_X_X_X, "xop_x_x_x", XREG, XREG, XREG) MINI_OP(OP_XOP_X_X_I4, "xop_x_x_i4", XREG, XREG, IREG) MINI_OP(OP_XOP_X_X_I8, "xop_x_x_i8", XREG, XREG, LREG) +MINI_OP(OP_XOP_I4_I8, "xop_i4_i8", IREG, LREG, NONE) +MINI_OP(OP_XOP_I8_I8, "xop_i8_i8", LREG, LREG, NONE) +MINI_OP(OP_XOP_I4_I4, "xop_i4_i4", IREG, IREG, NONE) +MINI_OP(OP_XOP_I4_I4_I4, "xop_i4_i4_i4", IREG, IREG, IREG) +MINI_OP(OP_XOP_I4_I4_I8, "xop_i4_i4_i8", IREG, IREG, LREG) MINI_OP(OP_XCAST, "xcast", XREG, XREG, NONE) /* Extract element of vector */ @@ -1546,3 +1555,8 @@ MINI_OP(OP_LZCNT32, "lzcnt32", IREG, IREG, NONE) MINI_OP(OP_LZCNT64, "lzcnt64", LREG, LREG, NONE) MINI_OP(OP_POPCNT32, "popcnt32", IREG, IREG, NONE) MINI_OP(OP_POPCNT64, "popcnt64", LREG, LREG, NONE) + +#ifdef TARGET_ARM64 +MINI_OP(OP_LSCNT32, "lscnt32", IREG, IREG, NONE) +MINI_OP(OP_LSCNT64, "lscnt64", LREG, LREG, NONE) +#endif // TARGET_ARM64 diff --git a/src/mono/mono/mini/mini.c b/src/mono/mono/mini/mini.c index 8fb697c..12b5b78 100644 --- a/src/mono/mono/mini/mini.c +++ b/src/mono/mono/mini/mini.c @@ -4353,6 +4353,11 @@ mini_get_cpu_features (MonoCompile* cfg) } #endif +#if defined(TARGET_ARM64) + // All Arm64 devices have this set + features |= MONO_CPU_ARM64_BASE; +#endif + // apply parameters passed via -mattr return (features | mono_cpu_features_enabled) & ~mono_cpu_features_disabled; } diff --git a/src/mono/mono/mini/mini.h b/src/mono/mono/mini/mini.h index b9321bc..e5c36f5 100644 --- a/src/mono/mono/mini/mini.h +++ b/src/mono/mono/mini/mini.h @@ -313,7 +313,12 @@ enum { #define MONO_IS_REAL_MOVE(ins) (((ins)->opcode == OP_MOVE) || ((ins)->opcode == OP_FMOVE) || ((ins)->opcode == OP_XMOVE) || ((ins)->opcode == OP_RMOVE)) #define MONO_IS_ZERO(ins) (((ins)->opcode == OP_VZERO) || ((ins)->opcode == OP_XZERO)) +#ifdef TARGET_ARM64 +// FIXME: enable for Arm64 +#define MONO_CLASS_IS_SIMD(cfg, klass) (0) +#else #define MONO_CLASS_IS_SIMD(cfg, klass) (((cfg)->opt & MONO_OPT_SIMD) && m_class_is_simd_type (klass)) +#endif #else @@ -2844,6 +2849,10 @@ typedef enum { #ifdef TARGET_WASM MONO_CPU_WASM_SIMD = 1 << 1, #endif +#ifdef TARGET_ARM64 + MONO_CPU_ARM64_BASE = 1 << 1, + MONO_CPU_ARM64_CRC = 1 << 2, +#endif } MonoCPUFeatures; G_ENUM_FUNCTIONS (MonoCPUFeatures) @@ -2938,7 +2947,17 @@ typedef enum { SIMD_OP_SSE_PSIGND, SIMD_OP_SSE_PMADDUBSW, SIMD_OP_SSE_PMULHRSW, - SIMD_OP_SSE_LDDQU + SIMD_OP_SSE_LDDQU, + SIMD_OP_ARM64_CRC32B, + SIMD_OP_ARM64_CRC32H, + SIMD_OP_ARM64_CRC32W, + SIMD_OP_ARM64_CRC32X, + SIMD_OP_ARM64_CRC32CB, + SIMD_OP_ARM64_CRC32CH, + SIMD_OP_ARM64_CRC32CW, + SIMD_OP_ARM64_CRC32CX, + SIMD_OP_ARM64_RBIT32, + SIMD_OP_ARM64_RBIT64 } SimdOp; const char *mono_arch_xregname (int reg); diff --git a/src/mono/mono/mini/simd-intrinsics-netcore.c b/src/mono/mono/mini/simd-intrinsics-netcore.c index 6bb6eb2..a1acfc7 100644 --- a/src/mono/mono/mini/simd-intrinsics-netcore.c +++ b/src/mono/mono/mini/simd-intrinsics-netcore.c @@ -129,31 +129,6 @@ lookup_intrins_info (SimdIntrinsic *intrinsics, int size, MonoMethod *cmethod) return (SimdIntrinsic *)mono_binary_search (cmethod->name, intrinsics, size / sizeof (SimdIntrinsic), sizeof (SimdIntrinsic), &simd_intrinsic_info_compare_by_name); } -static int -type_to_expand_op (MonoType *type) -{ - switch (type->type) { - case MONO_TYPE_I1: - case MONO_TYPE_U1: - return OP_EXPAND_I1; - case MONO_TYPE_I2: - case MONO_TYPE_U2: - return OP_EXPAND_I2; - case MONO_TYPE_I4: - case MONO_TYPE_U4: - return OP_EXPAND_I4; - case MONO_TYPE_I8: - case MONO_TYPE_U8: - return OP_EXPAND_I8; - case MONO_TYPE_R4: - return OP_EXPAND_R4; - case MONO_TYPE_R8: - return OP_EXPAND_R8; - default: - g_assert_not_reached (); - } -} - /* * Return a simd vreg for the simd value represented by SRC. * SRC is the 'this' argument to methods. @@ -291,6 +266,33 @@ get_vector_t_elem_type (MonoType *vector_type) return etype; } +#ifdef TARGET_AMD64 + +static int +type_to_expand_op (MonoType *type) +{ + switch (type->type) { + case MONO_TYPE_I1: + case MONO_TYPE_U1: + return OP_EXPAND_I1; + case MONO_TYPE_I2: + case MONO_TYPE_U2: + return OP_EXPAND_I2; + case MONO_TYPE_I4: + case MONO_TYPE_U4: + return OP_EXPAND_I4; + case MONO_TYPE_I8: + case MONO_TYPE_U8: + return OP_EXPAND_I8; + case MONO_TYPE_R4: + return OP_EXPAND_R4; + case MONO_TYPE_R8: + return OP_EXPAND_R8; + default: + g_assert_not_reached (); + } +} + static guint16 vector_methods [] = { SN_ConvertToDouble, SN_ConvertToInt32, @@ -669,6 +671,95 @@ emit_sys_numerics_vector_t (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSig return NULL; } +#endif // !TARGET_ARM64 + +#ifdef TARGET_ARM64 + +static SimdIntrinsic armbase_methods [] = { + {SN_LeadingSignCount}, + {SN_LeadingZeroCount}, + {SN_ReverseElementBits}, + {SN_get_IsSupported} +}; + +static SimdIntrinsic crc32_methods [] = { + {SN_ComputeCrc32}, + {SN_ComputeCrc32C}, + {SN_get_IsSupported} +}; + +static MonoInst* +emit_arm64_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args) +{ + // Arm64 intrinsics are LLVM-only + if (!COMPILE_LLVM (cfg)) + return NULL; + + MonoInst *ins; + gboolean supported, is_64bit; + MonoClass *klass = cmethod->klass; + MonoTypeEnum arg0_type = fsig->param_count > 0 ? get_underlying_type (fsig->params [0]) : MONO_TYPE_VOID; + gboolean arg0_i32 = (arg0_type == MONO_TYPE_I4) || (arg0_type == MONO_TYPE_U4); + SimdIntrinsic *info; + + if (is_hw_intrinsics_class (klass, "ArmBase", &is_64bit)) { + info = lookup_intrins_info (armbase_methods, sizeof (armbase_methods), cmethod); + if (!info) + return NULL; + + supported = (mini_get_cpu_features (cfg) & MONO_CPU_ARM64_BASE) != 0; + + switch (info->id) { + case SN_get_IsSupported: + EMIT_NEW_ICONST (cfg, ins, supported ? 1 : 0); + ins->type = STACK_I4; + return ins; + case SN_LeadingZeroCount: + return emit_simd_ins_for_sig (cfg, klass, arg0_i32 ? OP_LZCNT32 : OP_LZCNT64, 0, arg0_type, fsig, args); + case SN_LeadingSignCount: + return emit_simd_ins_for_sig (cfg, klass, arg0_i32 ? OP_LSCNT32 : OP_LSCNT64, 0, arg0_type, fsig, args); + case SN_ReverseElementBits: + return emit_simd_ins_for_sig (cfg, klass, + (is_64bit ? OP_XOP_I8_I8 : OP_XOP_I4_I4), + (is_64bit ? SIMD_OP_ARM64_RBIT64 : SIMD_OP_ARM64_RBIT32), + arg0_type, fsig, args); + default: + g_assert_not_reached (); // if a new API is added we need to either implement it or change IsSupported to false + } + } + + if (is_hw_intrinsics_class (klass, "Crc32", &is_64bit)) { + info = lookup_intrins_info (crc32_methods, sizeof (crc32_methods), cmethod); + if (!info) + return NULL; + + supported = (mini_get_cpu_features (cfg) & MONO_CPU_ARM64_CRC) != 0; + + switch (info->id) { + case SN_get_IsSupported: + EMIT_NEW_ICONST (cfg, ins, supported ? 1 : 0); + ins->type = STACK_I4; + return ins; + case SN_ComputeCrc32: + case SN_ComputeCrc32C: { + SimdOp op = (SimdOp)0; + gboolean is_c = info->id == SN_ComputeCrc32C; + switch (get_underlying_type (fsig->params [1])) { + case MONO_TYPE_U1: op = is_c ? SIMD_OP_ARM64_CRC32CB : SIMD_OP_ARM64_CRC32B; break; + case MONO_TYPE_U2: op = is_c ? SIMD_OP_ARM64_CRC32CH : SIMD_OP_ARM64_CRC32H; break; + case MONO_TYPE_U4: op = is_c ? SIMD_OP_ARM64_CRC32CW : SIMD_OP_ARM64_CRC32W; break; + case MONO_TYPE_U8: op = is_c ? SIMD_OP_ARM64_CRC32CX : SIMD_OP_ARM64_CRC32X; break; + default: g_assert_not_reached (); break; + } + return emit_simd_ins_for_sig (cfg, klass, is_64bit ? OP_XOP_I4_I4_I8 : OP_XOP_I4_I4_I4, op, arg0_type, fsig, args); + } + default: + g_assert_not_reached (); // if a new API is added we need to either implement it or change IsSupported to false + } + } + return NULL; +} +#endif // TARGET_ARM64 #ifdef TARGET_AMD64 @@ -1632,7 +1723,6 @@ emit_x86_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature return NULL; } -#endif static guint16 vector_128_methods [] = { SN_AsByte, @@ -1792,6 +1882,8 @@ emit_vector256_t (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fs return NULL; } +#endif // !TARGET_ARM64 + MonoInst* mono_emit_simd_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args) { @@ -1809,12 +1901,20 @@ mono_emit_simd_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSign if (m_class_get_nested_in (cmethod->klass)) class_ns = m_class_get_name_space (m_class_get_nested_in (cmethod->klass)); +#ifdef TARGET_ARM64 + if (!strcmp (class_ns, "System.Runtime.Intrinsics.Arm")) { + MonoInst *ins = emit_arm64_intrinsics (cfg, cmethod, fsig, args); + return ins; + } +#endif // TARGET_ARM64 + #ifdef TARGET_AMD64 // TODO: test and enable for x86 too if (!strcmp (class_ns, "System.Runtime.Intrinsics.X86")) { - MonoInst *ins = emit_x86_intrinsics (cfg ,cmethod, fsig, args); + MonoInst *ins = emit_x86_intrinsics (cfg, cmethod, fsig, args); return ins; } -#endif + + // FIXME: implement Vector64, Vector128 and Vector for Arm64 if (!strcmp (class_ns, "System.Runtime.Intrinsics")) { if (!strcmp (class_name, "Vector128`1")) @@ -1831,6 +1931,7 @@ mono_emit_simd_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSign if (!strcmp (class_name, "Vector`1")) return emit_sys_numerics_vector_t (cfg, cmethod, fsig, args); } +#endif // TARGET_AMD64 return NULL; } diff --git a/src/mono/mono/mini/simd-methods-netcore.h b/src/mono/mono/mini/simd-methods-netcore.h index c63a9cc..d8871d0 100644 --- a/src/mono/mono/mini/simd-methods-netcore.h +++ b/src/mono/mono/mini/simd-methods-netcore.h @@ -194,3 +194,9 @@ METHOD(Insert) METHOD(TestZ) // Sse42 METHOD(Crc32) +// ArmBase +METHOD(LeadingSignCount) +METHOD(ReverseElementBits) +// Crc32 +METHOD(ComputeCrc32) +METHOD(ComputeCrc32C) -- 2.7.4