From: Filip Navara Date: Sun, 8 Sep 2019 19:57:21 +0000 (+0200) Subject: [netcore][x64] Implement lowering of new SIMD OPs into SSE opcodes (mono/mono#16672) X-Git-Tag: submit/tizen/20210909.063632~10331^2~5^2~544 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=1eab0fec0c9e674ff14457bc33d4988ccc13e8ae;p=platform%2Fupstream%2Fdotnet%2Fruntime.git [netcore][x64] Implement lowering of new SIMD OPs into SSE opcodes (mono/mono#16672) * [netcore][x64] Implement lowering of new SIMD OPs into SSE2 or SSE4.1 opcodes. * Implement LZCNT/POPCNT in mini JIT * Fix C++ build Commit migrated from https://github.com/mono/mono/commit/cf2c8575e15c53153add54453c97d1d37a6db719 --- diff --git a/src/mono/mono/arch/amd64/amd64-codegen.h b/src/mono/mono/arch/amd64/amd64-codegen.h index 8295a91..7ac86b7 100644 --- a/src/mono/mono/arch/amd64/amd64-codegen.h +++ b/src/mono/mono/arch/amd64/amd64-codegen.h @@ -1165,6 +1165,9 @@ typedef union { #define amd64_sse_prefetch_reg_membase(inst, arg, basereg, disp) emit_sse_reg_membase_op2((inst), (arg), (basereg), (disp), 0x0f, 0x18) +#define amd64_sse_lzcnt_reg_reg_size(inst, dreg, reg, size) emit_sse_reg_reg_size((inst), (dreg), (reg), 0xf3, 0x0f, 0xbd, (size)) +#define amd64_sse_popcnt_reg_reg_size(inst, dreg, reg, size) emit_sse_reg_reg_size((inst), (dreg), (reg), 0xf3, 0x0f, 0xb8, (size)) + /* Generated from x86-codegen.h */ #define amd64_breakpoint_size(inst,size) do { x86_breakpoint(inst); } while (0) diff --git a/src/mono/mono/mini/cpu-amd64.md b/src/mono/mono/mini/cpu-amd64.md index b58cfb2..ecf358c 100644 --- a/src/mono/mono/mini/cpu-amd64.md +++ b/src/mono/mono/mini/cpu-amd64.md @@ -827,3 +827,8 @@ generic_class_init: src1:A len:32 clob:c get_last_error: dest:i len:32 fill_prof_call_ctx: src1:i len:128 + +lzcnt32: dest:i src1:i len:16 +lzcnt64: dest:i src1:i len:16 +popcnt32: dest:i src1:i len:16 +popcnt64: dest:i src1:i len:16 diff --git a/src/mono/mono/mini/decompose.c b/src/mono/mono/mini/decompose.c index c55f52c..392f89c 100644 --- a/src/mono/mono/mini/decompose.c +++ b/src/mono/mono/mini/decompose.c @@ -1557,7 +1557,7 @@ mono_decompose_array_access_opts (MonoCompile *cfg) MONO_EMIT_NEW_UNALU (cfg, OP_SEXT_I4, index2_reg, ins->sreg2); MONO_EMIT_DEFAULT_BOUNDS_CHECK (cfg, ins->sreg1, ins->inst_imm, index2_reg, ins->flags & MONO_INST_FAULT, ins->inst_p0); } else { - MONO_ARCH_EMIT_BOUNDS_CHECK (cfg, ins->sreg1, ins->inst_imm, ins->sreg2); + MONO_ARCH_EMIT_BOUNDS_CHECK (cfg, ins->sreg1, ins->inst_imm, ins->sreg2, ins->inst_p0); } break; case OP_NEWARR: diff --git a/src/mono/mono/mini/ir-emit.h b/src/mono/mono/mini/ir-emit.h index 76443ec..0fd3acc 100644 --- a/src/mono/mono/mini/ir-emit.h +++ b/src/mono/mono/mini/ir-emit.h @@ -955,23 +955,24 @@ static int ccount = 0; else \ MONO_EMIT_NEW_LOAD_MEMBASE_OP_FLAGS (cfg, OP_LOADI4_MEMBASE, _length_reg, array_reg, offset, MONO_INST_INVARIANT_LOAD); \ MONO_EMIT_NEW_BIALU (cfg, OP_COMPARE, -1, _length_reg, index_reg); \ - MONO_EMIT_NEW_COND_EXC (cfg, LE_UN, (const char*)(ex_name ? ex_name : "IndexOutOfRangeException")); \ + MONO_EMIT_NEW_COND_EXC (cfg, LE_UN, ex_name); \ } while (0) #ifndef MONO_ARCH_EMIT_BOUNDS_CHECK -#define MONO_ARCH_EMIT_BOUNDS_CHECK(cfg, array_reg, offset, index_reg) MONO_EMIT_DEFAULT_BOUNDS_CHECK ((cfg), (array_reg), (offset), (index_reg), TRUE, NULL) +#define MONO_ARCH_EMIT_BOUNDS_CHECK(cfg, array_reg, offset, index_reg, ex_name) MONO_EMIT_DEFAULT_BOUNDS_CHECK ((cfg), (array_reg), (offset), (index_reg), TRUE, ex_name) #endif static inline void mini_emit_bounds_check_offset (MonoCompile *cfg, int array_reg, int array_length_offset, int index_reg, const char *ex_name) { if (!(cfg->opt & MONO_OPT_UNSAFE)) { + ex_name = ex_name ? ex_name : "IndexOutOfRangeException"; if (!(cfg->opt & MONO_OPT_ABCREM)) { MONO_EMIT_NULL_CHECK (cfg, array_reg, FALSE); if (COMPILE_LLVM (cfg)) MONO_EMIT_DEFAULT_BOUNDS_CHECK ((cfg), (array_reg), (array_length_offset), (index_reg), TRUE, ex_name); else - MONO_ARCH_EMIT_BOUNDS_CHECK ((cfg), (array_reg), (array_length_offset), (index_reg)); + MONO_ARCH_EMIT_BOUNDS_CHECK ((cfg), (array_reg), (array_length_offset), (index_reg), ex_name); } else { MonoInst *ins; MONO_INST_NEW ((cfg), ins, OP_BOUNDS_CHECK); diff --git a/src/mono/mono/mini/mini-amd64.c b/src/mono/mono/mini/mini-amd64.c index 0bead30..7f56fd9 100644 --- a/src/mono/mono/mini/mini-amd64.c +++ b/src/mono/mono/mini/mini-amd64.c @@ -1502,6 +1502,20 @@ mono_arch_cpu_enumerate_simd_versions (void) return sse_opts; } +MonoCPUFeatures +mono_arch_get_cpu_features (void) +{ + guint64 features = MONO_CPU_INITED; + + if (mono_hwcap_x86_has_popcnt) + features |= MONO_CPU_X86_POPCNT; + + if (mono_hwcap_x86_has_lzcnt) + features |= MONO_CPU_X86_LZCNT; + + return (MonoCPUFeatures)features; +} + #ifndef DISABLE_JIT GList * @@ -3375,10 +3389,257 @@ mono_arch_peephole_pass_2 (MonoCompile *cfg, MonoBasicBlock *bb) #define NEW_INS(cfg,ins,dest,op) do { \ MONO_INST_NEW ((cfg), (dest), (op)); \ - (dest)->cil_code = (ins)->cil_code; \ - mono_bblock_insert_before_ins (bb, ins, (dest)); \ + (dest)->cil_code = (ins)->cil_code; \ + mono_bblock_insert_before_ins (bb, ins, (dest)); \ } while (0) +#define NEW_SIMD_INS(cfg,ins,dest,op,d,s1,s2) do { \ + MONO_INST_NEW ((cfg), (dest), (op)); \ + (dest)->cil_code = (ins)->cil_code; \ + (dest)->dreg = d; \ + (dest)->sreg1 = s1; \ + (dest)->sreg2 = s2; \ + (dest)->type = STACK_VTYPE; \ + (dest)->klass = ins->klass; \ + mono_bblock_insert_before_ins (bb, ins, (dest)); \ + } while (0) + +static int +simd_type_to_comp_op (int t) +{ + switch (t) { + case MONO_TYPE_I1: + case MONO_TYPE_U1: + return OP_PCMPEQB; + case MONO_TYPE_I2: + case MONO_TYPE_U2: + return OP_PCMPEQW; + case MONO_TYPE_I4: + case MONO_TYPE_U4: + return OP_PCMPEQD; + case MONO_TYPE_I8: + case MONO_TYPE_U8: + return OP_PCMPEQQ; // SSE 4.1 + default: + g_assert_not_reached (); + return -1; + } +} + +static int +simd_type_to_sub_op (int t) +{ + switch (t) { + case MONO_TYPE_I1: + case MONO_TYPE_U1: + return OP_PSUBB; + case MONO_TYPE_I2: + case MONO_TYPE_U2: + return OP_PSUBW; + case MONO_TYPE_I4: + case MONO_TYPE_U4: + return OP_PSUBD; + case MONO_TYPE_I8: + case MONO_TYPE_U8: + return OP_PSUBQ; + default: + g_assert_not_reached (); + return -1; + } +} + +static int +simd_type_to_shl_op (int t) +{ + switch (t) { + case MONO_TYPE_I2: + case MONO_TYPE_U2: + return OP_PSHLW; + case MONO_TYPE_I4: + case MONO_TYPE_U4: + return OP_PSHLD; + case MONO_TYPE_I8: + case MONO_TYPE_U8: + return OP_PSHLQ; + default: + g_assert_not_reached (); + return -1; + } +} + +static int +simd_type_to_gt_op (int t) +{ + switch (t) { + case MONO_TYPE_I1: + case MONO_TYPE_U1: + return OP_PCMPGTB; + case MONO_TYPE_I2: + case MONO_TYPE_U2: + return OP_PCMPGTW; + case MONO_TYPE_I4: + case MONO_TYPE_U4: + return OP_PCMPGTD; + case MONO_TYPE_I8: + case MONO_TYPE_U8: + return OP_PCMPGTQ; // SSE 4.2 + default: + g_assert_not_reached (); + return -1; + } +} + +static int +simd_type_to_max_un_op (int t) +{ + switch (t) { + case MONO_TYPE_U1: + return OP_PMAXB_UN; + case MONO_TYPE_U2: + return OP_PMAXW_UN; // SSE 4.1 + case MONO_TYPE_U4: + return OP_PMAXD_UN; // SSE 4.1 + //case MONO_TYPE_U8: + // return OP_PMAXQ_UN; // AVX + default: + g_assert_not_reached (); + return -1; + } +} + +static int +simd_type_to_add_op (int t) +{ + switch (t) { + case MONO_TYPE_I1: + case MONO_TYPE_U1: + return OP_PADDB; + case MONO_TYPE_I2: + case MONO_TYPE_U2: + return OP_PADDW; + case MONO_TYPE_I4: + case MONO_TYPE_U4: + return OP_PADDD; + case MONO_TYPE_I8: + case MONO_TYPE_U8: + return OP_PADDQ; + default: + g_assert_not_reached (); + return -1; + } +} + +static void +emit_simd_comp_op (MonoCompile *cfg, MonoBasicBlock *bb, MonoInst *ins, int type, int dreg, int sreg1, int sreg2) +{ + MonoInst *temp; + + if (!mono_hwcap_x86_has_sse42 && (ins->inst_c1 == MONO_TYPE_I8 || ins->inst_c1 == MONO_TYPE_U8)) { + int temp_reg1 = mono_alloc_ireg (cfg); + int temp_reg2 = mono_alloc_ireg (cfg); + + NEW_SIMD_INS (cfg, ins, temp, OP_PCMPEQD, temp_reg1, sreg1, sreg2); + NEW_SIMD_INS (cfg, ins, temp, OP_PSHUFLED, temp_reg2, temp_reg1, -1); + temp->inst_c0 = 0xB1; + NEW_SIMD_INS (cfg, ins, temp, OP_ANDPD, dreg, temp_reg1, temp_reg2); + } else { + NEW_SIMD_INS (cfg, ins, temp, simd_type_to_comp_op (type), dreg, sreg1, sreg2); + } +} + +static void +emit_simd_gt_op (MonoCompile *cfg, MonoBasicBlock *bb, MonoInst *ins, int type, int dreg, int sreg1, int sreg2); + +static void +emit_simd_gt_un_op (MonoCompile *cfg, MonoBasicBlock *bb, MonoInst *ins, int type, int dreg, int sreg1, int sreg2) +{ + MonoInst *temp; + + switch (type) { + case MONO_TYPE_U2: + case MONO_TYPE_U4: + if (mono_hwcap_x86_has_sse41) + goto USE_MAX; + goto USE_SIGNED_GT; + + case MONO_TYPE_U1: + USE_MAX: { + // dreg = max(sreg1, sreg2) != sreg2 + + int temp_reg1 = mono_alloc_ireg (cfg); + int temp_reg2 = mono_alloc_ireg (cfg); + int temp_reg3 = mono_alloc_ireg (cfg); + + NEW_SIMD_INS (cfg, ins, temp, simd_type_to_max_un_op (type), temp_reg1, sreg1, sreg2); + emit_simd_comp_op (cfg, bb, ins, ins->inst_c1, temp_reg2, temp_reg1, ins->sreg2); + NEW_SIMD_INS (cfg, ins, temp, OP_XONES, temp_reg3, -1, -1); + NEW_SIMD_INS (cfg, ins, temp, OP_XORPD, dreg, temp_reg2, temp_reg3); + break; + } + + case MONO_TYPE_U8: + USE_SIGNED_GT: { + // convert to signed integer by subtracting (1 << (size - 1)) from each operand + // and then use signed comparison + + int temp_c0 = mono_alloc_ireg (cfg); + int temp_c80 = mono_alloc_ireg (cfg); + int temp_s1 = mono_alloc_ireg (cfg); + int temp_s2 = mono_alloc_ireg (cfg); + + NEW_SIMD_INS (cfg, ins, temp, OP_XONES, temp_c0, -1, -1); + NEW_SIMD_INS (cfg, ins, temp, simd_type_to_shl_op (type), temp_c80, temp_c0, -1); + temp->inst_imm = type == MONO_TYPE_U2 ? 15 : (type == MONO_TYPE_U4 ? 31 : 63); + NEW_SIMD_INS (cfg, ins, temp, simd_type_to_sub_op (type), temp_s1, sreg1, temp_c80); + NEW_SIMD_INS (cfg, ins, temp, simd_type_to_sub_op (type), temp_s2, sreg2, temp_c80); + emit_simd_gt_op (cfg, bb, ins, type, dreg, temp_s1, temp_s2); + break; + } + } +} + +static void +emit_simd_gt_op (MonoCompile *cfg, MonoBasicBlock *bb, MonoInst *ins, int type, int dreg, int sreg1, int sreg2) +{ + MonoInst *temp; + + if (!mono_hwcap_x86_has_sse42 && (type == MONO_TYPE_I8 || type == MONO_TYPE_U8)) { + // Decompose 64-bit greater than to 32-bit + // + // t = (v1 > v2) + // u = (v1 == v2) + // v = (v1 > v2) unsigned + // + // z = shuffle(t, (3, 3, 1, 1)) + // t1 = shuffle(v, (2, 2, 0, 0)) + // u1 = shuffle(u, (3, 3, 1, 1)) + // w = and(t1, u1) + // result = bitwise_or(z, w) + + int temp_t = mono_alloc_ireg (cfg); + int temp_u = mono_alloc_ireg (cfg); + int temp_v = mono_alloc_ireg (cfg); + int temp_z = temp_t; + int temp_t1 = temp_v; + int temp_u1 = temp_u; + int temp_w = temp_t1; + + NEW_SIMD_INS (cfg, ins, temp, OP_PCMPGTD, temp_t, sreg1, sreg2); + NEW_SIMD_INS (cfg, ins, temp, OP_PCMPEQD, temp_u, sreg1, sreg2); + emit_simd_gt_un_op (cfg, bb, ins, MONO_TYPE_U4, temp_v, sreg1, sreg2); + NEW_SIMD_INS (cfg, ins, temp, OP_PSHUFLED, temp_z, temp_t, -1); + temp->inst_c0 = 0xF5; + NEW_SIMD_INS (cfg, ins, temp, OP_PSHUFLED, temp_t1, temp_v, -1); + temp->inst_c0 = 0xA0; + NEW_SIMD_INS (cfg, ins, temp, OP_PSHUFLED, temp_u1, temp_u, -1); + temp->inst_c0 = 0xF5; + NEW_SIMD_INS (cfg, ins, temp, OP_ANDPD, temp_w, temp_t1, temp_u1); + NEW_SIMD_INS (cfg, ins, temp, OP_ORPD, dreg, temp_z, temp_w); + } else { + NEW_SIMD_INS (cfg, ins, temp, simd_type_to_gt_op (type), dreg, sreg1, sreg2); + } +} + /* * mono_arch_lowering_pass: * @@ -3444,27 +3705,204 @@ mono_arch_lowering_pass (MonoCompile *cfg, MonoBasicBlock *bb) break; #ifdef MONO_ARCH_SIMD_INTRINSICS case OP_EXPAND_I1: { + int temp_reg1 = mono_alloc_ireg (cfg); + int temp_reg2 = mono_alloc_ireg (cfg); + int original_reg = ins->sreg1; + + NEW_INS (cfg, ins, temp, OP_ICONV_TO_U1); + temp->sreg1 = original_reg; + temp->dreg = temp_reg1; + + NEW_INS (cfg, ins, temp, OP_SHL_IMM); + temp->sreg1 = temp_reg1; + temp->dreg = temp_reg2; + temp->inst_imm = 8; + + NEW_INS (cfg, ins, temp, OP_LOR); + temp->sreg1 = temp->dreg = temp_reg2; + temp->sreg2 = temp_reg1; + + ins->opcode = OP_EXPAND_I2; + ins->sreg1 = temp_reg2; + break; + } + + case OP_XEQUAL: { + int temp_reg1 = mono_alloc_ireg (cfg); + int temp_reg2 = mono_alloc_ireg (cfg); + + NEW_SIMD_INS (cfg, ins, temp, OP_PCMPEQD, temp_reg1, ins->sreg1, ins->sreg2); + NEW_SIMD_INS (cfg, ins, temp, OP_EXTRACT_MASK, temp_reg2, temp_reg1, -1); + temp->type = STACK_I4; + NEW_INS (cfg, ins, temp, OP_COMPARE_IMM); + temp->sreg1 = temp_reg2; + temp->inst_imm = 0xFFFF; + temp->klass = ins->klass; + ins->opcode = OP_CEQ; + ins->sreg1 = -1; + ins->sreg2 = -1; + break; + } + + case OP_XCOMPARE: { + int temp_reg; + + switch (ins->inst_c0) + { + case CMP_EQ: + emit_simd_comp_op (cfg, bb, ins, ins->inst_c1, ins->dreg, ins->sreg1, ins->sreg2); + NULLIFY_INS (ins); + break; + + case CMP_NE: { int temp_reg1 = mono_alloc_ireg (cfg); int temp_reg2 = mono_alloc_ireg (cfg); - int original_reg = ins->sreg1; - NEW_INS (cfg, ins, temp, OP_ICONV_TO_U1); - temp->sreg1 = original_reg; - temp->dreg = temp_reg1; + emit_simd_comp_op (cfg, bb, ins, ins->inst_c1, temp_reg1, ins->sreg1, ins->sreg2); + NEW_SIMD_INS (cfg, ins, temp, OP_XONES, temp_reg2, -1, -1); + ins->opcode = OP_XORPD; + ins->sreg1 = temp_reg1; + ins->sreg1 = temp_reg2; + break; + } - NEW_INS (cfg, ins, temp, OP_SHL_IMM); - temp->sreg1 = temp_reg1; - temp->dreg = temp_reg2; - temp->inst_imm = 8; + case CMP_LT: + temp_reg = ins->sreg1; + ins->sreg1 = ins->sreg2; + ins->sreg2 = temp_reg; + case CMP_GT: + emit_simd_gt_op (cfg, bb, ins, ins->inst_c1, ins->dreg, ins->sreg1, ins->sreg2); + NULLIFY_INS (ins); + break; - NEW_INS (cfg, ins, temp, OP_LOR); - temp->sreg1 = temp->dreg = temp_reg2; - temp->sreg2 = temp_reg1; + case CMP_LE: + temp_reg = ins->sreg1; + ins->sreg1 = ins->sreg2; + ins->sreg2 = temp_reg; + case CMP_GE: { + int temp_reg1 = mono_alloc_ireg (cfg); + int temp_reg2 = mono_alloc_ireg (cfg); - ins->opcode = OP_EXPAND_I2; - ins->sreg1 = temp_reg2; + emit_simd_gt_op (cfg, bb, ins, ins->inst_c1, temp_reg1, ins->sreg1, ins->sreg2); + emit_simd_comp_op (cfg, bb, ins, ins->inst_c1, temp_reg2, ins->sreg1, ins->sreg2); + ins->opcode = OP_POR; + ins->sreg1 = temp_reg1; + ins->sreg2 = temp_reg2; + break; } + + case CMP_LE_UN: + temp_reg = ins->sreg1; + ins->sreg1 = ins->sreg2; + ins->sreg2 = temp_reg; + case CMP_GE_UN: + if (mono_hwcap_x86_has_sse41 && ins->inst_c1 != MONO_TYPE_U8) { + int temp_reg1 = mono_alloc_ireg (cfg); + + NEW_SIMD_INS (cfg, ins, temp, simd_type_to_max_un_op (ins->inst_c1), temp_reg1, ins->sreg1, ins->sreg2); + emit_simd_comp_op (cfg, bb, ins, ins->inst_c1, ins->dreg, temp_reg1, ins->sreg1); + NULLIFY_INS (ins); + } else { + int temp_reg1 = mono_alloc_ireg (cfg); + int temp_reg2 = mono_alloc_ireg (cfg); + + emit_simd_gt_un_op (cfg, bb, ins, ins->inst_c1, temp_reg1, ins->sreg1, ins->sreg2); + emit_simd_comp_op (cfg, bb, ins, ins->inst_c1, temp_reg2, ins->sreg1, ins->sreg2); + ins->opcode = OP_POR; + ins->sreg1 = temp_reg1; + ins->sreg2 = temp_reg2; + } + break; + + case CMP_LT_UN: + temp_reg = ins->sreg1; + ins->sreg1 = ins->sreg2; + ins->sreg2 = temp_reg; + case CMP_GT_UN: { + emit_simd_gt_un_op (cfg, bb, ins, ins->inst_c1, ins->dreg, ins->sreg1, ins->sreg2); + NULLIFY_INS (ins); + break; + } + + default: + g_assert_not_reached(); + break; + } + + ins->type = STACK_VTYPE; + ins->inst_c0 = 0; break; + } + + case OP_XCOMPARE_FP: { + ins->opcode = ins->inst_c1 == MONO_TYPE_R4 ? OP_COMPPS : OP_COMPPD; + + switch (ins->inst_c0) + { + case CMP_EQ: ins->inst_c0 = 0; break; + case CMP_NE: ins->inst_c0 = 4; break; + case CMP_LT: ins->inst_c0 = 1; break; + case CMP_LE: ins->inst_c0 = 2; break; + case CMP_GT: ins->inst_c0 = 6; break; + case CMP_GE: ins->inst_c0 = 5; break; + default: + g_assert_not_reached(); + break; + } + + break; + } + + case OP_XCAST: { + ins->opcode = OP_XMOVE; + break; + } + + case OP_XBINOP: { + switch (ins->inst_c0) + { + case OP_ISUB: + ins->opcode = simd_type_to_sub_op (ins->inst_c1); + break; + case OP_IADD: + ins->opcode = simd_type_to_add_op (ins->inst_c1); + break; + case OP_IAND: + ins->opcode = OP_ANDPD; + break; + case OP_IXOR: + ins->opcode = OP_XORPD; + break; + case OP_IOR: + ins->opcode = OP_ORPD; + break; + case OP_FSUB: + ins->opcode = ins->inst_c1 == MONO_TYPE_R8 ? OP_SUBPD : OP_SUBPS; + break; + case OP_FADD: + ins->opcode = ins->inst_c1 == MONO_TYPE_R8 ? OP_ADDPD : OP_ADDPS; + break; + case OP_FDIV: + ins->opcode = ins->inst_c1 == MONO_TYPE_R8 ? OP_DIVPD : OP_DIVPS; + break; + case OP_FMUL: + ins->opcode = ins->inst_c1 == MONO_TYPE_R8 ? OP_MULPD : OP_MULPS; + break; + default: + g_assert_not_reached(); + break; + } + break; + } + + case OP_XEXTRACT_R4: + case OP_XEXTRACT_R8: + case OP_XEXTRACT_I32: + case OP_XEXTRACT_I64: { + // TODO + g_assert_not_reached(); + break; + } #endif default: break; @@ -4137,7 +4575,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) break; case OP_COMPARE_IMM: #if defined(MONO_ARCH_ILP32) - /* Comparison of pointer immediates should be 4 bytes to avoid sign-extend problems */ + /* Comparison of pointer immediates should be 4 bytes to avoid sign-extend problems */ g_assert (amd64_is_imm32 (ins->inst_imm)); amd64_alu_reg_imm_size (code, X86_CMP, ins->sreg1, ins->inst_imm, 4); break; @@ -6774,6 +7212,20 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) amd64_sse_roundpd_reg_reg_imm (code, ins->dreg, ins->sreg1, ins->inst_c0); break; #endif + + case OP_LZCNT32: + amd64_sse_lzcnt_reg_reg_size (code, ins->dreg, ins->sreg1, 4); + break; + case OP_LZCNT64: + amd64_sse_lzcnt_reg_reg_size (code, ins->dreg, ins->sreg1, 8); + break; + case OP_POPCNT32: + amd64_sse_popcnt_reg_reg_size (code, ins->dreg, ins->sreg1, 4); + break; + case OP_POPCNT64: + amd64_sse_popcnt_reg_reg_size (code, ins->dreg, ins->sreg1, 8); + break; + case OP_LIVERANGE_START: { if (cfg->verbose_level > 1) printf ("R%d START=0x%x\n", MONO_VARINFO (cfg, ins->inst_c0)->vreg, (int)(code - cfg->native_code)); diff --git a/src/mono/mono/mini/mini-amd64.h b/src/mono/mono/mini/mini-amd64.h index d000c15..7caef06 100644 --- a/src/mono/mono/mini/mini-amd64.h +++ b/src/mono/mono/mini/mini-amd64.h @@ -485,14 +485,14 @@ typedef struct { /* Used for optimization, not complete */ #define MONO_ARCH_IS_OP_MEMBASE(opcode) ((opcode) == OP_X86_PUSH_MEMBASE) -#define MONO_ARCH_EMIT_BOUNDS_CHECK(cfg, array_reg, offset, index_reg) do { \ +#define MONO_ARCH_EMIT_BOUNDS_CHECK(cfg, array_reg, offset, index_reg, ex_name) do { \ MonoInst *inst; \ MONO_INST_NEW ((cfg), inst, OP_AMD64_ICOMPARE_MEMBASE_REG); \ inst->inst_basereg = array_reg; \ inst->inst_offset = offset; \ inst->sreg2 = index_reg; \ MONO_ADD_INS ((cfg)->cbb, inst); \ - MONO_EMIT_NEW_COND_EXC (cfg, LE_UN, "IndexOutOfRangeException"); \ + MONO_EMIT_NEW_COND_EXC (cfg, LE_UN, ex_name); \ } while (0) // Does the ABI have a volatile non-parameter register, so tailcall diff --git a/src/mono/mono/mini/mini-x86.h b/src/mono/mono/mini/mini-x86.h index 55f6850..d64c37f 100644 --- a/src/mono/mono/mini/mini-x86.h +++ b/src/mono/mono/mini/mini-x86.h @@ -68,9 +68,11 @@ LONG CALLBACK seh_handler(EXCEPTION_POINTERS* ep); #define MONO_ARCH_SUPPORT_TASKLETS 1 #ifndef DISABLE_SIMD +#ifndef ENABLE_NETCORE #define MONO_ARCH_SIMD_INTRINSICS 1 #define MONO_ARCH_NEED_SIMD_BANK 1 #endif +#endif /* we should lower this size and make sure we don't call heavy stack users in the segv handler */ #if defined(__APPLE__) @@ -242,14 +244,14 @@ typedef struct { /* Used for optimization, not complete */ #define MONO_ARCH_IS_OP_MEMBASE(opcode) ((opcode) == OP_X86_PUSH_MEMBASE) -#define MONO_ARCH_EMIT_BOUNDS_CHECK(cfg, array_reg, offset, index_reg) do { \ +#define MONO_ARCH_EMIT_BOUNDS_CHECK(cfg, array_reg, offset, index_reg, ex_name) do { \ MonoInst *inst; \ MONO_INST_NEW ((cfg), inst, OP_X86_COMPARE_MEMBASE_REG); \ inst->inst_basereg = array_reg; \ inst->inst_offset = offset; \ inst->sreg2 = index_reg; \ MONO_ADD_INS ((cfg)->cbb, inst); \ - MONO_EMIT_NEW_COND_EXC (cfg, LE_UN, "IndexOutOfRangeException"); \ + MONO_EMIT_NEW_COND_EXC (cfg, LE_UN, ex_name); \ } while (0) // Does the ABI have a volatile non-parameter register, so tailcall diff --git a/src/mono/mono/mini/mini.h b/src/mono/mono/mini/mini.h index cd442e0..1b8a826 100644 --- a/src/mono/mono/mini/mini.h +++ b/src/mono/mono/mini/mini.h @@ -2823,6 +2823,7 @@ enum { const char *mono_arch_xregname (int reg); guint32 mono_arch_cpu_enumerate_simd_versions (void); +MonoCPUFeatures mono_arch_get_cpu_features (void); #ifdef MONO_ARCH_SIMD_INTRINSICS void mono_simd_simplify_indirection (MonoCompile *cfg); diff --git a/src/mono/mono/mini/simd-intrinsics-netcore.c b/src/mono/mono/mini/simd-intrinsics-netcore.c index 20146a3..9bd6299 100644 --- a/src/mono/mono/mini/simd-intrinsics-netcore.c +++ b/src/mono/mono/mini/simd-intrinsics-netcore.c @@ -59,6 +59,8 @@ get_cpu_features (void) { #ifdef ENABLE_LLVM return mono_llvm_get_cpu_features (); +#elif defined(TARGET_AMD64) + return mono_arch_get_cpu_features (); #else return (MonoCPUFeatures)0; #endif @@ -311,6 +313,8 @@ emit_sys_numerics_vector_t (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSig return emit_xcompare (cfg, klass, etype, ins, ins); } case SN_get_Item: + if (!COMPILE_LLVM (cfg)) + return NULL; MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, args [1]->dreg, len); MONO_EMIT_NEW_COND_EXC (cfg, GE_UN, "IndexOutOfRangeException"); int opcode = -1; @@ -438,7 +442,8 @@ emit_sys_numerics_vector_t (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSig if (id == SN_op_Inequality) { int sreg = ins->dreg; int dreg = alloc_ireg (cfg); - EMIT_NEW_UNALU (cfg, ins, OP_INOT, dreg, sreg); + MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, sreg, 0); + EMIT_NEW_UNALU (cfg, ins, OP_CEQ, dreg, -1); } return ins; case SN_GreaterThan: @@ -477,6 +482,7 @@ emit_sys_numerics_vector_t (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSig if (!(fsig->param_count == 2 && mono_metadata_type_equal (fsig->ret, type) && mono_metadata_type_equal (fsig->params [0], type) && mono_metadata_type_equal (fsig->params [1], type))) return NULL; ins = emit_simd_ins (cfg, klass, OP_XBINOP, args [0]->dreg, args [1]->dreg); + ins->inst_c1 = etype->type; if (etype->type == MONO_TYPE_R4 || etype->type == MONO_TYPE_R8) { switch (id) { case SN_op_Addition: @@ -622,6 +628,8 @@ emit_x86_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature // We only support the subset used by corelib if (m_class_get_image (cfg->method->klass) != mono_get_corlib ()) return NULL; + if (!COMPILE_LLVM (cfg)) + return NULL; id = lookup_intrins (bmi1_methods, sizeof (bmi1_methods), cmethod); g_assert (id != -1); supported = (get_cpu_features () & MONO_CPU_X86_BMI1) != 0; @@ -647,6 +655,8 @@ emit_x86_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature // We only support the subset used by corelib if (m_class_get_image (cfg->method->klass) != mono_get_corlib ()) return NULL; + if (!COMPILE_LLVM (cfg)) + return NULL; id = lookup_intrins (bmi2_methods, sizeof (bmi2_methods), cmethod); g_assert (id != -1); supported = (get_cpu_features () & MONO_CPU_X86_BMI2) != 0; @@ -782,8 +792,6 @@ mono_emit_simd_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSign if (image != mono_get_corlib ()) return NULL; - if (!COMPILE_LLVM (cfg)) - return NULL; // FIXME: if (cfg->compile_aot) return NULL; diff --git a/src/mono/mono/utils/mono-hwcap-vars.h b/src/mono/mono/utils/mono-hwcap-vars.h index 1596ca4..a7846db 100644 --- a/src/mono/mono/utils/mono-hwcap-vars.h +++ b/src/mono/mono/utils/mono-hwcap-vars.h @@ -74,5 +74,8 @@ MONO_HWCAP_VAR(x86_has_ssse3) MONO_HWCAP_VAR(x86_has_sse41) MONO_HWCAP_VAR(x86_has_sse42) MONO_HWCAP_VAR(x86_has_sse4a) +MONO_HWCAP_VAR(x86_has_lzcnt) +MONO_HWCAP_VAR(x86_has_popcnt) +MONO_HWCAP_VAR(x86_has_avx) #endif diff --git a/src/mono/mono/utils/mono-hwcap-x86.c b/src/mono/mono/utils/mono-hwcap-x86.c index 4e103e0..7bbaa2a 100644 --- a/src/mono/mono/utils/mono-hwcap-x86.c +++ b/src/mono/mono/utils/mono-hwcap-x86.c @@ -136,6 +136,12 @@ mono_hwcap_arch_init (void) if (ecx & (1 << 20)) mono_hwcap_x86_has_sse42 = TRUE; + + if (ecx & (1 << 23)) + mono_hwcap_x86_has_popcnt = TRUE; + + if (ecx & (1 << 28)) + mono_hwcap_x86_has_avx = TRUE; } if (cpuid (0x80000000, &eax, &ebx, &ecx, &edx)) { @@ -147,6 +153,11 @@ mono_hwcap_arch_init (void) } } + if (cpuid (0x80000001, &eax, &ebx, &ecx, &edx)) { + if (ecx & (1 << 5)) + mono_hwcap_x86_has_lzcnt = TRUE; + } + #if defined(HAVE_UNISTD_H) && defined(HAVE_ACCESS) mono_hwcap_x86_is_xen = !access ("/proc/xen", F_OK); #endif