[mono] Implement Sse41, Pclmulqdq, Aes and enable Sse41 intrinsics (#34866)
authorEgor Bogatov <egorbo@gmail.com>
Mon, 13 Apr 2020 21:37:05 +0000 (00:37 +0300)
committerGitHub <noreply@github.com>
Mon, 13 Apr 2020 21:37:05 +0000 (00:37 +0300)
* Implement Sse41, Pclmulqdq and Aes, enable Sse42.

src/mono/mono/mini/cpu-amd64.md
src/mono/mono/mini/intrinsics.c
src/mono/mono/mini/llvm-intrinsics.h
src/mono/mono/mini/mini-amd64.c
src/mono/mono/mini/mini-llvm.c
src/mono/mono/mini/mini-ops.h
src/mono/mono/mini/mini.h
src/mono/mono/mini/simd-intrinsics-netcore.c
src/mono/mono/mini/simd-methods-netcore.h

index 85252a9..06abb8e 100644 (file)
@@ -815,7 +815,7 @@ expand_i8: dest:x src1:i len:11
 expand_r4: dest:x src1:f len:16
 expand_r8: dest:x src1:f len:13
 
-roundpd: dest:x src1:x len:10
+roundp: dest:x src1:x len:10
 
 liverange_start: len:0
 liverange_end: len:0
index 38b7150..9f41b3a 100644 (file)
@@ -141,8 +141,9 @@ llvm_emit_inst_for_method (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSign
                                // to align with CoreCLR behavior
                                int xreg = alloc_xreg (cfg);
                                EMIT_NEW_UNALU (cfg, ins, OP_FCONV_TO_R4_X, xreg, args [0]->dreg);
-                               EMIT_NEW_UNALU (cfg, ins, OP_SSE41_ROUNDSS, xreg, xreg);
+                               EMIT_NEW_UNALU (cfg, ins, OP_SSE41_ROUNDS, xreg, xreg);
                                ins->inst_c0 = 0x4; // vroundss xmm0, xmm0, xmm0, 0x4 (mode for rounding)
+                               ins->inst_c1 = MONO_TYPE_R4;
                                int dreg = alloc_freg (cfg);
                                EMIT_NEW_UNALU (cfg, ins, OP_EXTRACT_R4, dreg, xreg);
                                return ins;
index ea79ba2..74b2ed3 100644 (file)
@@ -174,7 +174,10 @@ INTRINS(SSE_PAUSE, x86_sse2_pause)
 INTRINS(SSE_MASKMOVDQU, x86_sse2_maskmov_dqu)
 INTRINS(SSE_PSHUFB, x86_ssse3_pshuf_b_128)
 INTRINS(SSE_DPPS, x86_sse41_dpps)
+INTRINS(SSE_DPPD, x86_sse41_dppd)
 INTRINS(SSE_ROUNDSS, x86_sse41_round_ss)
+INTRINS(SSE_ROUNDSD, x86_sse41_round_sd)
+INTRINS(SSE_ROUNDPS, x86_sse41_round_ps)
 INTRINS(SSE_ROUNDPD, x86_sse41_round_pd)
 INTRINS(SSE_PTESTZ, x86_sse41_ptestz)
 INTRINS(SSE_INSERTPS, x86_sse41_insertps)
@@ -197,6 +200,22 @@ INTRINS(SSE_CRC32_32_8, x86_sse42_crc32_32_8)
 INTRINS(SSE_CRC32_32_16, x86_sse42_crc32_32_16)
 INTRINS(SSE_CRC32_32_32, x86_sse42_crc32_32_32)
 INTRINS(SSE_CRC32_64_64, x86_sse42_crc32_64_64)
+INTRINS(SSE_TESTC, x86_sse41_ptestc)
+INTRINS(SSE_TESTNZ, x86_sse41_ptestnzc)
+INTRINS(SSE_TESTZ, x86_sse41_ptestz)
+INTRINS(SSE_PBLENDVB, x86_sse41_pblendvb)
+INTRINS(SSE_BLENDVPS, x86_sse41_blendvps)
+INTRINS(SSE_BLENDVPD, x86_sse41_blendvpd)
+INTRINS(SSE_PMULDQ, x86_sse41_pmuldq)
+INTRINS(SSE_PHMINPOSUW, x86_sse41_phminposuw)
+INTRINS(SSE_MPSADBW, x86_sse41_mpsadbw)
+INTRINS(PCLMULQDQ, x86_pclmulqdq)
+INTRINS(AESNI_AESKEYGENASSIST, x86_aesni_aeskeygenassist)
+INTRINS(AESNI_AESDEC, x86_aesni_aesdec)
+INTRINS(AESNI_AESDECLAST, x86_aesni_aesdeclast)
+INTRINS(AESNI_AESENC, x86_aesni_aesenc)
+INTRINS(AESNI_AESENCLAST, x86_aesni_aesenclast)
+INTRINS(AESNI_AESIMC, x86_aesni_aesimc)
 #if LLVM_API_VERSION >= 800
        // these intrinsics were renamed in LLVM 8
 INTRINS_OVR(SSE_SADD_SATI8, sadd_sat)
index 4ea484f..f997469 100644 (file)
@@ -7326,9 +7326,13 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        amd64_sse_movsd_reg_reg (code, ins->dreg, ins->sreg1);
                        amd64_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, 0x44);
                        break;
-               case OP_SSE41_ROUNDPD:
-                       amd64_sse_roundpd_reg_reg_imm (code, ins->dreg, ins->sreg1, ins->inst_c0);
+               case OP_SSE41_ROUNDP: {
+                       if (ins->inst_c1 == MONO_TYPE_R8)
+                               amd64_sse_roundpd_reg_reg_imm (code, ins->dreg, ins->sreg1, ins->inst_c0);
+                       else
+                               g_assert_not_reached (); // roundps, but it's not used anywhere for non-llvm back-end yet.
                        break;
+               }
 #endif
 
                case OP_LZCNT32:
@@ -8867,8 +8871,9 @@ mono_arch_emit_inst_for_method (MonoCompile *cfg, MonoMethod *cmethod, MonoMetho
                        if (mode != -1) {
                                int xreg = alloc_xreg (cfg);
                                EMIT_NEW_UNALU (cfg, ins, OP_FCONV_TO_R8_X, xreg, args [0]->dreg);
-                               EMIT_NEW_UNALU (cfg, ins, OP_SSE41_ROUNDPD, xreg, xreg);
+                               EMIT_NEW_UNALU (cfg, ins, OP_SSE41_ROUNDP, xreg, xreg);
                                ins->inst_c0 = mode;
+                               ins->inst_c1 = MONO_TYPE_R8;
                                int dreg = alloc_freg (cfg);
                                EMIT_NEW_UNALU (cfg, ins, OP_EXTRACT_R8, dreg, xreg);
                                return ins;
index 664582a..f5aef1f 100644 (file)
@@ -7993,6 +7993,7 @@ process_bb (EmitContext *ctx, MonoBasicBlock *bb)
                }
                case OP_XOP_X_I:
                case OP_XOP_X_X: {
+                       gboolean to_i1_t = FALSE;
                        IntrinsicId id = (IntrinsicId)0;
                        switch (ins->inst_c0) {
                        case SIMD_OP_SSE_SQRTPS: id = INTRINS_SSE_SQRT_PS; break;
@@ -8000,9 +8001,14 @@ process_bb (EmitContext *ctx, MonoBasicBlock *bb)
                        case SIMD_OP_SSE_RSQRTPS: id = INTRINS_SSE_RSQRT_PS; break;
                        case SIMD_OP_SSE_SQRTPD: id = INTRINS_SSE_SQRT_PD; break;
                        case SIMD_OP_SSE_LDDQU: id = INTRINS_SSE_LDU_DQ; break;
+                       case SIMD_OP_SSE_PHMINPOSUW: id = INTRINS_SSE_PHMINPOSUW; to_i1_t = TRUE; break;
+                       case SIMD_OP_AES_IMC: id = INTRINS_AESNI_AESIMC; break;
                        default: g_assert_not_reached (); break;
                        }
-                       values [ins->dreg] = call_intrins (ctx, id, &lhs, "");
+                       LLVMValueRef arg0 = lhs;
+                       if (to_i1_t)
+                               arg0 = convert (ctx, arg0, sse_i1_t);
+                       values [ins->dreg] = call_intrins (ctx, id, &arg0, "");
                        break;
                }
                case OP_XOP_I4_X:
@@ -8022,6 +8028,32 @@ process_bb (EmitContext *ctx, MonoBasicBlock *bb)
                        values [ins->dreg] = call_intrins (ctx, id, &lhs, "");
                        break;
                }
+               case OP_XOP_I4_X_X: {
+                       gboolean to_i8_t = FALSE;
+                       gboolean ret_bool = FALSE;
+                       IntrinsicId id = (IntrinsicId)0;
+                       switch (ins->inst_c0) {
+                       case SIMD_OP_SSE_TESTC:  id = INTRINS_SSE_TESTC;  to_i8_t = TRUE; ret_bool = TRUE; break;
+                       case SIMD_OP_SSE_TESTZ:  id = INTRINS_SSE_TESTZ;  to_i8_t = TRUE; ret_bool = TRUE; break;
+                       case SIMD_OP_SSE_TESTNZ: id = INTRINS_SSE_TESTNZ; to_i8_t = TRUE; ret_bool = TRUE; break;
+                       default: g_assert_not_reached (); break;
+                       }
+                       LLVMValueRef args [] = { lhs, rhs };
+                       if (to_i8_t) {
+                               args [0] = convert (ctx, args [0], sse_i8_t);
+                               args [1] = convert (ctx, args [1], sse_i8_t);
+                       }
+                       
+                       LLVMValueRef call = call_intrins (ctx, id, args, "");
+                       if (ret_bool) {
+                               // if return type is bool (it's still i32) we need to normalize it to 1/0
+                               LLVMValueRef cmp_zero = LLVMBuildICmp (builder, LLVMIntNE, call, LLVMConstInt (LLVMInt32Type (), 0, FALSE), "");
+                               values [ins->dreg] = LLVMBuildZExt (builder, cmp_zero, LLVMInt8Type (), "");
+                       } else {
+                               values [ins->dreg] = call;
+                       }
+                       break;
+               }
                case OP_XOP_X_X_X:
                case OP_XOP_X_X_I4:
                case OP_XOP_X_X_I8: {
@@ -8076,6 +8108,11 @@ process_bb (EmitContext *ctx, MonoBasicBlock *bb)
                        case SIMD_OP_SSE_PSIGND: id = INTRINS_SSE_PSIGND; break;
                        case SIMD_OP_SSE_PMADDUBSW: id = INTRINS_SSE_PMADDUBSW; break;
                        case SIMD_OP_SSE_PMULHRSW: id = INTRINS_SSE_PMULHRSW; break;
+                       case SIMD_OP_SSE_PACKUSDW: id = INTRINS_SSE_PACKUSDW; break;
+                       case SIMD_OP_AES_DEC: id = INTRINS_AESNI_AESDEC; break;
+                       case SIMD_OP_AES_DECLAST: id = INTRINS_AESNI_AESDECLAST; break;
+                       case SIMD_OP_AES_ENC: id = INTRINS_AESNI_AESENC; break;
+                       case SIMD_OP_AES_ENCLAST: id = INTRINS_AESNI_AESENCLAST; break;
                        default: g_assert_not_reached (); break;
                        }
                        values [ins->dreg] = call_intrins (ctx, id, args, "");
@@ -8405,25 +8442,38 @@ process_bb (EmitContext *ctx, MonoBasicBlock *bb)
                        values [ins->dreg] = LLVMBuildInsertElement (builder, vector, val, insert_pos, "");
                        break;
                }
-
-               case OP_SSE41_ROUNDSS: {
+               case OP_SSE41_ROUNDP: {
+                       LLVMValueRef args [] = { lhs, LLVMConstInt (LLVMInt32Type (), ins->inst_c0, FALSE) };
+                       values [ins->dreg] = call_intrins (ctx, ins->inst_c1 == MONO_TYPE_R4 ? INTRINS_SSE_ROUNDPS : INTRINS_SSE_ROUNDPD, args, dname);
+                       break;
+               }
+               case OP_SSE41_ROUNDS: {
                        LLVMValueRef args [3];
-
                        args [0] = lhs;
-                       args [1] = lhs;
+                       args [1] = ins->sreg2 != -1 ? values [ins->sreg2] : lhs;
                        args [2] = LLVMConstInt (LLVMInt32Type (), ins->inst_c0, FALSE);
-
-                       values [ins->dreg] = call_intrins (ctx, INTRINS_SSE_ROUNDSS, args, dname);
+                       values [ins->dreg] = call_intrins (ctx, ins->inst_c1 == MONO_TYPE_R4 ? INTRINS_SSE_ROUNDSS : INTRINS_SSE_ROUNDSD, args, dname);
                        break;
                }
 
-               case OP_SSE41_ROUNDPD: {
-                       LLVMValueRef args [3];
+               case OP_SSE41_DPPS_IMM: {
+                       LLVMValueRef args [] = { lhs, rhs, LLVMConstInt (LLVMInt8Type (), ins->inst_c0, FALSE) };
+                       values [ins->dreg] = call_intrins (ctx, INTRINS_SSE_DPPS, args, dname);
+                       break;
+               }
 
-                       args [0] = lhs;
-                       args [1] = LLVMConstInt (LLVMInt32Type (), ins->inst_c0, FALSE);
+               case OP_SSE41_DPPD_IMM: {
+                       LLVMValueRef args [] = { lhs, rhs, LLVMConstInt (LLVMInt8Type (), ins->inst_c0, FALSE) };
+                       values [ins->dreg] = call_intrins (ctx, INTRINS_SSE_DPPD, args, dname);
+                       break;
+               }
 
-                       values [ins->dreg] = call_intrins (ctx, INTRINS_SSE_ROUNDPD, args, dname);
+               case OP_SSE41_MPSADBW_IMM: {
+                       LLVMValueRef args [3];
+                       args [0] = LLVMBuildBitCast (ctx->builder, lhs, sse_i1_t, "");
+                       args [1] = LLVMBuildBitCast (ctx->builder, rhs, sse_i1_t, "");
+                       args [2] = LLVMConstInt (LLVMInt8Type (), ins->inst_c0, FALSE);
+                       values [ins->dreg] = call_intrins (ctx, INTRINS_SSE_MPSADBW, args, dname);
                        break;
                }
 
@@ -8445,16 +8495,105 @@ process_bb (EmitContext *ctx, MonoBasicBlock *bb)
                        break;
                }
 
-               case OP_SSE41_PTESTZ: {
-                       LLVMValueRef args [2];
-                       args [0] = convert (ctx, lhs, sse_i8_t);
-                       args [1] = convert (ctx, rhs, sse_i8_t);
-                       LLVMValueRef call = call_intrins (ctx, INTRINS_SSE_PTESTZ, args, dname);
-                       LLVMValueRef cmp_zero = LLVMBuildICmp (builder, LLVMIntNE, call, LLVMConstInt (LLVMInt32Type (), 0, FALSE), "");
-                       values [ins->dreg] = LLVMBuildZExt (builder, cmp_zero, LLVMInt8Type (), "");
+               case OP_SSE41_BLEND_IMM: {
+                       int nelem = LLVMGetVectorSize (LLVMTypeOf (lhs));
+                       g_assert(nelem >= 2 && nelem <= 8); // I2, U2, R4, R8
+                       
+                       int mask_values [8];
+                       for (int i = 0; i < nelem; i++) {
+                               // n-bit in inst_c0 (control byte) is set to 1
+                               gboolean bit_set = ((ins->inst_c0 & ( 1 << i )) >> i);
+                               mask_values [i] = i + (bit_set ? 1 : 0) * nelem;
+                       }
+                       
+                       LLVMValueRef mask = create_const_vector_i32 (mask_values, nelem);
+                       values [ins->dreg] = LLVMBuildShuffleVector (builder, lhs, rhs, mask, "");
+                       break;
+               }
+
+               case OP_SSE41_BLENDV: {
+                       LLVMValueRef args [] = { lhs, rhs, values [ins->sreg3] };
+                       if (ins->inst_c1 == MONO_TYPE_R4) {
+                               values [ins->dreg] = call_intrins (ctx, INTRINS_SSE_BLENDVPS, args, dname);
+                       } else if (ins->inst_c1 == MONO_TYPE_R8) {
+                               values [ins->dreg] = call_intrins (ctx, INTRINS_SSE_BLENDVPD, args, dname);
+                       } else {
+                               // for other non-fp type just convert to <16 x i8> and pass to @llvm.x86.sse41.pblendvb
+                               args [0] = LLVMBuildBitCast (ctx->builder, args [0], sse_i1_t, "");
+                               args [1] = LLVMBuildBitCast (ctx->builder, args [1], sse_i1_t, "");
+                               args [2] = LLVMBuildBitCast (ctx->builder, args [2], sse_i1_t, "");
+                               values [ins->dreg] = call_intrins (ctx, INTRINS_SSE_PBLENDVB, args, dname);
+                       }
                        break;
                }
 
+               case OP_SSE_CVTII: {
+                       gboolean is_signed = (ins->inst_c1 == MONO_TYPE_I1) || 
+                               (ins->inst_c1 == MONO_TYPE_I2) || (ins->inst_c1 == MONO_TYPE_I4);
+
+                       LLVMTypeRef vec_type;
+                       if ((ins->inst_c1 == MONO_TYPE_I1) || (ins->inst_c1 == MONO_TYPE_U1))
+                               vec_type = sse_i1_t;
+                       else if ((ins->inst_c1 == MONO_TYPE_I2) || (ins->inst_c1 == MONO_TYPE_U2))
+                               vec_type = sse_i2_t;
+                       else
+                               vec_type = sse_i4_t;
+
+                       LLVMValueRef value;
+                       if (LLVMGetTypeKind (LLVMTypeOf (lhs)) != LLVMVectorTypeKind) {
+                               LLVMValueRef bitcasted = LLVMBuildBitCast (ctx->builder, lhs, LLVMPointerType (vec_type, 0), "");
+                               value = mono_llvm_build_aligned_load (builder, bitcasted, "", FALSE, 1);
+                       } else {
+                               value = LLVMBuildBitCast (ctx->builder, lhs, vec_type, "");
+                       }
+
+                       const int mask_values [] = { 0, 1, 2, 3, 4, 5, 6, 7 };
+                       LLVMValueRef mask_vec;
+                       LLVMTypeRef dst_type;
+                       if (ins->inst_c0 == MONO_TYPE_I2) {
+                               mask_vec = create_const_vector_i32 (mask_values, 8);
+                               dst_type = sse_i2_t;
+                       } else if (ins->inst_c0 == MONO_TYPE_I4) {
+                               mask_vec = create_const_vector_i32 (mask_values, 4);
+                               dst_type = sse_i4_t;
+                       } else {
+                               g_assert (ins->inst_c0 == MONO_TYPE_I8);
+                               mask_vec = create_const_vector_i32 (mask_values, 2);
+                               dst_type = sse_i8_t;
+                       }
+
+                       LLVMValueRef shuffled = LLVMBuildShuffleVector (builder, value,
+                               LLVMGetUndef (vec_type), mask_vec, "");
+
+                       if (is_signed)
+                               values [ins->dreg] = LLVMBuildSExt (ctx->builder, shuffled, dst_type, "");
+                       else
+                               values [ins->dreg] = LLVMBuildZExt (ctx->builder, shuffled, dst_type, "");
+                       break;
+               }
+
+               case OP_SSE41_LOADANT: {
+                       LLVMValueRef dst_ptr = convert (ctx, lhs, LLVMPointerType (primitive_type_to_llvm_type (inst_c1_type (ins)), 0));
+                       LLVMValueRef dst_vec = LLVMBuildBitCast (builder, dst_ptr, LLVMPointerType (type_to_sse_type (ins->inst_c1), 0), "");
+                       LLVMValueRef load = mono_llvm_build_aligned_load (builder, dst_vec, "", FALSE, 16);
+                       set_nontemporal_flag (load);
+                       values [ins->dreg] = load;
+                       break;
+               }
+
+               case OP_SSE41_MUL: {
+                       // NOTE: LLVM 7 and later use shifts here
+                       // however, pmuldq is still available so I guess it's fine to keep using it
+                       LLVMValueRef args [] = { lhs, rhs };
+                       values [ins->dreg] = call_intrins (ctx, INTRINS_SSE_PMULDQ, args, dname);
+                       break;  
+               }
+
+               case OP_SSE41_MULLO: {
+                       values [ins->dreg] = LLVMBuildMul (ctx->builder, lhs, rhs, "");
+                       break;  
+               }
+
                case OP_SSE42_CRC32:
                case OP_SSE42_CRC64: {
                        LLVMValueRef args [2];
@@ -8471,6 +8610,18 @@ process_bb (EmitContext *ctx, MonoBasicBlock *bb)
                        values [ins->dreg] = call_intrins (ctx, id, args, "");
                        break;
                }
+
+               case OP_PCLMULQDQ_IMM: {
+                       LLVMValueRef args [] = { lhs, rhs, LLVMConstInt (LLVMInt8Type (), ins->inst_c0, FALSE) };
+                       values [ins->dreg] = call_intrins (ctx, INTRINS_PCLMULQDQ, args, "");
+                       break;
+               }
+
+               case OP_AES_KEYGEN_IMM: {
+                       LLVMValueRef args [] = { lhs, LLVMConstInt (LLVMInt8Type (), ins->inst_c0, FALSE) };
+                       values [ins->dreg] = call_intrins (ctx, INTRINS_AESNI_AESKEYGENASSIST, args, "");
+                       break;
+               }
 #endif
 
 #ifdef ENABLE_NETCORE
@@ -8636,42 +8787,9 @@ process_bb (EmitContext *ctx, MonoBasicBlock *bb)
                case OP_XEXTRACT_I32:
                case OP_XEXTRACT_I64:
                case OP_XEXTRACT_R8:
-               case OP_XEXTRACT_R4: {
-                       LLVMBasicBlockRef bbs [64];
-                       LLVMValueRef switch_ins;
-                       LLVMValueRef phi_values [64];
-                       int nelems = LLVMGetVectorSize (LLVMTypeOf (lhs));
-                       int i;
-
-                       g_assert (nelems <= 64);
-                       for (i = 0; i < nelems; ++i)
-                               bbs [i] = gen_bb (ctx, "XEXTRACT_CASE_BB");
-                       cbb = gen_bb (ctx, "XEXTRACT_COND_BB");
-
-                       switch_ins = LLVMBuildSwitch (builder, LLVMBuildAnd (builder, rhs, const_int32 (0xf), ""), bbs [0], 0);
-                       for (i = 0; i < nelems; ++i) {
-                               LLVMAddCase (switch_ins, LLVMConstInt (LLVMInt32Type (), i, FALSE), bbs [i]);
-                               LLVMPositionBuilderAtEnd (builder, bbs [i]);
-                               phi_values [i] = LLVMBuildExtractElement (builder, lhs, LLVMConstInt (LLVMInt32Type (), i, FALSE), "");
-                               LLVMBuildBr (builder, cbb);
-                       }
-
-                       LLVMPositionBuilderAtEnd (builder, cbb);
-                       values [ins->dreg] = LLVMBuildPhi (builder, LLVMTypeOf (phi_values [0]), "");
-                       LLVMAddIncoming (values [ins->dreg], phi_values, bbs, nelems);
-
-                       MonoTypeEnum type = (MonoTypeEnum)ins->inst_c0;
-                       switch (type) {
-                       case MONO_TYPE_U1:
-                       case MONO_TYPE_U2:
-                               values [ins->dreg] = LLVMBuildZExt (ctx->builder, values [ins->dreg], LLVMInt32Type (), "");
-                               break;
-                       default:
-                               break;
-                       }
-                       ctx->bblocks [bb->block_num].end_bblock = cbb;
+               case OP_XEXTRACT_R4:
+                       values [ins->dreg] = LLVMBuildExtractElement (builder, lhs, rhs, "");
                        break;
-               }
                case OP_POPCNT32:
                        values [ins->dreg] = call_intrins (ctx, INTRINS_CTPOP_I32, &lhs, "");
                        break;
index a7279a1..15baeb1 100644 (file)
@@ -1099,15 +1099,29 @@ MINI_OP(OP_SSSE3_SHUFFLE, "ssse3_shuffle", XREG, XREG, XREG)
 MINI_OP3(OP_SSSE3_ALIGNR, "ssse3_alignr", XREG, XREG, XREG, IREG)
 
 /* sse 4.1 */
-/* inst_c0 is the rounding mode: 0 = round, 1 = floor, 2 = ceiling */
-MINI_OP(OP_SSE41_ROUNDPD, "roundpd", XREG, XREG, NONE)
-MINI_OP(OP_SSE41_ROUNDSS, "roundss", XREG, XREG, NONE)
+MINI_OP(OP_SSE41_ROUNDP, "roundp", XREG, XREG, NONE) // packed, inst_c0 - mode, inst_c1 - r4 or r8
+MINI_OP(OP_SSE41_ROUNDS, "rounds", XREG, XREG, NONE) // scalar, inst_c0 - mode, inst_c1 - r4 or r8
 MINI_OP3(OP_SSE41_INSERT, "sse41_insert", XREG, XREG, XREG, IREG)
-MINI_OP(OP_SSE41_PTESTZ, "sse41_ptestz", IREG, XREG, XREG)
+MINI_OP3(OP_SSE41_BLENDV, "sse41_blendv", XREG, XREG, XREG, XREG)
+MINI_OP(OP_SSE41_BLEND_IMM, "sse41_blend", XREG, XREG, XREG)
+MINI_OP(OP_SSE41_LOADANT, "sse41_loadant", XREG, XREG, NONE)
+MINI_OP(OP_SSE41_MUL, "sse41_mul", XREG, XREG, XREG)
+MINI_OP(OP_SSE41_MULLO, "sse41_mullo", XREG, XREG, XREG)
+MINI_OP(OP_SSE_CVTII, "sse_cvtii", XREG, XREG, NONE)
+MINI_OP(OP_SSE41_DPPS_IMM, "sse_dpps", XREG, XREG, XREG)
+MINI_OP(OP_SSE41_DPPD_IMM, "sse_dppd", XREG, XREG, XREG)
+MINI_OP(OP_SSE41_MPSADBW_IMM, "sse_mpsadbw", XREG, XREG, XREG)
+
+/* pclmulqdq */
+MINI_OP(OP_PCLMULQDQ_IMM, "pclmulqdq", XREG, XREG, XREG)
+
+/* aes */
+MINI_OP(OP_AES_KEYGEN_IMM, "aes_keygen", XREG, XREG, NONE)
 
 /* sse 4.2 */
 MINI_OP(OP_SSE42_CRC32, "sse42_crc32", IREG, IREG, IREG)
 MINI_OP(OP_SSE42_CRC64, "sse42_crc64", LREG, LREG, LREG)
+MINI_OP(OP_SSE42_PTESTZ, "sse42_ptestc", IREG, XREG, XREG)
 
 /* Intel BMI1 */
 /* Count trailing zeroes, return 32/64 if the input is 0 */
@@ -1522,6 +1536,7 @@ MINI_OP(OP_XOP, "xop", NONE, NONE, NONE)
 MINI_OP(OP_XOP_X_I, "xop_x_i", XREG, IREG, NONE)
 MINI_OP(OP_XOP_X_X, "xop_x_x", XREG, XREG, NONE)
 MINI_OP(OP_XOP_I4_X, "xop_i4_x", IREG, XREG, NONE)
+MINI_OP(OP_XOP_I4_X_X, "xop_i4_x_x", IREG, XREG, XREG)
 MINI_OP(OP_XOP_I8_X, "xop_i8_x", LREG, XREG, NONE)
 MINI_OP(OP_XOP_X_X_X, "xop_x_x_x", XREG, XREG, XREG)
 MINI_OP(OP_XOP_X_X_I4, "xop_x_x_i4", XREG, XREG, IREG)
index e5c36f5..0e632d0 100644 (file)
@@ -2948,6 +2948,16 @@ typedef enum {
        SIMD_OP_SSE_PMADDUBSW,
        SIMD_OP_SSE_PMULHRSW,
        SIMD_OP_SSE_LDDQU,
+       SIMD_OP_SSE_TESTC,
+       SIMD_OP_SSE_TESTNZ,
+       SIMD_OP_SSE_TESTZ,
+       SIMD_OP_SSE_PACKUSDW,
+       SIMD_OP_SSE_PHMINPOSUW,
+       SIMD_OP_AES_IMC,
+       SIMD_OP_AES_ENC,
+       SIMD_OP_AES_ENCLAST,
+       SIMD_OP_AES_DEC,
+       SIMD_OP_AES_DECLAST,
        SIMD_OP_ARM64_CRC32B,
        SIMD_OP_ARM64_CRC32H,
        SIMD_OP_ARM64_CRC32W,
index a1acfc7..ea4c1a9 100644 (file)
@@ -189,6 +189,9 @@ emit_simd_ins (MonoCompile *cfg, MonoClass *klass, int opcode, int sreg1, int sr
        } else if (spec [MONO_INST_DEST] == 'l') {
                ins->dreg = alloc_lreg (cfg);
                ins->type = STACK_I8;
+       } else if (spec [MONO_INST_DEST] == 'f') {
+               ins->dreg = alloc_freg (cfg);
+               ins->type = STACK_R8;
        }
        ins->sreg1 = sreg1;
        ins->sreg2 = sreg2;
@@ -673,6 +676,14 @@ emit_sys_numerics_vector_t (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSig
 }
 #endif // !TARGET_ARM64
 
+static MonoInst*
+emit_invalid_operation (MonoCompile *cfg, const char* message)
+{
+       mono_cfg_set_exception (cfg, MONO_EXCEPTION_MONO_ERROR);
+       mono_error_set_generic_error (cfg->error, "System", "InvalidOperationException", "%s", message);
+       return NULL;
+}
+
 #ifdef TARGET_ARM64
 
 static SimdIntrinsic armbase_methods [] = {
@@ -992,10 +1003,40 @@ static SimdIntrinsic ssse3_methods [] = {
 };
 
 static SimdIntrinsic sse41_methods [] = {
+       {SN_Blend},
+       {SN_BlendVariable},
+       {SN_Ceiling, OP_SSE41_ROUNDP, 10 /*round mode*/},
+       {SN_CeilingScalar, OP_SSE41_ROUNDS, 10 /*round mode*/},
+       {SN_CompareEqual, OP_XCOMPARE, CMP_EQ},
+       {SN_ConvertToVector128Int16, OP_SSE_CVTII, MONO_TYPE_I2},
+       {SN_ConvertToVector128Int32, OP_SSE_CVTII, MONO_TYPE_I4},
+       {SN_ConvertToVector128Int64, OP_SSE_CVTII, MONO_TYPE_I8},
+       {SN_DotProduct},
+       {SN_Extract},
+       {SN_Floor, OP_SSE41_ROUNDP, 9 /*round mode*/},
+       {SN_FloorScalar, OP_SSE41_ROUNDS, 9 /*round mode*/},
        {SN_Insert},
+       {SN_LoadAlignedVector128NonTemporal, OP_SSE41_LOADANT},
        {SN_Max, OP_XBINOP, OP_IMAX},
        {SN_Min, OP_XBINOP, OP_IMIN},
-       {SN_TestZ, OP_SSE41_PTESTZ},
+       {SN_MinHorizontal, OP_XOP_X_X, SIMD_OP_SSE_PHMINPOSUW},
+       {SN_MultipleSumAbsoluteDifferences},
+       {SN_Multiply, OP_SSE41_MUL},
+       {SN_MultiplyLow, OP_SSE41_MULLO},
+       {SN_PackUnsignedSaturate, OP_XOP_X_X_X, SIMD_OP_SSE_PACKUSDW},
+       {SN_RoundCurrentDirection, OP_SSE41_ROUNDP, 4 /*round mode*/},
+       {SN_RoundCurrentDirectionScalar, OP_SSE41_ROUNDS, 4 /*round mode*/},
+       {SN_RoundToNearestInteger, OP_SSE41_ROUNDP, 8 /*round mode*/},
+       {SN_RoundToNearestIntegerScalar, OP_SSE41_ROUNDS, 8 /*round mode*/},
+       {SN_RoundToNegativeInfinity, OP_SSE41_ROUNDP, 9 /*round mode*/},
+       {SN_RoundToNegativeInfinityScalar, OP_SSE41_ROUNDS, 9 /*round mode*/},
+       {SN_RoundToPositiveInfinity, OP_SSE41_ROUNDP, 10 /*round mode*/},
+       {SN_RoundToPositiveInfinityScalar, OP_SSE41_ROUNDS, 10 /*round mode*/},
+       {SN_RoundToZero, OP_SSE41_ROUNDP, 11 /*round mode*/},
+       {SN_RoundToZeroScalar, OP_SSE41_ROUNDS, 11 /*round mode*/},
+       {SN_TestC, OP_XOP_I4_X_X, SIMD_OP_SSE_TESTC},
+       {SN_TestNotZAndNotC, OP_XOP_I4_X_X, SIMD_OP_SSE_TESTNZ},
+       {SN_TestZ, OP_XOP_I4_X_X, SIMD_OP_SSE_TESTZ},
        {SN_get_IsSupported}
 };
 
@@ -1005,6 +1046,21 @@ static SimdIntrinsic sse42_methods [] = {
        {SN_get_IsSupported}
 };
 
+static SimdIntrinsic pclmulqdq_methods [] = {
+       {SN_CarrylessMultiply},
+       {SN_get_IsSupported}
+};
+
+static SimdIntrinsic aes_methods [] = {
+       {SN_Decrypt, OP_XOP_X_X_X, SIMD_OP_AES_DEC},
+       {SN_DecryptLast, OP_XOP_X_X_X, SIMD_OP_AES_DECLAST},
+       {SN_Encrypt, OP_XOP_X_X_X, SIMD_OP_AES_ENC},
+       {SN_EncryptLast, OP_XOP_X_X_X, SIMD_OP_AES_ENCLAST},
+       {SN_InverseMixColumns, OP_XOP_X_X, SIMD_OP_AES_IMC},
+       {SN_KeygenAssist},
+       {SN_get_IsSupported}
+};
+
 static SimdIntrinsic popcnt_methods [] = {
        {SN_PopCount},
        {SN_get_IsSupported}
@@ -1041,7 +1097,6 @@ emit_x86_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature
        MonoClass *klass = cmethod->klass;
        MonoTypeEnum arg0_type = fsig->param_count > 0 ? get_underlying_type (fsig->params [0]) : MONO_TYPE_VOID;
        SimdIntrinsic *info;
-       gboolean is_corlib = m_class_get_image (cfg->method->klass) == mono_get_corlib ();
 
        if (is_hw_intrinsics_class (klass, "Sse", &is_64bit)) {
                if (!COMPILE_LLVM (cfg))
@@ -1062,17 +1117,11 @@ emit_x86_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature
                        EMIT_NEW_ICONST (cfg, ins, supported ? 1 : 0);
                        ins->type = STACK_I4;
                        return ins;
-               case SN_Shuffle: {
-                       if (fsig->param_count != 3)
-                               return NULL;
-                       if (args [2]->opcode != OP_ICONST) {
-                               mono_cfg_set_exception (cfg, MONO_EXCEPTION_MONO_ERROR);
-                               mono_error_set_generic_error (cfg->error, "System", 
-                                       "InvalidOperationException", "mask in Sse.Shuffle must be constant.");
-                               return NULL;
-                       }
-                       return emit_simd_ins_for_sig (cfg, klass, OP_SSE_SHUFFLE, args [2]->inst_c0 /*mask*/, arg0_type, fsig, args);
-               }
+               case SN_Shuffle:
+                       if (args [2]->opcode == OP_ICONST)
+                               return emit_simd_ins_for_sig (cfg, klass, OP_SSE_SHUFFLE, args [2]->inst_c0, arg0_type, fsig, args);
+                       // FIXME: handle non-constant mask (generate a switch)
+                       return emit_invalid_operation (cfg, "mask in Sse.Shuffle must be constant");
                case SN_ConvertScalarToVector128Single: {
                        int op = 0;
                        switch (fsig->params [1]->type) {
@@ -1114,7 +1163,6 @@ emit_x86_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature
                        return NULL;
                int id = info->id;
 
-               // Some intrinsics are missing
                supported = (mini_get_cpu_features (cfg) & MONO_CPU_X86_SSE2) != 0;
 
                /* Common case */
@@ -1441,9 +1489,7 @@ emit_x86_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature
                case SN_AlignRight:
                        if (args [2]->opcode == OP_ICONST)
                                return emit_simd_ins_for_sig (cfg, klass, OP_SSSE3_ALIGNR, args [2]->inst_c0, arg0_type, fsig, args);
-                       else
-                               // FIXME: non-constant mask (generate switch)
-                               return NULL;
+                       return emit_invalid_operation (cfg, "mask in Ssse3.AlignRight must be constant");
                case SN_HorizontalAdd:
                        if (arg0_type == MONO_TYPE_I2)
                                return emit_simd_ins_for_sig (cfg, klass, OP_XOP_X_X_X, SIMD_OP_SSE_PHADDW, arg0_type, fsig, args);
@@ -1476,21 +1522,50 @@ emit_x86_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature
                if (info->op != 0)
                        return emit_simd_ins_for_sig (cfg, klass, info->op, info->instc0, arg0_type, fsig, args);
 
-               supported = COMPILE_LLVM (cfg) && (mini_get_cpu_features (cfg) & MONO_CPU_X86_SSE41) != 0 && is_corlib; // We only support the subset used by corelib
+               supported = COMPILE_LLVM (cfg) && (mini_get_cpu_features (cfg) & MONO_CPU_X86_SSE41) != 0;
 
                switch (id) {
                case SN_get_IsSupported:
                        EMIT_NEW_ICONST (cfg, ins, supported ? 1 : 0);
                        ins->type = STACK_I4;
                        return ins;
-               case SN_Insert:
-                       if (args [2]->opcode != OP_ICONST) {
-                               mono_cfg_set_exception (cfg, MONO_EXCEPTION_MONO_ERROR);
-                               mono_error_set_generic_error (cfg->error, "System", 
-                                       "InvalidOperationException", "index in Sse41.Insert must be constant.");
-                               return NULL;
+               case SN_DotProduct:
+                       if (args [2]->opcode == OP_ICONST && arg0_type == MONO_TYPE_R4)
+                               return emit_simd_ins_for_sig (cfg, klass, OP_SSE41_DPPS_IMM, args [2]->inst_c0, arg0_type, fsig, args);
+                       else if (args [2]->opcode == OP_ICONST && arg0_type == MONO_TYPE_R8)
+                               return emit_simd_ins_for_sig (cfg, klass, OP_SSE41_DPPD_IMM, args [2]->inst_c0, arg0_type, fsig, args);
+                       // FIXME: handle non-constant control byte (generate a switch)
+                       return emit_invalid_operation (cfg, "control byte in Sse41.DotProduct must be constant");
+               case SN_MultipleSumAbsoluteDifferences:
+                       if (args [2]->opcode == OP_ICONST)
+                               return emit_simd_ins_for_sig (cfg, klass, OP_SSE41_MPSADBW_IMM, args [2]->inst_c0, arg0_type, fsig, args);
+                       // FIXME: handle non-constant control byte (generate a switch)
+                       return emit_invalid_operation (cfg, "control byte in Sse41.MultipleSumAbsoluteDifferences must be constant");
+               case SN_Blend:
+                       if (args [2]->opcode == OP_ICONST)
+                               return emit_simd_ins_for_sig (cfg, klass, OP_SSE41_BLEND_IMM, args [2]->inst_c0, arg0_type, fsig, args);
+                       // FIXME: handle non-constant control byte (generate a switch)
+                       return emit_invalid_operation (cfg, "control byte in Sse41.Blend must be constant");
+               case SN_BlendVariable:
+                       return emit_simd_ins_for_sig (cfg, klass, OP_SSE41_BLENDV, -1, arg0_type, fsig, args);
+               case SN_Extract: {
+                       int op = 0;
+                       switch (arg0_type) {
+                       case MONO_TYPE_U1:
+                       case MONO_TYPE_U4:
+                       case MONO_TYPE_I4: op = OP_XEXTRACT_I32; break;
+                       case MONO_TYPE_I8:
+                       case MONO_TYPE_U8: op = OP_XEXTRACT_I64; break;
+                       case MONO_TYPE_R4: op = OP_XEXTRACT_R4; break;
+                       default: g_assert_not_reached(); break;
                        }
-                       return emit_simd_ins_for_sig (cfg, klass, OP_SSE41_INSERT, -1, arg0_type, fsig, args);
+                       return emit_simd_ins_for_sig (cfg, klass, op, arg0_type, 0, fsig, args);
+               }
+               case SN_Insert:
+                       if (args [2]->opcode == OP_ICONST)
+                               return emit_simd_ins_for_sig (cfg, klass, OP_SSE41_INSERT, -1, arg0_type, fsig, args);
+                       // FIXME: handle non-constant index (generate a switch)
+                       return emit_invalid_operation (cfg, "index in Sse41.Insert must be constant");
                default:
                        g_assert_not_reached ();
                        break;
@@ -1509,8 +1584,7 @@ emit_x86_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature
                if (info->op != 0)
                        return emit_simd_ins_for_sig (cfg, klass, info->op, info->instc0, arg0_type, fsig, args);
 
-               // FIXME: remove is_corlib check once Sse41 is implemented
-               supported = COMPILE_LLVM (cfg) && (mini_get_cpu_features (cfg) & MONO_CPU_X86_SSE42) != 0 && is_corlib; 
+               supported = COMPILE_LLVM (cfg) && (mini_get_cpu_features (cfg) & MONO_CPU_X86_SSE42) != 0; 
 
                switch (id) {
                case SN_get_IsSupported:
@@ -1529,6 +1603,68 @@ emit_x86_intrinsics (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature
                }
        }
 
+       if (is_hw_intrinsics_class (klass, "Pclmulqdq", &is_64bit)) {
+               if (!COMPILE_LLVM (cfg))
+                       return NULL;
+               info = lookup_intrins_info (pclmulqdq_methods, sizeof (pclmulqdq_methods), cmethod);
+               if (!info)
+                       return NULL;
+               int id = info->id;
+
+               /* Common case */
+               if (info->op != 0)
+                       return emit_simd_ins_for_sig (cfg, klass, info->op, info->instc0, arg0_type, fsig, args);
+
+               supported = COMPILE_LLVM (cfg) && (mini_get_cpu_features (cfg) & MONO_CPU_X86_PCLMUL) != 0; 
+
+               switch (id) {
+               case SN_CarrylessMultiply: {
+                       if (args [2]->opcode == OP_ICONST)
+                               return emit_simd_ins_for_sig (cfg, klass, OP_PCLMULQDQ_IMM, args [2]->inst_c0, arg0_type, fsig, args);
+                       // FIXME: handle non-constant control byte (generate a switch)
+                       return emit_invalid_operation (cfg, "index in Pclmulqdq.CarrylessMultiply must be constant");
+               }
+               case SN_get_IsSupported:
+                       EMIT_NEW_ICONST (cfg, ins, supported ? 1 : 0);
+                       ins->type = STACK_I4;
+                       return ins;
+               default:
+                       g_assert_not_reached ();
+                       break;
+               }
+       }
+
+       if (is_hw_intrinsics_class (klass, "Aes", &is_64bit)) {
+               if (!COMPILE_LLVM (cfg))
+                       return NULL;
+               info = lookup_intrins_info (aes_methods, sizeof (aes_methods), cmethod);
+               if (!info)
+                       return NULL;
+               int id = info->id;
+
+               /* Common case */
+               if (info->op != 0)
+                       return emit_simd_ins_for_sig (cfg, klass, info->op, info->instc0, arg0_type, fsig, args);
+
+               supported = COMPILE_LLVM (cfg) && (mini_get_cpu_features (cfg) & MONO_CPU_X86_AES) != 0; 
+
+               switch (id) {
+               case SN_get_IsSupported:
+                       EMIT_NEW_ICONST (cfg, ins, supported ? 1 : 0);
+                       ins->type = STACK_I4;
+                       return ins;
+               case SN_KeygenAssist: {
+                       if (args [1]->opcode == OP_ICONST)
+                               return emit_simd_ins_for_sig (cfg, klass, OP_AES_KEYGEN_IMM, args [1]->inst_c0, arg0_type, fsig, args);
+                       // FIXME: handle non-constant control byte (generate a switch)
+                       return emit_invalid_operation (cfg, "control byte in Aes.KeygenAssist must be constant");
+               }
+               default:
+                       g_assert_not_reached ();
+                       break;
+               }
+       }
+
        if (is_hw_intrinsics_class (klass, "Popcnt", &is_64bit)) {
                info = lookup_intrins_info (popcnt_methods, sizeof (popcnt_methods), cmethod);
                if (!info)
index d8871d0..fceba00 100644 (file)
@@ -190,10 +190,43 @@ METHOD(HorizontalSubtractSaturate)
 METHOD(MultiplyHighRoundScale)
 METHOD(Sign)
 // Sse41
+METHOD(Blend)
+METHOD(BlendVariable)
+METHOD(Ceiling)
+METHOD(CeilingScalar)
+METHOD(ConvertToVector128Int16)
+METHOD(ConvertToVector128Int64)
+METHOD(Floor)
+METHOD(FloorScalar)
 METHOD(Insert)
+METHOD(LoadAlignedVector128NonTemporal)
+METHOD(RoundCurrentDirectionScalar)
+METHOD(RoundToNearestInteger)
+METHOD(RoundToNearestIntegerScalar)
+METHOD(RoundToNegativeInfinity)
+METHOD(RoundToNegativeInfinityScalar)
+METHOD(RoundToPositiveInfinity)
+METHOD(RoundToPositiveInfinityScalar)
+METHOD(RoundToZero)
+METHOD(RoundToZeroScalar)
+METHOD(RoundCurrentDirection)
+METHOD(MinHorizontal)
+METHOD(TestC)
+METHOD(TestNotZAndNotC)
 METHOD(TestZ)
+METHOD(DotProduct)
+METHOD(MultipleSumAbsoluteDifferences)
 // Sse42
 METHOD(Crc32)
+// Aes
+METHOD(Decrypt)
+METHOD(DecryptLast)
+METHOD(Encrypt)
+METHOD(EncryptLast)
+METHOD(InverseMixColumns)
+METHOD(KeygenAssist)
+// Pclmulqdq
+METHOD(CarrylessMultiply)
 // ArmBase
 METHOD(LeadingSignCount)
 METHOD(ReverseElementBits)