[mono] Adding support for Vector128::ExtractMostSignificantBits intrinsics on amd64...
authorMatous Kozak <55735845+matouskozak@users.noreply.github.com>
Thu, 10 Aug 2023 08:47:51 +0000 (10:47 +0200)
committerGitHub <noreply@github.com>
Thu, 10 Aug 2023 08:47:51 +0000 (10:47 +0200)
* Extract MSB amd64

* add SSSE3 check

src/mono/mono/arch/amd64/amd64-codegen.h
src/mono/mono/mini/cpu-amd64.mdesc
src/mono/mono/mini/mini-amd64.c
src/mono/mono/mini/simd-intrinsics.c

index 5ebb5ae..304ff3c 100644 (file)
@@ -895,6 +895,7 @@ typedef union {
 
 #define amd64_sse_movsldup_reg_reg(inst,dreg,reg) emit_sse_reg_reg((inst), (dreg), (reg), 0xf3, 0x0f, 0x12)
 
+#define amd64_sse_pshufb_reg_reg(inst,dreg,reg) emit_sse_reg_reg_op4((inst), (dreg), (reg), 0x66, 0x0f, 0x38, 0x00)
 
 #define amd64_sse_pshufhw_reg_reg_imm(inst,dreg,reg,imm) emit_sse_reg_reg_imm((inst), (dreg), (reg), 0xf3, 0x0f, 0x70, (imm))
 
@@ -947,6 +948,10 @@ typedef union {
 
 #define amd64_sse_pmovmskb_reg_reg(inst,dreg,reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0xd7)
 
+#define amd64_sse_movmskps_reg_reg(inst,dreg,reg) emit_sse_reg_reg_op2((inst), (dreg), (reg), 0x0f, 0x50)
+
+#define amd64_sse_movmskpd_reg_reg(inst,dreg,reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0x50)
+
 
 #define amd64_sse_pand_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0xdb)
 
index 0842ac8..06321f8 100644 (file)
@@ -827,6 +827,8 @@ expand_r4: dest:x src1:f len:16
 expand_r8: dest:x src1:f len:13
 xop_x_x_x: dest:x src1:x src2:x len:16 clob:1
 xop_x_x: dest:x src1:x len:16 clob:1
+sse_movmsk: dest:i src1:x len:5
+ssse3_shuffle: dest:x src1:x src2:x len:6 clob:1
 sse41_dpps_imm: dest:x src1:x src2:x len:7 clob:1
 sse41_dppd_imm: dest:x src1:x src2:x len:7 clob:1
 vector_andnot: dest:x src1:x src2:x len:7 clob:1
index fb84039..1a2f9ff 100644 (file)
@@ -7521,6 +7521,23 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        amd64_sse_movsd_reg_reg (code, ins->dreg, ins->sreg1);
                        amd64_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, 0x44);
                        break;
+               case OP_SSE_MOVMSK: {
+                       switch (ins->inst_c1) {
+                       case MONO_TYPE_R4:
+                               amd64_sse_movmskps_reg_reg (code, ins->dreg, ins->sreg1);
+                               break;
+                       case MONO_TYPE_R8:
+                               amd64_sse_movmskpd_reg_reg (code, ins->dreg, ins->sreg1);
+                               break;
+                       default:
+                               amd64_sse_pmovmskb_reg_reg (code, ins->dreg, ins->sreg1);
+                               break;
+                       }
+                       break;
+               }
+               case OP_SSSE3_SHUFFLE:
+                       amd64_sse_pshufb_reg_reg (code, ins->dreg, ins->sreg2);
+                       break;
                case OP_SSE41_ROUNDP: {
                        if (ins->inst_c1 == MONO_TYPE_R8)
                                amd64_sse_roundpd_reg_reg_imm (code, ins->dreg, ins->sreg1, ins->inst_c0);
index 4f05655..10f1dc9 100644 (file)
@@ -1839,7 +1839,51 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi
                }
                return result_ins;
 #elif defined(TARGET_AMD64)
-               return NULL;
+               int type = MONO_TYPE_I1;
+
+               switch (arg0_type) {
+                       case MONO_TYPE_U2:
+                       case MONO_TYPE_I2: {
+                               if (!is_SIMD_feature_supported (cfg, MONO_CPU_X86_SSSE3)) 
+                                       return NULL;
+                                       
+                               type = type_enum_is_unsigned (arg0_type) ? MONO_TYPE_U1 : MONO_TYPE_I1;
+                               MonoClass* arg_class = mono_class_from_mono_type_internal (fsig->params [0]);
+
+                               guint64 shuffle_mask[2];
+                               shuffle_mask[0] = 0x0F0D0B0907050301; // Place odd bytes in the lower half of vector
+                               shuffle_mask[1] = 0x8080808080808080; // Zero the upper half
+
+                               MonoInst* shuffle_vec = emit_xconst_v128 (cfg, arg_class, (guint8*)shuffle_mask);
+                               shuffle_vec->klass = arg_class;
+
+                               args [0] = emit_simd_ins (cfg, klass, OP_SSSE3_SHUFFLE, args [0]->dreg, shuffle_vec->dreg);
+                               args [0]->inst_c1 = type;
+                               break;
+                       }
+#if TARGET_SIZEOF_VOID_P == 4
+                       case MONO_TYPE_I:
+                       case MONO_TYPE_U:
+#endif
+                       case MONO_TYPE_U4:
+                       case MONO_TYPE_I4:
+                       case MONO_TYPE_R4: {
+                               type = MONO_TYPE_R4;
+                               break;
+                       }
+#if TARGET_SIZEOF_VOID_P == 8
+                       case MONO_TYPE_I:
+                       case MONO_TYPE_U:
+#endif
+                       case MONO_TYPE_U8:
+                       case MONO_TYPE_I8:
+                       case MONO_TYPE_R8: {
+                               type = MONO_TYPE_R8;
+                               break;
+                       }
+               }
+
+               return emit_simd_ins_for_sig (cfg, klass, OP_SSE_MOVMSK, -1, type, fsig, args);
 #endif
        }
        case SN_GetElement: {