ix86_tune_features[X86_TUNE_EXPAND_ABS]
#define TARGET_V2DF_REDUCTION_PREFER_HADDPD \
ix86_tune_features[X86_TUNE_V2DF_REDUCTION_PREFER_HADDPD]
+#define TARGET_DEST_FALSE_DEP_FOR_GLC \
+ ix86_tune_features[X86_TUNE_DEST_FALSE_DEP_FOR_GLC]
/* Feature tests against the various architecture variations. */
enum ix86_arch_indices {
(match_operand:VF_AVX512FP16VL 2 "nonimmediate_operand" "<round_constraint>")]
UNSPEC_COMPLEX_F_C_MUL))]
"TARGET_AVX512FP16 && <round_mode512bit_condition>"
- "v<complexopname><ssemodesuffix>\t{<round_maskc_op3>%2, %1, %0<maskc_operand3>|%0<maskc_operand3>, %1, %2<round_maskc_op3>}"
+{
+ if (TARGET_DEST_FALSE_DEP_FOR_GLC
+ && <maskc_dest_false_dep_for_glc_cond>)
+ output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
+ return "v<complexopname><ssemodesuffix>\t{<round_maskc_op3>%2, %1, %0<maskc_operand3>|%0<maskc_operand3>, %1, %2<round_maskc_op3>}";
+}
[(set_attr "type" "ssemul")
(set_attr "mode" "<MODE>")])
(match_dup 1)
(const_int 3)))]
"TARGET_AVX512FP16"
- "v<complexopname>sh\t{<round_scalarc_mask_op3>%2, %1, %0<mask_scalarc_operand3>|%0<mask_scalarc_operand3>, %1, %2<round_scalarc_mask_op3>}"
+{
+ if (TARGET_DEST_FALSE_DEP_FOR_GLC
+ && <mask_scalarc_dest_false_dep_for_glc_cond>)
+ output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
+ return "v<complexopname>sh\t{<round_scalarc_mask_op3>%2, %1, %0<mask_scalarc_operand3>|%0<mask_scalarc_operand3>, %1, %2<round_scalarc_mask_op3>}";
+}
[(set_attr "type" "ssemul")
(set_attr "mode" "V8HF")])
(match_operand:VI8_AVX512VL 2 "bcst_vector_operand" "vmBr")))]
"TARGET_AVX512DQ && <mask_mode512bit_condition>
&& ix86_binary_operator_ok (MULT, <MODE>mode, operands)"
- "vpmullq\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}"
+{
+ if (TARGET_DEST_FALSE_DEP_FOR_GLC
+ && <mask3_dest_false_dep_for_glc_cond>
+ && !reg_mentioned_p (operands[0], operands[1])
+ && !reg_mentioned_p (operands[0], operands[2]))
+ output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
+ return "vpmullq\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}";
+}
[(set_attr "type" "sseimul")
(set_attr "prefix" "evex")
(set_attr "mode" "<sseinsnmode>")])
(match_operand:<sseintvecmode> 2 "register_operand" "v")]
UNSPEC_VPERMVAR))]
"TARGET_AVX2 && <mask_mode512bit_condition>"
- "vperm<ssemodesuffix>\t{%1, %2, %0<mask_operand3>|%0<mask_operand3>, %2, %1}"
+{
+ if (TARGET_DEST_FALSE_DEP_FOR_GLC
+ && <mask3_dest_false_dep_for_glc_cond>
+ && !reg_mentioned_p (operands[0], operands[1])
+ && !reg_mentioned_p (operands[0], operands[2]))
+ output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
+ return "vperm<ssemodesuffix>\t{%1, %2, %0<mask_operand3>|%0<mask_operand3>, %2, %1}";
+}
[(set_attr "type" "sselog")
(set_attr "prefix" "<mask_prefix2>")
(set_attr "mode" "<sseinsnmode>")])
mask |= INTVAL (operands[4]) << 4;
mask |= INTVAL (operands[5]) << 6;
operands[2] = GEN_INT (mask);
+ if (TARGET_DEST_FALSE_DEP_FOR_GLC
+ && <mask6_dest_false_dep_for_glc_cond>
+ && !reg_mentioned_p (operands[0], operands[1]))
+ output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
return "vperm<ssemodesuffix>\t{%2, %1, %0<mask_operand6>|%0<mask_operand6>, %1, %2}";
}
[(set_attr "type" "sselog")
mask |= INTVAL (operands[4]) << 4;
mask |= INTVAL (operands[5]) << 6;
operands[2] = GEN_INT (mask);
+ if (TARGET_DEST_FALSE_DEP_FOR_GLC
+ && <mask10_dest_false_dep_for_glc_cond>
+ && !reg_mentioned_p (operands[0], operands[1]))
+ output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
return "vperm<ssemodesuffix>\t{%2, %1, %0<mask_operand10>|%0<mask_operand10>, %1, %2}";
}
[(set_attr "type" "sselog")
(match_operand:SI 3 "const_0_to_15_operand")]
UNSPEC_RANGE))]
"TARGET_AVX512DQ && <round_saeonly_mode512bit_condition>"
- "vrange<ssemodesuffix>\t{%3, <round_saeonly_mask_op4>%2, %1, %0<mask_operand4>|%0<mask_operand4>, %1, %2<round_saeonly_mask_op4>, %3}"
+{
+ if (TARGET_DEST_FALSE_DEP_FOR_GLC
+ && <mask4_dest_false_dep_for_glc_cond>
+ && !reg_mentioned_p (operands[0], operands[1])
+ && !reg_mentioned_p (operands[0], operands[2]))
+ output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
+ return "vrange<ssemodesuffix>\t{%3, <round_saeonly_mask_op4>%2, %1, %0<mask_operand4>|%0<mask_operand4>, %1, %2<round_saeonly_mask_op4>, %3}";
+}
[(set_attr "type" "sse")
(set_attr "prefix" "evex")
(set_attr "mode" "<MODE>")])
(match_dup 1)
(const_int 1)))]
"TARGET_AVX512DQ"
- "vrange<ssescalarmodesuffix>\t{%3, <round_saeonly_scalar_mask_op4>%2, %1, %0<mask_scalar_operand4>|%0<mask_scalar_operand4>, %1, %<iptr>2<round_saeonly_scalar_mask_op4>, %3}"
+{
+ if (TARGET_DEST_FALSE_DEP_FOR_GLC
+ && <mask_scalar4_dest_false_dep_for_glc_cond>
+ && !reg_mentioned_p (operands[0], operands[1])
+ && !reg_mentioned_p (operands[0], operands[2]))
+ output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
+ return "vrange<ssescalarmodesuffix>\t{%3, <round_saeonly_scalar_mask_op4>%2, %1, %0<mask_scalar_operand4>|%0<mask_scalar_operand4>, %1, %<iptr>2<round_saeonly_scalar_mask_op4>, %3}";
+}
[(set_attr "type" "sse")
(set_attr "prefix" "evex")
(set_attr "mode" "<MODE>")])
(match_operand:SI 2 "const_0_to_15_operand")]
UNSPEC_GETMANT))]
"TARGET_AVX512F"
- "vgetmant<ssemodesuffix>\t{%2, <round_saeonly_mask_op3>%1, %0<mask_operand3>|%0<mask_operand3>, %1<round_saeonly_mask_op3>, %2}";
+{
+ if (TARGET_DEST_FALSE_DEP_FOR_GLC
+ && <mask3_dest_false_dep_for_glc_cond>
+ && MEM_P (operands[1]))
+ output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
+ return "vgetmant<ssemodesuffix>\t{%2, <round_saeonly_mask_op3>%1, %0<mask_operand3>|%0<mask_operand3>, %1<round_saeonly_mask_op3>, %2}";
+}
[(set_attr "prefix" "evex")
(set_attr "mode" "<MODE>")])
(match_dup 1)
(const_int 1)))]
"TARGET_AVX512F"
- "vgetmant<ssescalarmodesuffix>\t{%3, <round_saeonly_scalar_mask_op4>%2, %1, %0<mask_scalar_operand4>|%0<mask_scalar_operand4>, %1, %<iptr>2<round_saeonly_scalar_mask_op4>, %3}";
+{
+ if (TARGET_DEST_FALSE_DEP_FOR_GLC
+ && <mask_scalar4_dest_false_dep_for_glc_cond>
+ && !reg_mentioned_p (operands[0], operands[1])
+ && !reg_mentioned_p (operands[0], operands[2]))
+ output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands);
+ return "vgetmant<ssescalarmodesuffix>\t{%3, <round_saeonly_scalar_mask_op4>%2, %1, %0<mask_scalar_operand4>|%0<mask_scalar_operand4>, %1, %<iptr>2<round_saeonly_scalar_mask_op4>, %3}";
+}
[(set_attr "prefix" "evex")
(set_attr "mode" "<ssescalarmode>")])
(define_subst_attr "mask_prefix4" "mask" "orig,orig,vex" "evex,evex,evex")
(define_subst_attr "bcst_mask_prefix4" "mask" "orig,orig,maybe_evex" "evex,evex,evex")
(define_subst_attr "mask_expand_op3" "mask" "3" "5")
+(define_subst_attr "mask3_dest_false_dep_for_glc_cond" "mask" "1" "operands[3] == CONST0_RTX(<MODE>mode)")
+(define_subst_attr "mask4_dest_false_dep_for_glc_cond" "mask" "1" "operands[4] == CONST0_RTX(<MODE>mode)")
+(define_subst_attr "mask6_dest_false_dep_for_glc_cond" "mask" "1" "operands[6] == CONST0_RTX(<MODE>mode)")
+(define_subst_attr "mask10_dest_false_dep_for_glc_cond" "mask" "1" "operands[10] == CONST0_RTX(<MODE>mode)")
+(define_subst_attr "maskc_dest_false_dep_for_glc_cond" "maskc" "1" "operands[3] == CONST0_RTX(<MODE>mode)")
(define_subst "mask"
[(set (match_operand:SUBST_V 0)
(define_subst_attr "mask_scalar_operand3" "mask_scalar" "" "%{%4%}%N3")
(define_subst_attr "mask_scalar_operand4" "mask_scalar" "" "%{%5%}%N4")
(define_subst_attr "mask_scalarcz_operand4" "mask_scalarcz" "" "%{%5%}%N4")
+(define_subst_attr "mask_scalar4_dest_false_dep_for_glc_cond" "mask_scalar" "1" "operands[4] == CONST0_RTX(<MODE>mode)")
+(define_subst_attr "mask_scalarc_dest_false_dep_for_glc_cond" "mask_scalarc" "1" "operands[3] == CONST0_RTX(V8HFmode)")
(define_subst "mask_scalar"
[(set (match_operand:SUBST_V 0)
m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_AMDFAM10
| m_BDVER | m_ZNVER | m_ALDERLAKE | m_GENERIC)
+/* X86_TUNE_DEST_FALSE_DEP_FOR_GLC: This knob inserts zero-idiom before
+ several insns to break false dependency on the dest register for GLC
+ micro-architecture. */
+DEF_TUNE (X86_TUNE_DEST_FALSE_DEP_FOR_GLC,
+ "dest_false_dep_for_glc", m_SAPPHIRERAPIDS | m_ALDERLAKE)
+
/* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
are resolved on SSE register parts instead of whole registers, so we may
maintain just lower part of scalar values in proper format leaving the
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-mavx2 -mtune=generic -mtune-ctrl=dest_false_dep_for_glc -O2" } */
+
+
+#include <immintrin.h>
+
+extern __m256i i1, i2, i3, i4;
+extern __m256d d1, d2;
+extern __m256 f1, f2;
+
+void vperm_test (void)
+{
+ i3 = _mm256_permutevar8x32_epi32 (i1, i2);
+ i4 = _mm256_permute4x64_epi64 (i1, 12);
+ d2 = _mm256_permute4x64_pd (d1, 12);
+ f2 = _mm256_permutevar8x32_ps (f1, i2);
+}
+
+/* { dg-final { scan-assembler-times "vxorps" 4 } } */
+/* { dg-final { scan-assembler-times "vpermd" 1 } } */
+/* { dg-final { scan-assembler-times "vpermq" 1 } } */
+/* { dg-final { scan-assembler-times "vpermpd" 1 } } */
+/* { dg-final { scan-assembler-times "vpermps" 1 } } */
+
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-mavx512dq -mavx512vl -mtune=generic -mtune-ctrl=dest_false_dep_for_glc -O2" } */
+
+#include <immintrin.h>
+
+extern __m512i i1;
+extern __m256i i2;
+extern __m128i i3;
+extern __m512d d1, d11;
+extern __m256d d2;
+extern __m128d d3, d33;
+extern __m512 f1, f11;
+extern __m256 f2;
+extern __m128 f3, f33;
+
+__mmask32 m32;
+__mmask16 m16;
+__mmask8 m8;
+
+void mullo_test (void)
+{
+ i1 = _mm512_mullo_epi64 (i1, i1);
+ i1 = _mm512_mask_mullo_epi64 (i1, m8, i1, i1);
+ i1 = _mm512_maskz_mullo_epi64 (m8, i1, i1);
+ i2 = _mm256_mullo_epi64 (i2, i2);
+ i2 = _mm256_mask_mullo_epi64 (i2, m8, i2, i2);
+ i2 = _mm256_maskz_mullo_epi64 (m8, i2, i2);
+ i3 = _mm_mullo_epi64 (i3, i3);
+ i3 = _mm_mask_mullo_epi64 (i3, m8, i3, i3);
+ i3 = _mm_maskz_mullo_epi64 (m8, i3, i3);
+}
+
+void range_test (void)
+{
+ d1 = _mm512_range_pd (d1, d11, 15);
+ d11 = _mm512_range_round_pd (d11, d1, 15, 8);
+ d1 = _mm512_mask_range_pd (d1, m8, d11, d11, 15);
+ d11 = _mm512_mask_range_round_pd (d11, m8, d1, d1, 15, 8);
+ d1 = _mm512_maskz_range_pd (m8, d11, d11, 15);
+ d11 = _mm512_maskz_range_round_pd (m8, d1, d1, 15, 8);
+ d2 = _mm256_range_pd (d2, d2, 15);
+ d2 = _mm256_mask_range_pd (d2, m8, d2, d2, 15);
+ d2 = _mm256_maskz_range_pd (m8, d2, d2, 15);
+ d3 = _mm_range_pd (d3, d3, 15);
+ d3 = _mm_mask_range_pd (d3, m8, d3, d3, 15);
+ d3 = _mm_maskz_range_pd (m8, d3, d3, 15);
+ d33 = _mm_range_sd (d33, d33, 15);
+ d33 = _mm_mask_range_sd (d33, m8, d33, d33, 15);
+ d33 = _mm_maskz_range_sd (m8, d33, d33, 15);
+
+ f1 = _mm512_range_ps (f1, f11, 15);
+ f11 = _mm512_range_round_ps (f11, f1, 15, 8);
+ f1 = _mm512_mask_range_ps (f1, m16, f11, f11, 15);
+ f11 = _mm512_mask_range_round_ps (f11, m16, f1, f1, 15, 8);
+ f1 = _mm512_maskz_range_ps (m16, f11, f11, 15);
+ f11 = _mm512_maskz_range_round_ps (m16, f1, f1, 15, 8);
+ f2 = _mm256_range_ps (f2, f2, 15);
+ f2 = _mm256_mask_range_ps (f2, m8, f2, f2, 15);
+ f2 = _mm256_maskz_range_ps (m8, f2, f2, 15);
+ f3 = _mm_range_ps (f3, f3, 15);
+ f3 = _mm_mask_range_ps (f3, m8, f3, f3, 15);
+ f3 = _mm_maskz_range_ps (m8, f3, f3, 15);
+ f33 = _mm_range_ss (f33, f33, 15);
+ f33 = _mm_mask_range_ss (f33, m8, f33, f33, 15);
+ f33 = _mm_maskz_range_ss (m8, f33, f33, 15);
+}
+
+/* { dg-final { scan-assembler-times "vxorps" 26 } } */
+/* { dg-final { scan-assembler-times "vpmullq" 9 } } */
+/* { dg-final { scan-assembler-times "vrangepd" 12 } } */
+/* { dg-final { scan-assembler-times "vrangesd" 3 } } */
+/* { dg-final { scan-assembler-times "vrangeps" 12 } } */
+/* { dg-final { scan-assembler-times "vrangess" 3 } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-mavx512f -mtune=generic -mtune-ctrl=dest_false_dep_for_glc -O2" } */
+
+#include <immintrin.h>
+
+extern __m512i i1, i2, i3;
+extern __m512d d1, d11, *pd1;
+extern __m128d d2;
+extern __m512 f1, *pf1;
+extern __m128 f2;
+volatile __m512d *pd11;
+
+__mmask16 m16;
+__mmask8 m8;
+
+void vperm_test (void)
+{
+ d1 = _mm512_permutex_pd (d1, 12);
+ d1 = _mm512_mask_permutex_pd (d1, m8, d1, 13);
+ d1 = _mm512_maskz_permutex_pd (m8, d1, 14);
+ d11 = _mm512_permutexvar_pd (i1, d11);
+ d11 = _mm512_mask_permutexvar_pd (d11, m8, i2, d11);
+ d11 = _mm512_maskz_permutexvar_pd (m8, i3, d11);
+
+ f1 = _mm512_permutexvar_ps (i1, f1);
+ f1 = _mm512_mask_permutexvar_ps (f1, m16, i1, f1);
+ f1 = _mm512_maskz_permutexvar_ps (m16, i1, f1);
+
+ i3 = _mm512_permutexvar_epi64 (i3, i3);
+ i3 = _mm512_mask_permutexvar_epi64 (i3, m8, i1, i1);
+ i3 = _mm512_maskz_permutexvar_epi64 (m8, i3, i1);
+ i1 = _mm512_permutex_epi64 (i3, 12);
+ i1 = _mm512_mask_permutex_epi64 (i1, m8, i1, 12);
+ i1 = _mm512_maskz_permutex_epi64 (m8, i1, 12);
+
+ i2 = _mm512_permutexvar_epi32 (i2, i2);
+ i2 = _mm512_mask_permutexvar_epi32 (i2, m16, i2, i2);
+ i3 = _mm512_maskz_permutexvar_epi32 (m16, i3, i3);
+}
+
+void getmant_test (void)
+{
+ d1 = _mm512_getmant_pd (*pd1, _MM_MANT_NORM_p75_1p5,
+ _MM_MANT_SIGN_src);
+ d1 = _mm512_getmant_round_pd (*pd11, _MM_MANT_NORM_p75_1p5,
+ _MM_MANT_SIGN_src, 8);
+ d1 = _mm512_mask_getmant_pd (d1, m8, *pd1, _MM_MANT_NORM_p75_1p5,
+ _MM_MANT_SIGN_src);
+ d1 = _mm512_mask_getmant_round_pd (d1, m8, *pd1, _MM_MANT_NORM_p75_1p5,
+ _MM_MANT_SIGN_src, 8);
+ d1 = _mm512_maskz_getmant_pd (m8, *pd1, _MM_MANT_NORM_p75_1p5,
+ _MM_MANT_SIGN_src);
+ d1 = _mm512_maskz_getmant_round_pd (m8, *pd1, _MM_MANT_NORM_p75_1p5,
+ _MM_MANT_SIGN_src, 8);
+ f1 = _mm512_getmant_ps (*pf1, _MM_MANT_NORM_p75_1p5,
+ _MM_MANT_SIGN_src);
+ f1 = _mm512_getmant_round_ps (*pf1, _MM_MANT_NORM_p75_1p5,
+ _MM_MANT_SIGN_src, 8);
+ f1 = _mm512_mask_getmant_ps (f1, m16, *pf1, _MM_MANT_NORM_p75_1p5,
+ _MM_MANT_SIGN_src);
+ f1 = _mm512_mask_getmant_round_ps (f1, m16, *pf1, _MM_MANT_NORM_p75_1p5,
+ _MM_MANT_SIGN_src, 8);
+ f1 = _mm512_maskz_getmant_ps (m16, *pf1, _MM_MANT_NORM_p75_1p5,
+ _MM_MANT_SIGN_src);
+ f1 = _mm512_maskz_getmant_round_ps (m16, *pf1, _MM_MANT_NORM_p75_1p5,
+ _MM_MANT_SIGN_src, 8);
+
+ d2 = _mm_getmant_sd (d2, d2, _MM_MANT_NORM_p75_1p5,
+ _MM_MANT_SIGN_src);
+ d2 = _mm_getmant_round_sd (d2, d2, _MM_MANT_NORM_p75_1p5,
+ _MM_MANT_SIGN_src, 8);
+ d2 = _mm_mask_getmant_sd (d2, m8, d2, d2, _MM_MANT_NORM_p75_1p5,
+ _MM_MANT_SIGN_src);
+ d2 = _mm_mask_getmant_round_sd (d2, m8, d2, d2, _MM_MANT_NORM_p75_1p5,
+ _MM_MANT_SIGN_src, 8);
+ d2 = _mm_maskz_getmant_sd (m8, d2, d2, _MM_MANT_NORM_p75_1p5,
+ _MM_MANT_SIGN_src);
+ d2 = _mm_maskz_getmant_round_sd (m8, d2, d2, _MM_MANT_NORM_p75_1p5,
+ _MM_MANT_SIGN_src, 8);
+ f2 = _mm_getmant_ss (f2, f2, _MM_MANT_NORM_p75_1p5,
+ _MM_MANT_SIGN_src);
+ f2 = _mm_getmant_round_ss (f2, f2, _MM_MANT_NORM_p75_1p5,
+ _MM_MANT_SIGN_src, 8);
+ f2 = _mm_mask_getmant_ss (f2, m8, f2, f2, _MM_MANT_NORM_p75_1p5,
+ _MM_MANT_SIGN_src);
+ f2 = _mm_mask_getmant_round_ss (f2, m8, f2, f2, _MM_MANT_NORM_p75_1p5,
+ _MM_MANT_SIGN_src, 8);
+ f2 = _mm_maskz_getmant_ss (m8, f2, f2, _MM_MANT_NORM_p75_1p5,
+ _MM_MANT_SIGN_src);
+ f2 = _mm_maskz_getmant_round_ss (m8, f2, f2, _MM_MANT_NORM_p75_1p5,
+ _MM_MANT_SIGN_src, 8);
+
+}
+
+/* { dg-final { scan-assembler-times "vxorps" 22 } } */
+/* { dg-final { scan-assembler-times "vpermd" 3 } } */
+/* { dg-final { scan-assembler-times "vpermq" 6 } } */
+/* { dg-final { scan-assembler-times "vpermps" 3 } } */
+/* { dg-final { scan-assembler-times "vpermpd" 6 } } */
+/* { dg-final { scan-assembler-times "vgetmantpd" 6 } } */
+/* { dg-final { scan-assembler-times "vgetmantps" 6 } } */
+/* { dg-final { scan-assembler-times "vgetmantsd" 6 } } */
+/* { dg-final { scan-assembler-times "vgetmantss" 6 } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-mavx512fp16 -mavx512vl -mtune=generic -mtune-ctrl=dest_false_dep_for_glc -O2" } */
+
+#include <immintrin.h>
+
+extern __m512h h1;
+extern __m256h h2;
+extern __m128h h3;
+
+__mmask32 m32;
+__mmask16 m16;
+__mmask8 m8;
+
+void complex_mul_test (void)
+{
+ h1 = _mm512_fmul_pch (h1, h1);
+ h1 = _mm512_fmul_round_pch (h1, h1, 8);
+ h1 = _mm512_mask_fmul_pch (h1, m32, h1, h1);
+ h1 = _mm512_mask_fmul_round_pch (h1, m32, h1, h1, 8);
+ h1 = _mm512_maskz_fmul_pch (m32, h1, h1);
+ h1 = _mm512_maskz_fmul_round_pch (m32, h1, h1, 11);
+
+ h3 = _mm_fmul_sch (h3, h3);
+ h3 = _mm_fmul_round_sch (h3, h3, 8);
+ h3 = _mm_mask_fmul_sch (h3, m8, h3, h3);
+ h3 = _mm_mask_fmul_round_sch (h3, m8, h3, h3, 8);
+ h3 = _mm_maskz_fmul_sch (m8, h3, h3);
+ h3 = _mm_maskz_fmul_round_sch (m8, h3, h3, 11);
+}
+
+void vgetmant_test (void)
+{
+ h3 = _mm_getmant_sh (h3, h3, _MM_MANT_NORM_p75_1p5,
+ _MM_MANT_SIGN_src);
+ h3 = _mm_mask_getmant_sh (h3, m8, h3, h3, _MM_MANT_NORM_p75_1p5,
+ _MM_MANT_SIGN_src);
+ h3 = _mm_maskz_getmant_sh (m8, h3, h3, _MM_MANT_NORM_p75_1p5,
+ _MM_MANT_SIGN_src);
+}
+
+/* { dg-final { scan-assembler-times "vxorps" 10 } } */
+/* { dg-final { scan-assembler-times "vfmulcph" 6 } } */
+/* { dg-final { scan-assembler-times "vfmulcsh" 6 } } */
+/* { dg-final { scan-assembler-times "vgetmantsh" 3 } } */
+
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-mavx512fp16 -mavx512vl -mtune-ctrl=dest_false_dep_for_glc -O2" } */
+
+#include <immintrin.h>
+
+extern __m256h h1;
+extern __m128h h2;
+
+__mmask16 m16;
+__mmask8 m8;
+
+void complex_mul_test (void)
+{
+ h1 = _mm256_fmul_pch (h1, h1);
+ h1 = _mm256_mask_fmul_pch (h1, m16, h1, h1);
+ h1 = _mm256_maskz_fmul_pch (m16, h1, h1);
+ h2 = _mm_fmul_pch (h2, h2);
+ h2 = _mm_mask_fmul_pch (h2, m16, h2, h2);
+ h2 = _mm_maskz_fmul_pch (m16, h2, h2);
+}
+
+/* { dg-final { scan-assembler-times "vxorps" 4 } } */
+/* { dg-final { scan-assembler-times "vfmulcph" 6 } } */
+
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-mavx512f -mtune=generic -mavx512vl -mtune-ctrl=dest_false_dep_for_glc -O2" } */
+
+
+#include <immintrin.h>
+
+extern __m256i i1, i2, i3;
+extern __m256d d1, d11, *pd1;
+extern __m128d d2, *pd2;
+extern __m256 f1, *pf1;
+extern __m128 f2, *pf2;
+
+__mmask16 m16;
+__mmask8 m8;
+
+void vperm_test (void)
+{
+ d1 = _mm256_permutex_pd (d1, 12);
+ d1 = _mm256_mask_permutex_pd (d1, m8, d1, 12);
+ d1 = _mm256_maskz_permutex_pd (m8, d1, 12);
+ d11 = _mm256_permutexvar_pd (i1, d11);
+ d11 = _mm256_mask_permutexvar_pd (d11, m8, i1, d11);
+ d11 = _mm256_maskz_permutexvar_pd (m8, i1, d11);
+
+ f1 = _mm256_permutexvar_ps (i1, f1);
+ f1 = _mm256_mask_permutexvar_ps (f1, m8, i1, f1);
+ f1 = _mm256_maskz_permutexvar_ps (m8, i1, f1);
+
+ i1 = _mm256_permutexvar_epi64 (i1, i1);
+ i1 = _mm256_mask_permutexvar_epi64 (i1, m8, i1, i1);
+ i1 = _mm256_maskz_permutexvar_epi64 (m8, i1, i1);
+ i1 = _mm256_permutex_epi64 (i1, 12);
+ i1 = _mm256_mask_permutex_epi64 (i1, m8, i1, 12);
+ i1 = _mm256_maskz_permutex_epi64 (m8, i1, 12);
+
+ i2 = _mm256_permutexvar_epi32 (i2, i2);
+ i2 = _mm256_mask_permutexvar_epi32 (i2, m8, i2, i2);
+ i3 = _mm256_maskz_permutexvar_epi32 (m8, i3, i3);
+}
+
+void getmant_test (void)
+{
+ d1 = _mm256_getmant_pd (*pd1, _MM_MANT_NORM_p75_1p5,
+ _MM_MANT_SIGN_src);
+ d1 = _mm256_mask_getmant_pd (d1, m8, *pd1, _MM_MANT_NORM_p75_1p5,
+ _MM_MANT_SIGN_src);
+ d1 = _mm256_maskz_getmant_pd (m8, *pd1, _MM_MANT_NORM_p75_1p5,
+ _MM_MANT_SIGN_src);
+ d2 = _mm_getmant_pd (*pd2, _MM_MANT_NORM_p75_1p5,
+ _MM_MANT_SIGN_src);
+ d2 = _mm_mask_getmant_pd (d2, m8, *pd2, _MM_MANT_NORM_p75_1p5,
+ _MM_MANT_SIGN_src);
+ d2 = _mm_maskz_getmant_pd (m8, *pd2, _MM_MANT_NORM_p75_1p5,
+ _MM_MANT_SIGN_src);
+ f1 = _mm256_getmant_ps (*pf1, _MM_MANT_NORM_p75_1p5,
+ _MM_MANT_SIGN_src);
+ f1 = _mm256_mask_getmant_ps (f1, m8, *pf1, _MM_MANT_NORM_p75_1p5,
+ _MM_MANT_SIGN_src);
+ f1 = _mm256_maskz_getmant_ps (m8, *pf1, _MM_MANT_NORM_p75_1p5,
+ _MM_MANT_SIGN_src);
+ f2 = _mm_getmant_ps (*pf2, _MM_MANT_NORM_p75_1p5,
+ _MM_MANT_SIGN_src);
+ f2 = _mm_mask_getmant_ps (f2, m8, *pf2, _MM_MANT_NORM_p75_1p5,
+ _MM_MANT_SIGN_src);
+ f2 = _mm_maskz_getmant_ps (m8, *pf2, _MM_MANT_NORM_p75_1p5,
+ _MM_MANT_SIGN_src);
+}
+
+/* { dg-final { scan-assembler-times "vxorps" 19 } } */
+/* { dg-final { scan-assembler-times "vpermpd" 6 } } */
+/* { dg-final { scan-assembler-times "vpermps" 3 } } */
+/* { dg-final { scan-assembler-times "vpermq" 6 } } */
+/* { dg-final { scan-assembler-times "vpermd" 3 } } */
+/* { dg-final { scan-assembler-times "vgetmantpd" 6 } } */
+/* { dg-final { scan-assembler-times "vgetmantps" 6 } } */
+