TARGET_BUILTIN(__builtin_ia32_vfmaddcph256_maskz, "V8fV8fV8fV8fUc", "ncV:256:", "avx512fp16,avx512vl")
TARGET_BUILTIN(__builtin_ia32_vfmaddcph512_mask, "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512fp16")
TARGET_BUILTIN(__builtin_ia32_vfmaddcph512_maskz, "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vfmaddcph512_mask3, "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512fp16")
TARGET_BUILTIN(__builtin_ia32_vfcmaddcph128_mask, "V4fV4fV4fV4fUc", "ncV:128:", "avx512fp16,avx512vl")
TARGET_BUILTIN(__builtin_ia32_vfcmaddcph128_maskz, "V4fV4fV4fV4fUc", "ncV:128:", "avx512fp16,avx512vl")
TARGET_BUILTIN(__builtin_ia32_vfcmaddcph256_mask, "V8fV8fV8fV8fUc", "ncV:256:", "avx512fp16,avx512vl")
TARGET_BUILTIN(__builtin_ia32_vfcmaddcph256_maskz, "V8fV8fV8fV8fUc", "ncV:256:", "avx512fp16,avx512vl")
TARGET_BUILTIN(__builtin_ia32_vfcmaddcph512_mask, "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512fp16")
TARGET_BUILTIN(__builtin_ia32_vfcmaddcph512_maskz, "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vfcmaddcph512_mask3, "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512fp16")
TARGET_BUILTIN(__builtin_ia32_vfmaddcsh_mask, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16")
TARGET_BUILTIN(__builtin_ia32_vfmaddcsh_maskz, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16")
TARGET_BUILTIN(__builtin_ia32_vfcmaddcsh_mask, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16")
TARGET_BUILTIN(__builtin_ia32_vfcmaddcsh_maskz, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vfmaddcsh_round_mask, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vfmaddcsh_round_mask3, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vfcmaddcsh_round_mask, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vfcmaddcsh_round_mask3, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16")
TARGET_BUILTIN(__builtin_ia32_vfmulcsh_mask, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16")
TARGET_BUILTIN(__builtin_ia32_vfcmulcsh_mask, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512fp16")
static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_fcmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
- return (__m128h)__builtin_ia32_selectps_128(
- __U,
- __builtin_ia32_vfcmaddcsh_mask((__v4sf)__A, (__v4sf)__B, (__v4sf)__C,
- (__mmask8)__U, _MM_FROUND_CUR_DIRECTION),
- (__v4sf)__A);
+ return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask(
+ (__v4sf)__A, (__v4sf)(__B), (__v4sf)(__C), __U, _MM_FROUND_CUR_DIRECTION);
}
static __inline__ __m128h __DEFAULT_FN_ATTRS128
static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask3_fcmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
- return (__m128h)_mm_move_ss((__m128)__C,
- (__m128)__builtin_ia32_vfcmaddcsh_mask(
- (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, __U,
- _MM_FROUND_CUR_DIRECTION));
+ return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask3(
+ (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, __U, _MM_FROUND_CUR_DIRECTION);
}
#define _mm_fcmadd_round_sch(A, B, C, R) \
(__mmask8)-1, (int)(R)))
#define _mm_mask_fcmadd_round_sch(A, U, B, C, R) \
- ((__m128h)__builtin_ia32_selectps_128( \
- (__mmask8)(U & 1), \
- __builtin_ia32_vfcmaddcsh_mask( \
- (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
- (__mmask8)(U), (int)(R)), \
- (__v4sf)(__m128h)(A)))
+ ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask( \
+ (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
+ (__mmask8)(U), (int)(R)))
#define _mm_maskz_fcmadd_round_sch(U, A, B, C, R) \
((__m128h)__builtin_ia32_vfcmaddcsh_maskz( \
(__mmask8)(U), (int)(R)))
#define _mm_mask3_fcmadd_round_sch(A, B, C, U, R) \
- ((__m128h)_mm_move_ss((__m128)(C), \
- (__m128)__builtin_ia32_vfcmaddcsh_mask( \
- (__v4sf)(A), (__v4sf)(B), (__v4sf)(C), (U), (R))))
+ ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask3( \
+ (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
+ (__mmask8)(U), (int)(R)))
static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_sch(__m128h __A,
__m128h __B,
static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_fmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
- return (__m128h)__builtin_ia32_selectps_128(
- __U,
- __builtin_ia32_vfmaddcsh_mask((__v4sf)__A, (__v4sf)__B, (__v4sf)__C,
- (__mmask8)__U, _MM_FROUND_CUR_DIRECTION),
- (__v4sf)__A);
+ return (__m128h)__builtin_ia32_vfmaddcsh_round_mask(
+ (__v4sf)__A, (__v4sf)(__B), (__v4sf)(__C), __U, _MM_FROUND_CUR_DIRECTION);
}
static __inline__ __m128h __DEFAULT_FN_ATTRS128
_MM_FROUND_CUR_DIRECTION);
}
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask3_fmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
+ return (__m128h)__builtin_ia32_vfmaddcsh_round_mask3(
+ (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, __U, _MM_FROUND_CUR_DIRECTION);
+}
+
#define _mm_fmadd_round_sch(A, B, C, R) \
((__m128h)__builtin_ia32_vfmaddcsh_mask( \
(__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
(__mmask8)-1, (int)(R)))
#define _mm_mask_fmadd_round_sch(A, U, B, C, R) \
- ((__m128h)__builtin_ia32_selectps_128( \
- (__mmask8)(U & 1), \
- __builtin_ia32_vfmaddcsh_mask( \
- (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
- (__mmask8)(U), (int)(R)), \
- (__v4sf)(__m128h)(A)))
+ ((__m128h)__builtin_ia32_vfmaddcsh_round_mask( \
+ (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
+ (__mmask8)(U), (int)(R)))
#define _mm_maskz_fmadd_round_sch(U, A, B, C, R) \
((__m128h)__builtin_ia32_vfmaddcsh_maskz( \
(__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
(__mmask8)(U), (int)(R)))
+#define _mm_mask3_fmadd_round_sch(A, B, C, U, R) \
+ ((__m128h)__builtin_ia32_vfmaddcsh_round_mask3( \
+ (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
+ (__mmask8)(U), (int)(R)))
+
static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmul_sch(__m128h __A,
__m128h __B) {
return (__m128h)__builtin_ia32_vfcmulcsh_mask(
static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fcmadd_pch(__m512h __A,
__m512h __B,
__m512h __C) {
- return (__m512h)__builtin_ia32_vfcmaddcph512_mask((__v16sf)__A, (__v16sf)__B,
- (__v16sf)__C, (__mmask16)-1,
- _MM_FROUND_CUR_DIRECTION);
+ return (__m512h)__builtin_ia32_vfcmaddcph512_mask3(
+ (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)-1,
+ _MM_FROUND_CUR_DIRECTION);
}
static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_fcmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) {
- return (__m512h)__builtin_ia32_selectps_512(
- __U,
- __builtin_ia32_vfcmaddcph512_mask((__v16sf)__A, (__v16sf)__B,
- (__v16sf)__C, (__mmask16)__U,
- _MM_FROUND_CUR_DIRECTION),
- (__v16sf)__A);
+ return (__m512h)__builtin_ia32_vfcmaddcph512_mask(
+ (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
+ _MM_FROUND_CUR_DIRECTION);
}
static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask3_fcmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) {
- return (__m512h)__builtin_ia32_vfcmaddcph512_mask(
+ return (__m512h)__builtin_ia32_vfcmaddcph512_mask3(
(__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
_MM_FROUND_CUR_DIRECTION);
}
}
#define _mm512_fcmadd_round_pch(A, B, C, R) \
- ((__m512h)__builtin_ia32_vfcmaddcph512_mask( \
+ ((__m512h)__builtin_ia32_vfcmaddcph512_mask3( \
(__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
(__mmask16)-1, (int)(R)))
#define _mm512_mask_fcmadd_round_pch(A, U, B, C, R) \
- ((__m512h)__builtin_ia32_selectps_512( \
- (__mmask16)(U), \
- __builtin_ia32_vfcmaddcph512_mask( \
- (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
- (__mmask16)(U), (int)(R)), \
- (__v16sf)(__m512h)(A)))
+ ((__m512h)__builtin_ia32_vfcmaddcph512_mask( \
+ (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
+ (__mmask16)(U), (int)(R)))
#define _mm512_mask3_fcmadd_round_pch(A, B, C, U, R) \
- ((__m512h)__builtin_ia32_vfcmaddcph512_mask( \
+ ((__m512h)__builtin_ia32_vfcmaddcph512_mask3( \
(__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
(__mmask16)(U), (int)(R)))
static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmadd_pch(__m512h __A,
__m512h __B,
__m512h __C) {
- return (__m512h)__builtin_ia32_vfmaddcph512_mask((__v16sf)__A, (__v16sf)__B,
- (__v16sf)__C, (__mmask16)-1,
- _MM_FROUND_CUR_DIRECTION);
+ return (__m512h)__builtin_ia32_vfmaddcph512_mask3((__v16sf)__A, (__v16sf)__B,
+ (__v16sf)__C, (__mmask16)-1,
+ _MM_FROUND_CUR_DIRECTION);
}
static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_fmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) {
- return (__m512h)__builtin_ia32_selectps_512(
- __U,
- __builtin_ia32_vfmaddcph512_mask((__v16sf)__A, (__v16sf)__B, (__v16sf)__C,
- (__mmask16)__U,
- _MM_FROUND_CUR_DIRECTION),
- (__v16sf)__A);
+ return (__m512h)__builtin_ia32_vfmaddcph512_mask((__v16sf)__A, (__v16sf)__B,
+ (__v16sf)__C, (__mmask16)__U,
+ _MM_FROUND_CUR_DIRECTION);
}
static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask3_fmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) {
- return (__m512h)__builtin_ia32_vfmaddcph512_mask((__v16sf)__A, (__v16sf)__B,
- (__v16sf)__C, (__mmask16)__U,
- _MM_FROUND_CUR_DIRECTION);
+ return (__m512h)__builtin_ia32_vfmaddcph512_mask3(
+ (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
+ _MM_FROUND_CUR_DIRECTION);
}
static __inline__ __m512h __DEFAULT_FN_ATTRS512
}
#define _mm512_fmadd_round_pch(A, B, C, R) \
- ((__m512h)__builtin_ia32_vfmaddcph512_mask( \
+ ((__m512h)__builtin_ia32_vfmaddcph512_mask3( \
(__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
(__mmask16)-1, (int)(R)))
#define _mm512_mask_fmadd_round_pch(A, U, B, C, R) \
- ((__m512h)__builtin_ia32_selectps_512( \
- (__mmask16)(U), \
- __builtin_ia32_vfmaddcph512_mask( \
- (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
- (__mmask16)(U), (int)(R)), \
- (__v16sf)(__m512h)(A)))
+ ((__m512h)__builtin_ia32_vfmaddcph512_mask( \
+ (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
+ (__mmask16)(U), (int)(R)))
#define _mm512_mask3_fmadd_round_pch(A, B, C, U, R) \
- ((__m512h)__builtin_ia32_vfmaddcph512_mask( \
+ ((__m512h)__builtin_ia32_vfmaddcph512_mask3( \
(__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
(__mmask16)(U), (int)(R)))
// CHECK: %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float>
// CHECK: %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float>
// CHECK: %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float>
- // CHECK: %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float>
// CHECK: %{{.*}} = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.csh(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 %{{.*}}, i32 4)
- // CHECK: %{{.*}} = extractelement <4 x float> %{{.*}}, i32 0
- // CHECK: %{{.*}} = insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
+ // CHECK: %{{.*}} = shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
// CHECK: %{{.*}} = bitcast <4 x float> %{{.*}} to <8 x half>
return _mm_mask3_fcmadd_sch(__A, __B, __C, __U);
}
// CHECK: %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float>
// CHECK: %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float>
// CHECK: %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float>
- // CHECK: %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float>
// CHECK: %{{.*}} = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.csh(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 %{{.*}}, i32 11)
- // CHECK: %{{.*}} = extractelement <4 x float> %{{.*}}, i32 0
- // CHECK: %{{.*}} = insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
+ // CHECK: %{{.*}} = shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
// CHECK: %{{.*}} = bitcast <4 x float> %{{.*}} to <8 x half>
return _mm_mask3_fcmadd_round_sch(__A, __B, __C, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
}
return _mm_maskz_fmadd_sch(__U, __A, __B, __C);
}
+__m128h test_mm_mask3_fmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
+ // CHECK-LABEL: @test_mm_mask3_fmadd_sch
+ // CHECK: %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float>
+ // CHECK: %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float>
+ // CHECK: %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float>
+ // CHECK: %{{.*}} = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.csh(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 %{{.*}}, i32 4)
+ // CHECK: %{{.*}} = shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+ // CHECK: %{{.*}} = bitcast <4 x float> %{{.*}} to <8 x half>
+ return _mm_mask3_fmadd_sch(__A, __B, __C, __U);
+}
+
__m128h test_mm_fmadd_round_sch(__m128h __A, __m128h __B, __m128h __C) {
// CHECK-LABEL: @test_mm_fmadd_round_sch
// CHECK: @llvm.x86.avx512fp16.mask.vfmadd.csh
return _mm_maskz_fmadd_round_sch(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
}
+__m128h test_mm_mask3_fmadd_round_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
+ // CHECK-LABEL: @test_mm_mask3_fmadd_round_sch
+ // CHECK: %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float>
+ // CHECK: %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float>
+ // CHECK: %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float>
+ // CHECK: %{{.*}} = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.csh(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 %{{.*}}, i32 11)
+ // CHECK: %{{.*}} = shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+ // CHECK: %{{.*}} = bitcast <4 x float> %{{.*}} to <8 x half>
+ return _mm_mask3_fmadd_round_sch(__A, __B, __C, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
__m128h test_mm_fcmul_sch(__m128h __A, __m128h __B) {
// CHECK-LABEL: @test_mm_fcmul_sch
// CHECK: @llvm.x86.avx512fp16.mask.vfcmul.csh