From 638426fc36e08ccee78605a4d8136757ca0faf12 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 10 Jul 2018 00:37:25 +0000 Subject: [PATCH] [X86] Add __builtin_ia32_selectss_128 and __builtin_ia32_selectsd_128 that is suitable for use in scalar mask intrinsics. This will convert the i8 mask argument to <8 x i1> and extract an i1 and then emit a select instruction. This replaces the '(__U & 1)" and ternary operator used in some of intrinsics. The old sequence was lowered to a scalar and and compare. The new sequence uses an i1 vector that will interoperate better with other mask intrinsics. This removes the need to handle div_ss/sd specially in CGBuiltin.cpp. A follow up patch will add the GCCBuiltin name back in llvm and remove the custom handling. I made some adjustments to legacy move_ss/sd intrinsics which we reused here to do a simpler extract and insert instead of 2 extracts and two inserts or a shuffle. llvm-svn: 336622 --- clang/include/clang/Basic/BuiltinsX86.def | 2 + clang/lib/CodeGen/CGBuiltin.cpp | 7 ++ clang/lib/Headers/avx512fintrin.h | 82 ++++--------- clang/lib/Headers/emmintrin.h | 3 +- clang/lib/Headers/xmmintrin.h | 3 +- clang/test/CodeGen/avx512f-builtins.c | 196 ++++++++++++++++++------------ clang/test/CodeGen/sse-builtins.c | 3 +- clang/test/CodeGen/sse2-builtins.c | 4 +- 8 files changed, 162 insertions(+), 138 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def index 0b08e03..e98f7d61 100644 --- a/clang/include/clang/Basic/BuiltinsX86.def +++ b/clang/include/clang/Basic/BuiltinsX86.def @@ -1809,6 +1809,8 @@ TARGET_BUILTIN(__builtin_ia32_selectps_512, "V16fUsV16fV16f", "ncV:512:", "avx51 TARGET_BUILTIN(__builtin_ia32_selectpd_128, "V2dUcV2dV2d", "ncV:128:", "avx512vl") TARGET_BUILTIN(__builtin_ia32_selectpd_256, "V4dUcV4dV4d", "ncV:256:", "avx512vl") TARGET_BUILTIN(__builtin_ia32_selectpd_512, "V8dUcV8dV8d", "ncV:512:", "avx512f") +TARGET_BUILTIN(__builtin_ia32_selectss_128, "V4fUcV4fV4f", "ncV:128:", "avx512f") +TARGET_BUILTIN(__builtin_ia32_selectsd_128, "V2dUcV2dV2d", "ncV:128:", "avx512f") // MONITORX/MWAITX TARGET_BUILTIN(__builtin_ia32_monitorx, "vv*UiUi", "n", "mwaitx") diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 6b1198d..ba0519b 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -9832,6 +9832,13 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_selectpd_256: case X86::BI__builtin_ia32_selectpd_512: return EmitX86Select(*this, Ops[0], Ops[1], Ops[2]); + case X86::BI__builtin_ia32_selectss_128: + case X86::BI__builtin_ia32_selectsd_128: { + Value *A = Builder.CreateExtractElement(Ops[1], (uint64_t)0); + Value *B = Builder.CreateExtractElement(Ops[2], (uint64_t)0); + A = EmitX86ScalarSelect(*this, Ops[0], A, B); + return Builder.CreateInsertElement(Ops[1], A, (uint64_t)0); + } case X86::BI__builtin_ia32_cmpb128_mask: case X86::BI__builtin_ia32_cmpb256_mask: case X86::BI__builtin_ia32_cmpb512_mask: diff --git a/clang/lib/Headers/avx512fintrin.h b/clang/lib/Headers/avx512fintrin.h index 9024168..f69620b 100644 --- a/clang/lib/Headers/avx512fintrin.h +++ b/clang/lib/Headers/avx512fintrin.h @@ -1899,15 +1899,13 @@ _mm512_maskz_abs_epi32 (__mmask16 __U, __m512i __A) static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_add_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { __A = _mm_add_ss(__A, __B); - __A[0] = (__U & 1) ? __A[0] : __W[0]; - return __A; + return __builtin_ia32_selectss_128(__U, __A, __W); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_add_ss(__mmask8 __U,__m128 __A, __m128 __B) { __A = _mm_add_ss(__A, __B); - __A[0] = (__U & 1) ? __A[0] : 0; - return __A; + return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps()); } #define _mm_add_round_ss(A, B, R) \ @@ -1931,15 +1929,13 @@ _mm_maskz_add_ss(__mmask8 __U,__m128 __A, __m128 __B) { static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_add_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { __A = _mm_add_sd(__A, __B); - __A[0] = (__U & 1) ? __A[0] : __W[0]; - return __A; + return __builtin_ia32_selectsd_128(__U, __A, __W); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_add_sd(__mmask8 __U,__m128d __A, __m128d __B) { __A = _mm_add_sd(__A, __B); - __A[0] = (__U & 1) ? __A[0] : 0; - return __A; + return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd()); } #define _mm_add_round_sd(A, B, R) \ (__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \ @@ -2018,15 +2014,13 @@ _mm512_maskz_add_ps(__mmask16 __U, __m512 __A, __m512 __B) { static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_sub_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { __A = _mm_sub_ss(__A, __B); - __A[0] = (__U & 1) ? __A[0] : __W[0]; - return __A; + return __builtin_ia32_selectss_128(__U, __A, __W); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_sub_ss(__mmask8 __U,__m128 __A, __m128 __B) { __A = _mm_sub_ss(__A, __B); - __A[0] = (__U & 1) ? __A[0] : 0; - return __A; + return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps()); } #define _mm_sub_round_ss(A, B, R) \ (__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \ @@ -2049,15 +2043,13 @@ _mm_maskz_sub_ss(__mmask8 __U,__m128 __A, __m128 __B) { static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_sub_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { __A = _mm_sub_sd(__A, __B); - __A[0] = (__U & 1) ? __A[0] : __W[0]; - return __A; + return __builtin_ia32_selectsd_128(__U, __A, __W); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_sub_sd(__mmask8 __U,__m128d __A, __m128d __B) { __A = _mm_sub_sd(__A, __B); - __A[0] = (__U & 1) ? __A[0] : 0; - return __A; + return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd()); } #define _mm_sub_round_sd(A, B, R) \ @@ -2137,15 +2129,13 @@ _mm512_maskz_sub_ps(__mmask16 __U, __m512 __A, __m512 __B) { static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_mul_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { __A = _mm_mul_ss(__A, __B); - __A[0] = (__U & 1) ? __A[0] : __W[0]; - return __A; + return __builtin_ia32_selectss_128(__U, __A, __W); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_mul_ss(__mmask8 __U,__m128 __A, __m128 __B) { __A = _mm_mul_ss(__A, __B); - __A[0] = (__U & 1) ? __A[0] : 0; - return __A; + return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps()); } #define _mm_mul_round_ss(A, B, R) \ (__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \ @@ -2168,15 +2158,13 @@ _mm_maskz_mul_ss(__mmask8 __U,__m128 __A, __m128 __B) { static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_mul_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { __A = _mm_mul_sd(__A, __B); - __A[0] = (__U & 1) ? __A[0] : __W[0]; - return __A; + return __builtin_ia32_selectsd_128(__U, __A, __W); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_mul_sd(__mmask8 __U,__m128d __A, __m128d __B) { __A = _mm_mul_sd(__A, __B); - __A[0] = (__U & 1) ? __A[0] : 0; - return __A; + return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd()); } #define _mm_mul_round_sd(A, B, R) \ @@ -2255,20 +2243,14 @@ _mm512_maskz_mul_ps(__mmask16 __U, __m512 __A, __m512 __B) { static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_div_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_divss_round_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + __A = _mm_div_ss(__A, __B); + return __builtin_ia32_selectss_128(__U, __A, __W); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_div_ss(__mmask8 __U,__m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_divss_round_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) _mm_setzero_ps (), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + __A = _mm_div_ss(__A, __B); + return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps()); } #define _mm_div_round_ss(A, B, R) \ @@ -2291,20 +2273,14 @@ _mm_maskz_div_ss(__mmask8 __U,__m128 __A, __m128 __B) { static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_div_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_divsd_round_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + __A = _mm_div_sd(__A, __B); + return __builtin_ia32_selectsd_128(__U, __A, __W); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_div_sd(__mmask8 __U,__m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_divsd_round_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) _mm_setzero_pd (), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + __A = _mm_div_sd(__A, __B); + return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd()); } #define _mm_div_round_sd(A, B, R) \ @@ -8639,33 +8615,27 @@ _mm512_maskz_moveldup_ps (__mmask16 __U, __m512 __A) static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_move_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - __m128 res = __A; - res[0] = (__U & 1) ? __B[0] : __W[0]; - return res; + return __builtin_ia32_selectss_128(__U, _mm_move_ss(__A, __B), __W); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_move_ss (__mmask8 __U, __m128 __A, __m128 __B) { - __m128 res = __A; - res[0] = (__U & 1) ? __B[0] : 0; - return res; + return __builtin_ia32_selectss_128(__U, _mm_move_ss(__A, __B), + _mm_setzero_ps()); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_move_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - __m128d res = __A; - res[0] = (__U & 1) ? __B[0] : __W[0]; - return res; + return __builtin_ia32_selectsd_128(__U, _mm_move_sd(__A, __B), __W); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B) { - __m128d res = __A; - res[0] = (__U & 1) ? __B[0] : 0; - return res; + return __builtin_ia32_selectsd_128(__U, _mm_move_sd(__A, __B), + _mm_setzero_pd()); } static __inline__ void __DEFAULT_FN_ATTRS128 diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h index be88c65..f0ea7cd 100644 --- a/clang/lib/Headers/emmintrin.h +++ b/clang/lib/Headers/emmintrin.h @@ -1900,7 +1900,8 @@ _mm_setzero_pd(void) static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_move_sd(__m128d __a, __m128d __b) { - return __extension__ (__m128d){ __b[0], __a[1] }; + __a[0] = __b[0]; + return __a; } /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a diff --git a/clang/lib/Headers/xmmintrin.h b/clang/lib/Headers/xmmintrin.h index 0bc7650..17af172 100644 --- a/clang/lib/Headers/xmmintrin.h +++ b/clang/lib/Headers/xmmintrin.h @@ -2671,7 +2671,8 @@ _mm_unpacklo_ps(__m128 __a, __m128 __b) static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_move_ss(__m128 __a, __m128 __b) { - return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 4, 1, 2, 3); + __a[0] = __b[0]; + return __a; } /// Constructs a 128-bit floating-point vector of [4 x float]. The lower diff --git a/clang/test/CodeGen/avx512f-builtins.c b/clang/test/CodeGen/avx512f-builtins.c index be5fcb2..2beae24 100644 --- a/clang/test/CodeGen/avx512f-builtins.c +++ b/clang/test/CodeGen/avx512f-builtins.c @@ -3150,11 +3150,12 @@ __m128 test_mm_mask_add_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { // CHECK: extractelement <4 x float> %{{.*}}, i32 0 // CHECK: fadd float %{{.*}}, %{{.*}} // CHECK: insertelement <4 x float> %{{.*}}, i32 0 - // CHECK: and i32 {{.*}}, 1 - // CHECK: icmp ne i32 %{{.*}}, 0 - // CHECK: br {{.*}}, {{.*}}, {{.*}} - // CHECK: extractelement <4 x float> %{{.*}}, i32 0 - // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0 + // CHECK: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK-NEXT: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: select i1 %{{.*}}, float %{{.*}}, float %{{.*}} + // CHECK-NEXT: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0 return _mm_mask_add_ss(__W,__U,__A,__B); } __m128 test_mm_maskz_add_ss(__mmask8 __U, __m128 __A, __m128 __B) { @@ -3164,10 +3165,12 @@ __m128 test_mm_maskz_add_ss(__mmask8 __U, __m128 __A, __m128 __B) { // CHECK: extractelement <4 x float> %{{.*}}, i32 0 // CHECK: fadd float %{{.*}}, %{{.*}} // CHECK: insertelement <4 x float> %{{.*}}, i32 0 - // CHECK: and i32 {{.*}}, 1 - // CHECK: icmp ne i32 %{{.*}}, 0 - // CHECK: br {{.*}}, {{.*}}, {{.*}} - // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0 + // CHECK: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK-NEXT: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: select i1 %{{.*}}, float %{{.*}}, float %{{.*}} + // CHECK-NEXT: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0 return _mm_maskz_add_ss(__U,__A,__B); } __m128d test_mm_add_round_sd(__m128d __A, __m128d __B) { @@ -3192,11 +3195,12 @@ __m128d test_mm_mask_add_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) // CHECK: extractelement <2 x double> %{{.*}}, i32 0 // CHECK: fadd double %{{.*}}, %{{.*}} // CHECK: insertelement <2 x double> {{.*}}, i32 0 - // CHECK: and i32 {{.*}}, 1 - // CHECK: icmp ne i32 %{{.*}}, 0 - // CHECK: br {{.*}}, {{.*}}, {{.*}} - // CHECK: extractelement <2 x double> %{{.*}}, i32 0 - // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0 + // CHECK: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK-NEXT: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: select i1 %{{.*}}, double %{{.*}}, double %{{.*}} + // CHECK-NEXT: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0 return _mm_mask_add_sd(__W,__U,__A,__B); } __m128d test_mm_maskz_add_sd(__mmask8 __U, __m128d __A, __m128d __B) { @@ -3206,10 +3210,12 @@ __m128d test_mm_maskz_add_sd(__mmask8 __U, __m128d __A, __m128d __B) { // CHECK: extractelement <2 x double> %{{.*}}, i32 0 // CHECK: fadd double %{{.*}}, %{{.*}} // CHECK: insertelement <2 x double> {{.*}}, i32 0 - // CHECK: and i32 {{.*}}, 1 - // CHECK: icmp ne i32 %{{.*}}, 0 - // CHECK: br {{.*}}, {{.*}}, {{.*}} - // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0 + // CHECK: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK-NEXT: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: select i1 %{{.*}}, double %{{.*}}, double %{{.*}} + // CHECK-NEXT: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0 return _mm_maskz_add_sd(__U,__A,__B); } __m512d test_mm512_sub_round_pd(__m512d __A, __m512d __B) { @@ -3292,11 +3298,12 @@ __m128 test_mm_mask_sub_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { // CHECK: extractelement <4 x float> %{{.*}}, i32 0 // CHECK: fsub float %{{.*}}, %{{.*}} // CHECK: insertelement <4 x float> {{.*}}, i32 0 - // CHECK: and i32 {{.*}}, 1 - // CHECK: icmp ne i32 %{{.*}}, 0 - // CHECK: br {{.*}}, {{.*}}, {{.*}} - // CHECK: extractelement <4 x float> %{{.*}}, i32 0 - // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0 + // CHECK: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK-NEXT: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: select i1 %{{.*}}, float %{{.*}}, float %{{.*}} + // CHECK-NEXT: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0 return _mm_mask_sub_ss(__W,__U,__A,__B); } __m128 test_mm_maskz_sub_ss(__mmask8 __U, __m128 __A, __m128 __B) { @@ -3306,10 +3313,12 @@ __m128 test_mm_maskz_sub_ss(__mmask8 __U, __m128 __A, __m128 __B) { // CHECK: extractelement <4 x float> %{{.*}}, i32 0 // CHECK: fsub float %{{.*}}, %{{.*}} // CHECK: insertelement <4 x float> {{.*}}, i32 0 - // CHECK: and i32 {{.*}}, 1 - // CHECK: icmp ne i32 %{{.*}}, 0 - // CHECK: br {{.*}}, {{.*}}, {{.*}} - // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0 + // CHECK: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK-NEXT: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: select i1 %{{.*}}, float %{{.*}}, float %{{.*}} + // CHECK-NEXT: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0 return _mm_maskz_sub_ss(__U,__A,__B); } __m128d test_mm_sub_round_sd(__m128d __A, __m128d __B) { @@ -3334,11 +3343,12 @@ __m128d test_mm_mask_sub_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) // CHECK: extractelement <2 x double> %{{.*}}, i32 0 // CHECK: fsub double %{{.*}}, %{{.*}} // CHECK: insertelement <2 x double> {{.*}}, i32 0 - // CHECK: and i32 {{.*}}, 1 - // CHECK: icmp ne i32 %{{.*}}, 0 - // CHECK: br {{.*}}, {{.*}}, {{.*}} - // CHECK: extractelement <2 x double> %{{.*}}, i32 0 - // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0 + // CHECK: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK-NEXT: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: select i1 %{{.*}}, double %{{.*}}, double %{{.*}} + // CHECK-NEXT: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0 return _mm_mask_sub_sd(__W,__U,__A,__B); } __m128d test_mm_maskz_sub_sd(__mmask8 __U, __m128d __A, __m128d __B) { @@ -3348,10 +3358,12 @@ __m128d test_mm_maskz_sub_sd(__mmask8 __U, __m128d __A, __m128d __B) { // CHECK: extractelement <2 x double> %{{.*}}, i32 0 // CHECK: fsub double %{{.*}}, %{{.*}} // CHECK: insertelement <2 x double> {{.*}}, i32 0 - // CHECK: and i32 {{.*}}, 1 - // CHECK: icmp ne i32 %{{.*}}, 0 - // CHECK: br {{.*}}, {{.*}}, {{.*}} - // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0 + // CHECK: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK-NEXT: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: select i1 %{{.*}}, double %{{.*}}, double %{{.*}} + // CHECK-NEXT: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0 return _mm_maskz_sub_sd(__U,__A,__B); } __m512d test_mm512_mul_round_pd(__m512d __A, __m512d __B) { @@ -3434,11 +3446,12 @@ __m128 test_mm_mask_mul_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { // CHECK: extractelement <4 x float> %{{.*}}, i32 0 // CHECK: fmul float %{{.*}}, %{{.*}} // CHECK: insertelement <4 x float> {{.*}}, i32 0 - // CHECK: and i32 {{.*}}, 1 - // CHECK: icmp ne i32 %{{.*}}, 0 - // CHECK: br {{.*}}, {{.*}}, {{.*}} - // CHECK: extractelement <4 x float> %{{.*}}, i32 0 - // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0 + // CHECK: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK-NEXT: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: select i1 %{{.*}}, float %{{.*}}, float %{{.*}} + // CHECK-NEXT: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0 return _mm_mask_mul_ss(__W,__U,__A,__B); } __m128 test_mm_maskz_mul_ss(__mmask8 __U, __m128 __A, __m128 __B) { @@ -3448,10 +3461,12 @@ __m128 test_mm_maskz_mul_ss(__mmask8 __U, __m128 __A, __m128 __B) { // CHECK: extractelement <4 x float> %{{.*}}, i32 0 // CHECK: fmul float %{{.*}}, %{{.*}} // CHECK: insertelement <4 x float> {{.*}}, i32 0 - // CHECK: and i32 {{.*}}, 1 - // CHECK: icmp ne i32 %{{.*}}, 0 - // CHECK: br {{.*}}, {{.*}}, {{.*}} - // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0 + // CHECK: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK-NEXT: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: select i1 %{{.*}}, float %{{.*}}, float %{{.*}} + // CHECK-NEXT: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0 return _mm_maskz_mul_ss(__U,__A,__B); } __m128d test_mm_mul_round_sd(__m128d __A, __m128d __B) { @@ -3476,11 +3491,12 @@ __m128d test_mm_mask_mul_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) // CHECK: extractelement <2 x double> %{{.*}}, i32 0 // CHECK: fmul double %{{.*}}, %{{.*}} // CHECK: insertelement <2 x double> {{.*}}, i32 0 - // CHECK: and i32 {{.*}}, 1 - // CHECK: icmp ne i32 %{{.*}}, 0 - // CHECK: br {{.*}}, {{.*}}, {{.*}} - // CHECK: extractelement <2 x double> %{{.*}}, i32 0 - // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0 + // CHECK: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK-NEXT: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: select i1 %{{.*}}, double %{{.*}}, double %{{.*}} + // CHECK-NEXT: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0 return _mm_mask_mul_sd(__W,__U,__A,__B); } __m128d test_mm_maskz_mul_sd(__mmask8 __U, __m128d __A, __m128d __B) { @@ -3490,10 +3506,12 @@ __m128d test_mm_maskz_mul_sd(__mmask8 __U, __m128d __A, __m128d __B) { // CHECK: extractelement <2 x double> %{{.*}}, i32 0 // CHECK: fmul double %{{.*}}, %{{.*}} // CHECK: insertelement <2 x double> {{.*}}, i32 0 - // CHECK: and i32 {{.*}}, 1 - // CHECK: icmp ne i32 %{{.*}}, 0 - // CHECK: br {{.*}}, {{.*}}, {{.*}} - // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0 + // CHECK: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK-NEXT: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: select i1 %{{.*}}, double %{{.*}}, double %{{.*}} + // CHECK-NEXT: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0 return _mm_maskz_mul_sd(__U,__A,__B); } __m512d test_mm512_div_round_pd(__m512d __A, __m512d __B) { @@ -3581,10 +3599,12 @@ __m128 test_mm_maskz_div_round_ss(__mmask8 __U, __m128 __A, __m128 __B) { } __m128 test_mm_mask_div_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { // CHECK-LABEL: @test_mm_mask_div_ss + // CHECK: extractelement <4 x float> %{{.*}}, i32 0 + // CHECK: extractelement <4 x float> %{{.*}}, i32 0 + // CHECK: fdiv float %{{.*}}, %{{.*}} + // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0 // CHECK: extractelement <4 x float> %{{.*}}, i64 0 // CHECK-NEXT: extractelement <4 x float> %{{.*}}, i64 0 - // CHECK-NEXT: extractelement <4 x float> %{{.*}}, i64 0 - // CHECK-NEXT: fdiv float %{{.*}}, %{{.*}} // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1> // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0 // CHECK-NEXT: select i1 %{{.*}}, float %{{.*}}, float %{{.*}} @@ -3593,10 +3613,12 @@ __m128 test_mm_mask_div_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { } __m128 test_mm_maskz_div_ss(__mmask8 __U, __m128 __A, __m128 __B) { // CHECK-LABEL: @test_mm_maskz_div_ss + // CHECK: extractelement <4 x float> %{{.*}}, i32 0 + // CHECK: extractelement <4 x float> %{{.*}}, i32 0 + // CHECK: fdiv float %{{.*}}, %{{.*}} + // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0 // CHECK: extractelement <4 x float> %{{.*}}, i64 0 // CHECK-NEXT: extractelement <4 x float> %{{.*}}, i64 0 - // CHECK-NEXT: extractelement <4 x float> %{{.*}}, i64 0 - // CHECK-NEXT: fdiv float %{{.*}}, %{{.*}} // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1> // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0 // CHECK-NEXT: select i1 %{{.*}}, float %{{.*}}, float %{{.*}} @@ -3620,10 +3642,12 @@ __m128d test_mm_maskz_div_round_sd(__mmask8 __U, __m128d __A, __m128d __B) { } __m128d test_mm_mask_div_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { // CHECK-LABEL: @test_mm_mask_div_sd + // CHECK: extractelement <2 x double> %{{.*}}, i32 0 + // CHECK: extractelement <2 x double> %{{.*}}, i32 0 + // CHECK: fdiv double %{{.*}}, %{{.*}} + // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0 // CHECK: extractelement <2 x double> %{{.*}}, i64 0 // CHECK-NEXT: extractelement <2 x double> %{{.*}}, i64 0 - // CHECK-NEXT: extractelement <2 x double> %{{.*}}, i64 0 - // CHECK-NEXT: fdiv double %{{.*}}, %{{.*}} // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1> // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0 // CHECK-NEXT: select i1 %{{.*}}, double %{{.*}}, double %{{.*}} @@ -3632,10 +3656,12 @@ __m128d test_mm_mask_div_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) } __m128d test_mm_maskz_div_sd(__mmask8 __U, __m128d __A, __m128d __B) { // CHECK-LABEL: @test_mm_maskz_div_sd + // CHECK: extractelement <2 x double> %{{.*}}, i32 0 + // CHECK: extractelement <2 x double> %{{.*}}, i32 0 + // CHECK: fdiv double %{{.*}}, %{{.*}} + // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0 // CHECK: extractelement <2 x double> %{{.*}}, i64 0 // CHECK-NEXT: extractelement <2 x double> %{{.*}}, i64 0 - // CHECK-NEXT: extractelement <2 x double> %{{.*}}, i64 0 - // CHECK-NEXT: fdiv double %{{.*}}, %{{.*}} // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1> // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0 // CHECK-NEXT: select i1 %{{.*}}, double %{{.*}}, double %{{.*}} @@ -10531,38 +10557,56 @@ int test_mm512_mask2int(__mmask16 __a) __m128 test_mm_mask_move_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { // CHECK-LABEL: @test_mm_mask_move_ss - // CHECK: extractelement <4 x float> %{{.*}}, i32 0 - // CHECK: extractelement <4 x float> %{{.*}}, i32 0 - // CHECK: phi float [ %{{.*}}, %{{.*}} ], [ %{{.*}}, %{{.*}} ] - // CHECK: insertelement <4 x float> %{{.*}}, float %cond.i, i32 0 + // CHECK: [[EXT:%.*]] = extractelement <4 x float> %{{.*}}, i32 0 + // CHECK: insertelement <4 x float> %{{.*}}, float [[EXT]], i32 0 + // CHECK: [[A:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 0 + // CHECK-NEXT: [[B:%.*]] = extractelement <4 x float> %{{.*}}, i64 0 + // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: [[SEL:%.*]] = select i1 %{{.*}}, float [[A]], float [[B]] + // CHECK-NEXT: insertelement <4 x float> [[VEC]], float [[SEL]], i64 0 return _mm_mask_move_ss ( __W, __U, __A, __B); } __m128 test_mm_maskz_move_ss (__mmask8 __U, __m128 __A, __m128 __B) { // CHECK-LABEL: @test_mm_maskz_move_ss - // CHECK: extractelement <4 x float> %{{.*}}, i32 0 - // CHECK: phi float [ %{{.*}}, %{{.*}} ], [ 0.000000e+00, %{{.*}} ] - // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0 + // CHECK: [[EXT:%.*]] = extractelement <4 x float> %{{.*}}, i32 0 + // CHECK: insertelement <4 x float> %{{.*}}, float [[EXT]], i32 0 + // CHECK: [[A:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 0 + // CHECK-NEXT: [[B:%.*]] = extractelement <4 x float> %{{.*}}, i64 0 + // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: [[SEL:%.*]] = select i1 %{{.*}}, float [[A]], float [[B]] + // CHECK-NEXT: insertelement <4 x float> [[VEC]], float [[SEL]], i64 0 return _mm_maskz_move_ss (__U, __A, __B); } __m128d test_mm_mask_move_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { // CHECK-LABEL: @test_mm_mask_move_sd - // CHECK: extractelement <2 x double> %{{.*}}, i32 0 - // CHECK: extractelement <2 x double> %{{.*}}, i32 0 - // CHECK: phi double [ %{{.*}}, %{{.*}} ], [ %{{.*}}, %{{.*}} ] - // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0 + // CHECK: [[EXT:%.*]] = extractelement <2 x double> %{{.*}}, i32 0 + // CHECK: insertelement <2 x double> %{{.*}}, double [[EXT]], i32 0 + // CHECK: [[A:%.*]] = extractelement <2 x double> [[VEC:%.*]], i64 0 + // CHECK-NEXT: [[B:%.*]] = extractelement <2 x double> %{{.*}}, i64 0 + // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: [[SEL:%.*]] = select i1 %{{.*}}, double [[A]], double [[B]] + // CHECK-NEXT: insertelement <2 x double> [[VEC]], double [[SEL]], i64 0 return _mm_mask_move_sd ( __W, __U, __A, __B); } __m128d test_mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B) { // CHECK-LABEL: @test_mm_maskz_move_sd - // CHECK: extractelement <2 x double> %{{.*}}, i32 0 - // CHECK: phi double [ %{{.*}}, %{{.*}} ], [ 0.000000e+00, %{{.*}} ] - // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0 + // CHECK: [[EXT:%.*]] = extractelement <2 x double> %{{.*}}, i32 0 + // CHECK: insertelement <2 x double> %{{.*}}, double [[EXT]], i32 0 + // CHECK: [[A:%.*]] = extractelement <2 x double> [[VEC:%.*]], i64 0 + // CHECK-NEXT: [[B:%.*]] = extractelement <2 x double> %{{.*}}, i64 0 + // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: [[SEL:%.*]] = select i1 %13, double [[A]], double [[B]] + // CHECK-NEXT: insertelement <2 x double> [[VEC]], double [[SEL]], i64 0 return _mm_maskz_move_sd (__U, __A, __B); } diff --git a/clang/test/CodeGen/sse-builtins.c b/clang/test/CodeGen/sse-builtins.c index b7c7a7f..e980148 100644 --- a/clang/test/CodeGen/sse-builtins.c +++ b/clang/test/CodeGen/sse-builtins.c @@ -450,7 +450,8 @@ __m128 test_mm_min_ss(__m128 A, __m128 B) { __m128 test_mm_move_ss(__m128 A, __m128 B) { // CHECK-LABEL: test_mm_move_ss - // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> + // CHECK: extractelement <4 x float> %{{.*}}, i32 0 + // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0 return _mm_move_ss(A, B); } diff --git a/clang/test/CodeGen/sse2-builtins.c b/clang/test/CodeGen/sse2-builtins.c index dbf79c4..fe7f7cc 100644 --- a/clang/test/CodeGen/sse2-builtins.c +++ b/clang/test/CodeGen/sse2-builtins.c @@ -794,9 +794,7 @@ __m128i test_mm_move_epi64(__m128i A) { __m128d test_mm_move_sd(__m128d A, __m128d B) { // CHECK-LABEL: test_mm_move_sd // CHECK: extractelement <2 x double> %{{.*}}, i32 0 - // CHECK: insertelement <2 x double> undef, double %{{.*}}, i32 0 - // CHECK: extractelement <2 x double> %{{.*}}, i32 1 - // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 1 + // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0 return _mm_move_sd(A, B); } -- 2.7.4