From 74ac0eda685e2a2e286b02cb679b68fd57c636b2 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 10 May 2018 05:43:43 +0000 Subject: [PATCH] [X86] Change the implementation of scalar masked load/store intrinsics to not use a 512-bit intermediate vector. This is unnecessary for AVX512VL supporting CPUs like SKX. We can just emit a 128-bit masked load/store here no matter what. The backend will widen it to 512-bits on KNL CPUs. Fixes the frontend portion of PR37386. Need to fix the backend to optimize the new sequences well. llvm-svn: 331958 --- clang/include/clang/Basic/BuiltinsX86.def | 8 +++---- clang/lib/CodeGen/CGBuiltin.cpp | 4 ++-- clang/lib/Headers/avx512fintrin.h | 36 +++++++++---------------------- clang/test/CodeGen/avx512f-builtins.c | 12 +++++------ 4 files changed, 22 insertions(+), 38 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def index 891b943..8d0e06f 100644 --- a/clang/include/clang/Basic/BuiltinsX86.def +++ b/clang/include/clang/Basic/BuiltinsX86.def @@ -1523,10 +1523,10 @@ TARGET_BUILTIN(__builtin_ia32_fixupimmps128_maskz, "V4fV4fV4fV4iIiUc", "nc", "av TARGET_BUILTIN(__builtin_ia32_fixupimmps256_mask, "V8fV8fV8fV8iIiUc", "nc", "avx512vl") TARGET_BUILTIN(__builtin_ia32_fixupimmps256_maskz, "V8fV8fV8fV8iIiUc", "nc", "avx512vl") TARGET_BUILTIN(__builtin_ia32_loadapd128_mask, "V2dV2d*V2dUc", "n", "avx512vl") -TARGET_BUILTIN(__builtin_ia32_loadsd128_mask, "V8dV8d*V8dUc", "n", "avx512f") +TARGET_BUILTIN(__builtin_ia32_loadsd128_mask, "V2dV2d*V2dUc", "n", "avx512f") TARGET_BUILTIN(__builtin_ia32_loadapd256_mask, "V4dV4d*V4dUc", "n", "avx512vl") TARGET_BUILTIN(__builtin_ia32_loadaps128_mask, "V4fV4f*V4fUc", "n", "avx512vl") -TARGET_BUILTIN(__builtin_ia32_loadss128_mask, "V16fV16f*V16fUs", "n", "avx512f") +TARGET_BUILTIN(__builtin_ia32_loadss128_mask, "V4fV4f*V4fUc", "n", "avx512f") TARGET_BUILTIN(__builtin_ia32_loadaps256_mask, "V8fV8f*V8fUc", "n", "avx512vl") TARGET_BUILTIN(__builtin_ia32_loaddqudi128_mask, "V2LLiV2LLi*V2LLiUc", "n", "avx512vl") TARGET_BUILTIN(__builtin_ia32_loaddqudi256_mask, "V4LLiV4LLi*V4LLiUc", "n", "avx512vl") @@ -1543,10 +1543,10 @@ TARGET_BUILTIN(__builtin_ia32_storedquhi256_mask, "vV16s*V16sUs", "n", "avx512vl TARGET_BUILTIN(__builtin_ia32_storedquqi128_mask, "vV16c*V16cUs", "n", "avx512vl,avx512bw") TARGET_BUILTIN(__builtin_ia32_storedquqi256_mask, "vV32c*V32cUi", "n", "avx512vl,avx512bw") TARGET_BUILTIN(__builtin_ia32_storeapd128_mask, "vV2d*V2dUc", "n", "avx512vl") -TARGET_BUILTIN(__builtin_ia32_storesd128_mask, "vV8d*V8dUc", "n", "avx512f") +TARGET_BUILTIN(__builtin_ia32_storesd128_mask, "vV2d*V2dUc", "n", "avx512f") TARGET_BUILTIN(__builtin_ia32_storeapd256_mask, "vV4d*V4dUc", "n", "avx512vl") TARGET_BUILTIN(__builtin_ia32_storeaps128_mask, "vV4f*V4fUc", "n", "avx512vl") -TARGET_BUILTIN(__builtin_ia32_storess128_mask, "vV16f*V16fUs", "n", "avx512f") +TARGET_BUILTIN(__builtin_ia32_storess128_mask, "vV4f*V4fUc", "n", "avx512f") TARGET_BUILTIN(__builtin_ia32_storeaps256_mask, "vV8f*V8fUc", "n", "avx512vl") TARGET_BUILTIN(__builtin_ia32_storedqudi128_mask, "vV2LLi*V2LLiUc", "n", "avx512vl") TARGET_BUILTIN(__builtin_ia32_storedqudi256_mask, "vV4LLi*V4LLiUc", "n", "avx512vl") diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index dfb9370..4d3bbd6 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -8735,7 +8735,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_storess128_mask: case X86::BI__builtin_ia32_storesd128_mask: { - return EmitX86MaskedStore(*this, Ops, 16); + return EmitX86MaskedStore(*this, Ops, 1); } case X86::BI__builtin_ia32_vpopcntb_128: case X86::BI__builtin_ia32_vpopcntd_128: @@ -8819,7 +8819,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_loadss128_mask: case X86::BI__builtin_ia32_loadsd128_mask: - return EmitX86MaskedLoad(*this, Ops, 16); + return EmitX86MaskedLoad(*this, Ops, 1); case X86::BI__builtin_ia32_loadaps128_mask: case X86::BI__builtin_ia32_loadaps256_mask: diff --git a/clang/lib/Headers/avx512fintrin.h b/clang/lib/Headers/avx512fintrin.h index 78fae67..1d5cc35 100644 --- a/clang/lib/Headers/avx512fintrin.h +++ b/clang/lib/Headers/avx512fintrin.h @@ -9091,17 +9091,13 @@ _mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B) static __inline__ void __DEFAULT_FN_ATTRS _mm_mask_store_ss (float * __W, __mmask8 __U, __m128 __A) { - __builtin_ia32_storess128_mask ((__v16sf *)__W, - (__v16sf) _mm512_castps128_ps512(__A), - (__mmask16) __U & (__mmask16)1); + __builtin_ia32_storess128_mask ((__v4sf *)__W, __A, __U & 1); } static __inline__ void __DEFAULT_FN_ATTRS _mm_mask_store_sd (double * __W, __mmask8 __U, __m128d __A) { - __builtin_ia32_storesd128_mask ((__v8df *)__W, - (__v8df) _mm512_castpd128_pd512(__A), - (__mmask8) __U & 1); + __builtin_ia32_storesd128_mask ((__v2df *)__W, __A, __U & 1); } static __inline__ __m128 __DEFAULT_FN_ATTRS @@ -9111,21 +9107,15 @@ _mm_mask_load_ss (__m128 __W, __mmask8 __U, const float* __A) (__v4sf) {0.0, 0.0, 0.0, 0.0}, 0, 4, 4, 4); - return (__m128) __builtin_shufflevector( - __builtin_ia32_loadss128_mask ((__v16sf *) __A, - (__v16sf) _mm512_castps128_ps512(src), - (__mmask16) __U & 1), - _mm512_undefined_ps(), 0, 1, 2, 3); + return (__m128) __builtin_ia32_loadss128_mask ((__v4sf *) __A, src, __U & 1); } static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_maskz_load_ss (__mmask8 __U, const float* __A) { - return (__m128) __builtin_shufflevector( - __builtin_ia32_loadss128_mask ((__v16sf *) __A, - (__v16sf) _mm512_setzero_ps(), - (__mmask16) __U & 1), - _mm512_undefined_ps(), 0, 1, 2, 3); + return (__m128)__builtin_ia32_loadss128_mask ((__v4sf *) __A, + (__v4sf) _mm_setzero_ps(), + __U & 1); } static __inline__ __m128d __DEFAULT_FN_ATTRS @@ -9134,21 +9124,15 @@ _mm_mask_load_sd (__m128d __W, __mmask8 __U, const double* __A) __m128d src = (__v2df) __builtin_shufflevector((__v2df) __W, (__v2df) {0.0, 0.0}, 0, 2); - return (__m128d) __builtin_shufflevector( - __builtin_ia32_loadsd128_mask ((__v8df *) __A, - (__v8df) _mm512_castpd128_pd512(src), - (__mmask8) __U & 1), - _mm512_undefined_pd(), 0, 1); + return (__m128d) __builtin_ia32_loadsd128_mask ((__v2df *) __A, src, __U & 1); } static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_maskz_load_sd (__mmask8 __U, const double* __A) { - return (__m128d) __builtin_shufflevector( - __builtin_ia32_loadsd128_mask ((__v8df *) __A, - (__v8df) _mm512_setzero_pd(), - (__mmask8) __U & 1), - _mm512_undefined_pd(), 0, 1); + return (__m128d) __builtin_ia32_loadsd128_mask ((__v2df *) __A, + (__v2df) _mm_setzero_pd(), + __U & 1); } #define _mm512_shuffle_epi32(A, I) __extension__ ({ \ diff --git a/clang/test/CodeGen/avx512f-builtins.c b/clang/test/CodeGen/avx512f-builtins.c index 5a3cc4e..4699335d 100644 --- a/clang/test/CodeGen/avx512f-builtins.c +++ b/clang/test/CodeGen/avx512f-builtins.c @@ -8456,42 +8456,42 @@ __m128d test_mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B) void test_mm_mask_store_ss(float * __P, __mmask8 __U, __m128 __A) { // CHECK-LABEL: @test_mm_mask_store_ss - // CHECK: call void @llvm.masked.store.v16f32.p0v16f32( + // CHECK: call void @llvm.masked.store.v4f32.p0v4f32( _mm_mask_store_ss(__P, __U, __A); } void test_mm_mask_store_sd(double * __P, __mmask8 __U, __m128d __A) { // CHECK-LABEL: @test_mm_mask_store_sd - // CHECK: call void @llvm.masked.store.v8f64.p0v8f64( + // CHECK: call void @llvm.masked.store.v2f64.p0v2f64( _mm_mask_store_sd(__P, __U, __A); } __m128 test_mm_mask_load_ss(__m128 __A, __mmask8 __U, const float* __W) { // CHECK-LABEL: @test_mm_mask_load_ss - // CHECK: call <16 x float> @llvm.masked.load.v16f32.p0v16f32( + // CHECK: call <4 x float> @llvm.masked.load.v4f32.p0v4f32( return _mm_mask_load_ss(__A, __U, __W); } __m128 test_mm_maskz_load_ss (__mmask8 __U, const float * __W) { // CHECK-LABEL: @test_mm_maskz_load_ss - // CHECK: call <16 x float> @llvm.masked.load.v16f32.p0v16f32( + // CHECK: call <4 x float> @llvm.masked.load.v4f32.p0v4f32( return _mm_maskz_load_ss (__U, __W); } __m128d test_mm_mask_load_sd (__m128d __A, __mmask8 __U, const double * __W) { // CHECK-LABEL: @test_mm_mask_load_sd - // CHECK: call <8 x double> @llvm.masked.load.v8f64.p0v8f64( + // CHECK: call <2 x double> @llvm.masked.load.v2f64.p0v2f64( return _mm_mask_load_sd (__A, __U, __W); } __m128d test_mm_maskz_load_sd (__mmask8 __U, const double * __W) { // CHECK-LABEL: @test_mm_maskz_load_sd - // CHECK: call <8 x double> @llvm.masked.load.v8f64.p0v8f64( + // CHECK: call <2 x double> @llvm.masked.load.v2f64.p0v2f64( return _mm_maskz_load_sd (__U, __W); } -- 2.7.4