From 42a4d0822e529b39cdc808e78ee0931bc2eb120e Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 31 Aug 2018 20:41:06 +0000 Subject: [PATCH] [X86] Add k-mask conversion and load/store instrinsics to match gcc and icc. This adds: _cvtmask8_u32, _cvtmask16_u32, _cvtmask32_u32, _cvtmask64_u64 _cvtu32_mask8, _cvtu32_mask16, _cvtu32_mask32, _cvtu64_mask64 _load_mask8, _load_mask16, _load_mask32, _load_mask64 _store_mask8, _store_mask16, _store_mask32, _store_mask64 These are currently missing from the Intel Intrinsics Guide webpage. llvm-svn: 341251 --- clang/include/clang/Basic/BuiltinsX86.def | 4 +++ clang/lib/CodeGen/CGBuiltin.cpp | 11 +++++++ clang/lib/Headers/avx512bwintrin.h | 40 +++++++++++++++++++++++ clang/lib/Headers/avx512dqintrin.h | 20 ++++++++++++ clang/lib/Headers/avx512fintrin.h | 20 ++++++++++++ clang/test/CodeGen/avx512bw-builtins.c | 54 +++++++++++++++++++++++++++++++ clang/test/CodeGen/avx512dq-builtins.c | 29 +++++++++++++++++ clang/test/CodeGen/avx512f-builtins.c | 29 +++++++++++++++++ 8 files changed, 207 insertions(+) diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def index 5b90b73..11c3305 100644 --- a/clang/include/clang/Basic/BuiltinsX86.def +++ b/clang/include/clang/Basic/BuiltinsX86.def @@ -1778,6 +1778,10 @@ TARGET_BUILTIN(__builtin_ia32_kshiftriqi, "UcUcIUi", "nc", "avx512dq") TARGET_BUILTIN(__builtin_ia32_kshiftrihi, "UsUsIUi", "nc", "avx512f") TARGET_BUILTIN(__builtin_ia32_kshiftrisi, "UiUiIUi", "nc", "avx512bw") TARGET_BUILTIN(__builtin_ia32_kshiftridi, "ULLiULLiIUi", "nc", "avx512bw") +TARGET_BUILTIN(__builtin_ia32_kmovb, "UcUc", "nc", "avx512dq") +TARGET_BUILTIN(__builtin_ia32_kmovw, "UsUs", "nc", "avx512f") +TARGET_BUILTIN(__builtin_ia32_kmovd, "UiUi", "nc", "avx512bw") +TARGET_BUILTIN(__builtin_ia32_kmovq, "ULLiULLi", "nc", "avx512bw") TARGET_BUILTIN(__builtin_ia32_palignr512, "V64cV64cV64cIi", "ncV:512:", "avx512bw") TARGET_BUILTIN(__builtin_ia32_dbpsadbw128, "V8sV16cV16cIi", "ncV:128:", "avx512bw,avx512vl") TARGET_BUILTIN(__builtin_ia32_dbpsadbw256, "V16sV32cV32cIi", "ncV:256:", "avx512bw,avx512vl") diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 50e9b5e..58f8b7d 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -10137,6 +10137,17 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, return Builder.CreateBitCast(Builder.CreateNot(Res), Ops[0]->getType()); } + case X86::BI__builtin_ia32_kmovb: + case X86::BI__builtin_ia32_kmovw: + case X86::BI__builtin_ia32_kmovd: + case X86::BI__builtin_ia32_kmovq: { + // Bitcast to vXi1 type and then back to integer. This gets the mask + // register type into the IR, but might be optimized out depending on + // what's around it. + unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth(); + Value *Res = getMaskVecValue(*this, Ops[0], NumElts); + return Builder.CreateBitCast(Res, Ops[0]->getType()); + } case X86::BI__builtin_ia32_kunpckdi: case X86::BI__builtin_ia32_kunpcksi: diff --git a/clang/lib/Headers/avx512bwintrin.h b/clang/lib/Headers/avx512bwintrin.h index d7a03c96..740f5cd 100644 --- a/clang/lib/Headers/avx512bwintrin.h +++ b/clang/lib/Headers/avx512bwintrin.h @@ -167,6 +167,46 @@ _kadd_mask64(__mmask64 __A, __mmask64 __B) #define _kshiftri_mask64(A, I) \ (__mmask64)__builtin_ia32_kshiftridi((__mmask64)(A), (unsigned int)(I)) +static __inline__ unsigned int __DEFAULT_FN_ATTRS +_cvtmask32_u32(__mmask32 __A) { + return (unsigned int)__builtin_ia32_kmovd((__mmask32)__A); +} + +static __inline__ unsigned long long __DEFAULT_FN_ATTRS +_cvtmask64_u64(__mmask64 __A) { + return (unsigned long long)__builtin_ia32_kmovq((__mmask64)__A); +} + +static __inline__ __mmask32 __DEFAULT_FN_ATTRS +_cvtu32_mask32(unsigned int __A) { + return (__mmask32)__builtin_ia32_kmovd((__mmask32)__A); +} + +static __inline__ __mmask64 __DEFAULT_FN_ATTRS +_cvtu64_mask64(unsigned long long __A) { + return (__mmask64)__builtin_ia32_kmovq((__mmask64)__A); +} + +static __inline__ __mmask32 __DEFAULT_FN_ATTRS +_load_mask32(__mmask32 *__A) { + return (__mmask32)__builtin_ia32_kmovd(*(__mmask32 *)__A); +} + +static __inline__ __mmask64 __DEFAULT_FN_ATTRS +_load_mask64(__mmask64 *__A) { + return (__mmask64)__builtin_ia32_kmovq(*(__mmask64 *)__A); +} + +static __inline__ void __DEFAULT_FN_ATTRS +_store_mask32(__mmask32 *__A, __mmask32 __B) { + *(__mmask32 *)__A = __builtin_ia32_kmovd((__mmask32)__B); +} + +static __inline__ void __DEFAULT_FN_ATTRS +_store_mask64(__mmask64 *__A, __mmask64 __B) { + *(__mmask64 *)__A = __builtin_ia32_kmovq((__mmask64)__B); +} + /* Integer compare */ #define _mm512_cmp_epi8_mask(a, b, p) \ diff --git a/clang/lib/Headers/avx512dqintrin.h b/clang/lib/Headers/avx512dqintrin.h index b54e547..7e025e9 100644 --- a/clang/lib/Headers/avx512dqintrin.h +++ b/clang/lib/Headers/avx512dqintrin.h @@ -104,6 +104,26 @@ _kadd_mask16(__mmask16 __A, __mmask16 __B) #define _kshiftri_mask8(A, I) \ (__mmask8)__builtin_ia32_kshiftriqi((__mmask8)(A), (unsigned int)(I)) +static __inline__ unsigned int __DEFAULT_FN_ATTRS +_cvtmask8_u32(__mmask8 __A) { + return (unsigned int)__builtin_ia32_kmovb((__mmask8)__A); +} + +static __inline__ __mmask8 __DEFAULT_FN_ATTRS +_cvtu32_mask8(unsigned int __A) { + return (__mmask8)__builtin_ia32_kmovb((__mmask8)__A); +} + +static __inline__ __mmask8 __DEFAULT_FN_ATTRS +_load_mask8(__mmask8 *__A) { + return (__mmask8)__builtin_ia32_kmovb(*(__mmask8 *)__A); +} + +static __inline__ void __DEFAULT_FN_ATTRS +_store_mask8(__mmask8 *__A, __mmask8 __B) { + *(__mmask8 *)__A = __builtin_ia32_kmovb((__mmask8)__B); +} + static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mullo_epi64 (__m512i __A, __m512i __B) { return (__m512i) ((__v8du) __A * (__v8du) __B); diff --git a/clang/lib/Headers/avx512fintrin.h b/clang/lib/Headers/avx512fintrin.h index 262f46c..d00e553 100644 --- a/clang/lib/Headers/avx512fintrin.h +++ b/clang/lib/Headers/avx512fintrin.h @@ -8400,6 +8400,26 @@ _mm512_kxor (__mmask16 __A, __mmask16 __B) #define _kshiftri_mask16(A, I) \ (__mmask16)__builtin_ia32_kshiftrihi((__mmask16)(A), (unsigned int)(I)) +static __inline__ unsigned int __DEFAULT_FN_ATTRS +_cvtmask16_u32(__mmask16 __A) { + return (unsigned int)__builtin_ia32_kmovw((__mmask16)__A); +} + +static __inline__ __mmask16 __DEFAULT_FN_ATTRS +_cvtu32_mask16(unsigned int __A) { + return (__mmask16)__builtin_ia32_kmovw((__mmask16)__A); +} + +static __inline__ __mmask16 __DEFAULT_FN_ATTRS +_load_mask16(__mmask16 *__A) { + return (__mmask16)__builtin_ia32_kmovw(*(__mmask16 *)__A); +} + +static __inline__ void __DEFAULT_FN_ATTRS +_store_mask16(__mmask16 *__A, __mmask16 __B) { + *(__mmask16 *)__A = __builtin_ia32_kmovw((__mmask16)__B); +} + static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_stream_si512 (__m512i * __P, __m512i __A) { diff --git a/clang/test/CodeGen/avx512bw-builtins.c b/clang/test/CodeGen/avx512bw-builtins.c index 2075a0c..22bc974 100644 --- a/clang/test/CodeGen/avx512bw-builtins.c +++ b/clang/test/CodeGen/avx512bw-builtins.c @@ -280,6 +280,60 @@ __mmask64 test_kshiftri_mask64(__m512i A, __m512i B, __m512i C, __m512i D) { return _mm512_mask_cmpneq_epu8_mask(_kshiftri_mask64(_mm512_cmpneq_epu8_mask(A, B), 32), C, D); } +unsigned int test_cvtmask32_u32(__m512i A, __m512i B) { + // CHECK-LABEL: @test_cvtmask32_u32 + // CHECK: bitcast <32 x i1> %{{.*}} to i32 + // CHECK: bitcast i32 %{{.*}} to <32 x i1> + return _cvtmask32_u32(_mm512_cmpneq_epu16_mask(A, B)); +} + +unsigned long long test_cvtmask64_u64(__m512i A, __m512i B) { + // CHECK-LABEL: @test_cvtmask64_u64 + // CHECK: bitcast <64 x i1> %{{.*}} to i64 + // CHECK: bitcast i64 %{{.*}} to <64 x i1> + return _cvtmask64_u64(_mm512_cmpneq_epu8_mask(A, B)); +} + +__mmask32 test_cvtu32_mask32(__m512i A, __m512i B, unsigned int C) { + // CHECK-LABEL: @test_cvtu32_mask32 + // CHECK: bitcast i32 %{{.*}} to <32 x i1> + return _mm512_mask_cmpneq_epu16_mask(_cvtu32_mask32(C), A, B); +} + +__mmask64 test_cvtu64_mask64(__m512i A, __m512i B, unsigned long long C) { + // CHECK-LABEL: @test_cvtu64_mask64 + // CHECK: bitcast i64 %{{.*}} to <64 x i1> + return _mm512_mask_cmpneq_epu8_mask(_cvtu64_mask64(C), A, B); +} + +__mmask32 test_load_mask32(__mmask32 *A, __m512i B, __m512i C) { + // CHECK-LABEL: @test_load_mask32 + // CHECK: [[LOAD:%.*]] = load i32, i32* %{{.*}} + // CHECK: bitcast i32 [[LOAD]] to <32 x i1> + return _mm512_mask_cmpneq_epu16_mask(_load_mask32(A), B, C); +} + +__mmask64 test_load_mask64(__mmask64 *A, __m512i B, __m512i C) { + // CHECK-LABEL: @test_load_mask64 + // CHECK: [[LOAD:%.*]] = load i64, i64* %{{.*}} + // CHECK: bitcast i64 [[LOAD]] to <64 x i1> + return _mm512_mask_cmpneq_epu8_mask(_load_mask64(A), B, C); +} + +void test_store_mask32(__mmask32 *A, __m512i B, __m512i C) { + // CHECK-LABEL: @test_store_mask32 + // CHECK: bitcast <32 x i1> %{{.*}} to i32 + // CHECK: store i32 %{{.*}}, i32* %{{.*}} + _store_mask32(A, _mm512_cmpneq_epu16_mask(B, C)); +} + +void test_store_mask64(__mmask64 *A, __m512i B, __m512i C) { + // CHECK-LABEL: @test_store_mask64 + // CHECK: bitcast <64 x i1> %{{.*}} to i64 + // CHECK: store i64 %{{.*}}, i64* %{{.*}} + _store_mask64(A, _mm512_cmpneq_epu8_mask(B, C)); +} + __mmask64 test_mm512_cmpeq_epi8_mask(__m512i __a, __m512i __b) { // CHECK-LABEL: @test_mm512_cmpeq_epi8_mask // CHECK: icmp eq <64 x i8> %{{.*}}, %{{.*}} diff --git a/clang/test/CodeGen/avx512dq-builtins.c b/clang/test/CodeGen/avx512dq-builtins.c index 6d2f93d..77f5833 100644 --- a/clang/test/CodeGen/avx512dq-builtins.c +++ b/clang/test/CodeGen/avx512dq-builtins.c @@ -152,6 +152,35 @@ __mmask8 test_kshiftri_mask8(__m512i A, __m512i B, __m512i C, __m512i D) { return _mm512_mask_cmpneq_epu64_mask(_kshiftri_mask8(_mm512_cmpneq_epu64_mask(A, B), 2), C, D); } +unsigned int test_cvtmask8_u32(__m512i A, __m512i B) { + // CHECK-LABEL: @test_cvtmask8_u32 + // CHECK: bitcast <8 x i1> %{{.*}} to i8 + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: zext i8 %{{.*}} to i32 + return _cvtmask8_u32(_mm512_cmpneq_epu64_mask(A, B)); +} + +__mmask8 test_cvtu32_mask8(__m512i A, __m512i B, unsigned int C) { + // CHECK-LABEL: @test_cvtu32_mask8 + // CHECK: trunc i32 %{{.*}} to i8 + // CHECK: bitcast i8 %{{.*}} to <8 x i1> + return _mm512_mask_cmpneq_epu64_mask(_cvtu32_mask8(C), A, B); +} + +__mmask8 test_load_mask8(__mmask8 *A, __m512i B, __m512i C) { + // CHECK-LABEL: @test_load_mask8 + // CHECK: [[LOAD:%.*]] = load i8, i8* %{{.*}} + // CHECK: bitcast i8 [[LOAD]] to <8 x i1> + return _mm512_mask_cmpneq_epu64_mask(_load_mask8(A), B, C); +} + +void test_store_mask8(__mmask8 *A, __m512i B, __m512i C) { + // CHECK-LABEL: @test_store_mask8 + // CHECK: bitcast <8 x i1> %{{.*}} to i8 + // CHECK: store i8 %{{.*}}, i8* %{{.*}} + _store_mask8(A, _mm512_cmpneq_epu64_mask(B, C)); +} + __m512i test_mm512_mullo_epi64 (__m512i __A, __m512i __B) { // CHECK-LABEL: @test_mm512_mullo_epi64 // CHECK: mul <8 x i64> diff --git a/clang/test/CodeGen/avx512f-builtins.c b/clang/test/CodeGen/avx512f-builtins.c index 8ddabc5..058591e 100644 --- a/clang/test/CodeGen/avx512f-builtins.c +++ b/clang/test/CodeGen/avx512f-builtins.c @@ -8312,6 +8312,35 @@ __mmask16 test_kshiftri_mask16(__m512i A, __m512i B, __m512i C, __m512i D) { return _mm512_mask_cmpneq_epu32_mask(_kshiftri_mask16(_mm512_cmpneq_epu32_mask(A, B), 1), C, D); } +unsigned int test_cvtmask16_u32(__m512i A, __m512i B) { + // CHECK-LABEL: @test_cvtmask16_u32 + // CHECK: bitcast <16 x i1> %{{.*}} to i16 + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: zext i16 %{{.*}} to i32 + return _cvtmask16_u32(_mm512_cmpneq_epu32_mask(A, B)); +} + +__mmask16 test_cvtu32_mask16(__m512i A, __m512i B, unsigned int C) { + // CHECK-LABEL: @test_cvtu32_mask16 + // CHECK: trunc i32 %{{.*}} to i16 + // CHECK: bitcast i16 %{{.*}} to <16 x i1> + return _mm512_mask_cmpneq_epu32_mask(_cvtu32_mask16(C), A, B); +} + +__mmask16 test_load_mask16(__mmask16 *A, __m512i B, __m512i C) { + // CHECK-LABEL: @test_load_mask16 + // CHECK: [[LOAD:%.*]] = load i16, i16* %{{.*}} + // CHECK: bitcast i16 [[LOAD]] to <16 x i1> + return _mm512_mask_cmpneq_epu32_mask(_load_mask16(A), B, C); +} + +void test_store_mask16(__mmask16 *A, __m512i B, __m512i C) { + // CHECK-LABEL: @test_store_mask16 + // CHECK: bitcast <16 x i1> %{{.*}} to i16 + // CHECK: store i16 %{{.*}}, i16* %{{.*}} + _store_mask16(A, _mm512_cmpneq_epu32_mask(B, C)); +} + void test_mm512_stream_si512(__m512i * __P, __m512i __A) { // CHECK-LABEL: @test_mm512_stream_si512 // CHECK: store <8 x i64> %{{.*}}, <8 x i64>* %{{.*}}, align 64, !nontemporal -- 2.7.4