From: Simon Pilgrim Date: Wed, 27 Apr 2016 09:53:09 +0000 (+0000) Subject: [InstCombine][SSE] Added DemandedBits tests for MOVMSK instructions X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=d2ea708739864a65f6483dd1e66a2951e8170ba3;p=platform%2Fupstream%2Fllvm.git [InstCombine][SSE] Added DemandedBits tests for MOVMSK instructions MOVMSK zeros the upper bits of the gpr - we should be able to use this. llvm-svn: 267686 --- diff --git a/llvm/test/Transforms/InstCombine/x86-movmsk.ll b/llvm/test/Transforms/InstCombine/x86-movmsk.ll new file mode 100644 index 0000000..0e23218 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/x86-movmsk.ll @@ -0,0 +1,137 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -instcombine -S | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +; +; DemandedBits - MOVMSK zeros the upper bits of the result. +; TODO - we can get the and for free +; + +define i32 @test_upper_x86_sse_movmsk_ps(<4 x float> %a0) { +; CHECK-LABEL: @test_upper_x86_sse_movmsk_ps( +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0) +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], 15 +; CHECK-NEXT: ret i32 [[TMP2]] +; + %1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0) + %2 = and i32 %1, 15 + ret i32 %2 +} + +define i32 @test_upper_x86_sse2_movmsk_pd(<2 x double> %a0) { +; CHECK-LABEL: @test_upper_x86_sse2_movmsk_pd( +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %a0) +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], 3 +; CHECK-NEXT: ret i32 [[TMP2]] +; + %1 = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %a0) + %2 = and i32 %1, 3 + ret i32 %2 +} + +define i32 @test_upper_x86_sse2_pmovmskb_128(<16 x i8> %a0) { +; CHECK-LABEL: @test_upper_x86_sse2_pmovmskb_128( +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %a0) +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], 65535 +; CHECK-NEXT: ret i32 [[TMP2]] +; + %1 = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %a0) + %2 = and i32 %1, 65535 + ret i32 %2 +} + +define i32 @test_upper_x86_avx_movmsk_ps_256(<8 x float> %a0) { +; CHECK-LABEL: @test_upper_x86_avx_movmsk_ps_256( +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %a0) +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], 255 +; CHECK-NEXT: ret i32 [[TMP2]] +; + %1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %a0) + %2 = and i32 %1, 255 + ret i32 %2 +} + +define i32 @test_upper_x86_avx_movmsk_pd_256(<4 x double> %a0) { +; CHECK-LABEL: @test_upper_x86_avx_movmsk_pd_256( +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %a0) +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], 15 +; CHECK-NEXT: ret i32 [[TMP2]] +; + %1 = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %a0) + %2 = and i32 %1, 15 + ret i32 %2 +} + +; llvm.x86.avx2.pmovmskb uses the whole of the 32-bit register. + +; +; DemandedBits - If we don't use the lower bits then we just return zero. +; TODO - just return zero +; + +define i32 @test_lower_x86_sse_movmsk_ps(<4 x float> %a0) { +; CHECK-LABEL: @test_lower_x86_sse_movmsk_ps( +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0) +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], -16 +; CHECK-NEXT: ret i32 [[TMP2]] +; + %1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0) + %2 = and i32 %1, -16 + ret i32 %2 +} + +define i32 @test_lower_x86_sse2_movmsk_pd(<2 x double> %a0) { +; CHECK-LABEL: @test_lower_x86_sse2_movmsk_pd( +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %a0) +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], -4 +; CHECK-NEXT: ret i32 [[TMP2]] +; + %1 = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %a0) + %2 = and i32 %1, -4 + ret i32 %2 +} + +define i32 @test_lower_x86_sse2_pmovmskb_128(<16 x i8> %a0) { +; CHECK-LABEL: @test_lower_x86_sse2_pmovmskb_128( +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %a0) +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], -65536 +; CHECK-NEXT: ret i32 [[TMP2]] +; + %1 = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %a0) + %2 = and i32 %1, -65536 + ret i32 %2 +} + +define i32 @test_lower_x86_avx_movmsk_ps_256(<8 x float> %a0) { +; CHECK-LABEL: @test_lower_x86_avx_movmsk_ps_256( +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %a0) +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], -256 +; CHECK-NEXT: ret i32 [[TMP2]] +; + %1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %a0) + %2 = and i32 %1, -256 + ret i32 %2 +} + +define i32 @test_lower_x86_avx_movmsk_pd_256(<4 x double> %a0) { +; CHECK-LABEL: @test_lower_x86_avx_movmsk_pd_256( +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %a0) +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], -16 +; CHECK-NEXT: ret i32 [[TMP2]] +; + %1 = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %a0) + %2 = and i32 %1, -16 + ret i32 %2 +} + +; llvm.x86.avx2.pmovmskb uses the whole of the 32-bit register. + + +declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) +declare i32 @llvm.x86.sse2.movmsk.pd(<2 x double>) +declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) + +declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) +declare i32 @llvm.x86.avx.movmsk.pd.256(<4 x double>) +declare i32 @llvm.x86.avx2.pmovmskb(<32 x i8>)