From 0b83e14804c46aaf8ba40863bb6d1a3cf175b997 Mon Sep 17 00:00:00 2001 From: David Green Date: Wed, 22 Jan 2020 13:44:36 +0000 Subject: [PATCH] [ARM] MVE Gather Scatter cost model tests. NFC --- .../CostModel/ARM/mve-gather-scatter-cost.ll | 498 +++++++++++++++++++++ 1 file changed, 498 insertions(+) create mode 100644 llvm/test/Analysis/CostModel/ARM/mve-gather-scatter-cost.ll diff --git a/llvm/test/Analysis/CostModel/ARM/mve-gather-scatter-cost.ll b/llvm/test/Analysis/CostModel/ARM/mve-gather-scatter-cost.ll new file mode 100644 index 0000000..139b84a --- /dev/null +++ b/llvm/test/Analysis/CostModel/ARM/mve-gather-scatter-cost.ll @@ -0,0 +1,498 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt < %s -S -mtriple=thumbv8.1-m.main-none-eabi -mattr=+mve.fp -cost-model -analyze -enable-arm-maskedgatscat | FileCheck %s + +define i32 @masked_gather() { +; CHECK-LABEL: 'masked_gather' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> undef, i32 4, <4 x i1> undef, <4 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> undef, i32 4, <2 x i1> undef, <2 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> undef, i32 4, <16 x i1> undef, <16 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> undef, i32 4, <8 x i1> undef, <8 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> undef, i32 4, <4 x i1> undef, <4 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> undef, i32 4, <2 x i1> undef, <2 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16F16 = call <16 x half> @llvm.masked.gather.v16f16.v16p0f16(<16 x half*> undef, i32 2, <16 x i1> undef, <16 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F16 = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> undef, i32 2, <8 x i1> undef, <8 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F16 = call <4 x half> @llvm.masked.gather.v4f16.v4p0f16(<4 x half*> undef, i32 2, <4 x i1> undef, <4 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F16 = call <2 x half> @llvm.masked.gather.v2f16.v2p0f16(<2 x half*> undef, i32 2, <2 x i1> undef, <2 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> undef, i32 4, <4 x i1> undef, <4 x i64> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> undef, i32 4, <2 x i1> undef, <2 x i64> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> undef, i32 4, <16 x i1> undef, <16 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> undef, i32 4, <8 x i1> undef, <8 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> undef, i32 4, <4 x i1> undef, <4 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> undef, i32 4, <2 x i1> undef, <2 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*> undef, i32 2, <16 x i1> undef, <16 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> undef, i32 2, <8 x i1> undef, <8 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> undef, i32 2, <4 x i1> undef, <4 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = call <2 x i16> @llvm.masked.gather.v2i16.v2p0i16(<2 x i16*> undef, i32 2, <2 x i1> undef, <2 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> undef, i32 1, <32 x i1> undef, <32 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> undef, i32 1, <16 x i1> undef, <16 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> undef, i32 1, <8 x i1> undef, <8 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I8 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> undef, i32 1, <4 x i1> undef, <4 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I8 = call <2 x i8> @llvm.masked.gather.v2i8.v2p0i8(<2 x i8*> undef, i32 1, <2 x i1> undef, <2 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 +; + %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> undef, i32 4, <4 x i1> undef, <4 x double> undef) + %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> undef, i32 4, <2 x i1> undef, <2 x double> undef) + + %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> undef, i32 4, <16 x i1> undef, <16 x float> undef) + %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> undef, i32 4, <8 x i1> undef, <8 x float> undef) + %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> undef, i32 4, <4 x i1> undef, <4 x float> undef) + %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> undef, i32 4, <2 x i1> undef, <2 x float> undef) + + %V16F16 = call <16 x half> @llvm.masked.gather.v16f16.v16p0f16(<16 x half*> undef, i32 2, <16 x i1> undef, <16 x half> undef) + %V8F16 = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> undef, i32 2, <8 x i1> undef, <8 x half> undef) + %V4F16 = call <4 x half> @llvm.masked.gather.v4f16.v4p0f16(<4 x half*> undef, i32 2, <4 x i1> undef, <4 x half> undef) + %V2F16 = call <2 x half> @llvm.masked.gather.v2f16.v2p0f16(<2 x half*> undef, i32 2, <2 x i1> undef, <2 x half> undef) + + %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> undef, i32 4, <4 x i1> undef, <4 x i64> undef) + %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> undef, i32 4, <2 x i1> undef, <2 x i64> undef) + + %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> undef, i32 4, <16 x i1> undef, <16 x i32> undef) + %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> undef, i32 4, <8 x i1> undef, <8 x i32> undef) + %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> undef, i32 4, <4 x i1> undef, <4 x i32> undef) + %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> undef, i32 4, <2 x i1> undef, <2 x i32> undef) + + %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*> undef, i32 2, <16 x i1> undef, <16 x i16> undef) + %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> undef, i32 2, <8 x i1> undef, <8 x i16> undef) + %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> undef, i32 2, <4 x i1> undef, <4 x i16> undef) + %V2I16 = call <2 x i16> @llvm.masked.gather.v2i16.v2p0i16(<2 x i16*> undef, i32 2, <2 x i1> undef, <2 x i16> undef) + + %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> undef, i32 1, <32 x i1> undef, <32 x i8> undef) + %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> undef, i32 1, <16 x i1> undef, <16 x i8> undef) + %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> undef, i32 1, <8 x i1> undef, <8 x i8> undef) + %V4I8 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> undef, i32 1, <4 x i1> undef, <4 x i8> undef) + %V2I8 = call <2 x i8> @llvm.masked.gather.v2i8.v2p0i8(<2 x i8*> undef, i32 1, <2 x i1> undef, <2 x i8> undef) + + ret i32 0 +} + +define i32 @masked_scatter() { +; CHECK-LABEL: 'masked_scatter' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> undef, <4 x double*> undef, i32 4, <4 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> undef, <2 x double*> undef, i32 4, <2 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> undef, <16 x float*> undef, i32 4, <16 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float> undef, <8 x float*> undef, i32 4, <8 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> undef, <4 x float*> undef, i32 4, <4 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> undef, <2 x float*> undef, i32 4, <2 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16f16.v16p0f16(<16 x half> undef, <16 x half*> undef, i32 2, <16 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> undef, <8 x half*> undef, i32 2, <8 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f16.v4p0f16(<4 x half> undef, <4 x half*> undef, i32 2, <4 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v2f16.v2p0f16(<2 x half> undef, <2 x half*> undef, i32 2, <2 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i64.v4p0i64(<4 x i64> undef, <4 x i64*> undef, i32 4, <4 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v2i64.v2p0i64(<2 x i64> undef, <2 x i64*> undef, i32 4, <2 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> undef, <16 x i32*> undef, i32 4, <16 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> undef, <8 x i32*> undef, i32 4, <8 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> undef, <4 x i32*> undef, i32 4, <4 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> undef, <2 x i32*> undef, i32 4, <2 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i16.v16p0i16(<16 x i16> undef, <16 x i16*> undef, i32 2, <16 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> undef, <8 x i16*> undef, i32 2, <8 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> undef, <4 x i16*> undef, i32 2, <4 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v2i16.v2p0i16(<2 x i16> undef, <2 x i16*> undef, i32 2, <2 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8> undef, <32 x i8*> undef, i32 1, <32 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> undef, <16 x i8*> undef, i32 1, <16 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> undef, <8 x i8*> undef, i32 1, <8 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> undef, <4 x i8*> undef, i32 1, <4 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v2i8.v2p0i8(<2 x i8> undef, <2 x i8*> undef, i32 1, <2 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 +; + call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> undef, <4 x double*> undef, i32 4, <4 x i1> undef) + call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> undef, <2 x double*> undef, i32 4, <2 x i1> undef) + + call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> undef, <16 x float*> undef, i32 4, <16 x i1> undef) + call void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float> undef, <8 x float*> undef, i32 4, <8 x i1> undef) + call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> undef, <4 x float*> undef, i32 4, <4 x i1> undef) + call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> undef, <2 x float*> undef, i32 4, <2 x i1> undef) + + call void @llvm.masked.scatter.v16f16.v16p0f16(<16 x half> undef, <16 x half*> undef, i32 2, <16 x i1> undef) + call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> undef, <8 x half*> undef, i32 2, <8 x i1> undef) + call void @llvm.masked.scatter.v4f16.v4p0f16(<4 x half> undef, <4 x half*> undef, i32 2, <4 x i1> undef) + call void @llvm.masked.scatter.v2f16.v2p0f16(<2 x half> undef, <2 x half*> undef, i32 2, <2 x i1> undef) + + call void @llvm.masked.scatter.v4i64.v4p0i64(<4 x i64> undef, <4 x i64*> undef, i32 4, <4 x i1> undef) + call void @llvm.masked.scatter.v2i64.v2p0i64(<2 x i64> undef, <2 x i64*> undef, i32 4, <2 x i1> undef) + + call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> undef, <16 x i32*> undef, i32 4, <16 x i1> undef) + call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> undef, <8 x i32*> undef, i32 4, <8 x i1> undef) + call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> undef, <4 x i32*> undef, i32 4, <4 x i1> undef) + call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> undef, <2 x i32*> undef, i32 4, <2 x i1> undef) + + call void @llvm.masked.scatter.v16i16.v16p0i16(<16 x i16> undef, <16 x i16*> undef, i32 2, <16 x i1> undef) + call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> undef, <8 x i16*> undef, i32 2, <8 x i1> undef) + call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> undef, <4 x i16*> undef, i32 2, <4 x i1> undef) + call void @llvm.masked.scatter.v2i16.v2p0i16(<2 x i16> undef, <2 x i16*> undef, i32 2, <2 x i1> undef) + + call void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8> undef, <32 x i8*> undef, i32 1, <32 x i1> undef) + call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> undef, <16 x i8*> undef, i32 1, <16 x i1> undef) + call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> undef, <8 x i8*> undef, i32 1, <8 x i1> undef) + call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> undef, <4 x i8*> undef, i32 1, <4 x i1> undef) + call void @llvm.masked.scatter.v2i8.v2p0i8(<2 x i8> undef, <2 x i8*> undef, i32 1, <2 x i1> undef) + + ret i32 0 +} + +define void @gep_v4i32(i32* %base, i16* %base16, i8* %base8, <4 x i32> %ind32, <4 x i16> %ind16, <4 x i1> %mask) { +; CHECK-LABEL: 'gep_v4i32' +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep1 = getelementptr i32, i32* %base, <4 x i32> %ind32 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res1 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %gep1, i32 4, <4 x i1> %mask, <4 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %res1, <4 x i32*> %gep1, i32 4, <4 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %indzext = zext <4 x i16> %ind16 to <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep2 = getelementptr i32, i32* %base, <4 x i32> %indzext +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res2 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %gep2, i32 4, <4 x i1> %mask, <4 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %res2, <4 x i32*> %gep2, i32 4, <4 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %indsext = sext <4 x i16> %ind16 to <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep3 = getelementptr i32, i32* %base, <4 x i32> %indsext +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res3 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %gep3, i32 4, <4 x i1> %mask, <4 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %res3, <4 x i32*> %gep3, i32 4, <4 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepu = getelementptr i32, i32* %base, <4 x i32> %ind32 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %resu = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %gepu, i32 1, <4 x i1> %mask, <4 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %resu, <4 x i32*> %gepu, i32 1, <4 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepos = getelementptr i8, i8* %base8, <4 x i32> %indzext +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %geposb = bitcast <4 x i8*> %gepos to <4 x i32*> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %resos = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %geposb, i32 4, <4 x i1> %mask, <4 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %resos, <4 x i32*> %geposb, i32 4, <4 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepbs = getelementptr i16, i16* %base16, <4 x i32> %indzext +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepbsb = bitcast <4 x i16*> %gepbs to <4 x i32*> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %resbs = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %gepbsb, i32 4, <4 x i1> %mask, <4 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %resbs, <4 x i32*> %gepbsb, i32 4, <4 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %gep1 = getelementptr i32, i32* %base, <4 x i32> %ind32 + %res1 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %gep1, i32 4, <4 x i1> %mask, <4 x i32> undef) + call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %res1, <4 x i32*> %gep1, i32 4, <4 x i1> %mask) + + %indzext = zext <4 x i16> %ind16 to <4 x i32> + %gep2 = getelementptr i32, i32* %base, <4 x i32> %indzext + %res2 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %gep2, i32 4, <4 x i1> %mask, <4 x i32> undef) + call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %res2, <4 x i32*> %gep2, i32 4, <4 x i1> %mask) + + %indsext = sext <4 x i16> %ind16 to <4 x i32> + %gep3 = getelementptr i32, i32* %base, <4 x i32> %indsext + %res3 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %gep3, i32 4, <4 x i1> %mask, <4 x i32> undef) + call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %res3, <4 x i32*> %gep3, i32 4, <4 x i1> %mask) + + ; unaligned + %gepu = getelementptr i32, i32* %base, <4 x i32> %ind32 + %resu = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %gepu, i32 1, <4 x i1> %mask, <4 x i32> undef) + call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %resu, <4 x i32*> %gepu, i32 1, <4 x i1> %mask) + + ; 1 scale + %gepos = getelementptr i8, i8* %base8, <4 x i32> %indzext + %geposb = bitcast <4 x i8*> %gepos to <4 x i32*> + %resos = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %geposb, i32 4, <4 x i1> %mask, <4 x i32> undef) + call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %resos, <4 x i32*> %geposb, i32 4, <4 x i1> %mask) + + ; bad scale (but doesn't really matter because i32) + %gepbs = getelementptr i16, i16* %base16, <4 x i32> %indzext + %gepbsb = bitcast <4 x i16*> %gepbs to <4 x i32*> + %resbs = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %gepbsb, i32 4, <4 x i1> %mask, <4 x i32> undef) + call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %resbs, <4 x i32*> %gepbsb, i32 4, <4 x i1> %mask) + ret void +} + +define void @gep_v4f32(float* %base, i16* %base16, i8* %base8, <4 x i32> %ind32, <4 x i16> %ind16, <4 x i1> %mask) { +; CHECK-LABEL: 'gep_v4f32' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %gep1 = getelementptr float, float* %base, <4 x i32> %ind32 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res1 = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep1, i32 4, <4 x i1> %mask, <4 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %res1, <4 x float*> %gep1, i32 4, <4 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %indzext = zext <4 x i16> %ind16 to <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %gep2 = getelementptr float, float* %base, <4 x i32> %indzext +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res2 = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep2, i32 4, <4 x i1> %mask, <4 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %res2, <4 x float*> %gep2, i32 4, <4 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %indsext = sext <4 x i16> %ind16 to <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %gep3 = getelementptr float, float* %base, <4 x i32> %indsext +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res3 = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep3, i32 4, <4 x i1> %mask, <4 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %res3, <4 x float*> %gep3, i32 4, <4 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %gepu = getelementptr float, float* %base, <4 x i32> %ind32 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %resu = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gepu, i32 1, <4 x i1> %mask, <4 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %resu, <4 x float*> %gepu, i32 1, <4 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepos = getelementptr i8, i8* %base8, <4 x i32> %indzext +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %geposb = bitcast <4 x i8*> %gepos to <4 x float*> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %resos = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %geposb, i32 4, <4 x i1> %mask, <4 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %resos, <4 x float*> %geposb, i32 4, <4 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepbs = getelementptr i16, i16* %base16, <4 x i32> %indzext +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepbsb = bitcast <4 x i16*> %gepbs to <4 x float*> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %resbs = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gepbsb, i32 4, <4 x i1> %mask, <4 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %resbs, <4 x float*> %gepbsb, i32 4, <4 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %gep1 = getelementptr float, float* %base, <4 x i32> %ind32 + %res1 = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep1, i32 4, <4 x i1> %mask, <4 x float> undef) + call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %res1, <4 x float*> %gep1, i32 4, <4 x i1> %mask) + + %indzext = zext <4 x i16> %ind16 to <4 x i32> + %gep2 = getelementptr float, float* %base, <4 x i32> %indzext + %res2 = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep2, i32 4, <4 x i1> %mask, <4 x float> undef) + call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %res2, <4 x float*> %gep2, i32 4, <4 x i1> %mask) + + %indsext = sext <4 x i16> %ind16 to <4 x i32> + %gep3 = getelementptr float, float* %base, <4 x i32> %indsext + %res3 = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep3, i32 4, <4 x i1> %mask, <4 x float> undef) + call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %res3, <4 x float*> %gep3, i32 4, <4 x i1> %mask) + + ; unaligned + %gepu = getelementptr float, float* %base, <4 x i32> %ind32 + %resu = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gepu, i32 1, <4 x i1> %mask, <4 x float> undef) + call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %resu, <4 x float*> %gepu, i32 1, <4 x i1> %mask) + + ; 1 scale + %gepos = getelementptr i8, i8* %base8, <4 x i32> %indzext + %geposb = bitcast <4 x i8*> %gepos to <4 x float*> + %resos = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %geposb, i32 4, <4 x i1> %mask, <4 x float> undef) + call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %resos, <4 x float*> %geposb, i32 4, <4 x i1> %mask) + + ; bad scale (but doesn't really matter because i32) + %gepbs = getelementptr i16, i16* %base16, <4 x i32> %indzext + %gepbsb = bitcast <4 x i16*> %gepbs to <4 x float*> + %resbs = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gepbsb, i32 4, <4 x i1> %mask, <4 x float> undef) + call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %resbs, <4 x float*> %gepbsb, i32 4, <4 x i1> %mask) + ret void +} + +define void @gep_v4i16(i16* %base, <4 x i32> %ind32, <4 x i16> %ind16, <4 x i1> %mask) { +; CHECK-LABEL: 'gep_v4i16' +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep1 = getelementptr i16, i16* %base, <4 x i32> %ind32 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res1 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %gep1, i32 2, <4 x i1> %mask, <4 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %res1, <4 x i16*> %gep1, i32 2, <4 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %indzext = zext <4 x i16> %ind16 to <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep2 = getelementptr i16, i16* %base, <4 x i32> %indzext +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res2 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %gep2, i32 2, <4 x i1> %mask, <4 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %res2, <4 x i16*> %gep2, i32 2, <4 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %indsext = sext <4 x i16> %ind16 to <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep3 = getelementptr i16, i16* %base, <4 x i32> %indsext +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res3 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %gep3, i32 2, <4 x i1> %mask, <4 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %res3, <4 x i16*> %gep3, i32 2, <4 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %gep1 = getelementptr i16, i16* %base, <4 x i32> %ind32 + %res1 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %gep1, i32 2, <4 x i1> %mask, <4 x i16> undef) + call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %res1, <4 x i16*> %gep1, i32 2, <4 x i1> %mask) + + %indzext = zext <4 x i16> %ind16 to <4 x i32> + %gep2 = getelementptr i16, i16* %base, <4 x i32> %indzext + %res2 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %gep2, i32 2, <4 x i1> %mask, <4 x i16> undef) + call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %res2, <4 x i16*> %gep2, i32 2, <4 x i1> %mask) + + %indsext = sext <4 x i16> %ind16 to <4 x i32> + %gep3 = getelementptr i16, i16* %base, <4 x i32> %indsext + %res3 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %gep3, i32 2, <4 x i1> %mask, <4 x i16> undef) + call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %res3, <4 x i16*> %gep3, i32 2, <4 x i1> %mask) + ret void +} + +define void @gep_v8i16(i16* %base, i8* %base8, i32* %base32, <8 x i32> %ind32, <8 x i16> %ind16, <8 x i1> %mask) { +; CHECK-LABEL: 'gep_v8i16' +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep1 = getelementptr i16, i16* %base, <8 x i32> %ind32 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res1 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gep1, i32 2, <8 x i1> %mask, <8 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %res1, <8 x i16*> %gep1, i32 2, <8 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %indzext = zext <8 x i16> %ind16 to <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep2 = getelementptr i16, i16* %base, <8 x i32> %indzext +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res2 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gep2, i32 2, <8 x i1> %mask, <8 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %res2, <8 x i16*> %gep2, i32 2, <8 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %indsext = sext <8 x i16> %ind16 to <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep3 = getelementptr i16, i16* %base, <8 x i32> %indsext +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res3 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gep3, i32 2, <8 x i1> %mask, <8 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %res3, <8 x i16*> %gep3, i32 2, <8 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %resu = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gep2, i32 1, <8 x i1> %mask, <8 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %resu, <8 x i16*> %gep2, i32 1, <8 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepos = getelementptr i8, i8* %base8, <8 x i32> %indzext +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %geposb = bitcast <8 x i8*> %gepos to <8 x i16*> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %resos = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %geposb, i32 2, <8 x i1> %mask, <8 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %resos, <8 x i16*> %geposb, i32 2, <8 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepbs = getelementptr i32, i32* %base32, <8 x i32> %indzext +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepbsb = bitcast <8 x i32*> %gepbs to <8 x i16*> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %resbs = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gepbsb, i32 2, <8 x i1> %mask, <8 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %resbs, <8 x i16*> %gepbsb, i32 2, <8 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %gep1 = getelementptr i16, i16* %base, <8 x i32> %ind32 + %res1 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gep1, i32 2, <8 x i1> %mask, <8 x i16> undef) + call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %res1, <8 x i16*> %gep1, i32 2, <8 x i1> %mask) + + %indzext = zext <8 x i16> %ind16 to <8 x i32> + %gep2 = getelementptr i16, i16* %base, <8 x i32> %indzext + %res2 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gep2, i32 2, <8 x i1> %mask, <8 x i16> undef) + call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %res2, <8 x i16*> %gep2, i32 2, <8 x i1> %mask) + + %indsext = sext <8 x i16> %ind16 to <8 x i32> + %gep3 = getelementptr i16, i16* %base, <8 x i32> %indsext + %res3 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gep3, i32 2, <8 x i1> %mask, <8 x i16> undef) + call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %res3, <8 x i16*> %gep3, i32 2, <8 x i1> %mask) + + ; unaligned + %resu = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gep2, i32 1, <8 x i1> %mask, <8 x i16> undef) + call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %resu, <8 x i16*> %gep2, i32 1, <8 x i1> %mask) + + ; 1 scale + %gepos = getelementptr i8, i8* %base8, <8 x i32> %indzext + %geposb = bitcast <8 x i8*> %gepos to <8 x i16*> + %resos = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %geposb, i32 2, <8 x i1> %mask, <8 x i16> undef) + call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %resos, <8 x i16*> %geposb, i32 2, <8 x i1> %mask) + + ; bad scale + %gepbs = getelementptr i32, i32* %base32, <8 x i32> %indzext + %gepbsb = bitcast <8 x i32*> %gepbs to <8 x i16*> + %resbs = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gepbsb, i32 2, <8 x i1> %mask, <8 x i16> undef) + call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %resbs, <8 x i16*> %gepbsb, i32 2, <8 x i1> %mask) + ret void +} + +define void @gep_v8f16(half* %base, i8* %base8, i32* %base32, <8 x i32> %ind32, <8 x i16> %ind16, <8 x i1> %mask) { +; CHECK-LABEL: 'gep_v8f16' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %gep1 = getelementptr half, half* %base, <8 x i32> %ind32 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res1 = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %gep1, i32 2, <8 x i1> %mask, <8 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %res1, <8 x half*> %gep1, i32 2, <8 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %indzext = zext <8 x i16> %ind16 to <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %gep2 = getelementptr half, half* %base, <8 x i32> %indzext +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res2 = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %gep2, i32 2, <8 x i1> %mask, <8 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %res2, <8 x half*> %gep2, i32 2, <8 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %indsext = sext <8 x i16> %ind16 to <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %gep3 = getelementptr half, half* %base, <8 x i32> %indsext +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res3 = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %gep3, i32 2, <8 x i1> %mask, <8 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %res3, <8 x half*> %gep3, i32 2, <8 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %resu = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %gep2, i32 1, <8 x i1> %mask, <8 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %resu, <8 x half*> %gep2, i32 1, <8 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepos = getelementptr i8, i8* %base8, <8 x i32> %indzext +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %geposb = bitcast <8 x i8*> %gepos to <8 x half*> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %resos = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %geposb, i32 2, <8 x i1> %mask, <8 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %resos, <8 x half*> %geposb, i32 2, <8 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepbs = getelementptr i32, i32* %base32, <8 x i32> %indzext +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepbsb = bitcast <8 x i32*> %gepbs to <8 x half*> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %resbs = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %gepbsb, i32 2, <8 x i1> %mask, <8 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %resbs, <8 x half*> %gepbsb, i32 2, <8 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %gep1 = getelementptr half, half* %base, <8 x i32> %ind32 + %res1 = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %gep1, i32 2, <8 x i1> %mask, <8 x half> undef) + call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %res1, <8 x half*> %gep1, i32 2, <8 x i1> %mask) + + %indzext = zext <8 x i16> %ind16 to <8 x i32> + %gep2 = getelementptr half, half* %base, <8 x i32> %indzext + %res2 = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %gep2, i32 2, <8 x i1> %mask, <8 x half> undef) + call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %res2, <8 x half*> %gep2, i32 2, <8 x i1> %mask) + + %indsext = sext <8 x i16> %ind16 to <8 x i32> + %gep3 = getelementptr half, half* %base, <8 x i32> %indsext + %res3 = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %gep3, i32 2, <8 x i1> %mask, <8 x half> undef) + call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %res3, <8 x half*> %gep3, i32 2, <8 x i1> %mask) + + ; unaligned + %resu = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %gep2, i32 1, <8 x i1> %mask, <8 x half> undef) + call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %resu, <8 x half*> %gep2, i32 1, <8 x i1> %mask) + + ; 1 scale + %gepos = getelementptr i8, i8* %base8, <8 x i32> %indzext + %geposb = bitcast <8 x i8*> %gepos to <8 x half*> + %resos = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %geposb, i32 2, <8 x i1> %mask, <8 x half> undef) + call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %resos, <8 x half*> %geposb, i32 2, <8 x i1> %mask) + + ; bad scale + %gepbs = getelementptr i32, i32* %base32, <8 x i32> %indzext + %gepbsb = bitcast <8 x i32*> %gepbs to <8 x half*> + %resbs = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %gepbsb, i32 2, <8 x i1> %mask, <8 x half> undef) + call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %resbs, <8 x half*> %gepbsb, i32 2, <8 x i1> %mask) + ret void +} + +define void @gep_v16i8(i8* %base, i16* %base16, <16 x i8> %ind8, <16 x i32> %ind32, <16 x i1> %mask) { +; CHECK-LABEL: 'gep_v16i8' +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep1 = getelementptr i8, i8* %base, <16 x i32> %ind32 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res1 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %gep1, i32 1, <16 x i1> %mask, <16 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %res1, <16 x i8*> %gep1, i32 2, <16 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %indzext = zext <16 x i8> %ind8 to <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep2 = getelementptr i8, i8* %base, <16 x i32> %indzext +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res2 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %gep2, i32 2, <16 x i1> %mask, <16 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %res2, <16 x i8*> %gep2, i32 2, <16 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %indsext = sext <16 x i8> %ind8 to <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep3 = getelementptr i8, i8* %base, <16 x i32> %indsext +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res3 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %gep3, i32 2, <16 x i1> %mask, <16 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %res3, <16 x i8*> %gep3, i32 2, <16 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepbs = getelementptr i16, i16* %base16, <16 x i32> %indzext +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepbsb = bitcast <16 x i16*> %gepbs to <16 x i8*> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %resbs = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %gepbsb, i32 2, <16 x i1> %mask, <16 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %resbs, <16 x i8*> %gepbsb, i32 2, <16 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %gep1 = getelementptr i8, i8* %base, <16 x i32> %ind32 + %res1 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %gep1, i32 1, <16 x i1> %mask, <16 x i8> undef) + call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %res1, <16 x i8*> %gep1, i32 2, <16 x i1> %mask) + + %indzext = zext <16 x i8> %ind8 to <16 x i32> + %gep2 = getelementptr i8, i8* %base, <16 x i32> %indzext + %res2 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %gep2, i32 2, <16 x i1> %mask, <16 x i8> undef) + call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %res2, <16 x i8*> %gep2, i32 2, <16 x i1> %mask) + + %indsext = sext <16 x i8> %ind8 to <16 x i32> + %gep3 = getelementptr i8, i8* %base, <16 x i32> %indsext + %res3 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %gep3, i32 2, <16 x i1> %mask, <16 x i8> undef) + call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %res3, <16 x i8*> %gep3, i32 2, <16 x i1> %mask) + + ; Bad scale + %gepbs = getelementptr i16, i16* %base16, <16 x i32> %indzext + %gepbsb = bitcast <16 x i16*> %gepbs to <16 x i8*> + %resbs = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %gepbsb, i32 2, <16 x i1> %mask, <16 x i8> undef) + call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %resbs, <16 x i8*> %gepbsb, i32 2, <16 x i1> %mask) + ret void +} + +declare <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*>, i32, <4 x i1>, <4 x double>) +declare <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*>, i32, <2 x i1>, <2 x double>) + +declare <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*>, i32, <16 x i1>, <16 x float>) +declare <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*>, i32, <8 x i1>, <8 x float>) +declare <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*>, i32, <4 x i1>, <4 x float>) +declare <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*>, i32, <2 x i1>, <2 x float>) + +declare <16 x half> @llvm.masked.gather.v16f16.v16p0f16(<16 x half*>, i32, <16 x i1>, <16 x half>) +declare <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*>, i32, <8 x i1>, <8 x half>) +declare <4 x half> @llvm.masked.gather.v4f16.v4p0f16(<4 x half*>, i32, <4 x i1>, <4 x half>) +declare <2 x half> @llvm.masked.gather.v2f16.v2p0f16(<2 x half*>, i32, <2 x i1>, <2 x half>) + +declare <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*>, i32, <4 x i1>, <4 x i64>) +declare <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*>, i32, <2 x i1>, <2 x i64>) + +declare <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*>, i32, <16 x i1>, <16 x i32>) +declare <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*>, i32, <8 x i1>, <8 x i32>) +declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>) +declare <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>) + +declare <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*>, i32, <16 x i1>, <16 x i16>) +declare <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*>, i32, <8 x i1>, <8 x i16>) +declare <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*>, i32, <4 x i1>, <4 x i16>) +declare <2 x i16> @llvm.masked.gather.v2i16.v2p0i16(<2 x i16*>, i32, <2 x i1>, <2 x i16>) + +declare <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*>, i32, <32 x i1>, <32 x i8>) +declare <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*>, i32, <16 x i1>, <16 x i8>) +declare <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*>, i32, <8 x i1>, <8 x i8>) +declare <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*>, i32, <4 x i1>, <4 x i8>) +declare <2 x i8> @llvm.masked.gather.v2i8.v2p0i8(<2 x i8*>, i32, <2 x i1>, <2 x i8>) + +declare void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double>, <4 x double*>, i32, <4 x i1>) +declare void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double>, <2 x double*>, i32, <2 x i1>) + +declare void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float>, <16 x float*>, i32, <16 x i1>) +declare void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float>, <8 x float*>, i32, <8 x i1>) +declare void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float>, <4 x float*>, i32, <4 x i1>) +declare void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float>, <2 x float*>, i32, <2 x i1>) + +declare void @llvm.masked.scatter.v16f16.v16p0f16(<16 x half>, <16 x half*>, i32, <16 x i1>) +declare void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half>, <8 x half*>, i32, <8 x i1>) +declare void @llvm.masked.scatter.v4f16.v4p0f16(<4 x half>, <4 x half*>, i32, <4 x i1>) +declare void @llvm.masked.scatter.v2f16.v2p0f16(<2 x half>, <2 x half*>, i32, <2 x i1>) + +declare void @llvm.masked.scatter.v4i64.v4p0i64(<4 x i64>, <4 x i64*>, i32, <4 x i1>) +declare void @llvm.masked.scatter.v2i64.v2p0i64(<2 x i64>, <2 x i64*>, i32, <2 x i1>) + +declare void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32>, <16 x i32*>, i32, <16 x i1>) +declare void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32>, <8 x i32*>, i32, <8 x i1>) +declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32>, <4 x i32*>, i32, <4 x i1>) +declare void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32>, <2 x i32*>, i32, <2 x i1>) + +declare void @llvm.masked.scatter.v16i16.v16p0i16(<16 x i16>, <16 x i16*>, i32, <16 x i1>) +declare void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16>, <8 x i16*>, i32, <8 x i1>) +declare void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16>, <4 x i16*>, i32, <4 x i1>) +declare void @llvm.masked.scatter.v2i16.v2p0i16(<2 x i16>, <2 x i16*>, i32, <2 x i1>) + +declare void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8>, <32 x i8*>, i32, <32 x i1>) +declare void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8>, <16 x i8*>, i32, <16 x i1>) +declare void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8>, <8 x i8*>, i32, <8 x i1>) +declare void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8>, <4 x i8*>, i32, <4 x i1>) +declare void @llvm.masked.scatter.v2i8.v2p0i8(<2 x i8>, <2 x i8*>, i32, <2 x i1>) -- 2.7.4