From 6bb4b2d00221658b3fac421957e2905a13579c68 Mon Sep 17 00:00:00 2001
From: Valery N Dmitriev <valery.n.dmitriev@intel.com>
Date: Fri, 30 Dec 2022 11:07:04 -0800
Subject: [PATCH] [NFC] Test case intended to cover SLP cost for chain with
 masked gather loads.

SLP produces two gather loads (one feeds another).
For the first set of scalar loads GEP indices are all constant.
The result of the second load is then fed into reduction (as a seed).

Differential Revision: https://reviews.llvm.org/D140785
---
 .../X86/remark_gather-load-redux-cost.ll           | 78 ++++++++++++++++++++++
 1 file changed, 78 insertions(+)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
new file mode 100644
index 0000000..cd9265e
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
@@ -0,0 +1,78 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mtriple=x86_64 -mcpu=skylake-avx512 -passes=slp-vectorizer -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: FileCheck --input-file=%t --check-prefix=YAML %s
+
+define i32 @test(ptr noalias %p, ptr noalias %addr) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[ADDR:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, <8 x ptr> [[TMP1]], <8 x i32> <i32 15, i32 13, i32 11, i32 9, i32 7, i32 5, i32 3, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP2]], i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> poison)
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x ptr> poison, ptr [[P:%.*]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x ptr> [[TMP4]], <8 x ptr> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, <8 x ptr> [[TMP5]], <8 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP6]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> poison)
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP7]])
+; CHECK-NEXT:    ret i32 [[TMP8]]
+;
+; YAML:      --- !Passed
+  ; YAML-NEXT: Pass:            slp-vectorizer
+  ; YAML-NEXT: Name:            VectorizedHorizontalReduction
+  ; YAML-NEXT: Function:        test
+  ; YAML-NEXT: Args:
+  ; YAML-NEXT:   - String:          'Vectorized horizontal reduction with cost '
+  ; YAML-NEXT:   - Cost:            '-17'
+  ; YAML-NEXT:   - String:          ' and with tree size '
+  ; YAML-NEXT:   - TreeSize:        '7'
+entry:
+  %off0.1 = getelementptr inbounds i32, ptr %addr, i32 1
+  %idx0 = load i32, ptr %off0.1, align 8
+  %gep0 = getelementptr inbounds i32, ptr %p, i32 %idx0
+  %ld0 = load i32, ptr %gep0, align 4
+
+  %off1.3 = getelementptr inbounds i32, ptr %addr, i32 3
+  %idx1 = load i32, ptr %off1.3, align 8
+  %gep1 = getelementptr inbounds i32, ptr %p, i32 %idx1
+  %ld1 = load i32, ptr %gep1, align 4
+
+  %off2.5 = getelementptr inbounds i32, ptr %addr, i32 5
+  %idx2 = load i32, ptr %off2.5, align 8
+  %gep2 = getelementptr inbounds i32, ptr %p, i32 %idx2
+  %ld2 = load i32, ptr %gep2, align 4
+
+  %off3.7 = getelementptr inbounds i32, ptr %addr, i32 7
+  %idx3 = load i32, ptr %off3.7, align 8
+  %gep3 = getelementptr inbounds i32, ptr %p, i32 %idx3
+  %ld3 = load i32, ptr %gep3, align 4
+
+  %off4.9 = getelementptr inbounds i32, ptr %addr, i32 9
+  %idx4 = load i32, ptr %off4.9, align 8
+  %gep4 = getelementptr inbounds i32, ptr %p, i32 %idx4
+  %ld4 = load i32, ptr %gep4, align 4
+
+  %off5.11 = getelementptr inbounds i32, ptr %addr, i32 11
+  %idx5 = load i32, ptr %off5.11, align 8
+  %gep5 = getelementptr inbounds i32, ptr %p, i32 %idx5
+  %ld5 = load i32, ptr %gep5, align 4
+
+  %off6.13 = getelementptr inbounds i32, ptr %addr, i32 13
+  %idx6 = load i32, ptr %off6.13, align 8
+  %gep6 = getelementptr inbounds i32, ptr %p, i32 %idx6
+  %ld6 = load i32, ptr %gep6, align 4
+
+  %off7.15 = getelementptr inbounds i32, ptr %addr, i32 15
+  %idx7 = load i32, ptr %off7.15, align 8
+  %gep7 = getelementptr inbounds i32, ptr %p, i32 %idx7
+  %ld7 = load i32, ptr %gep7, align 4
+
+  %add0 = add nsw i32 %ld1, %ld0
+  %add1 = add nsw i32 %add0, %ld2
+  %add2 = add nsw i32 %add1, %ld3
+  %add3 = add nsw i32 %add2, %ld4
+  %add4 = add nsw i32 %add3, %ld5
+  %add5 = add nsw i32 %add4, %ld6
+  %add6 = add nsw i32 %add5, %ld7
+
+  ret i32 %add6
+}
-- 
2.7.4