From a0217bda3836138a793afff76c65ef1041689fd9 Mon Sep 17 00:00:00 2001
From: Roman Lebedev <lebedev.ri@gmail.com>
Date: Wed, 21 Jul 2021 00:24:47 +0300
Subject: [PATCH] [NFC][VectorCombine] Add tests for widening of partial vector
 load

---
 .../Transforms/VectorCombine/X86/load-widening.ll  | 224 +++++++++++++++++++++
 1 file changed, 224 insertions(+)
 create mode 100644 llvm/test/Transforms/VectorCombine/X86/load-widening.ll

diff --git a/llvm/test/Transforms/VectorCombine/X86/load-widening.ll b/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
new file mode 100644
index 0000000..6c14b36
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
@@ -0,0 +1,224 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=sse2 --data-layout="e-m:e-i64:64-f80:128-n8:16:32:64-S128" | FileCheck %s --check-prefixes=CHECK
+; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=avx2 --data-layout="e-m:e-i64:64-f80:128-n8:16:32:64-S128" | FileCheck %s --check-prefixes=CHECK
+; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=sse2 --data-layout="E-m:e-i64:64-f80:128-n8:16:32:64-S128" | FileCheck %s --check-prefixes=CHECK
+; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=avx2 --data-layout="E-m:e-i64:64-f80:128-n8:16:32:64-S128" | FileCheck %s --check-prefixes=CHECK
+
+;-------------------------------------------------------------------------------
+; Here we know we can load 128 bits as per dereferenceability and alignment.
+
+; We don't widen scalar loads per-se.
+define <1 x float> @scalar(<1 x float>* align 16 dereferenceable(16) %p) {
+; CHECK-LABEL: @scalar(
+; CHECK-NEXT:    [[R:%.*]] = load <1 x float>, <1 x float>* [[P:%.*]], align 16
+; CHECK-NEXT:    ret <1 x float> [[R]]
+;
+  %r = load <1 x float>, <1 x float>* %p, align 16
+  ret <1 x float> %r
+}
+
+; We don't widen single-element loads, these get scalarized.
+define <1 x float> @vec_with_1elt(<1 x float>* align 16 dereferenceable(16) %p) {
+; CHECK-LABEL: @vec_with_1elt(
+; CHECK-NEXT:    [[R:%.*]] = load <1 x float>, <1 x float>* [[P:%.*]], align 16
+; CHECK-NEXT:    ret <1 x float> [[R]]
+;
+  %r = load <1 x float>, <1 x float>* %p, align 16
+  ret <1 x float> %r
+}
+
+define <2 x float> @vec_with_2elts(<2 x float>* align 16 dereferenceable(16) %p) {
+; CHECK-LABEL: @vec_with_2elts(
+; CHECK-NEXT:    [[R:%.*]] = load <2 x float>, <2 x float>* [[P:%.*]], align 16
+; CHECK-NEXT:    ret <2 x float> [[R]]
+;
+  %r = load <2 x float>, <2 x float>* %p, align 16
+  ret <2 x float> %r
+}
+
+define <3 x float> @vec_with_3elts(<3 x float>* align 16 dereferenceable(16) %p) {
+; CHECK-LABEL: @vec_with_3elts(
+; CHECK-NEXT:    [[R:%.*]] = load <3 x float>, <3 x float>* [[P:%.*]], align 16
+; CHECK-NEXT:    ret <3 x float> [[R]]
+;
+  %r = load <3 x float>, <3 x float>* %p, align 16
+  ret <3 x float> %r
+}
+
+; Full-vector load. All good already.
+define <4 x float> @vec_with_4elts(<4 x float>* align 16 dereferenceable(16) %p) {
+; CHECK-LABEL: @vec_with_4elts(
+; CHECK-NEXT:    [[R:%.*]] = load <4 x float>, <4 x float>* [[P:%.*]], align 16
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %r = load <4 x float>, <4 x float>* %p, align 16
+  ret <4 x float> %r
+}
+
+; We don't know we can load 256 bits though.
+define <5 x float> @vec_with_5elts(<5 x float>* align 16 dereferenceable(16) %p) {
+; CHECK-LABEL: @vec_with_5elts(
+; CHECK-NEXT:    [[R:%.*]] = load <5 x float>, <5 x float>* [[P:%.*]], align 16
+; CHECK-NEXT:    ret <5 x float> [[R]]
+;
+  %r = load <5 x float>, <5 x float>* %p, align 16
+  ret <5 x float> %r
+}
+
+;-------------------------------------------------------------------------------
+
+; We can load 128 bits, and the fact that it's underaligned isn't relevant.
+define <3 x float> @vec_with_3elts_underaligned(<3 x float>* align 8 dereferenceable(16) %p) {
+; CHECK-LABEL: @vec_with_3elts_underaligned(
+; CHECK-NEXT:    [[R:%.*]] = load <3 x float>, <3 x float>* [[P:%.*]], align 8
+; CHECK-NEXT:    ret <3 x float> [[R]]
+;
+  %r = load <3 x float>, <3 x float>* %p, align 8
+  ret <3 x float> %r
+}
+
+; We don't know we can load 128 bits, but since it's aligned, we still can do wide load.
+; FIXME: this should still get widened.
+define <3 x float> @vec_with_3elts_underdereferenceable(<3 x float>* align 16 dereferenceable(12) %p) {
+; CHECK-LABEL: @vec_with_3elts_underdereferenceable(
+; CHECK-NEXT:    [[R:%.*]] = load <3 x float>, <3 x float>* [[P:%.*]], align 16
+; CHECK-NEXT:    ret <3 x float> [[R]]
+;
+  %r = load <3 x float>, <3 x float>* %p, align 16
+  ret <3 x float> %r
+}
+
+; We can't tell if we can load 128 bits.
+define <3 x float> @vec_with_3elts_underaligned_underdereferenceable(<3 x float>* align 8 dereferenceable(12) %p) {
+; CHECK-LABEL: @vec_with_3elts_underaligned_underdereferenceable(
+; CHECK-NEXT:    [[R:%.*]] = load <3 x float>, <3 x float>* [[P:%.*]], align 8
+; CHECK-NEXT:    ret <3 x float> [[R]]
+;
+  %r = load <3 x float>, <3 x float>* %p, align 8
+  ret <3 x float> %r
+}
+
+;-------------------------------------------------------------------------------
+; Here we know we can load 256 bits as per dereferenceability and alignment.
+
+define <1 x float> @vec_with_1elt_256bits(<1 x float>* align 32 dereferenceable(32) %p) {
+; CHECK-LABEL: @vec_with_1elt_256bits(
+; CHECK-NEXT:    [[R:%.*]] = load <1 x float>, <1 x float>* [[P:%.*]], align 32
+; CHECK-NEXT:    ret <1 x float> [[R]]
+;
+  %r = load <1 x float>, <1 x float>* %p, align 32
+  ret <1 x float> %r
+}
+
+define <2 x float> @vec_with_2elts_256bits(<2 x float>* align 32 dereferenceable(32) %p) {
+; CHECK-LABEL: @vec_with_2elts_256bits(
+; CHECK-NEXT:    [[R:%.*]] = load <2 x float>, <2 x float>* [[P:%.*]], align 32
+; CHECK-NEXT:    ret <2 x float> [[R]]
+;
+  %r = load <2 x float>, <2 x float>* %p, align 32
+  ret <2 x float> %r
+}
+
+define <3 x float> @vec_with_3elts_256bits(<3 x float>* align 32 dereferenceable(32) %p) {
+; CHECK-LABEL: @vec_with_3elts_256bits(
+; CHECK-NEXT:    [[R:%.*]] = load <3 x float>, <3 x float>* [[P:%.*]], align 32
+; CHECK-NEXT:    ret <3 x float> [[R]]
+;
+  %r = load <3 x float>, <3 x float>* %p, align 32
+  ret <3 x float> %r
+}
+
+define <4 x float> @vec_with_4elts_256bits(<4 x float>* align 32 dereferenceable(32) %p) {
+; CHECK-LABEL: @vec_with_4elts_256bits(
+; CHECK-NEXT:    [[R:%.*]] = load <4 x float>, <4 x float>* [[P:%.*]], align 32
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %r = load <4 x float>, <4 x float>* %p, align 32
+  ret <4 x float> %r
+}
+
+define <5 x float> @vec_with_5elts_256bits(<5 x float>* align 32 dereferenceable(32) %p) {
+; CHECK-LABEL: @vec_with_5elts_256bits(
+; CHECK-NEXT:    [[R:%.*]] = load <5 x float>, <5 x float>* [[P:%.*]], align 32
+; CHECK-NEXT:    ret <5 x float> [[R]]
+;
+  %r = load <5 x float>, <5 x float>* %p, align 32
+  ret <5 x float> %r
+}
+
+define <6 x float> @vec_with_6elts_256bits(<6 x float>* align 32 dereferenceable(32) %p) {
+; CHECK-LABEL: @vec_with_6elts_256bits(
+; CHECK-NEXT:    [[R:%.*]] = load <6 x float>, <6 x float>* [[P:%.*]], align 32
+; CHECK-NEXT:    ret <6 x float> [[R]]
+;
+  %r = load <6 x float>, <6 x float>* %p, align 32
+  ret <6 x float> %r
+}
+
+define <7 x float> @vec_with_7elts_256bits(<7 x float>* align 32 dereferenceable(32) %p) {
+; CHECK-LABEL: @vec_with_7elts_256bits(
+; CHECK-NEXT:    [[R:%.*]] = load <7 x float>, <7 x float>* [[P:%.*]], align 32
+; CHECK-NEXT:    ret <7 x float> [[R]]
+;
+  %r = load <7 x float>, <7 x float>* %p, align 32
+  ret <7 x float> %r
+}
+
+; Full-vector load. All good already.
+define <8 x float> @vec_with_8elts_256bits(<8 x float>* align 32 dereferenceable(32) %p) {
+; CHECK-LABEL: @vec_with_8elts_256bits(
+; CHECK-NEXT:    [[R:%.*]] = load <8 x float>, <8 x float>* [[P:%.*]], align 32
+; CHECK-NEXT:    ret <8 x float> [[R]]
+;
+  %r = load <8 x float>, <8 x float>* %p, align 32
+  ret <8 x float> %r
+}
+
+; We can't tell if we can load more than 256 bits.
+define <9 x float> @vec_with_9elts_256bits(<9 x float>* align 32 dereferenceable(32) %p) {
+; CHECK-LABEL: @vec_with_9elts_256bits(
+; CHECK-NEXT:    [[R:%.*]] = load <9 x float>, <9 x float>* [[P:%.*]], align 32
+; CHECK-NEXT:    ret <9 x float> [[R]]
+;
+  %r = load <9 x float>, <9 x float>* %p, align 32
+  ret <9 x float> %r
+}
+
+;-------------------------------------------------------------------------------
+
+; Weird types we don't deal with
+define <2 x i7> @vec_with_two_subbyte_elts(<2 x i7>* align 16 dereferenceable(16) %p) {
+; CHECK-LABEL: @vec_with_two_subbyte_elts(
+; CHECK-NEXT:    [[R:%.*]] = load <2 x i7>, <2 x i7>* [[P:%.*]], align 16
+; CHECK-NEXT:    ret <2 x i7> [[R]]
+;
+  %r = load <2 x i7>, <2 x i7>* %p, align 16
+  ret <2 x i7> %r
+}
+
+define <2 x i9> @vec_with_two_nonbyte_sized_elts(<2 x i9>* align 16 dereferenceable(16) %p) {
+; CHECK-LABEL: @vec_with_two_nonbyte_sized_elts(
+; CHECK-NEXT:    [[R:%.*]] = load <2 x i9>, <2 x i9>* [[P:%.*]], align 16
+; CHECK-NEXT:    ret <2 x i9> [[R]]
+;
+  %r = load <2 x i9>, <2 x i9>* %p, align 16
+  ret <2 x i9> %r
+}
+
+define <2 x i24> @vec_with_two_nonpoweroftwo_sized_elts(<2 x i24>* align 16 dereferenceable(16) %p) {
+; CHECK-LABEL: @vec_with_two_nonpoweroftwo_sized_elts(
+; CHECK-NEXT:    [[R:%.*]] = load <2 x i24>, <2 x i24>* [[P:%.*]], align 16
+; CHECK-NEXT:    ret <2 x i24> [[R]]
+;
+  %r = load <2 x i24>, <2 x i24>* %p, align 16
+  ret <2 x i24> %r
+}
+
+define <2 x float> @vec_with_2elts_addressspace(<2 x float> addrspace(2)* align 16 dereferenceable(16) %p) {
+; CHECK-LABEL: @vec_with_2elts_addressspace(
+; CHECK-NEXT:    [[R:%.*]] = load <2 x float>, <2 x float> addrspace(2)* [[P:%.*]], align 16
+; CHECK-NEXT:    ret <2 x float> [[R]]
+;
+  %r = load <2 x float>, <2 x float> addrspace(2)* %p, align 16
+  ret <2 x float> %r
+}
-- 
2.7.4