From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 31 Aug 2016 21:52:27 +0000 (+0000)
Subject: AMDGPU: Fix introducing stack access on unaligned v16i8
X-Git-Tag: llvmorg-4.0.0-rc1~10940
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=b50eb8dc2b3cb0005eb0187e1cb9fe7a68a0a070;p=platform%2Fupstream%2Fllvm.git

AMDGPU: Fix introducing stack access on unaligned v16i8

llvm-svn: 280298
---

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 32e40a2..682157b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -2012,6 +2012,9 @@ SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
     // problems during legalization, the emitted instructions to pack and unpack
     // the bytes again are not eliminated in the case of an unaligned copy.
     if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) {
+      if (VT.isVector())
+        return scalarizeVectorLoad(LN, DAG);
+
       SDValue Ops[2];
       std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
       return DAG.getMergeValues(Ops, SDLoc(N));
@@ -2060,8 +2063,12 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
     // order problems during legalization, the emitted instructions to pack and
     // unpack the bytes again are not eliminated in the case of an unaligned
     // copy.
-    if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast))
+    if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) {
+      if (VT.isVector())
+        return scalarizeVectorStore(SN, DAG);
+
       return expandUnalignedStore(SN, DAG);
+    }
 
     if (!IsFast)
       return SDValue();
diff --git a/llvm/test/CodeGen/AMDGPU/sext-in-reg-failure-r600.ll b/llvm/test/CodeGen/AMDGPU/sext-in-reg-failure-r600.ll
index 08bdc3a..adba6bb 100644
--- a/llvm/test/CodeGen/AMDGPU/sext-in-reg-failure-r600.ll
+++ b/llvm/test/CodeGen/AMDGPU/sext-in-reg-failure-r600.ll
@@ -1,15 +1,13 @@
-; XFAIL: *
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s
-; XUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG %s
 ;
 ; EG-LABEL: {{^}}sext_in_reg_v2i1_in_v2i32_other_amount:
-; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
+; EG: MEM_{{.*}} MSKOR [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
 ; EG-NOT: BFE
 ; EG: ADD_INT
 ; EG: LSHL
-; EG: ASHR [[RES]]
+; EG: ASHR
 ; EG: LSHL
-; EG: ASHR [[RES]]
+; EG: ASHR
 ; EG: LSHR {{\*?}} [[ADDR]]
 
 ; Works with the align 2 removed
diff --git a/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll b/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll
index 129748a..453c981 100644
--- a/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll
+++ b/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll
@@ -552,4 +552,53 @@ define void @constant_align4_merge_load_2_i32(i32 addrspace(2)* %p, i32 addrspac
   ret void
 }
 
+; SI-LABEL: {{^}}local_load_align1_v16i8:
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+
+; SI: ScratchSize: 0{{$}}
+define void @local_load_align1_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(3)* %in) #0 {
+  %ld = load <16 x i8>, <16 x i8> addrspace(3)* %in, align 1
+  store <16 x i8> %ld, <16 x i8> addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}local_store_align1_v16i8:
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+
+; SI: ScratchSize: 0{{$}}
+define void @local_store_align1_v16i8(<16 x i8> addrspace(3)* %out) #0 {
+  store <16 x i8> zeroinitializer, <16 x i8> addrspace(3)* %out, align 1
+  ret void
+}
+
 attributes #0 = { nounwind }