From 0d9673cff2955a21e7cd6d0623db464b90912d8b Mon Sep 17 00:00:00 2001
From: Yaxun Liu <Yaxun.Liu@amd.com>
Date: Sat, 4 Nov 2017 17:37:43 +0000
Subject: [PATCH] [AMDGPU] Remove hardcoded address space value from
 AMDGPULibFunc

AMDGPULibFunc hardcodes address space values of the old address space mapping,
which causes invalid addrspacecast instructions and undefined functions in
APPSDK sample MonteCarloAsianDP.

This patch fixes that.

Differential Revision: https://reviews.llvm.org/D39616

llvm-svn: 317409
---
 llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp     |   4 +-
 llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp      |  29 +++---
 llvm/lib/Target/AMDGPU/AMDGPULibFunc.h        |  20 +++--
 llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll | 122 +++++++++++++-------------
 4 files changed, 90 insertions(+), 85 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index e7e5475..97cce26 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -1337,7 +1337,8 @@ bool AMDGPULibCalls::fold_sincos(CallInst *CI, IRBuilder<> &B,
   // for OpenCL 2.0 we have only generic implementation of sincos
   // function.
   AMDGPULibFunc nf(AMDGPULibFunc::EI_SINCOS, fInfo);
-  nf.getLeads()[0].PtrKind = AMDGPULibFunc::GENERIC;
+  const AMDGPUAS AS = AMDGPU::getAMDGPUAS(*M);
+  nf.getLeads()[0].PtrKind = AMDGPULibFunc::getEPtrKindFromAddrSpace(AS.FLAT_ADDRESS);
   Function *Fsincos = dyn_cast_or_null<Function>(getFunction(M, nf));
   if (!Fsincos) return false;
 
@@ -1350,7 +1351,6 @@ bool AMDGPULibCalls::fold_sincos(CallInst *CI, IRBuilder<> &B,
   // The allocaInst allocates the memory in private address space. This need
   // to be bitcasted to point to the address space of cos pointer type.
   // In OpenCL 2.0 this is generic, while in 1.2 that is private.
-  const AMDGPUAS AS = AMDGPU::getAMDGPUAS(*M);
   if (PTy->getPointerAddressSpace() != AS.PRIVATE_ADDRESS)
     P = B.CreateAddrSpaceCast(Alloc, PTy);
   CallInst *Call = CreateCallEx2(B, Fsincos, UI->getArgOperand(0), P);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp
index 919e8c1..4671273 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp
@@ -11,6 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "AMDGPU.h"
 #include "AMDGPULibFunc.h"
 #include <llvm/ADT/SmallString.h>
 #include <llvm/ADT/SmallVector.h>
@@ -458,13 +459,16 @@ AMDGPULibFunc::Param ParamIterator::getNextParam() {
       P.ArgType = AMDGPULibFunc::I32;
       break;
 
-    case E_CONSTPTR_SWAPGL:
-      switch (P.PtrKind & AMDGPULibFunc::ADDR_SPACE) {
-      case AMDGPULibFunc::GLOBAL: P.PtrKind = AMDGPULibFunc::LOCAL; break;
-      case AMDGPULibFunc::LOCAL:  P.PtrKind = AMDGPULibFunc::GLOBAL; break;
+    case E_CONSTPTR_SWAPGL: {
+      unsigned AS = AMDGPULibFunc::getAddrSpaceFromEPtrKind(P.PtrKind);
+      switch (AS) {
+      case AMDGPUAS::GLOBAL_ADDRESS: AS = AMDGPUAS::LOCAL_ADDRESS; break;
+      case AMDGPUAS::LOCAL_ADDRESS:  AS = AMDGPUAS::GLOBAL_ADDRESS; break;
       }
+      P.PtrKind = AMDGPULibFunc::getEPtrKindFromAddrSpace(AS);
       P.PtrKind |= AMDGPULibFunc::CONST;
       break;
+    }
 
     default: llvm_unreachable("Unhandeled param rule");
     }
@@ -590,19 +594,14 @@ bool ItaniumParamParser::parseItaniumParam(StringRef& param,
   if (eatTerm(param, 'P')) {
     if (eatTerm(param, 'K')) res.PtrKind |= AMDGPULibFunc::CONST;
     if (eatTerm(param, 'V')) res.PtrKind |= AMDGPULibFunc::VOLATILE;
+    unsigned AS;
     if (!eatTerm(param, "U3AS")) {
-      res.PtrKind |= AMDGPULibFunc::PRIVATE;
+      AS = 0;
     } else {
-      switch(param.front()) {
-      case '1': res.PtrKind |= AMDGPULibFunc::GLOBAL;  break;
-      case '2': res.PtrKind |= AMDGPULibFunc::READONLY;break;
-      case '3': res.PtrKind |= AMDGPULibFunc::LOCAL;   break;
-      case '4': res.PtrKind |= AMDGPULibFunc::GENERIC; break;
-      case '5': res.PtrKind |= AMDGPULibFunc::OTHER;   break;
-      default: return false;
-      }
+      AS = param.front() - '0';
       drop_front(param, 1);
     }
+    res.PtrKind |= AMDGPULibFuncBase::getEPtrKindFromAddrSpace(AS);
   } else {
     res.PtrKind = AMDGPULibFunc::BYVALUE;
   }
@@ -837,7 +836,9 @@ public:
       os << 'P';
       if (p.PtrKind & AMDGPULibFunc::CONST) os << 'K';
       if (p.PtrKind & AMDGPULibFunc::VOLATILE) os << 'V';
-      int AS = UseAddrSpace ? (p.PtrKind & AMDGPULibFunc::ADDR_SPACE)-1 : 0;
+      unsigned AS = UseAddrSpace
+                        ? AMDGPULibFuncBase::getAddrSpaceFromEPtrKind(p.PtrKind)
+                        : 0;
       if (AS != 0) os << "U3AS" << AS;
       Ptr = p;
       p.PtrKind = 0;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h b/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h
index 8f4297b..5405bc6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h
@@ -283,14 +283,7 @@ public:
 
   enum EPtrKind {
     BYVALUE = 0,
-    PRIVATE,
-    GLOBAL,
-    READONLY,
-    LOCAL,
-    GENERIC,
-    OTHER,
-
-    ADDR_SPACE = 0xF,
+    ADDR_SPACE = 0xF, // Address space takes value 0x1 ~ 0xF.
     CONST      = 0x10,
     VOLATILE   = 0x20
   };
@@ -315,6 +308,17 @@ public:
   static bool isMangled(EFuncId Id) {
     return static_cast<unsigned>(Id) <= static_cast<unsigned>(EI_LAST_MANGLED);
   }
+
+  static unsigned getEPtrKindFromAddrSpace(unsigned AS) {
+    assert(((AS + 1) & ~ADDR_SPACE) == 0);
+    return AS + 1;
+  }
+
+  static unsigned getAddrSpaceFromEPtrKind(unsigned Kind) {
+    Kind = Kind & ADDR_SPACE;
+    assert(Kind >= 1);
+    return Kind - 1;
+  }
 };
 
 class AMDGPULibFuncImpl : public AMDGPULibFuncBase {
diff --git a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
index 47eb9a9..aa6c183 100644
--- a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
+++ b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
@@ -1,11 +1,11 @@
-; RUN: opt -S -O1 -mtriple=amdgcn-- -amdgpu-simplify-libcall <%s | FileCheck -check-prefix=GCN -check-prefix=GCN-POSTLINK %s
-; RUN: opt -S -O1 -mtriple=amdgcn-- -amdgpu-simplify-libcall -amdgpu-prelink <%s | FileCheck -check-prefix=GCN -check-prefix=GCN-PRELINK %s
-; RUN: opt -S -O1 -mtriple=amdgcn-- -amdgpu-use-native -amdgpu-prelink <%s | FileCheck -check-prefix=GCN -check-prefix=GCN-NATIVE %s
+; RUN: opt -S -O1 -mtriple=amdgcn---amdgiz -amdgpu-simplify-libcall <%s | FileCheck -check-prefix=GCN -check-prefix=GCN-POSTLINK %s
+; RUN: opt -S -O1 -mtriple=amdgcn---amdgiz -amdgpu-simplify-libcall -amdgpu-prelink <%s | FileCheck -check-prefix=GCN -check-prefix=GCN-PRELINK %s
+; RUN: opt -S -O1 -mtriple=amdgcn---amdgiz -amdgpu-use-native -amdgpu-prelink <%s | FileCheck -check-prefix=GCN -check-prefix=GCN-NATIVE %s
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos
 ; GCN-POSTLINK: tail call fast float @_Z3sinf(
 ; GCN-POSTLINK: tail call fast float @_Z3cosf(
-; GCN-PRELINK: call fast float @_Z6sincosfPU3AS4f(
+; GCN-PRELINK: call fast float @_Z6sincosfPf(
 ; GCN-NATIVE: tail call fast float @_Z10native_sinf(
 ; GCN-NATIVE: tail call fast float @_Z10native_cosf(
 define amdgpu_kernel void @test_sincos(float addrspace(1)* nocapture %a) {
@@ -26,7 +26,7 @@ declare float @_Z3cosf(float)
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v2
 ; GCN-POSTLINK: tail call fast <2 x float> @_Z3sinDv2_f(
 ; GCN-POSTLINK: tail call fast <2 x float> @_Z3cosDv2_f(
-; GCN-PRELINK: call fast <2 x float> @_Z6sincosDv2_fPU3AS4S_(
+; GCN-PRELINK: call fast <2 x float> @_Z6sincosDv2_fPS_(
 ; GCN-NATIVE: tail call fast <2 x float> @_Z10native_sinDv2_f(
 ; GCN-NATIVE: tail call fast <2 x float> @_Z10native_cosDv2_f(
 define amdgpu_kernel void @test_sincos_v2(<2 x float> addrspace(1)* nocapture %a) {
@@ -47,7 +47,7 @@ declare <2 x float> @_Z3cosDv2_f(<2 x float>)
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v3
 ; GCN-POSTLINK: tail call fast <3 x float> @_Z3sinDv3_f(
 ; GCN-POSTLINK: tail call fast <3 x float> @_Z3cosDv3_f(
-; GCN-PRELINK: call fast <3 x float> @_Z6sincosDv3_fPU3AS4S_(
+; GCN-PRELINK: call fast <3 x float> @_Z6sincosDv3_fPS_(
 ; GCN-NATIVE: tail call fast <3 x float> @_Z10native_sinDv3_f(
 ; GCN-NATIVE: tail call fast <3 x float> @_Z10native_cosDv3_f(
 define amdgpu_kernel void @test_sincos_v3(<3 x float> addrspace(1)* nocapture %a) {
@@ -73,7 +73,7 @@ declare <3 x float> @_Z3cosDv3_f(<3 x float>)
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v4
 ; GCN-POSTLINK: tail call fast <4 x float> @_Z3sinDv4_f(
 ; GCN-POSTLINK: tail call fast <4 x float> @_Z3cosDv4_f(
-; GCN-PRELINK: call fast <4 x float> @_Z6sincosDv4_fPU3AS4S_(
+; GCN-PRELINK: call fast <4 x float> @_Z6sincosDv4_fPS_(
 ; GCN-NATIVE: tail call fast <4 x float> @_Z10native_sinDv4_f(
 ; GCN-NATIVE: tail call fast <4 x float> @_Z10native_cosDv4_f(
 define amdgpu_kernel void @test_sincos_v4(<4 x float> addrspace(1)* nocapture %a) {
@@ -94,7 +94,7 @@ declare <4 x float> @_Z3cosDv4_f(<4 x float>)
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v8
 ; GCN-POSTLINK: tail call fast <8 x float> @_Z3sinDv8_f(
 ; GCN-POSTLINK: tail call fast <8 x float> @_Z3cosDv8_f(
-; GCN-PRELINK: call fast <8 x float> @_Z6sincosDv8_fPU3AS4S_(
+; GCN-PRELINK: call fast <8 x float> @_Z6sincosDv8_fPS_(
 ; GCN-NATIVE: tail call fast <8 x float> @_Z10native_sinDv8_f(
 ; GCN-NATIVE: tail call fast <8 x float> @_Z10native_cosDv8_f(
 define amdgpu_kernel void @test_sincos_v8(<8 x float> addrspace(1)* nocapture %a) {
@@ -115,7 +115,7 @@ declare <8 x float> @_Z3cosDv8_f(<8 x float>)
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v16
 ; GCN-POSTLINK: tail call fast <16 x float> @_Z3sinDv16_f(
 ; GCN-POSTLINK: tail call fast <16 x float> @_Z3cosDv16_f(
-; GCN-PRELINK: call fast <16 x float> @_Z6sincosDv16_fPU3AS4S_(
+; GCN-PRELINK: call fast <16 x float> @_Z6sincosDv16_fPS_(
 ; GCN-NATIVE: tail call fast <16 x float> @_Z10native_sinDv16_f(
 ; GCN-NATIVE: tail call fast <16 x float> @_Z10native_cosDv16_f(
 define amdgpu_kernel void @test_sincos_v16(<16 x float> addrspace(1)* nocapture %a) {
@@ -685,101 +685,101 @@ define amdgpu_kernel void @test_use_native_sincos(float addrspace(1)* %a) {
 entry:
   %tmp = load float, float addrspace(1)* %a, align 4
   %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %a, i64 1
-  %tmp1 = addrspacecast float addrspace(1)* %arrayidx1 to float addrspace(4)*
-  %call = tail call fast float @_Z6sincosfPU3AS4f(float %tmp, float addrspace(4)* %tmp1)
+  %tmp1 = addrspacecast float addrspace(1)* %arrayidx1 to float*
+  %call = tail call fast float @_Z6sincosfPf(float %tmp, float* %tmp1)
   store float %call, float addrspace(1)* %a, align 4
   ret void
 }
 
-declare float @_Z6sincosfPU3AS4f(float, float addrspace(4)*)
+declare float @_Z6sincosfPf(float, float*)
 
 %opencl.pipe_t = type opaque
 %opencl.reserve_id_t = type opaque
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_read_pipe(%opencl.pipe_t addrspace(1)* %p, i32 addrspace(1)* %ptr)
-; GCN-PRELINK: call i32 @__read_pipe_2_4(%opencl.pipe_t addrspace(1)* %{{.*}}, i32 addrspace(4)* %{{.*}}) #[[NOUNWIND:[0-9]+]]
-; GCN-PRELINK: call i32 @__read_pipe_4_4(%opencl.pipe_t addrspace(1)* %{{.*}}, %opencl.reserve_id_t* %{{.*}}, i32 2, i32 addrspace(4)* %{{.*}}) #[[NOUNWIND]]
+; GCN-PRELINK: call i32 @__read_pipe_2_4(%opencl.pipe_t addrspace(1)* %{{.*}}, i32* %{{.*}}) #[[NOUNWIND:[0-9]+]]
+; GCN-PRELINK: call i32 @__read_pipe_4_4(%opencl.pipe_t addrspace(1)* %{{.*}}, %opencl.reserve_id_t addrspace(5)* %{{.*}}, i32 2, i32* %{{.*}}) #[[NOUNWIND]]
 define amdgpu_kernel void @test_read_pipe(%opencl.pipe_t addrspace(1)* %p, i32 addrspace(1)* %ptr) local_unnamed_addr {
 entry:
   %tmp = bitcast i32 addrspace(1)* %ptr to i8 addrspace(1)*
-  %tmp1 = addrspacecast i8 addrspace(1)* %tmp to i8 addrspace(4)*
-  %tmp2 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p, i8 addrspace(4)* %tmp1, i32 4, i32 4) #0
-  %tmp3 = tail call %opencl.reserve_id_t* @__reserve_read_pipe(%opencl.pipe_t addrspace(1)* %p, i32 2, i32 4, i32 4)
-  %tmp4 = tail call i32 @__read_pipe_4(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t* %tmp3, i32 2, i8 addrspace(4)* %tmp1, i32 4, i32 4) #0
-  tail call void @__commit_read_pipe(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t* %tmp3, i32 4, i32 4)
+  %tmp1 = addrspacecast i8 addrspace(1)* %tmp to i8*
+  %tmp2 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p, i8* %tmp1, i32 4, i32 4) #0
+  %tmp3 = tail call %opencl.reserve_id_t addrspace(5)* @__reserve_read_pipe(%opencl.pipe_t addrspace(1)* %p, i32 2, i32 4, i32 4)
+  %tmp4 = tail call i32 @__read_pipe_4(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t addrspace(5)* %tmp3, i32 2, i8* %tmp1, i32 4, i32 4) #0
+  tail call void @__commit_read_pipe(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t addrspace(5)* %tmp3, i32 4, i32 4)
   ret void
 }
 
-declare i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)*, i8 addrspace(4)*, i32, i32)
+declare i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)*, i8*, i32, i32)
 
-declare %opencl.reserve_id_t* @__reserve_read_pipe(%opencl.pipe_t addrspace(1)*, i32, i32, i32)
+declare %opencl.reserve_id_t addrspace(5)* @__reserve_read_pipe(%opencl.pipe_t addrspace(1)*, i32, i32, i32)
 
-declare i32 @__read_pipe_4(%opencl.pipe_t addrspace(1)*, %opencl.reserve_id_t*, i32, i8 addrspace(4)*, i32, i32)
+declare i32 @__read_pipe_4(%opencl.pipe_t addrspace(1)*, %opencl.reserve_id_t addrspace(5)*, i32, i8*, i32, i32)
 
-declare void @__commit_read_pipe(%opencl.pipe_t addrspace(1)*, %opencl.reserve_id_t*, i32, i32)
+declare void @__commit_read_pipe(%opencl.pipe_t addrspace(1)*, %opencl.reserve_id_t addrspace(5)*, i32, i32)
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_write_pipe(%opencl.pipe_t addrspace(1)* %p, i32 addrspace(1)* %ptr)
-; GCN-PRELINK: call i32 @__write_pipe_2_4(%opencl.pipe_t addrspace(1)* %{{.*}}, i32 addrspace(4)* %{{.*}}) #[[NOUNWIND]]
-; GCN-PRELINK: call i32 @__write_pipe_4_4(%opencl.pipe_t addrspace(1)* %{{.*}}, %opencl.reserve_id_t* %{{.*}}, i32 2, i32 addrspace(4)* %{{.*}}) #[[NOUNWIND]]
+; GCN-PRELINK: call i32 @__write_pipe_2_4(%opencl.pipe_t addrspace(1)* %{{.*}}, i32* %{{.*}}) #[[NOUNWIND]]
+; GCN-PRELINK: call i32 @__write_pipe_4_4(%opencl.pipe_t addrspace(1)* %{{.*}}, %opencl.reserve_id_t addrspace(5)* %{{.*}}, i32 2, i32* %{{.*}}) #[[NOUNWIND]]
 define amdgpu_kernel void @test_write_pipe(%opencl.pipe_t addrspace(1)* %p, i32 addrspace(1)* %ptr) local_unnamed_addr {
 entry:
   %tmp = bitcast i32 addrspace(1)* %ptr to i8 addrspace(1)*
-  %tmp1 = addrspacecast i8 addrspace(1)* %tmp to i8 addrspace(4)*
-  %tmp2 = tail call i32 @__write_pipe_2(%opencl.pipe_t addrspace(1)* %p, i8 addrspace(4)* %tmp1, i32 4, i32 4) #0
-  %tmp3 = tail call %opencl.reserve_id_t* @__reserve_write_pipe(%opencl.pipe_t addrspace(1)* %p, i32 2, i32 4, i32 4) #0
-  %tmp4 = tail call i32 @__write_pipe_4(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t* %tmp3, i32 2, i8 addrspace(4)* %tmp1, i32 4, i32 4) #0
-  tail call void @__commit_write_pipe(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t* %tmp3, i32 4, i32 4) #0
+  %tmp1 = addrspacecast i8 addrspace(1)* %tmp to i8*
+  %tmp2 = tail call i32 @__write_pipe_2(%opencl.pipe_t addrspace(1)* %p, i8* %tmp1, i32 4, i32 4) #0
+  %tmp3 = tail call %opencl.reserve_id_t addrspace(5)* @__reserve_write_pipe(%opencl.pipe_t addrspace(1)* %p, i32 2, i32 4, i32 4) #0
+  %tmp4 = tail call i32 @__write_pipe_4(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t addrspace(5)* %tmp3, i32 2, i8* %tmp1, i32 4, i32 4) #0
+  tail call void @__commit_write_pipe(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t addrspace(5)* %tmp3, i32 4, i32 4) #0
   ret void
 }
 
-declare i32 @__write_pipe_2(%opencl.pipe_t addrspace(1)*, i8 addrspace(4)*, i32, i32) local_unnamed_addr
+declare i32 @__write_pipe_2(%opencl.pipe_t addrspace(1)*, i8*, i32, i32) local_unnamed_addr
 
-declare %opencl.reserve_id_t* @__reserve_write_pipe(%opencl.pipe_t addrspace(1)*, i32, i32, i32) local_unnamed_addr
+declare %opencl.reserve_id_t addrspace(5)* @__reserve_write_pipe(%opencl.pipe_t addrspace(1)*, i32, i32, i32) local_unnamed_addr
 
-declare i32 @__write_pipe_4(%opencl.pipe_t addrspace(1)*, %opencl.reserve_id_t*, i32, i8 addrspace(4)*, i32, i32) local_unnamed_addr
+declare i32 @__write_pipe_4(%opencl.pipe_t addrspace(1)*, %opencl.reserve_id_t addrspace(5)*, i32, i8*, i32, i32) local_unnamed_addr
 
-declare void @__commit_write_pipe(%opencl.pipe_t addrspace(1)*, %opencl.reserve_id_t*, i32, i32) local_unnamed_addr
+declare void @__commit_write_pipe(%opencl.pipe_t addrspace(1)*, %opencl.reserve_id_t addrspace(5)*, i32, i32) local_unnamed_addr
 
 %struct.S = type { [100 x i32] }
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pipe_size
-; GCN-PRELINK: call i32 @__read_pipe_2_1(%opencl.pipe_t addrspace(1)* %{{.*}} i8 addrspace(4)* %{{.*}}) #[[NOUNWIND]]
-; GCN-PRELINK: call i32 @__read_pipe_2_2(%opencl.pipe_t addrspace(1)* %{{.*}} i16 addrspace(4)* %{{.*}}) #[[NOUNWIND]]
-; GCN-PRELINK: call i32 @__read_pipe_2_4(%opencl.pipe_t addrspace(1)* %{{.*}} i32 addrspace(4)* %{{.*}}) #[[NOUNWIND]]
-; GCN-PRELINK: call i32 @__read_pipe_2_8(%opencl.pipe_t addrspace(1)* %{{.*}} i64 addrspace(4)* %{{.*}}) #[[NOUNWIND]]
-; GCN-PRELINK: call i32 @__read_pipe_2_16(%opencl.pipe_t addrspace(1)* %{{.*}}, <2 x i64> addrspace(4)* %{{.*}}) #[[NOUNWIND]]
-; GCN-PRELINK: call i32 @__read_pipe_2_32(%opencl.pipe_t addrspace(1)* %{{.*}}, <4 x i64> addrspace(4)* %{{.*}} #[[NOUNWIND]]
-; GCN-PRELINK: call i32 @__read_pipe_2_64(%opencl.pipe_t addrspace(1)* %{{.*}}, <8 x i64> addrspace(4)* %{{.*}} #[[NOUNWIND]]
-; GCN-PRELINK: call i32 @__read_pipe_2_128(%opencl.pipe_t addrspace(1)* %{{.*}}, <16 x i64> addrspace(4)* %{{.*}} #[[NOUNWIND]]
-; GCN-PRELINK: call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %{{.*}}, i8 addrspace(4)* %{{.*}} i32 400, i32 4) #[[NOUNWIND]]
+; GCN-PRELINK: call i32 @__read_pipe_2_1(%opencl.pipe_t addrspace(1)* %{{.*}} i8* %{{.*}}) #[[NOUNWIND]]
+; GCN-PRELINK: call i32 @__read_pipe_2_2(%opencl.pipe_t addrspace(1)* %{{.*}} i16* %{{.*}}) #[[NOUNWIND]]
+; GCN-PRELINK: call i32 @__read_pipe_2_4(%opencl.pipe_t addrspace(1)* %{{.*}} i32* %{{.*}}) #[[NOUNWIND]]
+; GCN-PRELINK: call i32 @__read_pipe_2_8(%opencl.pipe_t addrspace(1)* %{{.*}} i64* %{{.*}}) #[[NOUNWIND]]
+; GCN-PRELINK: call i32 @__read_pipe_2_16(%opencl.pipe_t addrspace(1)* %{{.*}}, <2 x i64>* %{{.*}}) #[[NOUNWIND]]
+; GCN-PRELINK: call i32 @__read_pipe_2_32(%opencl.pipe_t addrspace(1)* %{{.*}}, <4 x i64>* %{{.*}} #[[NOUNWIND]]
+; GCN-PRELINK: call i32 @__read_pipe_2_64(%opencl.pipe_t addrspace(1)* %{{.*}}, <8 x i64>* %{{.*}} #[[NOUNWIND]]
+; GCN-PRELINK: call i32 @__read_pipe_2_128(%opencl.pipe_t addrspace(1)* %{{.*}}, <16 x i64>* %{{.*}} #[[NOUNWIND]]
+; GCN-PRELINK: call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %{{.*}}, i8* %{{.*}} i32 400, i32 4) #[[NOUNWIND]]
 define amdgpu_kernel void @test_pipe_size(%opencl.pipe_t addrspace(1)* %p1, i8 addrspace(1)* %ptr1, %opencl.pipe_t addrspace(1)* %p2, i16 addrspace(1)* %ptr2, %opencl.pipe_t addrspace(1)* %p4, i32 addrspace(1)* %ptr4, %opencl.pipe_t addrspace(1)* %p8, i64 addrspace(1)* %ptr8, %opencl.pipe_t addrspace(1)* %p16, <2 x i64> addrspace(1)* %ptr16, %opencl.pipe_t addrspace(1)* %p32, <4 x i64> addrspace(1)* %ptr32, %opencl.pipe_t addrspace(1)* %p64, <8 x i64> addrspace(1)* %ptr64, %opencl.pipe_t addrspace(1)* %p128, <16 x i64> addrspace(1)* %ptr128, %opencl.pipe_t addrspace(1)* %pu, %struct.S addrspace(1)* %ptru) local_unnamed_addr #0 {
 entry:
-  %tmp = addrspacecast i8 addrspace(1)* %ptr1 to i8 addrspace(4)*
-  %tmp1 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p1, i8 addrspace(4)* %tmp, i32 1, i32 1) #0
+  %tmp = addrspacecast i8 addrspace(1)* %ptr1 to i8*
+  %tmp1 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p1, i8* %tmp, i32 1, i32 1) #0
   %tmp2 = bitcast i16 addrspace(1)* %ptr2 to i8 addrspace(1)*
-  %tmp3 = addrspacecast i8 addrspace(1)* %tmp2 to i8 addrspace(4)*
-  %tmp4 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p2, i8 addrspace(4)* %tmp3, i32 2, i32 2) #0
+  %tmp3 = addrspacecast i8 addrspace(1)* %tmp2 to i8*
+  %tmp4 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p2, i8* %tmp3, i32 2, i32 2) #0
   %tmp5 = bitcast i32 addrspace(1)* %ptr4 to i8 addrspace(1)*
-  %tmp6 = addrspacecast i8 addrspace(1)* %tmp5 to i8 addrspace(4)*
-  %tmp7 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p4, i8 addrspace(4)* %tmp6, i32 4, i32 4) #0
+  %tmp6 = addrspacecast i8 addrspace(1)* %tmp5 to i8*
+  %tmp7 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p4, i8* %tmp6, i32 4, i32 4) #0
   %tmp8 = bitcast i64 addrspace(1)* %ptr8 to i8 addrspace(1)*
-  %tmp9 = addrspacecast i8 addrspace(1)* %tmp8 to i8 addrspace(4)*
-  %tmp10 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p8, i8 addrspace(4)* %tmp9, i32 8, i32 8) #0
+  %tmp9 = addrspacecast i8 addrspace(1)* %tmp8 to i8*
+  %tmp10 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p8, i8* %tmp9, i32 8, i32 8) #0
   %tmp11 = bitcast <2 x i64> addrspace(1)* %ptr16 to i8 addrspace(1)*
-  %tmp12 = addrspacecast i8 addrspace(1)* %tmp11 to i8 addrspace(4)*
-  %tmp13 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p16, i8 addrspace(4)* %tmp12, i32 16, i32 16) #0
+  %tmp12 = addrspacecast i8 addrspace(1)* %tmp11 to i8*
+  %tmp13 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p16, i8* %tmp12, i32 16, i32 16) #0
   %tmp14 = bitcast <4 x i64> addrspace(1)* %ptr32 to i8 addrspace(1)*
-  %tmp15 = addrspacecast i8 addrspace(1)* %tmp14 to i8 addrspace(4)*
-  %tmp16 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p32, i8 addrspace(4)* %tmp15, i32 32, i32 32) #0
+  %tmp15 = addrspacecast i8 addrspace(1)* %tmp14 to i8*
+  %tmp16 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p32, i8* %tmp15, i32 32, i32 32) #0
   %tmp17 = bitcast <8 x i64> addrspace(1)* %ptr64 to i8 addrspace(1)*
-  %tmp18 = addrspacecast i8 addrspace(1)* %tmp17 to i8 addrspace(4)*
-  %tmp19 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p64, i8 addrspace(4)* %tmp18, i32 64, i32 64) #0
+  %tmp18 = addrspacecast i8 addrspace(1)* %tmp17 to i8*
+  %tmp19 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p64, i8* %tmp18, i32 64, i32 64) #0
   %tmp20 = bitcast <16 x i64> addrspace(1)* %ptr128 to i8 addrspace(1)*
-  %tmp21 = addrspacecast i8 addrspace(1)* %tmp20 to i8 addrspace(4)*
-  %tmp22 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p128, i8 addrspace(4)* %tmp21, i32 128, i32 128) #0
+  %tmp21 = addrspacecast i8 addrspace(1)* %tmp20 to i8*
+  %tmp22 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p128, i8* %tmp21, i32 128, i32 128) #0
   %tmp23 = bitcast %struct.S addrspace(1)* %ptru to i8 addrspace(1)*
-  %tmp24 = addrspacecast i8 addrspace(1)* %tmp23 to i8 addrspace(4)*
-  %tmp25 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %pu, i8 addrspace(4)* %tmp24, i32 400, i32 4) #0
+  %tmp24 = addrspacecast i8 addrspace(1)* %tmp23 to i8*
+  %tmp25 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %pu, i8* %tmp24, i32 400, i32 4) #0
   ret void
 }
 
-- 
2.7.4