--- /dev/null
+; ModuleID = 'add.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel i32 @add(i32 %x, i32 %y) nounwind readnone noinline {
+entry:
+ %add = add i32 %y, %x
+ ret i32 %add
+}
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{i32 (i32, i32)* @add}
+++ /dev/null
-; ModuleID = 'add.o'
-target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
-target triple = "ptx32--"
-
-define ptx_kernel i32 @add(i32 %x, i32 %y) nounwind readnone noinline {
-entry:
- %add = add i32 %y, %x
- ret i32 %add
-}
-
-!opencl.kernels = !{!0}
-
-!0 = metadata !{i32 (i32, i32)* @add}
--- /dev/null
+; ModuleID = 'add2.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+%struct.big = type { i32, i32 }
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @add(%struct.big* noalias nocapture sret %agg.result, i32 %x, i32 %y) nounwind noinline {
+entry:
+ %add = add i32 %y, %x
+ %sub = add i32 %x, 10
+ %add1 = sub i32 %sub, %y
+ %agg.result.0 = getelementptr inbounds %struct.big* %agg.result, i32 0, i32 0
+ store i32 %add, i32* %agg.result.0, align 4
+ %agg.result.1 = getelementptr inbounds %struct.big* %agg.result, i32 0, i32 1
+ store i32 %add1, i32* %agg.result.1, align 4
+ ret void
+}
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (%struct.big*, i32, i32)* @add}
+++ /dev/null
-; ModuleID = 'add2.o'
-target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
-target triple = "ptx32--"
-
-%struct.big = type { i32, i32 }
-
-define ptx_kernel void @add(%struct.big* noalias nocapture sret %agg.result, i32 %x, i32 %y) nounwind noinline {
-entry:
- %add = add i32 %y, %x
- %sub = add i32 %x, 10
- %add1 = sub i32 %sub, %y
- %agg.result.0 = getelementptr inbounds %struct.big* %agg.result, i32 0, i32 0
- store i32 %add, i32* %agg.result.0, align 4
- %agg.result.1 = getelementptr inbounds %struct.big* %agg.result, i32 0, i32 1
- store i32 %add1, i32* %agg.result.1, align 4
- ret void
-}
-
-!opencl.kernels = !{!0}
-
-!0 = metadata !{void (%struct.big*, i32, i32)* @add}
--- /dev/null
+; ModuleID = 'cmp.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @test_cmp(i8 addrspace(1)* nocapture %dst, i32 %x, i32 %y, float %z, float %w) nounwind noinline {
+entry:
+ %cmp = icmp slt i32 %x, %y
+ %conv = zext i1 %cmp to i32
+ %cmp1 = fcmp ogt float %z, %w
+ %add = sext i1 %cmp1 to i32
+ %tobool = icmp ne i32 %conv, %add
+ %frombool = zext i1 %tobool to i8
+ store i8 %frombool, i8 addrspace(1)* %dst, align 1, !tbaa !1
+ ret void
+}
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (i8 addrspace(1)*, i32, i32, float, float)* @test_cmp}
+!1 = metadata !{metadata !"bool", metadata !2}
+!2 = metadata !{metadata !"omnipotent char", metadata !3}
+!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
+++ /dev/null
-; ModuleID = 'cmp.o'
-target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
-target triple = "ptx32--"
-
-define ptx_kernel void @test_cmp(i8* nocapture %dst, i32 %x, i32 %y, float %z, float %w) nounwind noinline {
-entry:
- %cmp = icmp slt i32 %x, %y
- %conv = zext i1 %cmp to i32
- %cmp1 = fcmp ogt float %z, %w
- %add = sext i1 %cmp1 to i32
- %tobool = icmp ne i32 %conv, %add
- %frombool = zext i1 %tobool to i8
- store i8 %frombool, i8* %dst, align 1, !tbaa !1
- ret void
-}
-
-!opencl.kernels = !{!0}
-
-!0 = metadata !{void (i8*, i32, i32, float, float)* @test_cmp}
-!1 = metadata !{metadata !"bool", metadata !2}
-!2 = metadata !{metadata !"omnipotent char", metadata !3}
-!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
--- /dev/null
+; ModuleID = 'cmp_cvt.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @cmp_cvt(i32 addrspace(1)* nocapture %dst, i32 %x, i32 %y) nounwind noinline {
+get_local_id.exit:
+ %add = add nsw i32 %y, %x
+ %call.i = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
+ %cmp = icmp ult i32 %add, %call.i
+ %conv = zext i1 %cmp to i32
+ store i32 %conv, i32 addrspace(1)* %dst, align 4, !tbaa !1
+ ret void
+}
+
+declare ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (i32 addrspace(1)*, i32, i32)* @cmp_cvt}
+!1 = metadata !{metadata !"int", metadata !2}
+!2 = metadata !{metadata !"omnipotent char", metadata !3}
+!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
+++ /dev/null
-; ModuleID = 'cmp_cvt.o'
-target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
-target triple = "ptx32--"
-
-define ptx_kernel void @cmp_cvt(i32* nocapture %dst, i32 %x, i32 %y) nounwind noinline {
-get_local_id.exit:
- %add = add nsw i32 %y, %x
- %call.i = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
- %cmp = icmp ult i32 %add, %call.i
- %conv = zext i1 %cmp to i32
- store i32 %conv, i32* %dst, align 4, !tbaa !1
- ret void
-}
-
-declare ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
-
-!opencl.kernels = !{!0}
-
-!0 = metadata !{void (i32*, i32, i32)* @cmp_cvt}
-!1 = metadata !{metadata !"int", metadata !2}
-!2 = metadata !{metadata !"omnipotent char", metadata !3}
-!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
-clang -emit-llvm -O3 -ccc-host-triple ptx32 -c $1.cl -o $1.o
+#!/bin/bash
+clang -emit-llvm -O3 -ccc-host-triple ptx32 -c $1 -o $1.o
llvm-dis $1.o
rm $1.o
mv $1.o.ll $1.ll
--- /dev/null
+; ModuleID = 'complex_struct.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+%struct.my_struct = type { i32, [5 x %struct.hop] }
+%struct.hop = type { float, float }
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @struct_cl(%struct.my_struct addrspace(1)* nocapture %dst, %struct.my_struct addrspace(1)* nocapture %src) nounwind noinline {
+entry:
+ %x = getelementptr inbounds %struct.my_struct addrspace(1)* %src, i32 1, i32 1, i32 3, i32 0
+ %0 = load float addrspace(1)* %x, align 4, !tbaa !1
+ %y = getelementptr inbounds %struct.my_struct addrspace(1)* %dst, i32 0, i32 1, i32 2, i32 1
+ store float %0, float addrspace(1)* %y, align 4, !tbaa !1
+ ret void
+}
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (%struct.my_struct addrspace(1)*, %struct.my_struct addrspace(1)*)* @struct_cl}
+!1 = metadata !{metadata !"float", metadata !2}
+!2 = metadata !{metadata !"omnipotent char", metadata !3}
+!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
--- /dev/null
+; ModuleID = 'complex_struct.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+%struct.my_struct = type { i32, [5 x %struct.hop] }
+%struct.hop = type { float, float }
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @struct_cl(%struct.my_struct addrspace(1)* nocapture %dst, %struct.my_struct addrspace(1)* nocapture %src) nounwind noinline {
+entry:
+ %x = getelementptr inbounds %struct.my_struct addrspace(1)* %src, i32 1, i32 1, i32 3, i32 0
+ %0 = load float addrspace(1)* %x, align 4, !tbaa !1
+ %y = getelementptr inbounds %struct.my_struct addrspace(1)* %dst, i32 0, i32 1, i32 2, i32 1
+ store float %0, float addrspace(1)* %y, align 4, !tbaa !1
+ ret void
+}
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (%struct.my_struct addrspace(1)*, %struct.my_struct addrspace(1)*)* @struct_cl}
+!1 = metadata !{metadata !"float", metadata !2}
+!2 = metadata !{metadata !"omnipotent char", metadata !3}
+!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
--- /dev/null
+; ModuleID = 'cycle.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @cycle(i32 addrspace(1)* nocapture %dst) noreturn nounwind readnone noinline {
+entry:
+ br label %hop0
+
+hop0: ; preds = %hop0, %entry
+ br label %hop0
+}
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (i32 addrspace(1)*)* @cycle}
+++ /dev/null
-; ModuleID = 'cycle.o'
-target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
-target triple = "ptx32--"
-
-define ptx_kernel void @cycle(i32* nocapture %dst) noreturn nounwind readnone noinline {
-entry:
- br label %hop0
-
-hop0: ; preds = %hop0, %entry
- br label %hop0
-}
-
-!opencl.kernels = !{!0}
-
-!0 = metadata !{void (i32*)* @cycle}
+++ /dev/null
-; ModuleID = 'void.o'
-target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
-target triple = "ptx32--"
-
-define ptx_kernel void @hop() nounwind readnone noinline {
-entry:
- ret void
-}
-
-!opencl.kernels = !{!0}
-
-!0 = metadata !{void ()* @hop}
--- /dev/null
+; ModuleID = 'extract.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @extract(<4 x i32> addrspace(1)* nocapture %dst, <4 x i32> addrspace(1)* nocapture %src, i32 %c) nounwind noinline {
+entry:
+ %0 = load <4 x i32> addrspace(1)* %src, align 16, !tbaa !1
+ %1 = extractelement <4 x i32> %0, i32 0
+ %vecinit = insertelement <4 x i32> undef, i32 %1, i32 0
+ %vecinit1 = insertelement <4 x i32> %vecinit, i32 1, i32 1
+ %vecinit2 = insertelement <4 x i32> %vecinit1, i32 2, i32 2
+ %vecinit3 = insertelement <4 x i32> %vecinit2, i32 3, i32 3
+ store <4 x i32> %vecinit3, <4 x i32> addrspace(1)* %dst, align 16, !tbaa !1
+ ret void
+}
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (<4 x i32> addrspace(1)*, <4 x i32> addrspace(1)*, i32)* @extract}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
+++ /dev/null
-; ModuleID = 'extract.o'
-target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
-target triple = "ptx32--"
-
-define ptx_kernel void @extract(<4 x i32>* nocapture %dst, <4 x i32>* nocapture %src, i32 %c) nounwind noinline {
-entry:
- %0 = load <4 x i32>* %src, align 16, !tbaa !1
- %1 = extractelement <4 x i32> %0, i32 0
- %vecinit = insertelement <4 x i32> undef, i32 %1, i32 0
- %vecinit1 = insertelement <4 x i32> %vecinit, i32 1, i32 1
- %vecinit2 = insertelement <4 x i32> %vecinit1, i32 2, i32 2
- %vecinit3 = insertelement <4 x i32> %vecinit2, i32 3, i32 3
- store <4 x i32> %vecinit3, <4 x i32>* %dst, align 16, !tbaa !1
- ret void
-}
-
-!opencl.kernels = !{!0}
-
-!0 = metadata !{void (<4 x i32>*, <4 x i32>*, i32)* @extract}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
--- /dev/null
+; ModuleID = 'function.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_device void @write(i32 addrspace(1)* nocapture %dst) nounwind {
+entry:
+ store i32 1, i32 addrspace(1)* %dst, align 4, !tbaa !1
+ ret void
+}
+
+define ptx_kernel void @write2(i32 addrspace(1)* nocapture %dst, i32 %x) nounwind noinline {
+entry:
+ store i32 1, i32 addrspace(1)* %dst, align 4, !tbaa !1
+ %arrayidx = getelementptr inbounds i32 addrspace(1)* %dst, i32 %x
+ store i32 1, i32 addrspace(1)* %arrayidx, align 4, !tbaa !1
+ ret void
+}
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (i32 addrspace(1)*, i32)* @write2}
+!1 = metadata !{metadata !"int", metadata !2}
+!2 = metadata !{metadata !"omnipotent char", metadata !3}
+!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
+++ /dev/null
-; ModuleID = 'function.o'
-target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
-target triple = "ptx32--"
-
-define ptx_device void @write(i32 addrspace(1)* nocapture %dst) nounwind {
-entry:
- store i32 1, i32 addrspace(1)* %dst, align 4, !tbaa !1
- ret void
-}
-
-define ptx_kernel void @write2(i32 addrspace(1)* nocapture %dst, i32 %x) nounwind noinline {
-entry:
- store i32 1, i32 addrspace(1)* %dst, align 4, !tbaa !1
- %arrayidx = getelementptr inbounds i32 addrspace(1)* %dst, i32 %x
- store i32 1, i32 addrspace(1)* %arrayidx, align 4, !tbaa !1
- ret void
-}
-
-!opencl.kernels = !{!0}
-
-!0 = metadata !{void (i32 addrspace(1)*, i32)* @write2}
-!1 = metadata !{metadata !"int", metadata !2}
-!2 = metadata !{metadata !"omnipotent char", metadata !3}
-!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
--- /dev/null
+; ModuleID = 'function_param.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+%struct.struct0 = type { [5 x i32], i32, i32, i32 }
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @param(%struct.struct0 addrspace(1)* nocapture %dst, %struct.struct0* nocapture byval %s, i32 addrspace(4)* nocapture %h, i32 %x, i32 %y) nounwind noinline {
+entry:
+ %arrayidx = getelementptr inbounds i32 addrspace(4)* %h, i32 4
+ %0 = load i32 addrspace(4)* %arrayidx, align 4, !tbaa !1
+ %arrayidx1 = getelementptr inbounds %struct.struct0* %s, i32 0, i32 0, i32 4
+ %1 = load i32* %arrayidx1, align 4, !tbaa !1
+ %add = add i32 %0, %x
+ %add2 = add i32 %add, %1
+ store i32 %add2, i32* %arrayidx1, align 4, !tbaa !1
+ %2 = bitcast %struct.struct0 addrspace(1)* %dst to i8 addrspace(1)*
+ %3 = bitcast %struct.struct0* %s to i8*
+ tail call void @llvm.memcpy.p1i8.p0i8.i32(i8 addrspace(1)* %2, i8* %3, i32 32, i32 4, i1 false)
+ %y5 = getelementptr inbounds %struct.struct0 addrspace(1)* %dst, i32 0, i32 2
+ %4 = load i32 addrspace(1)* %y5, align 4, !tbaa !1
+ %add6 = add nsw i32 %4, %y
+ store i32 %add6, i32 addrspace(1)* %y5, align 4, !tbaa !1
+ ret void
+}
+
+declare void @llvm.memcpy.p1i8.p0i8.i32(i8 addrspace(1)* nocapture, i8* nocapture, i32, i32, i1) nounwind
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (%struct.struct0 addrspace(1)*, %struct.struct0*, i32 addrspace(4)*, i32, i32)* @param}
+!1 = metadata !{metadata !"int", metadata !2}
+!2 = metadata !{metadata !"omnipotent char", metadata !3}
+!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
+++ /dev/null
-; ModuleID = 'function_param.o'
-target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
-target triple = "ptx32--"
-
-%struct.struct0 = type { [5 x i32], i32, i32, i32 }
-
-define ptx_kernel void @param(%struct.struct0 addrspace(1)* nocapture %dst, %struct.struct0* nocapture byval %s, i32 addrspace(4)* nocapture %h, i32 %x, i32 %y) nounwind noinline {
-entry:
- %arrayidx = getelementptr inbounds i32 addrspace(4)* %h, i32 4
- %0 = load i32 addrspace(4)* %arrayidx, align 4, !tbaa !1
- %arrayidx1 = getelementptr inbounds %struct.struct0* %s, i32 0, i32 0, i32 4
- %1 = load i32* %arrayidx1, align 4, !tbaa !1
- %add = add i32 %0, %x
- %add2 = add i32 %add, %1
- store i32 %add2, i32* %arrayidx1, align 4, !tbaa !1
- %2 = bitcast %struct.struct0 addrspace(1)* %dst to i8 addrspace(1)*
- %3 = bitcast %struct.struct0* %s to i8*
- tail call void @llvm.memcpy.p1i8.p0i8.i32(i8 addrspace(1)* %2, i8* %3, i32 32, i32 4, i1 false)
- %y5 = getelementptr inbounds %struct.struct0 addrspace(1)* %dst, i32 0, i32 2
- %4 = load i32 addrspace(1)* %y5, align 4, !tbaa !1
- %add6 = add nsw i32 %4, %y
- store i32 %add6, i32 addrspace(1)* %y5, align 4, !tbaa !1
- ret void
-}
-
-declare void @llvm.memcpy.p1i8.p0i8.i32(i8 addrspace(1)* nocapture, i8* nocapture, i32, i32, i1) nounwind
-
-!opencl.kernels = !{!0}
-
-!0 = metadata !{void (%struct.struct0 addrspace(1)*, %struct.struct0*, i32 addrspace(4)*, i32, i32)* @param}
-!1 = metadata !{metadata !"int", metadata !2}
-!2 = metadata !{metadata !"omnipotent char", metadata !3}
-!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
--- /dev/null
+; ModuleID = 'get_global_id.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @test_global_id(i32 addrspace(1)* nocapture %dst, i32 addrspace(1)* nocapture %p) nounwind noinline {
+get_global_id.exit13:
+ %call.i = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
+ %sext = shl i32 %call.i, 16
+ %conv1 = ashr exact i32 %sext, 16
+ %call.i6 = tail call ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
+ %arrayidx = getelementptr inbounds i32 addrspace(1)* %dst, i32 %call.i6
+ store i32 %conv1, i32 addrspace(1)* %arrayidx, align 4, !tbaa !1
+ %arrayidx5 = getelementptr inbounds i32 addrspace(1)* %p, i32 %call.i6
+ store i32 %call.i, i32 addrspace(1)* %arrayidx5, align 4, !tbaa !1
+ ret void
+}
+
+declare ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
+
+declare ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (i32 addrspace(1)*, i32 addrspace(1)*)* @test_global_id}
+!1 = metadata !{metadata !"int", metadata !2}
+!2 = metadata !{metadata !"omnipotent char", metadata !3}
+!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
+++ /dev/null
-; ModuleID = 'get_global_id.o'
-target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
-target triple = "ptx32--"
-
-define ptx_kernel void @test_global_id(i32* nocapture %dst, i32* nocapture %p) nounwind noinline {
-get_global_id.exit13:
- %call.i = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
- %sext = shl i32 %call.i, 16
- %conv1 = ashr exact i32 %sext, 16
- %call.i6 = tail call ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
- %arrayidx = getelementptr inbounds i32* %dst, i32 %call.i6
- store i32 %conv1, i32* %arrayidx, align 4, !tbaa !1
- %arrayidx5 = getelementptr inbounds i32* %p, i32 %call.i6
- store i32 %call.i, i32* %arrayidx5, align 4, !tbaa !1
- ret void
-}
-
-declare ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
-
-declare ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
-
-!opencl.kernels = !{!0}
-
-!0 = metadata !{void (i32*, i32*)* @test_global_id}
-!1 = metadata !{metadata !"int", metadata !2}
-!2 = metadata !{metadata !"omnipotent char", metadata !3}
-!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
+++ /dev/null
-; ModuleID = 'gg.ll'
-target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
-target triple = "ptx32--"
-
-%struct.my_struct = type { i32, [2 x i32] }
-
-@g = addrspace(1) constant [4 x i32] [i32 0, i32 1, i32 2, i32 3], align 4
-@struct_cl.array = internal addrspace(4) global [256 x %struct.my_struct] zeroinitializer, align 4
-
-define ptx_kernel void @struct_cl(%struct.my_struct* byval %s, i32 %x, i32* %mem, i32 %y) nounwind noinline {
-entry:
- %x.addr = alloca i32, align 4
- %mem.addr = alloca i32*, align 4
- %y.addr = alloca i32, align 4
- %i = alloca i32, align 4
- store i32 %x, i32* %x.addr, align 4
- store i32* %mem, i32** %mem.addr, align 4
- store i32 %y, i32* %y.addr, align 4
- store i32 0, i32* %i, align 4
- br label %for.cond
-
-for.cond: ; preds = %for.inc, %entry
- %0 = load i32* %i, align 4
- %cmp = icmp slt i32 %0, 256
- br i1 %cmp, label %for.body, label %for.end
-
-for.body: ; preds = %for.cond
- %1 = load i32* %i, align 4
- %2 = load i32* %i, align 4
- %arrayidx = getelementptr inbounds [256 x %struct.my_struct] addrspace(4)* @struct_cl.array, i32 0, i32 %2
- %a = getelementptr inbounds %struct.my_struct addrspace(4)* %arrayidx, i32 0, i32 0
- store i32 %1, i32 addrspace(4)* %a, align 4
- %3 = load i32* %i, align 4
- %4 = load i32* %i, align 4
- %arrayidx1 = getelementptr inbounds [256 x %struct.my_struct] addrspace(4)* @struct_cl.array, i32 0, i32 %4
- %b = getelementptr inbounds %struct.my_struct addrspace(4)* %arrayidx1, i32 0, i32 1
- %arrayidx2 = getelementptr inbounds [2 x i32] addrspace(4)* %b, i32 0, i32 0
- store i32 %3, i32 addrspace(4)* %arrayidx2, align 4
- %5 = load i32* %i, align 4
- %add = add nsw i32 %5, 1
- %6 = load i32* %i, align 4
- %arrayidx3 = getelementptr inbounds [256 x %struct.my_struct] addrspace(4)* @struct_cl.array, i32 0, i32 %6
- %b4 = getelementptr inbounds %struct.my_struct addrspace(4)* %arrayidx3, i32 0, i32 1
- %arrayidx5 = getelementptr inbounds [2 x i32] addrspace(4)* %b4, i32 0, i32 0
- store i32 %add, i32 addrspace(4)* %arrayidx5, align 4
- br label %for.inc
-
-for.inc: ; preds = %for.body
- %7 = load i32* %i, align 4
- %inc = add nsw i32 %7, 1
- store i32 %inc, i32* %i, align 4
- br label %for.cond
-
-for.end: ; preds = %for.cond
- %8 = load i32* %y.addr, align 4
- %arrayidx6 = getelementptr inbounds [256 x %struct.my_struct] addrspace(4)* @struct_cl.array, i32 0, i32 %8
- %9 = bitcast %struct.my_struct addrspace(4)* %arrayidx6 to i8 addrspace(4)*
- call void @llvm.memcpy.p4i8.p4i8.i32(i8 addrspace(4)* bitcast ([256 x %struct.my_struct] addrspace(4)* @struct_cl.array to i8 addrspace(4)*), i8 addrspace(4)* %9, i32 12, i32 4, i1 false)
- %a7 = getelementptr inbounds %struct.my_struct* %s, i32 0, i32 0
- %10 = load i32* %a7, align 4
- %11 = load i32* %x.addr, align 4
- %arrayidx8 = getelementptr inbounds [256 x %struct.my_struct] addrspace(4)* @struct_cl.array, i32 0, i32 %11
- %a9 = getelementptr inbounds %struct.my_struct addrspace(4)* %arrayidx8, i32 0, i32 0
- %12 = load i32 addrspace(4)* %a9, align 4
- %add10 = add nsw i32 %10, %12
- %13 = load i32* %x.addr, align 4
- %add11 = add nsw i32 %13, 1
- %arrayidx12 = getelementptr inbounds [256 x %struct.my_struct] addrspace(4)* @struct_cl.array, i32 0, i32 %add11
- %b13 = getelementptr inbounds %struct.my_struct addrspace(4)* %arrayidx12, i32 0, i32 1
- %arrayidx14 = getelementptr inbounds [2 x i32] addrspace(4)* %b13, i32 0, i32 0
- %14 = load i32 addrspace(4)* %arrayidx14, align 4
- %add15 = add nsw i32 %add10, %14
- %15 = load i32* %x.addr, align 4
- %arrayidx16 = getelementptr inbounds [4 x i32] addrspace(1)* @g, i32 0, i32 %15
- %16 = load i32 addrspace(1)* %arrayidx16, align 4
- %add17 = add nsw i32 %add15, %16
- %17 = load i32 addrspace(1)* getelementptr inbounds ([4 x i32] addrspace(1)* @g, i32 0, i32 3), align 4
- %add18 = add nsw i32 %add17, %17
- %18 = load i32** %mem.addr, align 4
- %arrayidx19 = getelementptr inbounds i32* %18, i32 0
- store i32 %add18, i32* %arrayidx19
- ret void
-}
-
-declare void @llvm.memcpy.p4i8.p4i8.i32(i8 addrspace(4)* nocapture, i8 addrspace(4)* nocapture, i32, i32, i1) nounwind
-
-!opencl.kernels = !{!0}
-
-!0 = metadata !{void (%struct.my_struct*, i32, i32*, i32)* @struct_cl}
--- /dev/null
+; ModuleID = 'insert.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @insert(<4 x i32> addrspace(1)* nocapture %dst, <4 x i32> addrspace(1)* nocapture %src, i32 %c) nounwind noinline {
+entry:
+ %0 = load <4 x i32> addrspace(1)* %src, align 16
+ %1 = insertelement <4 x i32> %0, i32 1, i32 2
+ store <4 x i32> %1, <4 x i32> addrspace(1)* %src, align 16
+ store <4 x i32> %1, <4 x i32> addrspace(1)* %dst, align 16, !tbaa !1
+ ret void
+}
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (<4 x i32> addrspace(1)*, <4 x i32> addrspace(1)*, i32)* @insert}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
+++ /dev/null
-; ModuleID = 'insert.o'
-target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
-target triple = "ptx32--"
-
-define ptx_kernel void @insert(<4 x i32>* nocapture %dst, <4 x i32>* nocapture %src, i32 %c) nounwind noinline {
-entry:
- %0 = load <4 x i32>* %src, align 16
- %1 = insertelement <4 x i32> %0, i32 1, i32 2
- store <4 x i32> %1, <4 x i32>* %src, align 16
- store <4 x i32> %1, <4 x i32>* %dst, align 16, !tbaa !1
- ret void
-}
-
-!opencl.kernels = !{!0}
-
-!0 = metadata !{void (<4 x i32>*, <4 x i32>*, i32)* @insert}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
-; ModuleID = 'load_store.o'
+; ModuleID = 'load_store.cl.o'
target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
target triple = "ptx32--"
--- /dev/null
+; ModuleID = 'loop.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+%struct.big = type { [10 x i32] }
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @add(i32 addrspace(1)* nocapture %dst, i32 %x, %struct.big* nocapture byval %b) nounwind noinline {
+entry:
+ %cmp2 = icmp eq i32 %x, 0
+ br i1 %cmp2, label %for.end, label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %i.03 = phi i32 [ %inc1, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds i32 addrspace(1)* %dst, i32 %i.03
+ %0 = load i32 addrspace(1)* %arrayidx, align 4, !tbaa !1
+ %inc = add nsw i32 %0, 1
+ store i32 %inc, i32 addrspace(1)* %arrayidx, align 4, !tbaa !1
+ %inc1 = add nsw i32 %i.03, 1
+ %exitcond = icmp eq i32 %inc1, %x
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end: ; preds = %for.body, %entry
+ ret void
+}
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (i32 addrspace(1)*, i32, %struct.big*)* @add}
+!1 = metadata !{metadata !"int", metadata !2}
+!2 = metadata !{metadata !"omnipotent char", metadata !3}
+!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
--- /dev/null
+; ModuleID = 'loop2.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+%struct.big = type { [10 x i32] }
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @add(i32 addrspace(1)* nocapture %dst, i32 %x, %struct.big* nocapture byval %b) nounwind noinline {
+entry:
+ %cmp6 = icmp eq i32 %x, 0
+ br i1 %cmp6, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph: ; preds = %entry
+ %.pre = load i32 addrspace(1)* %dst, align 4, !tbaa !1
+ br label %for.body
+
+for.body: ; preds = %for.body, %for.body.lr.ph
+ %0 = phi i32 [ %.pre, %for.body.lr.ph ], [ %1, %for.body ]
+ %i.07 = phi i32 [ 0, %for.body.lr.ph ], [ %add, %for.body ]
+ %add = add nsw i32 %i.07, 1
+ %arrayidx = getelementptr inbounds i32 addrspace(1)* %dst, i32 %add
+ %1 = load i32 addrspace(1)* %arrayidx, align 4, !tbaa !1
+ %cmp1 = icmp sgt i32 %1, 0
+ %arrayidx2 = getelementptr inbounds i32 addrspace(1)* %dst, i32 %i.07
+ %storemerge.v = select i1 %cmp1, i32 1, i32 2
+ %storemerge = add i32 %storemerge.v, %0
+ store i32 %storemerge, i32 addrspace(1)* %arrayidx2, align 4
+ %exitcond = icmp eq i32 %add, %x
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end: ; preds = %for.body, %entry
+ ret void
+}
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (i32 addrspace(1)*, i32, %struct.big*)* @add}
+!1 = metadata !{metadata !"int", metadata !2}
+!2 = metadata !{metadata !"omnipotent char", metadata !3}
+!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
+++ /dev/null
-; ModuleID = 'loop2.o'
-target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
-target triple = "ptx32--"
-
-%struct.big = type { [10 x i32] }
-
-define ptx_kernel void @add(i32 addrspace(1)* nocapture %dst, i32 %x, %struct.big* nocapture byval %b) nounwind noinline {
-entry:
- %cmp6 = icmp eq i32 %x, 0
- br i1 %cmp6, label %for.end, label %for.body.lr.ph
-
-for.body.lr.ph: ; preds = %entry
- %.pre = load i32 addrspace(1)* %dst, align 4, !tbaa !1
- br label %for.body
-
-for.body: ; preds = %for.body, %for.body.lr.ph
- %0 = phi i32 [ %.pre, %for.body.lr.ph ], [ %1, %for.body ]
- %i.07 = phi i32 [ 0, %for.body.lr.ph ], [ %add, %for.body ]
- %add = add nsw i32 %i.07, 1
- %arrayidx = getelementptr inbounds i32 addrspace(1)* %dst, i32 %add
- %1 = load i32 addrspace(1)* %arrayidx, align 4, !tbaa !1
- %cmp1 = icmp sgt i32 %1, 0
- %arrayidx2 = getelementptr inbounds i32 addrspace(1)* %dst, i32 %i.07
- %storemerge.v = select i1 %cmp1, i32 1, i32 2
- %storemerge = add i32 %storemerge.v, %0
- store i32 %storemerge, i32 addrspace(1)* %arrayidx2, align 4
- %exitcond = icmp eq i32 %add, %x
- br i1 %exitcond, label %for.end, label %for.body
-
-for.end: ; preds = %for.body, %entry
- ret void
-}
-
-!opencl.kernels = !{!0}
-
-!0 = metadata !{void (i32 addrspace(1)*, i32, %struct.big*)* @add}
-!1 = metadata !{metadata !"int", metadata !2}
-!2 = metadata !{metadata !"omnipotent char", metadata !3}
-!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
--- /dev/null
+#include "stdlib.h"
+
+struct big { int x[10]; };
+
+__kernel void add(__global int *dst, unsigned int x, struct big b)
+{
+ for (int i = 0; i < x; ++i) dst[get_local_id(0) + i]++;
+}
+
--- /dev/null
+; ModuleID = 'loop3.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+%struct.big = type { [10 x i32] }
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @add(i32 addrspace(1)* nocapture %dst, i32 %x, %struct.big* nocapture byval %b) nounwind noinline {
+entry:
+ %cmp2 = icmp eq i32 %x, 0
+ br i1 %cmp2, label %for.end, label %get_local_id.exit.lr.ph
+
+get_local_id.exit.lr.ph: ; preds = %entry
+ %call.i = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
+ br label %get_local_id.exit
+
+get_local_id.exit: ; preds = %get_local_id.exit, %get_local_id.exit.lr.ph
+ %i.03 = phi i32 [ 0, %get_local_id.exit.lr.ph ], [ %inc1, %get_local_id.exit ]
+ %add = add i32 %call.i, %i.03
+ %arrayidx = getelementptr inbounds i32 addrspace(1)* %dst, i32 %add
+ %0 = load i32 addrspace(1)* %arrayidx, align 4, !tbaa !1
+ %inc = add nsw i32 %0, 1
+ store i32 %inc, i32 addrspace(1)* %arrayidx, align 4, !tbaa !1
+ %inc1 = add nsw i32 %i.03, 1
+ %exitcond = icmp eq i32 %inc1, %x
+ br i1 %exitcond, label %for.end, label %get_local_id.exit
+
+for.end: ; preds = %get_local_id.exit, %entry
+ ret void
+}
+
+declare ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (i32 addrspace(1)*, i32, %struct.big*)* @add}
+!1 = metadata !{metadata !"int", metadata !2}
+!2 = metadata !{metadata !"omnipotent char", metadata !3}
+!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
-; ModuleID = 'loop.o'
+; ModuleID = 'loop3.o'
target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
target triple = "ptx32--"
define ptx_kernel void @add(i32 addrspace(1)* nocapture %dst, i32 %x, %struct.big* nocapture byval %b) nounwind noinline {
entry:
%cmp2 = icmp eq i32 %x, 0
- br i1 %cmp2, label %for.end, label %for.body
+ br i1 %cmp2, label %for.end, label %get_local_id.exit.lr.ph
-for.body: ; preds = %for.body, %entry
- %i.03 = phi i32 [ %inc1, %for.body ], [ 0, %entry ]
- %arrayidx = getelementptr inbounds i32 addrspace(1)* %dst, i32 %i.03
+get_local_id.exit.lr.ph: ; preds = %entry
+ %call.i = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
+ br label %get_local_id.exit
+
+get_local_id.exit: ; preds = %get_local_id.exit, %get_local_id.exit.lr.ph
+ %i.03 = phi i32 [ 0, %get_local_id.exit.lr.ph ], [ %inc1, %get_local_id.exit ]
+ %add = add i32 %call.i, %i.03
+ %arrayidx = getelementptr inbounds i32 addrspace(1)* %dst, i32 %add
%0 = load i32 addrspace(1)* %arrayidx, align 4, !tbaa !1
%inc = add nsw i32 %0, 1
store i32 %inc, i32 addrspace(1)* %arrayidx, align 4, !tbaa !1
%inc1 = add nsw i32 %i.03, 1
%exitcond = icmp eq i32 %inc1, %x
- br i1 %exitcond, label %for.end, label %for.body
+ br i1 %exitcond, label %for.end, label %get_local_id.exit
-for.end: ; preds = %for.body, %entry
+for.end: ; preds = %get_local_id.exit, %entry
ret void
}
+declare ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
+
!opencl.kernels = !{!0}
!0 = metadata !{void (i32 addrspace(1)*, i32, %struct.big*)* @add}
--- /dev/null
+#include "stdlib.h"
+
+struct big { int x[10]; };
+
+__kernel void add(__global int *dst, unsigned int x, struct big b)
+{
+ if (get_local_id(1) > 4)
+ for (int i = 0; i < x; ++i) dst[get_local_id(0) + i]++;
+ else
+ for (int i = 0; i < 2*x; ++i) dst[get_local_id(0) + i + x]++;
+}
+
--- /dev/null
+; ModuleID = 'loop4.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+%struct.big = type { [10 x i32] }
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @add(i32 addrspace(1)* nocapture %dst, i32 %x, %struct.big* nocapture byval %b) nounwind noinline {
+get_local_id.exit:
+ %call3.i = tail call ptx_device i32 @__gen_ocl_get_local_id1() nounwind readnone
+ %cmp = icmp ugt i32 %call3.i, 4
+ br i1 %cmp, label %for.cond.preheader, label %for.cond5.preheader
+
+for.cond.preheader: ; preds = %get_local_id.exit
+ %cmp124 = icmp eq i32 %x, 0
+ br i1 %cmp124, label %if.end, label %get_local_id.exit17.lr.ph
+
+get_local_id.exit17.lr.ph: ; preds = %for.cond.preheader
+ %call.i = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
+ br label %get_local_id.exit17
+
+for.cond5.preheader: ; preds = %get_local_id.exit
+ %mul.mask = and i32 %x, 2147483647
+ %cmp621 = icmp eq i32 %mul.mask, 0
+ br i1 %cmp621, label %if.end, label %get_local_id.exit20.lr.ph
+
+get_local_id.exit20.lr.ph: ; preds = %for.cond5.preheader
+ %call.i18 = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
+ %0 = shl i32 %x, 1
+ br label %get_local_id.exit20
+
+get_local_id.exit17: ; preds = %get_local_id.exit17, %get_local_id.exit17.lr.ph
+ %i.025 = phi i32 [ 0, %get_local_id.exit17.lr.ph ], [ %inc3, %get_local_id.exit17 ]
+ %add = add i32 %call.i, %i.025
+ %arrayidx = getelementptr inbounds i32 addrspace(1)* %dst, i32 %add
+ %1 = load i32 addrspace(1)* %arrayidx, align 4, !tbaa !1
+ %inc = add nsw i32 %1, 1
+ store i32 %inc, i32 addrspace(1)* %arrayidx, align 4, !tbaa !1
+ %inc3 = add nsw i32 %i.025, 1
+ %exitcond26 = icmp eq i32 %inc3, %x
+ br i1 %exitcond26, label %if.end, label %get_local_id.exit17
+
+get_local_id.exit20: ; preds = %get_local_id.exit20, %get_local_id.exit20.lr.ph
+ %i4.022 = phi i32 [ 0, %get_local_id.exit20.lr.ph ], [ %inc14, %get_local_id.exit20 ]
+ %add9 = add i32 %i4.022, %x
+ %add10 = add i32 %add9, %call.i18
+ %arrayidx11 = getelementptr inbounds i32 addrspace(1)* %dst, i32 %add10
+ %2 = load i32 addrspace(1)* %arrayidx11, align 4, !tbaa !1
+ %inc12 = add nsw i32 %2, 1
+ store i32 %inc12, i32 addrspace(1)* %arrayidx11, align 4, !tbaa !1
+ %inc14 = add nsw i32 %i4.022, 1
+ %exitcond = icmp eq i32 %inc14, %0
+ br i1 %exitcond, label %if.end, label %get_local_id.exit20
+
+if.end: ; preds = %get_local_id.exit20, %get_local_id.exit17, %for.cond5.preheader, %for.cond.preheader
+ ret void
+}
+
+declare ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
+
+declare ptx_device i32 @__gen_ocl_get_local_id1() nounwind readnone
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (i32 addrspace(1)*, i32, %struct.big*)* @add}
+!1 = metadata !{metadata !"int", metadata !2}
+!2 = metadata !{metadata !"omnipotent char", metadata !3}
+!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
--- /dev/null
+; ModuleID = 'loop4.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+%struct.big = type { [10 x i32] }
+
+define ptx_kernel void @add(i32 addrspace(1)* nocapture %dst, i32 %x, %struct.big* nocapture byval %b) nounwind noinline {
+get_local_id.exit:
+ %call3.i = tail call ptx_device i32 @__gen_ocl_get_local_id1() nounwind readnone
+ %cmp = icmp ugt i32 %call3.i, 4
+ br i1 %cmp, label %for.cond.preheader, label %for.cond5.preheader
+
+for.cond.preheader: ; preds = %get_local_id.exit
+ %cmp124 = icmp eq i32 %x, 0
+ br i1 %cmp124, label %if.end, label %get_local_id.exit17.lr.ph
+
+get_local_id.exit17.lr.ph: ; preds = %for.cond.preheader
+ %call.i = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
+ br label %get_local_id.exit17
+
+for.cond5.preheader: ; preds = %get_local_id.exit
+ %mul.mask = and i32 %x, 2147483647
+ %cmp621 = icmp eq i32 %mul.mask, 0
+ br i1 %cmp621, label %if.end, label %get_local_id.exit20.lr.ph
+
+get_local_id.exit20.lr.ph: ; preds = %for.cond5.preheader
+ %call.i18 = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
+ %0 = shl i32 %x, 1
+ br label %get_local_id.exit20
+
+get_local_id.exit17: ; preds = %get_local_id.exit17, %get_local_id.exit17.lr.ph
+ %i.025 = phi i32 [ 0, %get_local_id.exit17.lr.ph ], [ %inc3, %get_local_id.exit17 ]
+ %add = add i32 %call.i, %i.025
+ %arrayidx = getelementptr inbounds i32 addrspace(1)* %dst, i32 %add
+ %1 = load i32 addrspace(1)* %arrayidx, align 4, !tbaa !1
+ %inc = add nsw i32 %1, 1
+ store i32 %inc, i32 addrspace(1)* %arrayidx, align 4, !tbaa !1
+ %inc3 = add nsw i32 %i.025, 1
+ %exitcond26 = icmp eq i32 %inc3, %x
+ br i1 %exitcond26, label %if.end, label %get_local_id.exit17
+
+get_local_id.exit20: ; preds = %get_local_id.exit20, %get_local_id.exit20.lr.ph
+ %i4.022 = phi i32 [ 0, %get_local_id.exit20.lr.ph ], [ %inc14, %get_local_id.exit20 ]
+ %add9 = add i32 %i4.022, %x
+ %add10 = add i32 %add9, %call.i18
+ %arrayidx11 = getelementptr inbounds i32 addrspace(1)* %dst, i32 %add10
+ %2 = load i32 addrspace(1)* %arrayidx11, align 4, !tbaa !1
+ %inc12 = add nsw i32 %2, 1
+ store i32 %inc12, i32 addrspace(1)* %arrayidx11, align 4, !tbaa !1
+ %inc14 = add nsw i32 %i4.022, 1
+ %exitcond = icmp eq i32 %inc14, %0
+ br i1 %exitcond, label %if.end, label %get_local_id.exit20
+
+if.end: ; preds = %get_local_id.exit20, %get_local_id.exit17, %for.cond5.preheader, %for.cond.preheader
+ ret void
+}
+
+declare ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
+
+declare ptx_device i32 @__gen_ocl_get_local_id1() nounwind readnone
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (i32 addrspace(1)*, i32, %struct.big*)* @add}
+!1 = metadata !{metadata !"int", metadata !2}
+!2 = metadata !{metadata !"omnipotent char", metadata !3}
+!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
--- /dev/null
+#include "stdlib.h"
+
+struct big { int x[10]; };
+
+__kernel void add(__global int *dst0, __global int *dst1, unsigned int x, int y, struct big b)
+{
+ __global int *dst = NULL;
+ if (y > 0)
+ dst = dst0;
+ else
+ dst = dst1;
+ if (get_local_id(1) > 4)
+ for (int i = 0; i < x; ++i) dst[get_local_id(0) + i]++;
+ else
+ for (int i = 0; i < 2*x; ++i) dst[get_local_id(0) + i + x]++;
+}
+
--- /dev/null
+; ModuleID = 'loop5.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+%struct.big = type { [10 x i32] }
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @add(i32 addrspace(1)* nocapture %dst0, i32 addrspace(1)* nocapture %dst1, i32 %x, i32 %y, %struct.big* nocapture byval %b) nounwind noinline {
+get_local_id.exit:
+ %cmp = icmp sgt i32 %y, 0
+ %dst0.dst1 = select i1 %cmp, i32 addrspace(1)* %dst0, i32 addrspace(1)* %dst1
+ %call3.i = tail call ptx_device i32 @__gen_ocl_get_local_id1() nounwind readnone
+ %cmp1 = icmp ugt i32 %call3.i, 4
+ br i1 %cmp1, label %for.cond.preheader, label %for.cond8.preheader
+
+for.cond.preheader: ; preds = %get_local_id.exit
+ %cmp328 = icmp eq i32 %x, 0
+ br i1 %cmp328, label %if.end19, label %get_local_id.exit21.lr.ph
+
+get_local_id.exit21.lr.ph: ; preds = %for.cond.preheader
+ %call.i = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
+ br label %get_local_id.exit21
+
+for.cond8.preheader: ; preds = %get_local_id.exit
+ %mul.mask = and i32 %x, 2147483647
+ %cmp925 = icmp eq i32 %mul.mask, 0
+ br i1 %cmp925, label %if.end19, label %get_local_id.exit24.lr.ph
+
+get_local_id.exit24.lr.ph: ; preds = %for.cond8.preheader
+ %call.i22 = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
+ %0 = shl i32 %x, 1
+ br label %get_local_id.exit24
+
+get_local_id.exit21: ; preds = %get_local_id.exit21, %get_local_id.exit21.lr.ph
+ %i.029 = phi i32 [ 0, %get_local_id.exit21.lr.ph ], [ %inc5, %get_local_id.exit21 ]
+ %add = add i32 %call.i, %i.029
+ %arrayidx = getelementptr inbounds i32 addrspace(1)* %dst0.dst1, i32 %add
+ %1 = load i32 addrspace(1)* %arrayidx, align 4, !tbaa !1
+ %inc = add nsw i32 %1, 1
+ store i32 %inc, i32 addrspace(1)* %arrayidx, align 4, !tbaa !1
+ %inc5 = add nsw i32 %i.029, 1
+ %exitcond30 = icmp eq i32 %inc5, %x
+ br i1 %exitcond30, label %if.end19, label %get_local_id.exit21
+
+get_local_id.exit24: ; preds = %get_local_id.exit24, %get_local_id.exit24.lr.ph
+ %i7.026 = phi i32 [ 0, %get_local_id.exit24.lr.ph ], [ %inc17, %get_local_id.exit24 ]
+ %add12 = add i32 %i7.026, %x
+ %add13 = add i32 %add12, %call.i22
+ %arrayidx14 = getelementptr inbounds i32 addrspace(1)* %dst0.dst1, i32 %add13
+ %2 = load i32 addrspace(1)* %arrayidx14, align 4, !tbaa !1
+ %inc15 = add nsw i32 %2, 1
+ store i32 %inc15, i32 addrspace(1)* %arrayidx14, align 4, !tbaa !1
+ %inc17 = add nsw i32 %i7.026, 1
+ %exitcond = icmp eq i32 %inc17, %0
+ br i1 %exitcond, label %if.end19, label %get_local_id.exit24
+
+if.end19: ; preds = %get_local_id.exit24, %get_local_id.exit21, %for.cond8.preheader, %for.cond.preheader
+ ret void
+}
+
+declare ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
+
+declare ptx_device i32 @__gen_ocl_get_local_id1() nounwind readnone
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (i32 addrspace(1)*, i32 addrspace(1)*, i32, i32, %struct.big*)* @add}
+!1 = metadata !{metadata !"int", metadata !2}
+!2 = metadata !{metadata !"omnipotent char", metadata !3}
+!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
--- /dev/null
+; ModuleID = 'loop5.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+%struct.big = type { [10 x i32] }
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @add(i32 addrspace(1)* nocapture %dst0, i32 addrspace(1)* nocapture %dst1, i32 %x, i32 %y, %struct.big* nocapture byval %b) nounwind noinline {
+get_local_id.exit:
+ %cmp = icmp sgt i32 %y, 0
+ %dst0.dst1 = select i1 %cmp, i32 addrspace(1)* %dst0, i32 addrspace(1)* %dst1
+ %call3.i = tail call ptx_device i32 @__gen_ocl_get_local_id1() nounwind readnone
+ %cmp1 = icmp ugt i32 %call3.i, 4
+ br i1 %cmp1, label %for.cond.preheader, label %for.cond8.preheader
+
+for.cond.preheader: ; preds = %get_local_id.exit
+ %cmp328 = icmp eq i32 %x, 0
+ br i1 %cmp328, label %if.end19, label %get_local_id.exit21.lr.ph
+
+get_local_id.exit21.lr.ph: ; preds = %for.cond.preheader
+ %call.i = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
+ br label %get_local_id.exit21
+
+for.cond8.preheader: ; preds = %get_local_id.exit
+ %mul.mask = and i32 %x, 2147483647
+ %cmp925 = icmp eq i32 %mul.mask, 0
+ br i1 %cmp925, label %if.end19, label %get_local_id.exit24.lr.ph
+
+get_local_id.exit24.lr.ph: ; preds = %for.cond8.preheader
+ %call.i22 = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
+ %0 = shl i32 %x, 1
+ br label %get_local_id.exit24
+
+get_local_id.exit21: ; preds = %get_local_id.exit21, %get_local_id.exit21.lr.ph
+ %i.029 = phi i32 [ 0, %get_local_id.exit21.lr.ph ], [ %inc5, %get_local_id.exit21 ]
+ %add = add i32 %call.i, %i.029
+ %arrayidx = getelementptr inbounds i32 addrspace(1)* %dst0.dst1, i32 %add
+ %1 = load i32 addrspace(1)* %arrayidx, align 4, !tbaa !1
+ %inc = add nsw i32 %1, 1
+ store i32 %inc, i32 addrspace(1)* %arrayidx, align 4, !tbaa !1
+ %inc5 = add nsw i32 %i.029, 1
+ %exitcond30 = icmp eq i32 %inc5, %x
+ br i1 %exitcond30, label %if.end19, label %get_local_id.exit21
+
+get_local_id.exit24: ; preds = %get_local_id.exit24, %get_local_id.exit24.lr.ph
+ %i7.026 = phi i32 [ 0, %get_local_id.exit24.lr.ph ], [ %inc17, %get_local_id.exit24 ]
+ %add12 = add i32 %i7.026, %x
+ %add13 = add i32 %add12, %call.i22
+ %arrayidx14 = getelementptr inbounds i32 addrspace(1)* %dst0.dst1, i32 %add13
+ %2 = load i32 addrspace(1)* %arrayidx14, align 4, !tbaa !1
+ %inc15 = add nsw i32 %2, 1
+ store i32 %inc15, i32 addrspace(1)* %arrayidx14, align 4, !tbaa !1
+ %inc17 = add nsw i32 %i7.026, 1
+ %exitcond = icmp eq i32 %inc17, %0
+ br i1 %exitcond, label %if.end19, label %get_local_id.exit24
+
+if.end19: ; preds = %get_local_id.exit24, %get_local_id.exit21, %for.cond8.preheader, %for.cond.preheader
+ ret void
+}
+
+declare ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
+
+declare ptx_device i32 @__gen_ocl_get_local_id1() nounwind readnone
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (i32 addrspace(1)*, i32 addrspace(1)*, i32, i32, %struct.big*)* @add}
+!1 = metadata !{metadata !"int", metadata !2}
+!2 = metadata !{metadata !"omnipotent char", metadata !3}
+!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
+++ /dev/null
-#include "stdlib.h"
-__attribute__((pure, overloadable)) int mad(int,int,int);
-__attribute__((pure, overloadable)) float mad(float,float,float);
-__attribute__((pure, overloadable)) float4 mad(float4,float4,float4);
-
-__kernel void add(__global int *dst, unsigned int x, float z)
-{
- for (int i = 0; i < x; ++i) {
- int y = mad(dst[i], 2, 3);
- y = mad(dst[i], 2, 3);
- float z = mad((float) dst[i], 2.f, 3.f);
- float4 z0 = mad((float4) dst[i], (float4)(0.f,1.f,2.f,3.f), (float4)3.f);
- float4 x0 = z0 * (float4) 2.f;
- dst[i] = y + (int) z + x0.x + x0.y + x0.z;
- }
-}
-
-
--- /dev/null
+; ModuleID = 'mad.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @add(i32 addrspace(1)* nocapture %dst, i32 %x, float %z) nounwind noinline {
+entry:
+ %cmp16 = icmp eq i32 %x, 0
+ br i1 %cmp16, label %for.end, label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %i.017 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds i32 addrspace(1)* %dst, i32 %i.017
+ %0 = load i32 addrspace(1)* %arrayidx, align 4, !tbaa !1
+ %call2 = tail call ptx_device i32 @_Z3madiii(i32 %0, i32 2, i32 3) nounwind readonly
+ %conv = sitofp i32 %0 to float
+ %call5 = tail call ptx_device float @_Z3madfff(float %conv, float 2.000000e+00, float 3.000000e+00) nounwind readnone
+ %call.i = tail call ptx_device float @_Z3madfff(float %conv, float 0.000000e+00, float 3.000000e+00) nounwind readnone
+ %vecinit.i = insertelement <4 x float> undef, float %call.i, i32 0
+ %call1.i = tail call ptx_device float @_Z3madfff(float %conv, float 1.000000e+00, float 3.000000e+00) nounwind readnone
+ %vecinit2.i = insertelement <4 x float> %vecinit.i, float %call1.i, i32 1
+ %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %call5, i32 2
+ %call5.i = tail call ptx_device float @_Z3madfff(float %conv, float 3.000000e+00, float 3.000000e+00) nounwind readnone
+ %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %call5.i, i32 3
+ %mul = fmul <4 x float> %vecinit6.i, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
+ %conv9 = fptosi float %call5 to i32
+ %add = add nsw i32 %conv9, %call2
+ %conv10 = sitofp i32 %add to float
+ %1 = extractelement <4 x float> %mul, i32 0
+ %add11 = fadd float %conv10, %1
+ %2 = extractelement <4 x float> %mul, i32 1
+ %add12 = fadd float %add11, %2
+ %3 = extractelement <4 x float> %mul, i32 2
+ %add13 = fadd float %add12, %3
+ %conv14 = fptosi float %add13 to i32
+ store i32 %conv14, i32 addrspace(1)* %arrayidx, align 4, !tbaa !1
+ %inc = add nsw i32 %i.017, 1
+ %exitcond = icmp eq i32 %inc, %x
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end: ; preds = %for.body, %entry
+ ret void
+}
+
+declare ptx_device i32 @_Z3madiii(i32, i32, i32) nounwind readonly
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (i32 addrspace(1)*, i32, float)* @add}
+!1 = metadata !{metadata !"int", metadata !2}
+!2 = metadata !{metadata !"omnipotent char", metadata !3}
+!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
+++ /dev/null
-; ModuleID = 'mad.o'
-target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
-target triple = "ptx32--"
-
-define ptx_kernel void @add(i32* nocapture %dst, i32 %x, float %z) nounwind noinline {
-entry:
- %cmp16 = icmp eq i32 %x, 0
- br i1 %cmp16, label %for.end, label %for.body
-
-for.body: ; preds = %for.body, %entry
- %i.017 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
- %arrayidx = getelementptr inbounds i32* %dst, i32 %i.017
- %0 = load i32* %arrayidx, align 4, !tbaa !1
- %call2 = tail call ptx_device i32 @_Z3madiii(i32 %0, i32 2, i32 3) nounwind readonly
- %conv = sitofp i32 %0 to float
- %call5 = tail call ptx_device float @_Z3madfff(float %conv, float 2.000000e+00, float 3.000000e+00) nounwind readonly
- %1 = insertelement <4 x float> undef, float %conv, i32 0
- %splat = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
- %call8 = tail call ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %splat, <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, <4 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>) nounwind readonly
- %mul = fmul <4 x float> %call8, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
- %conv9 = fptosi float %call5 to i32
- %add = add nsw i32 %conv9, %call2
- %conv10 = sitofp i32 %add to float
- %2 = extractelement <4 x float> %mul, i32 0
- %add11 = fadd float %conv10, %2
- %3 = extractelement <4 x float> %mul, i32 1
- %add12 = fadd float %add11, %3
- %4 = extractelement <4 x float> %mul, i32 2
- %add13 = fadd float %add12, %4
- %conv14 = fptosi float %add13 to i32
- store i32 %conv14, i32* %arrayidx, align 4, !tbaa !1
- %inc = add nsw i32 %i.017, 1
- %exitcond = icmp eq i32 %inc, %x
- br i1 %exitcond, label %for.end, label %for.body
-
-for.end: ; preds = %for.body, %entry
- ret void
-}
-
-declare ptx_device i32 @_Z3madiii(i32, i32, i32) nounwind readonly
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readonly
-
-declare ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float>, <4 x float>, <4 x float>) nounwind readonly
-
-!opencl.kernels = !{!0}
-
-!0 = metadata !{void (i32*, i32, float)* @add}
-!1 = metadata !{metadata !"int", metadata !2}
-!2 = metadata !{metadata !"omnipotent char", metadata !3}
-!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
--- /dev/null
+; ModuleID = 'select.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @test_select(<4 x i32> addrspace(1)* nocapture %dst, <4 x i32> addrspace(1)* nocapture %src0, <4 x i32> addrspace(1)* nocapture %src1) nounwind noinline {
+entry:
+ %0 = load <4 x i32> addrspace(1)* %src0, align 16, !tbaa !1
+ %arrayidx1 = getelementptr inbounds <4 x i32> addrspace(1)* %src0, i32 1
+ %1 = load <4 x i32> addrspace(1)* %arrayidx1, align 16, !tbaa !1
+ %2 = extractelement <4 x i32> %0, i32 0
+ %3 = extractelement <4 x i32> %1, i32 0
+ %4 = extractelement <4 x i32> %0, i32 1
+ %5 = extractelement <4 x i32> %1, i32 1
+ %6 = extractelement <4 x i32> %0, i32 2
+ %7 = extractelement <4 x i32> %1, i32 2
+ %8 = extractelement <4 x i32> %0, i32 3
+ %9 = extractelement <4 x i32> %1, i32 3
+ %tobool.i = icmp slt i32 %3, 0
+ %cond1.i = select i1 %tobool.i, i32 %3, i32 %2
+ %10 = insertelement <4 x i32> undef, i32 %cond1.i, i32 0
+ %tobool3.i = icmp slt i32 %5, 0
+ %cond7.i = select i1 %tobool3.i, i32 %5, i32 %4
+ %11 = insertelement <4 x i32> %10, i32 %cond7.i, i32 1
+ %tobool9.i = icmp slt i32 %7, 0
+ %cond13.i = select i1 %tobool9.i, i32 %7, i32 %6
+ %12 = insertelement <4 x i32> %11, i32 %cond13.i, i32 2
+ %tobool15.i = icmp slt i32 %9, 0
+ %cond19.i = select i1 %tobool15.i, i32 %9, i32 %8
+ %13 = insertelement <4 x i32> %12, i32 %cond19.i, i32 3
+ store <4 x i32> %13, <4 x i32> addrspace(1)* %dst, align 16, !tbaa !1
+ ret void
+}
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (<4 x i32> addrspace(1)*, <4 x i32> addrspace(1)*, <4 x i32> addrspace(1)*)* @test_select}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
+++ /dev/null
-; ModuleID = 'select.o'
-target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
-target triple = "ptx32--"
-
-define ptx_kernel void @test_select(<4 x i32>* nocapture %dst, <4 x i32>* nocapture %src0, <4 x i32>* nocapture %src1) nounwind noinline {
-entry:
- %0 = load <4 x i32>* %src0, align 16, !tbaa !1
- %arrayidx1 = getelementptr inbounds <4 x i32>* %src0, i32 1
- %1 = load <4 x i32>* %arrayidx1, align 16, !tbaa !1
- %2 = extractelement <4 x i32> %0, i32 0
- %3 = extractelement <4 x i32> %1, i32 0
- %4 = extractelement <4 x i32> %0, i32 1
- %5 = extractelement <4 x i32> %1, i32 1
- %6 = extractelement <4 x i32> %0, i32 2
- %7 = extractelement <4 x i32> %1, i32 2
- %8 = extractelement <4 x i32> %0, i32 3
- %9 = extractelement <4 x i32> %1, i32 3
- %tobool.i = icmp slt i32 %3, 0
- %cond1.i = select i1 %tobool.i, i32 %3, i32 %2
- %10 = insertelement <4 x i32> undef, i32 %cond1.i, i32 0
- %tobool3.i = icmp slt i32 %5, 0
- %cond7.i = select i1 %tobool3.i, i32 %5, i32 %4
- %11 = insertelement <4 x i32> %10, i32 %cond7.i, i32 1
- %tobool9.i = icmp slt i32 %7, 0
- %cond13.i = select i1 %tobool9.i, i32 %7, i32 %6
- %12 = insertelement <4 x i32> %11, i32 %cond13.i, i32 2
- %tobool15.i = icmp slt i32 %9, 0
- %cond19.i = select i1 %tobool15.i, i32 %9, i32 %8
- %13 = insertelement <4 x i32> %12, i32 %cond19.i, i32 3
- store <4 x i32> %13, <4 x i32>* %dst, align 16, !tbaa !1
- ret void
-}
-
-!opencl.kernels = !{!0}
-
-!0 = metadata !{void (<4 x i32>*, <4 x i32>*, <4 x i32>*)* @test_select}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
--- /dev/null
+; ModuleID = 'short.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @short_write(i16 addrspace(1)* nocapture %dst, i16 %x, i16 %y) nounwind noinline {
+entry:
+ %add = add i16 %y, %x
+ store i16 %add, i16 addrspace(1)* %dst, align 2, !tbaa !1
+ ret void
+}
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (i16 addrspace(1)*, i16, i16)* @short_write}
+!1 = metadata !{metadata !"short", metadata !2}
+!2 = metadata !{metadata !"omnipotent char", metadata !3}
+!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
+++ /dev/null
-; ModuleID = 'short.o'
-target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
-target triple = "ptx32--"
-
-define ptx_kernel void @short_write(i16* nocapture %dst, i16 %x, i16 %y) nounwind noinline {
-entry:
- %add = add i16 %y, %x
- store i16 %add, i16* %dst, align 2, !tbaa !1
- ret void
-}
-
-!opencl.kernels = !{!0}
-
-!0 = metadata !{void (i16*, i16, i16)* @short_write}
-!1 = metadata !{metadata !"short", metadata !2}
-!2 = metadata !{metadata !"omnipotent char", metadata !3}
-!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
--- /dev/null
+; ModuleID = 'shuffle.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @shuffle(<4 x i32> addrspace(1)* nocapture %dst, <4 x i32> addrspace(1)* nocapture %src, i32 %c) nounwind noinline {
+entry:
+ %0 = load <4 x i32> addrspace(1)* %src, align 16, !tbaa !1
+ %1 = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
+ store <4 x i32> %1, <4 x i32> addrspace(1)* %dst, align 16, !tbaa !1
+ ret void
+}
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (<4 x i32> addrspace(1)*, <4 x i32> addrspace(1)*, i32)* @shuffle}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
+++ /dev/null
-; ModuleID = 'shuffle.o'
-target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
-target triple = "ptx32--"
-
-define ptx_kernel void @shuffle(<4 x i32>* nocapture %dst, <4 x i32>* nocapture %src, i32 %c) nounwind noinline {
-entry:
- %0 = load <4 x i32>* %src, align 16, !tbaa !1
- %1 = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
- store <4 x i32> %1, <4 x i32>* %dst, align 16, !tbaa !1
- ret void
-}
-
-!opencl.kernels = !{!0}
-
-!0 = metadata !{void (<4 x i32>*, <4 x i32>*, i32)* @shuffle}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
--- /dev/null
+; ModuleID = 'simple_float4.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @simple_float4(<4 x float> addrspace(1)* nocapture %dst, <4 x float> addrspace(1)* nocapture %src) nounwind noinline {
+get_global_id.exit5:
+ %call.i = tail call ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
+ %arrayidx = getelementptr inbounds <4 x float> addrspace(1)* %src, i32 %call.i
+ %0 = load <4 x float> addrspace(1)* %arrayidx, align 16, !tbaa !1
+ %arrayidx2 = getelementptr inbounds <4 x float> addrspace(1)* %dst, i32 %call.i
+ store <4 x float> %0, <4 x float> addrspace(1)* %arrayidx2, align 16, !tbaa !1
+ ret void
+}
+
+declare ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (<4 x float> addrspace(1)*, <4 x float> addrspace(1)*)* @simple_float4}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
+++ /dev/null
-; ModuleID = 'simple_float4.o'
-target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
-target triple = "ptx32--"
-
-define ptx_kernel void @simple_float4(<4 x float>* nocapture %dst, <4 x float>* nocapture %src) nounwind noinline {
-get_global_id.exit5:
- %call.i = tail call ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
- %arrayidx = getelementptr inbounds <4 x float>* %src, i32 %call.i
- %0 = load <4 x float>* %arrayidx, align 16, !tbaa !1
- %arrayidx2 = getelementptr inbounds <4 x float>* %dst, i32 %call.i
- store <4 x float> %0, <4 x float>* %arrayidx2, align 16, !tbaa !1
- ret void
-}
-
-declare ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
-
-!opencl.kernels = !{!0}
-
-!0 = metadata !{void (<4 x float>*, <4 x float>*)* @simple_float4}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
--- /dev/null
+; ModuleID = 'simple_float4_2.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @simple_float4(<4 x float> addrspace(1)* nocapture %dst, <4 x float> addrspace(1)* nocapture %src) nounwind noinline {
+get_global_id.exit10:
+ %call.i = tail call ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
+ %arrayidx = getelementptr inbounds <4 x float> addrspace(1)* %src, i32 %call.i
+ %0 = load <4 x float> addrspace(1)* %arrayidx, align 16, !tbaa !1
+ %mul = fmul <4 x float> %0, %0
+ %arrayidx4 = getelementptr inbounds <4 x float> addrspace(1)* %dst, i32 %call.i
+ store <4 x float> %mul, <4 x float> addrspace(1)* %arrayidx4, align 16, !tbaa !1
+ ret void
+}
+
+declare ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (<4 x float> addrspace(1)*, <4 x float> addrspace(1)*)* @simple_float4}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
+++ /dev/null
-; ModuleID = 'simple_float4_2.o'
-target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
-target triple = "ptx32--"
-
-define ptx_kernel void @simple_float4(<4 x float>* nocapture %dst, <4 x float>* nocapture %src) nounwind noinline {
-get_global_id.exit10:
- %call.i = tail call ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
- %arrayidx = getelementptr inbounds <4 x float>* %src, i32 %call.i
- %0 = load <4 x float>* %arrayidx, align 16, !tbaa !1
- %mul = fmul <4 x float> %0, %0
- %arrayidx4 = getelementptr inbounds <4 x float>* %dst, i32 %call.i
- store <4 x float> %mul, <4 x float>* %arrayidx4, align 16, !tbaa !1
- ret void
-}
-
-declare ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
-
-!opencl.kernels = !{!0}
-
-!0 = metadata !{void (<4 x float>*, <4 x float>*)* @simple_float4}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
__kernel void simple_float4(__global float4 *dst, __global float4 *src, bool b)
{
- dst[get_global_id(0)] = select(b, src[get_global_id(0)], src[get_global_id(1)]);
+ dst[get_global_id(0)] = select(src[get_global_id(0)], src[get_global_id(1)], (int4)(b));
dst[get_global_id(0)] += (float4) (src[2].x, 1.f, 2.f, 3.f);
}
--- /dev/null
+; ModuleID = 'simple_float4_3.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @simple_float4(<4 x float> addrspace(1)* nocapture %dst, <4 x float> addrspace(1)* nocapture %src, i1 %b) nounwind noinline {
+get_global_id.exit16:
+ %call.i = tail call ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
+ %arrayidx = getelementptr inbounds <4 x float> addrspace(1)* %src, i32 %call.i
+ %0 = load <4 x float> addrspace(1)* %arrayidx, align 16, !tbaa !1
+ %arrayidx5 = getelementptr inbounds <4 x float> addrspace(1)* %dst, i32 %call.i
+ store <4 x float> %0, <4 x float> addrspace(1)* %arrayidx5, align 16, !tbaa !1
+ %arrayidx6 = getelementptr inbounds <4 x float> addrspace(1)* %src, i32 2
+ %1 = load <4 x float> addrspace(1)* %arrayidx6, align 16
+ %2 = extractelement <4 x float> %1, i32 0
+ %vecinit = insertelement <4 x float> undef, float %2, i32 0
+ %vecinit7 = insertelement <4 x float> %vecinit, float 1.000000e+00, i32 1
+ %vecinit8 = insertelement <4 x float> %vecinit7, float 2.000000e+00, i32 2
+ %vecinit9 = insertelement <4 x float> %vecinit8, float 3.000000e+00, i32 3
+ %add = fadd <4 x float> %0, %vecinit9
+ store <4 x float> %add, <4 x float> addrspace(1)* %arrayidx5, align 16, !tbaa !1
+ ret void
+}
+
+declare ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (<4 x float> addrspace(1)*, <4 x float> addrspace(1)*, i1)* @simple_float4}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
+++ /dev/null
-; ModuleID = 'simple_float4_3.o'
-target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
-target triple = "ptx32--"
-
-define ptx_kernel void @simple_float4(<4 x float>* nocapture %dst, <4 x float>* nocapture %src, i1 %b) nounwind noinline {
-get_global_id.exit16:
- %call.i = tail call ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
- %arrayidx = getelementptr inbounds <4 x float>* %src, i32 %call.i
- %0 = load <4 x float>* %arrayidx, align 16, !tbaa !1
- %call3.i = tail call ptx_device i32 @__gen_ocl_get_global_id1() nounwind readnone
- %arrayidx2 = getelementptr inbounds <4 x float>* %src, i32 %call3.i
- %1 = load <4 x float>* %arrayidx2, align 16, !tbaa !1
- %x.y.i = select i1 %b, <4 x float> %0, <4 x float> %1
- %arrayidx5 = getelementptr inbounds <4 x float>* %dst, i32 %call.i
- store <4 x float> %x.y.i, <4 x float>* %arrayidx5, align 16, !tbaa !1
- %arrayidx6 = getelementptr inbounds <4 x float>* %src, i32 2
- %2 = load <4 x float>* %arrayidx6, align 16
- %3 = extractelement <4 x float> %2, i32 0
- %vecinit = insertelement <4 x float> undef, float %3, i32 0
- %vecinit7 = insertelement <4 x float> %vecinit, float 1.000000e+00, i32 1
- %vecinit8 = insertelement <4 x float> %vecinit7, float 2.000000e+00, i32 2
- %vecinit9 = insertelement <4 x float> %vecinit8, float 3.000000e+00, i32 3
- %add = fadd <4 x float> %x.y.i, %vecinit9
- store <4 x float> %add, <4 x float>* %arrayidx5, align 16, !tbaa !1
- ret void
-}
-
-declare ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
-
-declare ptx_device i32 @__gen_ocl_get_global_id1() nounwind readnone
-
-!opencl.kernels = !{!0}
-
-!0 = metadata !{void (<4 x float>*, <4 x float>*, i1)* @simple_float4}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
__attribute__((pure,const)) unsigned int __gen_ocl_get_local_id0(void);
__attribute__((pure,const)) unsigned int __gen_ocl_get_local_id1(void);
__attribute__((pure,const)) unsigned int __gen_ocl_get_local_id2(void);
+__attribute__ ((pure,const,overloadable)) float mad(float a, float b, float c);
inline unsigned get_global_id(unsigned int dim) {
if (dim == 0) return __gen_ocl_get_global_id0();
typedef bool bool3 __attribute__((ext_vector_type(3)));
typedef bool bool4 __attribute__((ext_vector_type(4)));
-__attribute__((overloadable)) inline int4 select(int4 src0, int4 src1, int4 cond) {
- int4 dst;
- const int x0 = src0.x; // Fix performance issue with CLANG
- const int x1 = src1.x;
- const int y0 = src0.y;
- const int y1 = src1.y;
- const int z0 = src0.z;
- const int z1 = src1.z;
- const int w0 = src0.w;
- const int w1 = src1.w;
+// This will be optimized out by LLVM and will output LLVM select instructions
+#define DECL_SELECT4(TYPE4, TYPE, COND_TYPE4, MASK) \
+__attribute__((overloadable)) \
+inline TYPE4 select(TYPE4 src0, TYPE4 src1, COND_TYPE4 cond) { \
+ TYPE4 dst; \
+ const TYPE x0 = src0.x; /* Fix performance issue with CLANG */ \
+ const TYPE x1 = src1.x; \
+ const TYPE y0 = src0.y; \
+ const TYPE y1 = src1.y; \
+ const TYPE z0 = src0.z; \
+ const TYPE z1 = src1.z; \
+ const TYPE w0 = src0.w; \
+ const TYPE w1 = src1.w; \
+ \
+ dst.x = (cond.x & MASK) ? x1 : x0; \
+ dst.y = (cond.y & MASK) ? y1 : y0; \
+ dst.z = (cond.z & MASK) ? z1 : z0; \
+ dst.w = (cond.w & MASK) ? w1 : w0; \
+ return dst; \
+}
+DECL_SELECT4(int4, int, int4, 0x80000000)
+DECL_SELECT4(float4, float, int4, 0x80000000)
+#undef DECL_SELECT4
- dst.x = (cond.x & 0x80000000) ? x1 : x0;
- dst.y = (cond.y & 0x80000000) ? y1 : y0;
- dst.z = (cond.z & 0x80000000) ? z1 : z0;
- dst.w = (cond.w & 0x80000000) ? w1 : w0;
- return dst;
+__attribute__((overloadable)) float2 mad(float2 a, float2 b, float2 c) {
+ return (float2)(mad(a.x,b.x,c.x), mad(a.y,b.y,c.y));
+}
+__attribute__((overloadable)) float3 mad(float3 a, float3 b, float3 c) {
+ return (float3)(mad(a.x,b.x,c.x), mad(a.y,b.y,c.y), mad(a.z,b.z,c.z));
+}
+__attribute__((overloadable)) float4 mad(float4 a, float4 b, float4 c) {
+ return (float4)(mad(a.x,b.x,c.x), mad(a.y,b.y,c.y),
+ mad(a.z,b.z,c.z), mad(a.w,b.w,c.w));
}
#define __private __attribute__((address_space(0)))
#define __global __attribute__((address_space(1)))
#define __constant __attribute__((address_space(2)))
-#define __local __attribute__((address_space(3)))
+//#define __local __attribute__((address_space(3)))
#define global __global
-#define local __local
+//#define local __local
#define constant __constant
#define private __private
+#define NULL ((void*)0)
--- /dev/null
+; ModuleID = 'store.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @store(i32 addrspace(1)* nocapture %dst, i32 addrspace(4)* nocapture %dst0, i32 %x) nounwind noinline {
+entry:
+ store i32 1, i32 addrspace(1)* %dst, align 4, !tbaa !1
+ ret void
+}
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (i32 addrspace(1)*, i32 addrspace(4)*, i32)* @store}
+!1 = metadata !{metadata !"int", metadata !2}
+!2 = metadata !{metadata !"omnipotent char", metadata !3}
+!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
+++ /dev/null
-; ModuleID = 'store.o'
-target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
-target triple = "ptx32--"
-
-define ptx_kernel void @store(i32* nocapture %dst, i32 addrspace(4)* nocapture %dst0, i32 %x) nounwind noinline {
-entry:
- store i32 1, i32* %dst, align 4, !tbaa !1
- ret void
-}
-
-!opencl.kernels = !{!0}
-
-!0 = metadata !{void (i32*, i32 addrspace(4)*, i32)* @store}
-!1 = metadata !{metadata !"int", metadata !2}
-!2 = metadata !{metadata !"omnipotent char", metadata !3}
-!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
--- /dev/null
+; ModuleID = 'struct.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+%struct.my_struct = type { i32, [2 x i32] }
+
+@g = addrspace(2) constant [4 x i32] [i32 0, i32 1, i32 2, i32 3], align 4
+@struct_cl.hop = internal addrspace(4) unnamed_addr global %struct.my_struct zeroinitializer, align 4
+@struct_cl.array = internal addrspace(4) global [256 x %struct.my_struct] zeroinitializer, align 4
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @struct_cl(%struct.my_struct* nocapture byval %s, i32 %x, i32 addrspace(1)* nocapture %mem, i32 %y) nounwind noinline {
+entry:
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %i.023 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+ %a = getelementptr inbounds [256 x %struct.my_struct] addrspace(4)* @struct_cl.array, i32 0, i32 %i.023, i32 0
+ store i32 %i.023, i32 addrspace(4)* %a, align 4, !tbaa !1
+ %arrayidx2 = getelementptr inbounds [256 x %struct.my_struct] addrspace(4)* @struct_cl.array, i32 0, i32 %i.023, i32 1, i32 0
+ %add = add nsw i32 %i.023, 1
+ store i32 %add, i32 addrspace(4)* %arrayidx2, align 4, !tbaa !1
+ %exitcond = icmp eq i32 %add, 256
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end: ; preds = %for.body
+ %cmp6 = icmp eq i32 %y, 0
+ br i1 %cmp6, label %if.then, label %if.else
+
+if.then: ; preds = %for.end
+ tail call void @llvm.memcpy.p4i8.p4i8.i32(i8 addrspace(4)* bitcast (%struct.my_struct addrspace(4)* @struct_cl.hop to i8 addrspace(4)*), i8 addrspace(4)* bitcast ([256 x %struct.my_struct] addrspace(4)* @struct_cl.array to i8 addrspace(4)*), i32 12, i32 4, i1 false)
+ br label %if.end
+
+if.else: ; preds = %for.end
+ %add8 = add nsw i32 %y, 1
+ %arrayidx9 = getelementptr inbounds [256 x %struct.my_struct] addrspace(4)* @struct_cl.array, i32 0, i32 %add8
+ %0 = bitcast %struct.my_struct addrspace(4)* %arrayidx9 to i8 addrspace(4)*
+ tail call void @llvm.memcpy.p4i8.p4i8.i32(i8 addrspace(4)* bitcast (%struct.my_struct addrspace(4)* @struct_cl.hop to i8 addrspace(4)*), i8 addrspace(4)* %0, i32 12, i32 4, i1 false)
+ br label %if.end
+
+if.end: ; preds = %if.else, %if.then
+ tail call void @llvm.memcpy.p4i8.p4i8.i32(i8 addrspace(4)* bitcast ([256 x %struct.my_struct] addrspace(4)* @struct_cl.array to i8 addrspace(4)*), i8 addrspace(4)* bitcast (%struct.my_struct addrspace(4)* @struct_cl.hop to i8 addrspace(4)*), i32 12, i32 4, i1 false)
+ %a10 = getelementptr inbounds %struct.my_struct* %s, i32 0, i32 0
+ %1 = load i32* %a10, align 4, !tbaa !1
+ %a12 = getelementptr inbounds [256 x %struct.my_struct] addrspace(4)* @struct_cl.array, i32 0, i32 %x, i32 0
+ %2 = load i32 addrspace(4)* %a12, align 4, !tbaa !1
+ %add14 = add nsw i32 %x, 1
+ %arrayidx17 = getelementptr inbounds [256 x %struct.my_struct] addrspace(4)* @struct_cl.array, i32 0, i32 %add14, i32 1, i32 0
+ %3 = load i32 addrspace(4)* %arrayidx17, align 4, !tbaa !1
+ %arrayidx19 = getelementptr inbounds [4 x i32] addrspace(2)* @g, i32 0, i32 %x
+ %4 = load i32 addrspace(2)* %arrayidx19, align 4, !tbaa !1
+ %add13 = add i32 %1, 3
+ %add18 = add i32 %add13, %2
+ %add20 = add i32 %add18, %3
+ %add21 = add i32 %add20, %4
+ store i32 %add21, i32 addrspace(1)* %mem, align 4, !tbaa !1
+ ret void
+}
+
+declare void @llvm.memcpy.p4i8.p4i8.i32(i8 addrspace(4)* nocapture, i8 addrspace(4)* nocapture, i32, i32, i1) nounwind
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (%struct.my_struct*, i32, i32 addrspace(1)*, i32)* @struct_cl}
+!1 = metadata !{metadata !"int", metadata !2}
+!2 = metadata !{metadata !"omnipotent char", metadata !3}
+!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
+++ /dev/null
-; ModuleID = 'struct.o'
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-%struct.my_struct = type { i32, [2 x i32] }
-
-@g = constant [4 x i32] [i32 0, i32 1, i32 2, i32 3], align 16
-@struct_cl.hop = internal global %struct.my_struct zeroinitializer, align 4
-@struct_cl.array = internal global [256 x %struct.my_struct] zeroinitializer, align 16
-
-define void @struct_cl(i64 %s.coerce0, i32 %s.coerce1, i32 %x, i32* %mem, i32 %y) nounwind uwtable {
-entry:
- %s = alloca %struct.my_struct, align 8
- %x.addr = alloca i32, align 4
- %mem.addr = alloca i32*, align 8
- %y.addr = alloca i32, align 4
- %i = alloca i32, align 4
- %0 = bitcast %struct.my_struct* %s to { i64, i32 }*
- %1 = getelementptr { i64, i32 }* %0, i32 0, i32 0
- store i64 %s.coerce0, i64* %1
- %2 = getelementptr { i64, i32 }* %0, i32 0, i32 1
- store i32 %s.coerce1, i32* %2
- store i32 %x, i32* %x.addr, align 4
- store i32* %mem, i32** %mem.addr, align 8
- store i32 %y, i32* %y.addr, align 4
- store i32 0, i32* %i, align 4
- br label %for.cond
-
-for.cond: ; preds = %for.inc, %entry
- %3 = load i32* %i, align 4
- %cmp = icmp slt i32 %3, 256
- br i1 %cmp, label %for.body, label %for.end
-
-for.body: ; preds = %for.cond
- %4 = load i32* %i, align 4
- %5 = load i32* %i, align 4
- %idxprom = sext i32 %5 to i64
- %arrayidx = getelementptr inbounds [256 x %struct.my_struct]* @struct_cl.array, i32 0, i64 %idxprom
- %a = getelementptr inbounds %struct.my_struct* %arrayidx, i32 0, i32 0
- store i32 %4, i32* %a, align 4
- %6 = load i32* %i, align 4
- %7 = load i32* %i, align 4
- %idxprom1 = sext i32 %7 to i64
- %arrayidx2 = getelementptr inbounds [256 x %struct.my_struct]* @struct_cl.array, i32 0, i64 %idxprom1
- %b = getelementptr inbounds %struct.my_struct* %arrayidx2, i32 0, i32 1
- %arrayidx3 = getelementptr inbounds [2 x i32]* %b, i32 0, i64 0
- store i32 %6, i32* %arrayidx3, align 4
- %8 = load i32* %i, align 4
- %add = add nsw i32 %8, 1
- %9 = load i32* %i, align 4
- %idxprom4 = sext i32 %9 to i64
- %arrayidx5 = getelementptr inbounds [256 x %struct.my_struct]* @struct_cl.array, i32 0, i64 %idxprom4
- %b6 = getelementptr inbounds %struct.my_struct* %arrayidx5, i32 0, i32 1
- %arrayidx7 = getelementptr inbounds [2 x i32]* %b6, i32 0, i64 0
- store i32 %add, i32* %arrayidx7, align 4
- br label %for.inc
-
-for.inc: ; preds = %for.body
- %10 = load i32* %i, align 4
- %inc = add nsw i32 %10, 1
- store i32 %inc, i32* %i, align 4
- br label %for.cond
-
-for.end: ; preds = %for.cond
- %11 = load i32* %y.addr, align 4
- %cmp8 = icmp eq i32 %11, 0
- br i1 %cmp8, label %if.then, label %if.else
-
-if.then: ; preds = %for.end
- %12 = load i32* %y.addr, align 4
- %idxprom9 = sext i32 %12 to i64
- %arrayidx10 = getelementptr inbounds [256 x %struct.my_struct]* @struct_cl.array, i32 0, i64 %idxprom9
- %13 = bitcast %struct.my_struct* %arrayidx10 to i8*
- call void @llvm.memcpy.p0i8.p0i8.i64(i8* bitcast (%struct.my_struct* @struct_cl.hop to i8*), i8* %13, i64 12, i32 4, i1 false)
- br label %if.end
-
-if.else: ; preds = %for.end
- %14 = load i32* %y.addr, align 4
- %add11 = add nsw i32 %14, 1
- %idxprom12 = sext i32 %add11 to i64
- %arrayidx13 = getelementptr inbounds [256 x %struct.my_struct]* @struct_cl.array, i32 0, i64 %idxprom12
- %15 = bitcast %struct.my_struct* %arrayidx13 to i8*
- call void @llvm.memcpy.p0i8.p0i8.i64(i8* bitcast (%struct.my_struct* @struct_cl.hop to i8*), i8* %15, i64 12, i32 4, i1 false)
- br label %if.end
-
-if.end: ; preds = %if.else, %if.then
- call void @llvm.memcpy.p0i8.p0i8.i64(i8* bitcast ([256 x %struct.my_struct]* @struct_cl.array to i8*), i8* bitcast (%struct.my_struct* @struct_cl.hop to i8*), i64 12, i32 4, i1 false)
- %a14 = getelementptr inbounds %struct.my_struct* %s, i32 0, i32 0
- %16 = load i32* %a14, align 4
- %17 = load i32* %x.addr, align 4
- %idxprom15 = sext i32 %17 to i64
- %arrayidx16 = getelementptr inbounds [256 x %struct.my_struct]* @struct_cl.array, i32 0, i64 %idxprom15
- %a17 = getelementptr inbounds %struct.my_struct* %arrayidx16, i32 0, i32 0
- %18 = load i32* %a17, align 4
- %add18 = add nsw i32 %16, %18
- %19 = load i32* %x.addr, align 4
- %add19 = add nsw i32 %19, 1
- %idxprom20 = sext i32 %add19 to i64
- %arrayidx21 = getelementptr inbounds [256 x %struct.my_struct]* @struct_cl.array, i32 0, i64 %idxprom20
- %b22 = getelementptr inbounds %struct.my_struct* %arrayidx21, i32 0, i32 1
- %arrayidx23 = getelementptr inbounds [2 x i32]* %b22, i32 0, i64 0
- %20 = load i32* %arrayidx23, align 4
- %add24 = add nsw i32 %add18, %20
- %21 = load i32* %x.addr, align 4
- %idxprom25 = sext i32 %21 to i64
- %arrayidx26 = getelementptr inbounds [4 x i32]* @g, i32 0, i64 %idxprom25
- %22 = load i32* %arrayidx26, align 4
- %add27 = add nsw i32 %add24, %22
- %23 = load i32* getelementptr inbounds ([4 x i32]* @g, i32 0, i64 3), align 4
- %add28 = add nsw i32 %add27, %23
- %24 = load i32** %mem.addr, align 8
- %arrayidx29 = getelementptr inbounds i32* %24, i64 0
- store i32 %add28, i32* %arrayidx29
- ret void
-}
-
-declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
-
-!opencl.kernels = !{!0}
-
-!0 = metadata !{void (i64, i32, i32, i32*, i32)* @struct_cl}
--- /dev/null
+; ModuleID = 'struct2.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+%struct.my_struct = type { i32, [2 x i32] }
+
+@g = addrspace(2) constant [4 x i32] [i32 0, i32 1, i32 2, i32 3], align 4
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @struct_cl(%struct.my_struct* nocapture byval %s, i32 %x, %struct.my_struct addrspace(1)* nocapture %mem, i32 %y) nounwind noinline {
+entry:
+ %cmp = icmp eq i32 %y, 0
+ br i1 %cmp, label %if.end, label %if.else
+
+if.else: ; preds = %entry
+ %s.0 = getelementptr inbounds %struct.my_struct* %s, i32 0, i32 0
+ %tmp4 = load i32* %s.0, align 4
+ %s.1.0 = getelementptr inbounds %struct.my_struct* %s, i32 0, i32 1, i32 0
+ %tmp5 = load i32* %s.1.0, align 4
+ %s.1.1 = getelementptr inbounds %struct.my_struct* %s, i32 0, i32 1, i32 1
+ %tmp6 = load i32* %s.1.1, align 4
+ br label %if.end
+
+if.end: ; preds = %if.else, %entry
+ %hop.1.1.0 = phi i32 [ %tmp6, %if.else ], [ 2, %entry ]
+ %hop.1.0.0 = phi i32 [ %tmp5, %if.else ], [ 2, %entry ]
+ %hop.0.0 = phi i32 [ %tmp4, %if.else ], [ 1, %entry ]
+ %mem.0 = getelementptr inbounds %struct.my_struct addrspace(1)* %mem, i32 0, i32 0
+ store i32 %hop.0.0, i32 addrspace(1)* %mem.0, align 4
+ %mem.1.0 = getelementptr inbounds %struct.my_struct addrspace(1)* %mem, i32 0, i32 1, i32 0
+ store i32 %hop.1.0.0, i32 addrspace(1)* %mem.1.0, align 4
+ %mem.1.1 = getelementptr inbounds %struct.my_struct addrspace(1)* %mem, i32 0, i32 1, i32 1
+ store i32 %hop.1.1.0, i32 addrspace(1)* %mem.1.1, align 4
+ ret void
+}
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (%struct.my_struct*, i32, %struct.my_struct addrspace(1)*, i32)* @struct_cl}
+++ /dev/null
-; ModuleID = 'struct2.o'
-target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
-target triple = "ptx32--"
-
-%struct.my_struct = type { i32, [2 x i32] }
-
-@g = addrspace(1) constant [4 x i32] [i32 0, i32 1, i32 2, i32 3], align 4
-
-define ptx_kernel void @struct_cl(%struct.my_struct* nocapture byval %s, i32 %x, %struct.my_struct* nocapture %mem, i32 %y) nounwind noinline {
-entry:
- %cmp = icmp eq i32 %y, 0
- br i1 %cmp, label %if.end, label %if.else
-
-if.else: ; preds = %entry
- %s.0 = getelementptr inbounds %struct.my_struct* %s, i32 0, i32 0
- %tmp4 = load i32* %s.0, align 4
- %s.1.0 = getelementptr inbounds %struct.my_struct* %s, i32 0, i32 1, i32 0
- %tmp5 = load i32* %s.1.0, align 4
- %s.1.1 = getelementptr inbounds %struct.my_struct* %s, i32 0, i32 1, i32 1
- %tmp6 = load i32* %s.1.1, align 4
- br label %if.end
-
-if.end: ; preds = %if.else, %entry
- %hop.1.1.0 = phi i32 [ %tmp6, %if.else ], [ 2, %entry ]
- %hop.1.0.0 = phi i32 [ %tmp5, %if.else ], [ 2, %entry ]
- %hop.0.0 = phi i32 [ %tmp4, %if.else ], [ 1, %entry ]
- %mem.0 = getelementptr inbounds %struct.my_struct* %mem, i32 0, i32 0
- store i32 %hop.0.0, i32* %mem.0, align 4
- %mem.1.0 = getelementptr inbounds %struct.my_struct* %mem, i32 0, i32 1, i32 0
- store i32 %hop.1.0.0, i32* %mem.1.0, align 4
- %mem.1.1 = getelementptr inbounds %struct.my_struct* %mem, i32 0, i32 1, i32 1
- store i32 %hop.1.1.0, i32* %mem.1.1, align 4
- ret void
-}
-
-!opencl.kernels = !{!0}
-
-!0 = metadata !{void (%struct.my_struct*, i32, %struct.my_struct*, i32)* @struct_cl}
--- /dev/null
+; ModuleID = 'test_select.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @test_select(i32 addrspace(1)* nocapture %dst, i32 addrspace(1)* nocapture %src) nounwind noinline {
+get_global_id.exit7:
+ %call.i = tail call ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
+ %arrayidx = getelementptr inbounds i32 addrspace(1)* %src, i32 %call.i
+ %0 = load i32 addrspace(1)* %arrayidx, align 4, !tbaa !1
+ %cmp = icmp sgt i32 %0, 1
+ %arrayidx2 = getelementptr inbounds i32 addrspace(1)* %dst, i32 %call.i
+ %. = select i1 %cmp, i32 1, i32 2
+ store i32 %., i32 addrspace(1)* %arrayidx2, align 4
+ ret void
+}
+
+declare ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (i32 addrspace(1)*, i32 addrspace(1)*)* @test_select}
+!1 = metadata !{metadata !"int", metadata !2}
+!2 = metadata !{metadata !"omnipotent char", metadata !3}
+!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
+++ /dev/null
-; ModuleID = 'test_select.o'
-target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
-target triple = "ptx32--"
-
-define ptx_kernel void @test_select(i32* nocapture %dst, i32* nocapture %src) nounwind noinline {
-get_global_id.exit7:
- %call.i = tail call ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
- %arrayidx = getelementptr inbounds i32* %src, i32 %call.i
- %0 = load i32* %arrayidx, align 4, !tbaa !1
- %cmp = icmp sgt i32 %0, 1
- %arrayidx2 = getelementptr inbounds i32* %dst, i32 %call.i
- %. = select i1 %cmp, i32 1, i32 2
- store i32 %., i32* %arrayidx2, align 4
- ret void
-}
-
-declare ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
-
-!opencl.kernels = !{!0}
-
-!0 = metadata !{void (i32*, i32*)* @test_select}
-!1 = metadata !{metadata !"int", metadata !2}
-!2 = metadata !{metadata !"omnipotent char", metadata !3}
-!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
--- /dev/null
+; ModuleID = 'undefined.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @undefined(i32 addrspace(1)* nocapture %dst) nounwind noinline {
+entry:
+ store i32 1, i32 addrspace(1)* %dst, align 4, !tbaa !1
+ ret void
+}
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (i32 addrspace(1)*)* @undefined}
+!1 = metadata !{metadata !"int", metadata !2}
+!2 = metadata !{metadata !"omnipotent char", metadata !3}
+!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
+++ /dev/null
-; ModuleID = 'undefined.o'
-target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
-target triple = "ptx32--"
-
-define ptx_kernel void @undefined(i32* %dst) nounwind noinline {
-entry:
- %dst.addr = alloca i32*, align 4
- %x = alloca i32, align 4
- store i32* %dst, i32** %dst.addr, align 4
- %0 = load i32* %x, align 4
- %cmp = icmp eq i32 %0, 0
- br i1 %cmp, label %if.then, label %if.else
-
-if.then: ; preds = %entry
- %1 = load i32** %dst.addr, align 4
- %arrayidx = getelementptr inbounds i32* %1, i32 0
- store i32 0, i32* %arrayidx
- br label %if.end
-
-if.else: ; preds = %entry
- %2 = load i32** %dst.addr, align 4
- %arrayidx1 = getelementptr inbounds i32* %2, i32 0
- store i32 1, i32* %arrayidx1
- br label %if.end
-
-if.end: ; preds = %if.else, %if.then
- ret void
-}
-
-!opencl.kernels = !{!0}
-
-!0 = metadata !{void (i32*)* @undefined}
--- /dev/null
+#include "stdlib.h"
+
+__kernel void simple_float4(__global float4 *dst, __global float4 *src)
+{
+ dst[get_global_id(0)] = src[get_global_id(0)] + (float4)(0.f,1.f,2.f,3.f);
+}
+
--- /dev/null
+; ModuleID = 'vector_constant.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @simple_float4(<4 x float> addrspace(1)* nocapture %dst, <4 x float> addrspace(1)* nocapture %src) nounwind noinline {
+get_global_id.exit5:
+ %call.i = tail call ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
+ %arrayidx = getelementptr inbounds <4 x float> addrspace(1)* %src, i32 %call.i
+ %0 = load <4 x float> addrspace(1)* %arrayidx, align 16, !tbaa !1
+ %add = fadd <4 x float> %0, <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
+ %arrayidx2 = getelementptr inbounds <4 x float> addrspace(1)* %dst, i32 %call.i
+ store <4 x float> %add, <4 x float> addrspace(1)* %arrayidx2, align 16, !tbaa !1
+ ret void
+}
+
+declare ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (<4 x float> addrspace(1)*, <4 x float> addrspace(1)*)* @simple_float4}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
--- /dev/null
+; ModuleID = 'void.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @hop() nounwind readnone noinline {
+entry:
+ ret void
+}
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void ()* @hop}
+++ /dev/null
-; ModuleID = 'void.o'
-target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
-target triple = "ptx32--"
-
-define ptx_kernel void @hop() nounwind readnone noinline {
-entry:
- ret void
-}
-
-!opencl.kernels = !{!0}
-
-!0 = metadata !{void ()* @hop}
}
}
+ uint32_t Function::getFirstSpecialReg(void) const {
+ return this->profile == PROFILE_OCL ? 0u : ~0u;
+ }
+
+ uint32_t Function::getSpecialRegNum(void) const {
+ return this->profile == PROFILE_OCL ? ocl::regNum : ~0u;
+ }
+
void Function::computeCFG(void) {
// Clear possible previously computed CFG
this->foreachBlock([this](BasicBlock &bb) {
jumpToNext = NULL;
}
if (bb.last == NULL) return;
- GBE_ASSERT(bb.last->isMemberOf<BranchInstruction>() == true);
+ if (bb.last->isMemberOf<BranchInstruction>() == false) {
+ jumpToNext = &bb;
+ return;
+ }
const BranchInstruction &insn = cast<BranchInstruction>(*bb.last);
if (insn.getOpcode() == OP_BRA) {
const LabelIndex label = insn.getLabelIndex();
GBE_ASSERT(blocks[ID] != NULL);
return *blocks[ID];
}
+ /*! Get the first index of the special registers and number of them */
+ uint32_t getFirstSpecialReg(void) const;
+ uint32_t getSpecialRegNum(void) const;
+ /*! Indicate if the given register is a special one */
+ INLINE bool isSpecialReg(const Register ®) const {
+ const uint32_t ID = uint32_t(reg);
+ const uint32_t firstID = this->getFirstSpecialReg();
+ const uint32_t specialNum = this->getSpecialRegNum();
+ return ID >= firstID && ID < firstID + specialNum;
+ }
/*! Create a new label (still not bound to a basic block) */
LabelIndex newLabel(void);
/*! Create the control flow graph */
/*! Return the complete liveness info */
INLINE const Info &getLivenessInfo(void) const { return liveness; }
/*! Return the complete block info */
- INLINE const BlockInfo &getBlockInfo(const BasicBlock &bb) const {
- auto it = liveness.find(&bb);
+ INLINE const BlockInfo &getBlockInfo(const BasicBlock *bb) const {
+ auto it = liveness.find(bb);
GBE_ASSERT(it != liveness.end() && it->second != NULL);
return *it->second;
}
/*! Initialize liveOut with the instruction destination values */
void initializeInstructionDst(void);
/*! Initialize liveOut with the function argument */
- void initializeFunctionInput(void);
+ void initializeFunctionInputAndSpecialReg(void);
/*! Iterate to completely transfer the liveness and get the def sets */
void iterateLiveOut(void);
};
liveness(liveness), dag(dag)
{
this->initializeInstructionDst();
- this->initializeFunctionInput();
+ this->initializeFunctionInputAndSpecialReg();
this->iterateLiveOut();
}
for (auto def = predDef.begin(); def != predDef.end(); ++def)
udChain.insert(*def);
}
+
+ // If this is the top block we must take into account both function
+ // arguments and special registers
+ const Function &fn = bb.getParent();
+ if (fn.isEntryBlock(bb) == false) return;
+
+ // Is it a function input?
+ const FunctionInput *input = fn.getInput(reg);
+ if (input == NULL) return;
+ ValueDef *def = (ValueDef *) dag.getDefAddress(input);
+ udChain.insert(def);
+
+ // Is it a special register?
+ if (fn.isSpecialReg(reg) == false) return;
+ def = (ValueDef *) dag.getDefAddress(reg);
+ udChain.insert(def);
}
void LiveOutSet::initializeInstructionDst(void) {
defMap.insert(std::make_pair(&bb, blockDefMap));
// We only consider liveout registers
- const auto &info = this->liveness.getBlockInfo(bb);
+ const auto &info = this->liveness.getBlockInfo(&bb);
const auto &liveOut = info.liveOut;
for (auto it = liveOut.begin(); it != liveOut.end(); ++it) {
GBE_ASSERT(blockDefMap->find(*it) == blockDefMap->end());
const uint32_t dstNum = insn.getDstNum();
for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
const Register reg = insn.getDstIndex(fn, dstID);
- std::cout << "reg" << reg << std::endl;
// We only take the most recent definition
if (defined.contains(reg) == true) continue;
// Not in LiveOut, so does not matter
});
}
- void LiveOutSet::initializeFunctionInput(void) {
+ void LiveOutSet::initializeFunctionInputAndSpecialReg(void) {
const Function &fn = liveness.getFunction();
const uint32_t inputNum = fn.inputNum();
// The first block must also transfer the function arguments
const BasicBlock &top = fn.getBlock(0);
- const Liveness::BlockInfo &info = this->liveness.getBlockInfo(top);
+ const Liveness::BlockInfo &info = this->liveness.getBlockInfo(&top);
GBE_ASSERT(defMap.contains(&top) == true);
auto blockDefMap = defMap.find(&top)->second;
GBE_ASSERT(it != blockDefMap->end());
it->second->insert(def);
}
+
+ // Now transfer the special registers that are not over-written
+ const uint32_t firstID = fn.getFirstSpecialReg();
+ const uint32_t specialNum = fn.getSpecialRegNum();
+ for (uint32_t regID = firstID; regID < firstID + specialNum; ++regID) {
+ const Register reg(regID);
+ // Do not transfer dead values
+ if (info.inLiveOut(reg) == false) continue;
+ // If we overwrite it, do not transfer the initial value
+ if (info.inVarKill(reg) == true) continue;
+ ValueDef *def = (ValueDef*) this->dag.getDefAddress(reg);
+ auto it = blockDefMap->find(reg);
+ GBE_ASSERT(it != blockDefMap->end());
+ it->second->insert(def);
+ }
}
void LiveOutSet::iterateLiveOut(void) {
});
// Iterate over all alive registers to get their definitions
- out << "LiveSet:" << std::endl;
const LiveOutSet::BlockDefMap *defMap = it->second;
+ if (defMap->size() > 0) out << "LiveSet:" << std::endl;
for (auto regIt = defMap->begin(); regIt != defMap->end(); ++regIt) {
const Register reg = regIt->first;
const LiveOutSet::RegDefSet *set = regIt->second;
for (auto def = set->begin(); def != set->end(); ++def) {
const ValueDef::Type type = (*def)->getType();
- if (type == ValueDef::FUNCTION_INPUT)
+ if (type == ValueDef::DEF_FN_INPUT)
out << "%" << reg << ": " << "function input" << std::endl;
- else if (type == ValueDef::INSTRUCTION_DST) {
+ else if (type == ValueDef::DEF_SPECIAL_REG)
+ out << "%" << reg << ": " << "special register" << std::endl;
+ else {
const Instruction *insn = (*def)->getInstruction();
out << "%" << reg << ": " << insn << " " << *insn << std::endl;
}
duGraph.insert(std::make_pair(*valueDef, duEmpty));
}
+ // Special registers are also definitions
+ const uint32_t firstID = fn.getFirstSpecialReg();
+ const uint32_t specialNum = fn.getSpecialRegNum();
+ for (uint32_t regID = firstID; regID < firstID + specialNum; ++regID) {
+ const Register reg(regID);
+ ValueDef *valueDef = this->newValueDef(reg);
+ defName.insert(std::make_pair(*valueDef, valueDef));
+ duGraph.insert(std::make_pair(*valueDef, duEmpty));
+ }
+
// We create the liveOutSet to help us transfer the definitions
LiveOutSet liveOutSet(liveness, *this);
GBE_ASSERT(it != defName.end() && it->second != NULL);
return it->second;
}
+ const ValueDef *FunctionDAG::getDefAddress(const Register ®) const {
+ const ValueDef def(reg);
+ auto it = defName.find(def);
+ GBE_ASSERT(it != defName.end() && it->second != NULL);
+ return it->second;
+ }
const ValueUse *FunctionDAG::getUseAddress(const Instruction *insn, uint32_t srcID) const {
const ValueUse use(insn, srcID);
auto it = useName.find(use);
const Register reg = insn.getSrcIndex(fn, srcID);
const auto &defs = dag.getDef(&insn, srcID);
for (auto it = defs.begin(); it != defs.end(); ++it) {
- if ((*it)->getType() == ValueDef::FUNCTION_INPUT)
+ if ((*it)->getType() == ValueDef::DEF_FN_INPUT)
out << " %" << reg << " # function argument" << std::endl;
- else if ((*it)->getType() == ValueDef::INSTRUCTION_DST) {
+ else if ((*it)->getType() == ValueDef::DEF_SPECIAL_REG)
+ out << " %" << reg << " # special register" << std::endl;
+ else {
const Instruction *other = (*it)->getInstruction();
out << " %" << reg << " " << other << ": " << *other << std::endl;
}
public:
/*! Discriminates the kind of values */
enum Type : uint32_t {
- FUNCTION_INPUT = 0,
- INSTRUCTION_DST = 1
+ DEF_FN_INPUT = 0,
+ DEF_INSN_DST = 1,
+ DEF_SPECIAL_REG = 2
};
/*! Build a value from an instruction destination */
- ValueDef(const Instruction *insn, uint32_t dstID = 0u) :
- type(INSTRUCTION_DST)
+ explicit ValueDef(const Instruction *insn, uint32_t dstID = 0u) :
+ type(DEF_INSN_DST)
{
this->data.insn = insn;
this->data.dstID = dstID;
}
/*! Build a value from a function argument */
- ValueDef(const FunctionInput *input) : type(FUNCTION_INPUT) {
+ explicit ValueDef(const FunctionInput *input) : type(DEF_FN_INPUT) {
this->data.input = input;
}
+ /*! Build a value from a special register */
+ explicit ValueDef(const Register ®) : type(DEF_SPECIAL_REG) {
+ this->data.regID = uint32_t(reg);
+ }
/*! Get the type of the value */
INLINE Type getType(void) const { return type; }
/*! Get the instruction (only if this is a instruction value) */
INLINE const Instruction *getInstruction(void) const {
- GBE_ASSERT(type == INSTRUCTION_DST);
+ GBE_ASSERT(type == DEF_INSN_DST);
return data.insn;
}
/*! Get the destination ID (only if this is a instruction value) */
INLINE uint32_t getDstID(void) const {
- GBE_ASSERT(type == INSTRUCTION_DST);
+ GBE_ASSERT(type == DEF_INSN_DST);
return data.dstID;
}
/*! Get the function input (only if this is a function argument) */
INLINE const FunctionInput *getFunctionInput(void) const {
- GBE_ASSERT(type == FUNCTION_INPUT);
+ GBE_ASSERT(type == DEF_FN_INPUT);
return data.input;
}
+ /*! Get the register */
+ INLINE Register getRegister(void) const {
+ GBE_ASSERT(type == DEF_SPECIAL_REG);
+ return Register(data.regID);
+ }
private:
/*! Instruction or function argument */
const Instruction *insn; //<! Instruction itself
uint32_t dstID; //<! Which destination we take into account
};
- /*! ... function argument */
+ /*! ... function argument or ... */
const FunctionInput *input;
+ /*! ... special register */
+ uint32_t regID;
} data;
/*!< Function argument or instruction dst? */
Type type;
const ValueDef::Type type0 = def0.getType();
const ValueDef::Type type1 = def1.getType();
if (type0 != type1) return uint32_t(type0) < uint32_t(type1);
- if (type0 == ValueDef::FUNCTION_INPUT) {
+ if (type0 == ValueDef::DEF_FN_INPUT) {
const FunctionInput *in0 = def0.getFunctionInput();
const FunctionInput *in1 = def1.getFunctionInput();
return uintptr_t(in0) < uintptr_t(in1);
+ } else if (type0 == ValueDef::DEF_SPECIAL_REG) {
+ const Register reg0 = def0.getRegister();
+ const Register reg1 = def1.getRegister();
+ return uint32_t(reg0) < uint32_t(reg1);
} else {
const Instruction *insn0 = def0.getInstruction();
const Instruction *insn1 = def1.getInstruction();
{
public:
/*! Build a value use */
- ValueUse(const Instruction *insn, uint32_t srcID = 0u) :
+ explicit ValueUse(const Instruction *insn, uint32_t srcID = 0u) :
insn(insn), srcID(srcID) {}
/*! Get the instruction of the use */
const Instruction *getInstruction(void) const { return insn; }
const DUChain &getUse(const Instruction *insn, uint32_t dstID) const;
/*! Get the du-chain for the given function input */
const DUChain &getUse(const FunctionInput *input) const;
+ /*! Get the du-chain for the given special register */
+ const DUChain &getUse(const Register ®) const;
/*! Get the ud-chain for the instruction and source */
const UDChain &getDef(const Instruction *insn, uint32_t srcID) const;
/*! Get the pointer to the definition *as stored in the DAG* */
const ValueDef *getDefAddress(const Instruction *insn, uint32_t dstID) const;
/*! Get the pointer to the definition *as stored in the DAG* */
const ValueDef *getDefAddress(const FunctionInput *input) const;
+ /*! Get the pointer to the definition *as stored in the DAG* */
+ const ValueDef *getDefAddress(const Register ®) const;
/*! Get the pointer to the use *as stored in the DAG* */
const ValueUse *getUseAddress(const Instruction *insn, uint32_t srcID) const;
/*! Get the function we have the graph for */
/*! Return a valid register from an operand (can use LOADI to make one) */
INLINE ir::Register getRegister(Value *value, uint32_t index = 0);
/*! Create a new immediate from a constant */
- ir::ImmediateIndex newImmediate(Constant *CPV);
+ ir::ImmediateIndex newImmediate(Constant *CPV, uint32_t index = 0);
/*! Insert a new label index when this is a scalar value */
INLINE void newLabelIndex(const BasicBlock *bb);
/*! Inspect the terminator instruction and try to see if we should invert
}
template <typename U, typename T>
- static U processConstant(Constant *CPV, T doIt)
+ static U processConstant(Constant *CPV, T doIt, uint32_t index = 0u)
{
if (dyn_cast<ConstantExpr>(CPV))
GBE_ASSERTM(false, "Unsupported constant expression");
else if (isa<UndefValue>(CPV) && CPV->getType()->isSingleValueType())
GBE_ASSERTM(false, "Unsupported constant expression");
+ if (ConstantVector *CV = dyn_cast<ConstantVector>(CPV)) {
+ const uint32_t elemNum = CV->getNumOperands();
+ GBE_ASSERTM(index < elemNum, "Out-of-bound constant vector access");
+ CPV = cast<Constant>(CV->getOperand(index));
+ }
+
// Integers
if (ConstantInt *CI = dyn_cast<ConstantInt>(CPV)) {
Type* Ty = CI->getType();
ir::Context &ctx;
};
- ir::ImmediateIndex GenWriter::newImmediate(Constant *CPV) {
- return processConstant<ir::ImmediateIndex>(CPV, NewImmediateFunctor(ctx));
+ ir::ImmediateIndex GenWriter::newImmediate(Constant *CPV, uint32_t index) {
+ return processConstant<ir::ImmediateIndex>(CPV, NewImmediateFunctor(ctx), index);
}
void GenWriter::newRegister(Value *value) {
};
}
- ir::Register GenWriter::getRegister(Value *value, uint32_t index) {
+ ir::Register GenWriter::getRegister(Value *value, uint32_t elemID) {
Constant *CPV = dyn_cast<Constant>(value);
- if (CPV && !isa<GlobalValue>(CPV)) {
- const ir::ImmediateIndex index = this->newImmediate(CPV);
- const ir::Immediate imm = ctx.getImmediate(index);
+ if (CPV) {
+ GBE_ASSERT(isa<GlobalValue>(CPV) == false);
+ const ir::ImmediateIndex immIndex = this->newImmediate(CPV, elemID);
+ const ir::Immediate imm = ctx.getImmediate(immIndex);
const ir::Register reg = ctx.reg(getFamily(imm.type));
- ctx.LOADI(imm.type, reg, index);
+ ctx.LOADI(imm.type, reg, immIndex);
return reg;
}
else
- return regTranslator.getScalar(value, index);
+ return regTranslator.getScalar(value, elemID);
}
void GenWriter::newLabelIndex(const BasicBlock *bb) {
BasicBlock *target = I.getSuccessor(0);
if (llvm::next(Function::iterator(bb)) != Function::iterator(target)) {
GBE_ASSERT(labelMap.find(target) != labelMap.end());
- const ir::LabelIndex labelIndex = labelMap[bb];
+ const ir::LabelIndex labelIndex = labelMap[target];
ctx.BRA(labelIndex);
}
}
* \author Benjamin Segovia <benjamin.segovia@intel.com>
*/
+/* THIS CODE IS DERIVED FROM LLVM PTX BACKEND. CODE IS HERE:
+ * http://sourceforge.net/scm/?type=git&group_id=319085
+ * Note that the LICENSE is GPL
+ */
+
#include "llvm/CallingConv.h"
#include "llvm/Constants.h"
#include "llvm/DerivedTypes.h"
case Type::VectorTyID:
{
const VectorType* VecTy = cast<VectorType>(Ty);
- return VecTy->getNumElements() * getTypeByteSize(unit, VecTy->getElementType());
+ uint32_t elemNum = VecTy->getNumElements();
+ if (elemNum == 3) elemNum = 4; // OCL spec
+ return elemNum * getTypeByteSize(unit, VecTy->getElementType());
}
case Type::PointerTyID:
case Type::IntegerTyID:
ir::Liveness liveness(fn);
ir::FunctionDAG dag(liveness);
// std::cout << liveness << std::endl;
- // std::cout << dag << std::endl;
+ std::cout << dag << std::endl;
});
}
} /* namespace gbe */
GBE_ASSERT(dummyKernel != NULL);
fclose(dummyKernel);
- UTEST_EXPECT_SUCCESS(utestLLVM2Gen("loop.ll"));
+ UTEST_EXPECT_SUCCESS(utestLLVM2Gen("complex_struct.ll"));
+ //UTEST_EXPECT_SUCCESS(utestLLVM2Gen("vector_constant.ll"));
+ //UTEST_EXPECT_SUCCESS(utestLLVM2Gen("loop5.ll"));
+ //UTEST_EXPECT_SUCCESS(utestLLVM2Gen("loop4.ll"));
+ //UTEST_EXPECT_SUCCESS(utestLLVM2Gen("loop3.ll"));
+ //UTEST_EXPECT_SUCCESS(utestLLVM2Gen("loop.ll"));
//UTEST_EXPECT_SUCCESS(utestLLVM2Gen("function_param.ll"));
//UTEST_EXPECT_SUCCESS(utestLLVM2Gen("function.ll"));
//UTEST_EXPECT_SUCCESS(utestLLVM2Gen("mad.ll"));
-#if 0
- UTEST_EXPECT_SUCCESS(utestLLVM2Gen("select.ll"));
- UTEST_EXPECT_SUCCESS(utestLLVM2Gen("shuffle.ll"));
- UTEST_EXPECT_SUCCESS(utestLLVM2Gen("extract.ll"));
- UTEST_EXPECT_SUCCESS(utestLLVM2Gen("insert.ll"));
- UTEST_EXPECT_SUCCESS(utestLLVM2Gen("add.ll"));
- UTEST_EXPECT_SUCCESS(utestLLVM2Gen("load_store.ll"));
- UTEST_EXPECT_SUCCESS(utestLLVM2Gen("add2.ll"));
- UTEST_EXPECT_SUCCESS(utestLLVM2Gen("get_global_id.ll"));
- UTEST_EXPECT_SUCCESS(utestLLVM2Gen("simple_float4.ll"));
- UTEST_EXPECT_SUCCESS(utestLLVM2Gen("simple_float4_2.ll"));
- UTEST_EXPECT_SUCCESS(utestLLVM2Gen("void.ll"));
- UTEST_EXPECT_SUCCESS(utestLLVM2Gen("cmp_cvt.ll"));
+#if 1
+ UTEST_EXPECT_SUCCESS(utestLLVM2Gen("select.cl.ll"));
+ UTEST_EXPECT_SUCCESS(utestLLVM2Gen("shuffle.cl.ll"));
+ UTEST_EXPECT_SUCCESS(utestLLVM2Gen("extract.cl.ll"));
+ UTEST_EXPECT_SUCCESS(utestLLVM2Gen("insert.cl.ll"));
+ UTEST_EXPECT_SUCCESS(utestLLVM2Gen("add.cl.ll"));
+ UTEST_EXPECT_SUCCESS(utestLLVM2Gen("load_store.cl.ll"));
+ UTEST_EXPECT_SUCCESS(utestLLVM2Gen("add2.cl.ll"));
+ UTEST_EXPECT_SUCCESS(utestLLVM2Gen("get_global_id.cl.ll"));
+ UTEST_EXPECT_SUCCESS(utestLLVM2Gen("simple_float4.cl.ll"));
+ UTEST_EXPECT_SUCCESS(utestLLVM2Gen("simple_float4_2.cl.ll"));
+ UTEST_EXPECT_SUCCESS(utestLLVM2Gen("void.cl.ll"));
+ UTEST_EXPECT_SUCCESS(utestLLVM2Gen("cmp_cvt.cl.ll"));
#endif
}