target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
target triple = "ptx32--"
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_kernel void @add(i32 addrspace(1)* nocapture %dst, i32 %x, i32 %y) nounwind noinline {
entry:
%add = add i32 %y, %x
%struct.big = type { i32, i32 }
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_kernel void @add(%struct.big addrspace(1)* nocapture %b, i32 %x, i32 %y) nounwind noinline {
entry:
%add = add i32 %y, %x
target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
target triple = "ptx32--"
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_kernel void @test_cmp(i8 addrspace(1)* nocapture %dst, i32 %x, i32 %y, float %z, float %w) nounwind noinline {
entry:
%cmp = icmp slt i32 %x, %y
target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
target triple = "ptx32--"
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_kernel void @cmp_cvt(i32 addrspace(1)* nocapture %dst, i32 %x, i32 %y) nounwind noinline {
get_local_id.exit:
%add = add nsw i32 %y, %x
%struct.my_struct = type { i32, [5 x %struct.hop] }
%struct.hop = type { float, float }
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_kernel void @struct_cl(%struct.my_struct addrspace(1)* nocapture %dst, %struct.my_struct addrspace(1)* nocapture %src) nounwind noinline {
entry:
%x = getelementptr inbounds %struct.my_struct addrspace(1)* %src, i32 1, i32 1, i32 3, i32 0
target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
target triple = "ptx32--"
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_kernel void @cycle(i32 addrspace(1)* nocapture %dst) noreturn nounwind readnone noinline {
entry:
br label %hop0
target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
target triple = "ptx32--"
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_kernel void @extract(<4 x i32> addrspace(1)* nocapture %dst, <4 x i32> addrspace(1)* nocapture %src, i32 %c) nounwind noinline {
entry:
%0 = load <4 x i32> addrspace(1)* %src, align 16, !tbaa !1
target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
target triple = "ptx32--"
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_device void @write(i32 addrspace(1)* nocapture %dst) nounwind {
entry:
store i32 1, i32 addrspace(1)* %dst, align 4, !tbaa !1
%struct.struct0 = type { [5 x i32], i32, i32, i32 }
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_kernel void @param(%struct.struct0 addrspace(1)* nocapture %dst, %struct.struct0* nocapture byval %s, i32 addrspace(4)* nocapture %h, i32 %x, i32 %y) nounwind noinline {
entry:
%arrayidx = getelementptr inbounds i32 addrspace(4)* %h, i32 4
target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
target triple = "ptx32--"
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_kernel void @test_global_id(i32 addrspace(1)* nocapture %dst, i32 addrspace(1)* nocapture %p) nounwind noinline {
-get_global_id.exit13:
+get_global_id.exit17:
%call.i = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
%sext = shl i32 %call.i, 16
%conv1 = ashr exact i32 %sext, 16
- %call.i6 = tail call ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
- %arrayidx = getelementptr inbounds i32 addrspace(1)* %dst, i32 %call.i6
+ %call.i3.i = tail call ptx_device i32 @__gen_ocl_get_local_size0() nounwind readnone
+ %call.i10.i = tail call ptx_device i32 @__gen_ocl_get_num_groups0() nounwind readnone
+ %mul.i = mul i32 %call.i10.i, %call.i3.i
+ %add.i = add i32 %mul.i, %call.i
+ %arrayidx = getelementptr inbounds i32 addrspace(1)* %dst, i32 %add.i
store i32 %conv1, i32 addrspace(1)* %arrayidx, align 4, !tbaa !1
- %arrayidx5 = getelementptr inbounds i32 addrspace(1)* %p, i32 %call.i6
+ %arrayidx5 = getelementptr inbounds i32 addrspace(1)* %p, i32 %add.i
store i32 %call.i, i32 addrspace(1)* %arrayidx5, align 4, !tbaa !1
ret void
}
-declare ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
+declare ptx_device i32 @__gen_ocl_get_num_groups0() nounwind readnone
+
+declare ptx_device i32 @__gen_ocl_get_local_size0() nounwind readnone
declare ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
target triple = "ptx32--"
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_kernel void @insert(<4 x i32> addrspace(1)* nocapture %dst, <4 x i32> addrspace(1)* nocapture %src, i32 %c) nounwind noinline {
entry:
%0 = load <4 x i32> addrspace(1)* %src, align 16
%struct.big = type { [10 x i32] }
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_kernel void @add(i32 addrspace(1)* nocapture %dst, i32 %x, %struct.big* nocapture byval %b) nounwind noinline {
entry:
%cmp2 = icmp eq i32 %x, 0
%struct.big = type { [10 x i32] }
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_kernel void @add(i32 addrspace(1)* nocapture %dst, i32 %x, %struct.big* nocapture byval %b) nounwind noinline {
entry:
%cmp6 = icmp eq i32 %x, 0
%struct.big = type { [10 x i32] }
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_kernel void @add(i32 addrspace(1)* nocapture %dst, i32 %x, %struct.big* nocapture byval %b) nounwind noinline {
entry:
%cmp2 = icmp eq i32 %x, 0
%struct.big = type { [10 x i32] }
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_kernel void @add(i32 addrspace(1)* nocapture %dst, i32 %x, %struct.big* nocapture byval %b) nounwind noinline {
get_local_id.exit:
%call3.i = tail call ptx_device i32 @__gen_ocl_get_local_id1() nounwind readnone
%struct.big = type { [10 x i32] }
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_kernel void @add(i32 addrspace(1)* nocapture %dst0, i32 addrspace(1)* nocapture %dst1, i32 %x, i32 %y, %struct.big* nocapture byval %b) nounwind noinline {
get_local_id.exit:
%cmp = icmp sgt i32 %y, 0
target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
target triple = "ptx32--"
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_kernel void @test_select(<4 x i32> addrspace(1)* nocapture %dst, <4 x i32> addrspace(1)* nocapture %src0, <4 x i32> addrspace(1)* nocapture %src1) nounwind noinline {
entry:
%0 = load <4 x i32> addrspace(1)* %src0, align 16, !tbaa !1
target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
target triple = "ptx32--"
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_kernel void @short_write(i16 addrspace(1)* nocapture %dst, i16 %x, i16 %y) nounwind noinline {
entry:
%add = add i16 %y, %x
target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
target triple = "ptx32--"
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_kernel void @shuffle(<4 x i32> addrspace(1)* nocapture %dst, <4 x i32> addrspace(1)* nocapture %src, i32 %c) nounwind noinline {
entry:
%0 = load <4 x i32> addrspace(1)* %src, align 16, !tbaa !1
target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
target triple = "ptx32--"
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_kernel void @simple_float4(<4 x float> addrspace(1)* nocapture %dst, <4 x float> addrspace(1)* nocapture %src) nounwind noinline {
-get_global_id.exit5:
- %call.i = tail call ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
- %arrayidx = getelementptr inbounds <4 x float> addrspace(1)* %src, i32 %call.i
+get_global_id.exit11:
+ %call.i.i = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
+ %call.i3.i = tail call ptx_device i32 @__gen_ocl_get_local_size0() nounwind readnone
+ %call.i10.i = tail call ptx_device i32 @__gen_ocl_get_num_groups0() nounwind readnone
+ %mul.i = mul i32 %call.i10.i, %call.i3.i
+ %add.i = add i32 %mul.i, %call.i.i
+ %arrayidx = getelementptr inbounds <4 x float> addrspace(1)* %src, i32 %add.i
%0 = load <4 x float> addrspace(1)* %arrayidx, align 16, !tbaa !1
- %arrayidx2 = getelementptr inbounds <4 x float> addrspace(1)* %dst, i32 %call.i
+ %arrayidx2 = getelementptr inbounds <4 x float> addrspace(1)* %dst, i32 %add.i
store <4 x float> %0, <4 x float> addrspace(1)* %arrayidx2, align 16, !tbaa !1
ret void
}
-declare ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
+declare ptx_device i32 @__gen_ocl_get_num_groups0() nounwind readnone
+
+declare ptx_device i32 @__gen_ocl_get_local_size0() nounwind readnone
+
+declare ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
!opencl.kernels = !{!0}
target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
target triple = "ptx32--"
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_kernel void @simple_float4(<4 x float> addrspace(1)* nocapture %dst, <4 x float> addrspace(1)* nocapture %src) nounwind noinline {
-get_global_id.exit10:
- %call.i = tail call ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
- %arrayidx = getelementptr inbounds <4 x float> addrspace(1)* %src, i32 %call.i
+get_global_id.exit22:
+ %call.i.i = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
+ %call.i3.i = tail call ptx_device i32 @__gen_ocl_get_local_size0() nounwind readnone
+ %call.i10.i = tail call ptx_device i32 @__gen_ocl_get_num_groups0() nounwind readnone
+ %mul.i = mul i32 %call.i10.i, %call.i3.i
+ %add.i = add i32 %mul.i, %call.i.i
+ %arrayidx = getelementptr inbounds <4 x float> addrspace(1)* %src, i32 %add.i
%0 = load <4 x float> addrspace(1)* %arrayidx, align 16, !tbaa !1
%mul = fmul <4 x float> %0, %0
- %arrayidx4 = getelementptr inbounds <4 x float> addrspace(1)* %dst, i32 %call.i
+ %arrayidx4 = getelementptr inbounds <4 x float> addrspace(1)* %dst, i32 %add.i
store <4 x float> %mul, <4 x float> addrspace(1)* %arrayidx4, align 16, !tbaa !1
ret void
}
-declare ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
+declare ptx_device i32 @__gen_ocl_get_num_groups0() nounwind readnone
+
+declare ptx_device i32 @__gen_ocl_get_local_size0() nounwind readnone
+
+declare ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
!opencl.kernels = !{!0}
target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
target triple = "ptx32--"
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_kernel void @simple_float4(<4 x float> addrspace(1)* nocapture %dst, <4 x float> addrspace(1)* nocapture %src, i1 %b) nounwind noinline {
-get_global_id.exit16:
- %call.i = tail call ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
- %arrayidx = getelementptr inbounds <4 x float> addrspace(1)* %src, i32 %call.i
+get_global_id.exit35:
+ %call.i.i = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
+ %call.i3.i = tail call ptx_device i32 @__gen_ocl_get_local_size0() nounwind readnone
+ %call.i10.i = tail call ptx_device i32 @__gen_ocl_get_num_groups0() nounwind readnone
+ %mul.i = mul i32 %call.i10.i, %call.i3.i
+ %add.i = add i32 %mul.i, %call.i.i
+ %arrayidx = getelementptr inbounds <4 x float> addrspace(1)* %src, i32 %add.i
%0 = load <4 x float> addrspace(1)* %arrayidx, align 16, !tbaa !1
- %arrayidx5 = getelementptr inbounds <4 x float> addrspace(1)* %dst, i32 %call.i
+ %arrayidx5 = getelementptr inbounds <4 x float> addrspace(1)* %dst, i32 %add.i
store <4 x float> %0, <4 x float> addrspace(1)* %arrayidx5, align 16, !tbaa !1
%arrayidx6 = getelementptr inbounds <4 x float> addrspace(1)* %src, i32 2
%1 = load <4 x float> addrspace(1)* %arrayidx6, align 16
ret void
}
-declare ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
+declare ptx_device i32 @__gen_ocl_get_num_groups0() nounwind readnone
+
+declare ptx_device i32 @__gen_ocl_get_local_size0() nounwind readnone
+
+declare ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
!opencl.kernels = !{!0}
target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
target triple = "ptx32--"
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_kernel void @store(i32 addrspace(1)* nocapture %dst, i32 addrspace(4)* nocapture %dst0, i32 %x) nounwind noinline {
entry:
store i32 1, i32 addrspace(1)* %dst, align 4, !tbaa !1
@struct_cl.hop = internal addrspace(4) unnamed_addr global %struct.my_struct zeroinitializer, align 4
@struct_cl.array = internal addrspace(4) global [256 x %struct.my_struct] zeroinitializer, align 4
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_kernel void @struct_cl(%struct.my_struct* nocapture byval %s, i32 %x, i32 addrspace(1)* nocapture %mem, i32 %y) nounwind noinline {
entry:
br label %for.body
@g = addrspace(2) constant [4 x i32] [i32 0, i32 1, i32 2, i32 3], align 4
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_kernel void @struct_cl(%struct.my_struct* nocapture byval %s, i32 %x, %struct.my_struct addrspace(1)* nocapture %mem, i32 %y) nounwind noinline {
entry:
%cmp = icmp eq i32 %y, 0
target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
target triple = "ptx32--"
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_kernel void @test_select(i32 addrspace(1)* nocapture %dst, i32 addrspace(1)* nocapture %src) nounwind noinline {
-get_global_id.exit7:
- %call.i = tail call ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
- %arrayidx = getelementptr inbounds i32 addrspace(1)* %src, i32 %call.i
+get_global_id.exit13:
+ %call.i.i = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
+ %call.i3.i = tail call ptx_device i32 @__gen_ocl_get_local_size0() nounwind readnone
+ %call.i10.i = tail call ptx_device i32 @__gen_ocl_get_num_groups0() nounwind readnone
+ %mul.i = mul i32 %call.i10.i, %call.i3.i
+ %add.i = add i32 %mul.i, %call.i.i
+ %arrayidx = getelementptr inbounds i32 addrspace(1)* %src, i32 %add.i
%0 = load i32 addrspace(1)* %arrayidx, align 4, !tbaa !1
%cmp = icmp sgt i32 %0, 1
- %arrayidx2 = getelementptr inbounds i32 addrspace(1)* %dst, i32 %call.i
+ %arrayidx2 = getelementptr inbounds i32 addrspace(1)* %dst, i32 %add.i
%. = select i1 %cmp, i32 1, i32 2
store i32 %., i32 addrspace(1)* %arrayidx2, align 4
ret void
}
-declare ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
+declare ptx_device i32 @__gen_ocl_get_num_groups0() nounwind readnone
+
+declare ptx_device i32 @__gen_ocl_get_local_size0() nounwind readnone
+
+declare ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
!opencl.kernels = !{!0}
target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
target triple = "ptx32--"
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_kernel void @undefined(i32 addrspace(1)* nocapture %dst) nounwind noinline {
entry:
store i32 1, i32 addrspace(1)* %dst, align 4, !tbaa !1
target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
target triple = "ptx32--"
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_kernel void @simple_float4(<4 x float> addrspace(1)* nocapture %dst, <4 x float> addrspace(1)* nocapture %src) nounwind noinline {
-get_global_id.exit5:
- %call.i = tail call ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
- %arrayidx = getelementptr inbounds <4 x float> addrspace(1)* %src, i32 %call.i
+get_global_id.exit11:
+ %call.i.i = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
+ %call.i3.i = tail call ptx_device i32 @__gen_ocl_get_local_size0() nounwind readnone
+ %call.i10.i = tail call ptx_device i32 @__gen_ocl_get_num_groups0() nounwind readnone
+ %mul.i = mul i32 %call.i10.i, %call.i3.i
+ %add.i = add i32 %mul.i, %call.i.i
+ %arrayidx = getelementptr inbounds <4 x float> addrspace(1)* %src, i32 %add.i
%0 = load <4 x float> addrspace(1)* %arrayidx, align 16, !tbaa !1
%add = fadd <4 x float> %0, <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
- %arrayidx2 = getelementptr inbounds <4 x float> addrspace(1)* %dst, i32 %call.i
+ %arrayidx2 = getelementptr inbounds <4 x float> addrspace(1)* %dst, i32 %add.i
store <4 x float> %add, <4 x float> addrspace(1)* %arrayidx2, align 16, !tbaa !1
ret void
}
-declare ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
+declare ptx_device i32 @__gen_ocl_get_num_groups0() nounwind readnone
+
+declare ptx_device i32 @__gen_ocl_get_local_size0() nounwind readnone
+
+declare ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
!opencl.kernels = !{!0}
target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
target triple = "ptx32--"
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_kernel void @hop() nounwind readnone noinline {
entry:
ret void
#include "ir/profile.hpp"
#include <algorithm>
-
namespace gbe
{
Context::Context(const ir::Unit &unit, const std::string &name) :
unit(unit), fn(*unit.getFunction(name)), name(name), liveness(NULL), dag(NULL)
- { GBE_ASSERT(unit.getPointerSize() == ir::POINTER_32_BITS); }
+ { GBE_ASSERT(unit.getPointerSize() == ir::POINTER_32_BITS);
+ this->simdWidth = 16; /* XXX environment variable for that to start with */
+ }
Context::~Context(void) {}
Kernel *Context::compileKernel(void) {
// Go over all the instructions and find the special register value we need
// to push
#define INSERT_REG(SPECIAL_REG, PATCH) \
- else if (reg == ir::ocl::SPECIAL_REG) { \
+ if (reg == ir::ocl::SPECIAL_REG) { \
if (specialRegs.find(reg) != specialRegs.end()) continue; \
const PatchInfo patch(GBE_CURBE_##PATCH, 0, kernel->curbeSize); \
kernel->patches.push_back(patch); \
kernel->curbeSize += ptrSize; \
- }
+ } else
set<ir::Register> specialRegs; // already inserted registers
fn.foreachInstruction([&](const ir::Instruction &insn) {
const uint32_t srcNum = insn.getSrcNum();
const ir::Register reg = insn.getSrcIndex(fn, srcID);
if (fn.isSpecialReg(reg) == false) continue;
- if (0);
INSERT_REG(lsize0, LOCAL_SIZE_X)
INSERT_REG(lsize1, LOCAL_SIZE_Y)
INSERT_REG(lsize2, LOCAL_SIZE_Z)
INSERT_REG(goffset2, GLOBAL_OFFSET_Z)
INSERT_REG(numgroup0, GROUP_NUM_X)
INSERT_REG(numgroup1, GROUP_NUM_Y)
- INSERT_REG(numgroup2, GROUP_NUM_Z)
+ INSERT_REG(numgroup2, GROUP_NUM_Z);
}
});
}
}
+ bool Context::isScalarReg(const ir::Register ®) const {
+ GBE_ASSERT(fn.getProfile() == ir::Profile::PROFILE_OCL);
+ if (fn.getInput(reg) != NULL)
+ return true;
+ if (reg == ir::ocl::groupid0 ||
+ reg == ir::ocl::groupid1 ||
+ reg == ir::ocl::groupid2 ||
+ reg == ir::ocl::numgroup0 ||
+ reg == ir::ocl::numgroup1 ||
+ reg == ir::ocl::numgroup2 ||
+ reg == ir::ocl::lsize0 ||
+ reg == ir::ocl::lsize1 ||
+ reg == ir::ocl::lsize2 ||
+ reg == ir::ocl::gsize0 ||
+ reg == ir::ocl::gsize1 ||
+ reg == ir::ocl::gsize2 ||
+ reg == ir::ocl::goffset0 ||
+ reg == ir::ocl::goffset1 ||
+ reg == ir::ocl::goffset2)
+ return true;
+ return false;
+ }
} /* namespace gbe */
class Unit; // Contains the complete program
class Function; // We compile a function into a kernel
+ class Register; // We compile a function into a kernel
class Liveness; // Describes liveness of each ir function register
class FunctionDAG; // Describes the instruction dependencies
void buildPatchList(void);
/*! Build the list of arguments to set to launch the kernel */
void buildArgList(void);
+ /*! Indicate if a register is scalar or not */
+ bool isScalarReg(const ir::Register ®) const;
/*! Build the instruction stream */
virtual void emitCode(void) = 0;
/*! Allocate a new empty kernel */
Kernel *kernel; //!< Kernel we are building
ir::Liveness *liveness; //!< Liveness info for the variables
ir::FunctionDAG *dag; //!< Complete DAG of values on the function
+ uint32_t simdWidth; //!< Number of lanes per HW threads
};
} /* namespace gbe */
#ifndef __GBE_SIMULATOR_H__
#define __GBE_SIMULATOR_H__
+#include <stdlib.h>
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
/* Gen simulator that runs the c++ produced by the back-end */
typedef struct _gbe_simulator *gbe_simulator;
-/* Return the base address of the global / constant memory space */
+/* Get / set the base address of the global / constant memory space */
typedef void *(sim_get_base_address_cb)(gbe_simulator);
-/* Set the base address of the global / constant memory space */
typedef void (sim_set_base_address_cb)(gbe_simulator, void*);
-/* Set the base address of the constant buffer */
+/* Get / set the base address of the constant buffer */
typedef void *(sim_get_curbe_address_cb)(gbe_simulator);
-/* Set the base address of the global / constant memory space */
typedef void (sim_set_curbe_address_cb)(gbe_simulator, void*);
+/* Get / set per-thread curbe size */
+typedef void (sim_set_curbe_size_cb)(gbe_simulator, size_t);
+typedef size_t (sim_get_curbe_size_cb)(gbe_simulator);
struct _gbe_simulator {
sim_set_base_address_cb *set_base_address;
sim_get_base_address_cb *get_base_address;
sim_set_curbe_address_cb *set_curbe_address;
sim_get_curbe_address_cb *get_curbe_address;
+ sim_set_curbe_size_cb *set_curbe_size;
+ sim_get_curbe_size_cb *get_curbe_size;
};
#ifdef __cplusplus
"#ifndef __GBE_SIMULATOR_H__\n"
"#define __GBE_SIMULATOR_H__\n"
"\n"
+"#include <stdlib.h>\n"
"#ifdef __cplusplus\n"
"extern \"C\" {\n"
"#endif /* __cplusplus */\n"
"\n"
"/* Gen simulator that runs the c++ produced by the back-end */\n"
"typedef struct _gbe_simulator *gbe_simulator;\n"
-"/* Return the base address of the global / constant memory space */\n"
+"/* Get / set the base address of the global / constant memory space */\n"
"typedef void *(sim_get_base_address_cb)(gbe_simulator);\n"
-"/* Set the base address of the global / constant memory space */\n"
"typedef void (sim_set_base_address_cb)(gbe_simulator, void*);\n"
-"/* Set the base address of the constant buffer */\n"
+"/* Get / set the base address of the constant buffer */\n"
"typedef void *(sim_get_curbe_address_cb)(gbe_simulator);\n"
-"/* Set the base address of the global / constant memory space */\n"
"typedef void (sim_set_curbe_address_cb)(gbe_simulator, void*);\n"
+"/* Get / set per-thread curbe size */\n"
+"typedef void (sim_set_curbe_size_cb)(gbe_simulator, size_t);\n"
+"typedef size_t (sim_get_curbe_size_cb)(gbe_simulator);\n"
"struct _gbe_simulator {\n"
" sim_set_base_address_cb *set_base_address;\n"
" sim_get_base_address_cb *get_base_address;\n"
" sim_set_curbe_address_cb *set_curbe_address;\n"
" sim_get_curbe_address_cb *get_curbe_address;\n"
+" sim_set_curbe_size_cb *set_curbe_size;\n"
+" sim_get_curbe_size_cb *get_curbe_size;\n"
"};\n"
"\n"
"#ifdef __cplusplus\n"
*/
#include "backend/sim_context.hpp"
#include "backend/sim_program.hpp"
+#include "ir/function.hpp"
#include <cstring>
#include <cstdio>
-#include <fstream>
#include <dlfcn.h>
namespace gbe
extern std::string simulator_str;
extern std::string sim_vector_str;
+ void SimContext::emitRegisters(void) {
+ GBE_ASSERT(fn.getProfile() == ir::PROFILE_OCL);
+ const uint32_t regNum = fn.regNum();
+ for (uint32_t regID = 0; regID < regNum; ++regID) {
+ const ir::Register reg(regID);
+ if (reg == ir::ocl::groupid0 ||
+ reg == ir::ocl::groupid1 ||
+ reg == ir::ocl::groupid2)
+ continue;
+ const ir::RegisterData regData = fn.getRegisterData(reg);
+ switch (regData.family) {
+ case ir::FAMILY_BOOL:
+ case ir::FAMILY_BYTE:
+ case ir::FAMILY_WORD:
+ case ir::FAMILY_QWORD:
+ NOT_IMPLEMENTED;
+ break;
+ case ir::FAMILY_DWORD:
+ if (isScalarReg(reg) == true)
+ o << "scalar_dw _" << regID << ";\n";
+ else
+ o << "simd" << simdWidth << "dw _" << regID << ";\n";
+ break;
+ }
+ }
+ }
+
+ void SimContext::loadCurbe(void) {
+ // Right now curbe is only made of input argument stuff
+ const uint32_t inputNum = fn.inputNum();
+ for (uint32_t inputID = 0; inputID < inputNum; ++inputID) {
+
+ }
+ }
+
void SimContext::emitCode(void) {
SimKernel *simKernel = static_cast<SimKernel*>(this->kernel);
char srcStr[L_tmpnam+1], libStr[L_tmpnam+1];
const std::string srcName = std::string(tmpnam_r(srcStr)) + ".cpp"; /* unsafe! */
const std::string libName = std::string(tmpnam_r(libStr)) + ".so"; /* unsafe! */
-
+ std::cout << fn;
/* Output the code first */
- std::ofstream ostream;
- ostream.open(srcName);
- ostream << simulator_str << std::endl;
- ostream << sim_vector_str << std::endl;
- ostream << "#include <stdint.h>\n";
- ostream << "extern \"C\" void " << name
- << "(gbe_simulator sim, uint32_t thread, uint32_t group_x, uint32_t group_y, uint32_t group_z)" << std::endl
- << "{}"
- << std::endl << std::endl;
- ostream.close();
+ o.open(srcName);
+ o << simulator_str << std::endl;
+ o << sim_vector_str << std::endl;
+ o << "#include <stdint.h>\n";
+ o << "extern \"C\" void " << name
+ << "(gbe_simulator sim, uint32_t tid, scalar_dw _3, scalar_dw _4, scalar_dw _5)\n"
+ << "{\n"
+ << "const size_t curbe_sz = sim->get_curbe_size(sim);\n"
+ << "const char *curbe = (const char*) sim->get_curbe_address(sim) + curbe_sz * tid;\n";
+ this->emitRegisters();
+ o << "}\n";
+ o << std::endl;
+ o.close();
/* Compile the function */
std::cout << "# source: " << srcName << " library: " << libName << std::endl;
#define __GBE_SIM_CONTEXT_HPP__
#include <string>
+#include <fstream>
#include "backend/context.hpp"
namespace gbe
~SimContext(void);
/*! Implements base class */
virtual void emitCode(void);
+ /*! Emit all the register declarations */
+ void emitRegisters(void);
+ /*! Load the curbe data into the registers */
+ void loadCurbe(void);
/*! Implements base class */
virtual Kernel *allocateKernel(void);
+ std::ofstream o; //!< Where to output the c++ string
};
} /* namespace gbe */
PROFILE_OCL = 1
};
- // Will be pre-initialized
+ // Will be pre-initialized based on its profile
class Function;
/*! Registers used for ocl */
* file. We enforce type safety with this class
*/
TYPE_SAFE(Register, uint16_t)
+ INLINE bool operator< (const Register &r0, const Register &r1) {
+ return r0.value() < r1.value();
+ }
/*! Tuple is the position of the first register in the tuple vector. We
* enforce type safety with this class
/*! Performs the double look-up to get the set of defs per register */
RegDefSet &getDefSet(const BasicBlock *bb, const Register ®);
/*! Build a UD-chain as the union of the predecessor chains */
- void makeUDChain(UDChain &udChain, const BasicBlock &bb, const Register ®);
+ void makeDefSet(DefSet &udChain, const BasicBlock &bb, const Register ®);
/*! Fast per register definition set allocation */
DECL_POOL(RegDefSet, regDefSetPool);
/*! Fast register sets allocation */
return *defIt->second;
}
- void LiveOutSet::makeUDChain(UDChain &udChain,
+ void LiveOutSet::makeDefSet(DefSet &udChain,
const BasicBlock &bb,
const Register ®)
{
fn(liveness.getFunction())
{
// We first start with empty chains
- udEmpty = this->newUDChain();
- duEmpty = this->newDUChain();
+ udEmpty = this->newDefSet();
+ duEmpty = this->newUseSet();
// First create the chains and insert them in their respective maps
fn.foreachInstruction([this, udEmpty, duEmpty](const Instruction &insn) {
// Build UD chains traversing the blocks top to bottom
fn.foreachBlock([&](const BasicBlock &bb) {
// Track the allocated chains to be able to reuse them
- map<Register, UDChain*> allocated;
+ map<Register, DefSet*> allocated;
// Some chains may be not used (ie they are dead). We track them to be
// able to deallocate them later
- set<UDChain*> unused;
+ set<DefSet*> unused;
// For each instruction build the UD chains
bb.foreach([&](const Instruction &insn) {
}
// Create a new one from the predecessor chains (upward used value)
else {
- UDChain *udChain = this->newUDChain();
- liveOutSet.makeUDChain(*udChain, bb, src);
+ DefSet *udChain = this->newDefSet();
+ liveOutSet.makeDefSet(*udChain, bb, src);
allocated.insert(std::make_pair(src, udChain));
ud->second = udChain;
}
for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
const Register dst = insn.getDstIndex(fn, dstID);
ValueDef *def = (ValueDef *) this->getDefAddress(&insn, dstID);
- UDChain *udChain = this->newUDChain();
+ DefSet *udChain = this->newDefSet();
udChain->insert(def);
unused.insert(udChain);
// Remove the previous definition if any
// Deallocate unused chains
for (auto it = unused.begin(); it != unused.end(); ++it)
- this->deleteUDChain(*it);
+ this->deleteDefSet(*it);
});
// Build the DU chains from the UD ones
const auto &defs = this->getDef(&insn, srcID);
for (auto def = defs.begin(); def != defs.end(); ++def) {
auto uses = duGraph.find(**def);
- DUChain *du = uses->second;
+ UseSet *du = uses->second;
GBE_ASSERT(uses != duGraph.end());
if (du == duEmpty) {
duGraph.erase(**def);
- du = this->newDUChain();
+ du = this->newUseSet();
duGraph.insert(std::make_pair(**def, du));
}
du->insert(use);
}
}
});
- std::cout << liveOutSet;
+
+ // Allocate the set of uses and defs per register
+ const uint32_t regNum = fn.regNum();
+ for (uint32_t regID = 0; regID < regNum; ++regID) {
+ const Register reg(regID);
+ UseSet *useSet = GBE_NEW(UseSet);
+ DefSet *defSet = GBE_NEW(DefSet);
+ regUse.insert(std::make_pair(reg, useSet));
+ regDef.insert(std::make_pair(reg, defSet));
+ }
+
+ // Fill use sets (one per register)
+ for (auto &useSet : duGraph) {
+ for (auto use : *useSet.second) {
+ const Register reg = use->getRegister();
+ auto it = regUse.find(reg);
+ GBE_ASSERT(it != regUse.end() && it->second != NULL);
+ it->second->insert(use);
+ }
+ }
+
+ // Fill def sets (one per register)
+ for (auto &defSet : udGraph) {
+ for (auto def : *defSet.second) {
+ const Register reg = def->getRegister();
+ auto it = regDef.find(reg);
+ GBE_ASSERT(it != regDef.end() && it->second != NULL);
+ it->second->insert(def);
+ }
+ }
}
/*! Helper to deallocate objects */
set<void*> destroyed;
// Release the empty ud-chains and du-chains
- PTR_RELEASE(UDChain, udEmpty);
- PTR_RELEASE(DUChain, duEmpty);
+ PTR_RELEASE(DefSet, udEmpty);
+ PTR_RELEASE(UseSet, duEmpty);
// We free all the ud-chains
for (auto it = udGraph.begin(); it != udGraph.end(); ++it) {
if (destroyed.contains(defs)) continue;
for (auto def = defs->begin(); def != defs->end(); ++def)
PTR_RELEASE(ValueDef, *def);
- PTR_RELEASE(UDChain, defs);
+ PTR_RELEASE(DefSet, defs);
}
// We free all the du-chains
if (destroyed.contains(uses)) continue;
for (auto use = uses->begin(); use != uses->end(); ++use)
PTR_RELEASE(ValueUse, *use);
- PTR_RELEASE(DUChain, uses);
+ PTR_RELEASE(UseSet, uses);
}
+
+ // Release all the use and definition sets per register
+ for (auto it = regUse.begin(); it != regUse.end(); ++it)
+ GBE_SAFE_DELETE(it->second);
+ for (auto it = regDef.begin(); it != regDef.end(); ++it)
+ GBE_SAFE_DELETE(it->second);
}
#undef PTR_RELEASE
- const DUChain &FunctionDAG::getUse(const Instruction *insn, uint32_t dstID) const {
+ const UseSet &FunctionDAG::getUse(const Instruction *insn, uint32_t dstID) const {
const ValueDef def(insn, dstID);
auto it = duGraph.find(def);
GBE_ASSERT(it != duGraph.end());
return *it->second;
}
- const DUChain &FunctionDAG::getUse(const FunctionInput *input) const {
+ const UseSet &FunctionDAG::getUse(const FunctionInput *input) const {
const ValueDef def(input);
auto it = duGraph.find(def);
GBE_ASSERT(it != duGraph.end());
return *it->second;
}
- const UDChain &FunctionDAG::getDef(const Instruction *insn, uint32_t srcID) const {
+ const DefSet &FunctionDAG::getDef(const Instruction *insn, uint32_t srcID) const {
const ValueUse use(insn, srcID);
auto it = udGraph.find(use);
GBE_ASSERT(it != udGraph.end());
GBE_ASSERT(type == DEF_FN_INPUT);
return data.input;
}
- /*! Get the register */
- INLINE Register getRegister(void) const {
+ /*! Get the special register */
+ INLINE Register getSpecialReg(void) const {
GBE_ASSERT(type == DEF_SPECIAL_REG);
return Register(data.regID);
}
+ /*! Retrieve the register associated to the definition */
+ INLINE Register getRegister(void) const {
+ if (type == DEF_SPECIAL_REG)
+ return Register(data.regID);
+ else if (type == DEF_FN_INPUT)
+ return data.input->reg;
+ else {
+ const Function &fn = data.insn->getParent()->getParent();
+ return data.insn->getDstIndex(fn, data.dstID);
+ }
+ }
private:
/*! Instruction or function argument */
const FunctionInput *in1 = def1.getFunctionInput();
return uintptr_t(in0) < uintptr_t(in1);
} else if (type0 == ValueDef::DEF_SPECIAL_REG) {
- const Register reg0 = def0.getRegister();
- const Register reg1 = def1.getRegister();
+ const Register reg0 = def0.getSpecialReg();
+ const Register reg1 = def1.getSpecialReg();
return uint32_t(reg0) < uint32_t(reg1);
} else {
const Instruction *insn0 = def0.getInstruction();
const Instruction *getInstruction(void) const { return insn; }
/*! Get the source index for this use */
uint32_t getSrcID(void) const { return srcID; }
+ /*! Get the register for this use */
+ Register getRegister(void) const {
+ return insn->getSrcIndex(insn->getParent()->getParent(), srcID);
+ }
private:
const Instruction *insn; //!< Instruction where the value is used
uint32_t srcID; //!< Index of the source in the instruction
}
/*! All uses of a definition */
- typedef set<ValueUse*> DUChain;
+ typedef set<ValueUse*> UseSet;
/*! All possible definitions for a use */
- typedef set<ValueDef*> UDChain;
+ typedef set<ValueDef*> DefSet;
/*! Get the chains (in both directions) for the complete program */
class FunctionDAG : public NonCopyable
/*! Free all the resources */
~FunctionDAG(void);
/*! Get the du-chain for the given instruction and destination */
- const DUChain &getUse(const Instruction *insn, uint32_t dstID) const;
+ const UseSet &getUse(const Instruction *insn, uint32_t dstID) const;
/*! Get the du-chain for the given function input */
- const DUChain &getUse(const FunctionInput *input) const;
+ const UseSet &getUse(const FunctionInput *input) const;
/*! Get the du-chain for the given special register */
- const DUChain &getUse(const Register ®) const;
+ const UseSet &getUse(const Register ®) const;
/*! Get the ud-chain for the instruction and source */
- const UDChain &getDef(const Instruction *insn, uint32_t srcID) const;
+ const DefSet &getDef(const Instruction *insn, uint32_t srcID) const;
/*! Get the pointer to the definition *as stored in the DAG* */
const ValueDef *getDefAddress(const Instruction *insn, uint32_t dstID) const;
/*! Get the pointer to the definition *as stored in the DAG* */
const ValueUse *getUseAddress(const Instruction *insn, uint32_t srcID) const;
/*! Get the function we have the graph for */
const Function &getFunction(void) const { return fn; }
- /*! The UDChain for each definition use */
- typedef map<ValueUse, UDChain*> UDGraph;
- /*! The DUChain for each definition */
- typedef map<ValueDef, DUChain*> DUGraph;
+ /*! The DefSet for each definition use */
+ typedef map<ValueUse, DefSet*> UDGraph;
+ /*! The UseSet for each definition */
+ typedef map<ValueDef, UseSet*> DUGraph;
private:
UDGraph udGraph; //!< All the UD chains
DUGraph duGraph; //!< All the DU chains
- UDChain *udEmpty; //!< Void use set
- DUChain *duEmpty; //!< Void def set
+ DefSet *udEmpty; //!< Void use set
+ UseSet *duEmpty; //!< Void def set
ValueDef *undefined; //!< Undefined value
map<ValueUse, ValueUse*> useName; //!< Get the ValueUse pointer from the value
map<ValueDef, ValueDef*> defName; //!< Get the ValueDef pointer from the value
+ map<Register, UseSet*> regUse; //!< All uses of registers
+ map<Register, DefSet*> regDef; //!< All defs of registers
DECL_POOL(ValueDef, valueDefPool); //!< Fast ValueDef allocation
DECL_POOL(ValueUse, valueUsePool); //!< Fast ValueUse allocation
- DECL_POOL(UDChain, udChainPool); //!< Fast UDChain allocation
- DECL_POOL(DUChain, duChainPool); //!< Fast DUChain allocation
+ DECL_POOL(DefSet, udChainPool); //!< Fast DefSet allocation
+ DECL_POOL(UseSet, duChainPool); //!< Fast UseSet allocation
const Function &fn; //!< Function we are referring to
GBE_CLASS(FunctionDAG); //!< Use internal allocators
};
+ std::string(file)
+ ", function " + std::string(fn)
+ ", line " + std::string(lineString);
- assert(0);
+ // assert(0);
throw Exception(str);
}
} /* namespace gbe */
UTEST_EXPECT_SUCCESS(utestScatterGather());
}
-
UTEST_REGISTER(utestVector)