From 2774bad1124215571ab154afcb5478c78cf46344 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Thu, 2 Dec 2021 12:26:59 +0000 Subject: [PATCH] [AMDGPU] Change llvm.amdgcn.image.bvh.intersect.ray to take vec3 args The ray_origin, ray_dir and ray_inv_dir arguments should all be vec3 to match how the hardware instruction works. Don't change the API of the corresponding OpenCL builtins. Differential Revision: https://reviews.llvm.org/D115032 --- clang/lib/CodeGen/CGBuiltin.cpp | 9 + .../CodeGenOpenCL/builtins-amdgcn-raytracing.cl | 8 +- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 4 +- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 8 +- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 4 +- .../AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll | 558 ++++++++++----------- .../CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll | 184 +++---- 7 files changed, 377 insertions(+), 398 deletions(-) diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 0a98b5b..44a24d2 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -16592,6 +16592,15 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, llvm::Value *RayInverseDir = EmitScalarExpr(E->getArg(4)); llvm::Value *TextureDescr = EmitScalarExpr(E->getArg(5)); + // The builtins take these arguments as vec4 where the last element is + // ignored. The intrinsic takes them as vec3. + RayOrigin = Builder.CreateShuffleVector(RayOrigin, RayOrigin, + ArrayRef{0, 1, 2}); + RayDir = + Builder.CreateShuffleVector(RayDir, RayDir, ArrayRef{0, 1, 2}); + RayInverseDir = Builder.CreateShuffleVector(RayInverseDir, RayInverseDir, + ArrayRef{0, 1, 2}); + Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_image_bvh_intersect_ray, {NodePtr->getType(), RayDir->getType()}); return Builder.CreateCall(F, {NodePtr, RayExtent, RayOrigin, RayDir, diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-raytracing.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-raytracing.cl index 805d17a..3c90c9a 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-raytracing.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-raytracing.cl @@ -19,7 +19,7 @@ typedef double double4 __attribute__((ext_vector_type(4))); typedef half half4 __attribute__((ext_vector_type(4))); typedef uint uint4 __attribute__((ext_vector_type(4))); -// CHECK: call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32 +// CHECK: call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v3f32 // ISA: image_bvh_intersect_ray void test_image_bvh_intersect_ray(global uint4* out, uint node_ptr, float ray_extent, float4 ray_origin, float4 ray_dir, float4 ray_inv_dir, @@ -29,7 +29,7 @@ void test_image_bvh_intersect_ray(global uint4* out, uint node_ptr, ray_origin, ray_dir, ray_inv_dir, texture_descr); } -// CHECK: call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16 +// CHECK: call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v3f16 // ISA: image_bvh_intersect_ray void test_image_bvh_intersect_ray_h(global uint4* out, uint node_ptr, float ray_extent, float4 ray_origin, half4 ray_dir, half4 ray_inv_dir, @@ -39,7 +39,7 @@ void test_image_bvh_intersect_ray_h(global uint4* out, uint node_ptr, ray_origin, ray_dir, ray_inv_dir, texture_descr); } -// CHECK: call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32 +// CHECK: call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v3f32 // ISA: image_bvh_intersect_ray void test_image_bvh_intersect_ray_l(global uint4* out, ulong node_ptr, float ray_extent, float4 ray_origin, float4 ray_dir, float4 ray_inv_dir, @@ -49,7 +49,7 @@ void test_image_bvh_intersect_ray_l(global uint4* out, ulong node_ptr, ray_origin, ray_dir, ray_inv_dir, texture_descr); } -// CHECK: call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16 +// CHECK: call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v3f16 // ISA: image_bvh_intersect_ray void test_image_bvh_intersect_ray_lh(global uint4* out, ulong node_ptr, float ray_extent, float4 ray_origin, half4 ray_dir, half4 ray_inv_dir, diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 0a44670..91c2dc0 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1789,9 +1789,11 @@ def int_amdgcn_global_atomic_csub : AMDGPUGlobalAtomicRtn; // uint4 llvm.amdgcn.image.bvh.intersect.ray , , , // , , +// is i32 or i64. +// and are both v3f16 or both v3f32. def int_amdgcn_image_bvh_intersect_ray : Intrinsic<[llvm_v4i32_ty], - [llvm_anyint_ty, llvm_float_ty, llvm_v4f32_ty, llvm_anyvector_ty, + [llvm_anyint_ty, llvm_float_ty, llvm_v3f32_ty, llvm_anyvector_ty, LLVMMatchType<1>, llvm_v4i32_ty], [IntrReadMem, IntrWillReturn]>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 1f898f2b..e4d70ad 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -4869,8 +4869,8 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, } Ops.push_back(RayExtent); - auto packLanes = [&Ops, &S32, &B] (Register Src) { - auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src); + auto packLanes = [&Ops, &S32, &B](Register Src) { + auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); Ops.push_back(Unmerge.getReg(0)); Ops.push_back(Unmerge.getReg(1)); Ops.push_back(Unmerge.getReg(2)); @@ -4878,8 +4878,8 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, packLanes(RayOrigin); if (IsA16) { - auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir); - auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir); + auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); + auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); Register R1 = MRI.createGenericVirtualRegister(S32); Register R2 = MRI.createGenericVirtualRegister(S32); Register R3 = MRI.createGenericVirtualRegister(S32); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 91b63c0..ae193fb 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -7503,8 +7503,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, assert(NodePtr.getValueType() == MVT::i32 || NodePtr.getValueType() == MVT::i64); - assert(RayDir.getValueType() == MVT::v4f16 || - RayDir.getValueType() == MVT::v4f32); + assert(RayDir.getValueType() == MVT::v3f16 || + RayDir.getValueType() == MVT::v3f32); if (!Subtarget->hasGFX10_AEncoding()) { emitRemovedIntrinsicError(DAG, DL, Op.getValueType()); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll index 82a4ece..0236ebc 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll @@ -3,37 +3,25 @@ ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1013 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1013 %s ; RUN: not --crash llc -global-isel -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s -o /dev/null 2>&1 | FileCheck -check-prefix=ERR %s -; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(uint node_ptr, float ray_extent, float4 ray_origin, float4 ray_dir, float4 ray_inv_dir, uint4 texture_descr) -; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(uint node_ptr, float ray_extent, float4 ray_origin, half4 ray_dir, half4 ray_inv_dir, uint4 texture_descr) -; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(ulong node_ptr, float ray_extent, float4 ray_origin, float4 ray_dir, float4 ray_inv_dir, uint4 texture_descr) -; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(ulong node_ptr, float ray_extent, float4 ray_origin, half4 ray_dir, half4 ray_inv_dir, uint4 texture_descr) +; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(uint node_ptr, float ray_extent, float3 ray_origin, float3 ray_dir, float3 ray_inv_dir, uint4 texture_descr) +; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(uint node_ptr, float ray_extent, float3 ray_origin, half3 ray_dir, half3 ray_inv_dir, uint4 texture_descr) +; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(ulong node_ptr, float ray_extent, float3 ray_origin, float3 ray_dir, float3 ray_inv_dir, uint4 texture_descr) +; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(ulong node_ptr, float ray_extent, float3 ray_origin, half3 ray_dir, half3 ray_inv_dir, uint4 texture_descr) -declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32, float, <4 x float>, <4 x float>, <4 x float>, <4 x i32>) -declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32, float, <4 x float>, <4 x half>, <4 x half>, <4 x i32>) -declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64, float, <4 x float>, <4 x float>, <4 x float>, <4 x i32>) -declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64, float, <4 x float>, <4 x half>, <4 x half>, <4 x i32>) +declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32, float, <3 x float>, <3 x float>, <3 x float>, <4 x i32>) +declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32, float, <3 x float>, <3 x half>, <3 x half>, <4 x i32>) +declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64, float, <3 x float>, <3 x float>, <3 x float>, <4 x i32>) +declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64, float, <3 x float>, <3 x half>, <3 x half>, <4 x i32>) declare i32 @llvm.amdgcn.workitem.id.x() -define amdgpu_ps <4 x float> @image_bvh_intersect_ray(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> inreg %tdescr) { -; GFX1030-LABEL: image_bvh_intersect_ray: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], [v0, v1, v2, v3, v4, v6, v7, v8, v10, v11, v12], s[0:3] -; GFX1030-NEXT: s_waitcnt vmcnt(0) -; GFX1030-NEXT: ; return to shader part epilog -; -; GFX1013-LABEL: image_bvh_intersect_ray: -; GFX1013: ; %bb.0: -; GFX1013-NEXT: v_mov_b32_e32 v5, v6 -; GFX1013-NEXT: v_mov_b32_e32 v6, v7 -; GFX1013-NEXT: v_mov_b32_e32 v7, v8 -; GFX1013-NEXT: v_mov_b32_e32 v8, v10 -; GFX1013-NEXT: v_mov_b32_e32 v9, v11 -; GFX1013-NEXT: v_mov_b32_e32 v10, v12 -; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:15], s[0:3] -; GFX1013-NEXT: s_waitcnt vmcnt(0) -; GFX1013-NEXT: ; return to shader part epilog +define amdgpu_ps <4 x float> @image_bvh_intersect_ray(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> inreg %tdescr) { +; GCN-LABEL: image_bvh_intersect_ray: +; GCN: ; %bb.0: +; GCN-NEXT: image_bvh_intersect_ray v[0:3], v[0:15], s[0:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog ; ERR: in function image_bvh_intersect_ray{{.*}}intrinsic not supported on subtarget - %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) %r = bitcast <4 x i32> %v to <4 x float> ret <4 x float> %r } @@ -44,60 +32,48 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_flat(i32 %node_ptr, float ; GCN-NEXT: image_bvh_intersect_ray v[0:3], v[0:15], s[0:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog - %ray_origin0 = insertelement <4 x float> undef, float %ray_origin_x, i32 0 - %ray_origin1 = insertelement <4 x float> %ray_origin0, float %ray_origin_y, i32 1 - %ray_origin = insertelement <4 x float> %ray_origin1, float %ray_origin_z, i32 2 - %ray_dir0 = insertelement <4 x float> undef, float %ray_dir_x, i32 0 - %ray_dir1 = insertelement <4 x float> %ray_dir0, float %ray_dir_y, i32 1 - %ray_dir = insertelement <4 x float> %ray_dir1, float %ray_dir_z, i32 2 - %ray_inv_dir0 = insertelement <4 x float> undef, float %ray_inv_dir_x, i32 0 - %ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float %ray_inv_dir_y, i32 1 - %ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float %ray_inv_dir_z, i32 2 - %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) + %ray_origin0 = insertelement <3 x float> undef, float %ray_origin_x, i32 0 + %ray_origin1 = insertelement <3 x float> %ray_origin0, float %ray_origin_y, i32 1 + %ray_origin = insertelement <3 x float> %ray_origin1, float %ray_origin_z, i32 2 + %ray_dir0 = insertelement <3 x float> undef, float %ray_dir_x, i32 0 + %ray_dir1 = insertelement <3 x float> %ray_dir0, float %ray_dir_y, i32 1 + %ray_dir = insertelement <3 x float> %ray_dir1, float %ray_dir_z, i32 2 + %ray_inv_dir0 = insertelement <3 x float> undef, float %ray_inv_dir_x, i32 0 + %ray_inv_dir1 = insertelement <3 x float> %ray_inv_dir0, float %ray_inv_dir_y, i32 1 + %ray_inv_dir = insertelement <3 x float> %ray_inv_dir1, float %ray_inv_dir_z, i32 2 + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) %r = bitcast <4 x i32> %v to <4 x float> ret <4 x float> %r } -define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> inreg %tdescr) { +define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> inreg %tdescr) { ; GCN-LABEL: image_bvh_intersect_ray_a16: ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b32 s4, 0xffff -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v6 -; GCN-NEXT: v_and_b32_e32 v10, s4, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_and_b32_e32 v9, s4, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; GCN-NEXT: v_and_b32_e32 v10, s4, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_and_b32_e32 v8, s4, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_and_or_b32 v5, v6, s4, v5 -; GCN-NEXT: v_and_or_b32 v6, v7, s4, v10 -; GCN-NEXT: v_lshl_or_b32 v7, v9, 16, v8 +; GCN-NEXT: v_lshl_or_b32 v7, v8, 16, v7 +; GCN-NEXT: v_and_or_b32 v5, v5, s4, v9 +; GCN-NEXT: v_and_or_b32 v6, v6, s4, v10 ; GCN-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[0:3] a16 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog - %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) %r = bitcast <4 x i32> %v to <4 x float> ret <4 x float> %r } -define amdgpu_ps <4 x float> @image_bvh64_intersect_ray(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> inreg %tdescr) { -; GFX1030-LABEL: image_bvh64_intersect_ray: -; GFX1030: ; %bb.0: -; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], [v0, v1, v2, v3, v4, v5, v7, v8, v9, v11, v12, v13], s[0:3] -; GFX1030-NEXT: s_waitcnt vmcnt(0) -; GFX1030-NEXT: ; return to shader part epilog -; -; GFX1013-LABEL: image_bvh64_intersect_ray: -; GFX1013: ; %bb.0: -; GFX1013-NEXT: v_mov_b32_e32 v6, v7 -; GFX1013-NEXT: v_mov_b32_e32 v7, v8 -; GFX1013-NEXT: v_mov_b32_e32 v8, v9 -; GFX1013-NEXT: v_mov_b32_e32 v9, v11 -; GFX1013-NEXT: v_mov_b32_e32 v10, v12 -; GFX1013-NEXT: v_mov_b32_e32 v11, v13 -; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] -; GFX1013-NEXT: s_waitcnt vmcnt(0) -; GFX1013-NEXT: ; return to shader part epilog - %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) +define amdgpu_ps <4 x float> @image_bvh64_intersect_ray(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> inreg %tdescr) { +; GCN-LABEL: image_bvh64_intersect_ray: +; GCN: ; %bb.0: +; GCN-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) %r = bitcast <4 x i32> %v to <4 x float> ret <4 x float> %r } @@ -109,67 +85,70 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_flat(<2 x i32> %node_ptr ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %node_ptr = bitcast <2 x i32> %node_ptr_vec to i64 - %ray_origin0 = insertelement <4 x float> undef, float %ray_origin_x, i32 0 - %ray_origin1 = insertelement <4 x float> %ray_origin0, float %ray_origin_y, i32 1 - %ray_origin = insertelement <4 x float> %ray_origin1, float %ray_origin_z, i32 2 - %ray_dir0 = insertelement <4 x float> undef, float %ray_dir_x, i32 0 - %ray_dir1 = insertelement <4 x float> %ray_dir0, float %ray_dir_y, i32 1 - %ray_dir = insertelement <4 x float> %ray_dir1, float %ray_dir_z, i32 2 - %ray_inv_dir0 = insertelement <4 x float> undef, float %ray_inv_dir_x, i32 0 - %ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float %ray_inv_dir_y, i32 1 - %ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float %ray_inv_dir_z, i32 2 - %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) + %ray_origin0 = insertelement <3 x float> undef, float %ray_origin_x, i32 0 + %ray_origin1 = insertelement <3 x float> %ray_origin0, float %ray_origin_y, i32 1 + %ray_origin = insertelement <3 x float> %ray_origin1, float %ray_origin_z, i32 2 + %ray_dir0 = insertelement <3 x float> undef, float %ray_dir_x, i32 0 + %ray_dir1 = insertelement <3 x float> %ray_dir0, float %ray_dir_y, i32 1 + %ray_dir = insertelement <3 x float> %ray_dir1, float %ray_dir_z, i32 2 + %ray_inv_dir0 = insertelement <3 x float> undef, float %ray_inv_dir_x, i32 0 + %ray_inv_dir1 = insertelement <3 x float> %ray_inv_dir0, float %ray_inv_dir_y, i32 1 + %ray_inv_dir = insertelement <3 x float> %ray_inv_dir1, float %ray_inv_dir_z, i32 2 + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) %r = bitcast <4 x i32> %v to <4 x float> ret <4 x float> %r } -define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> inreg %tdescr) { +define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> inreg %tdescr) { ; GCN-LABEL: image_bvh64_intersect_ray_a16: ; GCN: ; %bb.0: ; GCN-NEXT: s_mov_b32 s4, 0xffff -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v7 -; GCN-NEXT: v_and_b32_e32 v11, s4, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_and_b32_e32 v10, s4, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v6 +; GCN-NEXT: v_and_b32_e32 v11, s4, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_and_b32_e32 v9, s4, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_and_or_b32 v6, v7, s4, v6 -; GCN-NEXT: v_and_or_b32 v7, v8, s4, v11 -; GCN-NEXT: v_lshl_or_b32 v8, v10, 16, v9 +; GCN-NEXT: v_lshl_or_b32 v8, v9, 16, v8 +; GCN-NEXT: v_and_or_b32 v6, v6, s4, v10 +; GCN-NEXT: v_and_or_b32 v7, v7, s4, v11 ; GCN-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] a16 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog - %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) %r = bitcast <4 x i32> %v to <4 x float> ret <4 x float> %r } -define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) { +define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) { ; GFX1030-LABEL: image_bvh_intersect_ray_vgpr_descr: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: v_mov_b32_e32 v18, v0 -; GFX1030-NEXT: v_mov_b32_e32 v19, v1 -; GFX1030-NEXT: v_mov_b32_e32 v20, v2 -; GFX1030-NEXT: v_mov_b32_e32 v21, v3 -; GFX1030-NEXT: v_mov_b32_e32 v22, v4 -; GFX1030-NEXT: v_mov_b32_e32 v23, v6 -; GFX1030-NEXT: v_mov_b32_e32 v24, v7 -; GFX1030-NEXT: v_mov_b32_e32 v25, v8 -; GFX1030-NEXT: v_mov_b32_e32 v26, v10 -; GFX1030-NEXT: v_mov_b32_e32 v27, v11 -; GFX1030-NEXT: v_mov_b32_e32 v28, v12 +; GFX1030-NEXT: v_mov_b32_e32 v15, v0 +; GFX1030-NEXT: v_mov_b32_e32 v16, v1 +; GFX1030-NEXT: v_mov_b32_e32 v17, v2 +; GFX1030-NEXT: v_mov_b32_e32 v18, v3 +; GFX1030-NEXT: v_mov_b32_e32 v19, v4 +; GFX1030-NEXT: v_mov_b32_e32 v20, v5 +; GFX1030-NEXT: v_mov_b32_e32 v21, v6 +; GFX1030-NEXT: v_mov_b32_e32 v22, v7 +; GFX1030-NEXT: v_mov_b32_e32 v23, v8 +; GFX1030-NEXT: v_mov_b32_e32 v24, v9 +; GFX1030-NEXT: v_mov_b32_e32 v25, v10 ; GFX1030-NEXT: s_mov_b32 s1, exec_lo ; GFX1030-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 -; GFX1030-NEXT: v_readfirstlane_b32 s4, v14 -; GFX1030-NEXT: v_readfirstlane_b32 s5, v15 -; GFX1030-NEXT: v_readfirstlane_b32 s6, v16 -; GFX1030-NEXT: v_readfirstlane_b32 s7, v17 -; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[14:15] -; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[16:17] +; GFX1030-NEXT: v_readfirstlane_b32 s4, v11 +; GFX1030-NEXT: v_readfirstlane_b32 s5, v12 +; GFX1030-NEXT: v_readfirstlane_b32 s6, v13 +; GFX1030-NEXT: v_readfirstlane_b32 s7, v14 +; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[11:12] +; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[13:14] ; GFX1030-NEXT: s_and_b32 s0, s0, vcc_lo ; GFX1030-NEXT: s_and_saveexec_b32 s0, s0 -; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[18:33], s[4:7] -; GFX1030-NEXT: ; implicit-def: $vgpr14_vgpr15 +; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[15:30], s[4:7] +; GFX1030-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1030-NEXT: ; implicit-def: $vgpr15 +; GFX1030-NEXT: ; implicit-def: $vgpr16 +; GFX1030-NEXT: ; implicit-def: $vgpr17 ; GFX1030-NEXT: ; implicit-def: $vgpr18 ; GFX1030-NEXT: ; implicit-def: $vgpr19 ; GFX1030-NEXT: ; implicit-def: $vgpr20 @@ -178,10 +157,7 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr, ; GFX1030-NEXT: ; implicit-def: $vgpr23 ; GFX1030-NEXT: ; implicit-def: $vgpr24 ; GFX1030-NEXT: ; implicit-def: $vgpr25 -; GFX1030-NEXT: ; implicit-def: $vgpr26 -; GFX1030-NEXT: ; implicit-def: $vgpr27 -; GFX1030-NEXT: ; implicit-def: $vgpr28 -; GFX1030-NEXT: ; implicit-def: $vgpr14_vgpr15_vgpr16_vgpr17 +; GFX1030-NEXT: ; implicit-def: $vgpr11_vgpr12_vgpr13_vgpr14 ; GFX1030-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1030-NEXT: s_cbranch_execnz .LBB6_1 ; GFX1030-NEXT: ; %bb.2: @@ -191,28 +167,24 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr, ; ; GFX1013-LABEL: image_bvh_intersect_ray_vgpr_descr: ; GFX1013: ; %bb.0: -; GFX1013-NEXT: v_mov_b32_e32 v5, v6 -; GFX1013-NEXT: v_mov_b32_e32 v6, v7 -; GFX1013-NEXT: v_mov_b32_e32 v7, v8 -; GFX1013-NEXT: v_mov_b32_e32 v8, v10 -; GFX1013-NEXT: v_mov_b32_e32 v9, v11 -; GFX1013-NEXT: v_mov_b32_e32 v10, v12 -; GFX1013-NEXT: v_mov_b32_e32 v18, v14 -; GFX1013-NEXT: v_mov_b32_e32 v19, v15 +; GFX1013-NEXT: v_mov_b32_e32 v16, v11 +; GFX1013-NEXT: v_mov_b32_e32 v17, v12 +; GFX1013-NEXT: v_mov_b32_e32 v18, v13 +; GFX1013-NEXT: v_mov_b32_e32 v19, v14 ; GFX1013-NEXT: s_mov_b32 s1, exec_lo ; GFX1013-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 -; GFX1013-NEXT: v_readfirstlane_b32 s4, v18 -; GFX1013-NEXT: v_readfirstlane_b32 s5, v19 -; GFX1013-NEXT: v_readfirstlane_b32 s6, v16 -; GFX1013-NEXT: v_readfirstlane_b32 s7, v17 -; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[18:19] -; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[16:17] +; GFX1013-NEXT: v_readfirstlane_b32 s4, v16 +; GFX1013-NEXT: v_readfirstlane_b32 s5, v17 +; GFX1013-NEXT: v_readfirstlane_b32 s6, v18 +; GFX1013-NEXT: v_readfirstlane_b32 s7, v19 +; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[16:17] +; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[18:19] ; GFX1013-NEXT: s_and_b32 s0, s0, vcc_lo ; GFX1013-NEXT: s_and_saveexec_b32 s0, s0 ; GFX1013-NEXT: image_bvh_intersect_ray v[20:23], v[0:15], s[4:7] +; GFX1013-NEXT: ; implicit-def: $vgpr16_vgpr17 ; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX1013-NEXT: ; implicit-def: $vgpr18_vgpr19 -; GFX1013-NEXT: ; implicit-def: $vgpr14_vgpr15_vgpr16_vgpr17 +; GFX1013-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19 ; GFX1013-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1013-NEXT: s_cbranch_execnz .LBB6_1 @@ -224,41 +196,42 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr, ; GFX1013-NEXT: v_mov_b32_e32 v2, v22 ; GFX1013-NEXT: v_mov_b32_e32 v3, v23 ; GFX1013-NEXT: ; return to shader part epilog - %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) %r = bitcast <4 x i32> %v to <4 x float> ret <4 x float> %r } -define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) { +define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) { ; GFX1030-LABEL: image_bvh_intersect_ray_a16_vgpr_descr: ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_mov_b32 s0, 0xffff -; GFX1030-NEXT: v_mov_b32_e32 v14, v0 -; GFX1030-NEXT: v_mov_b32_e32 v15, v1 -; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX1030-NEXT: v_and_b32_e32 v1, s0, v8 -; GFX1030-NEXT: v_mov_b32_e32 v16, v2 -; GFX1030-NEXT: v_mov_b32_e32 v17, v3 -; GFX1030-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX1030-NEXT: v_mov_b32_e32 v13, v0 +; GFX1030-NEXT: v_mov_b32_e32 v14, v1 +; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GFX1030-NEXT: v_and_b32_e32 v1, s0, v7 +; GFX1030-NEXT: v_mov_b32_e32 v15, v2 +; GFX1030-NEXT: v_mov_b32_e32 v16, v3 +; GFX1030-NEXT: v_lshrrev_b32_e32 v2, 16, v7 ; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX1030-NEXT: v_and_b32_e32 v3, s0, v9 -; GFX1030-NEXT: v_mov_b32_e32 v18, v4 +; GFX1030-NEXT: v_and_b32_e32 v3, s0, v8 +; GFX1030-NEXT: v_mov_b32_e32 v17, v4 ; GFX1030-NEXT: s_mov_b32 s1, exec_lo -; GFX1030-NEXT: v_and_or_b32 v19, v6, s0, v0 -; GFX1030-NEXT: v_and_or_b32 v20, v7, s0, v1 -; GFX1030-NEXT: v_lshl_or_b32 v21, v3, 16, v2 +; GFX1030-NEXT: v_and_or_b32 v18, v5, s0, v0 +; GFX1030-NEXT: v_and_or_b32 v19, v6, s0, v1 +; GFX1030-NEXT: v_lshl_or_b32 v20, v3, 16, v2 ; GFX1030-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 -; GFX1030-NEXT: v_readfirstlane_b32 s4, v10 -; GFX1030-NEXT: v_readfirstlane_b32 s5, v11 -; GFX1030-NEXT: v_readfirstlane_b32 s6, v12 -; GFX1030-NEXT: v_readfirstlane_b32 s7, v13 -; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[10:11] -; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[12:13] +; GFX1030-NEXT: v_readfirstlane_b32 s4, v9 +; GFX1030-NEXT: v_readfirstlane_b32 s5, v10 +; GFX1030-NEXT: v_readfirstlane_b32 s6, v11 +; GFX1030-NEXT: v_readfirstlane_b32 s7, v12 +; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] +; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[11:12] ; GFX1030-NEXT: s_and_b32 s0, s0, vcc_lo ; GFX1030-NEXT: s_and_saveexec_b32 s0, s0 -; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[14:21], s[4:7] a16 -; GFX1030-NEXT: ; implicit-def: $vgpr10_vgpr11 +; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[13:20], s[4:7] a16 +; GFX1030-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1030-NEXT: ; implicit-def: $vgpr13 ; GFX1030-NEXT: ; implicit-def: $vgpr14 ; GFX1030-NEXT: ; implicit-def: $vgpr15 ; GFX1030-NEXT: ; implicit-def: $vgpr16 @@ -266,8 +239,7 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_p ; GFX1030-NEXT: ; implicit-def: $vgpr18 ; GFX1030-NEXT: ; implicit-def: $vgpr19 ; GFX1030-NEXT: ; implicit-def: $vgpr20 -; GFX1030-NEXT: ; implicit-def: $vgpr21 -; GFX1030-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13 +; GFX1030-NEXT: ; implicit-def: $vgpr9_vgpr10_vgpr11_vgpr12 ; GFX1030-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1030-NEXT: s_cbranch_execnz .LBB7_1 ; GFX1030-NEXT: ; %bb.2: @@ -278,72 +250,75 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_p ; GFX1013-LABEL: image_bvh_intersect_ray_a16_vgpr_descr: ; GFX1013: ; %bb.0: ; GFX1013-NEXT: s_mov_b32 s0, 0xffff -; GFX1013-NEXT: v_lshrrev_b32_e32 v5, 16, v6 -; GFX1013-NEXT: v_and_b32_e32 v14, s0, v8 -; GFX1013-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX1013-NEXT: v_and_b32_e32 v9, s0, v9 +; GFX1013-NEXT: v_lshrrev_b32_e32 v13, 16, v5 +; GFX1013-NEXT: v_and_b32_e32 v14, s0, v7 +; GFX1013-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX1013-NEXT: v_and_b32_e32 v8, s0, v8 ; GFX1013-NEXT: s_mov_b32 s1, exec_lo -; GFX1013-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX1013-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; GFX1013-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX1013-NEXT: v_and_or_b32 v5, v6, s0, v5 -; GFX1013-NEXT: v_and_or_b32 v6, v7, s0, v14 -; GFX1013-NEXT: v_lshl_or_b32 v7, v9, 16, v8 +; GFX1013-NEXT: v_lshl_or_b32 v7, v8, 16, v7 +; GFX1013-NEXT: v_and_or_b32 v5, v5, s0, v13 +; GFX1013-NEXT: v_and_or_b32 v6, v6, s0, v14 ; GFX1013-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 -; GFX1013-NEXT: v_readfirstlane_b32 s4, v10 -; GFX1013-NEXT: v_readfirstlane_b32 s5, v11 -; GFX1013-NEXT: v_readfirstlane_b32 s6, v12 -; GFX1013-NEXT: v_readfirstlane_b32 s7, v13 -; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[10:11] -; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[12:13] +; GFX1013-NEXT: v_readfirstlane_b32 s4, v9 +; GFX1013-NEXT: v_readfirstlane_b32 s5, v10 +; GFX1013-NEXT: v_readfirstlane_b32 s6, v11 +; GFX1013-NEXT: v_readfirstlane_b32 s7, v12 +; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] +; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[11:12] ; GFX1013-NEXT: s_and_b32 s0, s0, vcc_lo ; GFX1013-NEXT: s_and_saveexec_b32 s0, s0 -; GFX1013-NEXT: image_bvh_intersect_ray v[14:17], v[0:7], s[4:7] a16 -; GFX1013-NEXT: ; implicit-def: $vgpr10_vgpr11 +; GFX1013-NEXT: image_bvh_intersect_ray v[13:16], v[0:7], s[4:7] a16 +; GFX1013-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 -; GFX1013-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13 +; GFX1013-NEXT: ; implicit-def: $vgpr9_vgpr10_vgpr11_vgpr12 ; GFX1013-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1013-NEXT: s_cbranch_execnz .LBB7_1 ; GFX1013-NEXT: ; %bb.2: ; GFX1013-NEXT: s_mov_b32 exec_lo, s1 ; GFX1013-NEXT: s_waitcnt vmcnt(0) -; GFX1013-NEXT: v_mov_b32_e32 v0, v14 -; GFX1013-NEXT: v_mov_b32_e32 v1, v15 -; GFX1013-NEXT: v_mov_b32_e32 v2, v16 -; GFX1013-NEXT: v_mov_b32_e32 v3, v17 +; GFX1013-NEXT: v_mov_b32_e32 v0, v13 +; GFX1013-NEXT: v_mov_b32_e32 v1, v14 +; GFX1013-NEXT: v_mov_b32_e32 v2, v15 +; GFX1013-NEXT: v_mov_b32_e32 v3, v16 ; GFX1013-NEXT: ; return to shader part epilog - %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) %r = bitcast <4 x i32> %v to <4 x float> ret <4 x float> %r } -define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) { +define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) { ; GFX1030-LABEL: image_bvh64_intersect_ray_vgpr_descr: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: v_mov_b32_e32 v19, v0 -; GFX1030-NEXT: v_mov_b32_e32 v20, v1 -; GFX1030-NEXT: v_mov_b32_e32 v21, v2 -; GFX1030-NEXT: v_mov_b32_e32 v22, v3 -; GFX1030-NEXT: v_mov_b32_e32 v23, v4 -; GFX1030-NEXT: v_mov_b32_e32 v24, v5 -; GFX1030-NEXT: v_mov_b32_e32 v25, v7 -; GFX1030-NEXT: v_mov_b32_e32 v26, v8 -; GFX1030-NEXT: v_mov_b32_e32 v27, v9 -; GFX1030-NEXT: v_mov_b32_e32 v28, v11 -; GFX1030-NEXT: v_mov_b32_e32 v29, v12 -; GFX1030-NEXT: v_mov_b32_e32 v30, v13 +; GFX1030-NEXT: v_mov_b32_e32 v16, v0 +; GFX1030-NEXT: v_mov_b32_e32 v17, v1 +; GFX1030-NEXT: v_mov_b32_e32 v18, v2 +; GFX1030-NEXT: v_mov_b32_e32 v19, v3 +; GFX1030-NEXT: v_mov_b32_e32 v20, v4 +; GFX1030-NEXT: v_mov_b32_e32 v21, v5 +; GFX1030-NEXT: v_mov_b32_e32 v22, v6 +; GFX1030-NEXT: v_mov_b32_e32 v23, v7 +; GFX1030-NEXT: v_mov_b32_e32 v24, v8 +; GFX1030-NEXT: v_mov_b32_e32 v25, v9 +; GFX1030-NEXT: v_mov_b32_e32 v26, v10 +; GFX1030-NEXT: v_mov_b32_e32 v27, v11 ; GFX1030-NEXT: s_mov_b32 s1, exec_lo ; GFX1030-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 -; GFX1030-NEXT: v_readfirstlane_b32 s4, v15 -; GFX1030-NEXT: v_readfirstlane_b32 s5, v16 -; GFX1030-NEXT: v_readfirstlane_b32 s6, v17 -; GFX1030-NEXT: v_readfirstlane_b32 s7, v18 -; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[15:16] -; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[17:18] +; GFX1030-NEXT: v_readfirstlane_b32 s4, v12 +; GFX1030-NEXT: v_readfirstlane_b32 s5, v13 +; GFX1030-NEXT: v_readfirstlane_b32 s6, v14 +; GFX1030-NEXT: v_readfirstlane_b32 s7, v15 +; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[12:13] +; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[14:15] ; GFX1030-NEXT: s_and_b32 s0, s0, vcc_lo ; GFX1030-NEXT: s_and_saveexec_b32 s0, s0 -; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[19:34], s[4:7] -; GFX1030-NEXT: ; implicit-def: $vgpr15_vgpr16 +; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[16:31], s[4:7] +; GFX1030-NEXT: ; implicit-def: $vgpr12_vgpr13 +; GFX1030-NEXT: ; implicit-def: $vgpr16 +; GFX1030-NEXT: ; implicit-def: $vgpr17 +; GFX1030-NEXT: ; implicit-def: $vgpr18 ; GFX1030-NEXT: ; implicit-def: $vgpr19 ; GFX1030-NEXT: ; implicit-def: $vgpr20 ; GFX1030-NEXT: ; implicit-def: $vgpr21 @@ -353,10 +328,7 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr ; GFX1030-NEXT: ; implicit-def: $vgpr25 ; GFX1030-NEXT: ; implicit-def: $vgpr26 ; GFX1030-NEXT: ; implicit-def: $vgpr27 -; GFX1030-NEXT: ; implicit-def: $vgpr28 -; GFX1030-NEXT: ; implicit-def: $vgpr29 -; GFX1030-NEXT: ; implicit-def: $vgpr30 -; GFX1030-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18 +; GFX1030-NEXT: ; implicit-def: $vgpr12_vgpr13_vgpr14_vgpr15 ; GFX1030-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1030-NEXT: s_cbranch_execnz .LBB8_1 ; GFX1030-NEXT: ; %bb.2: @@ -366,75 +338,72 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr ; ; GFX1013-LABEL: image_bvh64_intersect_ray_vgpr_descr: ; GFX1013: ; %bb.0: -; GFX1013-NEXT: v_mov_b32_e32 v6, v7 -; GFX1013-NEXT: v_mov_b32_e32 v7, v8 -; GFX1013-NEXT: v_mov_b32_e32 v8, v9 -; GFX1013-NEXT: v_mov_b32_e32 v9, v11 -; GFX1013-NEXT: v_mov_b32_e32 v10, v12 -; GFX1013-NEXT: v_mov_b32_e32 v11, v13 +; GFX1013-NEXT: v_mov_b32_e32 v16, v12 +; GFX1013-NEXT: v_mov_b32_e32 v17, v13 +; GFX1013-NEXT: v_mov_b32_e32 v18, v14 ; GFX1013-NEXT: v_mov_b32_e32 v19, v15 -; GFX1013-NEXT: v_mov_b32_e32 v20, v16 ; GFX1013-NEXT: s_mov_b32 s1, exec_lo ; GFX1013-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 -; GFX1013-NEXT: v_readfirstlane_b32 s4, v19 -; GFX1013-NEXT: v_readfirstlane_b32 s5, v20 -; GFX1013-NEXT: v_readfirstlane_b32 s6, v17 -; GFX1013-NEXT: v_readfirstlane_b32 s7, v18 -; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[19:20] -; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[17:18] +; GFX1013-NEXT: v_readfirstlane_b32 s4, v16 +; GFX1013-NEXT: v_readfirstlane_b32 s5, v17 +; GFX1013-NEXT: v_readfirstlane_b32 s6, v18 +; GFX1013-NEXT: v_readfirstlane_b32 s7, v19 +; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[16:17] +; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[18:19] ; GFX1013-NEXT: s_and_b32 s0, s0, vcc_lo ; GFX1013-NEXT: s_and_saveexec_b32 s0, s0 -; GFX1013-NEXT: image_bvh64_intersect_ray v[21:24], v[0:15], s[4:7] +; GFX1013-NEXT: image_bvh64_intersect_ray v[20:23], v[0:15], s[4:7] +; GFX1013-NEXT: ; implicit-def: $vgpr16_vgpr17 ; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX1013-NEXT: ; implicit-def: $vgpr19_vgpr20 -; GFX1013-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18 +; GFX1013-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19 ; GFX1013-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1013-NEXT: s_cbranch_execnz .LBB8_1 ; GFX1013-NEXT: ; %bb.2: ; GFX1013-NEXT: s_mov_b32 exec_lo, s1 ; GFX1013-NEXT: s_waitcnt vmcnt(0) -; GFX1013-NEXT: v_mov_b32_e32 v0, v21 -; GFX1013-NEXT: v_mov_b32_e32 v1, v22 -; GFX1013-NEXT: v_mov_b32_e32 v2, v23 -; GFX1013-NEXT: v_mov_b32_e32 v3, v24 +; GFX1013-NEXT: v_mov_b32_e32 v0, v20 +; GFX1013-NEXT: v_mov_b32_e32 v1, v21 +; GFX1013-NEXT: v_mov_b32_e32 v2, v22 +; GFX1013-NEXT: v_mov_b32_e32 v3, v23 ; GFX1013-NEXT: ; return to shader part epilog - %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) %r = bitcast <4 x i32> %v to <4 x float> ret <4 x float> %r } -define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) { +define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) { ; GFX1030-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr: ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_mov_b32 s0, 0xffff -; GFX1030-NEXT: v_mov_b32_e32 v15, v0 -; GFX1030-NEXT: v_mov_b32_e32 v16, v1 -; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; GFX1030-NEXT: v_and_b32_e32 v1, s0, v9 -; GFX1030-NEXT: v_mov_b32_e32 v17, v2 -; GFX1030-NEXT: v_mov_b32_e32 v18, v3 -; GFX1030-NEXT: v_lshrrev_b32_e32 v2, 16, v9 +; GFX1030-NEXT: v_mov_b32_e32 v14, v0 +; GFX1030-NEXT: v_mov_b32_e32 v15, v1 +; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; GFX1030-NEXT: v_and_b32_e32 v1, s0, v8 +; GFX1030-NEXT: v_mov_b32_e32 v16, v2 +; GFX1030-NEXT: v_mov_b32_e32 v17, v3 +; GFX1030-NEXT: v_lshrrev_b32_e32 v2, 16, v8 ; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX1030-NEXT: v_and_b32_e32 v3, s0, v10 -; GFX1030-NEXT: v_mov_b32_e32 v19, v4 -; GFX1030-NEXT: v_mov_b32_e32 v20, v5 -; GFX1030-NEXT: v_and_or_b32 v21, v7, s0, v0 -; GFX1030-NEXT: v_and_or_b32 v22, v8, s0, v1 -; GFX1030-NEXT: v_lshl_or_b32 v23, v3, 16, v2 +; GFX1030-NEXT: v_and_b32_e32 v3, s0, v9 +; GFX1030-NEXT: v_mov_b32_e32 v18, v4 +; GFX1030-NEXT: v_mov_b32_e32 v19, v5 +; GFX1030-NEXT: v_and_or_b32 v20, v6, s0, v0 +; GFX1030-NEXT: v_and_or_b32 v21, v7, s0, v1 +; GFX1030-NEXT: v_lshl_or_b32 v22, v3, 16, v2 ; GFX1030-NEXT: s_mov_b32 s1, exec_lo ; GFX1030-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 -; GFX1030-NEXT: v_readfirstlane_b32 s4, v11 -; GFX1030-NEXT: v_readfirstlane_b32 s5, v12 -; GFX1030-NEXT: v_readfirstlane_b32 s6, v13 -; GFX1030-NEXT: v_readfirstlane_b32 s7, v14 -; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[11:12] -; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[13:14] +; GFX1030-NEXT: v_readfirstlane_b32 s4, v10 +; GFX1030-NEXT: v_readfirstlane_b32 s5, v11 +; GFX1030-NEXT: v_readfirstlane_b32 s6, v12 +; GFX1030-NEXT: v_readfirstlane_b32 s7, v13 +; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[10:11] +; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[12:13] ; GFX1030-NEXT: s_and_b32 s0, s0, vcc_lo ; GFX1030-NEXT: s_and_saveexec_b32 s0, s0 -; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[15:30], s[4:7] a16 -; GFX1030-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[14:29], s[4:7] a16 +; GFX1030-NEXT: ; implicit-def: $vgpr10_vgpr11 +; GFX1030-NEXT: ; implicit-def: $vgpr14 ; GFX1030-NEXT: ; implicit-def: $vgpr15 ; GFX1030-NEXT: ; implicit-def: $vgpr16 ; GFX1030-NEXT: ; implicit-def: $vgpr17 @@ -443,8 +412,7 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node ; GFX1030-NEXT: ; implicit-def: $vgpr20 ; GFX1030-NEXT: ; implicit-def: $vgpr21 ; GFX1030-NEXT: ; implicit-def: $vgpr22 -; GFX1030-NEXT: ; implicit-def: $vgpr23 -; GFX1030-NEXT: ; implicit-def: $vgpr11_vgpr12_vgpr13_vgpr14 +; GFX1030-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13 ; GFX1030-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1030-NEXT: s_cbranch_execnz .LBB9_1 ; GFX1030-NEXT: ; %bb.2: @@ -455,20 +423,20 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node ; GFX1013-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr: ; GFX1013: ; %bb.0: ; GFX1013-NEXT: s_mov_b32 s0, 0xffff -; GFX1013-NEXT: v_mov_b32_e32 v16, v11 -; GFX1013-NEXT: v_lshrrev_b32_e32 v6, 16, v7 -; GFX1013-NEXT: v_and_b32_e32 v11, s0, v9 -; GFX1013-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GFX1013-NEXT: v_and_b32_e32 v10, s0, v10 -; GFX1013-NEXT: v_mov_b32_e32 v17, v12 -; GFX1013-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX1013-NEXT: v_mov_b32_e32 v16, v10 +; GFX1013-NEXT: v_mov_b32_e32 v17, v11 +; GFX1013-NEXT: v_lshrrev_b32_e32 v10, 16, v6 +; GFX1013-NEXT: v_and_b32_e32 v11, s0, v8 +; GFX1013-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX1013-NEXT: v_and_b32_e32 v9, s0, v9 +; GFX1013-NEXT: v_mov_b32_e32 v18, v12 +; GFX1013-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; GFX1013-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX1013-NEXT: v_mov_b32_e32 v18, v13 -; GFX1013-NEXT: v_mov_b32_e32 v19, v14 +; GFX1013-NEXT: v_mov_b32_e32 v19, v13 +; GFX1013-NEXT: v_lshl_or_b32 v8, v9, 16, v8 ; GFX1013-NEXT: s_mov_b32 s1, exec_lo -; GFX1013-NEXT: v_and_or_b32 v6, v7, s0, v6 -; GFX1013-NEXT: v_and_or_b32 v7, v8, s0, v11 -; GFX1013-NEXT: v_lshl_or_b32 v8, v10, 16, v9 +; GFX1013-NEXT: v_and_or_b32 v6, v6, s0, v10 +; GFX1013-NEXT: v_and_or_b32 v7, v7, s0, v11 ; GFX1013-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 ; GFX1013-NEXT: v_readfirstlane_b32 s4, v16 ; GFX1013-NEXT: v_readfirstlane_b32 s5, v17 @@ -493,7 +461,7 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node ; GFX1013-NEXT: v_mov_b32_e32 v2, v22 ; GFX1013-NEXT: v_mov_b32_e32 v3, v23 ; GFX1013-NEXT: ; return to shader part epilog - %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) %r = bitcast <4 x i32> %v to <4 x float> ret <4 x float> %r } @@ -567,16 +535,16 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(i32* %p_node_ptr %node_ptr = load i32, i32* %gep_node_ptr, align 4 %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid %ray_extent = load float, float* %gep_ray, align 4 - %ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0 - %ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1 - %ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2 - %ray_dir0 = insertelement <4 x float> undef, float 3.0, i32 0 - %ray_dir1 = insertelement <4 x float> %ray_dir0, float 4.0, i32 1 - %ray_dir = insertelement <4 x float> %ray_dir1, float 5.0, i32 2 - %ray_inv_dir0 = insertelement <4 x float> undef, float 6.0, i32 0 - %ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float 7.0, i32 1 - %ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float 8.0, i32 2 - %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) + %ray_origin0 = insertelement <3 x float> undef, float 0.0, i32 0 + %ray_origin1 = insertelement <3 x float> %ray_origin0, float 1.0, i32 1 + %ray_origin = insertelement <3 x float> %ray_origin1, float 2.0, i32 2 + %ray_dir0 = insertelement <3 x float> undef, float 3.0, i32 0 + %ray_dir1 = insertelement <3 x float> %ray_dir0, float 4.0, i32 1 + %ray_dir = insertelement <3 x float> %ray_dir1, float 5.0, i32 2 + %ray_inv_dir0 = insertelement <3 x float> undef, float 6.0, i32 0 + %ray_inv_dir1 = insertelement <3 x float> %ray_inv_dir0, float 7.0, i32 1 + %ray_inv_dir = insertelement <3 x float> %ray_inv_dir1, float 8.0, i32 2 + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) store <4 x i32> %v, <4 x i32>* undef ret void } @@ -680,16 +648,16 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(i32* %p_node %node_ptr = load i32, i32* %gep_node_ptr, align 4 %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid %ray_extent = load float, float* %gep_ray, align 4 - %ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0 - %ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1 - %ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2 - %ray_dir0 = insertelement <4 x half> undef, half 3.0, i32 0 - %ray_dir1 = insertelement <4 x half> %ray_dir0, half 4.0, i32 1 - %ray_dir = insertelement <4 x half> %ray_dir1, half 5.0, i32 2 - %ray_inv_dir0 = insertelement <4 x half> undef, half 6.0, i32 0 - %ray_inv_dir1 = insertelement <4 x half> %ray_inv_dir0, half 7.0, i32 1 - %ray_inv_dir = insertelement <4 x half> %ray_inv_dir1, half 8.0, i32 2 - %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) + %ray_origin0 = insertelement <3 x float> undef, float 0.0, i32 0 + %ray_origin1 = insertelement <3 x float> %ray_origin0, float 1.0, i32 1 + %ray_origin = insertelement <3 x float> %ray_origin1, float 2.0, i32 2 + %ray_dir0 = insertelement <3 x half> undef, half 3.0, i32 0 + %ray_dir1 = insertelement <3 x half> %ray_dir0, half 4.0, i32 1 + %ray_dir = insertelement <3 x half> %ray_dir1, half 5.0, i32 2 + %ray_inv_dir0 = insertelement <3 x half> undef, half 6.0, i32 0 + %ray_inv_dir1 = insertelement <3 x half> %ray_inv_dir0, half 7.0, i32 1 + %ray_inv_dir = insertelement <3 x half> %ray_inv_dir1, half 8.0, i32 2 + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) store <4 x i32> %v, <4 x i32>* undef ret void } @@ -755,16 +723,16 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(float* %p_ray, %lid = tail call i32 @llvm.amdgcn.workitem.id.x() %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid %ray_extent = load float, float* %gep_ray, align 4 - %ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0 - %ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1 - %ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2 - %ray_dir0 = insertelement <4 x float> undef, float 3.0, i32 0 - %ray_dir1 = insertelement <4 x float> %ray_dir0, float 4.0, i32 1 - %ray_dir = insertelement <4 x float> %ray_dir1, float 5.0, i32 2 - %ray_inv_dir0 = insertelement <4 x float> undef, float 6.0, i32 0 - %ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float 7.0, i32 1 - %ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float 8.0, i32 2 - %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 1111111111111, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) + %ray_origin0 = insertelement <3 x float> undef, float 0.0, i32 0 + %ray_origin1 = insertelement <3 x float> %ray_origin0, float 1.0, i32 1 + %ray_origin = insertelement <3 x float> %ray_origin1, float 2.0, i32 2 + %ray_dir0 = insertelement <3 x float> undef, float 3.0, i32 0 + %ray_dir1 = insertelement <3 x float> %ray_dir0, float 4.0, i32 1 + %ray_dir = insertelement <3 x float> %ray_dir1, float 5.0, i32 2 + %ray_inv_dir0 = insertelement <3 x float> undef, float 6.0, i32 0 + %ray_inv_dir1 = insertelement <3 x float> %ray_inv_dir0, float 7.0, i32 1 + %ray_inv_dir = insertelement <3 x float> %ray_inv_dir1, float 8.0, i32 2 + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 1111111111111, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) store <4 x i32> %v, <4 x i32>* undef ret void } @@ -860,16 +828,16 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(float* %p_ %lid = tail call i32 @llvm.amdgcn.workitem.id.x() %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid %ray_extent = load float, float* %gep_ray, align 4 - %ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0 - %ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1 - %ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2 - %ray_dir0 = insertelement <4 x half> undef, half 3.0, i32 0 - %ray_dir1 = insertelement <4 x half> %ray_dir0, half 4.0, i32 1 - %ray_dir = insertelement <4 x half> %ray_dir1, half 5.0, i32 2 - %ray_inv_dir0 = insertelement <4 x half> undef, half 6.0, i32 0 - %ray_inv_dir1 = insertelement <4 x half> %ray_inv_dir0, half 7.0, i32 1 - %ray_inv_dir = insertelement <4 x half> %ray_inv_dir1, half 8.0, i32 2 - %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 1111111111110, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) + %ray_origin0 = insertelement <3 x float> undef, float 0.0, i32 0 + %ray_origin1 = insertelement <3 x float> %ray_origin0, float 1.0, i32 1 + %ray_origin = insertelement <3 x float> %ray_origin1, float 2.0, i32 2 + %ray_dir0 = insertelement <3 x half> undef, half 3.0, i32 0 + %ray_dir1 = insertelement <3 x half> %ray_dir0, half 4.0, i32 1 + %ray_dir = insertelement <3 x half> %ray_dir1, half 5.0, i32 2 + %ray_inv_dir0 = insertelement <3 x half> undef, half 6.0, i32 0 + %ray_inv_dir1 = insertelement <3 x half> %ray_inv_dir0, half 7.0, i32 1 + %ray_inv_dir = insertelement <3 x half> %ray_inv_dir1, half 8.0, i32 2 + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 1111111111110, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) store <4 x i32> %v, <4 x i32>* undef ret void } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll index 7842369..24b236b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll @@ -3,15 +3,15 @@ ; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1030 %s ; RUN: not --crash llc -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERR %s -; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(uint node_ptr, float ray_extent, float4 ray_origin, float4 ray_dir, float4 ray_inv_dir, uint4 texture_descr) -; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(uint node_ptr, float ray_extent, float4 ray_origin, half4 ray_dir, half4 ray_inv_dir, uint4 texture_descr) -; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(ulong node_ptr, float ray_extent, float4 ray_origin, float4 ray_dir, float4 ray_inv_dir, uint4 texture_descr) -; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(ulong node_ptr, float ray_extent, float4 ray_origin, half4 ray_dir, half4 ray_inv_dir, uint4 texture_descr) +; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(uint node_ptr, float ray_extent, float3 ray_origin, float3 ray_dir, float3 ray_inv_dir, uint4 texture_descr) +; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(uint node_ptr, float ray_extent, float3 ray_origin, half3 ray_dir, half3 ray_inv_dir, uint4 texture_descr) +; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(ulong node_ptr, float ray_extent, float3 ray_origin, float3 ray_dir, float3 ray_inv_dir, uint4 texture_descr) +; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(ulong node_ptr, float ray_extent, float3 ray_origin, half3 ray_dir, half3 ray_inv_dir, uint4 texture_descr) -declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32, float, <4 x float>, <4 x float>, <4 x float>, <4 x i32>) -declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32, float, <4 x float>, <4 x half>, <4 x half>, <4 x i32>) -declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64, float, <4 x float>, <4 x float>, <4 x float>, <4 x i32>) -declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64, float, <4 x float>, <4 x half>, <4 x half>, <4 x i32>) +declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32, float, <3 x float>, <3 x float>, <3 x float>, <4 x i32>) +declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32, float, <3 x float>, <3 x half>, <3 x half>, <4 x i32>) +declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64, float, <3 x float>, <3 x float>, <3 x float>, <4 x i32>) +declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64, float, <3 x float>, <3 x half>, <3 x half>, <4 x i32>) ; ERR: in function image_bvh_intersect_ray{{.*}}intrinsic not supported on subtarget ; Arguments are flattened to represent the actual VGPR_A layout, so we have no @@ -23,43 +23,43 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray(i32 %node_ptr, float %ray_ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog main_body: - %ray_origin0 = insertelement <4 x float> undef, float %ray_origin_x, i32 0 - %ray_origin1 = insertelement <4 x float> %ray_origin0, float %ray_origin_y, i32 1 - %ray_origin = insertelement <4 x float> %ray_origin1, float %ray_origin_z, i32 2 - %ray_dir0 = insertelement <4 x float> undef, float %ray_dir_x, i32 0 - %ray_dir1 = insertelement <4 x float> %ray_dir0, float %ray_dir_y, i32 1 - %ray_dir = insertelement <4 x float> %ray_dir1, float %ray_dir_z, i32 2 - %ray_inv_dir0 = insertelement <4 x float> undef, float %ray_inv_dir_x, i32 0 - %ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float %ray_inv_dir_y, i32 1 - %ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float %ray_inv_dir_z, i32 2 - %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) + %ray_origin0 = insertelement <3 x float> undef, float %ray_origin_x, i32 0 + %ray_origin1 = insertelement <3 x float> %ray_origin0, float %ray_origin_y, i32 1 + %ray_origin = insertelement <3 x float> %ray_origin1, float %ray_origin_z, i32 2 + %ray_dir0 = insertelement <3 x float> undef, float %ray_dir_x, i32 0 + %ray_dir1 = insertelement <3 x float> %ray_dir0, float %ray_dir_y, i32 1 + %ray_dir = insertelement <3 x float> %ray_dir1, float %ray_dir_z, i32 2 + %ray_inv_dir0 = insertelement <3 x float> undef, float %ray_inv_dir_x, i32 0 + %ray_inv_dir1 = insertelement <3 x float> %ray_inv_dir0, float %ray_inv_dir_y, i32 1 + %ray_inv_dir = insertelement <3 x float> %ray_inv_dir1, float %ray_inv_dir_z, i32 2 + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) %r = bitcast <4 x i32> %v to <4 x float> ret <4 x float> %r } -define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 inreg %node_ptr, float inreg %ray_extent, <4 x float> inreg %ray_origin, <4 x half> inreg %ray_dir, <4 x half> inreg %ray_inv_dir, <4 x i32> inreg %tdescr) { +define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 inreg %node_ptr, float inreg %ray_extent, <3 x float> inreg %ray_origin, <3 x half> inreg %ray_dir, <3 x half> inreg %ray_inv_dir, <4 x i32> inreg %tdescr) { ; GCN-LABEL: image_bvh_intersect_ray_a16: ; GCN: ; %bb.0: ; %main_body -; GCN-NEXT: s_lshr_b32 s5, s8, 16 -; GCN-NEXT: s_pack_ll_b32_b16 s7, s7, s8 -; GCN-NEXT: s_pack_ll_b32_b16 s5, s5, s9 +; GCN-NEXT: s_mov_b32 s15, s12 +; GCN-NEXT: s_mov_b32 s12, s9 +; GCN-NEXT: s_lshr_b32 s9, s7, 16 +; GCN-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GCN-NEXT: s_pack_ll_b32_b16 s7, s9, s8 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: v_mov_b32_e32 v4, s4 -; GCN-NEXT: v_mov_b32_e32 v5, s6 -; GCN-NEXT: v_mov_b32_e32 v6, s7 -; GCN-NEXT: v_mov_b32_e32 v7, s5 -; GCN-NEXT: s_mov_b32 s15, s13 -; GCN-NEXT: s_mov_b32 s14, s12 -; GCN-NEXT: s_mov_b32 s13, s11 -; GCN-NEXT: s_mov_b32 s12, s10 +; GCN-NEXT: v_mov_b32_e32 v5, s5 +; GCN-NEXT: v_mov_b32_e32 v6, s6 +; GCN-NEXT: v_mov_b32_e32 v7, s7 +; GCN-NEXT: s_mov_b32 s14, s11 +; GCN-NEXT: s_mov_b32 s13, s10 ; GCN-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[12:15] a16 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog main_body: - %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) %r = bitcast <4 x i32> %v to <4 x float> ret <4 x float> %r } @@ -74,44 +74,44 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray(<2 x i32> %node_ptr_vec, ; GCN-NEXT: ; return to shader part epilog main_body: %node_ptr = bitcast <2 x i32> %node_ptr_vec to i64 - %ray_origin0 = insertelement <4 x float> undef, float %ray_origin_x, i32 0 - %ray_origin1 = insertelement <4 x float> %ray_origin0, float %ray_origin_y, i32 1 - %ray_origin = insertelement <4 x float> %ray_origin1, float %ray_origin_z, i32 2 - %ray_dir0 = insertelement <4 x float> undef, float %ray_dir_x, i32 0 - %ray_dir1 = insertelement <4 x float> %ray_dir0, float %ray_dir_y, i32 1 - %ray_dir = insertelement <4 x float> %ray_dir1, float %ray_dir_z, i32 2 - %ray_inv_dir0 = insertelement <4 x float> undef, float %ray_inv_dir_x, i32 0 - %ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float %ray_inv_dir_y, i32 1 - %ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float %ray_inv_dir_z, i32 2 - %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) + %ray_origin0 = insertelement <3 x float> undef, float %ray_origin_x, i32 0 + %ray_origin1 = insertelement <3 x float> %ray_origin0, float %ray_origin_y, i32 1 + %ray_origin = insertelement <3 x float> %ray_origin1, float %ray_origin_z, i32 2 + %ray_dir0 = insertelement <3 x float> undef, float %ray_dir_x, i32 0 + %ray_dir1 = insertelement <3 x float> %ray_dir0, float %ray_dir_y, i32 1 + %ray_dir = insertelement <3 x float> %ray_dir1, float %ray_dir_z, i32 2 + %ray_inv_dir0 = insertelement <3 x float> undef, float %ray_inv_dir_x, i32 0 + %ray_inv_dir1 = insertelement <3 x float> %ray_inv_dir0, float %ray_inv_dir_y, i32 1 + %ray_inv_dir = insertelement <3 x float> %ray_inv_dir1, float %ray_inv_dir_z, i32 2 + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) %r = bitcast <4 x i32> %v to <4 x float> ret <4 x float> %r } -define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 inreg %node_ptr, float inreg %ray_extent, <4 x float> inreg %ray_origin, <4 x half> inreg %ray_dir, <4 x half> inreg %ray_inv_dir, <4 x i32> inreg %tdescr) { +define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 inreg %node_ptr, float inreg %ray_extent, <3 x float> inreg %ray_origin, <3 x half> inreg %ray_dir, <3 x half> inreg %ray_inv_dir, <4 x i32> inreg %tdescr) { ; GCN-LABEL: image_bvh64_intersect_ray_a16: ; GCN: ; %bb.0: ; %main_body -; GCN-NEXT: s_lshr_b32 s6, s9, 16 -; GCN-NEXT: s_pack_ll_b32_b16 s8, s8, s9 -; GCN-NEXT: s_pack_ll_b32_b16 s6, s6, s10 +; GCN-NEXT: s_mov_b32 s14, s12 +; GCN-NEXT: s_mov_b32 s12, s10 +; GCN-NEXT: s_lshr_b32 s10, s8, 16 +; GCN-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GCN-NEXT: s_pack_ll_b32_b16 s8, s10, s9 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: v_mov_b32_e32 v4, s4 ; GCN-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NEXT: v_mov_b32_e32 v6, s7 -; GCN-NEXT: v_mov_b32_e32 v7, s8 -; GCN-NEXT: v_mov_b32_e32 v8, s6 -; GCN-NEXT: s_mov_b32 s15, s14 -; GCN-NEXT: s_mov_b32 s14, s13 -; GCN-NEXT: s_mov_b32 s13, s12 -; GCN-NEXT: s_mov_b32 s12, s11 +; GCN-NEXT: v_mov_b32_e32 v6, s6 +; GCN-NEXT: v_mov_b32_e32 v7, s7 +; GCN-NEXT: v_mov_b32_e32 v8, s8 +; GCN-NEXT: s_mov_b32 s15, s13 +; GCN-NEXT: s_mov_b32 s13, s11 ; GCN-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[12:15] a16 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog main_body: - %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) %r = bitcast <4 x i32> %v to <4 x float> ret <4 x float> %r } @@ -178,16 +178,16 @@ main_body: %node_ptr = load i32, i32* %gep_node_ptr, align 4 %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid %ray_extent = load float, float* %gep_ray, align 4 - %ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0 - %ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1 - %ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2 - %ray_dir0 = insertelement <4 x float> undef, float 3.0, i32 0 - %ray_dir1 = insertelement <4 x float> %ray_dir0, float 4.0, i32 1 - %ray_dir = insertelement <4 x float> %ray_dir1, float 5.0, i32 2 - %ray_inv_dir0 = insertelement <4 x float> undef, float 6.0, i32 0 - %ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float 7.0, i32 1 - %ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float 8.0, i32 2 - %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) + %ray_origin0 = insertelement <3 x float> undef, float 0.0, i32 0 + %ray_origin1 = insertelement <3 x float> %ray_origin0, float 1.0, i32 1 + %ray_origin = insertelement <3 x float> %ray_origin1, float 2.0, i32 2 + %ray_dir0 = insertelement <3 x float> undef, float 3.0, i32 0 + %ray_dir1 = insertelement <3 x float> %ray_dir0, float 4.0, i32 1 + %ray_dir = insertelement <3 x float> %ray_dir1, float 5.0, i32 2 + %ray_inv_dir0 = insertelement <3 x float> undef, float 6.0, i32 0 + %ray_inv_dir1 = insertelement <3 x float> %ray_inv_dir0, float 7.0, i32 1 + %ray_inv_dir = insertelement <3 x float> %ray_inv_dir1, float 8.0, i32 2 + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) store <4 x i32> %v, <4 x i32>* undef ret void } @@ -246,16 +246,16 @@ main_body: %node_ptr = load i32, i32* %gep_node_ptr, align 4 %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid %ray_extent = load float, float* %gep_ray, align 4 - %ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0 - %ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1 - %ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2 - %ray_dir0 = insertelement <4 x half> undef, half 3.0, i32 0 - %ray_dir1 = insertelement <4 x half> %ray_dir0, half 4.0, i32 1 - %ray_dir = insertelement <4 x half> %ray_dir1, half 5.0, i32 2 - %ray_inv_dir0 = insertelement <4 x half> undef, half 6.0, i32 0 - %ray_inv_dir1 = insertelement <4 x half> %ray_inv_dir0, half 7.0, i32 1 - %ray_inv_dir = insertelement <4 x half> %ray_inv_dir1, half 8.0, i32 2 - %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) + %ray_origin0 = insertelement <3 x float> undef, float 0.0, i32 0 + %ray_origin1 = insertelement <3 x float> %ray_origin0, float 1.0, i32 1 + %ray_origin = insertelement <3 x float> %ray_origin1, float 2.0, i32 2 + %ray_dir0 = insertelement <3 x half> undef, half 3.0, i32 0 + %ray_dir1 = insertelement <3 x half> %ray_dir0, half 4.0, i32 1 + %ray_dir = insertelement <3 x half> %ray_dir1, half 5.0, i32 2 + %ray_inv_dir0 = insertelement <3 x half> undef, half 6.0, i32 0 + %ray_inv_dir1 = insertelement <3 x half> %ray_inv_dir0, half 7.0, i32 1 + %ray_inv_dir = insertelement <3 x half> %ray_inv_dir1, half 8.0, i32 2 + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) store <4 x i32> %v, <4 x i32>* undef ret void } @@ -316,16 +316,16 @@ main_body: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid %ray_extent = load float, float* %gep_ray, align 4 - %ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0 - %ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1 - %ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2 - %ray_dir0 = insertelement <4 x float> undef, float 3.0, i32 0 - %ray_dir1 = insertelement <4 x float> %ray_dir0, float 4.0, i32 1 - %ray_dir = insertelement <4 x float> %ray_dir1, float 5.0, i32 2 - %ray_inv_dir0 = insertelement <4 x float> undef, float 6.0, i32 0 - %ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float 7.0, i32 1 - %ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float 8.0, i32 2 - %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 1111111111111, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) + %ray_origin0 = insertelement <3 x float> undef, float 0.0, i32 0 + %ray_origin1 = insertelement <3 x float> %ray_origin0, float 1.0, i32 1 + %ray_origin = insertelement <3 x float> %ray_origin1, float 2.0, i32 2 + %ray_dir0 = insertelement <3 x float> undef, float 3.0, i32 0 + %ray_dir1 = insertelement <3 x float> %ray_dir0, float 4.0, i32 1 + %ray_dir = insertelement <3 x float> %ray_dir1, float 5.0, i32 2 + %ray_inv_dir0 = insertelement <3 x float> undef, float 6.0, i32 0 + %ray_inv_dir1 = insertelement <3 x float> %ray_inv_dir0, float 7.0, i32 1 + %ray_inv_dir = insertelement <3 x float> %ray_inv_dir1, float 8.0, i32 2 + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 1111111111111, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) store <4 x i32> %v, <4 x i32>* undef ret void } @@ -380,16 +380,16 @@ main_body: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid %ray_extent = load float, float* %gep_ray, align 4 - %ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0 - %ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1 - %ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2 - %ray_dir0 = insertelement <4 x half> undef, half 3.0, i32 0 - %ray_dir1 = insertelement <4 x half> %ray_dir0, half 4.0, i32 1 - %ray_dir = insertelement <4 x half> %ray_dir1, half 5.0, i32 2 - %ray_inv_dir0 = insertelement <4 x half> undef, half 6.0, i32 0 - %ray_inv_dir1 = insertelement <4 x half> %ray_inv_dir0, half 7.0, i32 1 - %ray_inv_dir = insertelement <4 x half> %ray_inv_dir1, half 8.0, i32 2 - %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 1111111111110, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) + %ray_origin0 = insertelement <3 x float> undef, float 0.0, i32 0 + %ray_origin1 = insertelement <3 x float> %ray_origin0, float 1.0, i32 1 + %ray_origin = insertelement <3 x float> %ray_origin1, float 2.0, i32 2 + %ray_dir0 = insertelement <3 x half> undef, half 3.0, i32 0 + %ray_dir1 = insertelement <3 x half> %ray_dir0, half 4.0, i32 1 + %ray_dir = insertelement <3 x half> %ray_dir1, half 5.0, i32 2 + %ray_inv_dir0 = insertelement <3 x half> undef, half 6.0, i32 0 + %ray_inv_dir1 = insertelement <3 x half> %ray_inv_dir0, half 7.0, i32 1 + %ray_inv_dir = insertelement <3 x half> %ray_inv_dir1, half 8.0, i32 2 + %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 1111111111110, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) store <4 x i32> %v, <4 x i32>* undef ret void } -- 2.7.4