AMDGPU: Add baseline test for propagating amdgpu-waves-per-eu
authorMatt Arsenault <Matthew.Arsenault@amd.com>
Sat, 3 Jun 2023 20:44:19 +0000 (16:44 -0400)
committerMatt Arsenault <Matthew.Arsenault@amd.com>
Thu, 15 Jun 2023 17:16:25 +0000 (13:16 -0400)
llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll [new file with mode: 0644]

diff --git a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
new file mode 100644 (file)
index 0000000..bba350b
--- /dev/null
@@ -0,0 +1,424 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals --version 2
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-attributor %s | FileCheck %s
+
+; Check propagation of amdgpu-flat-work-group-size attribute.
+
+; Called from a single kernel with 1,8
+define internal void @default_to_1_8_a() {
+; CHECK-LABEL: define internal void @default_to_1_8_a
+; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+define amdgpu_kernel void @kernel_1_8() #0 {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_1_8
+; CHECK-SAME: () #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:    call void @default_to_1_8_a()
+; CHECK-NEXT:    ret void
+;
+  call void @default_to_1_8_a()
+  ret void
+}
+
+; Called from a single kernel with 1,2
+define internal void @default_to_1_2() {
+; CHECK-LABEL: define internal void @default_to_1_2
+; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+define amdgpu_kernel void @kernel_1_2() #1 {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_1_2
+; CHECK-SAME: () #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT:    call void @default_to_1_2()
+; CHECK-NEXT:    call void @flat_group_1_1()
+; CHECK-NEXT:    call void @default_to_1_8_b()
+; CHECK-NEXT:    call void @flat_group_2_8()
+; CHECK-NEXT:    ret void
+;
+  call void @default_to_1_2()
+  call void @flat_group_1_1()
+  call void @default_to_1_8_b()
+  call void @flat_group_2_8()
+  ret void
+}
+
+; Called from a single kernel with 1,4
+define internal void @default_to_1_4() {
+; CHECK-LABEL: define internal void @default_to_1_4
+; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+define amdgpu_kernel void @kernel_1_4() #2 {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_1_4
+; CHECK-SAME: () #[[ATTR3:[0-9]+]] {
+; CHECK-NEXT:    call void @default_to_1_4()
+; CHECK-NEXT:    ret void
+;
+  call void @default_to_1_4()
+  ret void
+}
+
+; Called from kernels with 2,9 and 9,9
+define internal void @default_to_2_9() {
+; CHECK-LABEL: define internal void @default_to_2_9
+; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+; This already has strict bounds, but called from kernels with wider
+; bounds, and should not be changed.
+define internal void @flat_group_1_1() #3 {
+; CHECK-LABEL: define internal void @flat_group_1_1
+; CHECK-SAME: () #[[ATTR4:[0-9]+]] {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+; 2,8 -> 2,2
+define internal void @flat_group_2_8() #4 {
+; CHECK-LABEL: define internal void @flat_group_2_8
+; CHECK-SAME: () #[[ATTR5:[0-9]+]] {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+; 9,10 -> 9,9
+define internal void @flat_group_9_10() #5 {
+; CHECK-LABEL: define internal void @flat_group_9_10
+; CHECK-SAME: () #[[ATTR6:[0-9]+]] {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+define amdgpu_kernel void @kernel_2_9() #6 {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_2_9
+; CHECK-SAME: () #[[ATTR7:[0-9]+]] {
+; CHECK-NEXT:    call void @default_to_2_9()
+; CHECK-NEXT:    call void @flat_group_1_1()
+; CHECK-NEXT:    ret void
+;
+  call void @default_to_2_9()
+  call void @flat_group_1_1()
+  ret void
+}
+
+define amdgpu_kernel void @kernel_9_9() #7 {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_9_9
+; CHECK-SAME: () #[[ATTR8:[0-9]+]] {
+; CHECK-NEXT:    call void @default_to_2_9()
+; CHECK-NEXT:    call void @flat_group_9_10()
+; CHECK-NEXT:    ret void
+;
+  call void @default_to_2_9()
+  call void @flat_group_9_10()
+  ret void
+}
+
+; Called from kernels with 2,8 and 1,2 => 1,8
+define internal void @default_to_1_8_b() {
+; CHECK-LABEL: define internal void @default_to_1_8_b
+; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+; The kernel's lower bound is higher than the callee's lower bound, so
+; this should probably be illegal.
+define amdgpu_kernel void @kernel_2_8() #4 {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_2_8
+; CHECK-SAME: () #[[ATTR5]] {
+; CHECK-NEXT:    call void @default_to_1_8_a()
+; CHECK-NEXT:    call void @default_to_1_8_b()
+; CHECK-NEXT:    ret void
+;
+  call void @default_to_1_8_a()
+  call void @default_to_1_8_b()
+  ret void
+}
+
+; 1,2 -> 2,2
+define internal void @merge_cycle_0() #1 {
+; CHECK-LABEL: define internal void @merge_cycle_0
+; CHECK-SAME: () #[[ATTR2]] {
+; CHECK-NEXT:    call void @merge_cycle_1()
+; CHECK-NEXT:    ret void
+;
+  call void @merge_cycle_1()
+  ret void
+}
+
+; Called from 1,2 + 3,8
+; 2,8 -> 2,8
+define internal void @merge_cycle_1() #4 {
+; CHECK-LABEL: define internal void @merge_cycle_1
+; CHECK-SAME: () #[[ATTR5]] {
+; CHECK-NEXT:    call void @merge_cycle_0()
+; CHECK-NEXT:    ret void
+;
+  call void @merge_cycle_0()
+  ret void
+}
+
+define amdgpu_kernel void @kernel_3_8() #8 {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_3_8
+; CHECK-SAME: () #[[ATTR9:[0-9]+]] {
+; CHECK-NEXT:    call void @merge_cycle_0()
+; CHECK-NEXT:    call void @default_captured_address()
+; CHECK-NEXT:    call void @externally_visible_default()
+; CHECK-NEXT:    [[F32:%.*]] = call float @bitcasted_function()
+; CHECK-NEXT:    ret void
+;
+  call void @merge_cycle_0()
+  call void @default_captured_address()
+  call void @externally_visible_default()
+  %f32 = call float @bitcasted_function()
+  ret void
+}
+
+define internal void @default_captured_address() {
+; CHECK-LABEL: define internal void @default_captured_address
+; CHECK-SAME: () #[[ATTR10:[0-9]+]] {
+; CHECK-NEXT:    store volatile ptr @default_captured_address, ptr undef, align 8
+; CHECK-NEXT:    ret void
+;
+  store volatile ptr @default_captured_address, ptr undef, align 8
+  ret void
+}
+
+define void @externally_visible_default() {
+; CHECK-LABEL: define void @externally_visible_default
+; CHECK-SAME: () #[[ATTR10]] {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+; 1,10 -> 3,8
+define internal i32 @bitcasted_function() {
+; CHECK-LABEL: define internal i32 @bitcasted_function
+; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-NEXT:    ret i32 0
+;
+  ret i32 0
+}
+
+define internal void @called_from_invalid_bounds_0() {
+; CHECK-LABEL: define internal void @called_from_invalid_bounds_0
+; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+define internal void @called_from_invalid_bounds_1() {
+; CHECK-LABEL: define internal void @called_from_invalid_bounds_1
+; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+; Invalid range for amdgpu-waves-per-eu
+define amdgpu_kernel void @kernel_invalid_bounds_0_8() #9 {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_invalid_bounds_0_8
+; CHECK-SAME: () #[[ATTR11:[0-9]+]] {
+; CHECK-NEXT:    call void @called_from_invalid_bounds_0()
+; CHECK-NEXT:    ret void
+;
+  call void @called_from_invalid_bounds_0()
+  ret void
+}
+
+; Invalid range for amdgpu-waves-per-eu
+define amdgpu_kernel void @kernel_invalid_bounds_1_123() #10 {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_invalid_bounds_1_123
+; CHECK-SAME: () #[[ATTR12:[0-9]+]] {
+; CHECK-NEXT:    call void @called_from_invalid_bounds_1()
+; CHECK-NEXT:    ret void
+;
+  call void @called_from_invalid_bounds_1()
+  ret void
+}
+
+; XXX - Why is the maximum not 6?
+; The 512 maximum workgroup size implies a minimum occupancy of 2. The
+; implied minimum waves-per-eu should not be 3
+; -> 2,10
+define void @larger_group_size_implies_lower_minimum() #11 {
+; CHECK-LABEL: define void @larger_group_size_implies_lower_minimum
+; CHECK-SAME: () #[[ATTR13:[0-9]+]] {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+define amdgpu_kernel void @kernel_3_6() #12 {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_3_6
+; CHECK-SAME: () #[[ATTR14:[0-9]+]] {
+; CHECK-NEXT:    call void @larger_group_size_implies_lower_minimum()
+; CHECK-NEXT:    ret void
+;
+  call void @larger_group_size_implies_lower_minimum()
+  ret void
+}
+
+; 3,6 -> 6,9
+define internal void @refine_upper_func_3_6() #13 {
+; CHECK-LABEL: define internal void @refine_upper_func_3_6
+; CHECK-SAME: () #[[ATTR15:[0-9]+]] {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+; 4,8 -> 6,8
+define internal void @refine_lower_func_4_8() #14 {
+; CHECK-LABEL: define internal void @refine_lower_func_4_8
+; CHECK-SAME: () #[[ATTR16:[0-9]+]] {
+; CHECK-NEXT:    call void @refine_upper_func_3_6()
+; CHECK-NEXT:    ret void
+;
+  call void @refine_upper_func_3_6()
+  ret void
+}
+
+define amdgpu_kernel void @kernel_foo_6_8() #15 {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_foo_6_8
+; CHECK-SAME: () #[[ATTR17:[0-9]+]] {
+; CHECK-NEXT:    call void @refine_upper_func_3_6()
+; CHECK-NEXT:    call void @refine_lower_func_4_8()
+; CHECK-NEXT:    call void @func_9_10_a()
+; CHECK-NEXT:    ret void
+;
+  call void @refine_upper_func_3_6()
+  call void @refine_lower_func_4_8()
+  call void @func_9_10_a()
+  ret void
+}
+
+; 5,5 -> 5,5
+define internal void @func_5_5() #16 {
+; CHECK-LABEL: define internal void @func_5_5
+; CHECK-SAME: () #[[ATTR18:[0-9]+]] {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+; 5,8 -> 8,8
+define internal void @func_5_8() #17 {
+; CHECK-LABEL: define internal void @func_5_8
+; CHECK-SAME: () #[[ATTR19:[0-9]+]] {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+; 9,10 -> 9,10
+define internal void @func_9_10_a() #18 {
+; CHECK-LABEL: define internal void @func_9_10_a
+; CHECK-SAME: () #[[ATTR20:[0-9]+]] {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+; 9,10 -> 9,9
+define internal void @func_9_10_b() #18 {
+; CHECK-LABEL: define internal void @func_9_10_b
+; CHECK-SAME: () #[[ATTR20]] {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+define amdgpu_kernel void @kernel_bar_8_9() #19 {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_bar_8_9
+; CHECK-SAME: () #[[ATTR21:[0-9]+]] {
+; CHECK-NEXT:    call void @refine_upper_func_3_6()
+; CHECK-NEXT:    call void @func_5_5()
+; CHECK-NEXT:    call void @func_9_10_b()
+; CHECK-NEXT:    call void @func_5_8()
+; CHECK-NEXT:    call void @externally_visible()
+; CHECK-NEXT:    ret void
+;
+  call void @refine_upper_func_3_6()
+  call void @func_5_5()
+  call void @func_9_10_b()
+  call void @func_5_8()
+  call void @externally_visible()
+  ret void
+}
+
+; This is an optimization hint based on users, so it's not strictly
+; required that all callers be visible.
+define void @externally_visible() {
+; CHECK-LABEL: define void @externally_visible
+; CHECK-SAME: () #[[ATTR10]] {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+
+; Use a 1 wave workgroup so there is no interaction by the workgroup
+; size on the implied waves per EU.
+
+attributes #0 = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-waves-per-eu"="1,8" }
+attributes #1 = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-waves-per-eu"="1,2" }
+attributes #2 = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-waves-per-eu"="1,4" }
+attributes #3 = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-waves-per-eu"="1,1" }
+attributes #4 = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-waves-per-eu"="2,8" }
+attributes #5 = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-waves-per-eu"="9,10" }
+attributes #6 = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-waves-per-eu"="2,9" }
+attributes #7 = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-waves-per-eu"="9,9" }
+attributes #8 = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-waves-per-eu"="3,8" }
+attributes #9 = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-waves-per-eu"="0,8" }
+attributes #10 = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-waves-per-eu"="1,123" }
+attributes #11 = { "amdgpu-flat-work-group-size"="1,512" }
+attributes #12 = { "amdgpu-flat-work-group-size"="1,512" "amdgpu-waves-per-eu"="3,6" }
+attributes #13 = { "amdgpu-waves-per-eu"="3,6" }
+attributes #14 = { "amdgpu-waves-per-eu"="4,8" }
+attributes #15 = { "amdgpu-waves-per-eu"="6,8" }
+attributes #16 = { "amdgpu-waves-per-eu"="5,5" }
+attributes #17 = { "amdgpu-waves-per-eu"="5,8" }
+attributes #18 = { "amdgpu-waves-per-eu"="9,10" }
+attributes #19 = { "amdgpu-waves-per-eu"="8,9" }
+;.
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,8" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,2" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR3]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,4" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR4]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,1" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,8" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR7]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,9" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR8]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,9" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR9]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,8" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR10]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR11]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="0,8" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR12]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,123" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR13]] = { "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR14]] = { "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,6" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR15]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,6" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR16]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR17]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="6,8" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR18]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="5,5" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR19]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="5,8" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR20]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR21]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,9" "uniform-work-group-size"="false" }
+;.