From: Thomas Symalla <thomas.symalla@amd.com>
Date: Tue, 18 Oct 2022 14:54:01 +0000 (+0200)
Subject: [NFC][AMDGPU] Add tests for dependent v_bfi instructions.
X-Git-Tag: upstream/17.0.6~30244
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=09fbdde42cc1aa09fbfc11c0b2d2be8a28cc91db;p=platform%2Fupstream%2Fllvm.git

[NFC][AMDGPU] Add tests for dependent v_bfi instructions.

This commit adds a few tests which are used to test the codegen
of nested v_bfi instructions. These instruction sequences are
being generated when using the canonical form for bitfieldInsert
and having the sequences being transformed by SimplifyDemandedBits.

This is a pre-commit for a change which enables the backend to
lower these instruction sequences into v_bfi instructions.
---

diff --git a/llvm/test/CodeGen/AMDGPU/bfi_int.ll b/llvm/test/CodeGen/AMDGPU/bfi_int.ll
index 3dc4225..6c0a183 100644
--- a/llvm/test/CodeGen/AMDGPU/bfi_int.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfi_int.ll
@@ -1907,70 +1907,3 @@ entry:
   store i64 %scalar.use, i64 addrspace(1)* undef
   ret void
 }
-
-define i32 @v_bfi_seq_i32(i32 %x, i32 %y, i32 %z) {
-; GFX7-LABEL: v_bfi_seq_i32:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 20, v0
-; GFX7-NEXT:    s_mov_b32 s4, 0xffc00
-; GFX7-NEXT:    v_xor_b32_e32 v0, v0, v1
-; GFX7-NEXT:    v_bfi_b32 v2, s4, v1, v2
-; GFX7-NEXT:    v_and_b32_e32 v0, 0x3ff00000, v0
-; GFX7-NEXT:    v_xor_b32_e32 v0, v0, v2
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_bfi_seq_i32:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 20, v0
-; GFX8-NEXT:    s_mov_b32 s4, 0xffc00
-; GFX8-NEXT:    v_xor_b32_e32 v0, v0, v1
-; GFX8-NEXT:    v_bfi_b32 v2, s4, v1, v2
-; GFX8-NEXT:    v_and_b32_e32 v0, 0x3ff00000, v0
-; GFX8-NEXT:    v_xor_b32_e32 v0, v0, v2
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_bfi_seq_i32:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 20, v0
-; GFX10-NEXT:    v_xor_b32_e32 v0, v0, v1
-; GFX10-NEXT:    v_bfi_b32 v1, 0xffc00, v1, v2
-; GFX10-NEXT:    v_and_b32_e32 v0, 0x3ff00000, v0
-; GFX10-NEXT:    v_xor_b32_e32 v0, v0, v1
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-GISEL-LABEL: v_bfi_seq_i32:
-; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 20, v0
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v3, 0xffc00, v1
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v2, 0xfff003ff, v2
-; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff00000, v0
-; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
-; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: v_bfi_seq_i32:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 20, v0
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xfff003ff, v2
-; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
-; GFX10-GISEL-NEXT:    v_and_or_b32 v1, 0xffc00, v1, v2
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff00000, v0
-; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-  %1 = shl i32 %x, 20
-  %2 = and i32 %y, 1047552
-  %3 = and i32 %z, -1047553
-  %4 = or i32 %2, %3
-  %5 = xor i32 %1, %y
-  %6 = and i32 %5, 1072693248
-  %7 = xor i32 %6, %4
-  ret i32 %7
-}
diff --git a/llvm/test/CodeGen/AMDGPU/bfi_nested.ll b/llvm/test/CodeGen/AMDGPU/bfi_nested.ll
new file mode 100644
index 0000000..c81b04b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/bfi_nested.ll
@@ -0,0 +1,116 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s
+
+; When translating sequences like
+; bitfieldInsert(bitfieldInsert(...)),
+; where one bitfieldInsert's result is the base for another one,
+; the SimplifyDemandedBits transform executed during InstCombine
+; can merge the inverted mask of the outer bitfieldInsert and the
+; inverted mask of the inner bitfieldInsert. When it is possible,
+; e. g. if the constants are disjoint and the original inverted mask of
+; the outer bitfieldInsert can be reconstructed, aim to generate multiple
+; v_bfi instructions.
+define float @v_bfi_single_nesting_level (float %x, float %y, float %z) {
+; GFX10-LABEL: v_bfi_single_nesting_level:
+; GFX10:       ; %bb.0: ; %.entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_mul_f32_e32 v0, 0x447fc000, v0
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX10-NEXT:    v_mul_f32_e32 v2, 0x447fc000, v2
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 20, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffc00, v1
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xc00003ff, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, 0x3ff00000, v0
+; GFX10-NEXT:    v_or3_b32 v0, v1, v2, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+.entry:
+  %mul.base = fmul reassoc nnan nsz arcp contract afn float %z, 1.023000e+03
+  %mul.base.i32 = fptoui float %mul.base to i32
+  %y.i32 = fptoui float %y to i32
+  %shl.inner.insert = shl i32 %y.i32, 10
+  %bfi1.and = and i32 %shl.inner.insert, 1047552
+  %bfi1.andnot = and i32 %mul.base.i32, -1073740801
+  %bfi1.or = or i32 %bfi1.and, %bfi1.andnot
+  %mul.outer.insert = fmul reassoc nnan nsz arcp contract afn float %x, 1.023000e+03
+  %mul.outer.insert.i32 = fptoui float %mul.outer.insert to i32
+  %shl.outer.insert = shl i32 %mul.outer.insert.i32, 20
+  %and.outer = and i32 %shl.outer.insert, 1072693248
+  %or.outer = or i32 %bfi1.or, %and.outer
+  %result = bitcast i32 %or.outer to float
+  ret float %result
+}
+
+define float @v_bfi_no_nesting(float %x, float %y, float %z) {
+; GFX10-LABEL: v_bfi_no_nesting:
+; GFX10:       ; %bb.0: ; %.entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_mul_f32_e32 v0, 0x447fc000, v0
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX10-NEXT:    v_mul_f32_e32 v2, 0x447fc000, v2
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 20, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffc00, v1
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xc0000400, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, 0x3ff00000, v0
+; GFX10-NEXT:    v_or3_b32 v0, v1, v2, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+.entry:
+  %mul.base = fmul reassoc nnan nsz arcp contract afn float %z, 1.023000e+03
+  %mul.base.i32 = fptoui float %mul.base to i32
+  %y.i32 = fptoui float %y to i32
+  %shl.inner.insert = shl i32 %y.i32, 10
+  %inner.and = and i32 %shl.inner.insert, 1047552
+  %inner.and2 = and i32 %mul.base.i32, -1073740800
+  %inner.or = or i32 %inner.and, %inner.and2
+  %mul.outer.insert = fmul reassoc nnan nsz arcp contract afn float %x, 1.023000e+03
+  %mul.outer.insert.i32 = fptoui float %mul.outer.insert to i32
+  %shl.outer.insert = shl i32 %mul.outer.insert.i32, 20
+  %and.outer = and i32 %shl.outer.insert, 1072693248
+  %or.outer = or i32 %inner.or, %and.outer
+  %result = bitcast i32 %or.outer to float
+  ret float %result
+}
+
+define float @v_bfi_two_levels(float %x, float %y, float %z) {
+; GFX10-LABEL: v_bfi_two_levels:
+; GFX10:       ; %bb.0: ; %.entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_mul_f32_e32 v0, 0x447fc000, v0
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 5, v1
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xc000001f, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 20, v0
+; GFX10-NEXT:    v_and_or_b32 v2, 0x3e0, v3, v2
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffc00, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0x3ff00000, v0
+; GFX10-NEXT:    v_or3_b32 v0, v2, v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+.entry:
+  %y.i32 = fptoui float %y to i32
+  %shl.insert.inner = shl i32 %y.i32, 5
+  %and.insert.inner = and i32 %shl.insert.inner, 992
+  %z.i32 = fptoui float %z to i32
+  %base.inner = and i32 %z.i32, -1073741793
+  %or.inner = or i32 %and.insert.inner , %base.inner
+  %shl.insert.mid = shl i32 %y.i32, 10
+  %and.insert.mid = and i32 %shl.insert.mid, 1047552
+  %or.mid = or i32 %or.inner, %and.insert.mid
+  %fmul.insert.outer = fmul reassoc nnan nsz arcp contract afn float %x, 1.023000e+03
+  %cast.insert.outer = fptoui float %fmul.insert.outer to i32
+  %shl.insert.outer = shl i32 %cast.insert.outer, 20
+  %and.insert.outer = and i32 %shl.insert.outer, 1072693248
+  %or.outer = or i32 %or.mid, %and.insert.outer
+  %result = bitcast i32 %or.outer to float
+  ret float %result
+}