From cf0aa0b66ccec87b0ac14f632998074e507be950 Mon Sep 17 00:00:00 2001
From: Stefan Pintilie <stefanp@ca.ibm.com>
Date: Tue, 13 Jul 2021 21:15:30 -0500
Subject: [PATCH] [NFC][PowerPC] Added test to check regsiter allocation for
 ACC registers

ACC regsiters are a combination of 4 consecutive vector regsiters and therefore
somtimes require special treatment for register allocation. This patch only
adds a test.
---
 llvm/test/CodeGen/PowerPC/ppc64-acc-regalloc.ll | 352 ++++++++++++++++++++++++
 1 file changed, 352 insertions(+)
 create mode 100644 llvm/test/CodeGen/PowerPC/ppc64-acc-regalloc.ll

diff --git a/llvm/test/CodeGen/PowerPC/ppc64-acc-regalloc.ll b/llvm/test/CodeGen/PowerPC/ppc64-acc-regalloc.ll
new file mode 100644
index 0000000..ace652d
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/ppc64-acc-regalloc.ll
@@ -0,0 +1,352 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple powerpc64le-unknown-linux-gnu \
+; RUN:     -mcpu=pwr10 -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names < %s \
+; RUN:    | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple powerpc64le-unknown-linux-gnu \
+; RUN:     -mcpu=pwr10 -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names \
+; RUN:     -ppc-track-subreg-liveness < %s | FileCheck %s --check-prefix=TRACKLIVE
+
+%0 = type <{ double }>
+%1 = type <{ double }>
+
+define void @acc_regalloc(i32* %arg, [0 x %0]* %arg1, [0 x %1]* %arg2) local_unnamed_addr {
+; CHECK-LABEL: acc_regalloc:
+; CHECK:       # %bb.0: # %bb
+; CHECK-NEXT:    lwz r3, 0(r3)
+; CHECK-NEXT:    lxv vs0, 0(0)
+; CHECK-NEXT:    xxlxor vs2, vs2, vs2
+; CHECK-NEXT:    xxlxor vs3, vs3, vs3
+; CHECK-NEXT:    stfd f14, -144(r1) # 8-byte Folded Spill
+; CHECK-NEXT:    stfd f15, -136(r1) # 8-byte Folded Spill
+; CHECK-NEXT:    xxlxor v2, v2, v2
+; CHECK-NEXT:    li r6, 1
+; CHECK-NEXT:    li r4, 16
+; CHECK-NEXT:    stfd f16, -128(r1) # 8-byte Folded Spill
+; CHECK-NEXT:    stfd f17, -120(r1) # 8-byte Folded Spill
+; CHECK-NEXT:    extswsli r3, r3, 3
+; CHECK-NEXT:    stfd f18, -112(r1) # 8-byte Folded Spill
+; CHECK-NEXT:    stfd f19, -104(r1) # 8-byte Folded Spill
+; CHECK-NEXT:    xvmaddadp vs3, vs0, vs3
+; CHECK-NEXT:    lxvdsx vs1, 0, r3
+; CHECK-NEXT:    xvmaddadp vs2, vs1, vs2
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  .LBB0_1: # %bb9
+; CHECK-NEXT:    #
+; CHECK-NEXT:    addi r6, r6, 2
+; CHECK-NEXT:    lxv vs5, -64(r5)
+; CHECK-NEXT:    lxv vs6, -16(r5)
+; CHECK-NEXT:    lxv vs4, 16(0)
+; CHECK-NEXT:    xxlor v7, vs2, vs2
+; CHECK-NEXT:    xxlxor v8, v8, v8
+; CHECK-NEXT:    xxlxor v1, v1, v1
+; CHECK-NEXT:    mulld r6, r6, r3
+; CHECK-NEXT:    xvmaddadp v7, vs5, v2
+; CHECK-NEXT:    xxlxor v6, v6, v6
+; CHECK-NEXT:    xvmaddadp v8, vs6, v8
+; CHECK-NEXT:    xvmaddadp v1, vs4, vs1
+; CHECK-NEXT:    xvmuldp v0, vs4, v2
+; CHECK-NEXT:    xvmaddadp v1, v2, v2
+; CHECK-NEXT:    xvmaddadp v0, v2, v2
+; CHECK-NEXT:    lxvdsx v4, r6, r4
+; CHECK-NEXT:    xvmaddadp v6, vs5, v6
+; CHECK-NEXT:    li r6, 0
+; CHECK-NEXT:    xvmuldp v9, vs6, v4
+; CHECK-NEXT:    xvmuldp v3, vs5, v4
+; CHECK-NEXT:    xvmuldp v11, vs0, v4
+; CHECK-NEXT:    vmr v10, v2
+; CHECK-NEXT:    xvmuldp v5, v4, v2
+; CHECK-NEXT:    vmr v4, v2
+; CHECK-NEXT:    xxlor vs18, v8, v8
+; CHECK-NEXT:    xxlor vs4, v2, v2
+; CHECK-NEXT:    xxlor vs12, v10, v10
+; CHECK-NEXT:    xxlor vs13, v11, v11
+; CHECK-NEXT:    xxlor v10, vs3, vs3
+; CHECK-NEXT:    xxlor vs8, v4, v4
+; CHECK-NEXT:    xxlor vs9, v5, v5
+; CHECK-NEXT:    xxlor vs10, v0, v0
+; CHECK-NEXT:    xxlor vs11, v1, v1
+; CHECK-NEXT:    xxmtacc acc2
+; CHECK-NEXT:    xxlor vs19, v9, v9
+; CHECK-NEXT:    vmr v8, v2
+; CHECK-NEXT:    xxlor vs5, v3, v3
+; CHECK-NEXT:    xxlor vs6, v6, v6
+; CHECK-NEXT:    xxlor vs7, v7, v7
+; CHECK-NEXT:    xxlor vs14, v10, v10
+; CHECK-NEXT:    xxlor vs15, v11, v11
+; CHECK-NEXT:    xxlor vs16, v8, v8
+; CHECK-NEXT:    xxlor vs17, v9, v9
+; CHECK-NEXT:    xxmtacc acc1
+; CHECK-NEXT:    xxmtacc acc3
+; CHECK-NEXT:    xvf64gerpp acc1, vsp34, vs0
+; CHECK-NEXT:    xvf64gerpp acc2, vsp34, vs0
+; CHECK-NEXT:    xvf64gerpp acc3, vsp34, vs0
+; CHECK-NEXT:    xxmtacc acc4
+; CHECK-NEXT:    xvf64gerpp acc4, vsp34, vs0
+; CHECK-NEXT:    xvf64gerpp acc1, vsp34, vs0
+; CHECK-NEXT:    xvf64gerpp acc2, vsp34, vs0
+; CHECK-NEXT:    xvf64gerpp acc3, vsp34, vs0
+; CHECK-NEXT:    xvf64gerpp acc4, vsp34, vs0
+; CHECK-NEXT:    xvf64gerpp acc1, vsp34, vs0
+; CHECK-NEXT:    xvf64gerpp acc2, vsp34, vs0
+; CHECK-NEXT:    xvf64gerpp acc3, vsp34, vs0
+; CHECK-NEXT:    xvf64gerpp acc4, vsp34, vs0
+; CHECK-NEXT:    xvf64gerpp acc1, vsp34, vs0
+; CHECK-NEXT:    xvf64gerpp acc2, vsp34, vs0
+; CHECK-NEXT:    xvf64gerpp acc3, vsp34, vs0
+; CHECK-NEXT:    xvf64gerpp acc4, vsp34, vs0
+; CHECK-NEXT:    xvf64gerpp acc1, vsp34, vs0
+; CHECK-NEXT:    xvf64gerpp acc2, vsp34, vs0
+; CHECK-NEXT:    xvf64gerpp acc3, vsp34, vs0
+; CHECK-NEXT:    xvf64gerpp acc4, vsp34, vs0
+; CHECK-NEXT:    xvf64gerpp acc1, vsp34, vs0
+; CHECK-NEXT:    xvf64gerpp acc2, vsp34, vs0
+; CHECK-NEXT:    xvf64gerpp acc3, vsp34, vs0
+; CHECK-NEXT:    xvf64gerpp acc4, vsp34, vs0
+; CHECK-NEXT:    xvf64gerpp acc1, vsp34, vs0
+; CHECK-NEXT:    xvf64gerpp acc2, vsp34, vs0
+; CHECK-NEXT:    xvf64gerpp acc3, vsp34, vs0
+; CHECK-NEXT:    xvf64gerpp acc4, vsp34, vs0
+; CHECK-NEXT:    xxmfacc acc1
+; CHECK-NEXT:    xxmfacc acc2
+; CHECK-NEXT:    xxmfacc acc3
+; CHECK-NEXT:    xxmfacc acc4
+; CHECK-NEXT:    stxv vs5, 0(r3)
+; CHECK-NEXT:    stxv vs13, 32(r3)
+; CHECK-NEXT:    stxv vs8, 16(0)
+; CHECK-NEXT:    stxv vs16, 48(0)
+; CHECK-NEXT:    b .LBB0_1
+;
+; TRACKLIVE-LABEL: acc_regalloc:
+; TRACKLIVE:       # %bb.0: # %bb
+; TRACKLIVE-NEXT:    lwz r3, 0(r3)
+; TRACKLIVE-NEXT:    lxv vs0, 0(0)
+; TRACKLIVE-NEXT:    xxlxor vs2, vs2, vs2
+; TRACKLIVE-NEXT:    xxlxor vs3, vs3, vs3
+; TRACKLIVE-NEXT:    stfd f14, -144(r1) # 8-byte Folded Spill
+; TRACKLIVE-NEXT:    stfd f15, -136(r1) # 8-byte Folded Spill
+; TRACKLIVE-NEXT:    xxlxor v2, v2, v2
+; TRACKLIVE-NEXT:    li r6, 1
+; TRACKLIVE-NEXT:    li r4, 16
+; TRACKLIVE-NEXT:    stfd f16, -128(r1) # 8-byte Folded Spill
+; TRACKLIVE-NEXT:    stfd f17, -120(r1) # 8-byte Folded Spill
+; TRACKLIVE-NEXT:    extswsli r3, r3, 3
+; TRACKLIVE-NEXT:    stfd f18, -112(r1) # 8-byte Folded Spill
+; TRACKLIVE-NEXT:    stfd f19, -104(r1) # 8-byte Folded Spill
+; TRACKLIVE-NEXT:    xvmaddadp vs3, vs0, vs3
+; TRACKLIVE-NEXT:    lxvdsx vs1, 0, r3
+; TRACKLIVE-NEXT:    xvmaddadp vs2, vs1, vs2
+; TRACKLIVE-NEXT:    .p2align 4
+; TRACKLIVE-NEXT:  .LBB0_1: # %bb9
+; TRACKLIVE-NEXT:    #
+; TRACKLIVE-NEXT:    addi r6, r6, 2
+; TRACKLIVE-NEXT:    lxv vs4, 16(0)
+; TRACKLIVE-NEXT:    xxlxor v1, v1, v1
+; TRACKLIVE-NEXT:    lxv vs6, -16(r5)
+; TRACKLIVE-NEXT:    lxv vs5, -64(r5)
+; TRACKLIVE-NEXT:    xxlxor v8, v8, v8
+; TRACKLIVE-NEXT:    xxlor v7, vs2, vs2
+; TRACKLIVE-NEXT:    xxlxor v6, v6, v6
+; TRACKLIVE-NEXT:    mulld r6, r6, r3
+; TRACKLIVE-NEXT:    vmr v10, v2
+; TRACKLIVE-NEXT:    xxlor vs8, v10, v10
+; TRACKLIVE-NEXT:    xvmaddadp v1, vs4, vs1
+; TRACKLIVE-NEXT:    xvmuldp v0, vs4, v2
+; TRACKLIVE-NEXT:    xvmaddadp v8, vs6, v8
+; TRACKLIVE-NEXT:    xvmaddadp v7, vs5, v2
+; TRACKLIVE-NEXT:    xvmaddadp v6, vs5, v6
+; TRACKLIVE-NEXT:    xxlor vs4, v2, v2
+; TRACKLIVE-NEXT:    lxvdsx v4, r6, r4
+; TRACKLIVE-NEXT:    li r6, 0
+; TRACKLIVE-NEXT:    xvmaddadp v1, v2, v2
+; TRACKLIVE-NEXT:    xvmaddadp v0, v2, v2
+; TRACKLIVE-NEXT:    xxlor vs18, v8, v8
+; TRACKLIVE-NEXT:    vmr v8, v2
+; TRACKLIVE-NEXT:    xxlor vs7, v7, v7
+; TRACKLIVE-NEXT:    xxlor vs16, v8, v8
+; TRACKLIVE-NEXT:    xvmuldp v3, vs5, v4
+; TRACKLIVE-NEXT:    xvmuldp v5, vs0, v4
+; TRACKLIVE-NEXT:    xvmuldp v9, vs6, v4
+; TRACKLIVE-NEXT:    xvmuldp v11, v4, v2
+; TRACKLIVE-NEXT:    vmr v4, v2
+; TRACKLIVE-NEXT:    xxlor vs6, v6, v6
+; TRACKLIVE-NEXT:    xxlor vs12, v4, v4
+; TRACKLIVE-NEXT:    xxlor v4, vs3, vs3
+; TRACKLIVE-NEXT:    xxlor vs10, v0, v0
+; TRACKLIVE-NEXT:    xxlor vs11, v1, v1
+; TRACKLIVE-NEXT:    xxlor vs14, v4, v4
+; TRACKLIVE-NEXT:    xxlor vs5, v3, v3
+; TRACKLIVE-NEXT:    xxlor vs9, v11, v11
+; TRACKLIVE-NEXT:    xxlor vs13, v5, v5
+; TRACKLIVE-NEXT:    xxlor vs15, v5, v5
+; TRACKLIVE-NEXT:    xxlor vs19, v9, v9
+; TRACKLIVE-NEXT:    xxlor vs17, v9, v9
+; TRACKLIVE-NEXT:    xxmtacc acc1
+; TRACKLIVE-NEXT:    xxmtacc acc2
+; TRACKLIVE-NEXT:    xxmtacc acc3
+; TRACKLIVE-NEXT:    xxmtacc acc4
+; TRACKLIVE-NEXT:    xvf64gerpp acc1, vsp34, vs0
+; TRACKLIVE-NEXT:    xvf64gerpp acc2, vsp34, vs0
+; TRACKLIVE-NEXT:    xvf64gerpp acc3, vsp34, vs0
+; TRACKLIVE-NEXT:    xvf64gerpp acc4, vsp34, vs0
+; TRACKLIVE-NEXT:    xvf64gerpp acc1, vsp34, vs0
+; TRACKLIVE-NEXT:    xvf64gerpp acc2, vsp34, vs0
+; TRACKLIVE-NEXT:    xvf64gerpp acc3, vsp34, vs0
+; TRACKLIVE-NEXT:    xvf64gerpp acc4, vsp34, vs0
+; TRACKLIVE-NEXT:    xvf64gerpp acc1, vsp34, vs0
+; TRACKLIVE-NEXT:    xvf64gerpp acc2, vsp34, vs0
+; TRACKLIVE-NEXT:    xvf64gerpp acc3, vsp34, vs0
+; TRACKLIVE-NEXT:    xvf64gerpp acc4, vsp34, vs0
+; TRACKLIVE-NEXT:    xvf64gerpp acc1, vsp34, vs0
+; TRACKLIVE-NEXT:    xvf64gerpp acc2, vsp34, vs0
+; TRACKLIVE-NEXT:    xvf64gerpp acc3, vsp34, vs0
+; TRACKLIVE-NEXT:    xvf64gerpp acc4, vsp34, vs0
+; TRACKLIVE-NEXT:    xvf64gerpp acc1, vsp34, vs0
+; TRACKLIVE-NEXT:    xvf64gerpp acc2, vsp34, vs0
+; TRACKLIVE-NEXT:    xvf64gerpp acc3, vsp34, vs0
+; TRACKLIVE-NEXT:    xvf64gerpp acc4, vsp34, vs0
+; TRACKLIVE-NEXT:    xvf64gerpp acc1, vsp34, vs0
+; TRACKLIVE-NEXT:    xvf64gerpp acc2, vsp34, vs0
+; TRACKLIVE-NEXT:    xvf64gerpp acc3, vsp34, vs0
+; TRACKLIVE-NEXT:    xvf64gerpp acc4, vsp34, vs0
+; TRACKLIVE-NEXT:    xvf64gerpp acc1, vsp34, vs0
+; TRACKLIVE-NEXT:    xvf64gerpp acc2, vsp34, vs0
+; TRACKLIVE-NEXT:    xvf64gerpp acc3, vsp34, vs0
+; TRACKLIVE-NEXT:    xvf64gerpp acc4, vsp34, vs0
+; TRACKLIVE-NEXT:    xxmfacc acc1
+; TRACKLIVE-NEXT:    xxmfacc acc2
+; TRACKLIVE-NEXT:    xxmfacc acc3
+; TRACKLIVE-NEXT:    xxmfacc acc4
+; TRACKLIVE-NEXT:    stxv vs5, 0(r3)
+; TRACKLIVE-NEXT:    stxv vs13, 32(r3)
+; TRACKLIVE-NEXT:    stxv vs8, 16(0)
+; TRACKLIVE-NEXT:    stxv vs16, 48(0)
+; TRACKLIVE-NEXT:    b .LBB0_1
+bb:
+  %i = load i32, i32* %arg, align 4
+  %i3 = sext i32 %i to i64
+  %i4 = shl nsw i64 %i3, 3
+  %i5 = bitcast [0 x %0]* %arg1 to i8*
+  %i6 = getelementptr i8, i8* %i5, i64 undef
+  %i7 = getelementptr [0 x %1], [0 x %1]* %arg2, i64 0, i64 -8
+  %i8 = getelementptr i8, i8* %i6, i64 undef
+  br label %bb9
+
+bb9:                                              ; preds = %bb95, %bb
+  %i10 = phi i64 [ 1, %bb ], [ 0, %bb95 ]
+  %i11 = getelementptr %1, %1* null, i64 2
+  %i12 = bitcast %1* %i11 to <2 x double>*
+  %i13 = load <2 x double>, <2 x double>* %i12, align 1
+  %i14 = add nuw nsw i64 %i10, 2
+  %i15 = getelementptr inbounds %1, %1* %i7, i64 undef
+  %i16 = bitcast %1* %i15 to <2 x double>*
+  %i17 = load <2 x double>, <2 x double>* %i16, align 1
+  %i18 = load <2 x double>, <2 x double>* null, align 1
+  %i19 = getelementptr %1, %1* %i15, i64 6
+  %i20 = bitcast %1* %i19 to <2 x double>*
+  %i21 = load <2 x double>, <2 x double>* %i20, align 1
+  %i22 = load i64, i64* undef, align 8
+  %i23 = insertelement <2 x i64> poison, i64 %i22, i32 0
+  %i24 = bitcast <2 x i64> %i23 to <2 x double>
+  %i25 = shufflevector <2 x double> %i24, <2 x double> undef, <2 x i32> zeroinitializer
+  %i26 = mul i64 %i14, %i4
+  %i27 = getelementptr i8, i8* null, i64 %i26
+  %i28 = getelementptr inbounds i8, i8* %i27, i64 0
+  %i29 = getelementptr i8, i8* %i28, i64 16
+  %i30 = bitcast i8* %i29 to i64*
+  %i31 = load i64, i64* %i30, align 8
+  %i32 = insertelement <2 x i64> poison, i64 %i31, i32 0
+  %i33 = bitcast <2 x i64> %i32 to <2 x double>
+  %i34 = shufflevector <2 x double> %i33, <2 x double> undef, <2 x i32> zeroinitializer
+  %i35 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> zeroinitializer, <2 x double> %i25, <2 x double> zeroinitializer)
+  %i36 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %i13, <2 x double> %i25, <2 x double> zeroinitializer)
+  %i37 = fmul contract <2 x double> %i13, zeroinitializer
+  %i38 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %i17, <2 x double> zeroinitializer, <2 x double> %i35)
+  %i39 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> zeroinitializer, <2 x double> zeroinitializer, <2 x double> %i36)
+  %i40 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %i17, <2 x double> zeroinitializer, <2 x double> zeroinitializer)
+  %i41 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> zeroinitializer, <2 x double> zeroinitializer, <2 x double> %i37)
+  %i42 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %i18, <2 x double> zeroinitializer, <2 x double> zeroinitializer)
+  %i43 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %i21, <2 x double> zeroinitializer, <2 x double> zeroinitializer)
+  %i44 = fmul contract <2 x double> %i17, %i34
+  %i45 = fmul contract <2 x double> zeroinitializer, %i34
+  %i46 = fmul contract <2 x double> %i18, %i34
+  %i47 = fmul contract <2 x double> %i21, %i34
+  %i48 = bitcast <2 x double> %i44 to <16 x i8>
+  %i49 = bitcast <2 x double> %i40 to <16 x i8>
+  %i50 = bitcast <2 x double> %i38 to <16 x i8>
+  %i51 = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> zeroinitializer, <16 x i8> %i48, <16 x i8> %i49, <16 x i8> %i50)
+  %i52 = bitcast <2 x double> %i45 to <16 x i8>
+  %i53 = bitcast <2 x double> %i41 to <16 x i8>
+  %i54 = bitcast <2 x double> %i39 to <16 x i8>
+  %i55 = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> zeroinitializer, <16 x i8> %i52, <16 x i8> %i53, <16 x i8> %i54)
+  %i56 = bitcast <2 x double> %i46 to <16 x i8>
+  %i57 = bitcast <2 x double> %i42 to <16 x i8>
+  %i58 = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> zeroinitializer, <16 x i8> %i56, <16 x i8> %i57, <16 x i8> %i56)
+  %i59 = bitcast <2 x double> %i47 to <16 x i8>
+  %i60 = bitcast <2 x double> %i43 to <16 x i8>
+  %i61 = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> zeroinitializer, <16 x i8> %i59, <16 x i8> %i60, <16 x i8> %i59)
+  %i62 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %i51, <256 x i1> undef, <16 x i8> undef)
+  %i63 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %i55, <256 x i1> undef, <16 x i8> undef)
+  %i64 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %i58, <256 x i1> undef, <16 x i8> undef)
+  %i65 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %i61, <256 x i1> undef, <16 x i8> undef)
+  %i66 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %i62, <256 x i1> undef, <16 x i8> undef)
+  %i67 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %i63, <256 x i1> undef, <16 x i8> undef)
+  %i68 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %i64, <256 x i1> undef, <16 x i8> undef)
+  %i69 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %i65, <256 x i1> undef, <16 x i8> undef)
+  %i70 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %i66, <256 x i1> undef, <16 x i8> undef)
+  %i71 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %i67, <256 x i1> undef, <16 x i8> undef)
+  %i72 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %i68, <256 x i1> undef, <16 x i8> undef)
+  %i73 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %i69, <256 x i1> undef, <16 x i8> undef)
+  %i74 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %i70, <256 x i1> undef, <16 x i8> undef)
+  %i75 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %i71, <256 x i1> undef, <16 x i8> undef)
+  %i76 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %i72, <256 x i1> undef, <16 x i8> undef)
+  %i77 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %i73, <256 x i1> undef, <16 x i8> undef)
+  %i78 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %i74, <256 x i1> undef, <16 x i8> undef)
+  %i79 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %i75, <256 x i1> undef, <16 x i8> undef)
+  %i80 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %i76, <256 x i1> undef, <16 x i8> undef)
+  %i81 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %i77, <256 x i1> undef, <16 x i8> undef)
+  br label %bb82
+
+bb82:                                             ; preds = %bb82, %bb9
+  %i83 = phi <512 x i1> [ %i94, %bb82 ], [ %i81, %bb9 ]
+  %i84 = phi <512 x i1> [ %i93, %bb82 ], [ %i80, %bb9 ]
+  %i85 = phi <512 x i1> [ %i92, %bb82 ], [ %i79, %bb9 ]
+  %i86 = phi <512 x i1> [ %i91, %bb82 ], [ %i78, %bb9 ]
+  %i87 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %i86, <256 x i1> undef, <16 x i8> undef)
+  %i88 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %i85, <256 x i1> undef, <16 x i8> undef)
+  %i89 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %i84, <256 x i1> undef, <16 x i8> undef)
+  %i90 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %i83, <256 x i1> undef, <16 x i8> undef)
+  %i91 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %i87, <256 x i1> undef, <16 x i8> undef)
+  %i92 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %i88, <256 x i1> undef, <16 x i8> undef)
+  %i93 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %i89, <256 x i1> undef, <16 x i8> undef)
+  %i94 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %i90, <256 x i1> undef, <16 x i8> undef)
+  br i1 undef, label %bb95, label %bb82
+
+bb95:                                             ; preds = %bb82
+  %i96 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> %i91)
+  %i97 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %i96, 2
+  %i98 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> %i92)
+  %i99 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %i98, 3
+  %i100 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> %i93)
+  %i101 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %i100, 2
+  %i102 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> %i94)
+  %i103 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %i102, 3
+  %i104 = getelementptr inbounds i8, i8* %i8, i64 undef
+  %i105 = bitcast i8* %i104 to <16 x i8>*
+  store <16 x i8> %i97, <16 x i8>* %i105, align 1
+  %i106 = getelementptr i8, i8* %i104, i64 32
+  %i107 = bitcast i8* %i106 to <16 x i8>*
+  store <16 x i8> %i101, <16 x i8>* %i107, align 1
+  %i108 = getelementptr i8, i8* null, i64 16
+  %i109 = bitcast i8* %i108 to <16 x i8>*
+  store <16 x i8> %i99, <16 x i8>* %i109, align 1
+  %i110 = getelementptr i8, i8* null, i64 48
+  %i111 = bitcast i8* %i110 to <16 x i8>*
+  store <16 x i8> %i103, <16 x i8>* %i111, align 1
+  br label %bb9
+}
+
+declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>)
+declare <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
+declare <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1>, <256 x i1>, <16 x i8>)
+declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1>)
+
-- 
2.7.4