From 673b4ad64577e3336cb8109869919b21341e0d74 Mon Sep 17 00:00:00 2001 From: KAWASHIMA Takahiro Date: Mon, 12 Dec 2022 14:25:31 +0900 Subject: [PATCH] [AArch64] Add FP16 instructions to isAssociativeAndCommutative `-mcpu=` in `llvm/test/CodeGen/AArch64/machine-combiner.ll` is changed to `neoverse-n2` to use FP16 and SVE/SVE2 instructions. By this, the register allocation and/or instruction scheduling are slightly changed and some existing `CHECK` lines need to be updated. Differential Revision: https://reviews.llvm.org/D139809 --- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 9 +++ llvm/test/CodeGen/AArch64/machine-combiner.ll | 111 +++++++++++++++++++++++--- 2 files changed, 109 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 684e75d..a22a67a 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -4951,19 +4951,28 @@ bool AArch64InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst, switch (Inst.getOpcode()) { // == Floating-point types == // -- Floating-point instructions -- + case AArch64::FADDHrr: case AArch64::FADDSrr: case AArch64::FADDDrr: + case AArch64::FMULHrr: case AArch64::FMULSrr: case AArch64::FMULDrr: + case AArch64::FMULX16: case AArch64::FMULX32: case AArch64::FMULX64: // -- Advanced SIMD instructions -- + case AArch64::FADDv4f16: + case AArch64::FADDv8f16: case AArch64::FADDv2f32: case AArch64::FADDv4f32: case AArch64::FADDv2f64: + case AArch64::FMULv4f16: + case AArch64::FMULv8f16: case AArch64::FMULv2f32: case AArch64::FMULv4f32: case AArch64::FMULv2f64: + case AArch64::FMULXv4f16: + case AArch64::FMULXv8f16: case AArch64::FMULXv2f32: case AArch64::FMULXv4f32: case AArch64::FMULXv2f64: diff --git a/llvm/test/CodeGen/AArch64/machine-combiner.ll b/llvm/test/CodeGen/AArch64/machine-combiner.ll index 87bf7c2..f5fcdda 100644 --- a/llvm/test/CodeGen/AArch64/machine-combiner.ll +++ b/llvm/test/CodeGen/AArch64/machine-combiner.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=cortex-a57 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-STD -; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=cortex-a57 -enable-unsafe-fp-math < %s | FileCheck %s --check-prefixes=CHECK,CHECK-UNSAFE +; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=neoverse-n2 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-STD +; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=neoverse-n2 -enable-unsafe-fp-math < %s | FileCheck %s --check-prefixes=CHECK,CHECK-UNSAFE ; Incremental updates of the instruction depths should be enough for this test ; case. -; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=cortex-a57 -enable-unsafe-fp-math \ +; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=neoverse-n2 -enable-unsafe-fp-math \ ; RUN: -machine-combiner-inc-threshold=0 -machine-combiner-verify-pattern-order=true < %s | FileCheck %s --check-prefixes=CHECK,CHECK-UNSAFE ; Verify that the first two adds are independent regardless of how the inputs are @@ -189,8 +189,8 @@ define float @reassociate_muls1(float %x0, float %x1, float %x2, float %x3) { ; CHECK-STD-LABEL: reassociate_muls1: ; CHECK-STD: // %bb.0: ; CHECK-STD-NEXT: fdiv s0, s0, s1 -; CHECK-STD-NEXT: fmul s1, s2, s0 -; CHECK-STD-NEXT: fmul s0, s3, s1 +; CHECK-STD-NEXT: fmul s0, s2, s0 +; CHECK-STD-NEXT: fmul s0, s3, s0 ; CHECK-STD-NEXT: ret ; ; CHECK-UNSAFE-LABEL: reassociate_muls1: @@ -233,8 +233,8 @@ define double @reassociate_muls_double(double %x0, double %x1, double %x2, doubl ; CHECK-STD-LABEL: reassociate_muls_double: ; CHECK-STD: // %bb.0: ; CHECK-STD-NEXT: fdiv d0, d0, d1 -; CHECK-STD-NEXT: fmul d1, d2, d0 -; CHECK-STD-NEXT: fmul d0, d3, d1 +; CHECK-STD-NEXT: fmul d0, d2, d0 +; CHECK-STD-NEXT: fmul d0, d3, d0 ; CHECK-STD-NEXT: ret ; ; CHECK-UNSAFE-LABEL: reassociate_muls_double: @@ -249,6 +249,50 @@ define double @reassociate_muls_double(double %x0, double %x1, double %x2, doubl ret double %t2 } +; Verify that scalar half-precision adds are reassociated. + +define half @reassociate_adds_half(half %x0, half %x1, half %x2, half %x3) { +; CHECK-STD-LABEL: reassociate_adds_half: +; CHECK-STD: // %bb.0: +; CHECK-STD-NEXT: fdiv h0, h0, h1 +; CHECK-STD-NEXT: fadd h0, h2, h0 +; CHECK-STD-NEXT: fadd h0, h3, h0 +; CHECK-STD-NEXT: ret +; +; CHECK-UNSAFE-LABEL: reassociate_adds_half: +; CHECK-UNSAFE: // %bb.0: +; CHECK-UNSAFE-NEXT: fdiv h0, h0, h1 +; CHECK-UNSAFE-NEXT: fadd h1, h3, h2 +; CHECK-UNSAFE-NEXT: fadd h0, h1, h0 +; CHECK-UNSAFE-NEXT: ret + %t0 = fdiv half %x0, %x1 + %t1 = fadd half %x2, %t0 + %t2 = fadd half %x3, %t1 + ret half %t2 +} + +; Verify that scalar half-precision multiplies are reassociated. + +define half @reassociate_muls_half(half %x0, half %x1, half %x2, half %x3) { +; CHECK-STD-LABEL: reassociate_muls_half: +; CHECK-STD: // %bb.0: +; CHECK-STD-NEXT: fdiv h0, h0, h1 +; CHECK-STD-NEXT: fmul h0, h2, h0 +; CHECK-STD-NEXT: fmul h0, h3, h0 +; CHECK-STD-NEXT: ret +; +; CHECK-UNSAFE-LABEL: reassociate_muls_half: +; CHECK-UNSAFE: // %bb.0: +; CHECK-UNSAFE-NEXT: fdiv h0, h0, h1 +; CHECK-UNSAFE-NEXT: fmul h1, h3, h2 +; CHECK-UNSAFE-NEXT: fmul h0, h1, h0 +; CHECK-UNSAFE-NEXT: ret + %t0 = fdiv half %x0, %x1 + %t1 = fmul half %x2, %t0 + %t2 = fmul half %x3, %t1 + ret half %t2 +} + ; Verify that scalar integer adds are reassociated. define i32 @reassociate_adds_i32(i32 %x0, i32 %x1, i32 %x2, i32 %x3) { @@ -399,6 +443,51 @@ define <4 x float> @vector_reassociate_adds4(<4 x float> %x0, <4 x float> %x1, < %t2 = fadd <4 x float> %x3, %t1 ret <4 x float> %t2 } + +; Verify that 64-bit vector half-precision adds are reassociated. + +define <4 x half> @reassociate_adds_v4f16(<4 x half> %x0, <4 x half> %x1, <4 x half> %x2, <4 x half> %x3) { +; CHECK-STD-LABEL: reassociate_adds_v4f16: +; CHECK-STD: // %bb.0: +; CHECK-STD-NEXT: fadd v0.4h, v0.4h, v1.4h +; CHECK-STD-NEXT: fadd v0.4h, v2.4h, v0.4h +; CHECK-STD-NEXT: fadd v0.4h, v3.4h, v0.4h +; CHECK-STD-NEXT: ret +; +; CHECK-UNSAFE-LABEL: reassociate_adds_v4f16: +; CHECK-UNSAFE: // %bb.0: +; CHECK-UNSAFE-NEXT: fadd v0.4h, v0.4h, v1.4h +; CHECK-UNSAFE-NEXT: fadd v1.4h, v3.4h, v2.4h +; CHECK-UNSAFE-NEXT: fadd v0.4h, v1.4h, v0.4h +; CHECK-UNSAFE-NEXT: ret + %t0 = fadd <4 x half> %x0, %x1 + %t1 = fadd <4 x half> %x2, %t0 + %t2 = fadd <4 x half> %x3, %t1 + ret <4 x half> %t2 +} + +; Verify that 128-bit vector half-precision multiplies are reassociated. + +define <8 x half> @reassociate_muls_v8f16(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2, <8 x half> %x3) { +; CHECK-STD-LABEL: reassociate_muls_v8f16: +; CHECK-STD: // %bb.0: +; CHECK-STD-NEXT: fadd v0.8h, v0.8h, v1.8h +; CHECK-STD-NEXT: fmul v0.8h, v2.8h, v0.8h +; CHECK-STD-NEXT: fmul v0.8h, v3.8h, v0.8h +; CHECK-STD-NEXT: ret +; +; CHECK-UNSAFE-LABEL: reassociate_muls_v8f16: +; CHECK-UNSAFE: // %bb.0: +; CHECK-UNSAFE-NEXT: fadd v0.8h, v0.8h, v1.8h +; CHECK-UNSAFE-NEXT: fmul v1.8h, v3.8h, v2.8h +; CHECK-UNSAFE-NEXT: fmul v0.8h, v1.8h, v0.8h +; CHECK-UNSAFE-NEXT: ret + %t0 = fadd <8 x half> %x0, %x1 + %t1 = fmul <8 x half> %x2, %t0 + %t2 = fmul <8 x half> %x3, %t1 + ret <8 x half> %t2 +} + ; Verify that 128-bit vector single-precision multiplies are reassociated. define <4 x float> @reassociate_muls_v4f32(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, <4 x float> %x3) { @@ -492,9 +581,9 @@ define double @reassociate_adds_from_calls() { ; CHECK-UNSAFE-NEXT: fmov d10, d0 ; CHECK-UNSAFE-NEXT: bl bar ; CHECK-UNSAFE-NEXT: fadd d1, d8, d9 -; CHECK-UNSAFE-NEXT: fadd d0, d10, d0 -; CHECK-UNSAFE-NEXT: ldr x30, [sp, #24] // 8-byte Folded Reload ; CHECK-UNSAFE-NEXT: ldp d9, d8, [sp, #8] // 16-byte Folded Reload +; CHECK-UNSAFE-NEXT: ldr x30, [sp, #24] // 8-byte Folded Reload +; CHECK-UNSAFE-NEXT: fadd d0, d10, d0 ; CHECK-UNSAFE-NEXT: fadd d0, d1, d0 ; CHECK-UNSAFE-NEXT: ldr d10, [sp], #32 // 8-byte Folded Reload ; CHECK-UNSAFE-NEXT: ret @@ -527,9 +616,9 @@ define double @already_reassociated() { ; CHECK-NEXT: fmov d10, d0 ; CHECK-NEXT: bl bar ; CHECK-NEXT: fadd d1, d8, d9 -; CHECK-NEXT: fadd d0, d10, d0 -; CHECK-NEXT: ldr x30, [sp, #24] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #8] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #24] // 8-byte Folded Reload +; CHECK-NEXT: fadd d0, d10, d0 ; CHECK-NEXT: fadd d0, d1, d0 ; CHECK-NEXT: ldr d10, [sp], #32 // 8-byte Folded Reload ; CHECK-NEXT: ret -- 2.7.4