From: David Green Date: Tue, 20 Sep 2022 16:09:14 +0000 (+0100) Subject: [AArch64] Enable LSLFast for modern OoO cpus X-Git-Tag: upstream/17.0.6~33017 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=cb375e8c1f393b53d6516950969a5caac42bf178;p=platform%2Fupstream%2Fllvm.git [AArch64] Enable LSLFast for modern OoO cpus This patch enables the LSLFast feature for Cortex-A76, Cortex-A77, Cortex-A78, Cortex-A78C, Cortex-A710, Cortex-X1, Cortex-X2, Neoverse N1, Neoverse N2, Neoverse V1 and the Neoverse 512TB pseudo-cpu, in-line with the software optimization guides for those CPUs. Differntial revision: https://reviews.llvm.org/D134273 --- diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td index a156b8f..5ba5498 100644 --- a/llvm/lib/Target/AArch64/AArch64.td +++ b/llvm/lib/Target/AArch64/AArch64.td @@ -717,17 +717,20 @@ def TuneA75 : SubtargetFeature<"a75", "ARMProcFamily", "CortexA75", def TuneA76 : SubtargetFeature<"a76", "ARMProcFamily", "CortexA76", "Cortex-A76 ARM processors", [ - FeatureFuseAES]>; + FeatureFuseAES, + FeatureLSLFast]>; def TuneA77 : SubtargetFeature<"a77", "ARMProcFamily", "CortexA77", "Cortex-A77 ARM processors", [ FeatureCmpBccFusion, - FeatureFuseAES]>; + FeatureFuseAES, + FeatureLSLFast]>; def TuneA78 : SubtargetFeature<"a78", "ARMProcFamily", "CortexA78", "Cortex-A78 ARM processors", [ FeatureCmpBccFusion, FeatureFuseAES, + FeatureLSLFast, FeaturePostRAScheduler]>; def TuneA78C : SubtargetFeature<"a78c", "ARMProcFamily", @@ -735,13 +738,15 @@ def TuneA78C : SubtargetFeature<"a78c", "ARMProcFamily", "Cortex-A78C ARM processors", [ FeatureCmpBccFusion, FeatureFuseAES, + FeatureLSLFast, FeaturePostRAScheduler]>; def TuneA710 : SubtargetFeature<"a710", "ARMProcFamily", "CortexA710", "Cortex-A710 ARM processors", [ + FeatureCmpBccFusion, FeatureFuseAES, - FeaturePostRAScheduler, - FeatureCmpBccFusion]>; + FeatureLSLFast, + FeaturePostRAScheduler]>; def TuneR82 : SubtargetFeature<"cortex-r82", "ARMProcFamily", "CortexR82", @@ -752,13 +757,15 @@ def TuneX1 : SubtargetFeature<"cortex-x1", "ARMProcFamily", "CortexX1", "Cortex-X1 ARM processors", [ FeatureCmpBccFusion, FeatureFuseAES, + FeatureLSLFast, FeaturePostRAScheduler]>; def TuneX2 : SubtargetFeature<"cortex-x2", "ARMProcFamily", "CortexX2", "Cortex-X2 ARM processors", [ + FeatureCmpBccFusion, FeatureFuseAES, - FeaturePostRAScheduler, - FeatureCmpBccFusion]>; + FeatureLSLFast, + FeaturePostRAScheduler]>; def TuneA64FX : SubtargetFeature<"a64fx", "ARMProcFamily", "A64FX", "Fujitsu A64FX processors", [ @@ -901,31 +908,32 @@ def TuneFalkor : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor", def TuneNeoverseE1 : SubtargetFeature<"neoversee1", "ARMProcFamily", "NeoverseE1", "Neoverse E1 ARM processors", [ - FeaturePostRAScheduler, - FeatureFuseAES - ]>; + FeatureFuseAES, + FeaturePostRAScheduler]>; def TuneNeoverseN1 : SubtargetFeature<"neoversen1", "ARMProcFamily", "NeoverseN1", "Neoverse N1 ARM processors", [ - FeaturePostRAScheduler, FeatureFuseAES, - FeatureFuseAdrpAdd - ]>; + FeatureFuseAdrpAdd, + FeatureLSLFast, + FeaturePostRAScheduler]>; def TuneNeoverseN2 : SubtargetFeature<"neoversen2", "ARMProcFamily", "NeoverseN2", "Neoverse N2 ARM processors", [ - FeaturePostRAScheduler, - FeatureFuseAES - ]>; + FeatureFuseAES, + FeatureLSLFast, + FeaturePostRAScheduler]>; + def TuneNeoverse512TVB : SubtargetFeature<"neoverse512tvb", "ARMProcFamily", "Neoverse512TVB", "Neoverse 512-TVB ARM processors", [ - FeaturePostRAScheduler, - FeatureFuseAES - ]>; + FeatureFuseAES, + FeatureLSLFast, + FeaturePostRAScheduler]>; def TuneNeoverseV1 : SubtargetFeature<"neoversev1", "ARMProcFamily", "NeoverseV1", "Neoverse V1 ARM processors", [ FeatureFuseAES, + FeatureLSLFast, FeaturePostRAScheduler]>; def TuneSaphira : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira", diff --git a/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll b/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll index 7c65884..25ea393 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+lsl-fast | FileCheck %s +; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s --check-prefixes=CHECK,CHECK0 +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+lsl-fast | FileCheck %s --check-prefixes=CHECK,CHECK3 %struct.a = type [256 x i16] %struct.b = type [256 x i32] @@ -7,20 +8,36 @@ declare void @foo() define i16 @halfword(%struct.a* %ctx, i32 %xor72) nounwind { -; CHECK-LABEL: halfword: -; CHECK: // %bb.0: -; CHECK-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 -; CHECK-NEXT: ubfx x21, x1, #9, #8 -; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: ldrh w20, [x0, x21, lsl #1] -; CHECK-NEXT: bl foo -; CHECK-NEXT: mov w0, w20 -; CHECK-NEXT: strh w20, [x19, x21, lsl #1] -; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload -; CHECK-NEXT: ret +; CHECK0-LABEL: halfword: +; CHECK0: // %bb.0: +; CHECK0-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; CHECK0-NEXT: // kill: def $w1 killed $w1 def $x1 +; CHECK0-NEXT: ubfx x8, x1, #9, #8 +; CHECK0-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK0-NEXT: lsl x21, x8, #1 +; CHECK0-NEXT: mov x19, x0 +; CHECK0-NEXT: ldrh w20, [x0, x21] +; CHECK0-NEXT: bl foo +; CHECK0-NEXT: mov w0, w20 +; CHECK0-NEXT: strh w20, [x19, x21] +; CHECK0-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK0-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; CHECK0-NEXT: ret +; +; CHECK3-LABEL: halfword: +; CHECK3: // %bb.0: +; CHECK3-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; CHECK3-NEXT: // kill: def $w1 killed $w1 def $x1 +; CHECK3-NEXT: ubfx x21, x1, #9, #8 +; CHECK3-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK3-NEXT: mov x19, x0 +; CHECK3-NEXT: ldrh w20, [x0, x21, lsl #1] +; CHECK3-NEXT: bl foo +; CHECK3-NEXT: mov w0, w20 +; CHECK3-NEXT: strh w20, [x19, x21, lsl #1] +; CHECK3-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK3-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; CHECK3-NEXT: ret %shr81 = lshr i32 %xor72, 9 %conv82 = zext i32 %shr81 to i64 %idxprom83 = and i64 %conv82, 255 @@ -32,20 +49,36 @@ define i16 @halfword(%struct.a* %ctx, i32 %xor72) nounwind { } define i32 @word(%struct.b* %ctx, i32 %xor72) nounwind { -; CHECK-LABEL: word: -; CHECK: // %bb.0: -; CHECK-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 -; CHECK-NEXT: ubfx x21, x1, #9, #8 -; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: ldr w20, [x0, x21, lsl #2] -; CHECK-NEXT: bl foo -; CHECK-NEXT: mov w0, w20 -; CHECK-NEXT: str w20, [x19, x21, lsl #2] -; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload -; CHECK-NEXT: ret +; CHECK0-LABEL: word: +; CHECK0: // %bb.0: +; CHECK0-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; CHECK0-NEXT: // kill: def $w1 killed $w1 def $x1 +; CHECK0-NEXT: ubfx x8, x1, #9, #8 +; CHECK0-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK0-NEXT: lsl x21, x8, #2 +; CHECK0-NEXT: mov x19, x0 +; CHECK0-NEXT: ldr w20, [x0, x21] +; CHECK0-NEXT: bl foo +; CHECK0-NEXT: mov w0, w20 +; CHECK0-NEXT: str w20, [x19, x21] +; CHECK0-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK0-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; CHECK0-NEXT: ret +; +; CHECK3-LABEL: word: +; CHECK3: // %bb.0: +; CHECK3-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; CHECK3-NEXT: // kill: def $w1 killed $w1 def $x1 +; CHECK3-NEXT: ubfx x21, x1, #9, #8 +; CHECK3-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK3-NEXT: mov x19, x0 +; CHECK3-NEXT: ldr w20, [x0, x21, lsl #2] +; CHECK3-NEXT: bl foo +; CHECK3-NEXT: mov w0, w20 +; CHECK3-NEXT: str w20, [x19, x21, lsl #2] +; CHECK3-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK3-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; CHECK3-NEXT: ret %shr81 = lshr i32 %xor72, 9 %conv82 = zext i32 %shr81 to i64 %idxprom83 = and i64 %conv82, 255 @@ -57,20 +90,36 @@ define i32 @word(%struct.b* %ctx, i32 %xor72) nounwind { } define i64 @doubleword(%struct.c* %ctx, i32 %xor72) nounwind { -; CHECK-LABEL: doubleword: -; CHECK: // %bb.0: -; CHECK-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 -; CHECK-NEXT: ubfx x21, x1, #9, #8 -; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: ldr x20, [x0, x21, lsl #3] -; CHECK-NEXT: bl foo -; CHECK-NEXT: mov x0, x20 -; CHECK-NEXT: str x20, [x19, x21, lsl #3] -; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload -; CHECK-NEXT: ret +; CHECK0-LABEL: doubleword: +; CHECK0: // %bb.0: +; CHECK0-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; CHECK0-NEXT: // kill: def $w1 killed $w1 def $x1 +; CHECK0-NEXT: ubfx x8, x1, #9, #8 +; CHECK0-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK0-NEXT: lsl x21, x8, #3 +; CHECK0-NEXT: mov x19, x0 +; CHECK0-NEXT: ldr x20, [x0, x21] +; CHECK0-NEXT: bl foo +; CHECK0-NEXT: mov x0, x20 +; CHECK0-NEXT: str x20, [x19, x21] +; CHECK0-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK0-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; CHECK0-NEXT: ret +; +; CHECK3-LABEL: doubleword: +; CHECK3: // %bb.0: +; CHECK3-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; CHECK3-NEXT: // kill: def $w1 killed $w1 def $x1 +; CHECK3-NEXT: ubfx x21, x1, #9, #8 +; CHECK3-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK3-NEXT: mov x19, x0 +; CHECK3-NEXT: ldr x20, [x0, x21, lsl #3] +; CHECK3-NEXT: bl foo +; CHECK3-NEXT: mov x0, x20 +; CHECK3-NEXT: str x20, [x19, x21, lsl #3] +; CHECK3-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK3-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; CHECK3-NEXT: ret %shr81 = lshr i32 %xor72, 9 %conv82 = zext i32 %shr81 to i64 %idxprom83 = and i64 %conv82, 255 @@ -112,3 +161,67 @@ exitbb: endbb: ret i64 %mul2 } + +define i64 @gep3(i64 *%p, i64 %b) { +; CHECK0-LABEL: gep3: +; CHECK0: // %bb.0: +; CHECK0-NEXT: lsl x9, x1, #3 +; CHECK0-NEXT: mov x8, x0 +; CHECK0-NEXT: ldr x0, [x0, x9] +; CHECK0-NEXT: str x1, [x8, x9] +; CHECK0-NEXT: ret +; +; CHECK3-LABEL: gep3: +; CHECK3: // %bb.0: +; CHECK3-NEXT: mov x8, x0 +; CHECK3-NEXT: ldr x0, [x0, x1, lsl #3] +; CHECK3-NEXT: str x1, [x8, x1, lsl #3] +; CHECK3-NEXT: ret + %g = getelementptr inbounds i64, i64* %p, i64 %b + %l = load i64, i64* %g + store i64 %b, i64* %g + ret i64 %l +} + +define i128 @gep4(i128 *%p, i128 %a, i64 %b) { +; CHECK-LABEL: gep4: +; CHECK: // %bb.0: +; CHECK-NEXT: add x8, x0, x4, lsl #4 +; CHECK-NEXT: ldp x0, x1, [x8] +; CHECK-NEXT: stp x2, x3, [x8] +; CHECK-NEXT: ret + %g = getelementptr inbounds i128, i128* %p, i64 %b + %l = load i128, i128* %g + store i128 %a, i128* %g + ret i128 %l +} + +define i64 @addlsl3(i64 %a, i64 %b) { +; CHECK-LABEL: addlsl3: +; CHECK: // %bb.0: +; CHECK-NEXT: lsl x8, x0, #3 +; CHECK-NEXT: add x9, x1, x8 +; CHECK-NEXT: sub x8, x1, x8 +; CHECK-NEXT: eor x0, x9, x8 +; CHECK-NEXT: ret + %x = shl i64 %a, 3 + %y = add i64 %b, %x + %z = sub i64 %b, %x + %r = xor i64 %y, %z + ret i64 %r +} + +define i64 @addlsl4(i64 %a, i64 %b) { +; CHECK-LABEL: addlsl4: +; CHECK: // %bb.0: +; CHECK-NEXT: lsl x8, x0, #4 +; CHECK-NEXT: add x9, x1, x8 +; CHECK-NEXT: sub x8, x1, x8 +; CHECK-NEXT: eor x0, x9, x8 +; CHECK-NEXT: ret + %x = shl i64 %a, 4 + %y = add i64 %b, %x + %z = sub i64 %b, %x + %r = xor i64 %y, %z + ret i64 %r +}