From 8d522d811a6d8ec72b10dbd76ba670678bc4251b Mon Sep 17 00:00:00 2001 From: Evandro Menezes Date: Tue, 15 May 2018 20:41:12 +0000 Subject: [PATCH] [AArch64] Improve single vector lane unscaled stores When storing the 0th lane of a vector, use a simpler and usually more efficient scalar store instead. In this case, also using the unscaled offset. Differential revision: https://reviews.llvm.org/D46762 llvm-svn: 332394 --- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 16 +++ llvm/test/CodeGen/AArch64/arm64-st1.ll | 127 ++++++++++++--------- .../test/CodeGen/AArch64/fp16-vector-load-store.ll | 4 +- 3 files changed, 92 insertions(+), 55 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 5f78a32..e815137 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -2477,6 +2477,22 @@ def : Pat<(truncstorei16 GPR64:$Rt, (am_unscaled16 GPR64sp:$Rn, simm9:$offset)), def : Pat<(truncstorei8 GPR64:$Rt, (am_unscaled8 GPR64sp:$Rn, simm9:$offset)), (STURBBi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>; +// Match stores from lane 0 to the appropriate subreg's store. +multiclass VecStoreULane0Pat { + defm : VecStoreLane0Pat; +} + +let AddedComplexity = 19 in { + defm : VecStoreULane0Pat; + defm : VecStoreULane0Pat; + defm : VecStoreULane0Pat; + defm : VecStoreULane0Pat; + defm : VecStoreULane0Pat; + defm : VecStoreULane0Pat; +} + //--- // STR mnemonics fall back to STUR for negative or unaligned offsets. def : InstAlias<"str $Rt, [$Rn, $offset]", diff --git a/llvm/test/CodeGen/AArch64/arm64-st1.ll b/llvm/test/CodeGen/AArch64/arm64-st1.ll index 55ac018..af234a9 100644 --- a/llvm/test/CodeGen/AArch64/arm64-st1.ll +++ b/llvm/test/CodeGen/AArch64/arm64-st1.ll @@ -4,23 +4,25 @@ define void @st1lane_16b(<16 x i8> %A, i8* %D) { ; CHECK-LABEL: st1lane_16b -; CHECK: st1.b +; CHECK: st1.b { v0 }[1], [x{{[0-9]+}}] + %ptr = getelementptr i8, i8* %D, i64 1 %tmp = extractelement <16 x i8> %A, i32 1 - store i8 %tmp, i8* %D + store i8 %tmp, i8* %ptr ret void } define void @st1lane0_16b(<16 x i8> %A, i8* %D) { ; CHECK-LABEL: st1lane0_16b -; CHECK: st1.b +; CHECK: st1.b { v0 }[0], [x{{[0-9]+}}] + %ptr = getelementptr i8, i8* %D, i64 1 %tmp = extractelement <16 x i8> %A, i32 0 - store i8 %tmp, i8* %D + store i8 %tmp, i8* %ptr ret void } define void @st1lane0u_16b(<16 x i8> %A, i8* %D) { ; CHECK-LABEL: st1lane0u_16b -; CHECK: st1.b +; CHECK: st1.b { v0 }[0], [x{{[0-9]+}}] %ptr = getelementptr i8, i8* %D, i64 -1 %tmp = extractelement <16 x i8> %A, i32 0 store i8 %tmp, i8* %ptr @@ -49,23 +51,25 @@ define void @st1lane0_ro_16b(<16 x i8> %A, i8* %D, i64 %offset) { define void @st1lane_8h(<8 x i16> %A, i16* %D) { ; CHECK-LABEL: st1lane_8h -; CHECK: st1.h +; CHECK: st1.h { v0 }[1], [x{{[0-9]+}}] + %ptr = getelementptr i16, i16* %D, i64 1 %tmp = extractelement <8 x i16> %A, i32 1 - store i16 %tmp, i16* %D + store i16 %tmp, i16* %ptr ret void } define void @st1lane0_8h(<8 x i16> %A, i16* %D) { ; CHECK-LABEL: st1lane0_8h -; CHECK: str +; CHECK: str h0, [x0, #2] + %ptr = getelementptr i16, i16* %D, i64 1 %tmp = extractelement <8 x i16> %A, i32 0 - store i16 %tmp, i16* %D + store i16 %tmp, i16* %ptr ret void } define void @st1lane0u_8h(<8 x i16> %A, i16* %D) { ; CHECK-LABEL: st1lane0u_8h -; CHECK: st1.h +; CHECK: stur h0, [x0, #-2] %ptr = getelementptr i16, i16* %D, i64 -1 %tmp = extractelement <8 x i16> %A, i32 0 store i16 %tmp, i16* %ptr @@ -93,23 +97,25 @@ define void @st1lane0_ro_8h(<8 x i16> %A, i16* %D, i64 %offset) { define void @st1lane_4s(<4 x i32> %A, i32* %D) { ; CHECK-LABEL: st1lane_4s -; CHECK: st1.s +; CHECK: st1.s { v0 }[1], [x{{[0-9]+}}] + %ptr = getelementptr i32, i32* %D, i64 1 %tmp = extractelement <4 x i32> %A, i32 1 - store i32 %tmp, i32* %D + store i32 %tmp, i32* %ptr ret void } define void @st1lane0_4s(<4 x i32> %A, i32* %D) { ; CHECK-LABEL: st1lane0_4s -; CHECK: str +; CHECK: str s0, [x0, #4] + %ptr = getelementptr i32, i32* %D, i64 1 %tmp = extractelement <4 x i32> %A, i32 0 - store i32 %tmp, i32* %D + store i32 %tmp, i32* %ptr ret void } define void @st1lane0u_4s(<4 x i32> %A, i32* %D) { ; CHECK-LABEL: st1lane0u_4s -; CHECK: st1.s +; CHECK: stur s0, [x0, #-4] %ptr = getelementptr i32, i32* %D, i64 -1 %tmp = extractelement <4 x i32> %A, i32 0 store i32 %tmp, i32* %ptr @@ -137,23 +143,25 @@ define void @st1lane0_ro_4s(<4 x i32> %A, i32* %D, i64 %offset) { define void @st1lane_4s_float(<4 x float> %A, float* %D) { ; CHECK-LABEL: st1lane_4s_float -; CHECK: st1.s +; CHECK: st1.s { v0 }[1], [x{{[0-9]+}}] + %ptr = getelementptr float, float* %D, i64 1 %tmp = extractelement <4 x float> %A, i32 1 - store float %tmp, float* %D + store float %tmp, float* %ptr ret void } define void @st1lane0_4s_float(<4 x float> %A, float* %D) { ; CHECK-LABEL: st1lane0_4s_float -; CHECK: str +; CHECK: str s0, [x0, #4] + %ptr = getelementptr float, float* %D, i64 1 %tmp = extractelement <4 x float> %A, i32 0 - store float %tmp, float* %D + store float %tmp, float* %ptr ret void } define void @st1lane0u_4s_float(<4 x float> %A, float* %D) { ; CHECK-LABEL: st1lane0u_4s_float -; CHECK: st1.s +; CHECK: stur s0, [x0, #-4] %ptr = getelementptr float, float* %D, i64 -1 %tmp = extractelement <4 x float> %A, i32 0 store float %tmp, float* %ptr @@ -181,23 +189,25 @@ define void @st1lane0_ro_4s_float(<4 x float> %A, float* %D, i64 %offset) { define void @st1lane_2d(<2 x i64> %A, i64* %D) { ; CHECK-LABEL: st1lane_2d -; CHECK: st1.d +; CHECK: st1.d { v0 }[1], [x{{[0-9]+}}] + %ptr = getelementptr i64, i64* %D, i64 1 %tmp = extractelement <2 x i64> %A, i32 1 - store i64 %tmp, i64* %D + store i64 %tmp, i64* %ptr ret void } define void @st1lane0_2d(<2 x i64> %A, i64* %D) { ; CHECK-LABEL: st1lane0_2d -; CHECK: str +; CHECK: str d0, [x0, #8] + %ptr = getelementptr i64, i64* %D, i64 1 %tmp = extractelement <2 x i64> %A, i32 0 - store i64 %tmp, i64* %D + store i64 %tmp, i64* %ptr ret void } define void @st1lane0u_2d(<2 x i64> %A, i64* %D) { ; CHECK-LABEL: st1lane0u_2d -; CHECK: st1.d +; CHECK: stur d0, [x0, #-8] %ptr = getelementptr i64, i64* %D, i64 -1 %tmp = extractelement <2 x i64> %A, i32 0 store i64 %tmp, i64* %ptr @@ -225,23 +235,25 @@ define void @st1lane0_ro_2d(<2 x i64> %A, i64* %D, i64 %offset) { define void @st1lane_2d_double(<2 x double> %A, double* %D) { ; CHECK-LABEL: st1lane_2d_double -; CHECK: st1.d +; CHECK: st1.d { v0 }[1], [x{{[0-9]+}}] + %ptr = getelementptr double, double* %D, i64 1 %tmp = extractelement <2 x double> %A, i32 1 - store double %tmp, double* %D + store double %tmp, double* %ptr ret void } define void @st1lane0_2d_double(<2 x double> %A, double* %D) { ; CHECK-LABEL: st1lane0_2d_double -; CHECK: str +; CHECK: str d0, [x0, #8] + %ptr = getelementptr double, double* %D, i64 1 %tmp = extractelement <2 x double> %A, i32 0 - store double %tmp, double* %D + store double %tmp, double* %ptr ret void } define void @st1lane0u_2d_double(<2 x double> %A, double* %D) { ; CHECK-LABEL: st1lane0u_2d_double -; CHECK: st1.d +; CHECK: stur d0, [x0, #-8] %ptr = getelementptr double, double* %D, i64 -1 %tmp = extractelement <2 x double> %A, i32 0 store double %tmp, double* %ptr @@ -269,9 +281,10 @@ define void @st1lane0_ro_2d_double(<2 x double> %A, double* %D, i64 %offset) { define void @st1lane_8b(<8 x i8> %A, i8* %D) { ; CHECK-LABEL: st1lane_8b -; CHECK: st1.b +; CHECK: st1.b { v0 }[1], [x{{[0-9]+}}] + %ptr = getelementptr i8, i8* %D, i64 1 %tmp = extractelement <8 x i8> %A, i32 1 - store i8 %tmp, i8* %D + store i8 %tmp, i8* %ptr ret void } @@ -297,23 +310,25 @@ define void @st1lane0_ro_8b(<8 x i8> %A, i8* %D, i64 %offset) { define void @st1lane_4h(<4 x i16> %A, i16* %D) { ; CHECK-LABEL: st1lane_4h -; CHECK: st1.h +; CHECK: st1.h { v0 }[1], [x{{[0-9]+}}] + %ptr = getelementptr i16, i16* %D, i64 1 %tmp = extractelement <4 x i16> %A, i32 1 - store i16 %tmp, i16* %D + store i16 %tmp, i16* %ptr ret void } define void @st1lane0_4h(<4 x i16> %A, i16* %D) { ; CHECK-LABEL: st1lane0_4h -; CHECK: str +; CHECK: str h0, [x0, #2] + %ptr = getelementptr i16, i16* %D, i64 1 %tmp = extractelement <4 x i16> %A, i32 0 - store i16 %tmp, i16* %D + store i16 %tmp, i16* %ptr ret void } define void @st1lane0u_4h(<4 x i16> %A, i16* %D) { ; CHECK-LABEL: st1lane0u_4h -; CHECK: st1.h +; CHECK: stur h0, [x0, #-2] %ptr = getelementptr i16, i16* %D, i64 -1 %tmp = extractelement <4 x i16> %A, i32 0 store i16 %tmp, i16* %ptr @@ -341,23 +356,25 @@ define void @st1lane0_ro_4h(<4 x i16> %A, i16* %D, i64 %offset) { define void @st1lane_2s(<2 x i32> %A, i32* %D) { ; CHECK-LABEL: st1lane_2s -; CHECK: st1.s +; CHECK: st1.s { v0 }[1], [x{{[0-9]+}}] + %ptr = getelementptr i32, i32* %D, i64 1 %tmp = extractelement <2 x i32> %A, i32 1 - store i32 %tmp, i32* %D + store i32 %tmp, i32* %ptr ret void } define void @st1lane0_2s(<2 x i32> %A, i32* %D) { ; CHECK-LABEL: st1lane0_2s -; CHECK: str +; CHECK: str s0, [x0, #4] + %ptr = getelementptr i32, i32* %D, i64 1 %tmp = extractelement <2 x i32> %A, i32 0 - store i32 %tmp, i32* %D + store i32 %tmp, i32* %ptr ret void } define void @st1lane0u_2s(<2 x i32> %A, i32* %D) { ; CHECK-LABEL: st1lane0u_2s -; CHECK: st1.s +; CHECK: stur s0, [x0, #-4] %ptr = getelementptr i32, i32* %D, i64 -1 %tmp = extractelement <2 x i32> %A, i32 0 store i32 %tmp, i32* %ptr @@ -385,23 +402,25 @@ define void @st1lane0_ro_2s(<2 x i32> %A, i32* %D, i64 %offset) { define void @st1lane_2s_float(<2 x float> %A, float* %D) { ; CHECK-LABEL: st1lane_2s_float -; CHECK: st1.s +; CHECK: st1.s { v0 }[1], [x{{[0-9]+}}] + %ptr = getelementptr float, float* %D, i64 1 %tmp = extractelement <2 x float> %A, i32 1 - store float %tmp, float* %D + store float %tmp, float* %ptr ret void } define void @st1lane0_2s_float(<2 x float> %A, float* %D) { ; CHECK-LABEL: st1lane0_2s_float -; CHECK: str +; CHECK: str s0, [x0, #4] + %ptr = getelementptr float, float* %D, i64 1 %tmp = extractelement <2 x float> %A, i32 0 - store float %tmp, float* %D + store float %tmp, float* %ptr ret void } define void @st1lane0u_2s_float(<2 x float> %A, float* %D) { ; CHECK-LABEL: st1lane0u_2s_float -; CHECK: st1.s +; CHECK: stur s0, [x0, #-4] %ptr = getelementptr float, float* %D, i64 -1 %tmp = extractelement <2 x float> %A, i32 0 store float %tmp, float* %ptr @@ -429,15 +448,16 @@ define void @st1lane0_ro_2s_float(<2 x float> %A, float* %D, i64 %offset) { define void @st1lane0_1d(<1 x i64> %A, i64* %D) { ; CHECK-LABEL: st1lane0_1d -; CHECK: str +; CHECK: str d0, [x0, #8] + %ptr = getelementptr i64, i64* %D, i64 1 %tmp = extractelement <1 x i64> %A, i32 0 - store i64 %tmp, i64* %D + store i64 %tmp, i64* %ptr ret void } define void @st1lane0u_1d(<1 x i64> %A, i64* %D) { ; CHECK-LABEL: st1lane0u_1d -; CHECK: st1.d +; CHECK: stur d0, [x0, #-8] %ptr = getelementptr i64, i64* %D, i64 -1 %tmp = extractelement <1 x i64> %A, i32 0 store i64 %tmp, i64* %ptr @@ -455,15 +475,16 @@ define void @st1lane0_ro_1d(<1 x i64> %A, i64* %D, i64 %offset) { define void @st1lane0_1d_double(<1 x double> %A, double* %D) { ; CHECK-LABEL: st1lane0_1d_double -; CHECK: str +; CHECK: str d0, [x0, #8] + %ptr = getelementptr double, double* %D, i64 1 %tmp = extractelement <1 x double> %A, i32 0 - store double %tmp, double* %D + store double %tmp, double* %ptr ret void } define void @st1lane0u_1d_double(<1 x double> %A, double* %D) { ; CHECK-LABEL: st1lane0u_1d_double -; CHECK: stur +; CHECK: stur d0, [x0, #-8] %ptr = getelementptr double, double* %D, i64 -1 %tmp = extractelement <1 x double> %A, i32 0 store double %tmp, double* %ptr diff --git a/llvm/test/CodeGen/AArch64/fp16-vector-load-store.ll b/llvm/test/CodeGen/AArch64/fp16-vector-load-store.ll index 0eb9dd1..1d1794a 100644 --- a/llvm/test/CodeGen/AArch64/fp16-vector-load-store.ll +++ b/llvm/test/CodeGen/AArch64/fp16-vector-load-store.ll @@ -99,7 +99,7 @@ entry: define void @storeu_lane0_64(half* nocapture %a, <4 x half> %b) #1 { ; CHECK-LABEL: storeu_lane0_64: -; CHECK: st1 { v0.h }[0], [x{{[0-9]+}}] +; CHECK: stur h0, [x{{[0-9]+}}, #-2] entry: %0 = getelementptr half, half* %a, i64 -1 %1 = extractelement <4 x half> %b, i32 0 @@ -148,7 +148,7 @@ entry: define void @storeu_lane0_128(half* nocapture %a, <8 x half> %b) #1 { ; CHECK-LABEL: storeu_lane0_128: -; CHECK: st1 { v0.h }[0], [x{{[0-9]+}}] +; CHECK: stur h0, [x{{[0-9]+}}, #-2] entry: %0 = getelementptr half, half* %a, i64 -1 %1 = extractelement <8 x half> %b, i32 0 -- 2.7.4