From 95c7dd8810b0bc93c0f76a285f1bcc3bd73f6a50 Mon Sep 17 00:00:00 2001 From: Krzysztof Parzyszek Date: Thu, 30 Dec 2021 07:57:11 -0800 Subject: [PATCH] Revert "[Hexagon] Don't build two halves of HVX vector in parallel" This reverts commit ba07f300c6d67a2c6dde8eef216b7a77ac4600bb. A build-vector sequence is made of pairs: rotate+insert. When constructing a single vector, this results in a chain of 2*N instructions. The rotate operation is a permute operation, but the insert uses a multiplication resource: insert and rotate can execute in the same cycle, but obviously they cannot operate on the same vector. The original halving idea is still beneficial since it does allow for insert/rotate overlap, and for hiding insert's latency. --- llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp | 67 ++++++++++++++++------ .../CodeGen/Hexagon/autohvx/isel-build-vector.ll | 24 ++++---- 2 files changed, 63 insertions(+), 28 deletions(-) diff --git a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp index a151f3d..569ad8b 100755 --- a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp @@ -659,10 +659,10 @@ HexagonTargetLowering::buildHvxVectorReg(ArrayRef Values, // Find most common element to initialize vector with. This is to avoid // unnecessary vinsert/valign for cases where the same value is present // many times. Creates a histogram of the vector's elements to find the - // most common element. + // most common element n. assert(4*Words.size() == Subtarget.getVectorLength()); - SmallVector VecHist(32); - int MaxAt = 0; + int VecHist[32]; + int n = 0; for (unsigned i = 0; i != NumWords; ++i) { VecHist[i] = 0; if (Words[i].isUndef()) @@ -671,29 +671,60 @@ HexagonTargetLowering::buildHvxVectorReg(ArrayRef Values, if (Words[i] == Words[j]) VecHist[i]++; - if (VecHist[i] > VecHist[MaxAt]) - MaxAt = i; + if (VecHist[i] > VecHist[n]) + n = i; } - // If each value is different, don't do splat, just insert them one by one. - bool NoSplat = VecHist[MaxAt] <= 1; - SDValue RotV = NoSplat - ? DAG.getUNDEF(VecTy) - : DAG.getNode(ISD::SPLAT_VECTOR, dl, VecTy, Words[MaxAt]); - int Rn = 0; - for (unsigned i = 0; i != NumWords; ++i) { + SDValue HalfV = getZero(dl, VecTy, DAG); + if (VecHist[n] > 1) { + SDValue SplatV = DAG.getNode(ISD::SPLAT_VECTOR, dl, VecTy, Words[n]); + HalfV = DAG.getNode(HexagonISD::VALIGN, dl, VecTy, + {HalfV, SplatV, DAG.getConstant(HwLen/2, dl, MVT::i32)}); + } + SDValue HalfV0 = HalfV; + SDValue HalfV1 = HalfV; + + // Construct two halves in parallel, then or them together. Rn and Rm count + // number of rotations needed before the next element. One last rotation is + // performed post-loop to position the last element. + int Rn = 0, Rm = 0; + SDValue Sn, Sm; + SDValue N = HalfV0; + SDValue M = HalfV1; + for (unsigned i = 0; i != NumWords/2; ++i) { + // Rotate by element count since last insertion. - if (NoSplat || Words[i] != Words[MaxAt]) { - RotV = DAG.getNode(HexagonISD::VROR, dl, VecTy, - {RotV, DAG.getConstant(Rn, dl, MVT::i32)}); - RotV = DAG.getNode(HexagonISD::VINSERTW0, dl, VecTy, {RotV, Words[i]}); + if (Words[i] != Words[n] || VecHist[n] <= 1) { + Sn = DAG.getConstant(Rn, dl, MVT::i32); + HalfV0 = DAG.getNode(HexagonISD::VROR, dl, VecTy, {N, Sn}); + N = DAG.getNode(HexagonISD::VINSERTW0, dl, VecTy, + {HalfV0, Words[i]}); Rn = 0; } + if (Words[i+NumWords/2] != Words[n] || VecHist[n] <= 1) { + Sm = DAG.getConstant(Rm, dl, MVT::i32); + HalfV1 = DAG.getNode(HexagonISD::VROR, dl, VecTy, {M, Sm}); + M = DAG.getNode(HexagonISD::VINSERTW0, dl, VecTy, + {HalfV1, Words[i+NumWords/2]}); + Rm = 0; + } Rn += 4; + Rm += 4; } // Perform last rotation. - return DAG.getNode(HexagonISD::VROR, dl, VecTy, - {RotV, DAG.getConstant(Rn, dl, MVT::i32)}); + Sn = DAG.getConstant(Rn+HwLen/2, dl, MVT::i32); + Sm = DAG.getConstant(Rm, dl, MVT::i32); + HalfV0 = DAG.getNode(HexagonISD::VROR, dl, VecTy, {N, Sn}); + HalfV1 = DAG.getNode(HexagonISD::VROR, dl, VecTy, {M, Sm}); + + SDValue T0 = DAG.getBitcast(tyVector(VecTy, MVT::i32), HalfV0); + SDValue T1 = DAG.getBitcast(tyVector(VecTy, MVT::i32), HalfV1); + + SDValue DstV = DAG.getNode(ISD::OR, dl, ty(T0), {T0, T1}); + + SDValue OutV = + DAG.getBitcast(tyVector(ty(DstV), VecTy.getVectorElementType()), DstV); + return OutV; } SDValue diff --git a/llvm/test/CodeGen/Hexagon/autohvx/isel-build-vector.ll b/llvm/test/CodeGen/Hexagon/autohvx/isel-build-vector.ll index 159001c..e6b8445 100644 --- a/llvm/test/CodeGen/Hexagon/autohvx/isel-build-vector.ll +++ b/llvm/test/CodeGen/Hexagon/autohvx/isel-build-vector.ll @@ -6,31 +6,35 @@ define <32 x i32> @fred(i32 %a0) #0 { ; CHECK: .cfi_startproc ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { -; CHECK-NEXT: r3:2 = combine(#76,#7) -; CHECK-NEXT: r1 = #12 -; CHECK-NEXT: r4 = #9 +; CHECK-NEXT: r3:2 = combine(#20,#9) +; CHECK-NEXT: v0 = vxor(v0,v0) +; CHECK-NEXT: r1 = #24 +; CHECK-NEXT: r4 = #12 ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v0 = vror(v0,r1) +; CHECK-NEXT: v1 = vror(v0,r1) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v0.w = vinsert(r2) -; CHECK-NEXT: r2 = #20 +; CHECK-NEXT: v1.w = vinsert(r2) +; CHECK-NEXT: r4 = #7 +; CHECK-NEXT: r2 = #116 +; CHECK-NEXT: v0 = vror(v0,r4) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v0 = vror(v0,r3) +; CHECK-NEXT: v0.w = vinsert(r4) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v0.w = vinsert(r4) +; CHECK-NEXT: v1 = vror(v1,r3) ; CHECK-NEXT: } ; CHECK-NEXT: { +; CHECK-NEXT: v1.w = vinsert(r0) ; CHECK-NEXT: v0 = vror(v0,r2) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v0.w = vinsert(r0) +; CHECK-NEXT: v1 = vror(v1,r3) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v0 = vror(v0,r2) +; CHECK-NEXT: v0 = vor(v0,v1) ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: } %v0 = insertelement <32 x i32> undef, i32 undef, i32 0 -- 2.7.4