From 7ab78f34cd3d90fda2c96ecdcca758617477c9ba Mon Sep 17 00:00:00 2001 From: Paul Walker Date: Wed, 23 Feb 2022 18:46:21 +0000 Subject: [PATCH] [SVE] Refactor complex immediate pattern used by CPY/DUP. SelectSVE8BitLslImm didn't account for constant values that have a larger bit width than the result vector's element type. This only seems to affect a single corner case when lowering fixed length vectors but the code itself is also not consistent with how other related complex patterns are implemented so I've taken the opportunity to refactor the code. Differential Revision: https://reviews.llvm.org/D120440 --- llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp | 75 +++++++++++++--------- llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td | 8 +-- llvm/lib/Target/AArch64/SVEInstrFormats.td | 71 ++++++++++---------- .../AArch64/sve-fixed-length-float-compares.ll | 21 ++---- 4 files changed, 90 insertions(+), 85 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index c8a2624..8fcf4dd 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -204,6 +204,11 @@ public: return SelectSVEAddSubImm(N, VT, Imm, Shift); } + template + bool SelectSVECpyDupImm(SDValue N, SDValue &Imm, SDValue &Shift) { + return SelectSVECpyDupImm(N, VT, Imm, Shift); + } + template bool SelectSVELogicalImm(SDValue N, SDValue &Imm) { return SelectSVELogicalImm(N, VT, Imm, Invert); @@ -357,10 +362,8 @@ private: bool SelectCMP_SWAP(SDNode *N); - bool SelectSVE8BitLslImm(SDValue N, SDValue &Imm, SDValue &Shift); - bool SelectSVEAddSubImm(SDValue N, MVT VT, SDValue &Imm, SDValue &Shift); - + bool SelectSVECpyDupImm(SDValue N, MVT VT, SDValue &Imm, SDValue &Shift); bool SelectSVELogicalImm(SDValue N, MVT VT, SDValue &Imm, bool Invert); bool SelectSVESignedArithImm(SDValue N, SDValue &Imm); @@ -3129,32 +3132,6 @@ bool AArch64DAGToDAGISel::SelectCMP_SWAP(SDNode *N) { return true; } -bool AArch64DAGToDAGISel::SelectSVE8BitLslImm(SDValue N, SDValue &Base, - SDValue &Offset) { - auto C = dyn_cast(N); - if (!C) - return false; - - auto Ty = N->getValueType(0); - - int64_t Imm = C->getSExtValue(); - SDLoc DL(N); - - if ((Imm >= -128) && (Imm <= 127)) { - Base = CurDAG->getTargetConstant(Imm, DL, Ty); - Offset = CurDAG->getTargetConstant(0, DL, Ty); - return true; - } - - if (((Imm % 256) == 0) && (Imm >= -32768) && (Imm <= 32512)) { - Base = CurDAG->getTargetConstant(Imm/256, DL, Ty); - Offset = CurDAG->getTargetConstant(8, DL, Ty); - return true; - } - - return false; -} - bool AArch64DAGToDAGISel::SelectSVEAddSubImm(SDValue N, MVT VT, SDValue &Imm, SDValue &Shift) { if (auto CNode = dyn_cast(N)) { const int64_t ImmVal = CNode->getSExtValue(); @@ -3200,6 +3177,46 @@ bool AArch64DAGToDAGISel::SelectSVEAddSubImm(SDValue N, MVT VT, SDValue &Imm, SD return false; } +bool AArch64DAGToDAGISel::SelectSVECpyDupImm(SDValue N, MVT VT, SDValue &Imm, + SDValue &Shift) { + if (!isa(N)) + return false; + + SDLoc DL(N); + int64_t Val = cast(N) + ->getAPIntValue() + .truncOrSelf(VT.getFixedSizeInBits()) + .getSExtValue(); + + switch (VT.SimpleTy) { + case MVT::i8: + // All immediates are supported. + Shift = CurDAG->getTargetConstant(0, DL, MVT::i32); + Imm = CurDAG->getTargetConstant(Val & 0xFF, DL, MVT::i32); + return true; + case MVT::i16: + case MVT::i32: + case MVT::i64: + // Support 8bit signed immediates. + if (Val >= -128 && Val <= 127) { + Shift = CurDAG->getTargetConstant(0, DL, MVT::i32); + Imm = CurDAG->getTargetConstant(Val & 0xFF, DL, MVT::i32); + return true; + } + // Support 16bit signed immediates that are a multiple of 256. + if (Val >= -32768 && Val <= 32512 && Val % 256 == 0) { + Shift = CurDAG->getTargetConstant(8, DL, MVT::i32); + Imm = CurDAG->getTargetConstant((Val >> 8) & 0xFF, DL, MVT::i32); + return true; + } + break; + default: + break; + } + + return false; +} + bool AArch64DAGToDAGISel::SelectSVESignedArithImm(SDValue N, SDValue &Imm) { if (auto CNode = dyn_cast(N)) { int64_t ImmVal = CNode->getSExtValue(); diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 0bd75e2..6f3883f 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -653,13 +653,13 @@ let Predicates = [HasSVEorStreamingSVE] in { def : Pat<(nxv8bf16 (AArch64dup (bf16 fpimm0))), (DUP_ZI_H 0, 0)>; // Duplicate Int immediate into all vector elements - def : Pat<(nxv16i8 (AArch64dup (i32 (SVE8BitLslImm32 i32:$a, i32:$b)))), + def : Pat<(nxv16i8 (AArch64dup (i32 (SVECpyDupImm8Pat i32:$a, i32:$b)))), (DUP_ZI_B $a, $b)>; - def : Pat<(nxv8i16 (AArch64dup (i32 (SVE8BitLslImm32 i32:$a, i32:$b)))), + def : Pat<(nxv8i16 (AArch64dup (i32 (SVECpyDupImm16Pat i32:$a, i32:$b)))), (DUP_ZI_H $a, $b)>; - def : Pat<(nxv4i32 (AArch64dup (i32 (SVE8BitLslImm32 i32:$a, i32:$b)))), + def : Pat<(nxv4i32 (AArch64dup (i32 (SVECpyDupImm32Pat i32:$a, i32:$b)))), (DUP_ZI_S $a, $b)>; - def : Pat<(nxv2i64 (AArch64dup (i64 (SVE8BitLslImm64 i32:$a, i32:$b)))), + def : Pat<(nxv2i64 (AArch64dup (i64 (SVECpyDupImm64Pat i32:$a, i32:$b)))), (DUP_ZI_D $a, $b)>; // Duplicate immediate FP into all vector elements. diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index 694b0fe50..fc130c5 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -199,6 +199,11 @@ def SVEAddSubImm16Pat : ComplexPattern", [ def SVEAddSubImm32Pat : ComplexPattern", []>; def SVEAddSubImm64Pat : ComplexPattern", []>; +def SVECpyDupImm8Pat : ComplexPattern", []>; +def SVECpyDupImm16Pat : ComplexPattern", []>; +def SVECpyDupImm32Pat : ComplexPattern", []>; +def SVECpyDupImm64Pat : ComplexPattern", []>; + def SVELogicalImm8Pat : ComplexPattern", []>; def SVELogicalImm16Pat : ComplexPattern", []>; def SVELogicalImm32Pat : ComplexPattern", []>; @@ -209,14 +214,6 @@ def SVELogicalImm16NotPat : ComplexPattern", []>; def SVELogicalImm64NotPat : ComplexPattern", []>; -def SVE8BitLslImm32 : ComplexPattern; -def SVE8BitLslImm64 : ComplexPattern; -class SVE8BitLslImm { - ComplexPattern Pat = !cond( - !eq(ty, i32): SVE8BitLslImm32, - !eq(ty, i64): SVE8BitLslImm64); -} - def SVEArithUImm8Pat : ComplexPattern", []>; def SVEArithUImm16Pat : ComplexPattern", []>; def SVEArithUImm32Pat : ComplexPattern", []>; @@ -4623,29 +4620,28 @@ class sve_int_dup_imm_pred sz8_64, bit m, string asm, } multiclass sve_int_dup_imm_pred_merge_inst< - bits<2> sz8_64, string asm, ZPRRegOp zprty, ValueType intty, - ValueType predty, ValueType scalarty, imm8_opt_lsl cpyimm> { + bits<2> sz8_64, string asm, ZPRRegOp zprty, imm8_opt_lsl cpyimm, + ValueType intty, ValueType predty, ValueType scalarty, ComplexPattern cpx> { let Constraints = "$Zd = $_Zd" in def NAME : sve_int_dup_imm_pred; def : InstAlias<"mov $Zd, $Pg/m, $imm", (!cast(NAME) zprty:$Zd, PPRAny:$Pg, cpyimm:$imm), 1>; - def : Pat<(intty - (vselect predty:$Pg, - (intty (AArch64dup (scalarty (SVE8BitLslImm.Pat i32:$imm, i32:$shift)))), - intty:$Zd)), - (!cast(NAME) zprty:$Zd, $Pg, i32:$imm, i32:$shift)>; + def : Pat<(vselect predty:$Pg, + (intty (AArch64dup (scalarty (cpx i32:$imm, i32:$shift)))), + ZPR:$Zd), + (!cast(NAME) $Zd, $Pg, $imm, $shift)>; } multiclass sve_int_dup_imm_pred_merge { - defm _B : sve_int_dup_imm_pred_merge_inst<0b00, asm, ZPR8, nxv16i8, nxv16i1, - i32, cpy_imm8_opt_lsl_i8>; - defm _H : sve_int_dup_imm_pred_merge_inst<0b01, asm, ZPR16, nxv8i16, nxv8i1, - i32, cpy_imm8_opt_lsl_i16>; - defm _S : sve_int_dup_imm_pred_merge_inst<0b10, asm, ZPR32, nxv4i32, nxv4i1, - i32, cpy_imm8_opt_lsl_i32>; - defm _D : sve_int_dup_imm_pred_merge_inst<0b11, asm, ZPR64, nxv2i64, nxv2i1, - i64, cpy_imm8_opt_lsl_i64>; + defm _B : sve_int_dup_imm_pred_merge_inst<0b00, asm, ZPR8, cpy_imm8_opt_lsl_i8, + nxv16i8, nxv16i1, i32, SVECpyDupImm8Pat>; + defm _H : sve_int_dup_imm_pred_merge_inst<0b01, asm, ZPR16, cpy_imm8_opt_lsl_i16, + nxv8i16, nxv8i1, i32, SVECpyDupImm16Pat>; + defm _S : sve_int_dup_imm_pred_merge_inst<0b10, asm, ZPR32, cpy_imm8_opt_lsl_i32, + nxv4i32, nxv4i1, i32, SVECpyDupImm32Pat>; + defm _D : sve_int_dup_imm_pred_merge_inst<0b11, asm, ZPR64, cpy_imm8_opt_lsl_i64, + nxv2i64, nxv2i1, i64, SVECpyDupImm64Pat>; def : InstAlias<"fmov $Zd, $Pg/m, #0.0", (!cast(NAME # _H) ZPR16:$Zd, PPRAny:$Pg, 0, 0), 0>; @@ -4656,8 +4652,8 @@ multiclass sve_int_dup_imm_pred_merge { } multiclass sve_int_dup_imm_pred_zero_inst< - bits<2> sz8_64, string asm, ZPRRegOp zprty, ValueType intty, - ValueType predty, ValueType scalarty, imm8_opt_lsl cpyimm> { + bits<2> sz8_64, string asm, ZPRRegOp zprty, imm8_opt_lsl cpyimm, + ValueType intty, ValueType predty, ValueType scalarty, ComplexPattern cpx> { def NAME : sve_int_dup_imm_pred; def : InstAlias<"mov $Zd, $Pg/z, $imm", @@ -4668,22 +4664,21 @@ multiclass sve_int_dup_imm_pred_zero_inst< (!cast(NAME) PPRAny:$Ps1, -1, 0)>; def : Pat<(intty (anyext (predty PPRAny:$Ps1))), (!cast(NAME) PPRAny:$Ps1, 1, 0)>; - def : Pat<(intty - (vselect predty:$Pg, - (intty (AArch64dup (scalarty (SVE8BitLslImm.Pat i32:$imm, i32:$shift)))), - (intty (AArch64dup (scalarty 0))))), - (!cast(NAME) $Pg, i32:$imm, i32:$shift)>; + def : Pat<(vselect predty:$Pg, + (intty (AArch64dup (scalarty (cpx i32:$imm, i32:$shift)))), + (intty (AArch64dup (scalarty 0)))), + (!cast(NAME) $Pg, $imm, $shift)>; } multiclass sve_int_dup_imm_pred_zero { - defm _B : sve_int_dup_imm_pred_zero_inst<0b00, asm, ZPR8, nxv16i8, nxv16i1, - i32, cpy_imm8_opt_lsl_i8>; - defm _H : sve_int_dup_imm_pred_zero_inst<0b01, asm, ZPR16, nxv8i16, nxv8i1, - i32, cpy_imm8_opt_lsl_i16>; - defm _S : sve_int_dup_imm_pred_zero_inst<0b10, asm, ZPR32, nxv4i32, nxv4i1, - i32, cpy_imm8_opt_lsl_i32>; - defm _D : sve_int_dup_imm_pred_zero_inst<0b11, asm, ZPR64, nxv2i64, nxv2i1, - i64, cpy_imm8_opt_lsl_i64>; + defm _B : sve_int_dup_imm_pred_zero_inst<0b00, asm, ZPR8, cpy_imm8_opt_lsl_i8, + nxv16i8, nxv16i1, i32, SVECpyDupImm8Pat>; + defm _H : sve_int_dup_imm_pred_zero_inst<0b01, asm, ZPR16, cpy_imm8_opt_lsl_i16, + nxv8i16, nxv8i1, i32, SVECpyDupImm16Pat>; + defm _S : sve_int_dup_imm_pred_zero_inst<0b10, asm, ZPR32, cpy_imm8_opt_lsl_i32, + nxv4i32, nxv4i1, i32, SVECpyDupImm32Pat>; + defm _D : sve_int_dup_imm_pred_zero_inst<0b11, asm, ZPR64, cpy_imm8_opt_lsl_i64, + nxv2i64, nxv2i1, i64, SVECpyDupImm64Pat>; } //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-float-compares.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-float-compares.ll index 3fe8d37..2ab4535 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-float-compares.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-float-compares.ll @@ -365,11 +365,10 @@ define void @fcmp_ueq_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) # ; CHECK-LABEL: fcmp_ueq_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 -; CHECK-NEXT: mov w8, #65535 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: fcmne p1.h, p0/z, z0.h, z1.h -; CHECK-NEXT: mov z1.h, w8 +; CHECK-NEXT: mov z1.h, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: eor z0.d, z0.d, z1.d ; CHECK-NEXT: st1h { z0.h }, p0, [x2] @@ -412,11 +411,10 @@ define void @fcmp_une_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) # ; CHECK-LABEL: fcmp_une_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 -; CHECK-NEXT: mov w8, #65535 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h -; CHECK-NEXT: mov z1.h, w8 +; CHECK-NEXT: mov z1.h, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: eor z0.d, z0.d, z1.d ; CHECK-NEXT: st1h { z0.h }, p0, [x2] @@ -459,11 +457,10 @@ define void @fcmp_ugt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) # ; CHECK-LABEL: fcmp_ugt_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 -; CHECK-NEXT: mov w8, #65535 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: fcmge p1.h, p0/z, z1.h, z0.h -; CHECK-NEXT: mov z1.h, w8 +; CHECK-NEXT: mov z1.h, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: eor z0.d, z0.d, z1.d ; CHECK-NEXT: st1h { z0.h }, p0, [x2] @@ -506,11 +503,10 @@ define void @fcmp_ult_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) # ; CHECK-LABEL: fcmp_ult_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 -; CHECK-NEXT: mov w8, #65535 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: fcmge p1.h, p0/z, z0.h, z1.h -; CHECK-NEXT: mov z1.h, w8 +; CHECK-NEXT: mov z1.h, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: eor z0.d, z0.d, z1.d ; CHECK-NEXT: st1h { z0.h }, p0, [x2] @@ -553,11 +549,10 @@ define void @fcmp_uge_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) # ; CHECK-LABEL: fcmp_uge_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 -; CHECK-NEXT: mov w8, #65535 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: fcmgt p1.h, p0/z, z1.h, z0.h -; CHECK-NEXT: mov z1.h, w8 +; CHECK-NEXT: mov z1.h, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: eor z0.d, z0.d, z1.d ; CHECK-NEXT: st1h { z0.h }, p0, [x2] @@ -600,11 +595,10 @@ define void @fcmp_ule_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) # ; CHECK-LABEL: fcmp_ule_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 -; CHECK-NEXT: mov w8, #65535 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: fcmgt p1.h, p0/z, z0.h, z1.h -; CHECK-NEXT: mov z1.h, w8 +; CHECK-NEXT: mov z1.h, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: eor z0.d, z0.d, z1.d ; CHECK-NEXT: st1h { z0.h }, p0, [x2] @@ -647,11 +641,10 @@ define void @fcmp_ord_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) # ; CHECK-LABEL: fcmp_ord_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 -; CHECK-NEXT: mov w8, #65535 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: fcmuo p1.h, p0/z, z0.h, z1.h -; CHECK-NEXT: mov z1.h, w8 +; CHECK-NEXT: mov z1.h, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: eor z0.d, z0.d, z1.d ; CHECK-NEXT: st1h { z0.h }, p0, [x2] -- 2.7.4