[SystemZ] Add CodeGen support for v4f32

author Ulrich Weigand <ulrich.weigand@de.ibm.com>

Tue, 5 May 2015 19:27:45 +0000 (19:27 +0000)

committer Ulrich Weigand <ulrich.weigand@de.ibm.com>

Tue, 5 May 2015 19:27:45 +0000 (19:27 +0000)
author Ulrich Weigand <ulrich.weigand@de.ibm.com>
Tue, 5 May 2015 19:27:45 +0000 (19:27 +0000)
committer Ulrich Weigand <ulrich.weigand@de.ibm.com>
Tue, 5 May 2015 19:27:45 +0000 (19:27 +0000)
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h

index 4c0661608be71f009bf22c335d39ff01c0b5e2c4..36ea750ec8dc7c6cbfb960d4ba3c890ca6150efb 100644 (file)
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
@@ -71,6 +71,11 @@ inline unsigned getRegAsGR32(unsigned Reg) {
  inline unsigned getRegAsGRH32(unsigned Reg) {
    return GRH32Regs[getFirstReg(Reg)];
  }
+
+// Return the given register as a VR128.
+inline unsigned getRegAsVR128(unsigned Reg) {
+  return VR128Regs[getFirstReg(Reg)];
+}
  } // end namespace SystemZMC
  
  MCCodeEmitter *createSystemZMCCodeEmitter(const MCInstrInfo &MCII,
diff --git a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp

index 5f46e6a6313c2fb69141656bb9d9d1a9bf773823..026a75f21403eb2c0914c14ec0bfea06beda4e52 100644 (file)
--- a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
@@ -158,6 +158,21 @@ void SystemZAsmPrinter::EmitInstruction(const MachineInstr *MI) {
        .addReg(SystemZMC::getRegAsGR64(MI->getOperand(2).getReg()));
      break;
  
+  case SystemZ::LFER:
+    LoweredMI = MCInstBuilder(SystemZ::VLGVF)
+      .addReg(SystemZMC::getRegAsGR64(MI->getOperand(0).getReg()))
+      .addReg(SystemZMC::getRegAsVR128(MI->getOperand(1).getReg()))
+      .addReg(0).addImm(0);
+    break;
+
+  case SystemZ::LEFR:
+    LoweredMI = MCInstBuilder(SystemZ::VLVGF)
+      .addReg(SystemZMC::getRegAsVR128(MI->getOperand(0).getReg()))
+      .addReg(SystemZMC::getRegAsVR128(MI->getOperand(0).getReg()))
+      .addReg(MI->getOperand(1).getReg())
+      .addReg(0).addImm(0);
+    break;
+
  #define LOWER_LOW(NAME)                                                 \
    case SystemZ::NAME##64: LoweredMI = lowerRILow(MI, SystemZ::NAME); break
  
diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.td b/llvm/lib/Target/SystemZ/SystemZCallingConv.td

index 360d348af3a6f1f80e9bdd820023a51fce52897e..a2f996e60dfb73e919ced7938da728b861cd989c 100644 (file)
--- a/llvm/lib/Target/SystemZ/SystemZCallingConv.td
+++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.td
@@ -44,7 +44,7 @@ def RetCC_SystemZ : CallingConv<[
  
    // Similarly for vectors, with V24 being the ABI-compliant choice.
    CCIfSubtarget<"hasVector()",
-    CCIfType<[v16i8, v8i16, v4i32, v2i64, v2f64],
+    CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
               CCAssignToReg<[V24, V26, V28, V30, V25, V27, V29, V31]>>>
  
    // ABI-compliant code returns long double by reference, but that conversion
@@ -76,13 +76,13 @@ def CC_SystemZ : CallingConv<[
  
    // The first 8 named vector arguments are passed in V24-V31.
    CCIfSubtarget<"hasVector()",
-    CCIfType<[v16i8, v8i16, v4i32, v2i64, v2f64],
+    CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
               CCIfFixed<CCAssignToReg<[V24, V26, V28, V30,
                                        V25, V27, V29, V31]>>>>,
  
    // Other vector arguments are passed in 8-byte-aligned 16-byte stack slots.
    CCIfSubtarget<"hasVector()",
-    CCIfType<[v16i8, v8i16, v4i32, v2i64, v2f64],
+    CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
               CCAssignToStack<16, 8>>>,
  
    // Other arguments are passed in 8-byte-aligned 8-byte stack slots.
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp

index 5f547439c9aab351ffcce3b3a1d2ae82d999b341..391cb8c6fc99c2aa168eea58bf68ba8815a89b6e 100644 (file)
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -101,6 +101,7 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm,
      addRegisterClass(MVT::v8i16, &SystemZ::VR128BitRegClass);
      addRegisterClass(MVT::v4i32, &SystemZ::VR128BitRegClass);
      addRegisterClass(MVT::v2i64, &SystemZ::VR128BitRegClass);
+    addRegisterClass(MVT::v4f32, &SystemZ::VR128BitRegClass);
      addRegisterClass(MVT::v2f64, &SystemZ::VR128BitRegClass);
    }
  
@@ -275,7 +276,8 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm,
      if (isTypeLegal(VT)) {
        // These operations are legal for anything that can be stored in a
        // vector register, even if there is no native support for the format
-      // as such.
+      // as such.  In particular, we can do these for v4f32 even though there
+      // are no specific instructions for that format.
        setOperationAction(ISD::LOAD, VT, Legal);
        setOperationAction(ISD::STORE, VT, Legal);
        setOperationAction(ISD::VSELECT, VT, Legal);
@@ -365,11 +367,14 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm,
    // Handle floating-point vector types.
    if (Subtarget.hasVector()) {
      // Scalar-to-vector conversion is just a subreg.
+    setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal);
      setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal);
  
      // Some insertions and extractions can be done directly but others
      // need to go via integers.
+    setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
      setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
  
      // These operations have direct equivalents.
@@ -407,8 +412,10 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm,
  
    // We have 64-bit FPR<->GPR moves, but need special handling for
    // 32-bit forms.
-  setOperationAction(ISD::BITCAST, MVT::i32, Custom);
-  setOperationAction(ISD::BITCAST, MVT::f32, Custom);
+  if (!Subtarget.hasVector()) {
+    setOperationAction(ISD::BITCAST, MVT::i32, Custom);
+    setOperationAction(ISD::BITCAST, MVT::f32, Custom);
+  }
  
    // VASTART and VACOPY need to deal with the SystemZ-specific varargs
    // structure, but VAEND is a no-op.
@@ -420,6 +427,7 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm,
    setTargetDAGCombine(ISD::SIGN_EXTEND);
    setTargetDAGCombine(ISD::STORE);
    setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
+  setTargetDAGCombine(ISD::FP_ROUND);
  
    // Handle intrinsics.
    setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
@@ -855,6 +863,7 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
        case MVT::v8i16:
        case MVT::v4i32:
        case MVT::v2i64:
+      case MVT::v4f32:
        case MVT::v2f64:
          RC = &SystemZ::VR128BitRegClass;
          break;
@@ -1977,6 +1986,33 @@ static unsigned getVectorComparisonOrInvert(ISD::CondCode CC, bool IsFP,
    return 0;
  }
  
+// Return a v2f64 that contains the extended form of elements Start and Start+1
+// of v4f32 value Op.
+static SDValue expandV4F32ToV2F64(SelectionDAG &DAG, int Start, SDLoc DL,
+                                  SDValue Op) {
+  int Mask[] = { Start, -1, Start + 1, -1 };
+  Op = DAG.getVectorShuffle(MVT::v4f32, DL, Op, DAG.getUNDEF(MVT::v4f32), Mask);
+  return DAG.getNode(SystemZISD::VEXTEND, DL, MVT::v2f64, Op);
+}
+
+// Build a comparison of vectors CmpOp0 and CmpOp1 using opcode Opcode,
+// producing a result of type VT.
+static SDValue getVectorCmp(SelectionDAG &DAG, unsigned Opcode, SDLoc DL,
+                            EVT VT, SDValue CmpOp0, SDValue CmpOp1) {
+  // There is no hardware support for v4f32, so extend the vector into
+  // two v2f64s and compare those.
+  if (CmpOp0.getValueType() == MVT::v4f32) {
+    SDValue H0 = expandV4F32ToV2F64(DAG, 0, DL, CmpOp0);
+    SDValue L0 = expandV4F32ToV2F64(DAG, 2, DL, CmpOp0);
+    SDValue H1 = expandV4F32ToV2F64(DAG, 0, DL, CmpOp1);
+    SDValue L1 = expandV4F32ToV2F64(DAG, 2, DL, CmpOp1);
+    SDValue HRes = DAG.getNode(Opcode, DL, MVT::v2i64, H0, H1);
+    SDValue LRes = DAG.getNode(Opcode, DL, MVT::v2i64, L0, L1);
+    return DAG.getNode(SystemZISD::PACK, DL, VT, HRes, LRes);
+  }
+  return DAG.getNode(Opcode, DL, VT, CmpOp0, CmpOp1);
+}
+
  // Lower a vector comparison of type CC between CmpOp0 and CmpOp1, producing
  // an integer mask of type VT.
  static SDValue lowerVectorSETCC(SelectionDAG &DAG, SDLoc DL, EVT VT,
@@ -1991,8 +2027,8 @@ static SDValue lowerVectorSETCC(SelectionDAG &DAG, SDLoc DL, EVT VT,
      Invert = true;
    case ISD::SETO: {
      assert(IsFP && "Unexpected integer comparison");
-    SDValue LT = DAG.getNode(SystemZISD::VFCMPH, DL, VT, CmpOp1, CmpOp0);
-    SDValue GE = DAG.getNode(SystemZISD::VFCMPHE, DL, VT, CmpOp0, CmpOp1);
+    SDValue LT = getVectorCmp(DAG, SystemZISD::VFCMPH, DL, VT, CmpOp1, CmpOp0);
+    SDValue GE = getVectorCmp(DAG, SystemZISD::VFCMPHE, DL, VT, CmpOp0, CmpOp1);
      Cmp = DAG.getNode(ISD::OR, DL, VT, LT, GE);
      break;
    }
@@ -2002,8 +2038,8 @@ static SDValue lowerVectorSETCC(SelectionDAG &DAG, SDLoc DL, EVT VT,
      Invert = true;
    case ISD::SETONE: {
      assert(IsFP && "Unexpected integer comparison");
-    SDValue LT = DAG.getNode(SystemZISD::VFCMPH, DL, VT, CmpOp1, CmpOp0);
-    SDValue GT = DAG.getNode(SystemZISD::VFCMPH, DL, VT, CmpOp0, CmpOp1);
+    SDValue LT = getVectorCmp(DAG, SystemZISD::VFCMPH, DL, VT, CmpOp1, CmpOp0);
+    SDValue GT = getVectorCmp(DAG, SystemZISD::VFCMPH, DL, VT, CmpOp0, CmpOp1);
      Cmp = DAG.getNode(ISD::OR, DL, VT, LT, GT);
      break;
    }
@@ -2013,11 +2049,11 @@ static SDValue lowerVectorSETCC(SelectionDAG &DAG, SDLoc DL, EVT VT,
      // there are no cases where both work.
    default:
      if (unsigned Opcode = getVectorComparisonOrInvert(CC, IsFP, Invert))
-      Cmp = DAG.getNode(Opcode, DL, VT, CmpOp0, CmpOp1);
+      Cmp = getVectorCmp(DAG, Opcode, DL, VT, CmpOp0, CmpOp1);
      else {
        CC = ISD::getSetCCSwappedOperands(CC);
        if (unsigned Opcode = getVectorComparisonOrInvert(CC, IsFP, Invert))
-        Cmp = DAG.getNode(Opcode, DL, VT, CmpOp1, CmpOp0);
+        Cmp = getVectorCmp(DAG, Opcode, DL, VT, CmpOp1, CmpOp0);
        else
          llvm_unreachable("Unhandled comparison");
      }
@@ -3621,6 +3657,31 @@ static SDValue buildVector(SelectionDAG &DAG, SDLoc DL, EVT VT,
    if (VT == MVT::v2f64)
      return buildMergeScalars(DAG, DL, VT, Elems[0], Elems[1]);
  
+  // Build v4f32 values directly from the FPRs:
+  //
+  //   <Axxx> <Bxxx> <Cxxxx> <Dxxx>
+  //         V              V         VMRHF
+  //      <ABxx>         <CDxx>
+  //                V                 VMRHG
+  //              <ABCD>
+  if (VT == MVT::v4f32) {
+    SDValue Op01 = buildMergeScalars(DAG, DL, VT, Elems[0], Elems[1]);
+    SDValue Op23 = buildMergeScalars(DAG, DL, VT, Elems[2], Elems[3]);
+    // Avoid unnecessary undefs by reusing the other operand.
+    if (Op01.getOpcode() == ISD::UNDEF)
+      Op01 = Op23;
+    else if (Op23.getOpcode() == ISD::UNDEF)
+      Op23 = Op01;
+    // Merging identical replications is a no-op.
+    if (Op01.getOpcode() == SystemZISD::REPLICATE && Op01 == Op23)
+      return Op01;
+    Op01 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Op01);
+    Op23 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Op23);
+    SDValue Op = DAG.getNode(SystemZISD::MERGE_HIGH,
+                             DL, MVT::v2i64, Op01, Op23);
+    return DAG.getNode(ISD::BITCAST, DL, VT, Op);
+  }
+
    // Collect the constant terms.
    SmallVector<SDValue, SystemZ::VectorBytes> Constants(NumElements, SDValue());
    SmallVector<bool, SystemZ::VectorBytes> Done(NumElements, false);
@@ -3796,10 +3857,11 @@ SDValue SystemZTargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
    SDValue Op2 = Op.getOperand(2);
    EVT VT = Op.getValueType();
  
-  // Insertions into constant indices can be done using VPDI.  However,
-  // if the inserted value is a bitcast or a constant then it's better
-  // to use GPRs, as below.
-  if (Op1.getOpcode() != ISD::BITCAST &&
+  // Insertions into constant indices of a v2f64 can be done using VPDI.
+  // However, if the inserted value is a bitcast or a constant then it's
+  // better to use GPRs, as below.
+  if (VT == MVT::v2f64 &&
+      Op1.getOpcode() != ISD::BITCAST &&
        Op1.getOpcode() != ISD::ConstantFP &&
        Op2.getOpcode() == ISD::Constant) {
      uint64_t Index = dyn_cast<ConstantSDNode>(Op2)->getZExtValue();
@@ -4065,6 +4127,8 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
      OPCODE(VFCMPE);
      OPCODE(VFCMPH);
      OPCODE(VFCMPHE);
+    OPCODE(VEXTEND);
+    OPCODE(VROUND);
      OPCODE(ATOMIC_SWAPW);
      OPCODE(ATOMIC_LOADW_ADD);
      OPCODE(ATOMIC_LOADW_SUB);
@@ -4265,6 +4329,19 @@ SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N,
        }
      }
    }
+  // (z_merge_high 0, 0) -> 0.  This is mostly useful for using VLLEZF
+  // for v4f32.
+  if (Opcode == SystemZISD::MERGE_HIGH) {
+    SDValue Op0 = N->getOperand(0);
+    SDValue Op1 = N->getOperand(1);
+    if (Op0 == Op1) {
+      if (Op0.getOpcode() == ISD::BITCAST)
+        Op0 = Op0.getOperand(0);
+      if (Op0.getOpcode() == SystemZISD::BYTE_MASK &&
+          cast<ConstantSDNode>(Op0.getOperand(0))->getZExtValue() == 0)
+        return Op1;
+    }
+  }
    // If we have (truncstoreiN (extract_vector_elt X, Y), Z) then it is better
    // for the extraction to be done on a vMiN value, so that we can use VSTE.
    // If X has wider elements then convert it to:
@@ -4299,6 +4376,49 @@ SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N,
        N->getOperand(0) == N->getOperand(1))
      return DAG.getNode(SystemZISD::REPLICATE, SDLoc(N), N->getValueType(0),
                         N->getOperand(0));
+  // (fround (extract_vector_elt X 0))
+  // (fround (extract_vector_elt X 1)) ->
+  // (extract_vector_elt (VROUND X) 0)
+  // (extract_vector_elt (VROUND X) 1)
+  //
+  // This is a special case since the target doesn't really support v2f32s.
+  if (Opcode == ISD::FP_ROUND) {
+    SDValue Op0 = N->getOperand(0);
+    if (N->getValueType(0) == MVT::f32 &&
+        Op0.hasOneUse() &&
+        Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+        Op0.getOperand(0).getValueType() == MVT::v2f64 &&
+        Op0.getOperand(1).getOpcode() == ISD::Constant &&
+        cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue() == 0) {
+      SDValue Vec = Op0.getOperand(0);
+      for (auto *U : Vec->uses()) {
+        if (U != Op0.getNode() &&
+            U->hasOneUse() &&
+            U->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+            U->getOperand(0) == Vec &&
+            U->getOperand(1).getOpcode() == ISD::Constant &&
+            cast<ConstantSDNode>(U->getOperand(1))->getZExtValue() == 1) {
+          SDValue OtherRound = SDValue(*U->use_begin(), 0);
+          if (OtherRound.getOpcode() == ISD::FP_ROUND &&
+              OtherRound.getOperand(0) == SDValue(U, 0) &&
+              OtherRound.getValueType() == MVT::f32) {
+            SDValue VRound = DAG.getNode(SystemZISD::VROUND, SDLoc(N),
+                                         MVT::v4f32, Vec);
+            DCI.AddToWorklist(VRound.getNode());
+            SDValue Extract1 =
+              DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(U), MVT::f32,
+                          VRound, DAG.getConstant(2, SDLoc(U), MVT::i32));
+            DCI.AddToWorklist(Extract1.getNode());
+            DAG.ReplaceAllUsesOfValueWith(OtherRound, Extract1);
+            SDValue Extract0 =
+              DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op0), MVT::f32,
+                          VRound, DAG.getConstant(0, SDLoc(Op0), MVT::i32));
+            return Extract0;
+          }
+        }
+      }
+    }
+  }
    return SDValue();
  }
  
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h

index 8319c01fc5e270b2c95f15e5f58ee2b319d93c34..24a3f4bb5d45ba1c9194331e1124932157b60d7c 100644 (file)
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -226,6 +226,14 @@ enum {
    VFCMPH,
    VFCMPHE,
  
+  // Extend the even f32 elements of vector operand 0 to produce a vector
+  // of f64 elements.
+  VEXTEND,
+
+  // Round the f64 elements of vector operand 0 to f32s and store them in the
+  // even elements of the result.
+  VROUND,
+
    // Wrappers around the inner loop of an 8- or 16-bit ATOMIC_SWAP or
    // ATOMIC_LOAD_<op>.
    //
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td

index d7bfc12b93810c4664a0344ccba2b67a3b423e1a..dc9dfa801fdf896c56e79fb8dbd0f337000796e6 100644 (file)
--- a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -2398,6 +2398,9 @@ class Alias<int size, dag outs, dag ins, list<dag> pattern>
    let isCodeGenOnly = 1;
  }
  
+class UnaryAliasVRS<RegisterOperand cls1, RegisterOperand cls2>
+ : Alias<6, (outs cls1:$src1), (ins cls2:$src2), []>;
+
  // An alias of a BinaryRI, but with different register sizes.
  class BinaryAliasRI<SDPatternOperator operator, RegisterOperand cls,
                      Immediate imm>
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrVector.td b/llvm/lib/Target/SystemZ/SystemZInstrVector.td

index 546974aa5d8f23c6b41a10546d9bf1c1fee470c8..b6c8042b3c821ffea00145e7330865448b24016c 100644 (file)
--- a/llvm/lib/Target/SystemZ/SystemZInstrVector.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrVector.td
@@ -118,6 +118,8 @@ let Predicates = [FeatureVector] in {
    def VLREPH : UnaryVRX<"vlreph", 0xE705, z_replicate_loadi16, v128h, 2, 1>;
    def VLREPF : UnaryVRX<"vlrepf", 0xE705, z_replicate_loadi32, v128f, 4, 2>;
    def VLREPG : UnaryVRX<"vlrepg", 0xE705, z_replicate_loadi64, v128g, 8, 3>;
+  def : Pat<(v4f32 (z_replicate_loadf32 bdxaddr12only:$addr)),
+            (VLREPF bdxaddr12only:$addr)>;
    def : Pat<(v2f64 (z_replicate_loadf64 bdxaddr12only:$addr)),
              (VLREPG bdxaddr12only:$addr)>;
  
@@ -126,6 +128,8 @@ let Predicates = [FeatureVector] in {
    def VLLEZH : UnaryVRX<"vllezh", 0xE704, z_vllezi16, v128h, 2, 1>;
    def VLLEZF : UnaryVRX<"vllezf", 0xE704, z_vllezi32, v128f, 4, 2>;
    def VLLEZG : UnaryVRX<"vllezg", 0xE704, z_vllezi64, v128g, 8, 3>;
+  def : Pat<(v4f32 (z_vllezf32 bdxaddr12only:$addr)),
+            (VLLEZF bdxaddr12only:$addr)>;
    def : Pat<(v2f64 (z_vllezf64 bdxaddr12only:$addr)),
              (VLLEZG bdxaddr12only:$addr)>;
  
@@ -134,6 +138,8 @@ let Predicates = [FeatureVector] in {
    def VLEH : TernaryVRX<"vleh", 0xE701, z_vlei16, v128h, v128h, 2, imm32zx3>;
    def VLEF : TernaryVRX<"vlef", 0xE703, z_vlei32, v128f, v128f, 4, imm32zx2>;
    def VLEG : TernaryVRX<"vleg", 0xE702, z_vlei64, v128g, v128g, 8, imm32zx1>;
+  def : Pat<(z_vlef32 (v4f32 VR128:$val), bdxaddr12only:$addr, imm32zx2:$index),
+            (VLEF VR128:$val, bdxaddr12only:$addr, imm32zx2:$index)>;
    def : Pat<(z_vlef64 (v2f64 VR128:$val), bdxaddr12only:$addr, imm32zx1:$index),
              (VLEG VR128:$val, bdxaddr12only:$addr, imm32zx1:$index)>;
  
@@ -158,6 +164,7 @@ defm : ReplicatePeephole<VLREPB, v16i8, anyextloadi8, i32>;
  defm : ReplicatePeephole<VLREPH, v8i16, anyextloadi16, i32>;
  defm : ReplicatePeephole<VLREPF, v4i32, load, i32>;
  defm : ReplicatePeephole<VLREPG, v2i64, load, i64>;
+defm : ReplicatePeephole<VLREPF, v4f32, load, f32>;
  defm : ReplicatePeephole<VLREPG, v2f64, load, f64>;
  
  //===----------------------------------------------------------------------===//
@@ -179,6 +186,9 @@ let Predicates = [FeatureVector] in {
    def VSTEH : StoreBinaryVRX<"vsteh", 0xE709, z_vstei16, v128h, 2, imm32zx3>;
    def VSTEF : StoreBinaryVRX<"vstef", 0xE70B, z_vstei32, v128f, 4, imm32zx2>;
    def VSTEG : StoreBinaryVRX<"vsteg", 0xE70A, z_vstei64, v128g, 8, imm32zx1>;
+  def : Pat<(z_vstef32 (v4f32 VR128:$val), bdxaddr12only:$addr,
+                       imm32zx2:$index),
+            (VSTEF VR128:$val, bdxaddr12only:$addr, imm32zx2:$index)>;
    def : Pat<(z_vstef64 (v2f64 VR128:$val), bdxaddr12only:$addr,
                         imm32zx1:$index),
              (VSTEG VR128:$val, bdxaddr12only:$addr, imm32zx1:$index)>;
@@ -198,6 +208,7 @@ let Predicates = [FeatureVector] in {
    def VMRHH : BinaryVRRc<"vmrhh", 0xE761, z_merge_high, v128h, v128h, 1>;
    def VMRHF : BinaryVRRc<"vmrhf", 0xE761, z_merge_high, v128f, v128f, 2>;
    def VMRHG : BinaryVRRc<"vmrhg", 0xE761, z_merge_high, v128g, v128g, 3>;
+  def : BinaryRRWithType<VMRHF, VR128, z_merge_high, v4f32>;
    def : BinaryRRWithType<VMRHG, VR128, z_merge_high, v2f64>;
  
    // Merge low.
@@ -205,6 +216,7 @@ let Predicates = [FeatureVector] in {
    def VMRLH : BinaryVRRc<"vmrlh", 0xE760, z_merge_low, v128h, v128h, 1>;
    def VMRLF : BinaryVRRc<"vmrlf", 0xE760, z_merge_low, v128f, v128f, 2>;
    def VMRLG : BinaryVRRc<"vmrlg", 0xE760, z_merge_low, v128g, v128g, 3>;
+  def : BinaryRRWithType<VMRLF, VR128, z_merge_low, v4f32>;
    def : BinaryRRWithType<VMRLG, VR128, z_merge_low, v2f64>;
  
    // Permute.
@@ -218,6 +230,8 @@ let Predicates = [FeatureVector] in {
    def VREPH : BinaryVRIc<"vreph", 0xE74D, z_splat, v128h, v128h, 1>;
    def VREPF : BinaryVRIc<"vrepf", 0xE74D, z_splat, v128f, v128f, 2>;
    def VREPG : BinaryVRIc<"vrepg", 0xE74D, z_splat, v128g, v128g, 3>;
+  def : Pat<(v4f32 (z_splat VR128:$vec, imm32zx16:$index)),
+            (VREPF VR128:$vec, imm32zx16:$index)>;
    def : Pat<(v2f64 (z_splat VR128:$vec, imm32zx16:$index)),
              (VREPG VR128:$vec, imm32zx16:$index)>;
  
@@ -301,6 +315,7 @@ defm : GenericVectorOps<v16i8, v16i8>;
  defm : GenericVectorOps<v8i16, v8i16>;
  defm : GenericVectorOps<v4i32, v4i32>;
  defm : GenericVectorOps<v2i64, v2i64>;
+defm : GenericVectorOps<v4f32, v4i32>;
  defm : GenericVectorOps<v2f64, v2i64>;
  
  //===----------------------------------------------------------------------===//
@@ -797,12 +812,13 @@ let Predicates = [FeatureVector] in {
    defm : VectorRounding<VFIDB, v128db>;
  
    // Load lengthened.
-  def VLDEB : UnaryVRRa<"vldeb", 0xE7C4, null_frag, v128db, v128eb, 2, 0>;
+  def VLDEB : UnaryVRRa<"vldeb", 0xE7C4, z_vextend, v128db, v128eb, 2, 0>;
    def WLDEB : UnaryVRRa<"wldeb", 0xE7C4, null_frag, v64db, v32eb, 2, 8>;
  
    // Load rounded,
    def VLEDB : TernaryVRRa<"vledb", 0xE7C5, null_frag, v128eb, v128db, 3, 0>;
    def WLEDB : TernaryVRRa<"wledb", 0xE7C5, null_frag, v32eb, v64db, 3, 8>;
+  def : Pat<(v4f32 (z_vround (v2f64 VR128:$src))), (VLEDB VR128:$src, 0, 0)>;
  
    // Multiply.
    def VFMDB : BinaryVRRc<"vfmdb", 0xE7E7, fmul, v128db, v128db, 3, 0>;
@@ -882,27 +898,38 @@ let Predicates = [FeatureVector] in {
  def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>;
  def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>;
  def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>;
  def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>;
  
  def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>;
  def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>;
  def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>;
  def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>;
  
  def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>;
  def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>;
  def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>;
  def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>;
  
  def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>;
  def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>;
  def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>;
  def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>;
  
+def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>;
+
  def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>;
  def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>;
  def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>;
  def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>;
  
  //===----------------------------------------------------------------------===//
  // Replicating scalars
@@ -926,6 +953,14 @@ def : Pat<(v2i64 (z_replicate GR64:$scalar)),
  // Floating-point insertion and extraction
  //===----------------------------------------------------------------------===//
  
+// Moving 32-bit values between GPRs and FPRs can be done using VLVGF
+// and VLGVF.
+def LEFR : UnaryAliasVRS<VR32, GR32>;
+def LFER : UnaryAliasVRS<GR64, VR32>;
+def : Pat<(f32 (bitconvert (i32 GR32:$src))), (LEFR GR32:$src)>;
+def : Pat<(i32 (bitconvert (f32 VR32:$src))),
+          (EXTRACT_SUBREG (LFER VR32:$src), subreg_l32)>;
+
  // Floating-point values are stored in element 0 of the corresponding
  // vector register.  Scalar to vector conversion is just a subreg and
  // scalar replication can just replicate element 0 of the vector register.
@@ -937,6 +972,7 @@ multiclass ScalarToVectorFP<Instruction vrep, ValueType vt, RegisterOperand cls,
              (vrep (INSERT_SUBREG (vt (IMPLICIT_DEF)), cls:$scalar,
                                   subreg), 0)>;
  }
+defm : ScalarToVectorFP<VREPF, v4f32, FP32, subreg_r32>;
  defm : ScalarToVectorFP<VREPG, v2f64, FP64, subreg_r64>;
  
  // Match v2f64 insertions.  The AddedComplexity counters the 3 added by
@@ -951,11 +987,16 @@ let AddedComplexity = 4 in {
                                               subreg_r64), 0)>;
  }
  
-// We extract f64 element X by replicating (for elements other than 0)
-// and then taking a high subreg.  The AddedComplexity counters the 3
-// added by TableGen for the base register operand in VLGV-based integer
+// We extract floating-point element X by replicating (for elements other
+// than 0) and then taking a high subreg.  The AddedComplexity counters the
+// 3 added by TableGen for the base register operand in VLGV-based integer
  // extractions and ensures that this version is strictly better.
  let AddedComplexity = 4 in {
+  def : Pat<(f32 (z_vector_extract (v4f32 VR128:$vec), 0)),
+            (EXTRACT_SUBREG VR128:$vec, subreg_r32)>;
+  def : Pat<(f32 (z_vector_extract (v4f32 VR128:$vec), imm32zx2:$index)),
+            (EXTRACT_SUBREG (VREPF VR128:$vec, imm32zx2:$index), subreg_r32)>;
+
    def : Pat<(f64 (z_vector_extract (v2f64 VR128:$vec), 0)),
              (EXTRACT_SUBREG VR128:$vec, subreg_r64)>;
    def : Pat<(f64 (z_vector_extract (v2f64 VR128:$vec), imm32zx1:$index)),
diff --git a/llvm/lib/Target/SystemZ/SystemZOperators.td b/llvm/lib/Target/SystemZ/SystemZOperators.td

index 7cf7d862ffec977932883af99b4fa56e9720cdc4..63c217413acc0d8affc275aa24f5a852ad412b6e 100644 (file)
--- a/llvm/lib/Target/SystemZ/SystemZOperators.td
+++ b/llvm/lib/Target/SystemZ/SystemZOperators.td
@@ -91,6 +91,9 @@ def SDT_ZExtractVectorElt   : SDTypeProfile<1, 2,
                                               SDTCisVT<2, i32>]>;
  def SDT_ZReplicate          : SDTypeProfile<1, 1,
                                              [SDTCisVec<0>]>;
+def SDT_ZVecUnaryConv       : SDTypeProfile<1, 1,
+                                            [SDTCisVec<0>,
+                                             SDTCisVec<1>]>;
  def SDT_ZVecBinary          : SDTypeProfile<1, 2,
                                              [SDTCisVec<0>,
                                               SDTCisSameAs<0, 1>,
@@ -203,6 +206,8 @@ def z_vicmphl           : SDNode<"SystemZISD::VICMPHL", SDT_ZVecBinary>;
  def z_vfcmpe            : SDNode<"SystemZISD::VFCMPE", SDT_ZVecBinaryConv>;
  def z_vfcmph            : SDNode<"SystemZISD::VFCMPH", SDT_ZVecBinaryConv>;
  def z_vfcmphe           : SDNode<"SystemZISD::VFCMPHE", SDT_ZVecBinaryConv>;
+def z_vextend           : SDNode<"SystemZISD::VEXTEND", SDT_ZVecUnaryConv>;
+def z_vround            : SDNode<"SystemZISD::VROUND", SDT_ZVecUnaryConv>;
  
  class AtomicWOp<string name, SDTypeProfile profile = SDT_ZAtomicLoadBinaryW>
    : SDNode<"SystemZISD::"##name, profile,
@@ -508,6 +513,7 @@ def z_replicate_loadi8  : z_replicate_load<i32, anyextloadi8>;
  def z_replicate_loadi16 : z_replicate_load<i32, anyextloadi16>;
  def z_replicate_loadi32 : z_replicate_load<i32, load>;
  def z_replicate_loadi64 : z_replicate_load<i64, load>;
+def z_replicate_loadf32 : z_replicate_load<f32, load>;
  def z_replicate_loadf64 : z_replicate_load<f64, load>;
  
  // Load a scalar and insert it into a single element of a vector.
@@ -519,6 +525,7 @@ def z_vlei8  : z_vle<i32, anyextloadi8>;
  def z_vlei16 : z_vle<i32, anyextloadi16>;
  def z_vlei32 : z_vle<i32, load>;
  def z_vlei64 : z_vle<i64, load>;
+def z_vlef32 : z_vle<f32, load>;
  def z_vlef64 : z_vle<f64, load>;
  
  // Load a scalar and insert it into the low element of the high i64 of a
@@ -532,6 +539,17 @@ def z_vllezi16 : z_vllez<i32, anyextloadi16, 3>;
  def z_vllezi32 : z_vllez<i32, load, 1>;
  def z_vllezi64 : PatFrag<(ops node:$addr),
                           (z_join_dwords (i64 (load node:$addr)), (i64 0))>;
+// We use high merges to form a v4f32 from four f32s.  Propagating zero
+// into all elements but index 1 gives this expression.
+def z_vllezf32 : PatFrag<(ops node:$addr),
+                         (bitconvert
+                          (z_merge_high
+                           (v2i64 (bitconvert
+                                   (z_merge_high
+                                    (v4f32 (z_vzero)),
+                                    (v4f32 (scalar_to_vector
+                                            (f32 (load node:$addr))))))),
+                           (v2i64 (z_vzero))))>;
  def z_vllezf64 : PatFrag<(ops node:$addr),
                           (z_merge_high
                            (scalar_to_vector (f64 (load node:$addr))),
@@ -546,6 +564,7 @@ def z_vstei8  : z_vste<i32, truncstorei8>;
  def z_vstei16 : z_vste<i32, truncstorei16>;
  def z_vstei32 : z_vste<i32, store>;
  def z_vstei64 : z_vste<i64, store>;
+def z_vstef32 : z_vste<f32, store>;
  def z_vstef64 : z_vste<f64, store>;
  
  // Arithmetic negation on vectors.
diff --git a/llvm/test/CodeGen/SystemZ/fp-move-09.ll b/llvm/test/CodeGen/SystemZ/fp-move-09.ll

index e4a3e9222d52d0c9398011a3f3ac0d3bc6900117..5e8dce272c23d6fdfad50581ff27acdee63bb7fd 100644 (file)
--- a/llvm/test/CodeGen/SystemZ/fp-move-09.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-move-09.ll
@@ -1,4 +1,4 @@
-; Test moves between FPRs and GPRs for z196 and above.
+; Test moves between FPRs and GPRs for z196 and zEC12.
  ;
  ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | FileCheck %s
  
diff --git a/llvm/test/CodeGen/SystemZ/fp-move-10.ll b/llvm/test/CodeGen/SystemZ/fp-move-10.ll

new file mode 100644 (file)

index 0000000..602397d
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/fp-move-10.ll
@@ -0,0 +1,61 @@
+; Test moves between FPRs and GPRs for z13 and above.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+
+; Check that moves from i32s to floats use a low GR32 and vector operation.
+define float @f1(i16 *%ptr) {
+; CHECK-LABEL: f1:
+; CHECK: llh [[REG:%r[0-5]]], 0(%r2)
+; CHECK: oilh [[REG]], 16256
+; CHECK: vlvgf %v0, [[REG]], 0
+; CHECK: br %r14
+  %base = load i16, i16 *%ptr
+  %ext = zext i16 %base to i32
+  %full = or i32 %ext, 1065353216
+  %res = bitcast i32 %full to float
+  ret float %res
+}
+
+; Check that moves from floats to i32s use a low GR32 and vector operation.
+define void @f2(float %val, i8 *%ptr) {
+; CHECK-LABEL: f2:
+; CHECK: vlgvf [[REG:%r[0-5]]], %v0, 0
+; CHECK: stc [[REG]], 0(%r2)
+; CHECK: br %r14
+  %res = bitcast float %val to i32
+  %trunc = trunc i32 %res to i8
+  store i8 %trunc, i8 *%ptr
+  ret void
+}
+
+; Like f2, but with a conditional store.
+define void @f3(float %val, i8 *%ptr, i32 %which) {
+; CHECK-LABEL: f3:
+; CHECK-DAG: cijlh %r3, 0,
+; CHECK-DAG: vlgvf [[REG:%r[0-5]]], %v0, 0
+; CHECK: stc [[REG]], 0(%r2)
+; CHECK: br %r14
+  %int = bitcast float %val to i32
+  %trunc = trunc i32 %int to i8
+  %old = load i8, i8 *%ptr
+  %cmp = icmp eq i32 %which, 0
+  %res = select i1 %cmp, i8 %trunc, i8 %old
+  store i8 %res, i8 *%ptr
+  ret void
+}
+
+; ...and again with 16-bit memory.
+define void @f4(float %val, i16 *%ptr, i32 %which) {
+; CHECK-LABEL: f4:
+; CHECK-DAG: cijlh %r3, 0,
+; CHECK-DAG: vlgvf [[REG:%r[0-5]]], %v0, 0
+; CHECK: sth [[REG]], 0(%r2)
+; CHECK: br %r14
+  %int = bitcast float %val to i32
+  %trunc = trunc i32 %int to i16
+  %old = load i16, i16 *%ptr
+  %cmp = icmp eq i32 %which, 0
+  %res = select i1 %cmp, i16 %trunc, i16 %old
+  store i16 %res, i16 *%ptr
+  ret void
+}
diff --git a/llvm/test/CodeGen/SystemZ/vec-cmp-05.ll b/llvm/test/CodeGen/SystemZ/vec-cmp-05.ll

new file mode 100644 (file)

index 0000000..74e9909
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/vec-cmp-05.ll
@@ -0,0 +1,472 @@
+; Test v4f32 comparisons.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+
+; Test oeq.
+define <4 x i32> @f1(<4 x float> %val1, <4 x float> %val2) {
+; CHECK-LABEL: f1:
+; CHECK-DAG: vmrhf [[HIGH0E:%v[0-9]+]], %v24, %v24
+; CHECK-DAG: vmrlf [[LOW0E:%v[0-9]+]], %v24, %v24
+; CHECK-DAG: vmrhf [[HIGH1E:%v[0-9]+]], %v26, %v26
+; CHECK-DAG: vmrlf [[LOW1E:%v[0-9]+]], %v26, %v26
+; CHECK-DAG: vldeb [[HIGH0D:%v[0-9]+]], [[HIGH0E]]
+; CHECK-DAG: vldeb [[HIGH1D:%v[0-9]+]], [[HIGH1E]]
+; CHECK-DAG: vldeb [[LOW0D:%v[0-9]+]], [[LOW0E]]
+; CHECK-DAG: vldeb [[LOW1D:%v[0-9]+]], [[LOW1E]]
+; CHECK-DAG: vfcedb [[HIGHRES:%v[0-9]+]], [[HIGH0D]], [[HIGH1D]]
+; CHECK-DAG: vfcedb [[LOWRES:%v[0-9]+]], [[LOW0D]], [[LOW1D]]
+; CHECK: vpkg %v24, [[HIGHRES]], [[LOWRES]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp oeq <4 x float> %val1, %val2
+  %ret = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %ret
+}
+
+; Test one.
+define <4 x i32> @f2(<4 x float> %val1, <4 x float> %val2) {
+; CHECK-LABEL: f2:
+; CHECK-DAG: vmrhf [[HIGH0E:%v[0-9]+]], %v24, %v24
+; CHECK-DAG: vmrlf [[LOW0E:%v[0-9]+]], %v24, %v24
+; CHECK-DAG: vmrhf [[HIGH1E:%v[0-9]+]], %v26, %v26
+; CHECK-DAG: vmrlf [[LOW1E:%v[0-9]+]], %v26, %v26
+; CHECK-DAG: vldeb [[HIGH0D:%v[0-9]+]], [[HIGH0E]]
+; CHECK-DAG: vldeb [[HIGH1D:%v[0-9]+]], [[HIGH1E]]
+; CHECK-DAG: vldeb [[LOW0D:%v[0-9]+]], [[LOW0E]]
+; CHECK-DAG: vldeb [[LOW1D:%v[0-9]+]], [[LOW1E]]
+; CHECK-DAG: vfchdb [[HIGHRES0:%v[0-9]+]], [[HIGH0D]], [[HIGH1D]]
+; CHECK-DAG: vfchdb [[LOWRES0:%v[0-9]+]], [[LOW0D]], [[LOW1D]]
+; CHECK-DAG: vfchdb [[HIGHRES1:%v[0-9]+]], [[HIGH1D]], [[HIGH0D]]
+; CHECK-DAG: vfchdb [[LOWRES1:%v[0-9]+]], [[LOW1D]], [[LOW0D]]
+; CHECK-DAG: vpkg [[RES0:%v[0-9]+]], [[HIGHRES0]], [[LOWRES0]]
+; CHECK-DAG: vpkg [[RES1:%v[0-9]+]], [[HIGHRES1]], [[LOWRES1]]
+; CHECK: vo %v24, [[RES1]], [[RES0]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp one <4 x float> %val1, %val2
+  %ret = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %ret
+}
+
+; Test ogt.
+define <4 x i32> @f3(<4 x float> %val1, <4 x float> %val2) {
+; CHECK-LABEL: f3:
+; CHECK-DAG: vmrhf [[HIGH0E:%v[0-9]+]], %v24, %v24
+; CHECK-DAG: vmrlf [[LOW0E:%v[0-9]+]], %v24, %v24
+; CHECK-DAG: vmrhf [[HIGH1E:%v[0-9]+]], %v26, %v26
+; CHECK-DAG: vmrlf [[LOW1E:%v[0-9]+]], %v26, %v26
+; CHECK-DAG: vldeb [[HIGH0D:%v[0-9]+]], [[HIGH0E]]
+; CHECK-DAG: vldeb [[HIGH1D:%v[0-9]+]], [[HIGH1E]]
+; CHECK-DAG: vldeb [[LOW0D:%v[0-9]+]], [[LOW0E]]
+; CHECK-DAG: vldeb [[LOW1D:%v[0-9]+]], [[LOW1E]]
+; CHECK-DAG: vfchdb [[HIGHRES:%v[0-9]+]], [[HIGH0D]], [[HIGH1D]]
+; CHECK-DAG: vfchdb [[LOWRES:%v[0-9]+]], [[LOW0D]], [[LOW1D]]
+; CHECK: vpkg %v24, [[HIGHRES]], [[LOWRES]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp ogt <4 x float> %val1, %val2
+  %ret = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %ret
+}
+
+; Test oge.
+define <4 x i32> @f4(<4 x float> %val1, <4 x float> %val2) {
+; CHECK-LABEL: f4:
+; CHECK-DAG: vmrhf [[HIGH0E:%v[0-9]+]], %v24, %v24
+; CHECK-DAG: vmrlf [[LOW0E:%v[0-9]+]], %v24, %v24
+; CHECK-DAG: vmrhf [[HIGH1E:%v[0-9]+]], %v26, %v26
+; CHECK-DAG: vmrlf [[LOW1E:%v[0-9]+]], %v26, %v26
+; CHECK-DAG: vldeb [[HIGH0D:%v[0-9]+]], [[HIGH0E]]
+; CHECK-DAG: vldeb [[HIGH1D:%v[0-9]+]], [[HIGH1E]]
+; CHECK-DAG: vldeb [[LOW0D:%v[0-9]+]], [[LOW0E]]
+; CHECK-DAG: vldeb [[LOW1D:%v[0-9]+]], [[LOW1E]]
+; CHECK-DAG: vfchedb [[HIGHRES:%v[0-9]+]], [[HIGH0D]], [[HIGH1D]]
+; CHECK-DAG: vfchedb [[LOWRES:%v[0-9]+]], [[LOW0D]], [[LOW1D]]
+; CHECK: vpkg %v24, [[HIGHRES]], [[LOWRES]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp oge <4 x float> %val1, %val2
+  %ret = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %ret
+}
+
+; Test ole.
+define <4 x i32> @f5(<4 x float> %val1, <4 x float> %val2) {
+; CHECK-LABEL: f5:
+; CHECK-DAG: vmrhf [[HIGH0E:%v[0-9]+]], %v24, %v24
+; CHECK-DAG: vmrlf [[LOW0E:%v[0-9]+]], %v24, %v24
+; CHECK-DAG: vmrhf [[HIGH1E:%v[0-9]+]], %v26, %v26
+; CHECK-DAG: vmrlf [[LOW1E:%v[0-9]+]], %v26, %v26
+; CHECK-DAG: vldeb [[HIGH0D:%v[0-9]+]], [[HIGH0E]]
+; CHECK-DAG: vldeb [[HIGH1D:%v[0-9]+]], [[HIGH1E]]
+; CHECK-DAG: vldeb [[LOW0D:%v[0-9]+]], [[LOW0E]]
+; CHECK-DAG: vldeb [[LOW1D:%v[0-9]+]], [[LOW1E]]
+; CHECK-DAG: vfchedb [[HIGHRES:%v[0-9]+]], [[HIGH1D]], [[HIGH0D]]
+; CHECK-DAG: vfchedb [[LOWRES:%v[0-9]+]], [[LOW1D]], [[LOW0D]]
+; CHECK: vpkg %v24, [[HIGHRES]], [[LOWRES]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp ole <4 x float> %val1, %val2
+  %ret = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %ret
+}
+
+; Test olt.
+define <4 x i32> @f6(<4 x float> %val1, <4 x float> %val2) {
+; CHECK-LABEL: f6:
+; CHECK-DAG: vmrhf [[HIGH0E:%v[0-9]+]], %v24, %v24
+; CHECK-DAG: vmrlf [[LOW0E:%v[0-9]+]], %v24, %v24
+; CHECK-DAG: vmrhf [[HIGH1E:%v[0-9]+]], %v26, %v26
+; CHECK-DAG: vmrlf [[LOW1E:%v[0-9]+]], %v26, %v26
+; CHECK-DAG: vldeb [[HIGH0D:%v[0-9]+]], [[HIGH0E]]
+; CHECK-DAG: vldeb [[HIGH1D:%v[0-9]+]], [[HIGH1E]]
+; CHECK-DAG: vldeb [[LOW0D:%v[0-9]+]], [[LOW0E]]
+; CHECK-DAG: vldeb [[LOW1D:%v[0-9]+]], [[LOW1E]]
+; CHECK-DAG: vfchdb [[HIGHRES:%v[0-9]+]], [[HIGH1D]], [[HIGH0D]]
+; CHECK-DAG: vfchdb [[LOWRES:%v[0-9]+]], [[LOW1D]], [[LOW0D]]
+; CHECK: vpkg %v24, [[HIGHRES]], [[LOWRES]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp olt <4 x float> %val1, %val2
+  %ret = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %ret
+}
+
+; Test ueq.
+define <4 x i32> @f7(<4 x float> %val1, <4 x float> %val2) {
+; CHECK-LABEL: f7:
+; CHECK-DAG: vmrhf [[HIGH0E:%v[0-9]+]], %v24, %v24
+; CHECK-DAG: vmrlf [[LOW0E:%v[0-9]+]], %v24, %v24
+; CHECK-DAG: vmrhf [[HIGH1E:%v[0-9]+]], %v26, %v26
+; CHECK-DAG: vmrlf [[LOW1E:%v[0-9]+]], %v26, %v26
+; CHECK-DAG: vldeb [[HIGH0D:%v[0-9]+]], [[HIGH0E]]
+; CHECK-DAG: vldeb [[HIGH1D:%v[0-9]+]], [[HIGH1E]]
+; CHECK-DAG: vldeb [[LOW0D:%v[0-9]+]], [[LOW0E]]
+; CHECK-DAG: vldeb [[LOW1D:%v[0-9]+]], [[LOW1E]]
+; CHECK-DAG: vfchdb [[HIGHRES0:%v[0-9]+]], [[HIGH0D]], [[HIGH1D]]
+; CHECK-DAG: vfchdb [[LOWRES0:%v[0-9]+]], [[LOW0D]], [[LOW1D]]
+; CHECK-DAG: vfchdb [[HIGHRES1:%v[0-9]+]], [[HIGH1D]], [[HIGH0D]]
+; CHECK-DAG: vfchdb [[LOWRES1:%v[0-9]+]], [[LOW1D]], [[LOW0D]]
+; CHECK-DAG: vpkg [[RES0:%v[0-9]+]], [[HIGHRES0]], [[LOWRES0]]
+; CHECK-DAG: vpkg [[RES1:%v[0-9]+]], [[HIGHRES1]], [[LOWRES1]]
+; CHECK: vno %v24, [[RES1]], [[RES0]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp ueq <4 x float> %val1, %val2
+  %ret = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %ret
+}
+
+; Test une.
+define <4 x i32> @f8(<4 x float> %val1, <4 x float> %val2) {
+; CHECK-LABEL: f8:
+; CHECK-DAG: vmrhf [[HIGH0E:%v[0-9]+]], %v24, %v24
+; CHECK-DAG: vmrlf [[LOW0E:%v[0-9]+]], %v24, %v24
+; CHECK-DAG: vmrhf [[HIGH1E:%v[0-9]+]], %v26, %v26
+; CHECK-DAG: vmrlf [[LOW1E:%v[0-9]+]], %v26, %v26
+; CHECK-DAG: vldeb [[HIGH0D:%v[0-9]+]], [[HIGH0E]]
+; CHECK-DAG: vldeb [[HIGH1D:%v[0-9]+]], [[HIGH1E]]
+; CHECK-DAG: vldeb [[LOW0D:%v[0-9]+]], [[LOW0E]]
+; CHECK-DAG: vldeb [[LOW1D:%v[0-9]+]], [[LOW1E]]
+; CHECK-DAG: vfcedb [[HIGHRES:%v[0-9]+]], [[HIGH0D]], [[HIGH1D]]
+; CHECK-DAG: vfcedb [[LOWRES:%v[0-9]+]], [[LOW0D]], [[LOW1D]]
+; CHECK: vpkg [[RES:%v[0-9]+]], [[HIGHRES]], [[LOWRES]]
+; CHECK-NEXT: vno %v24, [[RES]], [[RES]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp une <4 x float> %val1, %val2
+  %ret = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %ret
+}
+
+; Test ugt.
+define <4 x i32> @f9(<4 x float> %val1, <4 x float> %val2) {
+; CHECK-LABEL: f9:
+; CHECK-DAG: vmrhf [[HIGH0E:%v[0-9]+]], %v24, %v24
+; CHECK-DAG: vmrlf [[LOW0E:%v[0-9]+]], %v24, %v24
+; CHECK-DAG: vmrhf [[HIGH1E:%v[0-9]+]], %v26, %v26
+; CHECK-DAG: vmrlf [[LOW1E:%v[0-9]+]], %v26, %v26
+; CHECK-DAG: vldeb [[HIGH0D:%v[0-9]+]], [[HIGH0E]]
+; CHECK-DAG: vldeb [[HIGH1D:%v[0-9]+]], [[HIGH1E]]
+; CHECK-DAG: vldeb [[LOW0D:%v[0-9]+]], [[LOW0E]]
+; CHECK-DAG: vldeb [[LOW1D:%v[0-9]+]], [[LOW1E]]
+; CHECK-DAG: vfchedb [[HIGHRES:%v[0-9]+]], [[HIGH1D]], [[HIGH0D]]
+; CHECK-DAG: vfchedb [[LOWRES:%v[0-9]+]], [[LOW1D]], [[LOW0D]]
+; CHECK: vpkg [[RES:%v[0-9]+]], [[HIGHRES]], [[LOWRES]]
+; CHECK-NEXT: vno %v24, [[RES]], [[RES]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp ugt <4 x float> %val1, %val2
+  %ret = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %ret
+}
+
+; Test uge.
+define <4 x i32> @f10(<4 x float> %val1, <4 x float> %val2) {
+; CHECK-LABEL: f10:
+; CHECK-DAG: vmrhf [[HIGH0E:%v[0-9]+]], %v24, %v24
+; CHECK-DAG: vmrlf [[LOW0E:%v[0-9]+]], %v24, %v24
+; CHECK-DAG: vmrhf [[HIGH1E:%v[0-9]+]], %v26, %v26
+; CHECK-DAG: vmrlf [[LOW1E:%v[0-9]+]], %v26, %v26
+; CHECK-DAG: vldeb [[HIGH0D:%v[0-9]+]], [[HIGH0E]]
+; CHECK-DAG: vldeb [[HIGH1D:%v[0-9]+]], [[HIGH1E]]
+; CHECK-DAG: vldeb [[LOW0D:%v[0-9]+]], [[LOW0E]]
+; CHECK-DAG: vldeb [[LOW1D:%v[0-9]+]], [[LOW1E]]
+; CHECK-DAG: vfchdb [[HIGHRES:%v[0-9]+]], [[HIGH1D]], [[HIGH0D]]
+; CHECK-DAG: vfchdb [[LOWRES:%v[0-9]+]], [[LOW1D]], [[LOW0D]]
+; CHECK: vpkg [[RES:%v[0-9]+]], [[HIGHRES]], [[LOWRES]]
+; CHECK-NEXT: vno %v24, [[RES]], [[RES]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp uge <4 x float> %val1, %val2
+  %ret = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %ret
+}
+
+; Test ule.
+define <4 x i32> @f11(<4 x float> %val1, <4 x float> %val2) {
+; CHECK-LABEL: f11:
+; CHECK-DAG: vmrhf [[HIGH0E:%v[0-9]+]], %v24, %v24
+; CHECK-DAG: vmrlf [[LOW0E:%v[0-9]+]], %v24, %v24
+; CHECK-DAG: vmrhf [[HIGH1E:%v[0-9]+]], %v26, %v26
+; CHECK-DAG: vmrlf [[LOW1E:%v[0-9]+]], %v26, %v26
+; CHECK-DAG: vldeb [[HIGH0D:%v[0-9]+]], [[HIGH0E]]
+; CHECK-DAG: vldeb [[HIGH1D:%v[0-9]+]], [[HIGH1E]]
+; CHECK-DAG: vldeb [[LOW0D:%v[0-9]+]], [[LOW0E]]
+; CHECK-DAG: vldeb [[LOW1D:%v[0-9]+]], [[LOW1E]]
+; CHECK-DAG: vfchdb [[HIGHRES:%v[0-9]+]], [[HIGH0D]], [[HIGH1D]]
+; CHECK-DAG: vfchdb [[LOWRES:%v[0-9]+]], [[LOW0D]], [[LOW1D]]
+; CHECK: vpkg [[RES:%v[0-9]+]], [[HIGHRES]], [[LOWRES]]
+; CHECK-NEXT: vno %v24, [[RES]], [[RES]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp ule <4 x float> %val1, %val2
+  %ret = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %ret
+}
+
+; Test ult.
+define <4 x i32> @f12(<4 x float> %val1, <4 x float> %val2) {
+; CHECK-LABEL: f12:
+; CHECK-DAG: vmrhf [[HIGH0E:%v[0-9]+]], %v24, %v24
+; CHECK-DAG: vmrlf [[LOW0E:%v[0-9]+]], %v24, %v24
+; CHECK-DAG: vmrhf [[HIGH1E:%v[0-9]+]], %v26, %v26
+; CHECK-DAG: vmrlf [[LOW1E:%v[0-9]+]], %v26, %v26
+; CHECK-DAG: vldeb [[HIGH0D:%v[0-9]+]], [[HIGH0E]]
+; CHECK-DAG: vldeb [[HIGH1D:%v[0-9]+]], [[HIGH1E]]
+; CHECK-DAG: vldeb [[LOW0D:%v[0-9]+]], [[LOW0E]]
+; CHECK-DAG: vldeb [[LOW1D:%v[0-9]+]], [[LOW1E]]
+; CHECK-DAG: vfchedb [[HIGHRES:%v[0-9]+]], [[HIGH0D]], [[HIGH1D]]
+; CHECK-DAG: vfchedb [[LOWRES:%v[0-9]+]], [[LOW0D]], [[LOW1D]]
+; CHECK: vpkg [[RES:%v[0-9]+]], [[HIGHRES]], [[LOWRES]]
+; CHECK-NEXT: vno %v24, [[RES]], [[RES]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp ult <4 x float> %val1, %val2
+  %ret = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %ret
+}
+
+; Test ord.
+define <4 x i32> @f13(<4 x float> %val1, <4 x float> %val2) {
+; CHECK-LABEL: f13:
+; CHECK-DAG: vmrhf [[HIGH0E:%v[0-9]+]], %v24, %v24
+; CHECK-DAG: vmrlf [[LOW0E:%v[0-9]+]], %v24, %v24
+; CHECK-DAG: vmrhf [[HIGH1E:%v[0-9]+]], %v26, %v26
+; CHECK-DAG: vmrlf [[LOW1E:%v[0-9]+]], %v26, %v26
+; CHECK-DAG: vldeb [[HIGH0D:%v[0-9]+]], [[HIGH0E]]
+; CHECK-DAG: vldeb [[HIGH1D:%v[0-9]+]], [[HIGH1E]]
+; CHECK-DAG: vldeb [[LOW0D:%v[0-9]+]], [[LOW0E]]
+; CHECK-DAG: vldeb [[LOW1D:%v[0-9]+]], [[LOW1E]]
+; CHECK-DAG: vfchedb [[HIGHRES0:%v[0-9]+]], [[HIGH0D]], [[HIGH1D]]
+; CHECK-DAG: vfchedb [[LOWRES0:%v[0-9]+]], [[LOW0D]], [[LOW1D]]
+; CHECK-DAG: vfchdb [[HIGHRES1:%v[0-9]+]], [[HIGH1D]], [[HIGH0D]]
+; CHECK-DAG: vfchdb [[LOWRES1:%v[0-9]+]], [[LOW1D]], [[LOW0D]]
+; CHECK-DAG: vpkg [[RES0:%v[0-9]+]], [[HIGHRES0]], [[LOWRES0]]
+; CHECK-DAG: vpkg [[RES1:%v[0-9]+]], [[HIGHRES1]], [[LOWRES1]]
+; CHECK: vo %v24, [[RES1]], [[RES0]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp ord <4 x float> %val1, %val2
+  %ret = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %ret
+}
+
+; Test uno.
+define <4 x i32> @f14(<4 x float> %val1, <4 x float> %val2) {
+; CHECK-LABEL: f14:
+; CHECK-DAG: vmrhf [[HIGH0E:%v[0-9]+]], %v24, %v24
+; CHECK-DAG: vmrlf [[LOW0E:%v[0-9]+]], %v24, %v24
+; CHECK-DAG: vmrhf [[HIGH1E:%v[0-9]+]], %v26, %v26
+; CHECK-DAG: vmrlf [[LOW1E:%v[0-9]+]], %v26, %v26
+; CHECK-DAG: vldeb [[HIGH0D:%v[0-9]+]], [[HIGH0E]]
+; CHECK-DAG: vldeb [[HIGH1D:%v[0-9]+]], [[HIGH1E]]
+; CHECK-DAG: vldeb [[LOW0D:%v[0-9]+]], [[LOW0E]]
+; CHECK-DAG: vldeb [[LOW1D:%v[0-9]+]], [[LOW1E]]
+; CHECK-DAG: vfchedb [[HIGHRES0:%v[0-9]+]], [[HIGH0D]], [[HIGH1D]]
+; CHECK-DAG: vfchedb [[LOWRES0:%v[0-9]+]], [[LOW0D]], [[LOW1D]]
+; CHECK-DAG: vfchdb [[HIGHRES1:%v[0-9]+]], [[HIGH1D]], [[HIGH0D]]
+; CHECK-DAG: vfchdb [[LOWRES1:%v[0-9]+]], [[LOW1D]], [[LOW0D]]
+; CHECK-DAG: vpkg [[RES0:%v[0-9]+]], [[HIGHRES0]], [[LOWRES0]]
+; CHECK-DAG: vpkg [[RES1:%v[0-9]+]], [[HIGHRES1]], [[LOWRES1]]
+; CHECK: vno %v24, [[RES1]], [[RES0]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp uno <4 x float> %val1, %val2
+  %ret = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %ret
+}
+
+; Test oeq selects.
+define <4 x float> @f15(<4 x float> %val1, <4 x float> %val2,
+                        <4 x float> %val3, <4 x float> %val4) {
+; CHECK-LABEL: f15:
+; CHECK: vpkg [[REG:%v[0-9]+]],
+; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp oeq <4 x float> %val1, %val2
+  %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+  ret <4 x float> %ret
+}
+
+; Test one selects.
+define <4 x float> @f16(<4 x float> %val1, <4 x float> %val2,
+                        <4 x float> %val3, <4 x float> %val4) {
+; CHECK-LABEL: f16:
+; CHECK: vo [[REG:%v[0-9]+]],
+; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp one <4 x float> %val1, %val2
+  %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+  ret <4 x float> %ret
+}
+
+; Test ogt selects.
+define <4 x float> @f17(<4 x float> %val1, <4 x float> %val2,
+                        <4 x float> %val3, <4 x float> %val4) {
+; CHECK-LABEL: f17:
+; CHECK: vpkg [[REG:%v[0-9]+]],
+; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp ogt <4 x float> %val1, %val2
+  %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+  ret <4 x float> %ret
+}
+
+; Test oge selects.
+define <4 x float> @f18(<4 x float> %val1, <4 x float> %val2,
+                        <4 x float> %val3, <4 x float> %val4) {
+; CHECK-LABEL: f18:
+; CHECK: vpkg [[REG:%v[0-9]+]],
+; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp oge <4 x float> %val1, %val2
+  %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+  ret <4 x float> %ret
+}
+
+; Test ole selects.
+define <4 x float> @f19(<4 x float> %val1, <4 x float> %val2,
+                        <4 x float> %val3, <4 x float> %val4) {
+; CHECK-LABEL: f19:
+; CHECK: vpkg [[REG:%v[0-9]+]],
+; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp ole <4 x float> %val1, %val2
+  %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+  ret <4 x float> %ret
+}
+
+; Test olt selects.
+define <4 x float> @f20(<4 x float> %val1, <4 x float> %val2,
+                        <4 x float> %val3, <4 x float> %val4) {
+; CHECK-LABEL: f20:
+; CHECK: vpkg [[REG:%v[0-9]+]],
+; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp olt <4 x float> %val1, %val2
+  %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+  ret <4 x float> %ret
+}
+
+; Test ueq selects.
+define <4 x float> @f21(<4 x float> %val1, <4 x float> %val2,
+                        <4 x float> %val3, <4 x float> %val4) {
+; CHECK-LABEL: f21:
+; CHECK: vo [[REG:%v[0-9]+]],
+; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp ueq <4 x float> %val1, %val2
+  %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+  ret <4 x float> %ret
+}
+
+; Test une selects.
+define <4 x float> @f22(<4 x float> %val1, <4 x float> %val2,
+                        <4 x float> %val3, <4 x float> %val4) {
+; CHECK-LABEL: f22:
+; CHECK: vpkg [[REG:%v[0-9]+]],
+; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp une <4 x float> %val1, %val2
+  %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+  ret <4 x float> %ret
+}
+
+; Test ugt selects.
+define <4 x float> @f23(<4 x float> %val1, <4 x float> %val2,
+                        <4 x float> %val3, <4 x float> %val4) {
+; CHECK-LABEL: f23:
+; CHECK: vpkg [[REG:%v[0-9]+]],
+; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp ugt <4 x float> %val1, %val2
+  %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+  ret <4 x float> %ret
+}
+
+; Test uge selects.
+define <4 x float> @f24(<4 x float> %val1, <4 x float> %val2,
+                        <4 x float> %val3, <4 x float> %val4) {
+; CHECK-LABEL: f24:
+; CHECK: vpkg [[REG:%v[0-9]+]],
+; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp uge <4 x float> %val1, %val2
+  %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+  ret <4 x float> %ret
+}
+
+; Test ule selects.
+define <4 x float> @f25(<4 x float> %val1, <4 x float> %val2,
+                        <4 x float> %val3, <4 x float> %val4) {
+; CHECK-LABEL: f25:
+; CHECK: vpkg [[REG:%v[0-9]+]],
+; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp ule <4 x float> %val1, %val2
+  %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+  ret <4 x float> %ret
+}
+
+; Test ult selects.
+define <4 x float> @f26(<4 x float> %val1, <4 x float> %val2,
+                        <4 x float> %val3, <4 x float> %val4) {
+; CHECK-LABEL: f26:
+; CHECK: vpkg [[REG:%v[0-9]+]],
+; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp ult <4 x float> %val1, %val2
+  %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+  ret <4 x float> %ret
+}
+
+; Test ord selects.
+define <4 x float> @f27(<4 x float> %val1, <4 x float> %val2,
+                        <4 x float> %val3, <4 x float> %val4) {
+; CHECK-LABEL: f27:
+; CHECK: vo [[REG:%v[0-9]+]],
+; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp ord <4 x float> %val1, %val2
+  %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+  ret <4 x float> %ret
+}
+
+; Test uno selects.
+define <4 x float> @f28(<4 x float> %val1, <4 x float> %val2,
+                        <4 x float> %val3, <4 x float> %val4) {
+; CHECK-LABEL: f28:
+; CHECK: vo [[REG:%v[0-9]+]],
+; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp uno <4 x float> %val1, %val2
+  %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+  ret <4 x float> %ret
+}
diff --git a/llvm/test/CodeGen/SystemZ/vec-const-05.ll b/llvm/test/CodeGen/SystemZ/vec-const-05.ll

new file mode 100644 (file)

index 0000000..c482833
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/vec-const-05.ll
@@ -0,0 +1,47 @@
+; Test vector byte masks, v4f32 version.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+
+; Test an all-zeros vector.
+define <4 x float> @f1() {
+; CHECK-LABEL: f1:
+; CHECK: vgbm %v24, 0
+; CHECK: br %r14
+  ret <4 x float> zeroinitializer
+}
+
+; Test an all-ones vector.
+define <4 x float> @f2() {
+; CHECK-LABEL: f2:
+; CHECK: vgbm %v24, 65535
+; CHECK: br %r14
+  ret <4 x float> <float 0xffffffffe0000000, float 0xffffffffe0000000,
+                   float 0xffffffffe0000000, float 0xffffffffe0000000>
+}
+
+; Test a mixed vector (mask 0xc731).
+define <4 x float> @f3() {
+; CHECK-LABEL: f3:
+; CHECK: vgbm %v24, 50993
+; CHECK: br %r14
+  ret <4 x float> <float 0xffffe00000000000, float 0x381fffffe0000000,
+                   float 0x379fffe000000000, float 0x371fe00000000000>
+}
+
+; Test that undefs are treated as zero (mask 0xc031).
+define <4 x float> @f4() {
+; CHECK-LABEL: f4:
+; CHECK: vgbm %v24, 49201
+; CHECK: br %r14
+  ret <4 x float> <float 0xffffe00000000000, float undef,
+                   float 0x379fffe000000000, float 0x371fe00000000000>
+}
+
+; Test that we don't use VGBM if one of the bytes is not 0 or 0xff.
+define <4 x float> @f5() {
+; CHECK-LABEL: f5:
+; CHECK-NOT: vgbm
+; CHECK: br %r14
+  ret <4 x float> <float 0xffffe00000000000, float 0x381fffffc0000000,
+                   float 0x379fffe000000000, float 0x371fe00000000000>
+}
diff --git a/llvm/test/CodeGen/SystemZ/vec-const-11.ll b/llvm/test/CodeGen/SystemZ/vec-const-11.ll

new file mode 100644 (file)

index 0000000..0c69b88
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/vec-const-11.ll
@@ -0,0 +1,189 @@
+; Test vector replicates, v4f32 version.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+
+; Test a byte-granularity replicate with the lowest useful value.
+define <4 x float> @f1() {
+; CHECK-LABEL: f1:
+; CHECK: vrepib %v24, 1
+; CHECK: br %r14
+  ret <4 x float> <float 0x3820202020000000, float 0x3820202020000000,
+                   float 0x3820202020000000, float 0x3820202020000000>
+}
+
+; Test a byte-granularity replicate with an arbitrary value.
+define <4 x float> @f2() {
+; CHECK-LABEL: f2:
+; CHECK: vrepib %v24, -55
+; CHECK: br %r14
+  ret <4 x float> <float 0xc139393920000000, float 0xc139393920000000,
+                   float 0xc139393920000000, float 0xc139393920000000>
+}
+
+; Test a byte-granularity replicate with the highest useful value.
+define <4 x float> @f3() {
+; CHECK-LABEL: f3:
+; CHECK: vrepib %v24, -2
+; CHECK: br %r14
+  ret <4 x float> <float 0xc7dfdfdfc0000000, float 0xc7dfdfdfc0000000,
+                   float 0xc7dfdfdfc0000000, float 0xc7dfdfdfc0000000>
+}
+
+; Test a halfword-granularity replicate with the lowest useful value.
+define <4 x float> @f4() {
+; CHECK-LABEL: f4:
+; CHECK: vrepih %v24, 1
+; CHECK: br %r14
+  ret <4 x float> <float 0x37a0001000000000, float 0x37a0001000000000,
+                   float 0x37a0001000000000, float 0x37a0001000000000>
+}
+
+; Test a halfword-granularity replicate with an arbitrary value.
+define <4 x float> @f5() {
+; CHECK-LABEL: f5:
+; CHECK: vrepih %v24, 25650
+; CHECK: br %r14
+  ret <4 x float> <float 0x44864c8640000000, float 0x44864c8640000000,
+                   float 0x44864c8640000000, float 0x44864c8640000000>
+}
+
+; Test a halfword-granularity replicate with the highest useful value.
+define <4 x float> @f6() {
+; CHECK-LABEL: f6:
+; CHECK: vrepih %v24, -2
+; CHECK: br %r14
+  ret <4 x float> <float 0xffffdfffc0000000, float 0xffffdfffc0000000,
+                   float 0xffffdfffc0000000, float 0xffffdfffc0000000>
+}
+
+; Test a word-granularity replicate with the lowest useful positive value.
+define <4 x float> @f7() {
+; CHECK-LABEL: f7:
+; CHECK: vrepif %v24, 1
+; CHECK: br %r14
+  ret <4 x float> <float 0x36a0000000000000, float 0x36a0000000000000,
+                   float 0x36a0000000000000, float 0x36a0000000000000>
+}
+
+; Test a word-granularity replicate with the highest in-range value.
+define <4 x float> @f8() {
+; CHECK-LABEL: f8:
+; CHECK: vrepif %v24, 32767
+; CHECK: br %r14
+  ret <4 x float> <float 0x378fffc000000000, float 0x378fffc000000000,
+                   float 0x378fffc000000000, float 0x378fffc000000000>
+}
+
+; Test a word-granularity replicate with the next highest value.
+; This cannot use VREPIF.
+define <4 x float> @f9() {
+; CHECK-LABEL: f9:
+; CHECK-NOT: vrepif
+; CHECK: br %r14
+  ret <4 x float> <float 0x3790000000000000, float 0x3790000000000000,
+                   float 0x3790000000000000, float 0x3790000000000000>
+}
+
+; Test a word-granularity replicate with the lowest in-range value.
+define <4 x float> @f10() {
+; CHECK-LABEL: f10:
+; CHECK: vrepif %v24, -32768
+; CHECK: br %r14
+  ret <4 x float> <float 0xfffff00000000000, float 0xfffff00000000000,
+                   float 0xfffff00000000000, float 0xfffff00000000000>
+}
+
+; Test a word-granularity replicate with the next lowest value.
+; This cannot use VREPIF.
+define <4 x float> @f11() {
+; CHECK-LABEL: f11:
+; CHECK-NOT: vrepif
+; CHECK: br %r14
+  ret <4 x float> <float 0xffffefffe0000000, float 0xffffefffe0000000,
+                   float 0xffffefffe0000000, float 0xffffefffe0000000>
+}
+
+; Test a word-granularity replicate with the highest useful negative value.
+define <4 x float> @f12() {
+; CHECK-LABEL: f12:
+; CHECK: vrepif %v24, -2
+; CHECK: br %r14
+  ret <4 x float> <float 0xffffffffc0000000, float 0xffffffffc0000000,
+                   float 0xffffffffc0000000, float 0xffffffffc0000000>
+}
+
+; Test a doubleword-granularity replicate with the lowest useful positive
+; value.
+define <4 x float> @f13() {
+; CHECK-LABEL: f13:
+; CHECK: vrepig %v24, 1
+; CHECK: br %r14
+  ret <4 x float> <float 0.0, float 0x36a0000000000000,
+                   float 0.0, float 0x36a0000000000000>
+}
+
+; Test a doubleword-granularity replicate with the highest in-range value.
+define <4 x float> @f14() {
+; CHECK-LABEL: f14:
+; CHECK: vrepig %v24, 32767
+; CHECK: br %r14
+  ret <4 x float> <float 0.0, float 0x378fffc000000000,
+                   float 0.0, float 0x378fffc000000000>
+}
+
+; Test a doubleword-granularity replicate with the next highest value.
+; This cannot use VREPIG.
+define <4 x float> @f15() {
+; CHECK-LABEL: f15:
+; CHECK-NOT: vrepig
+; CHECK: br %r14
+  ret <4 x float> <float 0.0, float 0x3790000000000000,
+                   float 0.0, float 0x3790000000000000>
+}
+
+; Test a doubleword-granularity replicate with the lowest in-range value.
+define <4 x float> @f16() {
+; CHECK-LABEL: f16:
+; CHECK: vrepig %v24, -32768
+; CHECK: br %r14
+  ret <4 x float> <float 0xffffffffe0000000, float 0xfffff00000000000,
+                   float 0xffffffffe0000000, float 0xfffff00000000000>
+}
+
+; Test a doubleword-granularity replicate with the next lowest value.
+; This cannot use VREPIG.
+define <4 x float> @f17() {
+; CHECK-LABEL: f17:
+; CHECK-NOT: vrepig
+; CHECK: br %r14
+  ret <4 x float> <float 0xffffffffe0000000, float 0xffffefffe0000000,
+                   float 0xffffffffe0000000, float 0xffffefffe0000000>
+}
+
+; Test a doubleword-granularity replicate with the highest useful negative
+; value.
+define <4 x float> @f18() {
+; CHECK-LABEL: f18:
+; CHECK: vrepig %v24, -2
+; CHECK: br %r14
+  ret <4 x float> <float 0xffffffffe0000000, float 0xffffffffc0000000,
+                   float 0xffffffffe0000000, float 0xffffffffc0000000>
+}
+
+; Repeat f14 with undefs optimistically treated as 0, 32767.
+define <4 x float> @f19() {
+; CHECK-LABEL: f19:
+; CHECK: vrepig %v24, 32767
+; CHECK: br %r14
+  ret <4 x float> <float undef, float undef,
+                   float 0.0, float 0x378fffc000000000>
+}
+
+; Repeat f18 with undefs optimistically treated as -2, -1.
+define <4 x float> @f20() {
+; CHECK-LABEL: f20:
+; CHECK: vrepig %v24, -2
+; CHECK: br %r14
+  ret <4 x float> <float 0xffffffffe0000000, float undef,
+                   float undef, float 0xffffffffc0000000>
+}
diff --git a/llvm/test/CodeGen/SystemZ/vec-const-17.ll b/llvm/test/CodeGen/SystemZ/vec-const-17.ll

new file mode 100644 (file)

index 0000000..1306eab
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/vec-const-17.ll
@@ -0,0 +1,95 @@
+; Test vector replicates that use VECTOR GENERATE MASK, v4f32 version.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+
+; Test a word-granularity replicate with the lowest value that cannot use
+; VREPIF.
+define <4 x float> @f1() {
+; CHECK-LABEL: f1:
+; CHECK: vgmf %v24, 16, 16
+; CHECK: br %r14
+  ret <4 x float> <float 0x3790000000000000, float 0x3790000000000000,
+                   float 0x3790000000000000, float 0x3790000000000000>
+}
+
+; Test a word-granularity replicate that has the lower 17 bits set.
+define <4 x float> @f2() {
+; CHECK-LABEL: f2:
+; CHECK: vgmf %v24, 15, 31
+; CHECK: br %r14
+  ret <4 x float> <float 0x37affff000000000, float 0x37affff000000000,
+                   float 0x37affff000000000, float 0x37affff000000000>
+}
+
+; Test a word-granularity replicate that has the upper 15 bits set.
+define <4 x float> @f3() {
+; CHECK-LABEL: f3:
+; CHECK: vgmf %v24, 0, 14
+; CHECK: br %r14
+  ret <4 x float> <float 0xffffc00000000000, float 0xffffc00000000000,
+                   float 0xffffc00000000000, float 0xffffc00000000000>
+}
+
+; Test a word-granularity replicate that has middle bits set.
+define <4 x float> @f4() {
+; CHECK-LABEL: f4:
+; CHECK: vgmf %v24, 2, 8
+; CHECK: br %r14
+  ret <4 x float> <float 0x3ff0000000000000, float 0x3ff0000000000000,
+                   float 0x3ff0000000000000, float 0x3ff0000000000000>
+}
+
+; Test a word-granularity replicate with a wrap-around mask.
+define <4 x float> @f5() {
+; CHECK-LABEL: f5:
+; CHECK: vgmf %v24, 9, 1
+; CHECK: br %r14
+  ret <4 x float> <float 0xc00fffffe0000000, float 0xc00fffffe0000000,
+                   float 0xc00fffffe0000000, float 0xc00fffffe0000000>
+}
+
+; Test a doubleword-granularity replicate with the lowest value that cannot
+; use VREPIG.
+define <4 x float> @f6() {
+; CHECK-LABEL: f6:
+; CHECK: vgmg %v24, 48, 48
+; CHECK: br %r14
+  ret <4 x float> <float 0.0, float 0x3790000000000000,
+                   float 0.0, float 0x3790000000000000>
+}
+
+; Test a doubleword-granularity replicate that has the lower 22 bits set.
+define <4 x float> @f7() {
+; CHECK-LABEL: f7:
+; CHECK: vgmg %v24, 42, 63
+; CHECK: br %r14
+  ret <4 x float> <float 0.0, float 0x37ffffff80000000,
+                   float 0.0, float 0x37ffffff80000000>
+}
+
+; Test a doubleword-granularity replicate that has the upper 45 bits set.
+define <4 x float> @f8() {
+; CHECK-LABEL: f8:
+; CHECK: vgmg %v24, 0, 44
+; CHECK: br %r14
+  ret <4 x float> <float 0xffffffffe0000000, float 0xffff000000000000,
+                   float 0xffffffffe0000000, float 0xffff000000000000>
+}
+
+; Test a doubleword-granularity replicate that has middle bits set.
+define <4 x float> @f9() {
+; CHECK-LABEL: f9:
+; CHECK: vgmg %v24, 34, 41
+; CHECK: br %r14
+  ret <4 x float> <float 0.0, float 0x3ff8000000000000,
+                   float 0.0, float 0x3ff8000000000000>
+}
+
+; Test a doubleword-granularity replicate with a wrap-around mask.
+define <4 x float> @f10() {
+; CHECK-LABEL: f10:
+; CHECK: vgmg %v24, 32, 0
+; CHECK: br %r14
+  ret <4 x float> <float 0x8000000000000000, float 0xffffffffe0000000,
+                   float 0x8000000000000000, float 0xffffffffe0000000>
+}
diff --git a/llvm/test/CodeGen/SystemZ/vec-conv-02.ll b/llvm/test/CodeGen/SystemZ/vec-conv-02.ll

new file mode 100644 (file)

index 0000000..ceccfc6
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/vec-conv-02.ll
@@ -0,0 +1,13 @@
+; Test conversions between different-sized float elements.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+
+; Test cases where both elements of a v2f64 are converted to f32s.
+define void @f1(<2 x double> %val, <2 x float> *%ptr) {
+; CHECK-LABEL: f1:
+; CHECK: vledb {{%v[0-9]+}}, %v24, 0, 0
+; CHECK: br %r14
+  %res = fptrunc <2 x double> %val to <2 x float>
+  store <2 x float> %res, <2 x float> *%ptr
+  ret void
+}
diff --git a/llvm/test/CodeGen/SystemZ/vec-move-01.ll b/llvm/test/CodeGen/SystemZ/vec-move-01.ll

index f9ae13b3ba1118a0991e7e98771d97ec0b174f64..896d24a1d203bdccdbb8d0f3630e831193ab9c4c 100644 (file)
--- a/llvm/test/CodeGen/SystemZ/vec-move-01.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-move-01.ll
@@ -34,6 +34,14 @@ define <2 x i64> @f4(<2 x i64> %val1, <2 x i64> %val2) {
    ret <2 x i64> %val2
  }
  
+; Test v4f32 moves.
+define <4 x float> @f5(<4 x float> %val1, <4 x float> %val2) {
+; CHECK-LABEL: f5:
+; CHECK: vlr %v24, %v26
+; CHECK: br %r14
+  ret <4 x float> %val2
+}
+
  ; Test v2f64 moves.
  define <2 x double> @f6(<2 x double> %val1, <2 x double> %val2) {
  ; CHECK-LABEL: f6:
diff --git a/llvm/test/CodeGen/SystemZ/vec-move-02.ll b/llvm/test/CodeGen/SystemZ/vec-move-02.ll

index a8c6912f0c739f649315b77884f41174e85341ab..e43676055fada66a66a098a76829fbd459faa67d 100644 (file)
--- a/llvm/test/CodeGen/SystemZ/vec-move-02.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-move-02.ll
@@ -38,6 +38,15 @@ define <2 x i64> @f4(<2 x i64> *%ptr) {
    ret <2 x i64> %ret
  }
  
+; Test v4f32 loads.
+define <4 x float> @f5(<4 x float> *%ptr) {
+; CHECK-LABEL: f5:
+; CHECK: vl %v24, 0(%r2)
+; CHECK: br %r14
+  %ret = load <4 x float>, <4 x float> *%ptr
+  ret <4 x float> %ret
+}
+
  ; Test v2f64 loads.
  define <2 x double> @f6(<2 x double> *%ptr) {
  ; CHECK-LABEL: f6:
diff --git a/llvm/test/CodeGen/SystemZ/vec-move-03.ll b/llvm/test/CodeGen/SystemZ/vec-move-03.ll

index abd7c939fbedf8e5c0617ea577579dbee6ae310c..1b1f96163a06f5fab3b6a663b95599fa760da450 100644 (file)
--- a/llvm/test/CodeGen/SystemZ/vec-move-03.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-move-03.ll
@@ -38,6 +38,15 @@ define void @f4(<2 x i64> %val, <2 x i64> *%ptr) {
    ret void
  }
  
+; Test v4f32 stores.
+define void @f5(<4 x float> %val, <4 x float> *%ptr) {
+; CHECK-LABEL: f5:
+; CHECK: vst %v24, 0(%r2)
+; CHECK: br %r14
+  store <4 x float> %val, <4 x float> *%ptr
+  ret void
+}
+
  ; Test v2f64 stores.
  define void @f6(<2 x double> %val, <2 x double> *%ptr) {
  ; CHECK-LABEL: f6:
diff --git a/llvm/test/CodeGen/SystemZ/vec-move-04.ll b/llvm/test/CodeGen/SystemZ/vec-move-04.ll

index 4e75d21dc961488bf2fb01b91b86978f0c5d6572..27c9e5f71f403ac9304f99cdcf5d71ce8417b37c 100644 (file)
--- a/llvm/test/CodeGen/SystemZ/vec-move-04.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-move-04.ll
@@ -110,6 +110,36 @@ define <2 x i64> @f12(<2 x i64> %val, i64 %element, i32 %index) {
    ret <2 x i64> %ret
  }
  
+; Test v4f32 insertion into the first element.
+define <4 x float> @f13(<4 x float> %val, float %element) {
+; CHECK-LABEL: f13:
+; CHECK: vlgvf [[REG:%r[0-5]]], %v0, 0
+; CHECK: vlvgf %v24, [[REG]], 0
+; CHECK: br %r14
+  %ret = insertelement <4 x float> %val, float %element, i32 0
+  ret <4 x float> %ret
+}
+
+; Test v4f32 insertion into the last element.
+define <4 x float> @f14(<4 x float> %val, float %element) {
+; CHECK-LABEL: f14:
+; CHECK: vlgvf [[REG:%r[0-5]]], %v0, 0
+; CHECK: vlvgf %v24, [[REG]], 3
+; CHECK: br %r14
+  %ret = insertelement <4 x float> %val, float %element, i32 3
+  ret <4 x float> %ret
+}
+
+; Test v4f32 insertion into a variable element.
+define <4 x float> @f15(<4 x float> %val, float %element, i32 %index) {
+; CHECK-LABEL: f15:
+; CHECK: vlgvf [[REG:%r[0-5]]], %v0, 0
+; CHECK: vlvgf %v24, [[REG]], 0(%r2)
+; CHECK: br %r14
+  %ret = insertelement <4 x float> %val, float %element, i32 %index
+  ret <4 x float> %ret
+}
+
  ; Test v2f64 insertion into the first element.
  define <2 x double> @f16(<2 x double> %val, double %element) {
  ; CHECK-LABEL: f16:
diff --git a/llvm/test/CodeGen/SystemZ/vec-move-05.ll b/llvm/test/CodeGen/SystemZ/vec-move-05.ll

index 234157a0abb7e48e07c1d1e574c2b6ac1edf81a6..99871196d685eb240f125eecf8b87c5a73897450 100644 (file)
--- a/llvm/test/CodeGen/SystemZ/vec-move-05.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-move-05.ll
@@ -150,6 +150,59 @@ define i64 @f16(<2 x i64> %val, i32 %index) {
    ret i64 %ret
  }
  
+; Test v4f32 extraction of element 0.
+define float @f17(<4 x float> %val) {
+; CHECK-LABEL: f17:
+; CHECK: vlr %v0, %v24
+; CHECK: br %r14
+  %ret = extractelement <4 x float> %val, i32 0
+  ret float %ret
+}
+
+; Test v4f32 extraction of element 1.
+define float @f18(<4 x float> %val) {
+; CHECK-LABEL: f18:
+; CHECK: vrepf %v0, %v24, 1
+; CHECK: br %r14
+  %ret = extractelement <4 x float> %val, i32 1
+  ret float %ret
+}
+
+; Test v4f32 extraction of element 2.
+define float @f19(<4 x float> %val) {
+; CHECK-LABEL: f19:
+; CHECK: vrepf %v0, %v24, 2
+; CHECK: br %r14
+  %ret = extractelement <4 x float> %val, i32 2
+  ret float %ret
+}
+
+; Test v4f32 extraction of element 3.
+define float @f20(<4 x float> %val) {
+; CHECK-LABEL: f20:
+; CHECK: vrepf %v0, %v24, 3
+; CHECK: br %r14
+  %ret = extractelement <4 x float> %val, i32 3
+  ret float %ret
+}
+
+; Test v4f32 extractions of an absurd element number.  This must compile
+; but we don't care what it does.
+define float @f21(<4 x float> %val) {
+  %ret = extractelement <4 x float> %val, i32 100000
+  ret float %ret
+}
+
+; Test v4f32 extraction of a variable element.
+define float @f22(<4 x float> %val, i32 %index) {
+; CHECK-LABEL: f22:
+; CHECK: vlgvf [[REG:%r[0-5]]], %v24, 0(%r2)
+; CHECK: vlvgf %v0, [[REG]], 0
+; CHECK: br %r14
+  %ret = extractelement <4 x float> %val, i32 %index
+  ret float %ret
+}
+
  ; Test v2f64 extraction of the first element.
  define double @f23(<2 x double> %val) {
  ; CHECK-LABEL: f23:
diff --git a/llvm/test/CodeGen/SystemZ/vec-move-07.ll b/llvm/test/CodeGen/SystemZ/vec-move-07.ll

index 0cb8a0a1dfc5ed5e63fcc95f71e9518b7c0d6ec3..b0d06f782dee7a5f497ee8846452e54c09ad6473 100644 (file)
--- a/llvm/test/CodeGen/SystemZ/vec-move-07.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-move-07.ll
@@ -38,7 +38,16 @@ define <2 x i64> @f4(i64 %val) {
    ret <2 x i64> %ret
  }
  
-; Test v2f64, which is just a move.
+; Test v4f32, which is just a move.
+define <4 x float> @f5(float %val) {
+; CHECK-LABEL: f5:
+; CHECK: vlr %v24, %v0
+; CHECK: br %r14
+  %ret = insertelement <4 x float> undef, float %val, i32 0
+  ret <4 x float> %ret
+}
+
+; Likewise v2f64.
  define <2 x double> @f6(double %val) {
  ; CHECK-LABEL: f6:
  ; CHECK: vlr %v24, %v0
diff --git a/llvm/test/CodeGen/SystemZ/vec-move-08.ll b/llvm/test/CodeGen/SystemZ/vec-move-08.ll

index 6148529c225d60980be63b7c247d0749854e2eb2..5396a1edec6abc0384d1baf0effbd110ee664f22 100644 (file)
--- a/llvm/test/CodeGen/SystemZ/vec-move-08.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-move-08.ll
@@ -214,6 +214,59 @@ define <2 x i64> @f20(<2 x i64> %val, i64 *%ptr, i32 %index) {
    ret <2 x i64> %ret
  }
  
+; Test v4f32 insertion into the first element.
+define <4 x float> @f21(<4 x float> %val, float *%ptr) {
+; CHECK-LABEL: f21:
+; CHECK: vlef %v24, 0(%r2), 0
+; CHECK: br %r14
+  %element = load float, float *%ptr
+  %ret = insertelement <4 x float> %val, float %element, i32 0
+  ret <4 x float> %ret
+}
+
+; Test v4f32 insertion into the last element.
+define <4 x float> @f22(<4 x float> %val, float *%ptr) {
+; CHECK-LABEL: f22:
+; CHECK: vlef %v24, 0(%r2), 3
+; CHECK: br %r14
+  %element = load float, float *%ptr
+  %ret = insertelement <4 x float> %val, float %element, i32 3
+  ret <4 x float> %ret
+}
+
+; Test v4f32 insertion with the highest in-range offset.
+define <4 x float> @f23(<4 x float> %val, float *%base) {
+; CHECK-LABEL: f23:
+; CHECK: vlef %v24, 4092(%r2), 2
+; CHECK: br %r14
+  %ptr = getelementptr float, float *%base, i32 1023
+  %element = load float, float *%ptr
+  %ret = insertelement <4 x float> %val, float %element, i32 2
+  ret <4 x float> %ret
+}
+
+; Test v4f32 insertion with the first ouf-of-range offset.
+define <4 x float> @f24(<4 x float> %val, float *%base) {
+; CHECK-LABEL: f24:
+; CHECK: aghi %r2, 4096
+; CHECK: vlef %v24, 0(%r2), 1
+; CHECK: br %r14
+  %ptr = getelementptr float, float *%base, i32 1024
+  %element = load float, float *%ptr
+  %ret = insertelement <4 x float> %val, float %element, i32 1
+  ret <4 x float> %ret
+}
+
+; Test v4f32 insertion into a variable element.
+define <4 x float> @f25(<4 x float> %val, float *%ptr, i32 %index) {
+; CHECK-LABEL: f25:
+; CHECK-NOT: vlef
+; CHECK: br %r14
+  %element = load float, float *%ptr
+  %ret = insertelement <4 x float> %val, float %element, i32 %index
+  ret <4 x float> %ret
+}
+
  ; Test v2f64 insertion into the first element.
  define <2 x double> @f26(<2 x double> %val, double *%ptr) {
  ; CHECK-LABEL: f26:
@@ -336,6 +389,34 @@ define <2 x i64> @f35(<2 x i64> %val, <2 x i64> %index, i64 %base) {
    ret <2 x i64> %ret
  }
  
+; Test a v4f32 gather of the first element.
+define <4 x float> @f36(<4 x float> %val, <4 x i32> %index, i64 %base) {
+; CHECK-LABEL: f36:
+; CHECK: vgef %v24, 0(%v26,%r2), 0
+; CHECK: br %r14
+  %elem = extractelement <4 x i32> %index, i32 0
+  %ext = zext i32 %elem to i64
+  %add = add i64 %base, %ext
+  %ptr = inttoptr i64 %add to float *
+  %element = load float, float *%ptr
+  %ret = insertelement <4 x float> %val, float %element, i32 0
+  ret <4 x float> %ret
+}
+
+; Test a v4f32 gather of the last element.
+define <4 x float> @f37(<4 x float> %val, <4 x i32> %index, i64 %base) {
+; CHECK-LABEL: f37:
+; CHECK: vgef %v24, 0(%v26,%r2), 3
+; CHECK: br %r14
+  %elem = extractelement <4 x i32> %index, i32 3
+  %ext = zext i32 %elem to i64
+  %add = add i64 %base, %ext
+  %ptr = inttoptr i64 %add to float *
+  %element = load float, float *%ptr
+  %ret = insertelement <4 x float> %val, float %element, i32 3
+  ret <4 x float> %ret
+}
+
  ; Test a v2f64 gather of the first element.
  define <2 x double> @f38(<2 x double> %val, <2 x i64> %index, i64 %base) {
  ; CHECK-LABEL: f38:
diff --git a/llvm/test/CodeGen/SystemZ/vec-move-09.ll b/llvm/test/CodeGen/SystemZ/vec-move-09.ll

index 78c5454fb55103512adb4b4550a36147116ef6c8..5a53a2d6a198f74cfac3914fa1546fdfdfa79e25 100644 (file)
--- a/llvm/test/CodeGen/SystemZ/vec-move-09.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-move-09.ll
@@ -236,6 +236,33 @@ define <2 x i64> @f26(<2 x i64> %val, i32 %index) {
    ret <2 x i64> %ret
  }
  
+; Test v4f32 insertion of 0 into the first element.
+define <4 x float> @f27(<4 x float> %val) {
+; CHECK-LABEL: f27:
+; CHECK: vleif %v24, 0, 0
+; CHECK: br %r14
+  %ret = insertelement <4 x float> %val, float 0.0, i32 0
+  ret <4 x float> %ret
+}
+
+; Test v4f32 insertion of 0 into the last element.
+define <4 x float> @f28(<4 x float> %val) {
+; CHECK-LABEL: f28:
+; CHECK: vleif %v24, 0, 3
+; CHECK: br %r14
+  %ret = insertelement <4 x float> %val, float 0.0, i32 3
+  ret <4 x float> %ret
+}
+
+; Test v4f32 insertion of a nonzero value.
+define <4 x float> @f29(<4 x float> %val) {
+; CHECK-LABEL: f29:
+; CHECK-NOT: vleif
+; CHECK: br %r14
+  %ret = insertelement <4 x float> %val, float 1.0, i32 1
+  ret <4 x float> %ret
+}
+
  ; Test v2f64 insertion of 0 into the first element.
  define <2 x double> @f30(<2 x double> %val) {
  ; CHECK-LABEL: f30:
diff --git a/llvm/test/CodeGen/SystemZ/vec-move-10.ll b/llvm/test/CodeGen/SystemZ/vec-move-10.ll

index bc854214bbd4e5da3fa03bce0356ebf631e946c7..894d0c2b41fa06ace4b5586fd6d6c9453675133a 100644 (file)
--- a/llvm/test/CodeGen/SystemZ/vec-move-10.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-move-10.ll
@@ -258,6 +258,70 @@ define void @f24(<2 x i64> %val, i64 *%ptr, i32 %index) {
    ret void
  }
  
+; Test v4f32 extraction from the first element.
+define void @f25(<4 x float> %val, float *%ptr) {
+; CHECK-LABEL: f25:
+; CHECK: vstef %v24, 0(%r2), 0
+; CHECK: br %r14
+  %element = extractelement <4 x float> %val, i32 0
+  store float %element, float *%ptr
+  ret void
+}
+
+; Test v4f32 extraction from the last element.
+define void @f26(<4 x float> %val, float *%ptr) {
+; CHECK-LABEL: f26:
+; CHECK: vstef %v24, 0(%r2), 3
+; CHECK: br %r14
+  %element = extractelement <4 x float> %val, i32 3
+  store float %element, float *%ptr
+  ret void
+}
+
+; Test v4f32 extraction of an invalid element.  This must compile,
+; but we don't care what it does.
+define void @f27(<4 x float> %val, float *%ptr) {
+; CHECK-LABEL: f27:
+; CHECK-NOT: vstef %v24, 0(%r2), 4
+; CHECK: br %r14
+  %element = extractelement <4 x float> %val, i32 4
+  store float %element, float *%ptr
+  ret void
+}
+
+; Test v4f32 extraction with the highest in-range offset.
+define void @f28(<4 x float> %val, float *%base) {
+; CHECK-LABEL: f28:
+; CHECK: vstef %v24, 4092(%r2), 2
+; CHECK: br %r14
+  %ptr = getelementptr float, float *%base, i32 1023
+  %element = extractelement <4 x float> %val, i32 2
+  store float %element, float *%ptr
+  ret void
+}
+
+; Test v4f32 extraction with the first ouf-of-range offset.
+define void @f29(<4 x float> %val, float *%base) {
+; CHECK-LABEL: f29:
+; CHECK: aghi %r2, 4096
+; CHECK: vstef %v24, 0(%r2), 1
+; CHECK: br %r14
+  %ptr = getelementptr float, float *%base, i32 1024
+  %element = extractelement <4 x float> %val, i32 1
+  store float %element, float *%ptr
+  ret void
+}
+
+; Test v4f32 extraction from a variable element.
+define void @f30(<4 x float> %val, float *%ptr, i32 %index) {
+; CHECK-LABEL: f30:
+; CHECK-NOT: vstef
+; CHECK: br %r14
+  %element = extractelement <4 x float> %val, i32 %index
+  store float %element, float *%ptr
+  ret void
+}
+
  ; Test v2f64 extraction from the first element.
  define void @f32(<2 x double> %val, double *%ptr) {
  ; CHECK-LABEL: f32:
@@ -380,6 +444,34 @@ define void @f41(<2 x i64> %val, <2 x i64> %index, i64 %base) {
    ret void
  }
  
+; Test a v4f32 scatter of the first element.
+define void @f42(<4 x float> %val, <4 x i32> %index, i64 %base) {
+; CHECK-LABEL: f42:
+; CHECK: vscef %v24, 0(%v26,%r2), 0
+; CHECK: br %r14
+  %elem = extractelement <4 x i32> %index, i32 0
+  %ext = zext i32 %elem to i64
+  %add = add i64 %base, %ext
+  %ptr = inttoptr i64 %add to float *
+  %element = extractelement <4 x float> %val, i32 0
+  store float %element, float *%ptr
+  ret void
+}
+
+; Test a v4f32 scatter of the last element.
+define void @f43(<4 x float> %val, <4 x i32> %index, i64 %base) {
+; CHECK-LABEL: f43:
+; CHECK: vscef %v24, 0(%v26,%r2), 3
+; CHECK: br %r14
+  %elem = extractelement <4 x i32> %index, i32 3
+  %ext = zext i32 %elem to i64
+  %add = add i64 %base, %ext
+  %ptr = inttoptr i64 %add to float *
+  %element = extractelement <4 x float> %val, i32 3
+  store float %element, float *%ptr
+  ret void
+}
+
  ; Test a v2f64 scatter of the first element.
  define void @f44(<2 x double> %val, <2 x i64> %index, i64 %base) {
  ; CHECK-LABEL: f44:
diff --git a/llvm/test/CodeGen/SystemZ/vec-move-11.ll b/llvm/test/CodeGen/SystemZ/vec-move-11.ll

index 07a037ccdf256ff46a35dbaaf2ae519c54c6aa1f..fd9c3d3559f0d65b321aa00fb679ee84f33a148e 100644 (file)
--- a/llvm/test/CodeGen/SystemZ/vec-move-11.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-move-11.ll
@@ -92,6 +92,15 @@ define <2 x i64> @f10(i64 %val) {
    ret <2 x i64> %ret
  }
  
+; Test v4f32 insertion into an undef.
+define <4 x float> @f11(float %val) {
+; CHECK-LABEL: f11:
+; CHECK: vrepf %v24, %v0, 0
+; CHECK: br %r14
+  %ret = insertelement <4 x float> undef, float %val, i32 2
+  ret <4 x float> %ret
+}
+
  ; Test v2f64 insertion into an undef.
  define <2 x double> @f12(double %val) {
  ; CHECK-LABEL: f12:
diff --git a/llvm/test/CodeGen/SystemZ/vec-move-12.ll b/llvm/test/CodeGen/SystemZ/vec-move-12.ll

index 94b186f46e57f7aa0a501147cd9a9d3dddc80704..bc8ff97f8057f69b4d8a2e9ca160bbc82f97465b 100644 (file)
--- a/llvm/test/CodeGen/SystemZ/vec-move-12.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-move-12.ll
@@ -102,6 +102,16 @@ define <2 x i64> @f10(i64 *%ptr) {
    ret <2 x i64> %ret
  }
  
+; Test v4f32 insertion into an undef.
+define <4 x float> @f11(float *%ptr) {
+; CHECK-LABEL: f11:
+; CHECK: vlrepf %v24, 0(%r2)
+; CHECK: br %r14
+  %val = load float, float *%ptr
+  %ret = insertelement <4 x float> undef, float %val, i32 2
+  ret <4 x float> %ret
+}
+
  ; Test v2f64 insertion into an undef.
  define <2 x double> @f12(double *%ptr) {
  ; CHECK-LABEL: f12:
diff --git a/llvm/test/CodeGen/SystemZ/vec-move-13.ll b/llvm/test/CodeGen/SystemZ/vec-move-13.ll

index c50c94afb6cfccc202b5bdc4529e86489733de6b..4ad8e3f521005dc3d2dab339b62e522bd0bfecd2 100644 (file)
--- a/llvm/test/CodeGen/SystemZ/vec-move-13.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-move-13.ll
@@ -46,6 +46,17 @@ define <2 x i64> @f4(i64 %val) {
    ret <2 x i64> %ret
  }
  
+; Test v4f32 insertion into 0.
+define <4 x float> @f5(float %val) {
+; CHECK-LABEL: f5:
+; CHECK: vgbm [[ZERO:%v[0-9]+]], 0
+; CHECK: vmrhf [[REG:%v[0-9]+]], [[ZERO]], %v0
+; CHECK: vmrhg %v24, [[ZERO]], [[REG]]
+; CHECK: br %r14
+  %ret = insertelement <4 x float> zeroinitializer, float %val, i32 3
+  ret <4 x float> %ret
+}
+
  ; Test v2f64 insertion into 0.
  define <2 x double> @f6(double %val) {
  ; CHECK-LABEL: f6:
@@ -55,3 +66,4 @@ define <2 x double> @f6(double %val) {
    %ret = insertelement <2 x double> zeroinitializer, double %val, i32 1
    ret <2 x double> %ret
  }
+
diff --git a/llvm/test/CodeGen/SystemZ/vec-move-14.ll b/llvm/test/CodeGen/SystemZ/vec-move-14.ll

index b48f2175ebead70c5362883cf6d3c9939710cfdc..e41eb9da034653dd9f21ed6b6c19a6df05af7974 100644 (file)
--- a/llvm/test/CodeGen/SystemZ/vec-move-14.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-move-14.ll
@@ -75,6 +75,16 @@ define <2 x i64> @f7(i64 *%ptr) {
    ret <2 x i64> %ret
  }
  
+; Test VLLEZF with a float.
+define <4 x float> @f8(float *%ptr) {
+; CHECK-LABEL: f8:
+; CHECK: vllezf %v24, 0(%r2)
+; CHECK: br %r14
+  %val = load float, float *%ptr
+  %ret = insertelement <4 x float> zeroinitializer, float %val, i32 1
+  ret <4 x float> %ret
+}
+
  ; Test VLLEZG with a double.
  define <2 x double> @f9(double *%ptr) {
  ; CHECK-LABEL: f9:
diff --git a/llvm/test/CodeGen/SystemZ/vec-perm-01.ll b/llvm/test/CodeGen/SystemZ/vec-perm-01.ll

index c68958a98a254c3841c08e102348be44adcc4c76..4beec05eaece3864ab0e32b618fb73f985843d23 100644 (file)
--- a/llvm/test/CodeGen/SystemZ/vec-perm-01.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-perm-01.ll
@@ -123,6 +123,37 @@ define <2 x i64> @f11(<2 x i64> %val) {
    ret <2 x i64> %ret
  }
  
+; Test v4f32 splat of the first element.
+define <4 x float> @f12(<4 x float> %val) {
+; CHECK-LABEL: f12:
+; CHECK: vrepf %v24, %v24, 0
+; CHECK: br %r14
+  %ret = shufflevector <4 x float> %val, <4 x float> undef,
+                       <4 x i32> zeroinitializer
+  ret <4 x float> %ret
+}
+
+; Test v4f32 splat of the last element.
+define <4 x float> @f13(<4 x float> %val) {
+; CHECK-LABEL: f13:
+; CHECK: vrepf %v24, %v24, 3
+; CHECK: br %r14
+  %ret = shufflevector <4 x float> %val, <4 x float> undef,
+                       <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  ret <4 x float> %ret
+}
+
+; Test v4f32 splat of an arbitrary element, using the second operand of
+; the shufflevector.
+define <4 x float> @f14(<4 x float> %val) {
+; CHECK-LABEL: f14:
+; CHECK: vrepf %v24, %v24, 1
+; CHECK: br %r14
+  %ret = shufflevector <4 x float> undef, <4 x float> %val,
+                       <4 x i32> <i32 5, i32 5, i32 5, i32 5>
+  ret <4 x float> %ret
+}
+
  ; Test v2f64 splat of the first element.
  define <2 x double> @f15(<2 x double> %val) {
  ; CHECK-LABEL: f15:
diff --git a/llvm/test/CodeGen/SystemZ/vec-perm-02.ll b/llvm/test/CodeGen/SystemZ/vec-perm-02.ll

index 7158990174bd83076ba163f57c1e072654f3245e..e5c6df8e955a80b48789961784c0371f72e7a039 100644 (file)
--- a/llvm/test/CodeGen/SystemZ/vec-perm-02.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-perm-02.ll
@@ -143,6 +143,40 @@ define <2 x i64> @f11(i64 %scalar) {
    ret <2 x i64> %ret
  }
  
+; Test v4f32 splat of the first element.
+define <4 x float> @f12(float %scalar) {
+; CHECK-LABEL: f12:
+; CHECK: vrepf %v24, %v0, 0
+; CHECK: br %r14
+  %val = insertelement <4 x float> undef, float %scalar, i32 0
+  %ret = shufflevector <4 x float> %val, <4 x float> undef,
+                       <4 x i32> zeroinitializer
+  ret <4 x float> %ret
+}
+
+; Test v4f32 splat of the last element.
+define <4 x float> @f13(float %scalar) {
+; CHECK-LABEL: f13:
+; CHECK: vrepf %v24, %v0, 0
+; CHECK: br %r14
+  %val = insertelement <4 x float> undef, float %scalar, i32 3
+  %ret = shufflevector <4 x float> %val, <4 x float> undef,
+                       <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  ret <4 x float> %ret
+}
+
+; Test v4f32 splat of an arbitrary element, using the second operand of
+; the shufflevector.
+define <4 x float> @f14(float %scalar) {
+; CHECK-LABEL: f14:
+; CHECK: vrepf %v24, %v0, 0
+; CHECK: br %r14
+  %val = insertelement <4 x float> undef, float %scalar, i32 1
+  %ret = shufflevector <4 x float> undef, <4 x float> %val,
+                       <4 x i32> <i32 5, i32 5, i32 5, i32 5>
+  ret <4 x float> %ret
+}
+
  ; Test v2f64 splat of the first element.
  define <2 x double> @f15(double %scalar) {
  ; CHECK-LABEL: f15:
diff --git a/llvm/test/CodeGen/SystemZ/vec-perm-03.ll b/llvm/test/CodeGen/SystemZ/vec-perm-03.ll

index c30a87601a4385b795b0fadefa88775b8cda792d..663815549c33cbd59eef1e63da58b8b3fdb5735b 100644 (file)
--- a/llvm/test/CodeGen/SystemZ/vec-perm-03.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-perm-03.ll
@@ -158,6 +158,44 @@ define <2 x i64> @f12(i64 *%base) {
    ret <2 x i64> %ret
  }
  
+; Test a v4f32 replicating load with no offset.
+define <4 x float> @f13(float *%ptr) {
+; CHECK-LABEL: f13:
+; CHECK: vlrepf %v24, 0(%r2)
+; CHECK: br %r14
+  %scalar = load float, float *%ptr
+  %val = insertelement <4 x float> undef, float %scalar, i32 0
+  %ret = shufflevector <4 x float> %val, <4 x float> undef,
+                       <4 x i32> zeroinitializer
+  ret <4 x float> %ret
+}
+
+; Test a v4f32 replicating load with the maximum in-range offset.
+define <4 x float> @f14(float *%base) {
+; CHECK-LABEL: f14:
+; CHECK: vlrepf %v24, 4092(%r2)
+; CHECK: br %r14
+  %ptr = getelementptr float, float *%base, i64 1023
+  %scalar = load float, float *%ptr
+  %val = insertelement <4 x float> undef, float %scalar, i32 0
+  %ret = shufflevector <4 x float> %val, <4 x float> undef,
+                       <4 x i32> zeroinitializer
+  ret <4 x float> %ret
+}
+
+; Test a v4f32 replicating load with the first out-of-range offset.
+define <4 x float> @f15(float *%base) {
+; CHECK-LABEL: f15:
+; CHECK: aghi %r2, 4096
+; CHECK: vlrepf %v24, 0(%r2)
+; CHECK: br %r14
+  %ptr = getelementptr float, float *%base, i64 1024
+  %scalar = load float, float *%ptr
+  %val = insertelement <4 x float> undef, float %scalar, i32 0
+  %ret = shufflevector <4 x float> %val, <4 x float> undef,
+                       <4 x i32> zeroinitializer
+  ret <4 x float> %ret
+}
  
  ; Test a v2f64 replicating load with no offset.
  define <2 x double> @f16(double *%ptr) {
diff --git a/llvm/test/CodeGen/SystemZ/vec-perm-04.ll b/llvm/test/CodeGen/SystemZ/vec-perm-04.ll

index ca04fdf691321910f2c0251c66988b03da22d57e..0df6f4fbb0126e09f8ba1571253432e2daebd855 100644 (file)
--- a/llvm/test/CodeGen/SystemZ/vec-perm-04.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-perm-04.ll
@@ -159,6 +159,26 @@ define <2 x i64> @f13(<2 x i64> %val1, <2 x i64> %val2) {
    ret <2 x i64> %ret
  }
  
+; Test a canonical v4f32 merge high.
+define <4 x float> @f14(<4 x float> %val1, <4 x float> %val2) {
+; CHECK-LABEL: f14:
+; CHECK: vmrhf %v24, %v24, %v26
+; CHECK: br %r14
+  %ret = shufflevector <4 x float> %val1, <4 x float> %val2,
+                       <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  ret <4 x float> %ret
+}
+
+; Test a reversed v4f32 merge high.
+define <4 x float> @f15(<4 x float> %val1, <4 x float> %val2) {
+; CHECK-LABEL: f15:
+; CHECK: vmrhf %v24, %v26, %v24
+; CHECK: br %r14
+  %ret = shufflevector <4 x float> %val1, <4 x float> %val2,
+                       <4 x i32> <i32 4, i32 0, i32 5, i32 1>
+  ret <4 x float> %ret
+}
+
  ; Test a canonical v2f64 merge high.
  define <2 x double> @f16(<2 x double> %val1, <2 x double> %val2) {
  ; CHECK-LABEL: f16:
diff --git a/llvm/test/CodeGen/SystemZ/vec-perm-05.ll b/llvm/test/CodeGen/SystemZ/vec-perm-05.ll

index f4a46ff4e279827c1a5960bb020bd262ccbf9c32..b585cefbf84551c2bf880ce531e7e6a8452f3736 100644 (file)
--- a/llvm/test/CodeGen/SystemZ/vec-perm-05.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-perm-05.ll
@@ -159,6 +159,26 @@ define <2 x i64> @f13(<2 x i64> %val1, <2 x i64> %val2) {
    ret <2 x i64> %ret
  }
  
+; Test a canonical v4f32 merge low.
+define <4 x float> @f14(<4 x float> %val1, <4 x float> %val2) {
+; CHECK-LABEL: f14:
+; CHECK: vmrlf %v24, %v24, %v26
+; CHECK: br %r14
+  %ret = shufflevector <4 x float> %val1, <4 x float> %val2,
+                       <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  ret <4 x float> %ret
+}
+
+; Test a reversed v4f32 merge low.
+define <4 x float> @f15(<4 x float> %val1, <4 x float> %val2) {
+; CHECK-LABEL: f15:
+; CHECK: vmrlf %v24, %v26, %v24
+; CHECK: br %r14
+  %ret = shufflevector <4 x float> %val1, <4 x float> %val2,
+                       <4 x i32> <i32 6, i32 2, i32 7, i32 3>
+  ret <4 x float> %ret
+}
+
  ; Test a canonical v2f64 merge low.
  define <2 x double> @f16(<2 x double> %val1, <2 x double> %val2) {
  ; CHECK-LABEL: f16:
diff --git a/llvm/test/CodeGen/SystemZ/vec-perm-06.ll b/llvm/test/CodeGen/SystemZ/vec-perm-06.ll

index 298fc60e8519e3e741b3610431f8b1939b29da62..835276a36725a73cf08888faccbdae6a45e8a529 100644 (file)
--- a/llvm/test/CodeGen/SystemZ/vec-perm-06.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-perm-06.ll
@@ -138,3 +138,23 @@ define <4 x i32> @f11(<4 x i32> %val1, <4 x i32> %val2) {
                         <4 x i32> <i32 5, i32 7, i32 1, i32 3>
    ret <4 x i32> %ret
  }
+
+; Test a canonical v4f32 pack.
+define <4 x float> @f12(<4 x float> %val1, <4 x float> %val2) {
+; CHECK-LABEL: f12:
+; CHECK: vpkg %v24, %v24, %v26
+; CHECK: br %r14
+  %ret = shufflevector <4 x float> %val1, <4 x float> %val2,
+                       <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  ret <4 x float> %ret
+}
+
+; Test a reversed v4f32 pack.
+define <4 x float> @f13(<4 x float> %val1, <4 x float> %val2) {
+; CHECK-LABEL: f13:
+; CHECK: vpkg %v24, %v26, %v24
+; CHECK: br %r14
+  %ret = shufflevector <4 x float> %val1, <4 x float> %val2,
+                       <4 x i32> <i32 5, i32 7, i32 1, i32 3>
+  ret <4 x float> %ret
+}
diff --git a/llvm/test/CodeGen/SystemZ/vec-perm-07.ll b/llvm/test/CodeGen/SystemZ/vec-perm-07.ll

index 40ca39955243a475cb326f2b90d35478325200eb..9a370af2c0e7fb8995aecabbe4b2bef6119ab950 100644 (file)
--- a/llvm/test/CodeGen/SystemZ/vec-perm-07.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-perm-07.ll
@@ -122,4 +122,24 @@ define <4 x i32> @f10(<4 x i32> %val1, <4 x i32> %val2) {
    ret <4 x i32> %ret
  }
  
+; Test a v4f32 shift with the lowest useful shift amount.
+define <4 x float> @f12(<4 x float> %val1, <4 x float> %val2) {
+; CHECK-LABEL: f12:
+; CHECK: vsldb %v24, %v24, %v26, 4
+; CHECK: br %r14
+  %ret = shufflevector <4 x float> %val1, <4 x float> %val2,
+                       <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+  ret <4 x float> %ret
+}
+
+; Test a v4f32 shift with the highest useful shift amount.
+define <4 x float> @f13(<4 x float> %val1, <4 x float> %val2) {
+; CHECK-LABEL: f13:
+; CHECK: vsldb %v24, %v24, %v26, 12
+; CHECK: br %r14
+  %ret = shufflevector <4 x float> %val1, <4 x float> %val2,
+                       <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+  ret <4 x float> %ret
+}
+
  ; We use VPDI for v2i64 shuffles.
diff --git a/llvm/test/CodeGen/SystemZ/vec-perm-08.ll b/llvm/test/CodeGen/SystemZ/vec-perm-08.ll

index b5220ab67126aac68688bf1d6e17a82d693d5d1b..a18ca7b7397543ed5c53a95f06ef976a740d3c6b 100644 (file)
--- a/llvm/test/CodeGen/SystemZ/vec-perm-08.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-perm-08.ll
@@ -129,6 +129,26 @@ define <2 x i64> @f11(<2 x i64> %val1, <2 x i64> %val2) {
    ret <2 x i64> %ret
  }
  
+; Test a high1/low2 permute for v4f32.
+define <4 x float> @f12(<4 x float> %val1, <4 x float> %val2) {
+; CHECK-LABEL: f12:
+; CHECK: vpdi %v24, %v24, %v26, 1
+; CHECK: br %r14
+  %ret = shufflevector <4 x float> %val1, <4 x float> %val2,
+                       <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+  ret <4 x float> %ret
+}
+
+; Test a low2/high1 permute for v4f32.
+define <4 x float> @f13(<4 x float> %val1, <4 x float> %val2) {
+; CHECK-LABEL: f13:
+; CHECK: vpdi %v24, %v26, %v24, 4
+; CHECK: br %r14
+  %ret = shufflevector <4 x float> %val1, <4 x float> %val2,
+                       <4 x i32> <i32 6, i32 7, i32 0, i32 1>
+  ret <4 x float> %ret
+}
+
  ; Test a high1/low2 permute for v2f64.
  define <2 x double> @f14(<2 x double> %val1, <2 x double> %val2) {
  ; CHECK-LABEL: f14:
diff --git a/llvm/test/CodeGen/SystemZ/vec-sub-01.ll b/llvm/test/CodeGen/SystemZ/vec-sub-01.ll

index 24d4ba5a2bdc4a3d8cec045ffd354237cfdd5d83..aabf1c9be4a32eb8c13c60f216fe2a2713839d7a 100644 (file)
--- a/llvm/test/CodeGen/SystemZ/vec-sub-01.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-sub-01.ll
@@ -38,6 +38,33 @@ define <2 x i64> @f4(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) {
    ret <2 x i64> %ret
  }
  
+; Test a v4f32 subtraction, as an example of an operation that needs to be
+; scalarized and reassembled.  At present there's an unnecessary move that
+; could be avoided with smarter ordering.  It also isn't important whether
+; the VSLDBs use the result of the VLRs or use %v24 and %v26 directly.
+define <4 x float> @f5(<4 x float> %val1, <4 x float> %val2) {
+; CHECK-LABEL: f5:
+; CHECK-DAG: vlr %v[[A1:[0-5]]], %v24
+; CHECK-DAG: vlr %v[[A2:[0-5]]], %v26
+; CHECK-DAG: vrepf %v[[B1:[0-5]]], %v[[A1]], 1
+; CHECK-DAG: vrepf %v[[B2:[0-5]]], %v[[A2]], 1
+; CHECK-DAG: vrepf %v[[C1:[0-5]]], %v[[A1]], 2
+; CHECK-DAG: vrepf %v[[C2:[0-5]]], %v[[A2]], 2
+; CHECK-DAG: vrepf %v[[D1:[0-5]]], %v[[A1]], 3
+; CHECK-DAG: vrepf %v[[D2:[0-5]]], %v[[A2]], 3
+; CHECK-DAG: ler %f[[A1copy:[0-5]]], %f[[A1]]
+; CHECK-DAG: sebr %f[[A1copy]], %f[[A2]]
+; CHECK-DAG: sebr %f[[B1]], %f[[B2]]
+; CHECK-DAG: sebr %f[[C1]], %f[[C2]]
+; CHECK-DAG: sebr %f[[D1]], %f[[D2]]
+; CHECK-DAG: vmrhf [[HIGH:%v[0-9]+]], %v[[A1copy]], %v[[B1]]
+; CHECK-DAG: vmrhf [[LOW:%v[0-9]+]], %v[[C1]], %v[[D1]]
+; CHECK: vmrhg %v24, [[HIGH]], [[LOW]]
+; CHECK: br %r14
+  %ret = fsub <4 x float> %val1, %val2
+  ret <4 x float> %ret
+}
+
  ; Test a v2f64 subtraction.
  define <2 x double> @f6(<2 x double> %dummy, <2 x double> %val1,
                          <2 x double> %val2) {
author	Ulrich Weigand <ulrich.weigand@de.ibm.com>
	Tue, 5 May 2015 19:27:45 +0000 (19:27 +0000)
committer	Ulrich Weigand <ulrich.weigand@de.ibm.com>
	Tue, 5 May 2015 19:27:45 +0000 (19:27 +0000)
llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h		patch \| blob \| history
llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp		patch \| blob \| history
llvm/lib/Target/SystemZ/SystemZCallingConv.td		patch \| blob \| history
llvm/lib/Target/SystemZ/SystemZISelLowering.cpp		patch \| blob \| history
llvm/lib/Target/SystemZ/SystemZISelLowering.h		patch \| blob \| history
llvm/lib/Target/SystemZ/SystemZInstrFormats.td		patch \| blob \| history
llvm/lib/Target/SystemZ/SystemZInstrVector.td		patch \| blob \| history
llvm/lib/Target/SystemZ/SystemZOperators.td		patch \| blob \| history
llvm/test/CodeGen/SystemZ/fp-move-09.ll		patch \| blob \| history
llvm/test/CodeGen/SystemZ/fp-move-10.ll	[new file with mode: 0644]	patch \| blob
llvm/test/CodeGen/SystemZ/vec-cmp-05.ll	[new file with mode: 0644]	patch \| blob
llvm/test/CodeGen/SystemZ/vec-const-05.ll	[new file with mode: 0644]	patch \| blob
llvm/test/CodeGen/SystemZ/vec-const-11.ll	[new file with mode: 0644]	patch \| blob
llvm/test/CodeGen/SystemZ/vec-const-17.ll	[new file with mode: 0644]	patch \| blob
llvm/test/CodeGen/SystemZ/vec-conv-02.ll	[new file with mode: 0644]	patch \| blob
llvm/test/CodeGen/SystemZ/vec-move-01.ll		patch \| blob \| history
llvm/test/CodeGen/SystemZ/vec-move-02.ll		patch \| blob \| history
llvm/test/CodeGen/SystemZ/vec-move-03.ll		patch \| blob \| history
llvm/test/CodeGen/SystemZ/vec-move-04.ll		patch \| blob \| history
llvm/test/CodeGen/SystemZ/vec-move-05.ll		patch \| blob \| history
llvm/test/CodeGen/SystemZ/vec-move-07.ll		patch \| blob \| history
llvm/test/CodeGen/SystemZ/vec-move-08.ll		patch \| blob \| history
llvm/test/CodeGen/SystemZ/vec-move-09.ll		patch \| blob \| history
llvm/test/CodeGen/SystemZ/vec-move-10.ll		patch \| blob \| history
llvm/test/CodeGen/SystemZ/vec-move-11.ll		patch \| blob \| history
llvm/test/CodeGen/SystemZ/vec-move-12.ll		patch \| blob \| history
llvm/test/CodeGen/SystemZ/vec-move-13.ll		patch \| blob \| history
llvm/test/CodeGen/SystemZ/vec-move-14.ll		patch \| blob \| history
llvm/test/CodeGen/SystemZ/vec-perm-01.ll		patch \| blob \| history
llvm/test/CodeGen/SystemZ/vec-perm-02.ll		patch \| blob \| history
llvm/test/CodeGen/SystemZ/vec-perm-03.ll		patch \| blob \| history
llvm/test/CodeGen/SystemZ/vec-perm-04.ll		patch \| blob \| history
llvm/test/CodeGen/SystemZ/vec-perm-05.ll		patch \| blob \| history
llvm/test/CodeGen/SystemZ/vec-perm-06.ll		patch \| blob \| history
llvm/test/CodeGen/SystemZ/vec-perm-07.ll		patch \| blob \| history
llvm/test/CodeGen/SystemZ/vec-perm-08.ll		patch \| blob \| history
llvm/test/CodeGen/SystemZ/vec-sub-01.ll		patch \| blob \| history