[X86] Vectorcall Calling Convention - Adding CodeGen Complete Support

author Oren Ben Simhon <oren.ben.simhon@intel.com>

Wed, 21 Dec 2016 08:31:45 +0000 (08:31 +0000)

committer Oren Ben Simhon <oren.ben.simhon@intel.com>

Wed, 21 Dec 2016 08:31:45 +0000 (08:31 +0000)
author Oren Ben Simhon <oren.ben.simhon@intel.com>
Wed, 21 Dec 2016 08:31:45 +0000 (08:31 +0000)
committer Oren Ben Simhon <oren.ben.simhon@intel.com>
Wed, 21 Dec 2016 08:31:45 +0000 (08:31 +0000)
diff --git a/llvm/include/llvm/CodeGen/CallingConvLower.h b/llvm/include/llvm/CodeGen/CallingConvLower.h

index f7983c5..bfbd228 100644 (file)
--- a/llvm/include/llvm/CodeGen/CallingConvLower.h
+++ b/llvm/include/llvm/CodeGen/CallingConvLower.h
@@ -296,6 +296,12 @@ public:
    void AnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins,
                                CCAssignFn Fn);
  
+  /// The function will invoke AnalyzeFormalArguments.
+  void AnalyzeArguments(const SmallVectorImpl<ISD::InputArg> &Ins,
+                        CCAssignFn Fn) {
+    AnalyzeFormalArguments(Ins, Fn);
+  }
+
    /// AnalyzeReturn - Analyze the returned values of a return,
    /// incorporating info about the result values into this state.
    void AnalyzeReturn(const SmallVectorImpl<ISD::OutputArg> &Outs,
@@ -318,11 +324,22 @@ public:
                             SmallVectorImpl<ISD::ArgFlagsTy> &Flags,
                             CCAssignFn Fn);
  
+  /// The function will invoke AnalyzeCallOperands.
+  void AnalyzeArguments(const SmallVectorImpl<ISD::OutputArg> &Outs,
+                        CCAssignFn Fn) {
+    AnalyzeCallOperands(Outs, Fn);
+  }
+
    /// AnalyzeCallResult - Analyze the return values of a call,
    /// incorporating info about the passed values into this state.
    void AnalyzeCallResult(const SmallVectorImpl<ISD::InputArg> &Ins,
                           CCAssignFn Fn);
  
+  /// A shadow allocated register is a register that was allocated
+  /// but wasn't added to the location list (Locs).
+  /// \returns true if the register was allocated as shadow or false otherwise.
+  bool IsShadowAllocatedReg(unsigned Reg) const;
+
    /// AnalyzeCallResult - Same as above except it's specialized for calls which
    /// produce a single value.
    void AnalyzeCallResult(MVT VT, CCAssignFn Fn);
@@ -521,6 +538,37 @@ public:
                                  const SmallVectorImpl<ISD::InputArg> &Ins,
                                  CCAssignFn CalleeFn, CCAssignFn CallerFn);
  
+  /// The function runs an additional analysis pass over function arguments.
+  /// It will mark each argument with the attribute flag SecArgPass.
+  /// After running, it will sort the locs list.
+  template <class T>
+  void AnalyzeArgumentsSecondPass(const SmallVectorImpl<T> &Args,
+                                  CCAssignFn Fn) {
+    unsigned NumFirstPassLocs = Locs.size();
+
+    /// Creates similar argument list to \p Args in which each argument is
+    /// marked using SecArgPass flag.
+    SmallVector<T, 16> SecPassArg;
+    // SmallVector<ISD::InputArg, 16> SecPassArg;
+    for (auto Arg : Args) {
+      Arg.Flags.setSecArgPass();
+      SecPassArg.push_back(Arg);
+    }
+
+    // Run the second argument pass
+    AnalyzeArguments(SecPassArg, Fn);
+
+    // Sort the locations of the arguments according to their original position.
+    SmallVector<CCValAssign, 16> TmpArgLocs;
+    std::swap(TmpArgLocs, Locs);
+    auto B = TmpArgLocs.begin(), E = TmpArgLocs.end();
+    std::merge(B, B + NumFirstPassLocs, B + NumFirstPassLocs, E,
+               std::back_inserter(Locs),
+               [](const CCValAssign &A, const CCValAssign &B) -> bool {
+                 return A.getValNo() < B.getValNo();
+               });
+  }
+
  private:
    /// MarkAllocated - Mark a register and all of its aliases as allocated.
    void MarkAllocated(unsigned Reg);
diff --git a/llvm/include/llvm/Target/TargetCallingConv.h b/llvm/include/llvm/Target/TargetCallingConv.h

index 19d8917..be09236 100644 (file)
--- a/llvm/include/llvm/Target/TargetCallingConv.h
+++ b/llvm/include/llvm/Target/TargetCallingConv.h
@@ -51,6 +51,15 @@ namespace ISD {
      static const uint64_t SwiftSelfOffs  = 14;
      static const uint64_t SwiftError     = 1ULL<<15; ///< Swift error parameter
      static const uint64_t SwiftErrorOffs = 15;
+    static const uint64_t Hva            = 1ULL << 16; ///< HVA field for
+                                                       ///< vectorcall
+    static const uint64_t HvaOffs        = 16;
+    static const uint64_t HvaStart       = 1ULL << 17; ///< HVA structure start
+                                                       ///< for vectorcall
+    static const uint64_t HvaStartOffs   = 17;
+    static const uint64_t SecArgPass     = 1ULL << 18; ///< Second argument
+                                                       ///< pass for vectorcall
+    static const uint64_t SecArgPassOffs = 18;
      static const uint64_t OrigAlign      = 0x1FULL<<27;
      static const uint64_t OrigAlignOffs  = 27;
      static const uint64_t ByValSize      = 0x3fffffffULL<<32; ///< Struct size
@@ -91,6 +100,15 @@ namespace ISD {
      bool isSwiftError() const { return Flags & SwiftError; }
      void setSwiftError() { Flags |= One << SwiftErrorOffs; }
  
+    bool isHva() const { return Flags & Hva; }
+    void setHva() { Flags |= One << HvaOffs; }
+
+    bool isHvaStart() const { return Flags & HvaStart; }
+    void setHvaStart() { Flags |= One << HvaStartOffs; }
+
+    bool isSecArgPass() const { return Flags & SecArgPass; }
+    void setSecArgPass() { Flags |= One << SecArgPassOffs; }
+
      bool isNest()      const { return Flags & Nest; }
      void setNest()     { Flags |= One << NestOffs; }
  
diff --git a/llvm/lib/CodeGen/CallingConvLower.cpp b/llvm/lib/CodeGen/CallingConvLower.cpp

index 7d67bcf..2e33f14 100644 (file)
--- a/llvm/lib/CodeGen/CallingConvLower.cpp
+++ b/llvm/lib/CodeGen/CallingConvLower.cpp
@@ -23,6 +23,8 @@
  #include "llvm/Target/TargetLowering.h"
  #include "llvm/Target/TargetRegisterInfo.h"
  #include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+
  using namespace llvm;
  
  CCState::CCState(CallingConv::ID CC, bool isVarArg, MachineFunction &mf,
@@ -64,6 +66,22 @@ void CCState::MarkAllocated(unsigned Reg) {
      UsedRegs[*AI/32] |= 1 << (*AI&31);
  }
  
+bool CCState::IsShadowAllocatedReg(unsigned Reg) const {
+  if (!isAllocated(Reg))
+    return false;
+
+  for (auto const &ValAssign : Locs) {
+    if (ValAssign.isRegLoc()) {
+      for (MCRegAliasIterator AI(ValAssign.getLocReg(), &TRI, true);
+           AI.isValid(); ++AI) {
+        if (*AI == Reg)
+          return false;
+      }
+    }
+  }
+  return true;
+}
+
  /// Analyze an array of argument values,
  /// incorporating info about the formals into this state.
  void
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp

index 50ddc4b..da68fe8 100644 (file)
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -7732,8 +7732,19 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
          Flags.setZExt();
        if (Args[i].isSExt)
          Flags.setSExt();
-      if (Args[i].isInReg)
+      if (Args[i].isInReg) {
+        // If we are using vectorcall calling convention, a structure that is
+        // passed InReg - is surely an HVA
+        if (CLI.CallConv == CallingConv::X86_VectorCall &&
+            isa<StructType>(FinalType)) {
+          // The first value of a structure is marked
+          if (0 == Value)
+            Flags.setHvaStart();
+          Flags.setHva();
+        }
+        // Set InReg Flag
          Flags.setInReg();
+      }
        if (Args[i].isSRet)
          Flags.setSRet();
        if (Args[i].isSwiftSelf)
@@ -8019,8 +8030,19 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
          Flags.setZExt();
        if (F.getAttributes().hasAttribute(Idx, Attribute::SExt))
          Flags.setSExt();
-      if (F.getAttributes().hasAttribute(Idx, Attribute::InReg))
+      if (F.getAttributes().hasAttribute(Idx, Attribute::InReg)) {
+        // If we are using vectorcall calling convention, a structure that is
+        // passed InReg - is surely an HVA
+        if (F.getCallingConv() == CallingConv::X86_VectorCall &&
+            isa<StructType>(I->getType())) {
+          // The first value of a structure is marked
+          if (0 == Value)
+            Flags.setHvaStart();
+          Flags.setHva();
+        }
+        // Set InReg Flag
          Flags.setInReg();
+      }
        if (F.getAttributes().hasAttribute(Idx, Attribute::StructRet))
          Flags.setSRet();
        if (F.getAttributes().hasAttribute(Idx, Attribute::SwiftSelf))
diff --git a/llvm/lib/Target/X86/X86CallingConv.cpp b/llvm/lib/Target/X86/X86CallingConv.cpp

index 1bfe225..ae3f5b5 100644 (file)
--- a/llvm/lib/Target/X86/X86CallingConv.cpp
+++ b/llvm/lib/Target/X86/X86CallingConv.cpp
@@ -13,6 +13,7 @@
  //===----------------------------------------------------------------------===//\r
  \r
  #include "MCTargetDesc/X86MCTargetDesc.h"\r
+#include "X86Subtarget.h"\r
  #include "llvm/CodeGen/CallingConvLower.h"\r
  #include "llvm/IR/CallingConv.h"\r
  \r
@@ -39,14 +40,14 @@ bool CC_X86_32_RegCall_Assign2Regs(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
    if (AvailableRegs.size() < RequiredGprsUponSplit)\r
      return false; // Not enough free registers - continue the search.\r
  \r
-  // Allocating the available registers\r
+  // Allocating the available registers.\r
    for (unsigned I = 0; I < RequiredGprsUponSplit; I++) {\r
  \r
-    // Marking the register as located\r
+    // Marking the register as located.\r
      unsigned Reg = State.AllocateReg(AvailableRegs[I]);\r
  \r
      // Since we previously made sure that 2 registers are available\r
-    // we expect that a real register number will be returned\r
+    // we expect that a real register number will be returned.\r
      assert(Reg && "Expecting a register will be available");\r
  \r
      // Assign the value to the allocated register\r
@@ -57,4 +58,151 @@ bool CC_X86_32_RegCall_Assign2Regs(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
    return true;\r
  }\r
  \r
+static ArrayRef<MCPhysReg> CC_X86_VectorCallGetSSEs(const MVT &ValVT) {\r
+  if (ValVT.is512BitVector()) {\r
+    static const MCPhysReg RegListZMM[] = {X86::ZMM0, X86::ZMM1, X86::ZMM2,\r
+                                           X86::ZMM3, X86::ZMM4, X86::ZMM5};\r
+    return RegListZMM;\r
+  }\r
+\r
+  if (ValVT.is256BitVector()) {\r
+    static const MCPhysReg RegListYMM[] = {X86::YMM0, X86::YMM1, X86::YMM2,\r
+                                           X86::YMM3, X86::YMM4, X86::YMM5};\r
+    return RegListYMM;\r
+  }\r
+\r
+  static const MCPhysReg RegListXMM[] = {X86::XMM0, X86::XMM1, X86::XMM2,\r
+                                         X86::XMM3, X86::XMM4, X86::XMM5};\r
+  return RegListXMM;\r
+}\r
+\r
+static ArrayRef<MCPhysReg> CC_X86_64_VectorCallGetGPRs() {\r
+  static const MCPhysReg RegListGPR[] = {X86::RCX, X86::RDX, X86::R8, X86::R9};\r
+  return RegListGPR;\r
+}\r
+\r
+static bool CC_X86_VectorCallAssignRegister(unsigned &ValNo, MVT &ValVT,\r
+                                            MVT &LocVT,\r
+                                            CCValAssign::LocInfo &LocInfo,\r
+                                            ISD::ArgFlagsTy &ArgFlags,\r
+                                            CCState &State) {\r
+\r
+  ArrayRef<MCPhysReg> RegList = CC_X86_VectorCallGetSSEs(ValVT);\r
+  bool Is64bit = static_cast<const X86Subtarget &>(\r
+                     State.getMachineFunction().getSubtarget())\r
+                     .is64Bit();\r
+\r
+  for (auto Reg : RegList) {\r
+    // If the register is not marked as allocated - assign to it.\r
+    if (!State.isAllocated(Reg)) {\r
+      unsigned AssigedReg = State.AllocateReg(Reg);\r
+      assert(AssigedReg == Reg && "Expecting a valid register allocation");\r
+      State.addLoc(\r
+          CCValAssign::getReg(ValNo, ValVT, AssigedReg, LocVT, LocInfo));\r
+      return true;\r
+    }\r
+    // If the register is marked as shadow allocated - assign to it.\r
+    if (Is64bit && State.IsShadowAllocatedReg(Reg)) {\r
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));\r
+      return true;\r
+    }\r
+  }\r
+\r
+  llvm_unreachable("Clang should ensure that hva marked vectors will have "\r
+                   "an available register.");\r
+  return false;\r
+}\r
+\r
+bool CC_X86_64_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,\r
+                          CCValAssign::LocInfo &LocInfo,\r
+                          ISD::ArgFlagsTy &ArgFlags, CCState &State) {\r
+  // On the second pass, go through the HVAs only.\r
+  if (ArgFlags.isSecArgPass()) {\r
+    if (ArgFlags.isHva())\r
+      return CC_X86_VectorCallAssignRegister(ValNo, ValVT, LocVT, LocInfo,\r
+                                             ArgFlags, State);\r
+    return true;\r
+  }\r
+\r
+  // Process only vector types as defined by vectorcall spec:\r
+  // "A vector type is either a floating-point type, for example,\r
+  //  a float or double, or an SIMD vector type, for example, __m128 or __m256".\r
+  if (!(ValVT.isFloatingPoint() ||\r
+        (ValVT.isVector() && ValVT.getSizeInBits() >= 128))) {\r
+    // If R9 was already assigned it means that we are after the fourth element\r
+    // and because this is not an HVA / Vector type, we need to allocate\r
+    // shadow XMM register.\r
+    if (State.isAllocated(X86::R9)) {\r
+      // Assign shadow XMM register.\r
+      (void)State.AllocateReg(CC_X86_VectorCallGetSSEs(ValVT));\r
+    }\r
+\r
+    return false;\r
+  }\r
+\r
+  if (!ArgFlags.isHva() || ArgFlags.isHvaStart()) {\r
+    // Assign shadow GPR register.\r
+    (void)State.AllocateReg(CC_X86_64_VectorCallGetGPRs());\r
+\r
+    // Assign XMM register - (shadow for HVA and non-shadow for non HVA).\r
+    if (unsigned Reg = State.AllocateReg(CC_X86_VectorCallGetSSEs(ValVT))) {\r
+      // In Vectorcall Calling convention, additional shadow stack can be\r
+      // created on top of the basic 32 bytes of win64.\r
+      // It can happen if the fifth or sixth argument is vector type or HVA.\r
+      // At that case for each argument a shadow stack of 8 bytes is allocated.\r
+      if (Reg == X86::XMM4 || Reg == X86::XMM5)\r
+        State.AllocateStack(8, 8);\r
+\r
+      if (!ArgFlags.isHva()) {\r
+        State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));\r
+        return true; // Allocated a register - Stop the search.\r
+      }\r
+    }\r
+  }\r
+\r
+  // If this is an HVA - Stop the search,\r
+  // otherwise continue the search.\r
+  return ArgFlags.isHva();\r
+}\r
+\r
+bool CC_X86_32_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,\r
+                          CCValAssign::LocInfo &LocInfo,\r
+                          ISD::ArgFlagsTy &ArgFlags, CCState &State) {\r
+  // On the second pass, go through the HVAs only.\r
+  if (ArgFlags.isSecArgPass()) {\r
+    if (ArgFlags.isHva())\r
+      return CC_X86_VectorCallAssignRegister(ValNo, ValVT, LocVT, LocInfo,\r
+                                             ArgFlags, State);\r
+    return true;\r
+  }\r
+\r
+  // Process only vector types as defined by vectorcall spec:\r
+  // "A vector type is either a floating point type, for example,\r
+  //  a float or double, or an SIMD vector type, for example, __m128 or __m256".\r
+  if (!(ValVT.isFloatingPoint() ||\r
+        (ValVT.isVector() && ValVT.getSizeInBits() >= 128))) {\r
+    return false;\r
+  }\r
+\r
+  if (ArgFlags.isHva())\r
+    return true; // If this is an HVA - Stop the search.\r
+\r
+  // Assign XMM register.\r
+  if (unsigned Reg = State.AllocateReg(CC_X86_VectorCallGetSSEs(ValVT))) {\r
+    State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));\r
+    return true;\r
+  }\r
+\r
+  // In case we did not find an available XMM register for a vector -\r
+  // pass it indirectly.\r
+  // It is similar to CCPassIndirect, with the addition of inreg.\r
+  if (!ValVT.isFloatingPoint()) {\r
+    LocVT = MVT::i32;\r
+    LocInfo = CCValAssign::Indirect;\r
+    ArgFlags.setInReg();\r
+  }\r
+\r
+  return false; // No register was assigned - Continue the search.\r
+}\r
+\r
  } // End llvm namespace\r
diff --git a/llvm/lib/Target/X86/X86CallingConv.h b/llvm/lib/Target/X86/X86CallingConv.h

index 2e93ec9..c49a683 100644 (file)
--- a/llvm/lib/Target/X86/X86CallingConv.h
+++ b/llvm/lib/Target/X86/X86CallingConv.h
@@ -24,22 +24,29 @@ namespace llvm {
  /// When regcall calling convention compiled to 32 bit arch, special treatment
  /// is required for 64 bit masks.
  /// The value should be assigned to two GPRs.
-/// @return true if registers were allocated and false otherwise
+/// \return true if registers were allocated and false otherwise.
  bool CC_X86_32_RegCall_Assign2Regs(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
                                     CCValAssign::LocInfo &LocInfo,
                                     ISD::ArgFlagsTy &ArgFlags, CCState &State);
  
-inline bool CC_X86_32_VectorCallIndirect(unsigned &ValNo, MVT &ValVT,
-                                         MVT &LocVT,
-                                         CCValAssign::LocInfo &LocInfo,
-                                         ISD::ArgFlagsTy &ArgFlags,
-                                         CCState &State) {
-  // Similar to CCPassIndirect, with the addition of inreg.
-  LocVT = MVT::i32;
-  LocInfo = CCValAssign::Indirect;
-  ArgFlags.setInReg();
-  return false; // Continue the search, but now for i32.
-}
+/// Vectorcall calling convention has special handling for vector types or
+/// HVA for 64 bit arch.
+/// For HVAs shadow registers might be allocated on the first pass
+/// and actual XMM registers are allocated on the second pass.
+/// For vector types, actual XMM registers are allocated on the first pass.
+/// \return true if registers were allocated and false otherwise.
+bool CC_X86_64_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                          CCValAssign::LocInfo &LocInfo,
+                          ISD::ArgFlagsTy &ArgFlags, CCState &State);
+
+/// Vectorcall calling convention has special handling for vector types or
+/// HVA for 32 bit arch.
+/// For HVAs actual XMM registers are allocated on the second pass.
+/// For vector types, actual XMM registers are allocated on the first pass.
+/// \return true if registers were allocated and false otherwise.
+bool CC_X86_32_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                          CCValAssign::LocInfo &LocInfo,
+                          ISD::ArgFlagsTy &ArgFlags, CCState &State);
  
  inline bool CC_X86_AnyReg_Error(unsigned &, MVT &, MVT &,
                                  CCValAssign::LocInfo &, ISD::ArgFlagsTy &,
diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td

index a0c822f..cf7bc98 100644 (file)
--- a/llvm/lib/Target/X86/X86CallingConv.td
+++ b/llvm/lib/Target/X86/X86CallingConv.td
@@ -308,20 +308,12 @@ def RetCC_X86_32_HiPE : CallingConv<[
    CCIfType<[i32], CCAssignToReg<[ESI, EBP, EAX, EDX]>>
  ]>;
  
-// X86-32 HiPE return-value convention.
+// X86-32 Vectorcall return-value convention.
  def RetCC_X86_32_VectorCall : CallingConv<[
-  // Vector types are returned in XMM0,XMM1,XMMM2 and XMM3.
-  CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+  // Floating Point types are returned in XMM0,XMM1,XMMM2 and XMM3.
+  CCIfType<[f32, f64, f128],
              CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>,
  
-  // 256-bit FP vectors
-  CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
-            CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>,
-
-  // 512-bit FP vectors
-  CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
-            CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>,
-
    // Return integers in the standard way.
    CCDelegateTo<RetCC_X86Common>
  ]>;
@@ -350,6 +342,16 @@ def RetCC_X86_Win64_C : CallingConv<[
    CCDelegateTo<RetCC_X86_64_C>
  ]>;
  
+// X86-64 vectorcall return-value convention.
+def RetCC_X86_64_Vectorcall : CallingConv<[
+  // Vectorcall calling convention always returns FP values in XMMs.
+  CCIfType<[f32, f64, f128], 
+    CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>,
+
+  // Otherwise, everything is the same as Windows X86-64 C CC.
+  CCDelegateTo<RetCC_X86_Win64_C>
+]>;
+
  // X86-64 HiPE return-value convention.
  def RetCC_X86_64_HiPE : CallingConv<[
    // Promote all types to i64
@@ -447,6 +449,9 @@ def RetCC_X86_64 : CallingConv<[
    CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<RetCC_X86_Win64_C>>,
    CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<RetCC_X86_64_C>>,
  
+  // Handle Vectorcall CC
+  CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<RetCC_X86_64_Vectorcall>>,
+
    // Handle HHVM calls.
    CCIfCC<"CallingConv::HHVM", CCDelegateTo<RetCC_X86_64_HHVM>>,
  
@@ -626,18 +631,7 @@ def CC_X86_Win64_C : CallingConv<[
  ]>;
  
  def CC_X86_Win64_VectorCall : CallingConv<[
-  // The first 6 floating point and vector types of 128 bits or less use
-  // XMM0-XMM5.
-  CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
-           CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5]>>,
-
-  // 256-bit vectors use YMM registers.
-  CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
-           CCAssignToReg<[YMM0, YMM1, YMM2, YMM3, YMM4, YMM5]>>,
-
-  // 512-bit vectors use ZMM registers.
-  CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
-           CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5]>>,
+  CCCustom<"CC_X86_64_VectorCall">,
  
    // Delegate to fastcall to handle integer types.
    CCDelegateTo<CC_X86_Win64_C>
@@ -847,25 +841,9 @@ def CC_X86_32_FastCall : CallingConv<[
    CCDelegateTo<CC_X86_32_Common>
  ]>;
  
-def CC_X86_32_VectorCall : CallingConv<[
-  // The first 6 floating point and vector types of 128 bits or less use
-  // XMM0-XMM5.
-  CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
-           CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5]>>,
-
-  // 256-bit vectors use YMM registers.
-  CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
-           CCAssignToReg<[YMM0, YMM1, YMM2, YMM3, YMM4, YMM5]>>,
-
-  // 512-bit vectors use ZMM registers.
-  CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
-           CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5]>>,
-
-  // Otherwise, pass it indirectly.
-  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64,
-            v32i8, v16i16, v8i32, v4i64, v8f32, v4f64,
-            v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
-           CCCustom<"CC_X86_32_VectorCallIndirect">>,
+def CC_X86_Win32_VectorCall : CallingConv<[
+  // Pass floating point in XMMs
+  CCCustom<"CC_X86_32_VectorCall">,
  
    // Delegate to fastcall to handle integer types.
    CCDelegateTo<CC_X86_32_FastCall>
@@ -999,7 +977,7 @@ def CC_X86_32 : CallingConv<[
    CCIfCC<"CallingConv::X86_INTR", CCDelegateTo<CC_X86_32_Intr>>,
    CCIfSubtarget<"isTargetMCU()", CCDelegateTo<CC_X86_32_MCU>>,
    CCIfCC<"CallingConv::X86_FastCall", CCDelegateTo<CC_X86_32_FastCall>>,
-  CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_32_VectorCall>>,
+  CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_Win32_VectorCall>>,
    CCIfCC<"CallingConv::X86_ThisCall", CCDelegateTo<CC_X86_32_ThisCall>>,
    CCIfCC<"CallingConv::Fast", CCDelegateTo<CC_X86_32_FastCC>>,
    CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_32_GHC>>,
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp

index a2eb654..a816c06 100644 (file)
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -17,6 +17,7 @@
  #include "X86CallingConv.h"
  #include "X86FrameLowering.h"
  #include "X86InstrBuilder.h"
+#include "X86IntrinsicsInfo.h"
  #include "X86MachineFunctionInfo.h"
  #include "X86ShuffleDecodeConstantPool.h"
  #include "X86TargetMachine.h"
@@ -53,10 +54,10 @@
  #include "llvm/Support/ErrorHandling.h"
  #include "llvm/Support/MathExtras.h"
  #include "llvm/Target/TargetOptions.h"
-#include "X86IntrinsicsInfo.h"
+#include <algorithm>
  #include <bitset>
-#include <numeric>
  #include <cctype>
+#include <numeric>
  using namespace llvm;
  
  #define DEBUG_TYPE "x86-isel"
@@ -2781,6 +2782,13 @@ static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
    return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
  }
  
+static bool isSortedByValueNo(const SmallVectorImpl<CCValAssign> &ArgLocs) {
+  return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
+                        [](const CCValAssign &A, const CCValAssign &B) -> bool {
+                          return A.getValNo() < B.getValNo();
+                        });
+}
+
  SDValue X86TargetLowering::LowerFormalArguments(
      SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
      const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
@@ -2815,11 +2823,22 @@ SDValue X86TargetLowering::LowerFormalArguments(
    SmallVector<CCValAssign, 16> ArgLocs;
    CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
  
-  // Allocate shadow area for Win64
+  // Allocate shadow area for Win64.
    if (IsWin64)
      CCInfo.AllocateStack(32, 8);
  
-  CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
+  CCInfo.AnalyzeArguments(Ins, CC_X86);
+
+  // In vectorcall calling convention a second pass is required for the HVA
+  // types.
+  if (CallingConv::X86_VectorCall == CallConv) {
+    CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
+  }
+
+  // The next loop assumes that the locations are in the same order of the
+  // input arguments.
+  assert(isSortedByValueNo(ArgLocs) &&
+         "Argument Location list must be sorted before lowering");
  
    SDValue ArgValue;
    for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
@@ -3263,11 +3282,17 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
    SmallVector<CCValAssign, 16> ArgLocs;
    CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
  
-  // Allocate shadow area for Win64
+  // Allocate shadow area for Win64.
    if (IsWin64)
      CCInfo.AllocateStack(32, 8);
  
-  CCInfo.AnalyzeCallOperands(Outs, CC_X86);
+  CCInfo.AnalyzeArguments(Outs, CC_X86);
+
+  // In vectorcall calling convention a second pass is required for the HVA
+  // types.
+  if (CallingConv::X86_VectorCall == CallConv) {
+    CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
+  }
  
    // Get a count of how many bytes are to be pushed on the stack.
    unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
@@ -3322,6 +3347,11 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
    SmallVector<SDValue, 8> MemOpChains;
    SDValue StackPtr;
  
+  // The next loop assumes that the locations are in the same order of the
+  // input arguments.
+  assert(isSortedByValueNo(ArgLocs) &&
+         "Argument Location list must be sorted before lowering");
+
    // Walk the register/memloc assignments, inserting copies/loads.  In the case
    // of tail call optimization arguments are handle later.
    const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
diff --git a/llvm/test/CodeGen/X86/vectorcall.ll b/llvm/test/CodeGen/X86/vectorcall.ll

index 1e52654..376e2c0 100644 (file)
--- a/llvm/test/CodeGen/X86/vectorcall.ll
+++ b/llvm/test/CodeGen/X86/vectorcall.ll
@@ -6,14 +6,12 @@
  define x86_vectorcallcc i32 @test_int_1() {
    ret i32 0
  }
-
  ; CHECK-LABEL: {{^}}test_int_1@@0:
  ; CHECK: xorl %eax, %eax
  
  define x86_vectorcallcc i32 @test_int_2(i32 inreg %a) {
    ret i32 %a
  }
-
  ; X86-LABEL: {{^}}test_int_2@@4:
  ; X64-LABEL: {{^}}test_int_2@@8:
  ; CHECK: movl %ecx, %eax
@@ -22,7 +20,6 @@ define x86_vectorcallcc i32 @test_int_3(i64 inreg %a) {
    %at = trunc i64 %a to i32
    ret i32 %at
  }
-
  ; X86-LABEL: {{^}}test_int_3@@8:
  ; X64-LABEL: {{^}}test_int_3@@8:
  ; CHECK: movl %ecx, %eax
@@ -31,10 +28,8 @@ define x86_vectorcallcc i32 @test_int_4(i32 inreg %a, i32 inreg %b) {
    %s = add i32 %a, %b
    ret i32 %s
  }
-
  ; X86-LABEL: {{^}}test_int_4@@8:
  ; X86: leal (%ecx,%edx), %eax
-
  ; X64-LABEL: {{^}}test_int_4@@16:
  ; X64: leal (%rcx,%rdx), %eax
  
@@ -90,4 +85,139 @@ define x86_vectorcallcc <16 x i8> @test_vec_2(
    ret <16 x i8> %r
  }
  ; CHECK-LABEL: {{^}}test_vec_2@@104:
-; CHECK: movaps (%{{[re]}}cx), %xmm0
+; x64:           movq    {{[0-9]*}}(%rsp), %rax
+; CHECK:         movaps (%{{rax|ecx}}), %xmm0
+
+%struct.HVA5 = type { <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float> }
+%struct.HVA4 = type { <4 x float>, <4 x float>, <4 x float>, <4 x float> }
+%struct.HVA3 = type { <4 x float>, <4 x float>, <4 x float> }
+%struct.HVA2 = type { <4 x float>, <4 x float> }
+
+define x86_vectorcallcc <4 x float> @test_mixed_1(i32 %a, %struct.HVA4 inreg %bb, i32 %c) {
+entry:
+  %b = alloca %struct.HVA4, align 16
+  store %struct.HVA4 %bb, %struct.HVA4* %b, align 16
+  %w1 = getelementptr inbounds %struct.HVA4, %struct.HVA4* %b, i32 0, i32 1
+  %0 = load <4 x float>, <4 x float>* %w1, align 16
+  ret <4 x float> %0
+}
+; CHECK-LABEL: test_mixed_1
+; CHECK:       movaps  %xmm1, 16(%{{(e|r)}}sp)
+; CHECK:       movaps  16(%{{(e|r)}}sp), %xmm0
+; CHECK:       ret{{q|l}}
+
+define x86_vectorcallcc <4 x float> @test_mixed_2(%struct.HVA4 inreg %a, %struct.HVA4* %b, <4 x float> %c) {
+entry:
+  %c.addr = alloca <4 x float>, align 16
+  store <4 x float> %c, <4 x float>* %c.addr, align 16
+  %0 = load <4 x float>, <4 x float>* %c.addr, align 16
+  ret <4 x float> %0
+}
+; CHECK-LABEL: test_mixed_2
+; X86:         movaps  %xmm0, (%esp)
+; X64:         movaps  %xmm2, %xmm0
+; CHECK:       ret{{[ql]}}
+
+define x86_vectorcallcc <4 x float> @test_mixed_3(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, <4 x float> %e, %struct.HVA2* %f) {
+entry:
+  %x = getelementptr inbounds %struct.HVA2, %struct.HVA2* %f, i32 0, i32 0
+  %0 = load <4 x float>, <4 x float>* %x, align 16
+  ret <4 x float> %0
+}
+; CHECK-LABEL: test_mixed_3
+; CHECK:       movaps  (%{{[re][ac]}}x), %xmm0
+; CHECK:       ret{{[ql]}}
+
+define x86_vectorcallcc <4 x float> @test_mixed_4(%struct.HVA4 inreg %a, %struct.HVA2* %bb, <4 x float> %c) {
+entry:
+  %y4 = getelementptr inbounds %struct.HVA2, %struct.HVA2* %bb, i32 0, i32 1
+  %0 = load <4 x float>, <4 x float>* %y4, align 16
+  ret <4 x float> %0
+}
+; CHECK-LABEL: test_mixed_4
+; X86:         movaps  16(%eax), %xmm0
+; X64:         movaps  16(%rdx), %xmm0
+; CHECK:       ret{{[ql]}}
+
+define x86_vectorcallcc <4 x float> @test_mixed_5(%struct.HVA3 inreg %a, %struct.HVA3* %b, <4 x float> %c, %struct.HVA2 inreg %dd) {
+entry:
+  %d = alloca %struct.HVA2, align 16
+  store %struct.HVA2 %dd, %struct.HVA2* %d, align 16
+  %y5 = getelementptr inbounds %struct.HVA2, %struct.HVA2* %d, i32 0, i32 1
+  %0 = load <4 x float>, <4 x float>* %y5, align 16
+  ret <4 x float> %0
+}
+; CHECK-LABEL: test_mixed_5
+; CHECK:       movaps  %xmm5, 16(%{{(e|r)}}sp)
+; CHECK:       movaps  16(%{{(e|r)}}sp), %xmm0
+; CHECK:       ret{{[ql]}}
+
+define x86_vectorcallcc %struct.HVA4 @test_mixed_6(%struct.HVA4 inreg %a, %struct.HVA4* %b) {
+entry:
+  %retval = alloca %struct.HVA4, align 16
+  %0 = bitcast %struct.HVA4* %retval to i8*
+  %1 = bitcast %struct.HVA4* %b to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %0, i8* %1, i32 64, i32 16, i1 false)
+  %2 = load %struct.HVA4, %struct.HVA4* %retval, align 16
+  ret %struct.HVA4 %2
+}
+; CHECK-LABEL: test_mixed_6
+; CHECK:       movaps  (%{{[re]}}sp), %xmm0
+; CHECK:       movaps  16(%{{[re]}}sp), %xmm1
+; CHECK:       movaps  32(%{{[re]}}sp), %xmm2
+; CHECK:       movaps  48(%{{[re]}}sp), %xmm3
+; CHECK:       ret{{[ql]}}
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i32, i1)
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1)
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture writeonly, i8* nocapture readonly, i32, i32, i1)
+
+define x86_vectorcallcc void @test_mixed_7(%struct.HVA5* noalias sret %agg.result) {
+entry:
+  %a = alloca %struct.HVA5, align 16
+  %0 = bitcast %struct.HVA5* %a to i8*
+  call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 80, i32 16, i1 false)
+  %1 = bitcast %struct.HVA5* %agg.result to i8*
+  %2 = bitcast %struct.HVA5* %a to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* %2, i64 80, i32 16, i1 false)
+  ret void
+}
+; CHECK-LABEL: test_mixed_7
+; CHECK:       movaps  %xmm{{[0-9]}}, 64(%{{rcx|eax}})
+; CHECK:       movaps  %xmm{{[0-9]}}, 48(%{{rcx|eax}})
+; CHECK:       movaps  %xmm{{[0-9]}}, 32(%{{rcx|eax}})
+; CHECK:       movaps  %xmm{{[0-9]}}, 16(%{{rcx|eax}})
+; CHECK:       movaps  %xmm{{[0-9]}}, (%{{rcx|eax}})
+; X64:         mov{{[ql]}}     %rcx, %rax
+; CHECK:       ret{{[ql]}}
+
+define x86_vectorcallcc <4 x float> @test_mixed_8(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, i32 %e, <4 x float> %f) {
+entry:
+  %f.addr = alloca <4 x float>, align 16
+  store <4 x float> %f, <4 x float>* %f.addr, align 16
+  %0 = load <4 x float>, <4 x float>* %f.addr, align 16
+  ret <4 x float> %0
+}
+; CHECK-LABEL: test_mixed_8
+; X86:         movaps  %xmm4, %xmm0
+; X64:         movaps  %xmm5, %xmm0
+; CHECK:       ret{{[ql]}}
+
+%struct.HFA4 = type { double, double, double, double }
+declare x86_vectorcallcc double @test_mixed_9_callee(%struct.HFA4 %x, double %y)
+
+define x86_vectorcallcc double @test_mixed_9_caller(%struct.HFA4 inreg %b) {
+entry:
+  %call = call x86_vectorcallcc double @test_mixed_9_callee(%struct.HFA4 inreg %b, double 3.000000e+00)
+  %add = fadd double 1.000000e+00, %call
+  ret double %add
+}
+; CHECK-LABEL: test_mixed_9_caller
+; CHECK:       movaps  %xmm3, %xmm4
+; CHECK:       movaps  %xmm2, %xmm3
+; CHECK:       movaps  %xmm1, %xmm2
+; X32:         movasd  %xmm0, %xmm1
+; X64:         movapd  %xmm5, %xmm1
+; CHECK:       call{{l|q}}   test_mixed_9_callee@@40
+; CHECK:       addsd   {{.*}}, %xmm0
+; CHECK:       ret{{l|q}}
author	Oren Ben Simhon <oren.ben.simhon@intel.com>
	Wed, 21 Dec 2016 08:31:45 +0000 (08:31 +0000)
committer	Oren Ben Simhon <oren.ben.simhon@intel.com>
	Wed, 21 Dec 2016 08:31:45 +0000 (08:31 +0000)
llvm/include/llvm/CodeGen/CallingConvLower.h		patch \| blob \| history
llvm/include/llvm/Target/TargetCallingConv.h		patch \| blob \| history
llvm/lib/CodeGen/CallingConvLower.cpp		patch \| blob \| history
llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp		patch \| blob \| history
llvm/lib/Target/X86/X86CallingConv.cpp		patch \| blob \| history
llvm/lib/Target/X86/X86CallingConv.h		patch \| blob \| history
llvm/lib/Target/X86/X86CallingConv.td		patch \| blob \| history
llvm/lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
llvm/test/CodeGen/X86/vectorcall.ll		patch \| blob \| history