From b9c26a9cfe53da7ef96e13ef1aa7fe793b1b5d28 Mon Sep 17 00:00:00 2001
From: Dylan McKay <me@dylanmckay.io>
Date: Fri, 19 Jun 2020 23:26:00 +1200
Subject: [PATCH] [AVR] Rewrite the function calling convention.

Summary:
The previous version relied on the standard calling convention using
std::reverse() to try to force the AVR ABI. But this only works for
simple cases, it fails for example with aggregate types.

This patch rewrites the calling convention with custom C++ code, that
implements the ABI defined in https://gcc.gnu.org/wiki/avr-gcc.

To do that it adds a few 16-bit pseudo registers for unaligned argument
passing, such as R24R23. For example this function:

    define void @fun({ i8, i16 } %a)

will pass %a.0 in R22 and %a.1 in R24R23.

There are no instructions that can use these pseudo registers, so a new
register class, DREGSMOVW, is defined to make them apart.

Also the ArgCC_AVR_BUILTIN_DIV is no longer necessary, as it is
identical to the C++ behavior (actually the clobber list is more strict
for __div* functions, but that is currently unimplemented).

Reviewers: dylanmckay

Subscribers: Gaelan, Sh4rK, indirect, jwagen, efriedma, dsprenkels, hiraditya, Jim, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D68524

Patch by Rodrigo Rivas Costa.
---
 llvm/lib/Target/AVR/AVRCallingConv.td              |  18 +-
 llvm/lib/Target/AVR/AVRISelLowering.cpp            | 340 +++++++++------------
 llvm/lib/Target/AVR/AVRISelLowering.h              |   6 +-
 llvm/lib/Target/AVR/AVRInstrInfo.cpp               |   2 +-
 llvm/lib/Target/AVR/AVRRegisterInfo.td             |  27 ++
 llvm/test/CodeGen/AVR/calling-conv/c/basic_aggr.ll |  84 +++++
 llvm/test/CodeGen/AVR/calling-conv/c/call.ll       |  89 ++++++
 llvm/test/CodeGen/AVR/calling-conv/c/call_aggr.ll  |  48 +++
 .../test/CodeGen/AVR/calling-conv/c/return_aggr.ll |  31 ++
 9 files changed, 435 insertions(+), 210 deletions(-)
 create mode 100644 llvm/test/CodeGen/AVR/calling-conv/c/basic_aggr.ll
 create mode 100644 llvm/test/CodeGen/AVR/calling-conv/c/call.ll
 create mode 100644 llvm/test/CodeGen/AVR/calling-conv/c/call_aggr.ll
 create mode 100644 llvm/test/CodeGen/AVR/calling-conv/c/return_aggr.ll

diff --git a/llvm/lib/Target/AVR/AVRCallingConv.td b/llvm/lib/Target/AVR/AVRCallingConv.td
index 213e35f..65545e5 100644
--- a/llvm/lib/Target/AVR/AVRCallingConv.td
+++ b/llvm/lib/Target/AVR/AVRCallingConv.td
@@ -6,21 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 // This describes the calling conventions for AVR architecture.
+// Normal functions use a special calling convention, solved in code.
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
 // AVR Return Value Calling Convention
 //===----------------------------------------------------------------------===//
 
-def RetCC_AVR : CallingConv
-<[
-  // i8 is returned in R24.
-  CCIfType<[i8], CCAssignToReg<[R24]>>,
-
-  // i16 are returned in R25:R24, R23:R22, R21:R20 and R19:R18.
-  CCIfType<[i16], CCAssignToReg<[R25R24, R23R22, R21R20, R19R18]>>
-]>;
-
 // Special return value calling convention for runtime functions.
 def RetCC_AVR_BUILTIN : CallingConv
 <[
@@ -41,14 +33,6 @@ def ArgCC_AVR_Vararg : CallingConv
   CCAssignToStack<2, 1>
 ]>;
 
-// Special argument calling convention for
-// division runtime functions.
-def ArgCC_AVR_BUILTIN_DIV : CallingConv
-<[
-  CCIfType<[i8], CCAssignToReg<[R24,R22]>>,
-  CCIfType<[i16], CCAssignToReg<[R25R24, R23R22]>>
-]>;
-
 //===----------------------------------------------------------------------===//
 // Callee-saved register lists.
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AVR/AVRISelLowering.cpp b/llvm/lib/Target/AVR/AVRISelLowering.cpp
index 15efb62..bf9b32e 100644
--- a/llvm/lib/Target/AVR/AVRISelLowering.cpp
+++ b/llvm/lib/Target/AVR/AVRISelLowering.cpp
@@ -14,6 +14,7 @@
 #include "AVRISelLowering.h"
 
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -881,172 +882,145 @@ bool AVRTargetLowering::isOffsetFoldingLegal(
 
 #include "AVRGenCallingConv.inc"
 
-/// For each argument in a function store the number of pieces it is composed
-/// of.
-static void parseFunctionArgs(const SmallVectorImpl<ISD::InputArg> &Ins,
-                              SmallVectorImpl<unsigned> &Out) {
-  for (const ISD::InputArg &Arg : Ins) {
-    if(Arg.PartOffset > 0) continue;
-    unsigned Bytes = ((Arg.ArgVT.getSizeInBits()) + 7) / 8;
-
-    Out.push_back((Bytes + 1) / 2);
-  }
-}
-
-/// For external symbols there is no function prototype information so we
-/// have to rely directly on argument sizes.
-static void parseExternFuncCallArgs(const SmallVectorImpl<ISD::OutputArg> &In,
-                                    SmallVectorImpl<unsigned> &Out) {
-  for (unsigned i = 0, e = In.size(); i != e;) {
-    unsigned Size = 0;
-    unsigned Offset = 0;
-    while ((i != e) && (In[i].PartOffset == Offset)) {
-      Offset += In[i].VT.getStoreSize();
-      ++i;
-      ++Size;
-    }
-    Out.push_back(Size);
-  }
-}
-
-static StringRef getFunctionName(TargetLowering::CallLoweringInfo &CLI) {
-  SDValue Callee = CLI.Callee;
-
-  if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee)) {
-    return G->getSymbol();
-  }
-
-  if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
-    return G->getGlobal()->getName();
-  }
-
-  llvm_unreachable("don't know how to get the name for this callee");
-}
+/// Registers for calling conventions, ordered in reverse as required by ABI.
+/// Both arrays must be of the same length.
+static const MCPhysReg RegList8[] = {
+    AVR::R25, AVR::R24, AVR::R23, AVR::R22, AVR::R21, AVR::R20,
+    AVR::R19, AVR::R18, AVR::R17, AVR::R16, AVR::R15, AVR::R14,
+    AVR::R13, AVR::R12, AVR::R11, AVR::R10, AVR::R9,  AVR::R8};
+static const MCPhysReg RegList16[] = {
+    AVR::R26R25, AVR::R25R24, AVR::R24R23, AVR::R23R22,
+    AVR::R22R21, AVR::R21R20, AVR::R20R19, AVR::R19R18,
+    AVR::R18R17, AVR::R17R16, AVR::R16R15, AVR::R15R14,
+    AVR::R14R13, AVR::R13R12, AVR::R12R11, AVR::R11R10,
+    AVR::R10R9,  AVR::R9R8};
+
+static_assert(array_lengthof(RegList8) == array_lengthof(RegList16),
+        "8-bit and 16-bit register arrays must be of equal length");
 
 /// Analyze incoming and outgoing function arguments. We need custom C++ code
-/// to handle special constraints in the ABI like reversing the order of the
-/// pieces of splitted arguments. In addition, all pieces of a certain argument
-/// have to be passed either using registers or the stack but never mixing both.
-static void analyzeStandardArguments(TargetLowering::CallLoweringInfo *CLI,
-                                     const Function *F, const DataLayout *TD,
-                                     const SmallVectorImpl<ISD::OutputArg> *Outs,
-                                     const SmallVectorImpl<ISD::InputArg> *Ins,
-                                     CallingConv::ID CallConv,
-                                     SmallVectorImpl<CCValAssign> &ArgLocs,
-                                     CCState &CCInfo, bool IsCall, bool IsVarArg) {
-  static const MCPhysReg RegList8[] = {AVR::R24, AVR::R22, AVR::R20,
-                                       AVR::R18, AVR::R16, AVR::R14,
-                                       AVR::R12, AVR::R10, AVR::R8};
-  static const MCPhysReg RegList16[] = {AVR::R25R24, AVR::R23R22, AVR::R21R20,
-                                        AVR::R19R18, AVR::R17R16, AVR::R15R14,
-                                        AVR::R13R12, AVR::R11R10, AVR::R9R8};
-  if (IsVarArg) {
-    // Variadic functions do not need all the analysis below.
-    if (IsCall) {
-      CCInfo.AnalyzeCallOperands(*Outs, ArgCC_AVR_Vararg);
-    } else {
-      CCInfo.AnalyzeFormalArguments(*Ins, ArgCC_AVR_Vararg);
+/// to handle special constraints in the ABI.
+/// In addition, all pieces of a certain argument have to be passed either
+/// using registers or the stack but never mixing both.
+template <typename ArgT>
+static void
+analyzeArguments(TargetLowering::CallLoweringInfo *CLI, const Function *F,
+                 const DataLayout *TD, const SmallVectorImpl<ArgT> &Args,
+                 SmallVectorImpl<CCValAssign> &ArgLocs, CCState &CCInfo) {
+  unsigned NumArgs = Args.size();
+  // This is the index of the last used register, in RegList*.
+  // -1 means R26 (R26 is never actually used in CC).
+  int RegLastIdx = -1;
+  // Once a value is passed to the stack it will always be used
+  bool UseStack = false;
+  for (unsigned i = 0; i != NumArgs;) {
+    MVT VT = Args[i].VT;
+    // We have to count the number of bytes for each function argument, that is
+    // those Args with the same OrigArgIndex. This is important in case the
+    // function takes an aggregate type.
+    // Current argument will be between [i..j).
+    unsigned ArgIndex = Args[i].OrigArgIndex;
+    unsigned TotalBytes = VT.getStoreSize();
+    unsigned j = i + 1;
+    for (; j != NumArgs; ++j) {
+      if (Args[j].OrigArgIndex != ArgIndex)
+        break;
+      TotalBytes += Args[j].VT.getStoreSize();
     }
-    return;
-  }
-
-  // Fill in the Args array which will contain original argument sizes.
-  SmallVector<unsigned, 8> Args;
-  if (IsCall) {
-    parseExternFuncCallArgs(*Outs, Args);
-  } else {
-    assert(F != nullptr && "function should not be null");
-    parseFunctionArgs(*Ins, Args);
-  }
-
-  unsigned RegsLeft = array_lengthof(RegList8), ValNo = 0;
-  // Variadic functions always use the stack.
-  bool UsesStack = false;
-  for (unsigned i = 0, pos = 0, e = Args.size(); i != e; ++i) {
-    unsigned Size = Args[i];
-
-    // If we have a zero-sized argument, don't attempt to lower it.
-    // AVR-GCC does not support zero-sized arguments and so we need not
-    // worry about ABI compatibility.
-    if (Size == 0) continue;
-
-    MVT LocVT = (IsCall) ? (*Outs)[pos].VT : (*Ins)[pos].VT;
-
-    // If we have plenty of regs to pass the whole argument do it.
-    if (!UsesStack && (Size <= RegsLeft)) {
-      const MCPhysReg *RegList = (LocVT == MVT::i16) ? RegList16 : RegList8;
+    // Round up to even number of bytes.
+    TotalBytes = alignTo(TotalBytes, 2);
+    // Skip zero sized arguments
+    if (TotalBytes == 0)
+      continue;
+    // The index of the first register to be used
+    unsigned RegIdx = RegLastIdx + TotalBytes;
+    RegLastIdx = RegIdx;
+    // If there are not enough registers, use the stack
+    if (RegIdx >= array_lengthof(RegList8)) {
+      UseStack = true;
+    }
+    for (; i != j; ++i) {
+      MVT VT = Args[i].VT;
 
-      for (unsigned j = 0; j != Size; ++j) {
-        unsigned Reg = CCInfo.AllocateReg(
-            ArrayRef<MCPhysReg>(RegList, array_lengthof(RegList8)));
+      if (UseStack) {
+        auto evt = EVT(VT).getTypeForEVT(CCInfo.getContext());
+        unsigned Offset = CCInfo.AllocateStack(TD->getTypeAllocSize(evt),
+                                               TD->getABITypeAlign(evt));
         CCInfo.addLoc(
-            CCValAssign::getReg(ValNo++, LocVT, Reg, LocVT, CCValAssign::Full));
-        --RegsLeft;
-      }
-
-      // Reverse the order of the pieces to agree with the "big endian" format
-      // required in the calling convention ABI.
-      std::reverse(ArgLocs.begin() + pos, ArgLocs.begin() + pos + Size);
-    } else {
-      // Pass the rest of arguments using the stack.
-      UsesStack = true;
-      for (unsigned j = 0; j != Size; ++j) {
-        unsigned Offset = CCInfo.AllocateStack(
-            TD->getTypeAllocSize(EVT(LocVT).getTypeForEVT(CCInfo.getContext())),
-            TD->getABITypeAlign(EVT(LocVT).getTypeForEVT(CCInfo.getContext())));
-        CCInfo.addLoc(CCValAssign::getMem(ValNo++, LocVT, Offset, LocVT,
-                                          CCValAssign::Full));
+            CCValAssign::getMem(i, VT, Offset, VT, CCValAssign::Full));
+      } else {
+        unsigned Reg;
+        if (VT == MVT::i8) {
+          Reg = CCInfo.AllocateReg(RegList8[RegIdx]);
+        } else if (VT == MVT::i16) {
+          Reg = CCInfo.AllocateReg(RegList16[RegIdx]);
+        } else {
+          llvm_unreachable(
+              "calling convention can only manage i8 and i16 types");
+        }
+        assert(Reg && "register not available in calling convention");
+        CCInfo.addLoc(CCValAssign::getReg(i, VT, Reg, VT, CCValAssign::Full));
+        // Registers inside a particular argument are sorted in increasing order
+        // (remember the array is reversed).
+        RegIdx -= VT.getStoreSize();
       }
     }
-    pos += Size;
   }
 }
 
-static void analyzeBuiltinArguments(TargetLowering::CallLoweringInfo &CLI,
-                                    const Function *F, const DataLayout *TD,
-                                    const SmallVectorImpl<ISD::OutputArg> *Outs,
-                                    const SmallVectorImpl<ISD::InputArg> *Ins,
-                                    CallingConv::ID CallConv,
-                                    SmallVectorImpl<CCValAssign> &ArgLocs,
-                                    CCState &CCInfo, bool IsCall, bool IsVarArg) {
-  StringRef FuncName = getFunctionName(CLI);
-
-  if (FuncName.startswith("__udivmod") || FuncName.startswith("__divmod")) {
-    CCInfo.AnalyzeCallOperands(*Outs, ArgCC_AVR_BUILTIN_DIV);
-  } else {
-    analyzeStandardArguments(&CLI, F, TD, Outs, Ins,
-                             CallConv, ArgLocs, CCInfo,
-                             IsCall, IsVarArg);
+/// Count the total number of bytes needed to pass or return these arguments.
+template <typename ArgT>
+static unsigned getTotalArgumentsSizeInBytes(const SmallVectorImpl<ArgT> &Args) {
+  unsigned TotalBytes = 0;
+
+  for (const ArgT& Arg : Args) {
+    TotalBytes += Arg.VT.getStoreSize();
   }
+  return TotalBytes;
 }
 
-static void analyzeArguments(TargetLowering::CallLoweringInfo *CLI,
-                             const Function *F, const DataLayout *TD,
-                             const SmallVectorImpl<ISD::OutputArg> *Outs,
-                             const SmallVectorImpl<ISD::InputArg> *Ins,
-                             CallingConv::ID CallConv,
-                             SmallVectorImpl<CCValAssign> &ArgLocs,
-                             CCState &CCInfo, bool IsCall, bool IsVarArg) {
-  switch (CallConv) {
-    case CallingConv::AVR_BUILTIN: {
-      analyzeBuiltinArguments(*CLI, F, TD, Outs, Ins,
-                              CallConv, ArgLocs, CCInfo,
-                              IsCall, IsVarArg);
-      return;
-    }
-    default: {
-      analyzeStandardArguments(CLI, F, TD, Outs, Ins,
-                               CallConv, ArgLocs, CCInfo,
-                               IsCall, IsVarArg);
-      return;
+/// Analyze incoming and outgoing value of returning from a function.
+/// The algorithm is similar to analyzeArguments, but there can only be
+/// one value, possibly an aggregate, and it is limited to 8 bytes.
+template <typename ArgT>
+static void analyzeReturnValues(const SmallVectorImpl<ArgT> &Args,
+                                CCState &CCInfo) {
+  unsigned NumArgs = Args.size();
+  unsigned TotalBytes = getTotalArgumentsSizeInBytes(Args);
+  // CanLowerReturn() guarantees this assertion.
+  assert(TotalBytes <= 8 && "return values greater than 8 bytes cannot be lowered");
+
+  // GCC-ABI says that the size is rounded up to the next even number,
+  // but actually once it is more than 4 it will always round up to 8.
+  if (TotalBytes > 4) {
+    TotalBytes = 8;
+  } else {
+    TotalBytes = alignTo(TotalBytes, 2);
+  }
+
+  // The index of the first register to use.
+  int RegIdx = TotalBytes - 1;
+  for (unsigned i = 0; i != NumArgs; ++i) {
+    MVT VT = Args[i].VT;
+    unsigned Reg;
+    if (VT == MVT::i8) {
+      Reg = CCInfo.AllocateReg(RegList8[RegIdx]);
+    } else if (VT == MVT::i16) {
+      Reg = CCInfo.AllocateReg(RegList16[RegIdx]);
+    } else {
+      llvm_unreachable("calling convention can only manage i8 and i16 types");
     }
+    assert(Reg && "register not available in calling convention");
+    CCInfo.addLoc(CCValAssign::getReg(i, VT, Reg, VT, CCValAssign::Full));
+    // Registers sort in increasing order
+    RegIdx -= VT.getStoreSize();
   }
 }
 
 SDValue AVRTargetLowering::LowerFormalArguments(
     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
-    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, SelectionDAG &DAG,
-    SmallVectorImpl<SDValue> &InVals) const {
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   auto DL = DAG.getDataLayout();
@@ -1056,8 +1030,12 @@ SDValue AVRTargetLowering::LowerFormalArguments(
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
                  *DAG.getContext());
 
-  analyzeArguments(nullptr, &MF.getFunction(), &DL, 0, &Ins, CallConv, ArgLocs, CCInfo,
-                   false, isVarArg);
+  // Variadic functions do not need all the analysis below.
+  if (isVarArg) {
+    CCInfo.AnalyzeFormalArguments(Ins, ArgCC_AVR_Vararg);
+  } else {
+    analyzeArguments(nullptr, &MF.getFunction(), &DL, Ins, ArgLocs, CCInfo);
+  }
 
   SDValue ArgValue;
   for (CCValAssign &VA : ArgLocs) {
@@ -1178,8 +1156,12 @@ SDValue AVRTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                          getPointerTy(DAG.getDataLayout()));
   }
 
-  analyzeArguments(&CLI, F, &DAG.getDataLayout(), &Outs, 0, CallConv, ArgLocs, CCInfo,
-                   true, isVarArg);
+  // Variadic functions do not need all the analysis below.
+  if (isVarArg) {
+    CCInfo.AnalyzeCallOperands(Outs, ArgCC_AVR_Vararg);
+  } else {
+    analyzeArguments(&CLI, F, &DAG.getDataLayout(), Outs, ArgLocs, CCInfo);
+  }
 
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NumBytes = CCInfo.getNextStackOffset();
@@ -1316,13 +1298,10 @@ SDValue AVRTargetLowering::LowerCallResult(
                  *DAG.getContext());
 
   // Handle runtime calling convs.
-  auto CCFunction = CCAssignFnForReturn(CallConv);
-  CCInfo.AnalyzeCallResult(Ins, CCFunction);
-
-  if (CallConv != CallingConv::AVR_BUILTIN && RVLocs.size() > 1) {
-    // Reverse splitted return values to get the "big endian" format required
-    // to agree with the calling convention ABI.
-    std::reverse(RVLocs.begin(), RVLocs.end());
+  if (CallConv == CallingConv::AVR_BUILTIN) {
+    CCInfo.AnalyzeCallResult(Ins, RetCC_AVR_BUILTIN);
+  } else {
+    analyzeReturnValues(Ins, CCInfo);
   }
 
   // Copy all of the result registers out of their specified physreg.
@@ -1341,26 +1320,17 @@ SDValue AVRTargetLowering::LowerCallResult(
 //               Return Value Calling Convention Implementation
 //===----------------------------------------------------------------------===//
 
-CCAssignFn *AVRTargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const {
-  switch (CC) {
-  case CallingConv::AVR_BUILTIN:
-    return RetCC_AVR_BUILTIN;
-  default:
-    return RetCC_AVR;
+bool AVRTargetLowering::CanLowerReturn(
+    CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
+    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
+  if (CallConv == CallingConv::AVR_BUILTIN) {
+    SmallVector<CCValAssign, 16> RVLocs;
+    CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
+    return CCInfo.CheckReturn(Outs, RetCC_AVR_BUILTIN);
   }
-}
-
-bool
-AVRTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
-                                  MachineFunction &MF, bool isVarArg,
-                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
-                                  LLVMContext &Context) const
-{
-  SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
 
-  auto CCFunction = CCAssignFnForReturn(CallConv);
-  return CCInfo.CheckReturn(Outs, CCFunction);
+  unsigned TotalBytes = getTotalArgumentsSizeInBytes(Outs);
+  return TotalBytes <= 8;
 }
 
 SDValue
@@ -1376,25 +1346,19 @@ AVRTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
                  *DAG.getContext());
 
-  // Analyze return values.
-  auto CCFunction = CCAssignFnForReturn(CallConv);
-  CCInfo.AnalyzeReturn(Outs, CCFunction);
-
-  // If this is the first return lowered for this function, add the regs to
-  // the liveout set for the function.
   MachineFunction &MF = DAG.getMachineFunction();
-  unsigned e = RVLocs.size();
 
-  // Reverse splitted return values to get the "big endian" format required
-  // to agree with the calling convention ABI.
-  if (e > 1) {
-    std::reverse(RVLocs.begin(), RVLocs.end());
+  // Analyze return values.
+  if (CallConv == CallingConv::AVR_BUILTIN) {
+    CCInfo.AnalyzeReturn(Outs, RetCC_AVR_BUILTIN);
+  } else {
+    analyzeReturnValues(Outs, CCInfo);
   }
 
   SDValue Flag;
   SmallVector<SDValue, 4> RetOps(1, Chain);
   // Copy the result values into the output registers.
-  for (unsigned i = 0; i != e; ++i) {
+  for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
     CCValAssign &VA = RVLocs[i];
     assert(VA.isRegLoc() && "Can only return in registers!");
 
diff --git a/llvm/lib/Target/AVR/AVRISelLowering.h b/llvm/lib/Target/AVR/AVRISelLowering.h
index aca1ea1..d1eaf53 100644
--- a/llvm/lib/Target/AVR/AVRISelLowering.h
+++ b/llvm/lib/Target/AVR/AVRISelLowering.h
@@ -146,10 +146,8 @@ private:
   SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
 
-  CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC) const;
-
-  bool CanLowerReturn(CallingConv::ID CallConv,
-                      MachineFunction &MF, bool isVarArg,
+  bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+                      bool isVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
                       LLVMContext &Context) const override;
 
diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.cpp b/llvm/lib/Target/AVR/AVRInstrInfo.cpp
index ae268b1..06f0769 100644
--- a/llvm/lib/Target/AVR/AVRInstrInfo.cpp
+++ b/llvm/lib/Target/AVR/AVRInstrInfo.cpp
@@ -48,7 +48,7 @@ void AVRInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 
   // Not all AVR devices support the 16-bit `MOVW` instruction.
   if (AVR::DREGSRegClass.contains(DestReg, SrcReg)) {
-    if (STI.hasMOVW()) {
+    if (STI.hasMOVW() && AVR::DREGSMOVWRegClass.contains(DestReg, SrcReg)) {
       BuildMI(MBB, MI, DL, get(AVR::MOVWRdRr), DestReg)
           .addReg(SrcReg, getKillRegState(KillSrc));
     } else {
diff --git a/llvm/lib/Target/AVR/AVRRegisterInfo.td b/llvm/lib/Target/AVR/AVRRegisterInfo.td
index ea38fed..ab5d023 100644
--- a/llvm/lib/Target/AVR/AVRRegisterInfo.td
+++ b/llvm/lib/Target/AVR/AVRRegisterInfo.td
@@ -103,6 +103,17 @@ CoveredBySubRegs = 1 in
   def R5R4   : AVRReg<4,  "r5:r4",   [R4, R5]>,   DwarfRegNum<[4]>;
   def R3R2   : AVRReg<2,  "r3:r2",   [R2, R3]>,   DwarfRegNum<[2]>;
   def R1R0   : AVRReg<0,  "r1:r0",   [R0, R1]>,   DwarfRegNum<[0]>;
+
+  // Pseudo registers for unaligned i16
+  def R26R25 : AVRReg<25, "r26:r25", [R25, R26]>, DwarfRegNum<[25]>;
+  def R24R23 : AVRReg<23, "r24:r23", [R23, R24]>, DwarfRegNum<[23]>;
+  def R22R21 : AVRReg<21, "r22:r21", [R21, R22]>, DwarfRegNum<[21]>;
+  def R20R19 : AVRReg<19, "r20:r19", [R19, R20]>, DwarfRegNum<[19]>;
+  def R18R17 : AVRReg<17, "r18:r17", [R17, R18]>, DwarfRegNum<[17]>;
+  def R16R15 : AVRReg<15, "r16:r15", [R15, R16]>, DwarfRegNum<[15]>;
+  def R14R13 : AVRReg<13, "r14:r13", [R13, R14]>, DwarfRegNum<[13]>;
+  def R12R11 : AVRReg<11, "r12:r11", [R11, R12]>, DwarfRegNum<[11]>;
+  def R10R9  : AVRReg<9,  "r10:r9",  [R9,  R10]>, DwarfRegNum<[9]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -153,6 +164,22 @@ def DREGS : RegisterClass<"AVR", [i16], 8,
     R31R30, R27R26,
     // Callee saved registers.
     R29R28, R17R16, R15R14, R13R12, R11R10,
+    R9R8, R7R6, R5R4, R3R2, R1R0,
+    // Pseudo regs for unaligned 16-bits
+    R26R25, R24R23, R22R21,
+    R20R19, R18R17, R16R15,
+    R14R13, R12R11, R10R9
+  )>;
+
+// 16-bit pair register class for movw
+def DREGSMOVW : RegisterClass<"AVR", [i16], 8,
+  (
+    // Return value and arguments.
+    add R25R24, R19R18, R21R20, R23R22,
+    // Scratch registers.
+    R31R30, R27R26,
+    // Callee saved registers.
+    R29R28, R17R16, R15R14, R13R12, R11R10,
     R9R8, R7R6, R5R4, R3R2, R1R0
   )>;
 
diff --git a/llvm/test/CodeGen/AVR/calling-conv/c/basic_aggr.ll b/llvm/test/CodeGen/AVR/calling-conv/c/basic_aggr.ll
new file mode 100644
index 0000000..0f6cf0e
--- /dev/null
+++ b/llvm/test/CodeGen/AVR/calling-conv/c/basic_aggr.ll
@@ -0,0 +1,84 @@
+; RUN: llc < %s -march=avr | FileCheck %s
+
+; CHECK-LABEL: ret_void_args_struct_i8_i32
+define void @ret_void_args_struct_i8_i32({ i8, i32 } %a) {
+start:
+  ; CHECK:      sts     4, r20
+  %0 = extractvalue { i8, i32 } %a, 0
+  store volatile i8 %0, i8* inttoptr (i64 4 to i8*)
+
+  ; CHECK-NEXT: sts     8, r24
+  ; CHECK-NEXT: sts     7, r23
+  ; CHECK-NEXT: sts     6, r22
+  ; CHECK-NEXT: sts     5, r21
+  %1 = extractvalue { i8, i32 } %a, 1
+  store volatile i32 %1, i32* inttoptr (i64 5 to i32*)
+  ret void
+}
+
+; CHECK-LABEL: ret_void_args_struct_i8_i8_i8_i8
+define void @ret_void_args_struct_i8_i8_i8_i8({ i8, i8, i8, i8 } %a) {
+start:
+  ; CHECK:      sts     4, r22
+  %0 = extractvalue { i8, i8, i8, i8 } %a, 0
+  store volatile i8 %0, i8* inttoptr (i64 4 to i8*)
+  ; CHECK-NEXT: sts     5, r23
+  %1 = extractvalue { i8, i8, i8, i8 } %a, 1
+  store volatile i8 %1, i8* inttoptr (i64 5 to i8*)
+  ; CHECK-NEXT: sts     6, r24
+  %2 = extractvalue { i8, i8, i8, i8 } %a, 2
+  store volatile i8 %2, i8* inttoptr (i64 6 to i8*)
+  ; CHECK-NEXT: sts     7, r25
+  %3 = extractvalue { i8, i8, i8, i8 } %a, 3
+  store volatile i8 %3, i8* inttoptr (i64 7 to i8*)
+  ret void
+}
+
+; CHECK-LABEL: ret_void_args_struct_i32_16_i8
+define void @ret_void_args_struct_i32_16_i8({ i32, i16, i8} %a) {
+start:
+  ; CHECK:      sts     7, r21
+  ; CHECK-NEXT: sts     6, r20
+  ; CHECK-NEXT: sts     5, r19
+  ; CHECK-NEXT: sts     4, r18
+  %0 = extractvalue { i32, i16, i8 } %a, 0
+  store volatile i32 %0, i32* inttoptr (i64 4 to i32*)
+
+  ; CHECK-NEXT: sts     5, r23
+  ; CHECK-NEXT: sts     4, r22
+  %1 = extractvalue { i32, i16, i8 } %a, 1
+  store volatile i16 %1, i16* inttoptr (i64 4 to i16*)
+
+  ; CHECK-NEXT: sts     4, r24
+  %2 = extractvalue { i32, i16, i8 } %a, 2
+  store volatile i8 %2, i8* inttoptr (i64 4 to i8*)
+  ret void
+}
+
+; CHECK-LABEL: ret_void_args_struct_i8_i32_struct_i32_i8
+define void @ret_void_args_struct_i8_i32_struct_i32_i8({ i8, i32 } %a, { i32, i8 } %b) {
+start:
+  ; CHECK:      sts     4, r20
+  %0 = extractvalue { i8, i32 } %a, 0
+  store volatile i8 %0, i8* inttoptr (i64 4 to i8*)
+
+  ; CHECK-NEXT: sts     8, r24
+  ; CHECK-NEXT: sts     7, r23
+  ; CHECK-NEXT: sts     6, r22
+  ; CHECK-NEXT: sts     5, r21
+  %1 = extractvalue { i8, i32 } %a, 1
+  store volatile i32 %1, i32* inttoptr (i64 5 to i32*)
+
+  ; CHECK-NEXT:      sts     9, r17
+  ; CHECK-NEXT:      sts     8, r16
+  ; CHECK-NEXT:      sts     7, r15
+  ; CHECK-NEXT:      sts     6, r14
+  %2 = extractvalue { i32, i8 } %b, 0
+  store volatile i32 %2, i32* inttoptr (i64 6 to i32*)
+
+  ; CHECK-NEXT: sts     7, r18
+  %3 = extractvalue { i32, i8 } %b, 1
+  store volatile i8 %3, i8* inttoptr (i64 7 to i8*)
+  ret void
+}
+
diff --git a/llvm/test/CodeGen/AVR/calling-conv/c/call.ll b/llvm/test/CodeGen/AVR/calling-conv/c/call.ll
new file mode 100644
index 0000000..e218ed3
--- /dev/null
+++ b/llvm/test/CodeGen/AVR/calling-conv/c/call.ll
@@ -0,0 +1,89 @@
+; RUN: llc < %s -march=avr | FileCheck %s
+
+declare void @ret_void_args_i8(i8 %a)
+declare void @ret_void_args_i8_i32(i8 %a, i32 %b)
+declare void @ret_void_args_i8_i8_i8_i8(i8 %a, i8 %b, i8 %c, i8 %d)
+declare void @ret_void_args_i32_i16_i8(i32 %a, i16 %b, i8 %c)
+declare void @ret_void_args_i64(i64 %a)
+declare void @ret_void_args_i64_i64(i64 %a, i64 %b)
+declare void @ret_void_args_i64_i64_i16(i64 %a, i64 %b, i16 %c)
+
+; CHECK-LABEL: call_void_args_i8
+define void @call_void_args_i8() {
+    ; CHECK: ldi r24, 64
+    call void @ret_void_args_i8 (i8 64)
+    ret void
+}
+
+; CHECK-LABEL: call_void_args_i8_i32
+define void @call_void_args_i8_i32() {
+    ; CHECK: ldi r20, 4
+    ; CHECK-NEXT: ldi r21, 3
+    ; CHECK-NEXT: ldi r22, 2
+    ; CHECK-NEXT: ldi r23, 1
+    ; CHECK-NEXT: ldi r24, 64
+    call void @ret_void_args_i8_i32 (i8 64, i32 16909060)
+    ret void
+}
+
+; CHECK-LABEL: call_void_args_i8_i8_i8_i8
+define void @call_void_args_i8_i8_i8_i8() {
+    ; CHECK: ldi r24, 1
+    ; CHECK-NEXT: ldi r22, 2
+    ; CHECK-NEXT: ldi r20, 3
+    ; CHECK-NEXT: ldi r18, 4
+    call void @ret_void_args_i8_i8_i8_i8(i8 1, i8 2, i8 3, i8 4)
+    ret void
+}
+
+; CHECK-LABEL: call_void_args_i32_i16_i8
+define void @call_void_args_i32_i16_i8() {
+    ; CHECK: ldi r22, 4
+    ; CHECK-NEXT: ldi r23, 3
+    ; CHECK-NEXT: ldi r24, 2
+    ; CHECK-NEXT: ldi r25, 1
+    ; CHECK-NEXT: ldi r20, 1
+    ; CHECK-NEXT: ldi r21, 4
+    ; CHECK-NEXT: ldi r18, 64
+    call void @ret_void_args_i32_i16_i8(i32 16909060, i16 1025, i8 64)
+    ret void
+}
+
+; CHECK-LABEL: call_void_args_i64
+define void @call_void_args_i64() {
+    ; CHECK: ldi r18, 8
+    ; CHECK-NEXT: ldi r19, 7
+    ; CHECK-NEXT: ldi r20, 6
+    ; CHECK-NEXT: ldi r21, 5
+    ; CHECK-NEXT: ldi r22, 4
+    ; CHECK-NEXT: ldi r23, 3
+    ; CHECK-NEXT: ldi r24, 2
+    ; CHECK-NEXT: ldi r25, 1
+    call void @ret_void_args_i64(i64 72623859790382856)
+    ret void
+}
+
+; CHECK-LABEL: call_void_args_i64_i64
+define void @call_void_args_i64_i64() {
+    ; CHECK: ldi r18, 8
+    ; CHECK-NEXT: ldi r19, 7
+    ; CHECK-NEXT: ldi r20, 6
+    ; CHECK-NEXT: ldi r21, 5
+    ; CHECK-NEXT: ldi r22, 4
+    ; CHECK-NEXT: ldi r23, 3
+    ; CHECK-NEXT: ldi r24, 2
+    ; CHECK-NEXT: ldi r25, 1
+    ; the second arg is in r10:r17, but unordered
+    ; CHECK: r17,
+    ; CHECK: r10,
+    call void @ret_void_args_i64_i64(i64 72623859790382856, i64 651345242494996224)
+    ret void
+}
+
+; CHECK-LABEL: call_void_args_i64_i64_i16
+define void @call_void_args_i64_i64_i16() {
+    ; CHECK: r8,
+    ; CHECK: r9,
+    call void @ret_void_args_i64_i64_i16(i64 72623859790382856, i64 651345242494996224, i16 5655)
+    ret void
+}
diff --git a/llvm/test/CodeGen/AVR/calling-conv/c/call_aggr.ll b/llvm/test/CodeGen/AVR/calling-conv/c/call_aggr.ll
new file mode 100644
index 0000000..aeba3c8
--- /dev/null
+++ b/llvm/test/CodeGen/AVR/calling-conv/c/call_aggr.ll
@@ -0,0 +1,48 @@
+; RUN: llc < %s -march=avr | FileCheck %s
+
+declare void @ret_void_args_struct_i8_i32({ i8, i32 } %a)
+declare void @ret_void_args_struct_i8_i8_i8_i8({ i8, i8, i8, i8 } %a)
+declare void @ret_void_args_struct_i32_i16_i8({ i32, i16, i8} %a)
+declare void @ret_void_args_struct_i8_i32_struct_i32_i8({ i8, i32 } %a, { i32, i8 } %b)
+
+; CHECK-LABEL: call_void_args_struct_i8_i32
+define void @call_void_args_struct_i8_i32() {
+    ; CHECK: ldi r20, 64
+    ; CHECK-NEXT: r21,
+    ; CHECK-NEXT: r22,
+    ; CHECK-NEXT: r23,
+    ; CHECK-NEXT: r24,
+    call void @ret_void_args_struct_i8_i32({ i8, i32 } { i8 64, i32 16909060 })
+    ret void
+}
+
+; CHECK-LABEL: @call_void_args_struct_i8_i8_i8_i8
+define void @call_void_args_struct_i8_i8_i8_i8() {
+    ; CHECK: ldi r22, 1
+    ; CHECK-NEXT: ldi r23, 2
+    ; CHECK-NEXT: ldi r24, 3
+    ; CHECK-NEXT: ldi r25, 4
+    call void @ret_void_args_struct_i8_i8_i8_i8({ i8, i8, i8, i8 } { i8 1, i8 2, i8 3, i8 4 })
+    ret void
+}
+
+; CHECK-LABEL: @call_void_args_struct_i32_i16_i8
+define void @call_void_args_struct_i32_i16_i8() {
+    ; CHECK: ldi r18, 4
+    ; CHECK-NEXT: ldi r19, 3
+    ; CHECK-NEXT: ldi r20, 2
+    ; CHECK-NEXT: ldi r21, 1
+    ; CHECK-NEXT: ldi r22, 23
+    ; CHECK-NEXT: ldi r23, 22
+    ; CHECK-NEXT: ldi r24, 64
+    call void @ret_void_args_struct_i32_i16_i8({ i32, i16, i8 } { i32 16909060, i16 5655, i8 64 })
+    ret void
+}
+
+; CHECK-LABEL: @call_void_args_struct_i8_i32_struct_i32_i8
+define void @call_void_args_struct_i8_i32_struct_i32_i8() {
+    ; CHECK: ldi r20, 64
+    ; CHECK: ldi r18, 65
+    call void @ret_void_args_struct_i8_i32_struct_i32_i8({ i8, i32 } { i8 64, i32 16909060 }, { i32, i8 } { i32 287454020, i8 65 })
+    ret void
+}
diff --git a/llvm/test/CodeGen/AVR/calling-conv/c/return_aggr.ll b/llvm/test/CodeGen/AVR/calling-conv/c/return_aggr.ll
new file mode 100644
index 0000000..97668f6
--- /dev/null
+++ b/llvm/test/CodeGen/AVR/calling-conv/c/return_aggr.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s -march=avr | FileCheck %s
+
+; CHECK-LABEL: ret_struct_i8_i16_i8
+define { i8, i16, i8 } @ret_struct_i8_i16_i8() {
+start:
+  ; for some reason the i16 is loaded to r24:r25
+  ; and then moved to r23:r24
+  ; CHECK: ldi r22, 64
+  ; CHECK-NEXT: r23,
+  ; CHECK-NEXT: r24,
+  ; CHECK-NEXT: r25, 11
+  %0 = insertvalue {i8, i16, i8} undef, i8 64, 0
+  %1 = insertvalue {i8, i16, i8} %0, i16 1024, 1
+  %2 = insertvalue {i8, i16, i8} %1, i8 11, 2
+  ret {i8, i16, i8} %2
+}
+
+; CHECK-LABEL: ret_struct_i32_i16
+define { i32, i16 } @ret_struct_i32_i16() {
+start:
+  ; CHECK: ldi r18, 4
+  ; CHECK-NEXT: ldi r19, 3
+  ; CHECK-NEXT: ldi r20, 2
+  ; CHECK-NEXT: ldi r21, 1
+  ; CHECK-NEXT: ldi r22, 0
+  ; CHECK-NEXT: ldi r23, 8
+  %0 = insertvalue { i32, i16 } undef, i32 16909060, 0
+  %1 = insertvalue { i32, i16 } %0, i16 2048, 1
+  ret { i32, i16} %1
+}
+
-- 
2.7.4