From 488be5d790020489f7f4dd7d43680f43b101dbd4 Mon Sep 17 00:00:00 2001 From: JUNG DONG-HEON Date: Thu, 28 May 2020 11:15:47 +0900 Subject: [PATCH] Fix GenerateShuffleArray to support cyclic shuffles (dotnet/coreclr#26169) * Fix GenerateShuffleArray to support cyclic shuffles The GenerateShuffleArray was not handling case when there was a cycle in the register / stack slots shuffle and it resulted in an infinite loop in this function. This issue is Unix Amd64 ABI specific. To fix that, this change reworks the algorithm completely. Besides fixing the issue, it has also better performance in some cases. To fix the cyclic shuffling, I needed an extra helper register. However, there was no available general purpose register available, so I had to use xmm8 for this purpose. * Remove special handling of the hang from ABI stress --- src/vm/comdelegate.cpp | 123 +++++++++++++++++++++++++++++++++++------- src/vm/comdelegate.h | 1 + src/vm/i386/stublinkerx86.cpp | 75 +++++++++++++++++++++++++- src/vm/i386/stublinkerx86.h | 3 ++ 4 files changed, 182 insertions(+), 20 deletions(-) diff --git a/src/vm/comdelegate.cpp b/src/vm/comdelegate.cpp index 7ee85b6..9d92a7c 100644 --- a/src/vm/comdelegate.cpp +++ b/src/vm/comdelegate.cpp @@ -237,6 +237,22 @@ int GetNormalizedArgumentSlotIndex(UINT16 offset) return index; } + +// Node of a directed graph where nodes represent registers / stack slots +// and edges represent moves of data. +struct ShuffleGraphNode +{ + static const UINT16 NoNode = 0xffff; + // Previous node (represents source of data for the register / stack of the current node) + UINT16 prev; + // Offset of the register / stack slot + UINT16 ofs; + // Set to true for nodes that are source of data for a destination node + UINT8 isSource; + // Nodes that are marked are either already processed or don't participate in the shuffling + UINT8 isMarked; +}; + #endif // UNIX_AMD64_ABI VOID GenerateShuffleArray(MethodDesc* pInvoke, MethodDesc *pTargetMeth, SArray * pShuffleEntryArray) @@ -458,6 +474,7 @@ VOID GenerateShuffleArray(MethodDesc* pInvoke, MethodDesc *pTargetMeth, SArrayRSI, RDX->RCX, R8->RDX, R9->R8, stack[0]->R9, xmm0->xmm1, xmm1->xmm2, ... xmm6->xmm7, xmm7->stack[0], stack[1]->xmm0, stack[2]->stack[1], stack[3]->stack[2] - // To prevent overwriting of slots before they are moved, we need to sort the move operations. + // To prevent overwriting of slots before they are moved, we need to perform the shuffling in correct order + + NewArrayHolder pGraphNodes = new ShuffleGraphNode[argSlots]; + + // Initialize the graph array + for (unsigned int i = 0; i < argSlots; i++) + { + pGraphNodes[i].prev = ShuffleGraphNode::NoNode; + pGraphNodes[i].isMarked = true; + pGraphNodes[i].isSource = false; + } + + // Build the directed graph representing register and stack slot shuffling. + // The links are directed from destination to source. + // During the build also set isSource flag for nodes that are sources of data. + // The ones that don't have the isSource flag set are beginnings of non-cyclic + // segments of the graph. + for (unsigned int i = 0; i < pShuffleEntryArray->GetCount(); i++) + { + ShuffleEntry entry = (*pShuffleEntryArray)[i]; + + int srcIndex = GetNormalizedArgumentSlotIndex(entry.srcofs); + int dstIndex = GetNormalizedArgumentSlotIndex(entry.dstofs); + + // Unmark the node to indicate that it was not processed yet + pGraphNodes[srcIndex].isMarked = false; + // The node contains a register / stack slot that is a source from which we move data to a destination one + pGraphNodes[srcIndex].isSource = true; + pGraphNodes[srcIndex].ofs = entry.srcofs; + + // Unmark the node to indicate that it was not processed yet + pGraphNodes[dstIndex].isMarked = false; + // Link to the previous node in the graph (source of data for the current node) + pGraphNodes[dstIndex].prev = srcIndex; + pGraphNodes[dstIndex].ofs = entry.dstofs; + } - NewArrayHolder filledSlots = new bool[argSlots]; + // Now that we've built the graph, clear the array, we will regenerate it from the graph ensuring a proper order of shuffling + pShuffleEntryArray->Clear(); - bool reordered; - do + // Add all non-cyclic subgraphs to the target shuffle array and mark their nodes as visited + for (unsigned int startIndex = 0; startIndex < argSlots; startIndex++) { - reordered = false; + unsigned int index = startIndex; - for (int i = 0; i < argSlots; i++) + if (!pGraphNodes[index].isMarked && !pGraphNodes[index].isSource) { - filledSlots[i] = false; + // This node is not a source, that means it is an end of shuffle chain + // Generate shuffle array entries for all nodes in the chain in a correct + // order. + UINT16 dstOfs = ShuffleEntry::SENTINEL; + + do + { + pGraphNodes[index].isMarked = true; + if (dstOfs != ShuffleEntry::SENTINEL) + { + entry.srcofs = pGraphNodes[index].ofs; + entry.dstofs = dstOfs; + pShuffleEntryArray->Append(entry); + } + + dstOfs = pGraphNodes[index].ofs; + index = pGraphNodes[index].prev; + } + while (index != ShuffleGraphNode::NoNode); } - for (unsigned int i = 0; i < pShuffleEntryArray->GetCount(); i++) + } + + // Process all cycles in the graph + for (unsigned int startIndex = 0; startIndex < argSlots; startIndex++) + { + unsigned int index = startIndex; + + if (!pGraphNodes[index].isMarked) { - entry = (*pShuffleEntryArray)[i]; + // This node is part of a new cycle as all non-cyclic parts of the graphs were already visited - // If the slot that we are moving the argument to was filled in already, we need to move this entry in front - // of the entry that filled it in. - if (filledSlots[GetNormalizedArgumentSlotIndex(entry.srcofs)]) + // Move the first node register / stack slot to a helper reg + UINT16 dstOfs = ShuffleEntry::HELPERREG; + + do { - unsigned int j; - for (j = i; (*pShuffleEntryArray)[j].dstofs != entry.srcofs; j--) - (*pShuffleEntryArray)[j] = (*pShuffleEntryArray)[j - 1]; + pGraphNodes[index].isMarked = true; - (*pShuffleEntryArray)[j] = entry; - reordered = true; + entry.srcofs = pGraphNodes[index].ofs; + entry.dstofs = dstOfs; + pShuffleEntryArray->Append(entry); + + dstOfs = pGraphNodes[index].ofs; + index = pGraphNodes[index].prev; } + while (index != startIndex); - filledSlots[GetNormalizedArgumentSlotIndex(entry.dstofs)] = true; + // Move helper reg to the last node register / stack slot + entry.srcofs = ShuffleEntry::HELPERREG; + entry.dstofs = dstOfs; + pShuffleEntryArray->Append(entry); } } - while (reordered); + #endif // UNIX_AMD64_ABI entry.srcofs = ShuffleEntry::SENTINEL; diff --git a/src/vm/comdelegate.h b/src/vm/comdelegate.h index 3d5ec8e..330f1f8 100644 --- a/src/vm/comdelegate.h +++ b/src/vm/comdelegate.h @@ -194,6 +194,7 @@ struct ShuffleEntry OFSMASK = 0x7fff, // Mask to get stack offset OFSREGMASK = 0x1fff, // Mask to get register index SENTINEL = 0xffff, // Indicates end of shuffle array + HELPERREG = 0xcfff, // Use a helper register as source or destination (used to handle cycles in the shuffling) }; #if defined(_TARGET_AMD64_) && !defined(UNIX_AMD64_ABI) diff --git a/src/vm/i386/stublinkerx86.cpp b/src/vm/i386/stublinkerx86.cpp index 7020e40..983fc3f 100644 --- a/src/vm/i386/stublinkerx86.cpp +++ b/src/vm/i386/stublinkerx86.cpp @@ -1739,6 +1739,49 @@ VOID StubLinkerCPU::X64EmitMovSSToMem(X86Reg Xmmreg, X86Reg baseReg, __int32 ofs X64EmitMovXmmWorker(0xF3, 0x11, Xmmreg, baseReg, ofs); } +VOID StubLinkerCPU::X64EmitMovqRegXmm(X86Reg reg, X86Reg Xmmreg) +{ + STANDARD_VM_CONTRACT; + X64EmitMovqWorker(0x7e, Xmmreg, reg); +} + +VOID StubLinkerCPU::X64EmitMovqXmmReg(X86Reg Xmmreg, X86Reg reg) +{ + STANDARD_VM_CONTRACT; + X64EmitMovqWorker(0x6e, Xmmreg, reg); +} + +//----------------------------------------------------------------------------- +// Helper method for emitting movq between xmm and general purpose reqister +//----------------------------------------------------------------------------- +VOID StubLinkerCPU::X64EmitMovqWorker(BYTE opcode, X86Reg Xmmreg, X86Reg reg) +{ + BYTE codeBuffer[10]; + unsigned int nBytes = 0; + codeBuffer[nBytes++] = 0x66; + BYTE rex = REX_PREFIX_BASE | REX_OPERAND_SIZE_64BIT; + if (reg >= kR8) + { + rex |= REX_MODRM_RM_EXT; + reg = X86RegFromAMD64Reg(reg); + } + if (Xmmreg >= kXMM8) + { + rex |= REX_MODRM_REG_EXT; + Xmmreg = X86RegFromAMD64Reg(Xmmreg); + } + codeBuffer[nBytes++] = rex; + codeBuffer[nBytes++] = 0x0f; + codeBuffer[nBytes++] = opcode; + BYTE modrm = static_cast((Xmmreg << 3) | reg); + codeBuffer[nBytes++] = 0xC0|modrm; + + _ASSERTE(nBytes <= _countof(codeBuffer)); + + // Lastly, emit the encoded bytes + EmitBytes(codeBuffer, nBytes); +} + //--------------------------------------------------------------- // Helper method for emitting of XMM from/to memory moves //--------------------------------------------------------------- @@ -3800,7 +3843,37 @@ VOID StubLinkerCPU::EmitShuffleThunk(ShuffleEntry *pShuffleEntryArray) #ifdef UNIX_AMD64_ABI for (ShuffleEntry* pEntry = pShuffleEntryArray; pEntry->srcofs != ShuffleEntry::SENTINEL; pEntry++) { - if (pEntry->srcofs & ShuffleEntry::REGMASK) + if (pEntry->srcofs == ShuffleEntry::HELPERREG) + { + if (pEntry->dstofs & ShuffleEntry::REGMASK) + { + // movq dstReg, xmm8 + int dstRegIndex = pEntry->dstofs & ShuffleEntry::OFSREGMASK; + X64EmitMovqRegXmm(c_argRegs[dstRegIndex], (X86Reg)kXMM8); + } + else + { + // movsd [rax + dst], xmm8 + int dstOffset = (pEntry->dstofs + 1) * sizeof(void*); + X64EmitMovSDToMem((X86Reg)kXMM8, SCRATCH_REGISTER_X86REG, dstOffset); + } + } + else if (pEntry->dstofs == ShuffleEntry::HELPERREG) + { + if (pEntry->srcofs & ShuffleEntry::REGMASK) + { + // movq xmm8, srcReg + int srcRegIndex = pEntry->srcofs & ShuffleEntry::OFSREGMASK; + X64EmitMovqXmmReg((X86Reg)kXMM8, c_argRegs[srcRegIndex]); + } + else + { + // movsd xmm8, [rax + src] + int srcOffset = (pEntry->srcofs + 1) * sizeof(void*); + X64EmitMovSDFromMem((X86Reg)(kXMM8), SCRATCH_REGISTER_X86REG, srcOffset); + } + } + else if (pEntry->srcofs & ShuffleEntry::REGMASK) { // Source in a general purpose or float register, destination in the same kind of a register or on stack int srcRegIndex = pEntry->srcofs & ShuffleEntry::OFSREGMASK; diff --git a/src/vm/i386/stublinkerx86.h b/src/vm/i386/stublinkerx86.h index 3d79b51..49731a8 100644 --- a/src/vm/i386/stublinkerx86.h +++ b/src/vm/i386/stublinkerx86.h @@ -200,8 +200,11 @@ class StubLinkerCPU : public StubLinker VOID X64EmitMovSDToMem(X86Reg Xmmreg, X86Reg baseReg, __int32 ofs = 0); VOID X64EmitMovSSFromMem(X86Reg Xmmreg, X86Reg baseReg, __int32 ofs = 0); VOID X64EmitMovSSToMem(X86Reg Xmmreg, X86Reg baseReg, __int32 ofs = 0); + VOID X64EmitMovqRegXmm(X86Reg reg, X86Reg Xmmreg); + VOID X64EmitMovqXmmReg(X86Reg Xmmreg, X86Reg reg); VOID X64EmitMovXmmWorker(BYTE prefix, BYTE opcode, X86Reg Xmmreg, X86Reg baseReg, __int32 ofs = 0); + VOID X64EmitMovqWorker(BYTE opcode, X86Reg Xmmreg, X86Reg reg); #endif VOID X86EmitZeroOutReg(X86Reg reg); -- 2.7.4