From 13fefb339898648c5dc3cf668f7757f563cb2ce4 Mon Sep 17 00:00:00 2001 From: sivarv Date: Thu, 22 Sep 2016 16:19:08 -0700 Subject: [PATCH] Optimize codegen for SIMDIntrinsicGetItem when SIMD vector is a memory-op. --- src/jit/codegenxarch.cpp | 8 ++-- src/jit/gentree.h | 22 +++++----- src/jit/lower.cpp | 25 +++++++++++- src/jit/lowerxarch.cpp | 78 ++++++++++++++++++++++-------------- src/jit/rationalize.cpp | 2 +- src/jit/simdcodegenxarch.cpp | 53 ++++++++++++++++++++++++ 6 files changed, 141 insertions(+), 47 deletions(-) diff --git a/src/jit/codegenxarch.cpp b/src/jit/codegenxarch.cpp index 9bd4a1e64f..c1ede9f0f1 100644 --- a/src/jit/codegenxarch.cpp +++ b/src/jit/codegenxarch.cpp @@ -5434,13 +5434,13 @@ regNumber CodeGen::genConsumeReg(GenTree* tree) // Do liveness update for an address tree: one of GT_LEA, GT_LCL_VAR, or GT_CNS_INT (for call indirect). void CodeGen::genConsumeAddress(GenTree* addr) { - if (addr->OperGet() == GT_LEA) + if (!addr->isContained()) { - genConsumeAddrMode(addr->AsAddrMode()); + genConsumeReg(addr); } - else if (!addr->isContained()) + else if (addr->OperGet() == GT_LEA) { - genConsumeReg(addr); + genConsumeAddrMode(addr->AsAddrMode()); } } diff --git a/src/jit/gentree.h b/src/jit/gentree.h index 2eb8219bee..a9e7935c21 100644 --- a/src/jit/gentree.h +++ b/src/jit/gentree.h @@ -867,16 +867,18 @@ public: #define GTF_IND_TLS_REF 0x08000000 // GT_IND -- the target is accessed via TLS #define GTF_IND_ASG_LHS 0x04000000 // GT_IND -- this GT_IND node is (the effective val) of the LHS of an // assignment; don't evaluate it independently. -#define GTF_IND_VSD_TGT GTF_IND_ASG_LHS // GT_IND -- this GT_IND node represents the target of an indirect virtual - // stub call. This is only valid in the backend, where - // GTF_IND_ASG_LHS is not necessary (all such indirections will - // be lowered to GT_STOREIND). -#define GTF_IND_UNALIGNED 0x02000000 // GT_IND -- the load or store is unaligned (we assume worst case - // alignment of 1 byte) -#define GTF_IND_INVARIANT 0x01000000 // GT_IND -- the target is invariant (a prejit indirection) -#define GTF_IND_ARR_LEN 0x80000000 // GT_IND -- the indirection represents an array length (of the REF - // contribution to its argument). -#define GTF_IND_ARR_INDEX 0x00800000 // GT_IND -- the indirection represents an (SZ) array index +#define GTF_IND_REQ_ADDR_IN_REG GTF_IND_ASG_LHS // GT_IND -- requires its addr operand to be evaluated + // into a register. This flag is useful in cases where it + // is required to generate register indirect addressing mode. + // One such case is virtual stub calls on xarch. This is only + // valid in the backend, where GTF_IND_ASG_LHS is not necessary + // (all such indirections will be lowered to GT_STOREIND). +#define GTF_IND_UNALIGNED 0x02000000 // GT_IND -- the load or store is unaligned (we assume worst case + // alignment of 1 byte) +#define GTF_IND_INVARIANT 0x01000000 // GT_IND -- the target is invariant (a prejit indirection) +#define GTF_IND_ARR_LEN 0x80000000 // GT_IND -- the indirection represents an array length (of the REF + // contribution to its argument). +#define GTF_IND_ARR_INDEX 0x00800000 // GT_IND -- the indirection represents an (SZ) array index #define GTF_IND_FLAGS \ (GTF_IND_VOLATILE | GTF_IND_REFARR_LAYOUT | GTF_IND_TGTANYWHERE | GTF_IND_NONFAULTING | GTF_IND_TLS_REF | \ diff --git a/src/jit/lower.cpp b/src/jit/lower.cpp index 1b21855c33..d3ef79fb72 100644 --- a/src/jit/lower.cpp +++ b/src/jit/lower.cpp @@ -192,6 +192,27 @@ GenTree* Lowering::LowerNode(GenTree* node) // produces a TYP_SIMD16 result node->gtType = TYP_SIMD16; } + +#ifdef _TARGET_XARCH_ + if ((node->AsSIMD()->gtSIMDIntrinsicID == SIMDIntrinsicGetItem) && (node->gtGetOp1()->OperGet() == GT_IND)) + { + // If SIMD vector is already in memory, we force its + // addr to be evaluated into a reg. This would allow + // us to generate [regBase] or [regBase+offset] or + // [regBase+sizeOf(SIMD vector baseType)*regIndex] + // to access the required SIMD vector element directly + // from memory. + // + // TODO-CQ-XARCH: If addr of GT_IND is GT_LEA, we + // might be able update GT_LEA to fold the regIndex + // or offset in some cases. Instead with this + // approach we always evaluate GT_LEA into a reg. + // Ideally, we should be able to lower GetItem intrinsic + // into GT_IND(newAddr) where newAddr combines + // the addr of SIMD vector with the given index. + node->gtOp.gtOp1->gtFlags |= GTF_IND_REQ_ADDR_IN_REG; + } +#endif break; case GT_LCL_VAR: @@ -3154,7 +3175,7 @@ GenTree* Lowering::LowerVirtualStubCall(GenTreeCall* call) // All we have to do here is add an indirection to generate the actual call target. GenTree* ind = Ind(call->gtCallAddr); - ind->gtFlags |= GTF_IND_VSD_TGT; + ind->gtFlags |= GTF_IND_REQ_ADDR_IN_REG; BlockRange().InsertAfter(call->gtCallAddr, ind); call->gtCallAddr = ind; @@ -3196,7 +3217,7 @@ GenTree* Lowering::LowerVirtualStubCall(GenTreeCall* call) addr->gtRegNum = REG_VIRTUAL_STUB_PARAM; indir->gtRegNum = REG_JUMP_THUNK_PARAM; - indir->gtFlags |= GTF_IND_VSD_TGT; + indir->gtFlags |= GTF_IND_REQ_ADDR_IN_REG; #endif result = indir; } diff --git a/src/jit/lowerxarch.cpp b/src/jit/lowerxarch.cpp index d17bc87da1..c10e952de4 100644 --- a/src/jit/lowerxarch.cpp +++ b/src/jit/lowerxarch.cpp @@ -2420,12 +2420,13 @@ void Lowering::TreeNodeInfoInitSIMD(GenTree* tree) info->dstCount = 1; switch (simdTree->gtSIMDIntrinsicID) { + GenTree* op1; GenTree* op2; case SIMDIntrinsicInit: { info->srcCount = 1; - GenTree* op1 = tree->gtOp.gtOp1; + op1 = tree->gtOp.gtOp1; // This sets all fields of a SIMD struct to the given value. // Mark op1 as contained if it is either zero or int constant of all 1's, @@ -2558,11 +2559,13 @@ void Lowering::TreeNodeInfoInitSIMD(GenTree* tree) break; case SIMDIntrinsicGetItem: + { // This implements get_Item method. The sources are: // - the source SIMD struct // - index (which element to get) // The result is baseType of SIMD struct. info->srcCount = 2; + op1 = tree->gtOp.gtOp1; op2 = tree->gtOp.gtOp2; // If the index is a constant, mark it as contained. @@ -2571,39 +2574,55 @@ void Lowering::TreeNodeInfoInitSIMD(GenTree* tree) info->srcCount = 1; } - // If the index is not a constant, we will use the SIMD temp location to store the vector. - // Otherwise, if the baseType is floating point, the targetReg will be a xmm reg and we - // can use that in the process of extracting the element. - // - // If the index is a constant and base type is a small int we can use pextrw, but on AVX - // we will need a temp if are indexing into the upper half of the AVX register. - // In all other cases with constant index, we need a temp xmm register to extract the - // element if index is other than zero. - - if (!op2->IsCnsIntOrI()) + if (op1->isMemoryOp()) { - (void)comp->getSIMDInitTempVarNum(); + MakeSrcContained(tree, op1); + + // Although GT_IND of TYP_SIMD12 reserves an internal float + // register for reading 4 and 8 bytes from memory and + // assembling them into target XMM reg, it is not required + // in this case. + op1->gtLsraInfo.internalIntCount = 0; + op1->gtLsraInfo.internalFloatCount = 0; } - else if (!varTypeIsFloating(simdTree->gtSIMDBaseType)) + else { - bool needFloatTemp; - if (varTypeIsSmallInt(simdTree->gtSIMDBaseType) && - (comp->getSIMDInstructionSet() == InstructionSet_AVX)) - { - int byteShiftCnt = (int)op2->AsIntCon()->gtIconVal * genTypeSize(simdTree->gtSIMDBaseType); - needFloatTemp = (byteShiftCnt >= 16); - } - else + // If the index is not a constant, we will use the SIMD temp location to store the vector. + // Otherwise, if the baseType is floating point, the targetReg will be a xmm reg and we + // can use that in the process of extracting the element. + // + // If the index is a constant and base type is a small int we can use pextrw, but on AVX + // we will need a temp if are indexing into the upper half of the AVX register. + // In all other cases with constant index, we need a temp xmm register to extract the + // element if index is other than zero. + + if (!op2->IsCnsIntOrI()) { - needFloatTemp = !op2->IsIntegralConst(0); + (void)comp->getSIMDInitTempVarNum(); } - if (needFloatTemp) + else if (!varTypeIsFloating(simdTree->gtSIMDBaseType)) { - info->internalFloatCount = 1; - info->setInternalCandidates(lsra, lsra->allSIMDRegs()); + bool needFloatTemp; + if (varTypeIsSmallInt(simdTree->gtSIMDBaseType) && + (comp->getSIMDInstructionSet() == InstructionSet_AVX)) + { + int byteShiftCnt = (int)op2->AsIntCon()->gtIconVal * genTypeSize(simdTree->gtSIMDBaseType); + needFloatTemp = (byteShiftCnt >= 16); + } + else + { + needFloatTemp = !op2->IsIntegralConst(0); + } + + if (needFloatTemp) + { + info->internalFloatCount = 1; + info->setInternalCandidates(lsra, lsra->allSIMDRegs()); + } } } - break; + } + break; case SIMDIntrinsicSetX: case SIMDIntrinsicSetY: @@ -2854,11 +2873,10 @@ void Lowering::SetIndirAddrOpCounts(GenTreePtr indirTree) } #endif // FEATURE_SIMD - if ((indirTree->gtFlags & GTF_IND_VSD_TGT) != 0) + if ((indirTree->gtFlags & GTF_IND_REQ_ADDR_IN_REG) != 0) { - // The address of an indirection that is marked as the target of a VSD call must - // be computed into a register. Skip any further processing that might otherwise - // make it contained. + // The address of an indirection that requires its address in a reg. + // Skip any further processing that might otherwise make it contained. } else if ((addr->OperGet() == GT_CLS_VAR_ADDR) || (addr->OperGet() == GT_LCL_VAR_ADDR)) { diff --git a/src/jit/rationalize.cpp b/src/jit/rationalize.cpp index 1a9c38aa5f..17fd9fcfd5 100644 --- a/src/jit/rationalize.cpp +++ b/src/jit/rationalize.cpp @@ -662,7 +662,7 @@ Compiler::fgWalkResult Rationalizer::RewriteNode(GenTree** useEdge, ArrayStackgtFlags &= ~GTF_IND_ASG_LHS; break; diff --git a/src/jit/simdcodegenxarch.cpp b/src/jit/simdcodegenxarch.cpp index 61b383c6ce..007b0d695b 100644 --- a/src/jit/simdcodegenxarch.cpp +++ b/src/jit/simdcodegenxarch.cpp @@ -1440,6 +1440,59 @@ void CodeGen::genSIMDIntrinsicGetItem(GenTreeSIMD* simdNode) genConsumeOperands(simdNode); regNumber srcReg = op1->gtRegNum; + // Optimize the case of op1 is in memory and trying to access ith element. + if (op1->isMemoryOp()) + { + assert(op1->isContained()); + + regNumber baseReg; + regNumber indexReg; + int offset = 0; + + if (op1->OperGet() == GT_LCL_FLD) + { + // There are three parts to the total offset here: + // {offset of local} + {offset of SIMD Vector field} + {offset of element within SIMD vector}. + bool isEBPbased; + unsigned varNum = op1->gtLclVarCommon.gtLclNum; + offset += compiler->lvaFrameAddress(varNum, &isEBPbased); + offset += op1->gtLclFld.gtLclOffs; + + baseReg = (isEBPbased) ? REG_EBP : REG_ESP; + } + else + { + // Require GT_IND addr to be not contained. + assert(op1->OperGet() == GT_IND); + + GenTree* addr = op1->AsIndir()->Addr(); + assert(!addr->isContained()); + baseReg = addr->gtRegNum; + } + + if (op2->isContainedIntOrIImmed()) + { + indexReg = REG_NA; + offset += (int)op2->AsIntConCommon()->IconValue() * genTypeSize(baseType); + } + else + { + indexReg = op2->gtRegNum; + assert(genIsValidIntReg(indexReg)); + } + + // Now, load the desired element. + getEmitter()->emitIns_R_ARX(ins_Move_Extend(baseType, false), // Load + emitTypeSize(baseType), // Of the vector baseType + targetReg, // To targetReg + baseReg, // Base Reg + indexReg, // Indexed + genTypeSize(baseType), // by the size of the baseType + offset); + genProduceReg(simdNode); + return; + } + // SSE2 doesn't have an instruction to implement this intrinsic if the index is not a constant. // For the non-constant case, we will use the SIMD temp location to store the vector, and // the load the desired element. -- 2.34.1