From: Hyeongseok Oh Date: Fri, 20 Oct 2017 17:29:44 +0000 (+0900) Subject: [RyuJIT/ARM32] Fast tail call: code generation (#14445) X-Git-Tag: accepted/tizen/base/20180629.140029~836 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=c87cbf1b7496562e0cb7f94685ee5ea3de72c0f0;p=platform%2Fupstream%2Fcoreclr.git [RyuJIT/ARM32] Fast tail call: code generation (#14445) * Codegen for fast tail call Codegen call and epilog for fast tail call * Implementation for GT_START_NONGC This implementation removes two NYI_ARM Code generation for GT_START_NONGC which is used to prevent GC in fast tail call * Define fast tail call target register and mask on ARMARCH Define REG_FASTTAILCALL_TARGET and RBM_FASTTAILCALL_TARGET on ARMARCH Modify lsra init and codegen to use these definition * Merge genFnEpilog Merge genFnEpilog for ARM32 and ARM64 * Fix bug in getFirstArgWithStackSlot Fix bug in getFirstArgWithStackSlot: AMD64 and X86 --- diff --git a/src/jit/codegenarmarch.cpp b/src/jit/codegenarmarch.cpp index 6d4660f..4d8cc1e 100644 --- a/src/jit/codegenarmarch.cpp +++ b/src/jit/codegenarmarch.cpp @@ -69,12 +69,11 @@ void CodeGen::genCodeForTreeNode(GenTreePtr treeNode) switch (treeNode->gtOper) { -#ifdef _TARGET_ARM64_ - case GT_START_NONGC: getEmitter()->emitDisableGC(); break; +#ifdef _TARGET_ARM64_ case GT_PROF_HOOK: // We should be seeing this only if profiler hook is needed noway_assert(compiler->compIsProfilerHookNeeded()); @@ -567,9 +566,6 @@ void CodeGen::genPutArgStk(GenTreePutArgStk* treeNode) // All other calls - stk arg is setup in out-going arg area. if (treeNode->putInIncomingArgArea()) { - NYI_ARM("genPutArgStk: fast tail call"); - -#ifdef _TARGET_ARM64_ varNumOut = getFirstArgWithStackSlot(); argOffsetMax = compiler->compArgSize; #if FEATURE_FASTTAILCALL @@ -582,7 +578,6 @@ void CodeGen::genPutArgStk(GenTreePutArgStk* treeNode) LclVarDsc* varDsc = &(compiler->lvaTable[varNumOut]); assert(varDsc != nullptr); #endif // FEATURE_FASTTAILCALL -#endif // _TARGET_ARM64_ } else { @@ -2266,15 +2261,11 @@ void CodeGen::genCallInstruction(GenTreeCall* call) genConsumeReg(target); - NYI_ARM("fast tail call"); - -#ifdef _TARGET_ARM64_ - // Use IP0 as the call target register. - if (target->gtRegNum != REG_IP0) + // Use IP0 on ARM64 and R12 on ARM32 as the call target register. + if (target->gtRegNum != REG_FASTTAILCALL_TARGET) { - inst_RV_RV(INS_mov, REG_IP0, target->gtRegNum); + inst_RV_RV(INS_mov, REG_FASTTAILCALL_TARGET, target->gtRegNum); } -#endif // _TARGET_ARM64_ return; } diff --git a/src/jit/codegencommon.cpp b/src/jit/codegencommon.cpp index 80550b0..64f38ae 100644 --- a/src/jit/codegencommon.cpp +++ b/src/jit/codegencommon.cpp @@ -9406,14 +9406,14 @@ void CodeGen::genFnProlog() * Please consult the "debugger team notification" comment in genFnProlog(). */ -#if defined(_TARGET_ARM_) +#if defined(_TARGET_ARMARCH_) void CodeGen::genFnEpilog(BasicBlock* block) { #ifdef DEBUG if (verbose) printf("*************** In genFnEpilog()\n"); -#endif +#endif // DEBUG ScopedSetVariable _setGeneratingEpilog(&compiler->compGeneratingEpilog, true); @@ -9437,10 +9437,11 @@ void CodeGen::genFnEpilog(BasicBlock* block) getEmitter()->emitDispRegSet(gcInfo.gcRegByrefSetCur); printf("\n"); } -#endif +#endif // DEBUG bool jmpEpilog = ((block->bbFlags & BBF_HAS_JMP) != 0); +#ifdef _TARGET_ARM_ // We delay starting the unwind codes until we have an instruction which we know // needs an unwind code. In particular, for large stack frames in methods without // localloc, the sequence might look something like this: @@ -9496,148 +9497,28 @@ void CodeGen::genFnEpilog(BasicBlock* block) compiler->unwindAllocStack(preSpillRegArgSize); } - if (jmpEpilog) - { - noway_assert(block->bbJumpKind == BBJ_RETURN); - noway_assert(block->bbTreeList); - - // We better not have used a pop PC to return otherwise this will be unreachable code - noway_assert(!genUsedPopToReturn); - - /* figure out what jump we have */ - - GenTree* jmpNode = block->lastNode(); - noway_assert(jmpNode->gtOper == GT_JMP); - - CORINFO_METHOD_HANDLE methHnd = (CORINFO_METHOD_HANDLE)jmpNode->gtVal.gtVal1; - - CORINFO_CONST_LOOKUP addrInfo; - void* addr; - regNumber indCallReg; - emitter::EmitCallType callType; - - compiler->info.compCompHnd->getFunctionEntryPoint(methHnd, &addrInfo); - switch (addrInfo.accessType) - { - case IAT_VALUE: - if (arm_Valid_Imm_For_BL((ssize_t)addrInfo.addr)) - { - // Simple direct call - callType = emitter::EC_FUNC_TOKEN; - addr = addrInfo.addr; - indCallReg = REG_NA; - break; - } - - // otherwise the target address doesn't fit in an immediate - // so we have to burn a register... - __fallthrough; - - case IAT_PVALUE: - // Load the address into a register, load indirect and call through a register - // We have to use R12 since we assume the argument registers are in use - callType = emitter::EC_INDIR_R; - indCallReg = REG_R12; - addr = NULL; - instGen_Set_Reg_To_Imm(EA_HANDLE_CNS_RELOC, indCallReg, (ssize_t)addrInfo.addr); - if (addrInfo.accessType == IAT_PVALUE) - { - getEmitter()->emitIns_R_R_I(INS_ldr, EA_PTRSIZE, indCallReg, indCallReg, 0); - regTracker.rsTrackRegTrash(indCallReg); - } - break; - - case IAT_PPVALUE: - default: - NO_WAY("Unsupported JMP indirection"); - } - - /* Simply emit a jump to the methodHnd. This is similar to a call so we can use - * the same descriptor with some minor adjustments. - */ - - // clang-format off - getEmitter()->emitIns_Call(callType, - methHnd, - INDEBUG_LDISASM_COMMA(nullptr) - addr, - 0, // argSize - EA_UNKNOWN, // retSize - gcInfo.gcVarPtrSetCur, - gcInfo.gcRegGCrefSetCur, - gcInfo.gcRegByrefSetCur, - BAD_IL_OFFSET, // IL offset - indCallReg, // ireg - REG_NA, // xreg - 0, // xmul - 0, // disp - true); // isJump - // clang-format on - } - else - { - if (!genUsedPopToReturn) - { - // If we did not use a pop to return, then we did a "pop {..., lr}" instead of "pop {..., pc}", - // so we need a "bx lr" instruction to return from the function. - inst_RV(INS_bx, REG_LR, TYP_I_IMPL); - compiler->unwindBranch16(); - } - } - - compiler->unwindEndEpilog(); -} - -#elif defined(_TARGET_ARM64_) - -void CodeGen::genFnEpilog(BasicBlock* block) -{ -#ifdef DEBUG - if (verbose) - printf("*************** In genFnEpilog()\n"); -#endif - - ScopedSetVariable _setGeneratingEpilog(&compiler->compGeneratingEpilog, true); - - VarSetOps::Assign(compiler, gcInfo.gcVarPtrSetCur, getEmitter()->emitInitGCrefVars); - gcInfo.gcRegGCrefSetCur = getEmitter()->emitInitGCrefRegs; - gcInfo.gcRegByrefSetCur = getEmitter()->emitInitByrefRegs; - -#ifdef DEBUG - if (compiler->opts.dspCode) - printf("\n__epilog:\n"); - - if (verbose) - { - printf("gcVarPtrSetCur=%s ", VarSetOps::ToString(compiler, gcInfo.gcVarPtrSetCur)); - dumpConvertedVarSet(compiler, gcInfo.gcVarPtrSetCur); - printf(", gcRegGCrefSetCur="); - printRegMaskInt(gcInfo.gcRegGCrefSetCur); - getEmitter()->emitDispRegSet(gcInfo.gcRegGCrefSetCur); - printf(", gcRegByrefSetCur="); - printRegMaskInt(gcInfo.gcRegByrefSetCur); - getEmitter()->emitDispRegSet(gcInfo.gcRegByrefSetCur); - printf("\n"); - } -#endif - - bool jmpEpilog = ((block->bbFlags & BBF_HAS_JMP) != 0); - +#else // _TARGET_ARM64_ compiler->unwindBegEpilog(); genPopCalleeSavedRegistersAndFreeLclFrame(jmpEpilog); +#endif // _TARGET_ARM64_ if (jmpEpilog) { noway_assert(block->bbJumpKind == BBJ_RETURN); noway_assert(block->bbTreeList != nullptr); - // figure out what jump we have +#ifdef _TARGET_ARM_ + // We better not have used a pop PC to return otherwise this will be unreachable code + noway_assert(!genUsedPopToReturn); +#endif // _TARGET_ARM_ + + /* figure out what jump we have */ GenTree* jmpNode = block->lastNode(); #if !FEATURE_FASTTAILCALL noway_assert(jmpNode->gtOper == GT_JMP); -#else - // arm64 +#else // FEATURE_FASTTAILCALL + // armarch // If jmpNode is GT_JMP then gtNext must be null. // If jmpNode is a fast tail call, gtNext need not be null since it could have embedded stmts. noway_assert((jmpNode->gtOper != GT_JMP) || (jmpNode->gtNext == nullptr)); @@ -9648,7 +9529,7 @@ void CodeGen::genFnEpilog(BasicBlock* block) // The next block is associated with this "if" stmt if (jmpNode->gtOper == GT_JMP) -#endif +#endif // FEATURE_FASTTAILCALL { // Simply emit a jump to the methodHnd. This is similar to a call so we can use // the same descriptor with some minor adjustments. @@ -9656,6 +9537,70 @@ void CodeGen::genFnEpilog(BasicBlock* block) CORINFO_CONST_LOOKUP addrInfo; compiler->info.compCompHnd->getFunctionEntryPoint(methHnd, &addrInfo); + +#ifdef _TARGET_ARM_ + emitter::EmitCallType callType; + void* addr; + regNumber indCallReg; + switch (addrInfo.accessType) + { + case IAT_VALUE: + if (arm_Valid_Imm_For_BL((ssize_t)addrInfo.addr)) + { + // Simple direct call + callType = emitter::EC_FUNC_TOKEN; + addr = addrInfo.addr; + indCallReg = REG_NA; + break; + } + + // otherwise the target address doesn't fit in an immediate + // so we have to burn a register... + __fallthrough; + + case IAT_PVALUE: + // Load the address into a register, load indirect and call through a register + // We have to use R12 since we assume the argument registers are in use + callType = emitter::EC_INDIR_R; + indCallReg = REG_R12; + addr = NULL; + instGen_Set_Reg_To_Imm(EA_HANDLE_CNS_RELOC, indCallReg, (ssize_t)addrInfo.addr); + if (addrInfo.accessType == IAT_PVALUE) + { + getEmitter()->emitIns_R_R_I(INS_ldr, EA_PTRSIZE, indCallReg, indCallReg, 0); + regTracker.rsTrackRegTrash(indCallReg); + } + break; + + case IAT_PPVALUE: + default: + NO_WAY("Unsupported JMP indirection"); + } + + /* Simply emit a jump to the methodHnd. This is similar to a call so we can use + * the same descriptor with some minor adjustments. + */ + + // clang-format off + getEmitter()->emitIns_Call(callType, + methHnd, + INDEBUG_LDISASM_COMMA(nullptr) + addr, + 0, // argSize + EA_UNKNOWN, // retSize + gcInfo.gcVarPtrSetCur, + gcInfo.gcRegGCrefSetCur, + gcInfo.gcRegByrefSetCur, + BAD_IL_OFFSET, // IL offset + indCallReg, // ireg + REG_NA, // xreg + 0, // xmul + 0, // disp + true); // isJump + // clang-format on + CLANG_FORMAT_COMMENT_ANCHOR; + +#else // _TARGET_ARM64_ if (addrInfo.accessType != IAT_VALUE) { NYI_ARM64("Unsupported JMP indirection"); @@ -9680,22 +9625,35 @@ void CodeGen::genFnEpilog(BasicBlock* block) BAD_IL_OFFSET, REG_NA, REG_NA, 0, 0, /* iloffset, ireg, xreg, xmul, disp */ true); /* isJump */ // clang-format on + CLANG_FORMAT_COMMENT_ANCHOR; + +#endif // _TARGET_ARM64_ } #if FEATURE_FASTTAILCALL else { // Fast tail call. - // Call target = REG_IP0. + // Call target = REG_FASTTAILCALL_TARGET // https://github.com/dotnet/coreclr/issues/4827 // Do we need a special encoding for stack walker like rex.w prefix for x64? - getEmitter()->emitIns_R(INS_br, emitTypeSize(TYP_I_IMPL), REG_IP0); + getEmitter()->emitIns_R(INS_br, emitTypeSize(TYP_I_IMPL), REG_FASTTAILCALL_TARGET); } #endif // FEATURE_FASTTAILCALL } else { +#ifdef _TARGET_ARM_ + if (!genUsedPopToReturn) + { + // If we did not use a pop to return, then we did a "pop {..., lr}" instead of "pop {..., pc}", + // so we need a "bx lr" instruction to return from the function. + inst_RV(INS_bx, REG_LR, TYP_I_IMPL); + compiler->unwindBranch16(); + } +#else // _TARGET_ARM64_ inst_RV(INS_ret, REG_LR, TYP_I_IMPL); compiler->unwindReturn(REG_LR); +#endif // _TARGET_ARM64_ } compiler->unwindEndEpilog(); @@ -11373,7 +11331,7 @@ instruction CodeGen::genMapShiftInsToShiftByConstantIns(instruction ins, int shi #endif // _TARGET_XARCH_ -#if !defined(LEGACY_BACKEND) && (defined(_TARGET_XARCH_) || defined(_TARGET_ARM64_)) +#if !defined(LEGACY_BACKEND) //------------------------------------------------------------------------------------------------ // // getFirstArgWithStackSlot - returns the first argument with stack slot on the caller's frame. @@ -11385,13 +11343,13 @@ instruction CodeGen::genMapShiftInsToShiftByConstantIns(instruction ins, int shi // On x64 Windows the caller always creates slots (homing space) in its frame for the // first 4 arguments of a callee (register passed args). So, the the variable number // (lclNum) for the first argument with a stack slot is always 0. -// For System V systems or arm64, there is no such calling convention requirement, and the code needs to find +// For System V systems or armarch, there is no such calling convention requirement, and the code needs to find // the first stack passed argument from the caller. This is done by iterating over // all the lvParam variables and finding the first with lvArgReg equals to REG_STK. // unsigned CodeGen::getFirstArgWithStackSlot() { -#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING) || defined(_TARGET_ARM64_) +#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING) || defined(_TARGET_ARMARCH_) unsigned baseVarNum = 0; #if defined(FEATURE_UNIX_AMR64_STRUCT_PASSING) baseVarNum = compiler->lvaFirstStackIncomingArgNum; @@ -11429,14 +11387,14 @@ unsigned CodeGen::getFirstArgWithStackSlot() return baseVarNum; #elif defined(_TARGET_AMD64_) return 0; -#else +#else // _TARGET_X86 // Not implemented for x86. NYI_X86("getFirstArgWithStackSlot not yet implemented for x86."); return BAD_VAR_NUM; -#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING || _TARGET_ARM64_ +#endif // _TARGET_X86_ } -#endif // !LEGACY_BACKEND && (_TARGET_XARCH_ || _TARGET_ARM64_) +#endif // !LEGACY_BACKEND //------------------------------------------------------------------------ // genSinglePush: Report a change in stack level caused by a single word-sized push instruction diff --git a/src/jit/codegenlinear.h b/src/jit/codegenlinear.h index 2cee11d..fd5b1fc 100644 --- a/src/jit/codegenlinear.h +++ b/src/jit/codegenlinear.h @@ -53,9 +53,7 @@ void genPutArgSplit(GenTreePutArgSplit* treeNode); unsigned getBaseVarForPutArgStk(GenTreePtr treeNode); #endif // _TARGET_XARCH_ -#if defined(_TARGET_XARCH_) || defined(_TARGET_ARM64_) unsigned getFirstArgWithStackSlot(); -#endif // _TARGET_XARCH_ || _TARGET_ARM64_ void genCompareFloat(GenTreePtr treeNode); void genCompareInt(GenTreePtr treeNode); diff --git a/src/jit/lsraarmarch.cpp b/src/jit/lsraarmarch.cpp index b3e9bfa..1defa6f 100644 --- a/src/jit/lsraarmarch.cpp +++ b/src/jit/lsraarmarch.cpp @@ -395,15 +395,9 @@ void LinearScan::TreeNodeInfoInitCall(GenTreeCall* call) // computed into a register. if (call->IsFastTailCall()) { -#ifdef _TARGET_ARM64_ - // Fast tail call - make sure that call target is always computed in IP0 - // so that epilog sequence can generate "br xip0" to achieve fast tail call. - ctrlExpr->gtLsraInfo.setSrcCandidates(this, genRegMask(REG_IP0)); -#else // !_TARGET_ARM64_ - // Fast tail call - make sure that call target is always computed in r12 - // so that epilog sequence can generate "br r12" to achieve fast tail call. - ctrlExpr->gtLsraInfo.setSrcCandidates(this, RBM_R12); -#endif // !_TARGET_ARM64_ + // Fast tail call - make sure that call target is always computed in R12(ARM32)/IP0(ARM64) + // so that epilog sequence can generate "br xip0/r12" to achieve fast tail call. + ctrlExpr->gtLsraInfo.setSrcCandidates(this, RBM_FASTTAILCALL_TARGET); } } #ifdef _TARGET_ARM_ diff --git a/src/jit/target.h b/src/jit/target.h index 3cd5244..4dafd54 100644 --- a/src/jit/target.h +++ b/src/jit/target.h @@ -1248,6 +1248,8 @@ typedef unsigned short regPairNoSmall; // arm: need 12 bits #define RBM_CALLEE_TRASH_NOGC RBM_CALLEE_TRASH #endif #define REG_DEFAULT_HELPER_CALL_TARGET REG_R12 + #define REG_FASTTAILCALL_TARGET REG_R12 // Target register for fast tail call + #define RBM_FASTTAILCALL_TARGET RBM_R12 #define RBM_ALLINT (RBM_INT_CALLEE_SAVED | RBM_INT_CALLEE_TRASH) #define RBM_ALLFLOAT (RBM_FLT_CALLEE_SAVED | RBM_FLT_CALLEE_TRASH) @@ -1587,6 +1589,8 @@ typedef unsigned short regPairNoSmall; // arm: need 12 bits #define RBM_CALLEE_TRASH (RBM_INT_CALLEE_TRASH | RBM_FLT_CALLEE_TRASH) #define RBM_CALLEE_TRASH_NOGC (RBM_R12|RBM_R13|RBM_R14|RBM_R15|RBM_IP1) #define REG_DEFAULT_HELPER_CALL_TARGET REG_R12 + #define REG_FASTTAILCALL_TARGET REG_IP0 // Target register for fast tail call + #define RBM_FASTTAILCALL_TARGET RBM_IP0 #define RBM_ALLINT (RBM_INT_CALLEE_SAVED | RBM_INT_CALLEE_TRASH) #define RBM_ALLFLOAT (RBM_FLT_CALLEE_SAVED | RBM_FLT_CALLEE_TRASH)