From 1634631ae46e460a75f9c243e8b6368a7aa78e3c Mon Sep 17 00:00:00 2001 From: "jacob.bramley@arm.com" Date: Mon, 17 Feb 2014 12:08:06 +0000 Subject: [PATCH] A64: Tidy up Push and Pop TODOs. This addresses several TODOs: - Push and Pop requests can be queued up so that arrays of Registers can be pushed efficiently, with just one PrepareForPush/Pop. - PushMultipleTimes now takes an Operand. This allows variable-length arguments arrays to be initialized, for example. - A NoUseRealAbortsScope has been added to Abort so that AssertStackConsistency can be called from PrepareForPush without introducing infinite recursion. BUG= R=ulan@chromium.org Review URL: https://codereview.chromium.org/169533002 git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@19403 ce2b1a6d-e550-0410-aec6-3dcde31c8c00 --- src/a64/code-stubs-a64.cc | 7 +- src/a64/full-codegen-a64.cc | 25 +----- src/a64/macro-assembler-a64-inl.h | 5 +- src/a64/macro-assembler-a64.cc | 148 +++++++++++++++++++++++++++++---- src/a64/macro-assembler-a64.h | 43 +++++++++- src/a64/stub-cache-a64.cc | 10 ++- src/objects.h | 1 + test/cctest/test-assembler-a64.cc | 169 +++++++++++++++++++++++++++++++++++++- 8 files changed, 356 insertions(+), 52 deletions(-) diff --git a/src/a64/code-stubs-a64.cc b/src/a64/code-stubs-a64.cc index 9750a61..036e59a 100644 --- a/src/a64/code-stubs-a64.cc +++ b/src/a64/code-stubs-a64.cc @@ -523,11 +523,14 @@ void HydrogenCodeStub::GenerateLightweightMiss(MacroAssembler* masm) { FrameScope scope(masm, StackFrame::INTERNAL); ASSERT((descriptor->register_param_count_ == 0) || x0.Is(descriptor->register_params_[param_count - 1])); + // Push arguments - // TODO(jbramley): Try to push these in blocks. + MacroAssembler::PushPopQueue queue(masm); for (int i = 0; i < param_count; ++i) { - __ Push(descriptor->register_params_[i]); + queue.Queue(descriptor->register_params_[i]); } + queue.PushQueued(); + ExternalReference miss = descriptor->miss_handler(); __ CallExternalReference(miss, descriptor->register_param_count_); } diff --git a/src/a64/full-codegen-a64.cc b/src/a64/full-codegen-a64.cc index eca3f06..17bad86 100644 --- a/src/a64/full-codegen-a64.cc +++ b/src/a64/full-codegen-a64.cc @@ -183,7 +183,7 @@ void FullCodeGenerator::Generate() { if (locals_count > 0) { __ LoadRoot(x10, Heap::kUndefinedValueRootIndex); - __ PushMultipleTimes(locals_count, x10); + __ PushMultipleTimes(x10, locals_count); } } @@ -4625,20 +4625,11 @@ void FullCodeGenerator::EmitGeneratorResume(Expression *generator, __ Ldr(w10, FieldMemOperand(x10, SharedFunctionInfo::kFormalParameterCountOffset)); __ LoadRoot(the_hole, Heap::kTheHoleValueRootIndex); - - // TODO(jbramley): Write a variant of PushMultipleTimes which takes a register - // instead of a constant count, and use it to replace this loop. - Label push_argument_holes, push_frame; - __ Bind(&push_argument_holes); - __ Subs(w10, w10, 1); - __ B(mi, &push_frame); - __ Push(the_hole); - __ B(&push_argument_holes); + __ PushMultipleTimes(the_hole, w10); // Enter a new JavaScript frame, and initialize its slots as they were when // the generator was suspended. Label resume_frame; - __ Bind(&push_frame); __ Bl(&resume_frame); __ B(&done); @@ -4675,16 +4666,8 @@ void FullCodeGenerator::EmitGeneratorResume(Expression *generator, // Otherwise, we push holes for the operand stack and call the runtime to fix // up the stack and the handlers. - // TODO(jbramley): Write a variant of PushMultipleTimes which takes a register - // instead of a constant count, and use it to replace this loop. - Label push_operand_holes, call_resume; - __ Bind(&push_operand_holes); - __ Subs(operand_stack_size, operand_stack_size, 1); - __ B(mi, &call_resume); - __ Push(the_hole); - __ B(&push_operand_holes); - - __ Bind(&call_resume); + __ PushMultipleTimes(the_hole, operand_stack_size); + __ Mov(x10, Operand(Smi::FromInt(resume_mode))); __ Push(generator_object, result_register(), x10); __ CallRuntime(Runtime::kResumeJSGeneratorObject, 3); diff --git a/src/a64/macro-assembler-a64-inl.h b/src/a64/macro-assembler-a64-inl.h index 53ce57f..271fd89 100644 --- a/src/a64/macro-assembler-a64-inl.h +++ b/src/a64/macro-assembler-a64-inl.h @@ -1286,8 +1286,9 @@ void MacroAssembler::BumpSystemStackPointer(const Operand& space) { ASSERT(!csp.Is(sp_)); // TODO(jbramley): Several callers rely on this not using scratch registers, // so we use the assembler directly here. However, this means that large - // immediate values of 'space' cannot be handled. Once we merge with V8, we - // should try to use the new scope that controls scratch register usage. + // immediate values of 'space' cannot be handled cleanly. Once we implement + // our flexible scratch register idea, we could greatly simplify this + // function. InstructionAccurateScope scope(this); if ((space.IsImmediate()) && !is_uint12(space.immediate())) { // The subtract instruction supports a 12-bit immediate, shifted left by diff --git a/src/a64/macro-assembler-a64.cc b/src/a64/macro-assembler-a64.cc index b51e927..1e8da54 100644 --- a/src/a64/macro-assembler-a64.cc +++ b/src/a64/macro-assembler-a64.cc @@ -623,6 +623,56 @@ void MacroAssembler::Pop(const CPURegister& dst0, const CPURegister& dst1, } +void MacroAssembler::PushPopQueue::PushQueued() { + if (queued_.empty()) return; + + masm_->PrepareForPush(size_); + + int count = queued_.size(); + int index = 0; + while (index < count) { + // PushHelper can only handle registers with the same size and type, and it + // can handle only four at a time. Batch them up accordingly. + CPURegister batch[4] = {NoReg, NoReg, NoReg, NoReg}; + int batch_index = 0; + do { + batch[batch_index++] = queued_[index++]; + } while ((batch_index < 4) && (index < count) && + batch[0].IsSameSizeAndType(queued_[index])); + + masm_->PushHelper(batch_index, batch[0].SizeInBytes(), + batch[0], batch[1], batch[2], batch[3]); + } + + queued_.clear(); +} + + +void MacroAssembler::PushPopQueue::PopQueued() { + if (queued_.empty()) return; + + masm_->PrepareForPop(size_); + + int count = queued_.size(); + int index = 0; + while (index < count) { + // PopHelper can only handle registers with the same size and type, and it + // can handle only four at a time. Batch them up accordingly. + CPURegister batch[4] = {NoReg, NoReg, NoReg, NoReg}; + int batch_index = 0; + do { + batch[batch_index++] = queued_[index++]; + } while ((batch_index < 4) && (index < count) && + batch[0].IsSameSizeAndType(queued_[index])); + + masm_->PopHelper(batch_index, batch[0].SizeInBytes(), + batch[0], batch[1], batch[2], batch[3]); + } + + queued_.clear(); +} + + void MacroAssembler::PushCPURegList(CPURegList registers) { int size = registers.RegisterSizeInBytes(); @@ -668,7 +718,7 @@ void MacroAssembler::PopCPURegList(CPURegList registers) { } -void MacroAssembler::PushMultipleTimes(int count, Register src) { +void MacroAssembler::PushMultipleTimes(CPURegister src, int count) { int size = src.SizeInBytes(); PrepareForPush(count, size); @@ -703,6 +753,51 @@ void MacroAssembler::PushMultipleTimes(int count, Register src) { } +void MacroAssembler::PushMultipleTimes(CPURegister src, Register count) { + PrepareForPush(Operand(count, UXTW, WhichPowerOf2(src.SizeInBytes()))); + + Register temp = AppropriateTempFor(count); + + if (FLAG_optimize_for_size) { + Label loop, done; + + Subs(temp, count, 1); + B(mi, &done); + + // Push all registers individually, to save code size. + Bind(&loop); + Subs(temp, temp, 1); + PushHelper(1, src.SizeInBytes(), src, NoReg, NoReg, NoReg); + B(pl, &loop); + + Bind(&done); + } else { + Label loop, leftover2, leftover1, done; + + Subs(temp, count, 4); + B(mi, &leftover2); + + // Push groups of four first. + Bind(&loop); + Subs(temp, temp, 4); + PushHelper(4, src.SizeInBytes(), src, src, src, src); + B(pl, &loop); + + // Push groups of two. + Bind(&leftover2); + Tbz(count, 1, &leftover1); + PushHelper(2, src.SizeInBytes(), src, src, NoReg, NoReg); + + // Push the last one (if required). + Bind(&leftover1); + Tbz(count, 0, &done); + PushHelper(1, src.SizeInBytes(), src, NoReg, NoReg, NoReg); + + Bind(&done); + } +} + + void MacroAssembler::PushHelper(int count, int size, const CPURegister& src0, const CPURegister& src1, @@ -784,30 +879,39 @@ void MacroAssembler::PopHelper(int count, int size, } -void MacroAssembler::PrepareForPush(int count, int size) { - // TODO(jbramley): Use AssertStackConsistency here, if possible. See the - // AssertStackConsistency for details of why we can't at the moment. +void MacroAssembler::PrepareForPush(Operand total_size) { + AssertStackConsistency(); if (csp.Is(StackPointer())) { // If the current stack pointer is csp, then it must be aligned to 16 bytes // on entry and the total size of the specified registers must also be a // multiple of 16 bytes. - ASSERT((count * size) % 16 == 0); + if (total_size.IsImmediate()) { + ASSERT((total_size.immediate() % 16) == 0); + } + + // Don't check access size for non-immediate sizes. It's difficult to do + // well, and it will be caught by hardware (or the simulator) anyway. } else { // Even if the current stack pointer is not the system stack pointer (csp), // the system stack pointer will still be modified in order to comply with // ABI rules about accessing memory below the system stack pointer. - BumpSystemStackPointer(count * size); + BumpSystemStackPointer(total_size); } } -void MacroAssembler::PrepareForPop(int count, int size) { +void MacroAssembler::PrepareForPop(Operand total_size) { AssertStackConsistency(); if (csp.Is(StackPointer())) { // If the current stack pointer is csp, then it must be aligned to 16 bytes // on entry and the total size of the specified registers must also be a // multiple of 16 bytes. - ASSERT((count * size) % 16 == 0); + if (total_size.IsImmediate()) { + ASSERT((total_size.immediate() % 16) == 0); + } + + // Don't check access size for non-immediate sizes. It's difficult to do + // well, and it will be caught by hardware (or the simulator) anyway. } } @@ -903,15 +1007,24 @@ void MacroAssembler::PopCalleeSavedRegisters() { void MacroAssembler::AssertStackConsistency() { - if (emit_debug_code() && !csp.Is(StackPointer())) { + if (emit_debug_code()) { if (csp.Is(StackPointer())) { - // TODO(jbramley): Check for csp alignment if it is the stack pointer. - } else { - // TODO(jbramley): Currently we cannot use this assertion in Push because - // some calling code assumes that the flags are preserved. For an example, - // look at Builtins::Generate_ArgumentsAdaptorTrampoline. - Cmp(csp, StackPointer()); - Check(ls, kTheCurrentStackPointerIsBelowCsp); + // We can't check the alignment of csp without using a scratch register + // (or clobbering the flags), but the processor (or simulator) will abort + // if it is not properly aligned during a load. + ldr(xzr, MemOperand(csp, 0)); + } else if (FLAG_enable_slow_asserts) { + Label ok; + // Check that csp <= StackPointer(), preserving all registers and NZCV. + sub(StackPointer(), csp, StackPointer()); + cbz(StackPointer(), &ok); // Ok if csp == StackPointer(). + tbnz(StackPointer(), kXSignBit, &ok); // Ok if csp < StackPointer(). + + Abort(kTheCurrentStackPointerIsBelowCsp); + + bind(&ok); + // Restore StackPointer(). + sub(StackPointer(), csp, StackPointer()); } } } @@ -4344,6 +4457,9 @@ void MacroAssembler::Abort(BailoutReason reason) { Adr(x0, &msg_address); if (use_real_aborts()) { + // Avoid infinite recursion; Push contains some assertions that use Abort. + NoUseRealAbortsScope no_real_aborts(this); + // Split the message pointer into two SMI to avoid the GC // trying to scan the string. STATIC_ASSERT((kSmiShift == 32) && (kSmiTag == 0)); diff --git a/src/a64/macro-assembler-a64.h b/src/a64/macro-assembler-a64.h index c36a2e8..b711bd1 100644 --- a/src/a64/macro-assembler-a64.h +++ b/src/a64/macro-assembler-a64.h @@ -28,6 +28,8 @@ #ifndef V8_A64_MACRO_ASSEMBLER_A64_H_ #define V8_A64_MACRO_ASSEMBLER_A64_H_ +#include + #include "v8globals.h" #include "globals.h" @@ -517,7 +519,8 @@ class MacroAssembler : public Assembler { } // Push the specified register 'count' times. - void PushMultipleTimes(int count, Register src); + void PushMultipleTimes(CPURegister src, Register count); + void PushMultipleTimes(CPURegister src, int count); // This is a convenience method for pushing a single Handle. inline void Push(Handle handle); @@ -531,6 +534,35 @@ class MacroAssembler : public Assembler { Pop(dst); } + // Sometimes callers need to push or pop multiple registers in a way that is + // difficult to structure efficiently for fixed Push or Pop calls. This scope + // allows push requests to be queued up, then flushed at once. The + // MacroAssembler will try to generate the most efficient sequence required. + // + // Unlike the other Push and Pop macros, PushPopQueue can handle mixed sets of + // register sizes and types. + class PushPopQueue { + public: + explicit PushPopQueue(MacroAssembler* masm) : masm_(masm), size_(0) { } + + ~PushPopQueue() { + ASSERT(queued_.empty()); + } + + void Queue(const CPURegister& rt) { + size_ += rt.SizeInBytes(); + queued_.push_back(rt); + } + + void PushQueued(); + void PopQueued(); + + private: + MacroAssembler* masm_; + int size_; + std::vector queued_; + }; + // Poke 'src' onto the stack. The offset is in bytes. // // If the current stack pointer (according to StackPointer()) is csp, then @@ -2001,9 +2033,12 @@ class MacroAssembler : public Assembler { // Perform necessary maintenance operations before a push or pop. // - // Note that size is per register, and is specified in bytes. - void PrepareForPush(int count, int size); - void PrepareForPop(int count, int size); + // Note that size is specified in bytes. + void PrepareForPush(Operand total_size); + void PrepareForPop(Operand total_size); + + void PrepareForPush(int count, int size) { PrepareForPush(count * size); } + void PrepareForPop(int count, int size) { PrepareForPop(count * size); } // Call Printf. On a native build, a simple call will be generated, but if the // simulator is being used then a suitable pseudo-instruction is used. The diff --git a/src/a64/stub-cache-a64.cc b/src/a64/stub-cache-a64.cc index adb8355..b1e9130 100644 --- a/src/a64/stub-cache-a64.cc +++ b/src/a64/stub-cache-a64.cc @@ -754,14 +754,16 @@ static void GenerateFastApiCall(MacroAssembler* masm, int argc, Register* values) { ASSERT(!AreAliased(receiver, scratch)); - __ Push(receiver); - // Write the arguments to stack frame. + + MacroAssembler::PushPopQueue queue(masm); + queue.Queue(receiver); + // Write the arguments to the stack frame. for (int i = 0; i < argc; i++) { - // TODO(jbramley): Push these in as few Push() calls as possible. Register arg = values[argc-1-i]; ASSERT(!AreAliased(receiver, scratch, arg)); - __ Push(arg); + queue.Queue(arg); } + queue.PushQueued(); ASSERT(optimization.is_simple_api_call()); diff --git a/src/objects.h b/src/objects.h index ddecb35..7e88273 100644 --- a/src/objects.h +++ b/src/objects.h @@ -1332,6 +1332,7 @@ class MaybeObject BASE_EMBEDDED { "The instruction to patch should be an ori") \ V(kTheSourceAndDestinationAreTheSame, \ "The source and destination are the same") \ + V(kTheStackPointerIsNotAligned, "The stack pointer is not aligned.") \ V(kTheStackWasCorruptedByMacroAssemblerCall, \ "The stack was corrupted by MacroAssembler::Call()") \ V(kTooManyParametersLocals, "Too many parameters/locals") \ diff --git a/test/cctest/test-assembler-a64.cc b/test/cctest/test-assembler-a64.cc index 3597927..dda47cd 100644 --- a/test/cctest/test-assembler-a64.cc +++ b/test/cctest/test-assembler-a64.cc @@ -8110,7 +8110,8 @@ static void PushPopJsspWXOverlapHelper(int reg_count, int claim) { SETUP_SIZE(BUF_SIZE * 2); // Work out which registers to use, based on reg_size. - static RegList const allowed = ~(x8.Bit() | x9.Bit() | jssp.Bit()); + Register tmp = x8; + static RegList const allowed = ~(tmp.Bit() | jssp.Bit()); if (reg_count == kPushPopJsspMaxRegCount) { reg_count = CountSetBits(allowed, kNumberOfRegisters); } @@ -8196,7 +8197,13 @@ static void PushPopJsspWXOverlapHelper(int reg_count, int claim) { int times = i % 4 + 1; if (i & 1) { // Push odd-numbered registers as W registers. - __ PushMultipleTimes(times, w[i]); + if (i & 2) { + __ PushMultipleTimes(w[i], times); + } else { + // Use a register to specify the count. + __ Mov(tmp.W(), times); + __ PushMultipleTimes(w[i], tmp.W()); + } // Fill in the expected stack slots. for (int j = 0; j < times; j++) { if (w[i].Is(wzr)) { @@ -8208,7 +8215,13 @@ static void PushPopJsspWXOverlapHelper(int reg_count, int claim) { } } else { // Push even-numbered registers as X registers. - __ PushMultipleTimes(times, x[i]); + if (i & 2) { + __ PushMultipleTimes(x[i], times); + } else { + // Use a register to specify the count. + __ Mov(tmp, times); + __ PushMultipleTimes(x[i], tmp); + } // Fill in the expected stack slots. for (int j = 0; j < times; j++) { if (x[i].IsZero()) { @@ -8399,6 +8412,156 @@ TEST(push_pop_csp) { } +TEST(push_queued) { + INIT_V8(); + SETUP(); + + START(); + + ASSERT(__ StackPointer().Is(csp)); + __ Mov(jssp, __ StackPointer()); + __ SetStackPointer(jssp); + + MacroAssembler::PushPopQueue queue(&masm); + + // Queue up registers. + queue.Queue(x0); + queue.Queue(x1); + queue.Queue(x2); + queue.Queue(x3); + + queue.Queue(w4); + queue.Queue(w5); + queue.Queue(w6); + + queue.Queue(d0); + queue.Queue(d1); + + queue.Queue(s2); + + __ Mov(x0, 0x1234000000000000); + __ Mov(x1, 0x1234000100010001); + __ Mov(x2, 0x1234000200020002); + __ Mov(x3, 0x1234000300030003); + __ Mov(w4, 0x12340004); + __ Mov(w5, 0x12340005); + __ Mov(w6, 0x12340006); + __ Fmov(d0, 123400.0); + __ Fmov(d1, 123401.0); + __ Fmov(s2, 123402.0); + + // Actually push them. + queue.PushQueued(); + + Clobber(&masm, CPURegList(CPURegister::kRegister, kXRegSize, 0, 6)); + Clobber(&masm, CPURegList(CPURegister::kFPRegister, kDRegSize, 0, 2)); + + // Pop them conventionally. + __ Pop(s2); + __ Pop(d1, d0); + __ Pop(w6, w5, w4); + __ Pop(x3, x2, x1, x0); + + __ Mov(csp, __ StackPointer()); + __ SetStackPointer(csp); + + END(); + + RUN(); + + ASSERT_EQUAL_64(0x1234000000000000, x0); + ASSERT_EQUAL_64(0x1234000100010001, x1); + ASSERT_EQUAL_64(0x1234000200020002, x2); + ASSERT_EQUAL_64(0x1234000300030003, x3); + + ASSERT_EQUAL_32(0x12340004, w4); + ASSERT_EQUAL_32(0x12340005, w5); + ASSERT_EQUAL_32(0x12340006, w6); + + ASSERT_EQUAL_FP64(123400.0, d0); + ASSERT_EQUAL_FP64(123401.0, d1); + + ASSERT_EQUAL_FP32(123402.0, s2); + + TEARDOWN(); +} + + +TEST(pop_queued) { + INIT_V8(); + SETUP(); + + START(); + + ASSERT(__ StackPointer().Is(csp)); + __ Mov(jssp, __ StackPointer()); + __ SetStackPointer(jssp); + + MacroAssembler::PushPopQueue queue(&masm); + + __ Mov(x0, 0x1234000000000000); + __ Mov(x1, 0x1234000100010001); + __ Mov(x2, 0x1234000200020002); + __ Mov(x3, 0x1234000300030003); + __ Mov(w4, 0x12340004); + __ Mov(w5, 0x12340005); + __ Mov(w6, 0x12340006); + __ Fmov(d0, 123400.0); + __ Fmov(d1, 123401.0); + __ Fmov(s2, 123402.0); + + // Push registers conventionally. + __ Push(x0, x1, x2, x3); + __ Push(w4, w5, w6); + __ Push(d0, d1); + __ Push(s2); + + // Queue up a pop. + queue.Queue(s2); + + queue.Queue(d1); + queue.Queue(d0); + + queue.Queue(w6); + queue.Queue(w5); + queue.Queue(w4); + + queue.Queue(x3); + queue.Queue(x2); + queue.Queue(x1); + queue.Queue(x0); + + Clobber(&masm, CPURegList(CPURegister::kRegister, kXRegSize, 0, 6)); + Clobber(&masm, CPURegList(CPURegister::kFPRegister, kDRegSize, 0, 2)); + + // Actually pop them. + queue.PopQueued(); + + __ Mov(csp, __ StackPointer()); + __ SetStackPointer(csp); + + END(); + + RUN(); + + ASSERT_EQUAL_64(0x1234000000000000, x0); + ASSERT_EQUAL_64(0x1234000100010001, x1); + ASSERT_EQUAL_64(0x1234000200020002, x2); + ASSERT_EQUAL_64(0x1234000300030003, x3); + + ASSERT_EQUAL_64(0x0000000012340004, x4); + ASSERT_EQUAL_64(0x0000000012340005, x5); + ASSERT_EQUAL_64(0x0000000012340006, x6); + + ASSERT_EQUAL_FP64(123400.0, d0); + ASSERT_EQUAL_FP64(123401.0, d1); + + ASSERT_EQUAL_FP32(123402.0, s2); + + TEARDOWN(); +} + + TEST(jump_both_smi) { INIT_V8(); SETUP(); -- 2.7.4