js/src/nanojit/NativeARM.cpp

   1 /* -*- Mode: C++; c-basic-offset: 4; indent-tabs-mode: nil; tab-width: 4 -*- */
   2 /* vi: set ts=4 sw=4 expandtab: (add to ~/.vimrc: set modeline modelines=5) */
   3 /* ***** BEGIN LICENSE BLOCK *****
   4  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
   5  *
   6  * The contents of this file are subject to the Mozilla Public License Version
   7  * 1.1 (the "License"); you may not use this file except in compliance with
   8  * the License. You may obtain a copy of the License at
   9  * http://www.mozilla.org/MPL/
  10  *
  11  * Software distributed under the License is distributed on an "AS IS" basis,
  12  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  13  * for the specific language governing rights and limitations under the
  14  * License.
  15  *
  16  * The Original Code is [Open Source Virtual Machine].
  17  *
  18  * The Initial Developer of the Original Code is
  19  * Adobe System Incorporated.
  20  * Portions created by the Initial Developer are Copyright (C) 2004-2007
  21  * the Initial Developer. All Rights Reserved.
  22  *
  23  * Contributor(s):
  24  *   Adobe AS3 Team
  25  *   Vladimir Vukicevic <vladimir@pobox.com>
  26  *   Jacob Bramley <Jacob.Bramley@arm.com>
  27  *   Tero Koskinen <tero.koskinen@digia.com>
  28  *
  29  * Alternatively, the contents of this file may be used under the terms of
  30  * either the GNU General Public License Version 2 or later (the "GPL"), or
  31  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  32  * in which case the provisions of the GPL or the LGPL are applicable instead
  33  * of those above. If you wish to allow use of your version of this file only
  34  * under the terms of either the GPL or the LGPL, and not to allow others to
  35  * use your version of this file under the terms of the MPL, indicate your
  36  * decision by deleting the provisions above and replace them with the notice
  37  * and other provisions required by the GPL or the LGPL. If you do not delete
  38  * the provisions above, a recipient may use your version of this file under
  39  * the terms of any one of the MPL, the GPL or the LGPL.
  40  *
  41  * ***** END LICENSE BLOCK ***** */
  42
  43 #include "nanojit.h"
  44
  45 #ifdef UNDER_CE
  46 #include <cmnintrin.h>
  47 #endif
  48
  49 #if defined(FEATURE_NANOJIT) && defined(NANOJIT_ARM)
  50
  51 namespace nanojit
  52 {
  53
  54 #ifdef NJ_VERBOSE
  55 const char* regNames[] = {"r0","r1","r2","r3","r4","r5","r6","r7","r8","r9","r10","fp","ip","sp","lr","pc",
  56                           "d0","d1","d2","d3","d4","d5","d6","d7","s0"};
  57 const char* condNames[] = {"eq","ne","cs","cc","mi","pl","vs","vc","hi","ls","ge","lt","gt","le",""/*al*/,"nv"};
  58 const char* shiftNames[] = { "lsl", "lsl", "lsr", "lsr", "asr", "asr", "ror", "ror" };
  59 #endif
  60
  61 const Register Assembler::argRegs[] = { R0, R1, R2, R3 };
  62 const Register Assembler::retRegs[] = { R0, R1 };
  63 const Register Assembler::savedRegs[] = { R4, R5, R6, R7, R8, R9, R10 };
  64
  65 // --------------------------------
  66 // ARM-specific utility functions.
  67 // --------------------------------
  68
  69 #ifdef DEBUG
  70 // Return true if enc is a valid Operand 2 encoding and thus can be used as-is
  71 // in an ARM arithmetic operation that accepts such encoding.
  72 //
  73 // This utility does not know (or determine) the actual value that the encoded
  74 // value represents, and thus cannot be used to ensure the correct operation of
  75 // encOp2Imm, but it does ensure that the encoded value can be used to encode a
  76 // valid ARM instruction. decOp2Imm can be used if you also need to check that
  77 // a literal is correctly encoded (and thus that encOp2Imm is working
  78 // correctly).
  79 inline bool
  80 Assembler::isOp2Imm(uint32_t enc)
  81 {
  82     return ((enc & 0xfff) == enc);
  83 }
  84
  85 // Decodes operand 2 immediate values (for debug output and assertions).
  86 inline uint32_t
  87 Assembler::decOp2Imm(uint32_t enc)
  88 {
  89     NanoAssert(isOp2Imm(enc));
  90
  91     uint32_t    imm8 = enc & 0xff;
  92     uint32_t    rot = 32 - ((enc >> 7) & 0x1e);
  93
  94     return imm8 << (rot & 0x1f);
  95 }
  96 #endif
  97
  98 // Calculate the number of leading zeroes in data.
  99 static inline uint32_t
 100 CountLeadingZeroesSlow(uint32_t data)
 101 {
 102     // Other platforms must fall back to a C routine. This won't be as
 103     // efficient as the CLZ instruction, but it is functional.
 104     uint32_t    try_shift;
 105
 106     uint32_t    leading_zeroes = 0;
 107
 108     // This loop does a bisection search rather than the obvious rotation loop.
 109     // This should be faster, though it will still be no match for CLZ.
 110     for (try_shift = 16; try_shift != 0; try_shift /= 2) {
 111         uint32_t    shift = leading_zeroes + try_shift;
 112         if (((data << shift) >> shift) == data) {
 113             leading_zeroes = shift;
 114         }
 115     }
 116
 117     return leading_zeroes;
 118 }
 119
 120 inline uint32_t
 121 Assembler::CountLeadingZeroes(uint32_t data)
 122 {
 123     uint32_t    leading_zeroes;
 124
 125 #if defined(__ARMCC__)
 126     // ARMCC can do this with an intrinsic.
 127     leading_zeroes = __clz(data);
 128 #elif defined(__GNUC__)
 129     // GCC can use inline assembler to insert a CLZ instruction.
 130     if (ARM_ARCH_AT_LEAST(5)) {
 131         __asm (
 132 #if defined(ANDROID) && (NJ_COMPILER_ARM_ARCH < 7)
 133         // On Android gcc compiler, the clz instruction is not supported with a
 134         // target smaller than armv7, despite it being legal for armv5+.
 135             "   .arch armv7-a\n"
 136 #elif (NJ_COMPILER_ARM_ARCH < 5)
 137         // Targetting armv5t allows a toolchain with armv4t target to still build
 138         // with clz, and clz to be used when appropriate at runtime.
 139             "   .arch armv5t\n"
 140 #endif
 141             "   clz     %0, %1  \n"
 142             :   "=r"    (leading_zeroes)
 143             :   "r"     (data)
 144         );
 145     } else {
 146         leading_zeroes = CountLeadingZeroesSlow(data);
 147     }
 148 #elif defined(UNDER_CE)
 149     // WinCE can do this with an intrinsic.
 150     leading_zeroes = _CountLeadingZeros(data);
 151 #else
 152     leading_zeroes = CountLeadingZeroesSlow(data);
 153 #endif
 154
 155     // Assert that the operation worked!
 156     NanoAssert(((0xffffffff >> leading_zeroes) & data) == data);
 157
 158     return leading_zeroes;
 159 }
 160
 161 // The ARM instruction set allows some flexibility to the second operand of
 162 // most arithmetic operations. When operand 2 is an immediate value, it takes
 163 // the form of an 8-bit value rotated by an even value in the range 0-30.
 164 //
 165 // Some values that can be encoded this scheme — such as 0xf000000f — are
 166 // probably fairly rare in practice and require extra code to detect, so this
 167 // function implements a fast CLZ-based heuristic to detect any value that can
 168 // be encoded using just a shift, and not a full rotation. For example,
 169 // 0xff000000 and 0x000000ff are both detected, but 0xf000000f is not.
 170 //
 171 // This function will return true to indicate that the encoding was successful,
 172 // or false to indicate that the literal could not be encoded as an operand 2
 173 // immediate. If successful, the encoded value will be written to *enc.
 174 inline bool
 175 Assembler::encOp2Imm(uint32_t literal, uint32_t * enc)
 176 {
 177     // The number of leading zeroes in the literal. This is used to calculate
 178     // the rotation component of the encoding.
 179     uint32_t    leading_zeroes;
 180
 181     // Components of the operand 2 encoding.
 182     int32_t    rot;
 183     uint32_t    imm8;
 184
 185     // Check the literal to see if it is a simple 8-bit value. I suspect that
 186     // most literals are in fact small values, so doing this check early should
 187     // give a decent speed-up.
 188     if (literal < 256)
 189     {
 190         *enc = literal;
 191         return true;
 192     }
 193
 194     // Determine the number of leading zeroes in the literal. This is used to
 195     // calculate the required rotation.
 196     leading_zeroes = CountLeadingZeroes(literal);
 197
 198     // We've already done a check to see if the literal is an 8-bit value, so
 199     // leading_zeroes must be less than (and not equal to) (32-8)=24. However,
 200     // if it is greater than 24, this algorithm will break, so debug code
 201     // should use an assertion here to check that we have a value that we
 202     // expect.
 203     NanoAssert(leading_zeroes < 24);
 204
 205     // Assuming that we have a field of no more than 8 bits for a valid
 206     // literal, we can calculate the required rotation by subtracting
 207     // leading_zeroes from (32-8):
 208     //
 209     // Example:
 210     //      0: Known to be zero.
 211     //      1: Known to be one.
 212     //      X: Either zero or one.
 213     //      .: Zero in a valid operand 2 literal.
 214     //
 215     //  Literal:     [ 1XXXXXXX ........ ........ ........ ]
 216     //  leading_zeroes = 0
 217     //  Therefore rot (left) = 24.
 218     //  Encoded 8-bit literal:                  [ 1XXXXXXX ]
 219     //
 220     //  Literal:     [ ........ ..1XXXXX XX...... ........ ]
 221     //  leading_zeroes = 10
 222     //  Therefore rot (left) = 14.
 223     //  Encoded 8-bit literal:                  [ 1XXXXXXX ]
 224     //
 225     // Note, however, that we can only encode even shifts, and so
 226     // "rot=24-leading_zeroes" is not sufficient by itself. By ignoring
 227     // zero-bits in odd bit positions, we can ensure that we get a valid
 228     // encoding.
 229     //
 230     // Example:
 231     //  Literal:     [ 01XXXXXX ........ ........ ........ ]
 232     //  leading_zeroes = 1
 233     //  Therefore rot (left) = round_up(23) = 24.
 234     //  Encoded 8-bit literal:                  [ 01XXXXXX ]
 235     rot = 24 - (leading_zeroes & ~1);
 236
 237     // The imm8 component of the operand 2 encoding can be calculated from the
 238     // rot value.
 239     imm8 = literal >> rot;
 240
 241     // The validity of the literal can be checked by reversing the
 242     // calculation. It is much easier to decode the immediate than it is to
 243     // encode it!
 244     if (literal != (imm8 << rot)) {
 245         // The encoding is not valid, so report the failure. Calling code
 246         // should use some other method of loading the value (such as LDR).
 247         return false;
 248     }
 249
 250     // The operand is valid, so encode it.
 251     // Note that the ARM encoding is actually described by a rotate to the
 252     // _right_, so rot must be negated here. Calculating a left shift (rather
 253     // than calculating a right rotation) simplifies the above code.
 254     *enc = ((-rot << 7) & 0xf00) | imm8;
 255
 256     // Assert that the operand was properly encoded.
 257     NanoAssert(decOp2Imm(*enc) == literal);
 258
 259     return true;
 260 }
 261
 262 // Encode "rd = rn + imm" using an appropriate instruction sequence.
 263 // Set stat to 1 to update the status flags. Otherwise, set it to 0 or omit it.
 264 // (The declaration in NativeARM.h defines the default value of stat as 0.)
 265 //
 266 // It is not valid to call this function if:
 267 //   (rd == IP) AND (rn == IP) AND !encOp2Imm(imm) AND !encOp2Imm(-imm)
 268 // Where: if (encOp2Imm(imm)), imm can be encoded as an ARM operand 2 using the
 269 // encOp2Imm method.
 270 void
 271 Assembler::asm_add_imm(Register rd, Register rn, int32_t imm, int stat /* =0 */)
 272 {
 273     // Operand 2 encoding of the immediate.
 274     uint32_t    op2imm;
 275
 276     NanoAssert(IsGpReg(rd));
 277     NanoAssert(IsGpReg(rn));
 278     NanoAssert((stat & 1) == stat);
 279
 280     // As a special case to simplify code elsewhere, emit nothing where we
 281     // don't want to update the flags (stat == 0), the second operand is 0 and
 282     // (rd == rn). Such instructions are effectively NOPs.
 283     if ((imm == 0) && (stat == 0) && (rd == rn)) {
 284         return;
 285     }
 286
 287     // Try to encode the value directly as an operand 2 immediate value, then
 288     // fall back to loading the value into a register.
 289     if (encOp2Imm(imm, &op2imm)) {
 290         ADDis(rd, rn, op2imm, stat);
 291     } else if (encOp2Imm(-imm, &op2imm)) {
 292         // We could not encode the value for ADD, so try to encode it for SUB.
 293         // Note that this is valid even if stat is set, _unless_ imm is 0, but
 294         // that case is caught above.
 295         NanoAssert(imm != 0);
 296         SUBis(rd, rn, op2imm, stat);
 297     } else {
 298         // We couldn't encode the value directly, so use an intermediate
 299         // register to encode the value. We will use IP to do this unless rn is
 300         // IP; in that case we can reuse rd. This allows every case other than
 301         // "ADD IP, IP, =#imm".
 302         Register    rm = (rn == IP) ? (rd) : (IP);
 303         NanoAssert(rn != rm);
 304
 305         ADDs(rd, rn, rm, stat);
 306         asm_ld_imm(rm, imm);
 307     }
 308 }
 309
 310 // Encode "rd = rn - imm" using an appropriate instruction sequence.
 311 // Set stat to 1 to update the status flags. Otherwise, set it to 0 or omit it.
 312 // (The declaration in NativeARM.h defines the default value of stat as 0.)
 313 //
 314 // It is not valid to call this function if:
 315 //   (rd == IP) AND (rn == IP) AND !encOp2Imm(imm) AND !encOp2Imm(-imm)
 316 // Where: if (encOp2Imm(imm)), imm can be encoded as an ARM operand 2 using the
 317 // encOp2Imm method.
 318 void
 319 Assembler::asm_sub_imm(Register rd, Register rn, int32_t imm, int stat /* =0 */)
 320 {
 321     // Operand 2 encoding of the immediate.
 322     uint32_t    op2imm;
 323
 324     NanoAssert(IsGpReg(rd));
 325     NanoAssert(IsGpReg(rn));
 326     NanoAssert((stat & 1) == stat);
 327
 328     // As a special case to simplify code elsewhere, emit nothing where we
 329     // don't want to update the flags (stat == 0), the second operand is 0 and
 330     // (rd == rn). Such instructions are effectively NOPs.
 331     if ((imm == 0) && (stat == 0) && (rd == rn)) {
 332         return;
 333     }
 334
 335     // Try to encode the value directly as an operand 2 immediate value, then
 336     // fall back to loading the value into a register.
 337     if (encOp2Imm(imm, &op2imm)) {
 338         SUBis(rd, rn, op2imm, stat);
 339     } else if (encOp2Imm(-imm, &op2imm)) {
 340         // We could not encode the value for SUB, so try to encode it for ADD.
 341         // Note that this is valid even if stat is set, _unless_ imm is 0, but
 342         // that case is caught above.
 343         NanoAssert(imm != 0);
 344         ADDis(rd, rn, op2imm, stat);
 345     } else {
 346         // We couldn't encode the value directly, so use an intermediate
 347         // register to encode the value. We will use IP to do this unless rn is
 348         // IP; in that case we can reuse rd. This allows every case other than
 349         // "SUB IP, IP, =#imm".
 350         Register    rm = (rn == IP) ? (rd) : (IP);
 351         NanoAssert(rn != rm);
 352
 353         SUBs(rd, rn, rm, stat);
 354         asm_ld_imm(rm, imm);
 355     }
 356 }
 357
 358 // Encode "rd = rn & imm" using an appropriate instruction sequence.
 359 // Set stat to 1 to update the status flags. Otherwise, set it to 0 or omit it.
 360 // (The declaration in NativeARM.h defines the default value of stat as 0.)
 361 //
 362 // It is not valid to call this function if:
 363 //   (rd == IP) AND (rn == IP) AND !encOp2Imm(imm) AND !encOp2Imm(~imm)
 364 // Where: if (encOp2Imm(imm)), imm can be encoded as an ARM operand 2 using the
 365 // encOp2Imm method.
 366 void
 367 Assembler::asm_and_imm(Register rd, Register rn, int32_t imm, int stat /* =0 */)
 368 {
 369     // Operand 2 encoding of the immediate.
 370     uint32_t    op2imm;
 371
 372     NanoAssert(IsGpReg(rd));
 373     NanoAssert(IsGpReg(rn));
 374     NanoAssert((stat & 1) == stat);
 375
 376     // Try to encode the value directly as an operand 2 immediate value, then
 377     // fall back to loading the value into a register.
 378     if (encOp2Imm(imm, &op2imm)) {
 379         ANDis(rd, rn, op2imm, stat);
 380     } else if (encOp2Imm(~imm, &op2imm)) {
 381         // Use BIC with the inverted immediate.
 382         BICis(rd, rn, op2imm, stat);
 383     } else {
 384         // We couldn't encode the value directly, so use an intermediate
 385         // register to encode the value. We will use IP to do this unless rn is
 386         // IP; in that case we can reuse rd. This allows every case other than
 387         // "AND IP, IP, =#imm".
 388         Register    rm = (rn == IP) ? (rd) : (IP);
 389         NanoAssert(rn != rm);
 390
 391         ANDs(rd, rn, rm, stat);
 392         asm_ld_imm(rm, imm);
 393     }
 394 }
 395
 396 // Encode "rd = rn | imm" using an appropriate instruction sequence.
 397 // Set stat to 1 to update the status flags. Otherwise, set it to 0 or omit it.
 398 // (The declaration in NativeARM.h defines the default value of stat as 0.)
 399 //
 400 // It is not valid to call this function if:
 401 //   (rd == IP) AND (rn == IP) AND !encOp2Imm(imm)
 402 // Where: if (encOp2Imm(imm)), imm can be encoded as an ARM operand 2 using the
 403 // encOp2Imm method.
 404 void
 405 Assembler::asm_orr_imm(Register rd, Register rn, int32_t imm, int stat /* =0 */)
 406 {
 407     // Operand 2 encoding of the immediate.
 408     uint32_t    op2imm;
 409
 410     NanoAssert(IsGpReg(rd));
 411     NanoAssert(IsGpReg(rn));
 412     NanoAssert((stat & 1) == stat);
 413
 414     // Try to encode the value directly as an operand 2 immediate value, then
 415     // fall back to loading the value into a register.
 416     if (encOp2Imm(imm, &op2imm)) {
 417         ORRis(rd, rn, op2imm, stat);
 418     } else {
 419         // We couldn't encode the value directly, so use an intermediate
 420         // register to encode the value. We will use IP to do this unless rn is
 421         // IP; in that case we can reuse rd. This allows every case other than
 422         // "ORR IP, IP, =#imm".
 423         Register    rm = (rn == IP) ? (rd) : (IP);
 424         NanoAssert(rn != rm);
 425
 426         ORRs(rd, rn, rm, stat);
 427         asm_ld_imm(rm, imm);
 428     }
 429 }
 430
 431 // Encode "rd = rn ^ imm" using an appropriate instruction sequence.
 432 // Set stat to 1 to update the status flags. Otherwise, set it to 0 or omit it.
 433 // (The declaration in NativeARM.h defines the default value of stat as 0.)
 434 //
 435 // It is not valid to call this function if:
 436 //   (rd == IP) AND (rn == IP) AND !encOp2Imm(imm)
 437 // Where: if (encOp2Imm(imm)), imm can be encoded as an ARM operand 2 using the
 438 // encOp2Imm method.
 439 void
 440 Assembler::asm_eor_imm(Register rd, Register rn, int32_t imm, int stat /* =0 */)
 441 {
 442     // Operand 2 encoding of the immediate.
 443     uint32_t    op2imm;
 444
 445     NanoAssert(IsGpReg(rd));
 446     NanoAssert(IsGpReg(rn));
 447     NanoAssert((stat & 1) == stat);
 448
 449     // Try to encode the value directly as an operand 2 immediate value, then
 450     // fall back to loading the value into a register.
 451     if (encOp2Imm(imm, &op2imm)) {
 452         EORis(rd, rn, op2imm, stat);
 453     } else {
 454         // We couldn't encoder the value directly, so use an intermediate
 455         // register to encode the value. We will use IP to do this unless rn is
 456         // IP; in that case we can reuse rd. This allows every case other than
 457         // "EOR IP, IP, =#imm".
 458         Register    rm = (rn == IP) ? (rd) : (IP);
 459         NanoAssert(rn != rm);
 460
 461         EORs(rd, rn, rm, stat);
 462         asm_ld_imm(rm, imm);
 463     }
 464 }
 465
 466 // --------------------------------
 467 // Assembler functions.
 468 // --------------------------------
 469
 470 void
 471 Assembler::nInit(AvmCore*)
 472 {
 473     nHints[LIR_calli]  = rmask(retRegs[0]);
 474     nHints[LIR_hcalli] = rmask(retRegs[1]);
 475     nHints[LIR_paramp] = PREFER_SPECIAL;
 476 }
 477
 478 void Assembler::nBeginAssembly()
 479 {
 480     max_out_args = 0;
 481 }
 482
 483 NIns*
 484 Assembler::genPrologue()
 485 {
 486     /**
 487      * Prologue
 488      */
 489
 490     // NJ_RESV_OFFSET is space at the top of the stack for us
 491     // to use for parameter passing (8 bytes at the moment)
 492     uint32_t stackNeeded = max_out_args + STACK_GRANULARITY * _activation.stackSlotsNeeded();
 493     uint32_t savingCount = 2;
 494
 495     uint32_t savingMask = rmask(FP) | rmask(LR);
 496
 497     // so for alignment purposes we've pushed return addr and fp
 498     uint32_t stackPushed = STACK_GRANULARITY * savingCount;
 499     uint32_t aligned = alignUp(stackNeeded + stackPushed, NJ_ALIGN_STACK);
 500     int32_t amt = aligned - stackPushed;
 501
 502     // Make room on stack for what we are doing
 503     if (amt)
 504         asm_sub_imm(SP, SP, amt);
 505
 506     verbose_only( asm_output("## %p:",(void*)_nIns); )
 507     verbose_only( asm_output("## patch entry"); )
 508     NIns *patchEntry = _nIns;
 509
 510     MOV(FP, SP);
 511     PUSH_mask(savingMask);
 512     return patchEntry;
 513 }
 514
 515 void
 516 Assembler::nFragExit(LIns* guard)
 517 {
 518     SideExit *  exit = guard->record()->exit;
 519     Fragment *  frag = exit->target;
 520
 521     bool        target_is_known = frag && frag->fragEntry;
 522
 523     if (target_is_known) {
 524         // The target exists so we can simply emit a branch to its location.
 525         JMP_far(frag->fragEntry);
 526     } else {
 527         // The target doesn't exit yet, so emit a jump to the epilogue. If the
 528         // target is created later on, the jump will be patched.
 529
 530         GuardRecord *gr = guard->record();
 531
 532         if (!_epilogue)
 533             _epilogue = genEpilogue();
 534
 535         // Jump to the epilogue. This may get patched later, but JMP_far always
 536         // emits two instructions even when only one is required, so patching
 537         // will work correctly.
 538         JMP_far(_epilogue);
 539
 540         // In the future you may want to move this further down so that we can
 541         // overwrite the r0 guard record load during a patch to a different
 542         // fragment with some assumed input-register state. Not today though.
 543         gr->jmp = _nIns;
 544
 545         // NB: this is a workaround for the fact that, by patching a
 546         // fragment-exit jump, we could be changing the *meaning* of the R0
 547         // register we're passing to the jump target. If we jump to the
 548         // epilogue, ideally R0 means "return value when exiting fragment".
 549         // If we patch this to jump to another fragment however, R0 means
 550         // "incoming 0th parameter". This is just a quirk of ARM ABI. So
 551         // we compromise by passing "return value" to the epilogue in IP,
 552         // not R0, and have the epilogue MOV(R0, IP) first thing.
 553
 554         asm_ld_imm(IP, int(gr));
 555     }
 556
 557 #ifdef NJ_VERBOSE
 558     if (_config.arm_show_stats) {
 559         // load R1 with Fragment *fromFrag, target fragment
 560         // will make use of this when calling fragenter().
 561         int fromfrag = int((Fragment*)_thisfrag);
 562         asm_ld_imm(argRegs[1], fromfrag);
 563     }
 564 #endif
 565
 566     // profiling for the exit
 567     verbose_only(
 568        if (_logc->lcbits & LC_FragProfile) {
 569            asm_inc_m32( &guard->record()->profCount );
 570        }
 571     )
 572
 573     // Pop the stack frame.
 574     MOV(SP, FP);
 575 }
 576
 577 NIns*
 578 Assembler::genEpilogue()
 579 {
 580     RegisterMask savingMask;
 581
 582     if (ARM_ARCH_AT_LEAST(5)) {
 583         // On ARMv5+, loading directly to PC correctly handles interworking.
 584         savingMask = rmask(FP) | rmask(PC);
 585
 586     } else {
 587         // On ARMv4T, interworking is not handled properly, therefore, we pop
 588         // lr and use bx lr to avoid that.
 589         savingMask = rmask(FP) | rmask(LR);
 590         BX(LR);
 591     }
 592     POP_mask(savingMask); // regs
 593
 594     // NB: this is the later half of the dual-nature patchable exit branch
 595     // workaround noted above in nFragExit. IP has the "return value"
 596     // incoming, we need to move it to R0.
 597     MOV(R0, IP);
 598
 599     return _nIns;
 600 }
 601
 602 /*
 603  * asm_arg will encode the specified argument according to the current ABI, and
 604  * will update r and stkd as appropriate so that the next argument can be
 605  * encoded.
 606  *
 607  * Linux has used ARM's EABI for some time. Windows CE uses the legacy ABI.
 608  *
 609  * Under EABI:
 610  * - doubles are 64-bit aligned both in registers and on the stack.
 611  *   If the next available argument register is R1, it is skipped
 612  *   and the double is placed in R2:R3.  If R0:R1 or R2:R3 are not
 613  *   available, the double is placed on the stack, 64-bit aligned.
 614  * - 32-bit arguments are placed in registers and 32-bit aligned
 615  *   on the stack.
 616  *
 617  * Under EABI with hardware floating-point procedure-call variant:
 618  * - Same as EABI, but doubles are passed in D0..D7 registers.
 619  *
 620  * Under legacy ABI:
 621  * - doubles are placed in subsequent arg registers; if the next
 622  *   available register is r3, the low order word goes into r3
 623  *   and the high order goes on the stack.
 624  * - 32-bit arguments are placed in the next available arg register,
 625  * - both doubles and 32-bit arguments are placed on stack with 32-bit
 626  *   alignment.
 627  */
 628 void
 629 Assembler::asm_arg(ArgType ty, LIns* arg, ParameterRegisters& params)
 630 {
 631     // The stack pointer must always be at least aligned to 4 bytes.
 632     NanoAssert((params.stkd & 3) == 0);
 633
 634     if (ty == ARGTYPE_D) {
 635         // This task is fairly complex and so is delegated to asm_arg_64.
 636         asm_arg_64(arg, params);
 637     } else {
 638         NanoAssert(ty == ARGTYPE_I || ty == ARGTYPE_UI);
 639         // pre-assign registers R0-R3 for arguments (if they fit)
 640         if (params.r < R4) {
 641             asm_regarg(ty, arg, params.r);
 642             params.r = Register(params.r + 1);
 643         } else {
 644             asm_stkarg(arg, params.stkd);
 645             params.stkd += 4;
 646         }
 647     }
 648 }
 649
 650 // Encode a 64-bit floating-point argument using the appropriate ABI.
 651 // This function operates in the same way as asm_arg, except that it will only
 652 // handle arguments where (ArgType)ty == ARGTYPE_D.
 653
 654 #ifdef NJ_ARM_EABI_HARD_FLOAT
 655 void
 656 Assembler::asm_arg_64(LIns* arg, ParameterRegisters& params)
 657 {
 658     NanoAssert(IsFpReg(params.float_r));
 659     if (params.float_r <= D7) {
 660         findSpecificRegFor(arg, params.float_r);
 661         params.float_r = Register(params.float_r + 1);
 662     } else {
 663         NanoAssertMsg(0, "Only 8 floating point arguments supported");
 664     }
 665 }
 666
 667 #else
 668 void
 669 Assembler::asm_arg_64(LIns* arg, ParameterRegisters& params)
 670 {
 671     // The stack pointer must always be at least aligned to 4 bytes.
 672     NanoAssert((params.stkd & 3) == 0);
 673     // The only use for this function when we are using soft floating-point
 674     // is for LIR_ii2d.
 675     NanoAssert(ARM_VFP || arg->isop(LIR_ii2d));
 676
 677 #ifdef NJ_ARM_EABI
 678     // EABI requires that 64-bit arguments are aligned on even-numbered
 679     // registers, as R0:R1 or R2:R3. If the register base is at an
 680     // odd-numbered register, advance it. Note that this will push r past
 681     // R3 if r is R3 to start with, and will force the argument to go on
 682     // the stack.
 683     if ((params.r == R1) || (params.r == R3)) {
 684         params.r = Register(params.r + 1);
 685     }
 686 #endif
 687
 688     if (params.r < R3) {
 689         Register    ra = params.r;
 690         Register    rb = Register(params.r + 1);
 691         params.r = Register(rb + 1);
 692
 693 #ifdef NJ_ARM_EABI
 694         // EABI requires that 64-bit arguments are aligned on even-numbered
 695         // registers, as R0:R1 or R2:R3.
 696         NanoAssert( ((ra == R0) && (rb == R1)) || ((ra == R2) && (rb == R3)) );
 697 #endif
 698
 699         // Put the argument in ra and rb. If the argument is in a VFP register,
 700         // use FMRRD to move it to ra and rb. Otherwise, let asm_regarg deal
 701         // with the argument as if it were two 32-bit arguments.
 702         if (ARM_VFP) {
 703             Register dm = findRegFor(arg, FpRegs);
 704             FMRRD(ra, rb, dm);
 705         } else {
 706             asm_regarg(ARGTYPE_I, arg->oprnd1(), ra);
 707             asm_regarg(ARGTYPE_I, arg->oprnd2(), rb);
 708         }
 709
 710 #ifndef NJ_ARM_EABI
 711     } else if (params.r == R3) {
 712         // We only have one register left, but the legacy ABI requires that we
 713         // put 32 bits of the argument in the register (R3) and the remaining
 714         // 32 bits on the stack.
 715         Register    ra = params.r; // R3
 716         params.r = R4;
 717
 718         // We're splitting the argument between registers and the stack.  This
 719         // must be the first time that the stack is used, so stkd must be at 0.
 720         NanoAssert(params.stkd == 0);
 721
 722         if (ARM_VFP) {
 723             Register dm = findRegFor(arg, FpRegs);
 724             // TODO: We could optimize the this to store directly from
 725             // the VFP register to memory using "FMRRD ra, fp_reg[31:0]" and
 726             // "STR fp_reg[63:32], [SP, #stkd]".
 727
 728             // Load from the floating-point register as usual, but use IP
 729             // as a swap register.
 730             STR(IP, SP, 0);
 731             FMRRD(ra, IP, dm);
 732         } else {
 733             // Without VFP, we can simply use asm_regarg and asm_stkarg to
 734             // encode the two 32-bit words as we don't need to load from a VFP
 735             // register.
 736             asm_regarg(ARGTYPE_I, arg->oprnd1(), ra);
 737             asm_stkarg(arg->oprnd2(), 0);
 738         }
 739         params.stkd += 4;
 740 #endif
 741     } else {
 742         // The argument won't fit in registers, so pass on to asm_stkarg.
 743 #ifdef NJ_ARM_EABI
 744         // EABI requires that 64-bit arguments are 64-bit aligned.
 745         if ((params.stkd & 7) != 0) {
 746             // stkd will always be aligned to at least 4 bytes; this was
 747             // asserted on entry to this function.
 748             params.stkd += 4;
 749         }
 750 #endif
 751         if (ARM_VFP) {
 752             asm_stkarg(arg, params.stkd);
 753         } else {
 754             asm_stkarg(arg->oprnd1(), params.stkd);
 755             asm_stkarg(arg->oprnd2(), params.stkd+4);
 756         }
 757         params.stkd += 8;
 758     }
 759 }
 760 #endif // NJ_ARM_EABI_HARD_FLOAT
 761
 762 void
 763 Assembler::asm_regarg(ArgType ty, LIns* p, Register rd)
 764 {
 765     // Note that we don't have to prepareResultReg here because it is already
 766     // done by the caller, and the target register is passed as 'rd'.
 767     // Similarly, we don't have to freeResourcesOf(p).
 768
 769     if (ty == ARGTYPE_I || ty == ARGTYPE_UI)
 770     {
 771         // Put the argument in register rd.
 772         if (p->isImmI()) {
 773             asm_ld_imm(rd, p->immI());
 774         } else {
 775             if (p->isInReg()) {
 776                 MOV(rd, p->getReg());
 777             } else {
 778                 // Re-use the target register if the source is no longer
 779                 // required. This saves a MOV instruction.
 780                 findSpecificRegForUnallocated(p, rd);
 781             }
 782         }
 783     } else {
 784         NanoAssert(ty == ARGTYPE_D);
 785         // Floating-point arguments are handled as two integer arguments.
 786         NanoAssert(false);
 787     }
 788 }
 789
 790 void
 791 Assembler::asm_stkarg(LIns* arg, int stkd)
 792 {
 793     // The ABI doesn't allow accesses below the SP.
 794     NanoAssert(stkd >= 0);
 795     // The argument resides somewhere in registers, so we simply need to
 796     // push it onto the stack.
 797     if (arg->isI()) {
 798         Register rt = findRegFor(arg, GpRegs);
 799         asm_str(rt, SP, stkd);
 800     } else {
 801         // According to the comments in asm_arg_64, LIR_ii2d
 802         // can have a 64-bit argument even if VFP is disabled. However,
 803         // asm_arg_64 will split the argument and issue two 32-bit
 804         // arguments to asm_stkarg so we can ignore that case here.
 805         NanoAssert(arg->isD());
 806         NanoAssert(ARM_VFP);
 807         Register dt = findRegFor(arg, FpRegs);
 808 #ifdef NJ_ARM_EABI
 809         // EABI requires that 64-bit arguments are 64-bit aligned.
 810         NanoAssert((stkd % 8) == 0);
 811 #endif
 812         FSTD(dt, SP, stkd);
 813     }
 814 }
 815
 816 void
 817 Assembler::asm_call(LIns* ins)
 818 {
 819     if (ARM_VFP && ins->isop(LIR_calld)) {
 820         /* Because ARM actually returns the result in (R0,R1), and not in a
 821          * floating point register, the code to move the result into a correct
 822          * register is below.  We do nothing here.
 823          *
 824          * The reason being that if we did something here, the final code
 825          * sequence we'd get would be something like:
 826          *     MOV {R0-R3},params        [from below]
 827          *     BL function               [from below]
 828          *     MOV {R0-R3},spilled data  [from evictScratchRegsExcept()]
 829          *     MOV Dx,{R0,R1}            [from here]
 830          * which is clearly broken.
 831          *
 832          * This is not a problem for non-floating point calls, because the
 833          * restoring of spilled data into R0 is done via a call to
 834          * prepareResultReg(R0) in the other branch of this if-then-else,
 835          * meaning that evictScratchRegsExcept() will not modify R0. However,
 836          * prepareResultReg is not aware of the concept of using a register
 837          * pair (R0,R1) for the result of a single operation, so it can only be
 838          * used here with the ultimate VFP register, and not R0/R1, which
 839          * potentially allows for R0/R1 to get corrupted as described.
 840          */
 841 #ifdef NJ_ARM_EABI_HARD_FLOAT
 842         /* With ARM hardware floating point ABI, D0 is used to return the double
 843          * from the function. We need to prepare it like we do for R0 in the else
 844          * branch.
 845          */
 846         prepareResultReg(ins, rmask(D0));
 847         freeResourcesOf(ins);
 848 #endif
 849     } else if (!ins->isop(LIR_callv)) {
 850         prepareResultReg(ins, rmask(retRegs[0]));
 851         // Immediately free the resources as we need to re-use the register for
 852         // the arguments.
 853         freeResourcesOf(ins);
 854     }
 855
 856     // Do this after we've handled the call result, so we don't
 857     // force the call result to be spilled unnecessarily.
 858
 859     evictScratchRegsExcept(0);
 860
 861     const CallInfo* ci = ins->callInfo();
 862     ArgType argTypes[MAXARGS];
 863     uint32_t argc = ci->getArgTypes(argTypes);
 864     bool indirect = ci->isIndirect();
 865
 866     // If we aren't using VFP, assert that the LIR operation is an integer
 867     // function call.
 868     NanoAssert(ARM_VFP || ins->isop(LIR_callv) || ins->isop(LIR_calli));
 869
 870     // If we're using VFP, but not hardware floating point ABI, and
 871     // the return type is a double, it'll come back in R0/R1.
 872     // We need to either place it in the result fp reg, or store it.
 873     // See comments above for more details as to why this is necessary here
 874     // for floating point calls, but not for integer calls.
 875     if (!ARM_EABI_HARD && ARM_VFP && ins->isExtant()) {
 876         // If the result size is a floating-point value, treat the result
 877         // specially, as described previously.
 878         if (ci->returnType() == ARGTYPE_D) {
 879             NanoAssert(ins->isop(LIR_calld));
 880
 881             if (ins->isInReg()) {
 882                 Register dd = ins->getReg();
 883                 // Copy the result to the (VFP) result register.
 884                 FMDRR(dd, R0, R1);
 885             } else {
 886                 int d = findMemFor(ins);
 887                 // Immediately free the resources so the arguments can re-use
 888                 // the slot.
 889                 freeResourcesOf(ins);
 890
 891                 // The result doesn't have a register allocated, so store the
 892                 // result (in R0,R1) directly to its stack slot.
 893                 asm_str(R0, FP, d+0);
 894                 asm_str(R1, FP, d+4);
 895             }
 896         }
 897     }
 898
 899     // Emit the branch.
 900     if (!indirect) {
 901         verbose_only(if (_logc->lcbits & LC_Native)
 902             outputf("        %p:", _nIns);
 903         )
 904
 905         BranchWithLink((NIns*)ci->_address);
 906     } else {
 907         // Indirect call: we assign the address arg to LR
 908         if (ARM_ARCH_AT_LEAST(5)) {
 909 #ifndef UNDER_CE
 910             // workaround for msft device emulator bug (blx lr emulated as no-op)
 911             underrunProtect(8);
 912             BLX(IP);
 913             MOV(IP, LR);
 914 #else
 915             BLX(LR);
 916 #endif
 917         } else {
 918             underrunProtect(12);
 919             BX(IP);
 920             MOV(LR, PC);
 921             MOV(IP, LR);
 922         }
 923         asm_regarg(ARGTYPE_I, ins->arg(--argc), LR);
 924     }
 925
 926     // Encode the arguments, starting at R0 and with an empty argument stack (0).
 927     // With hardware fp ABI, floating point arguments start from D0.
 928     ParameterRegisters params = init_params(0, R0, D0);
 929
 930     // Iterate through the argument list and encode each argument according to
 931     // the ABI.
 932     // Note that we loop through the arguments backwards as LIR specifies them
 933     // in reverse order.
 934     uint32_t    i = argc;
 935     while(i--) {
 936         asm_arg(argTypes[i], ins->arg(i), params);
 937     }
 938
 939     if (params.stkd > max_out_args) {
 940         max_out_args = params.stkd;
 941     }
 942 }
 943
 944 Register
 945 Assembler::nRegisterAllocFromSet(RegisterMask set)
 946 {
 947     NanoAssert(set != 0);
 948
 949     // The CountLeadingZeroes function will use the CLZ instruction where
 950     // available. In other cases, it will fall back to a (slower) C
 951     // implementation.
 952     Register r = (Register)(31-CountLeadingZeroes(set));
 953     _allocator.free &= ~rmask(r);
 954
 955     NanoAssert(IsGpReg(r) || IsFpReg(r));
 956     NanoAssert((rmask(r) & set) == rmask(r));
 957
 958     return r;
 959 }
 960
 961 void
 962 Assembler::nRegisterResetAll(RegAlloc& a)
 963 {
 964     // add scratch registers to our free list for the allocator
 965     a.clear();
 966     a.free =
 967         rmask(R0) | rmask(R1) | rmask(R2) | rmask(R3) | rmask(R4) |
 968         rmask(R5) | rmask(R6) | rmask(R7) | rmask(R8) | rmask(R9) |
 969         rmask(R10) | rmask(LR);
 970     if (ARM_VFP) {
 971         a.free |=
 972             rmask(D0) | rmask(D1) | rmask(D2) | rmask(D3) |
 973             rmask(D4) | rmask(D5) | rmask(D6) | rmask(D7);
 974     }
 975 }
 976
 977 static inline ConditionCode
 978 get_cc(NIns *ins)
 979 {
 980     return ConditionCode((*ins >> 28) & 0xF);
 981 }
 982
 983 static inline bool
 984 branch_is_B(NIns* branch)
 985 {
 986     return (*branch & 0x0E000000) == 0x0A000000;
 987 }
 988
 989 static inline bool
 990 branch_is_LDR_PC(NIns* branch)
 991 {
 992     return (*branch & 0x0F7FF000) == 0x051FF000;
 993 }
 994
 995 // Is this an instruction of the form  ldr/str reg, [fp, #-imm] ?
 996 static inline bool
 997 is_ldstr_reg_fp_minus_imm(/*OUT*/uint32_t* isLoad, /*OUT*/uint32_t* rX,
 998                           /*OUT*/uint32_t* immX, NIns i1)
 999 {
1000     if ((i1 & 0xFFEF0000) != 0xE50B0000)
1001         return false;
1002     *isLoad = (i1 >> 20) & 1;
1003     *rX     = (i1 >> 12) & 0xF;
1004     *immX   = i1 & 0xFFF;
1005     return true;
1006 }
1007
1008 // Is this an instruction of the form  ldmdb/stmdb fp, regset ?
1009 static inline bool
1010 is_ldstmdb_fp(/*OUT*/uint32_t* isLoad, /*OUT*/uint32_t* regSet, NIns i1)
1011 {
1012     if ((i1 & 0xFFEF0000) != 0xE90B0000)
1013         return false;
1014     *isLoad = (i1 >> 20) & 1;
1015     *regSet = i1 & 0xFFFF;
1016     return true;
1017 }
1018
1019 // Make an instruction of the form ldmdb/stmdb fp, regset
1020 static inline NIns
1021 mk_ldstmdb_fp(uint32_t isLoad, uint32_t regSet)
1022 {
1023     return 0xE90B0000 | (regSet & 0xFFFF) | ((isLoad & 1) << 20);
1024 }
1025
1026 // Compute the number of 1 bits in the lowest 16 bits of regSet
1027 static inline uint32_t
1028 size_of_regSet(uint32_t regSet)
1029 {
1030    uint32_t x = regSet;
1031    x = (x & 0x5555) + ((x >> 1) & 0x5555);
1032    x = (x & 0x3333) + ((x >> 2) & 0x3333);
1033    x = (x & 0x0F0F) + ((x >> 4) & 0x0F0F);
1034    x = (x & 0x00FF) + ((x >> 8) & 0x00FF);
1035    return x;
1036 }
1037
1038 // See if two ARM instructions, i1 and i2, can be combined into one
1039 static bool
1040 do_peep_2_1(/*OUT*/NIns* merged, NIns i1, NIns i2)
1041 {
1042     uint32_t rX, rY, immX, immY, isLoadX, isLoadY, regSet;
1043     /*   ld/str rX, [fp, #-8]
1044          ld/str rY, [fp, #-4]
1045          ==>
1046          ld/stmdb fp, {rX, rY}
1047          when
1048          X < Y and X != fp and Y != fp and X != 15 and Y != 15
1049     */
1050     if (is_ldstr_reg_fp_minus_imm(&isLoadX, &rX, &immX, i1) &&
1051         is_ldstr_reg_fp_minus_imm(&isLoadY, &rY, &immY, i2) &&
1052         immX == 8 && immY == 4 && rX < rY &&
1053         isLoadX == isLoadY &&
1054         rX != FP && rY != FP &&
1055          rX != 15 && rY != 15) {
1056         *merged = mk_ldstmdb_fp(isLoadX, (1 << rX) | (1<<rY));
1057         return true;
1058     }
1059     /*   ld/str   rX, [fp, #-N]
1060          ld/stmdb fp, regset
1061          ==>
1062          ld/stmdb fp, union(regset,{rX})
1063          when
1064          regset is nonempty
1065          X < all elements of regset
1066          N == 4 * (1 + card(regset))
1067          X != fp and X != 15
1068     */
1069     if (is_ldstr_reg_fp_minus_imm(&isLoadX, &rX, &immX, i1) &&
1070         is_ldstmdb_fp(&isLoadY, &regSet, i2) &&
1071         regSet != 0 &&
1072         (regSet & ((1 << (rX + 1)) - 1)) == 0 &&
1073         immX == 4 * (1 + size_of_regSet(regSet)) &&
1074         isLoadX == isLoadY &&
1075         rX != FP && rX != 15) {
1076         *merged = mk_ldstmdb_fp(isLoadX, regSet | (1 << rX));
1077         return true;
1078     }
1079     return false;
1080 }
1081
1082 // Determine whether or not it's safe to look at _nIns[1].
1083 // Necessary condition for safe peepholing with do_peep_2_1.
1084 static inline bool
1085 does_next_instruction_exist(NIns* _nIns, NIns* codeStart, NIns* codeEnd,
1086                             NIns* exitStart, NIns* exitEnd)
1087 {
1088     return (exitStart <= _nIns && _nIns+1 < exitEnd) ||
1089            (codeStart <= _nIns && _nIns+1 < codeEnd);
1090 }
1091
1092 void
1093 Assembler::nPatchBranch(NIns* branch, NIns* target)
1094 {
1095     // Patch the jump in a loop
1096
1097     //
1098     // There are two feasible cases here, the first of which has 2 sub-cases:
1099     //
1100     //   (1) We are patching a patchable unconditional jump emitted by
1101     //       JMP_far.  All possible encodings we may be looking at with
1102     //       involve 2 words, though we *may* have to change from 1 word to
1103     //       2 or vice verse.
1104     //
1105     //          1a:  B ±32MB ; BKPT
1106     //          1b:  LDR PC [PC, #-4] ; $imm
1107     //
1108     //   (2) We are patching a patchable conditional jump emitted by
1109     //       B_cond_chk.  Short conditional jumps are non-patchable, so we
1110     //       won't have one here; will only ever have an instruction of the
1111     //       following form:
1112     //
1113     //          LDRcc PC [PC, #lit] ...
1114     //
1115     //       We don't actually know whether the lit-address is in the
1116     //       constant pool or in-line of the instruction stream, following
1117     //       the insn (with a jump over it) and we don't need to. For our
1118     //       purposes here, cases 2, 3 and 4 all look the same.
1119     //
1120     // For purposes of handling our patching task, we group cases 1b and 2
1121     // together, and handle case 1a on its own as it might require expanding
1122     // from a short-jump to a long-jump.
1123     //
1124     // We do not handle contracting from a long-jump to a short-jump, though
1125     // this is a possible future optimisation for case 1b. For now it seems
1126     // not worth the trouble.
1127     //
1128
1129     if (branch_is_B(branch)) {
1130         // Case 1a
1131         // A short B branch, must be unconditional.
1132         NanoAssert(get_cc(branch) == AL);
1133
1134         int32_t offset = PC_OFFSET_FROM(target, branch);
1135         if (isS24(offset>>2)) {
1136             // We can preserve the existing form, just rewrite its offset.
1137             NIns cond = *branch & 0xF0000000;
1138             *branch = (NIns)( cond | (0xA<<24) | ((offset>>2) & 0xFFFFFF) );
1139         } else {
1140             // We need to expand the existing branch to a long jump.
1141             // make sure the next instruction is a dummy BKPT
1142             NanoAssert(*(branch+1) == BKPT_insn);
1143
1144             // Set the branch instruction to   LDRcc pc, [pc, #-4]
1145             NIns cond = *branch & 0xF0000000;
1146             *branch++ = (NIns)( cond | (0x51<<20) | (PC<<16) | (PC<<12) | (4));
1147             *branch++ = (NIns)target;
1148         }
1149     } else {
1150         // Case 1b & 2
1151         // Not a B branch, must be LDR, might be any kind of condition.
1152         NanoAssert(branch_is_LDR_PC(branch));
1153
1154         NIns *addr = branch+2;
1155         int offset = (*branch & 0xFFF) / sizeof(NIns);
1156         if (*branch & (1<<23)) {
1157             addr += offset;
1158         } else {
1159             addr -= offset;
1160         }
1161
1162         // Just redirect the jump target, leave the insn alone.
1163         *addr = (NIns) target;
1164     }
1165 }
1166
1167 RegisterMask
1168 Assembler::nHint(LIns* ins)
1169 {
1170     NanoAssert(ins->isop(LIR_paramp));
1171     RegisterMask prefer = 0;
1172     if (ins->paramKind() == 0)
1173         if (ins->paramArg() < 4)
1174             prefer = rmask(argRegs[ins->paramArg()]);
1175     return prefer;
1176 }
1177
1178 void
1179 Assembler::asm_qjoin(LIns *ins)
1180 {
1181     int d = findMemFor(ins);
1182     NanoAssert(d);
1183     LIns* lo = ins->oprnd1();
1184     LIns* hi = ins->oprnd2();
1185
1186     Register rlo;
1187     Register rhi;
1188
1189     findRegFor2(GpRegs, lo, rlo, GpRegs, hi, rhi);
1190
1191     asm_str(rhi, FP, d+4);
1192     asm_str(rlo, FP, d);
1193
1194     freeResourcesOf(ins);
1195 }
1196
1197 void
1198 Assembler::asm_store32(LOpcode op, LIns *value, int dr, LIns *base)
1199 {
1200     Register ra, rb;
1201     getBaseReg2(GpRegs, value, ra, GpRegs, base, rb, dr);
1202
1203     switch (op) {
1204         case LIR_sti:
1205             if (isU12(-dr) || isU12(dr)) {
1206                 STR(ra, rb, dr);
1207             } else {
1208                 STR(ra, IP, 0);
1209                 asm_add_imm(IP, rb, dr);
1210             }
1211             return;
1212         case LIR_sti2c:
1213             if (isU12(-dr) || isU12(dr)) {
1214                 STRB(ra, rb, dr);
1215             } else {
1216                 STRB(ra, IP, 0);
1217                 asm_add_imm(IP, rb, dr);
1218             }
1219             return;
1220         case LIR_sti2s:
1221             // Similar to the sti/stb case, but the max offset is smaller.
1222             if (isU8(-dr) || isU8(dr)) {
1223                 STRH(ra, rb, dr);
1224             } else {
1225                 STRH(ra, IP, 0);
1226                 asm_add_imm(IP, rb, dr);
1227             }
1228             return;
1229         default:
1230             NanoAssertMsg(0, "asm_store32 should never receive this LIR opcode");
1231             return;
1232     }
1233 }
1234
1235 bool
1236 canRematALU(LIns *ins)
1237 {
1238     // Return true if we can generate code for this instruction that neither
1239     // sets CCs, clobbers an input register, nor requires allocating a register.
1240     switch (ins->opcode()) {
1241     case LIR_addi:
1242     case LIR_subi:
1243     case LIR_andi:
1244     case LIR_ori:
1245     case LIR_xori:
1246         return ins->oprnd1()->isInReg() && ins->oprnd2()->isImmI();
1247     default:
1248         ;
1249     }
1250     return false;
1251 }
1252
1253 bool
1254 Assembler::canRemat(LIns* ins)
1255 {
1256     return ins->isImmI() || ins->isop(LIR_allocp) || canRematALU(ins);
1257 }
1258
1259 void
1260 Assembler::asm_restore(LIns* i, Register r)
1261 {
1262     // The following registers should never be restored:
1263     NanoAssert(r != PC);
1264     NanoAssert(r != IP);
1265     NanoAssert(r != SP);
1266
1267     if (i->isop(LIR_allocp)) {
1268         int d = findMemFor(i);
1269         asm_add_imm(r, FP, d);
1270     } else if (i->isImmI()) {
1271         asm_ld_imm(r, i->immI());
1272     } else if (canRematALU(i)) {
1273         Register rn = i->oprnd1()->getReg();
1274         int32_t imm = i->oprnd2()->immI();
1275         switch (i->opcode()) {
1276         case LIR_addi: asm_add_imm(r, rn, imm, /*stat=*/ 0); break;
1277         case LIR_subi: asm_sub_imm(r, rn, imm, /*stat=*/ 0); break;
1278         case LIR_andi: asm_and_imm(r, rn, imm, /*stat=*/ 0); break;
1279         case LIR_ori:  asm_orr_imm(r, rn, imm, /*stat=*/ 0); break;
1280         case LIR_xori: asm_eor_imm(r, rn, imm, /*stat=*/ 0); break;
1281         default:       NanoAssert(0);                        break;
1282         }
1283     } else {
1284         // We can't easily load immediate values directly into FP registers, so
1285         // ensure that memory is allocated for the constant and load it from
1286         // memory.
1287         int d = findMemFor(i);
1288         if (ARM_VFP && IsFpReg(r)) {
1289             if (isU8(d/4) || isU8(-d/4)) {
1290                 FLDD(r, FP, d);
1291             } else {
1292                 FLDD(r, IP, d%1024);
1293                 asm_add_imm(IP, FP, d-(d%1024));
1294             }
1295         } else {
1296             NIns merged;
1297             LDR(r, FP, d);
1298             // See if we can merge this load into an immediately following
1299             // one, by creating or extending an LDM instruction.
1300             if (/* is it safe to poke _nIns[1] ? */
1301                 does_next_instruction_exist(_nIns, codeStart, codeEnd,
1302                                                    exitStart, exitEnd)
1303                 && /* can we merge _nIns[0] into _nIns[1] ? */
1304                    do_peep_2_1(&merged, _nIns[0], _nIns[1])) {
1305                 _nIns[1] = merged;
1306                 _nIns++;
1307                 verbose_only( asm_output("merge next into LDMDB"); )
1308             }
1309         }
1310     }
1311 }
1312
1313 void
1314 Assembler::asm_spill(Register rr, int d, bool quad)
1315 {
1316     (void) quad;
1317     NanoAssert(d);
1318     // The following registers should never be spilled:
1319     NanoAssert(rr != PC);
1320     NanoAssert(rr != IP);
1321     NanoAssert(rr != SP);
1322     if (ARM_VFP && IsFpReg(rr)) {
1323         if (isU8(d/4) || isU8(-d/4)) {
1324             FSTD(rr, FP, d);
1325         } else {
1326             FSTD(rr, IP, d%1024);
1327             asm_add_imm(IP, FP, d-(d%1024));
1328         }
1329     } else {
1330         NIns merged;
1331         // asm_str always succeeds, but returns '1' to indicate that it emitted
1332         // a simple, easy-to-merge STR.
1333         if (asm_str(rr, FP, d)) {
1334             // See if we can merge this store into an immediately following one,
1335             // one, by creating or extending a STM instruction.
1336             if (/* is it safe to poke _nIns[1] ? */
1337                     does_next_instruction_exist(_nIns, codeStart, codeEnd,
1338                         exitStart, exitEnd)
1339                     && /* can we merge _nIns[0] into _nIns[1] ? */
1340                     do_peep_2_1(&merged, _nIns[0], _nIns[1])) {
1341                 _nIns[1] = merged;
1342                 _nIns++;
1343                 verbose_only( asm_output("merge next into STMDB"); )
1344             }
1345         }
1346     }
1347 }
1348
1349 void
1350 Assembler::asm_load64(LIns* ins)
1351 {
1352     NanoAssert(ins->isD());
1353
1354     if (ARM_VFP) {
1355         Register    dd;
1356         LIns*       base = ins->oprnd1();
1357         Register    rn = findRegFor(base, GpRegs);
1358         int         offset = ins->disp();
1359
1360         if (ins->isInReg()) {
1361             dd = prepareResultReg(ins, FpRegs & ~rmask(D0));
1362         } else {
1363             // If the result isn't already in a register, use the VFP scratch
1364             // register for the result and store it directly into memory.
1365             NanoAssert(ins->isInAr());
1366             int d = arDisp(ins);
1367             evictIfActive(D0);
1368             dd = D0;
1369             // VFP can only do loads and stores with a range of ±1020, so we
1370             // might need to do some arithmetic to extend its range.
1371             if (isU8(d/4) || isU8(-d/4)) {
1372                 FSTD(dd, FP, d);
1373             } else {
1374                 FSTD(dd, IP, d%1024);
1375                 asm_add_imm(IP, FP, d-(d%1024));
1376             }
1377         }
1378
1379         switch (ins->opcode()) {
1380             case LIR_ldd:
1381                 if (isU8(offset/4) || isU8(-offset/4)) {
1382                     FLDD(dd, rn, offset);
1383                 } else {
1384                     FLDD(dd, IP, offset%1024);
1385                     asm_add_imm(IP, rn, offset-(offset%1024));
1386                 }
1387                 break;
1388             case LIR_ldf2d:
1389                 evictIfActive(D0);
1390                 FCVTDS(dd, S0);
1391                 if (isU8(offset/4) || isU8(-offset/4)) {
1392                     FLDS(S0, rn, offset);
1393                 } else {
1394                     FLDS(S0, IP, offset%1024);
1395                     asm_add_imm(IP, rn, offset-(offset%1024));
1396                 }
1397                 break;
1398             default:
1399                 NanoAssertMsg(0, "LIR opcode unsupported by asm_load64.");
1400                 break;
1401         }
1402     } else {
1403         NanoAssert(ins->isInAr());
1404         int         d = arDisp(ins);
1405
1406         LIns*       base = ins->oprnd1();
1407         Register    rn = findRegFor(base, GpRegs);
1408         int         offset = ins->disp();
1409
1410         switch (ins->opcode()) {
1411             case LIR_ldd:
1412                 asm_mmq(FP, d, rn, offset);
1413                 break;
1414             case LIR_ldf2d:
1415                 NanoAssertMsg(0, "LIR_ldf2d is not yet implemented for soft-float.");
1416                 break;
1417             default:
1418                 NanoAssertMsg(0, "LIR opcode unsupported by asm_load64.");
1419                 break;
1420         }
1421     }
1422
1423     freeResourcesOf(ins);
1424 }
1425
1426 void
1427 Assembler::asm_store64(LOpcode op, LIns* value, int dr, LIns* base)
1428 {
1429     NanoAssert(value->isD());
1430
1431     if (ARM_VFP) {
1432         Register dd = findRegFor(value, FpRegs & ~rmask(D0));
1433         Register rn = findRegFor(base, GpRegs);
1434
1435         switch (op) {
1436             case LIR_std:
1437                 // VFP can only do stores with a range of ±1020, so we might
1438                 // need to do some arithmetic to extend its range.
1439                 if (isU8(dr/4) || isU8(-dr/4)) {
1440                     FSTD(dd, rn, dr);
1441                 } else {
1442                     FSTD(dd, IP, dr%1024);
1443                     asm_add_imm(IP, rn, dr-(dr%1024));
1444                 }
1445
1446                 break;
1447             case LIR_std2f:
1448                 // VFP can only do stores with a range of ±1020, so we might
1449                 // need to do some arithmetic to extend its range.
1450                 evictIfActive(D0);
1451                 if (isU8(dr/4) || isU8(-dr/4)) {
1452                     FSTS(S0, rn, dr);
1453                 } else {
1454                     FSTS(S0, IP, dr%1024);
1455                     asm_add_imm(IP, rn, dr-(dr%1024));
1456                 }
1457
1458                 FCVTSD(S0, dd);
1459
1460                 break;
1461             default:
1462                 NanoAssertMsg(0, "LIR opcode unsupported by asm_store64.");
1463                 break;
1464         }
1465     } else {
1466         int         d = findMemFor(value);
1467         Register    rn = findRegFor(base, GpRegs);
1468
1469         switch (op) {
1470             case LIR_std:
1471                 // Doubles in soft-float never get registers allocated, so this
1472                 // is always a simple two-word memcpy.
1473                 // *(uint64_t*)(rb+dr) = *(uint64_t*)(FP+da)
1474                 asm_mmq(rn, dr, FP, d);
1475                 break;
1476             case LIR_std2f:
1477                 NanoAssertMsg(0, "TODO: Soft-float implementation of LIR_std2f.");
1478                 break;
1479             default:
1480                 NanoAssertMsg(0, "LIR opcode unsupported by asm_store64.");
1481                 break;
1482         }
1483     }
1484 }
1485
1486 // Load the float64 specified by immDhi:immDlo into VFP register dd.
1487 void
1488 Assembler::asm_immd_nochk(Register dd, int32_t immDlo, int32_t immDhi)
1489 {
1490     // We're not going to use a slot, because it might be too far
1491     // away.  Instead, we're going to stick a branch in the stream to
1492     // jump over the constants, and then load from a short PC relative
1493     // offset.
1494
1495     // stream should look like:
1496     //    branch A
1497     //    immDlo
1498     //    immDhi
1499     // A: FLDD PC-16
1500
1501     FLDD(dd, PC, -16);
1502
1503     *(--_nIns) = (NIns) immDhi;
1504     *(--_nIns) = (NIns) immDlo;
1505
1506     B_nochk(_nIns+2);
1507 }
1508
1509 void
1510 Assembler::asm_immd(LIns* ins)
1511 {
1512     // If the value isn't in a register, it's simplest to use integer
1513     // instructions to put the value in its stack slot. Otherwise, use a VFP
1514     // load to get the value from a literal pool.
1515     if (ARM_VFP && ins->isInReg()) {
1516         Register dd = prepareResultReg(ins, FpRegs);
1517         underrunProtect(4*4);
1518         asm_immd_nochk(dd, ins->immDlo(), ins->immDhi());
1519     } else {
1520         NanoAssert(ins->isInAr());
1521         int d = arDisp(ins);
1522         asm_str(IP, FP, d+4);
1523         asm_ld_imm(IP, ins->immDhi());
1524         asm_str(IP, FP, d);
1525         asm_ld_imm(IP, ins->immDlo());
1526     }
1527
1528     freeResourcesOf(ins);
1529 }
1530
1531 void
1532 Assembler::asm_nongp_copy(Register r, Register s)
1533 {
1534     if (ARM_VFP && IsFpReg(r) && IsFpReg(s)) {
1535         // fp->fp
1536         FCPYD(r, s);
1537     } else {
1538         // We can't move a double-precision FP register into a 32-bit GP
1539         // register, so assert that no calling code is trying to do that.
1540         NanoAssert(0);
1541     }
1542 }
1543
1544 /**
1545  * copy 64 bits: (rd+dd) <- (rs+ds)
1546  */
1547 void
1548 Assembler::asm_mmq(Register rd, int dd, Register rs, int ds)
1549 {
1550     // The value is either a 64bit struct or maybe a float that isn't live in
1551     // an FPU reg.  Either way, don't put it in an FPU reg just to load & store
1552     // it.
1553     // This operation becomes a simple 64-bit memcpy.
1554
1555     // In order to make the operation optimal, we will require two GP
1556     // registers. We can't allocate a register here because the caller may have
1557     // called deprecated_freeRsrcOf, and allocating a register here may cause something
1558     // else to spill onto the stack which has just be conveniently freed by
1559     // deprecated_freeRsrcOf (resulting in stack corruption).
1560     //
1561     // Falling back to a single-register implementation of asm_mmq is better
1562     // than adjusting the callers' behaviour (to allow us to allocate another
1563     // register here) because spilling a register will end up being slower than
1564     // just using the same register twice anyway.
1565     //
1566     // Thus, if there is a free register which we can borrow, we will emit the
1567     // following code:
1568     //  LDR rr, [rs, #ds]
1569     //  LDR ip, [rs, #(ds+4)]
1570     //  STR rr, [rd, #dd]
1571     //  STR ip, [rd, #(dd+4)]
1572     // (Where rr is the borrowed register.)
1573     //
1574     // If there is no free register, don't spill an existing allocation. Just
1575     // do the following:
1576     //  LDR ip, [rs, #ds]
1577     //  STR ip, [rd, #dd]
1578     //  LDR ip, [rs, #(ds+4)]
1579     //  STR ip, [rd, #(dd+4)]
1580     //
1581     // Note that if rs+4 or rd+4 is outside the LDR or STR range, extra
1582     // instructions will be emitted as required to make the code work.
1583
1584     // Ensure that the PC is not used as either base register. The instruction
1585     // generation macros call underrunProtect, and a side effect of this is
1586     // that we may be pushed onto another page, so the PC is not a reliable
1587     // base register.
1588     NanoAssert(rs != PC);
1589     NanoAssert(rd != PC);
1590
1591     // We use IP as a swap register, so check that it isn't used for something
1592     // else by the caller.
1593     NanoAssert(rs != IP);
1594     NanoAssert(rd != IP);
1595
1596     // Find the list of free registers from the allocator's free list and the
1597     // GpRegs mask. This excludes any floating-point registers that may be on
1598     // the free list.
1599     RegisterMask    free = _allocator.free & AllowableFlagRegs;
1600
1601     // Ensure that ds and dd are within the +/-4095 offset range of STR and
1602     // LDR. If either is out of range, adjust and modify rd or rs so that the
1603     // load works correctly.
1604     // The modification here is performed after the LDR/STR block (because code
1605     // is emitted backwards), so this one is the reverse operation.
1606
1607     int32_t dd_adj = 0;
1608     int32_t ds_adj = 0;
1609
1610     if ((dd+4) >= 0x1000) {
1611         dd_adj = ((dd+4) & ~0xfff);
1612     } else if (dd <= -0x1000) {
1613         dd_adj = -((-dd) & ~0xfff);
1614     }
1615     if ((ds+4) >= 0x1000) {
1616         ds_adj = ((ds+4) & ~0xfff);
1617     } else if (ds <= -0x1000) {
1618         ds_adj = -((-ds) & ~0xfff);
1619     }
1620
1621     // These will emit no code if d*_adj is 0.
1622     asm_sub_imm(rd, rd, dd_adj);
1623     asm_sub_imm(rs, rs, ds_adj);
1624
1625     ds -= ds_adj;
1626     dd -= dd_adj;
1627
1628     if (free) {
1629         // There is at least one register on the free list, so grab one for
1630         // temporary use. There is no need to allocate it explicitly because
1631         // we won't need it after this function returns.
1632
1633         // The CountLeadingZeroes utility can be used to quickly find a set bit
1634         // in the free mask.
1635         Register    rr = (Register)(31-CountLeadingZeroes(free));
1636
1637         // Note: Not every register in GpRegs is usable here. However, these
1638         // registers will never appear on the free list.
1639         NanoAssert((free & rmask(PC)) == 0);
1640         NanoAssert((free & rmask(LR)) == 0);
1641         NanoAssert((free & rmask(SP)) == 0);
1642         NanoAssert((free & rmask(IP)) == 0);
1643         NanoAssert((free & rmask(FP)) == 0);
1644
1645         // Emit the actual instruction sequence.
1646         STR(IP, rd, dd+4);
1647         STR(rr, rd, dd);
1648         LDR(IP, rs, ds+4);
1649         LDR(rr, rs, ds);
1650     } else {
1651         // There are no free registers, so fall back to using IP twice.
1652         STR(IP, rd, dd+4);
1653         LDR(IP, rs, ds+4);
1654         STR(IP, rd, dd);
1655         LDR(IP, rs, ds);
1656     }
1657
1658     // Re-adjust the base registers. (These will emit no code if d*_adj is 0.
1659     asm_add_imm(rd, rd, dd_adj);
1660     asm_add_imm(rs, rs, ds_adj);
1661 }
1662
1663 // Increment the 32-bit profiling counter at pCtr, without
1664 // changing any registers.
1665 verbose_only(
1666 void Assembler::asm_inc_m32(uint32_t* pCtr)
1667 {
1668     // We need to temporarily free up two registers to do this, so
1669     // just push r0 and r1 on the stack.  This assumes that the area
1670     // at r13 - 8 .. r13 - 1 isn't being used for anything else at
1671     // this point.  This guaranteed us by the EABI; although the
1672     // situation with the legacy ABI I'm not sure of.
1673     //
1674     // Plan: emit the following bit of code.  It's not efficient, but
1675     // this is for profiling debug builds only, and is self contained,
1676     // except for above comment re stack use.
1677     //
1678     // E92D0003                 push    {r0,r1}
1679     // E59F0000                 ldr     r0, [r15]   ; pCtr
1680     // EA000000                 b       .+8         ; jump over imm
1681     // 12345678                 .word   0x12345678  ; pCtr
1682     // E5901000                 ldr     r1, [r0]
1683     // E2811001                 add     r1, r1, #1
1684     // E5801000                 str     r1, [r0]
1685     // E8BD0003                 pop     {r0,r1}
1686
1687     // We need keep the 4 words beginning at "ldr r0, [r15]"
1688     // together.  Simplest to underrunProtect the whole thing.
1689     underrunProtect(8*4);
1690     IMM32(0xE8BD0003);       //  pop     {r0,r1}
1691     IMM32(0xE5801000);       //  str     r1, [r0]
1692     IMM32(0xE2811001);       //  add     r1, r1, #1
1693     IMM32(0xE5901000);       //  ldr     r1, [r0]
1694     IMM32((uint32_t)pCtr);   //  .word   pCtr
1695     IMM32(0xEA000000);       //  b       .+8
1696     IMM32(0xE59F0000);       //  ldr     r0, [r15]
1697     IMM32(0xE92D0003);       //  push    {r0,r1}
1698 }
1699 )
1700
1701 void
1702 Assembler::nativePageReset()
1703 {
1704     _nSlot = 0;
1705     _nExitSlot = 0;
1706 }
1707
1708 void
1709 Assembler::nativePageSetup()
1710 {
1711     NanoAssert(!_inExit);
1712     if (!_nIns)
1713         codeAlloc(codeStart, codeEnd, _nIns verbose_only(, codeBytes), NJ_MAX_CPOOL_OFFSET);
1714
1715     // constpool starts at top of page and goes down,
1716     // code starts at bottom of page and moves up
1717     if (!_nSlot)
1718         _nSlot = codeStart;
1719 }
1720
1721
1722 void
1723 Assembler::underrunProtect(int bytes)
1724 {
1725     NanoAssertMsg(bytes<=LARGEST_UNDERRUN_PROT, "constant LARGEST_UNDERRUN_PROT is too small");
1726     NanoAssert(_nSlot != 0 && int(_nIns)-int(_nSlot) <= 4096);
1727     uintptr_t top = uintptr_t(_nSlot);
1728     uintptr_t pc = uintptr_t(_nIns);
1729     if (pc - bytes < top)
1730     {
1731         verbose_only(verbose_outputf("        %p:", _nIns);)
1732         NIns* target = _nIns;
1733         // This may be in a normal code chunk or an exit code chunk.
1734         codeAlloc(codeStart, codeEnd, _nIns verbose_only(, codeBytes), NJ_MAX_CPOOL_OFFSET);
1735
1736         _nSlot = codeStart;
1737
1738         // _nSlot points to the first empty position in the new code block
1739         // _nIns points just past the last empty position.
1740         // Assume B_nochk won't ever try to write to _nSlot. See B_cond_chk macro.
1741         B_nochk(target);
1742     }
1743 }
1744
1745 void
1746 Assembler::JMP_far(NIns* addr)
1747 {
1748     // Even if a simple branch is all that is required, this function must emit
1749     // two words so that the branch can be arbitrarily patched later on.
1750     underrunProtect(8);
1751
1752     intptr_t offs = PC_OFFSET_FROM(addr,_nIns-2);
1753
1754     if (isS24(offs>>2)) {
1755         // Emit a BKPT to ensure that we reserve enough space for a full 32-bit
1756         // branch patch later on. The BKPT should never be executed.
1757         BKPT_nochk();
1758
1759         asm_output("bkpt");
1760
1761         // B [PC+offs]
1762         *(--_nIns) = (NIns)( COND_AL | (0xA<<24) | ((offs>>2) & 0xFFFFFF) );
1763
1764         asm_output("b %p", (void*)addr);
1765     } else {
1766         // Insert the target address as a constant in the instruction stream.
1767         *(--_nIns) = (NIns)((addr));
1768         // ldr pc, [pc, #-4] // load the address into pc, reading it from [pc-4] (e.g.,
1769         // the next instruction)
1770         *(--_nIns) = (NIns)( COND_AL | (0x51<<20) | (PC<<16) | (PC<<12) | (4));
1771
1772         asm_output("ldr pc, =%p", (void*)addr);
1773     }
1774 }
1775
1776 // Perform a branch with link, and ARM/Thumb exchange if necessary. The actual
1777 // BLX instruction is only available from ARMv5 onwards, but as we don't
1778 // support anything older than that this function will not attempt to output
1779 // pre-ARMv5 sequences.
1780 //
1781 // Note: This function is not designed to be used with branches which will be
1782 // patched later, though it will work if the patcher knows how to patch the
1783 // generated instruction sequence.
1784 void
1785 Assembler::BranchWithLink(NIns* addr)
1786 {
1787     // Most branches emitted by TM are loaded through a register, so always
1788     // reserve enough space for the LDR sequence. This should give us a slight
1789     // net gain over reserving the exact amount required for shorter branches.
1790     // This _must_ be called before PC_OFFSET_FROM as it can move _nIns!
1791     underrunProtect(8+LD32_size);
1792
1793     // Calculate the offset from the instruction that is about to be
1794     // written (at _nIns-1) to the target.
1795     intptr_t offs = PC_OFFSET_FROM(addr,_nIns-1);
1796
1797     // ARMv5 and above can use BLX <imm> for branches within ±32MB of the
1798     // PC and BLX Rm for long branches.
1799     if (isS24(offs>>2)) {
1800         // the value we need to stick in the instruction; masked,
1801         // because it will be sign-extended back to 32 bits.
1802         intptr_t offs2 = (offs>>2) & 0xffffff;
1803
1804         if (((intptr_t)addr & 1) == 0) {
1805             // The target is ARM, so just emit a BL.
1806
1807             // BL target
1808             *(--_nIns) = (NIns)( (COND_AL) | (0xB<<24) | (offs2) );
1809             asm_output("bl %p", (void*)addr);
1810             return;
1811         } else if (ARM_ARCH_AT_LEAST(5)) {
1812             // The target is Thumb, so emit a BLX (ARMv5+)
1813             // The (pre-shifted) value of the "H" bit in the BLX encoding.
1814             uint32_t    H = (offs & 0x2) << 23;
1815
1816             // BLX addr
1817             *(--_nIns) = (NIns)( (0xF << 28) | (0x5<<25) | (H) | (offs2) );
1818             asm_output("blx %p", (void*)addr);
1819             return;
1820         }
1821         /* If we get here, it means we are on ARMv4T, and the target is Thumb,
1822            in which case we want to emit a branch with a register */
1823     }
1824     if (ARM_ARCH_AT_LEAST(5)) {
1825         // Load the target address into IP and branch to that. We've already
1826         // done underrunProtect, so we can skip that here.
1827         BLX(IP, false);
1828     } else {
1829         BX(IP);
1830         MOV(LR, PC);
1831     }
1832     // LDR IP, =addr
1833     asm_ld_imm(IP, (int32_t)addr, false);
1834 }
1835
1836 // This is identical to BranchWithLink(NIns*) but emits a branch to an address
1837 // held in a register rather than a literal address.
1838 inline void
1839 Assembler::BLX(Register addr, bool chk /* = true */)
1840 {
1841     // We need to emit an ARMv5+ instruction, so assert that we have a suitable
1842     // processor. Note that we don't support ARMv4(T), but this serves as a
1843     // useful sanity check.
1844     NanoAssert(ARM_ARCH_AT_LEAST(5));
1845
1846     NanoAssert(IsGpReg(addr));
1847 #ifdef UNDER_CE
1848     // There is a bug in the WinCE device emulator which stops "BLX LR" from
1849     // working as expected. Assert that we never do that!
1850     NanoAssert(addr != LR);
1851 #endif
1852
1853     if (chk) {
1854         underrunProtect(4);
1855     }
1856
1857     // BLX reg
1858     *(--_nIns) = (NIns)( (COND_AL) | (0x12<<20) | (0xFFF<<8) | (0x3<<4) | (addr) );
1859     asm_output("blx %s", gpn(addr));
1860 }
1861
1862 // Emit the code required to load a memory address into a register as follows:
1863 // d = *(b+off)
1864 // underrunProtect calls from this function can be disabled by setting chk to
1865 // false. However, this function can use more than LD32_size bytes of space if
1866 // the offset is out of the range of a LDR instruction; the maximum space this
1867 // function requires for underrunProtect is 4+LD32_size.
1868 void
1869 Assembler::asm_ldr_chk(Register d, Register b, int32_t off, bool chk)
1870 {
1871     if (ARM_VFP && IsFpReg(d)) {
1872         FLDD_chk(d,b,off,chk);
1873         return;
1874     }
1875
1876     NanoAssert(IsGpReg(d));
1877     NanoAssert(IsGpReg(b));
1878
1879     // We can't use underrunProtect if the base register is the PC because
1880     // underrunProtect might move the PC if there isn't enough space on the
1881     // current page.
1882     NanoAssert((b != PC) || (!chk));
1883
1884     if (isU12(off)) {
1885         // LDR d, b, #+off
1886         if (chk) underrunProtect(4);
1887         *(--_nIns) = (NIns)( COND_AL | (0x59<<20) | (b<<16) | (d<<12) | off );
1888     } else if (isU12(-off)) {
1889         // LDR d, b, #-off
1890         if (chk) underrunProtect(4);
1891         *(--_nIns) = (NIns)( COND_AL | (0x51<<20) | (b<<16) | (d<<12) | -off );
1892     } else {
1893         // The offset is over 4096 (and outside the range of LDR), so we need
1894         // to add a level of indirection to get the address into IP.
1895
1896         // Because of that, we can't do a PC-relative load unless it fits within
1897         // the single-instruction forms above.
1898
1899         NanoAssert(b != PC);
1900         NanoAssert(b != IP);
1901
1902         if (chk) underrunProtect(4+LD32_size);
1903
1904         *(--_nIns) = (NIns)( COND_AL | (0x79<<20) | (b<<16) | (d<<12) | IP );
1905         asm_ld_imm(IP, off, false);
1906     }
1907
1908     asm_output("ldr %s, [%s, #%d]",gpn(d),gpn(b),(off));
1909 }
1910
1911 // Emit a store, using a register base and an arbitrary immediate offset. This
1912 // behaves like a STR instruction, but doesn't care about the offset range, and
1913 // emits one of the following instruction sequences:
1914 //
1915 // ----
1916 // STR  rt, [rr, #offset]
1917 // ----
1918 // asm_add_imm  ip, rr, #(offset & ~0xfff)
1919 // STR  rt, [ip, #(offset & 0xfff)]
1920 // ----
1921 // # This one's fairly horrible, but should be rare.
1922 // asm_add_imm  rr, rr, #(offset & ~0xfff)
1923 // STR  rt, [ip, #(offset & 0xfff)]
1924 // asm_sub_imm  rr, rr, #(offset & ~0xfff)
1925 // ----
1926 // SUB-based variants (for negative offsets) are also supported.
1927 // ----
1928 //
1929 // The return value is 1 if a simple STR could be emitted, or 0 if the required
1930 // sequence was more complex.
1931 int32_t
1932 Assembler::asm_str(Register rt, Register rr, int32_t offset)
1933 {
1934     // We can't do PC-relative stores, and we can't store the PC value, because
1935     // we use macros (such as STR) which call underrunProtect, and this can
1936     // push _nIns to a new page, thus making any PC value impractical to
1937     // predict.
1938     NanoAssert(rr != PC);
1939     NanoAssert(rt != PC);
1940     if (offset >= 0) {
1941         // The offset is positive, so use ADD (and variants).
1942         if (isU12(offset)) {
1943             STR(rt, rr, offset);
1944             return 1;
1945         }
1946
1947         if (rt != IP) {
1948             STR(rt, IP, offset & 0xfff);
1949             asm_add_imm(IP, rr, offset & ~0xfff);
1950         } else {
1951             int32_t adj = offset & ~0xfff;
1952             asm_sub_imm(rr, rr, adj);
1953             STR(rt, rr, offset-adj);
1954             asm_add_imm(rr, rr, adj);
1955         }
1956     } else {
1957         // The offset is negative, so use SUB (and variants).
1958         if (isU12(-offset)) {
1959             STR(rt, rr, offset);
1960             return 1;
1961         }
1962
1963         if (rt != IP) {
1964             STR(rt, IP, -((-offset) & 0xfff));
1965             asm_sub_imm(IP, rr, (-offset) & ~0xfff);
1966         } else {
1967             int32_t adj = ((-offset) & ~0xfff);
1968             asm_add_imm(rr, rr, adj);
1969             STR(rt, rr, offset+adj);
1970             asm_sub_imm(rr, rr, adj);
1971         }
1972     }
1973
1974     return 0;
1975 }
1976
1977 // Emit the code required to load an immediate value (imm) into general-purpose
1978 // register d. Optimal (MOV-based) mechanisms are used if the immediate can be
1979 // encoded using ARM's operand 2 encoding. Otherwise, a slot is used on the
1980 // literal pool and LDR is used to load the value.
1981 //
1982 // chk can be explicitly set to false in order to disable underrunProtect calls
1983 // from this function; this allows the caller to perform the check manually.
1984 // This function guarantees not to use more than LD32_size bytes of space.
1985 void
1986 Assembler::asm_ld_imm(Register d, int32_t imm, bool chk /* = true */)
1987 {
1988     uint32_t    op2imm;
1989
1990     NanoAssert(IsGpReg(d));
1991
1992     // Attempt to encode the immediate using the second operand of MOV or MVN.
1993     // This is the simplest solution and generates the shortest and fastest
1994     // code, but can only encode a limited set of values.
1995
1996     if (encOp2Imm(imm, &op2imm)) {
1997         // Use MOV to encode the literal.
1998         MOVis(d, op2imm, 0);
1999         return;
2000     }
2001
2002     if (encOp2Imm(~imm, &op2imm)) {
2003         // Use MVN to encode the inverted literal.
2004         MVNis(d, op2imm, 0);
2005         return;
2006     }
2007
2008     // Try to use simple MOV, MVN or MOV(W|T) instructions to load the
2009     // immediate. If this isn't possible, load it from memory.
2010     //  - We cannot use MOV(W|T) on cores older than the introduction of
2011     //    Thumb-2 or if the target register is the PC.
2012     //
2013     // (Note that we use Thumb-2 if arm_arch is ARMv7 or later; the only earlier
2014     // ARM core that provided Thumb-2 is ARMv6T2/ARM1156, which is a real-time
2015     // core that nanojit is unlikely to ever target.)
2016     if (ARM_ARCH_AT_LEAST(7) && (d != PC)) {
2017         // ARMv6T2 and above have MOVW and MOVT.
2018         uint32_t    high_h = (uint32_t)imm >> 16;
2019         uint32_t    low_h = imm & 0xffff;
2020
2021         if (high_h != 0) {
2022             // Load the high half-word (if necessary).
2023             MOVTi_chk(d, high_h, chk);
2024         }
2025         // Load the low half-word. This also zeroes the high half-word, and
2026         // thus must execute _before_ MOVT, and is necessary even if low_h is 0
2027         // because MOVT will not change the existing low half-word.
2028         MOVWi_chk(d, low_h, chk);
2029
2030         return;
2031     }
2032
2033     // We couldn't encode the literal in the instruction stream, so load it
2034     // from memory.
2035
2036     // Because the literal pool is on the same page as the generated code, it
2037     // will almost always be within the ±4096 range of a LDR. However, this may
2038     // not be the case if _nSlot is at the start of the page and _nIns is at
2039     // the end because the PC is 8 bytes ahead of _nIns. This is unlikely to
2040     // happen, but if it does occur we can simply waste a word or two of
2041     // literal space.
2042
2043     // We must do the underrunProtect before PC_OFFSET_FROM as underrunProtect
2044     // can move the PC if there isn't enough space on the current page!
2045     if (chk) {
2046         underrunProtect(LD32_size);
2047     }
2048
2049     int offset = PC_OFFSET_FROM(_nSlot, _nIns-1);
2050     // If the offset is out of range, waste literal space until it is in range.
2051     while (offset <= -4096) {
2052         ++_nSlot;
2053         offset += sizeof(_nSlot);
2054     }
2055     NanoAssert((isU12(-offset) || isU12(offset)) && (offset <= -8));
2056
2057     // Write the literal.
2058     *(_nSlot++) = imm;
2059     asm_output("## imm= 0x%x", imm);
2060
2061     // Load the literal.
2062     LDR_nochk(d,PC,offset);
2063     NanoAssert(uintptr_t(_nIns) + 8 + offset == uintptr_t(_nSlot-1));
2064     NanoAssert(*((int32_t*)_nSlot-1) == imm);
2065 }
2066
2067 // Branch to target address _t with condition _c, doing underrun
2068 // checks (_chk == 1) or skipping them (_chk == 0).
2069 //
2070 // Set the target address (_t) to 0 if the target is not yet known and the
2071 // branch will be patched up later.
2072 //
2073 // If the jump is to a known address (with _t != 0) and it fits in a relative
2074 // jump (±32MB), emit that.
2075 // If the jump is unconditional, emit the dest address inline in
2076 // the instruction stream and load it into pc.
2077 // If the jump has a condition, but noone's mucked with _nIns and our _nSlot
2078 // pointer is valid, stick the constant in the slot and emit a conditional
2079 // load into pc.
2080 // Otherwise, emit the conditional load into pc from a nearby constant,
2081 // and emit a jump to jump over it it in case the condition fails.
2082 //
2083 // NB: B_nochk depends on this not calling samepage() when _c == AL
2084 void
2085 Assembler::B_cond_chk(ConditionCode _c, NIns* _t, bool _chk)
2086 {
2087     int32_t offs = PC_OFFSET_FROM(_t,_nIns-1);
2088     //nj_dprintf("B_cond_chk target: 0x%08x offset: %d @0x%08x\n", _t, offs, _nIns-1);
2089
2090     // optimistically check if this will fit in 24 bits
2091     if (_chk && isS24(offs>>2) && (_t != 0)) {
2092         underrunProtect(4);
2093         // recalculate the offset, because underrunProtect may have
2094         // moved _nIns to a new page
2095         offs = PC_OFFSET_FROM(_t,_nIns-1);
2096     }
2097
2098     // Emit one of the following patterns:
2099     //
2100     //  --- Short branch. This can never be emitted if the branch target is not
2101     //      known.
2102     //          B(cc)   ±32MB
2103     //
2104     //  --- Long unconditional branch.
2105     //          LDR     PC, #lit
2106     //  lit:    #target
2107     //
2108     //  --- Long conditional branch. Note that conditional branches will never
2109     //      be patched, so the nPatchBranch function doesn't need to know where
2110     //      the literal pool is located.
2111     //          LDRcc   PC, #lit
2112     //          ; #lit is in the literal pool at _nSlot
2113     //
2114     //  --- Long conditional branch (if the slot isn't on the same page as the instruction).
2115     //          LDRcc   PC, #lit
2116     //          B       skip        ; Jump over the literal data.
2117     //  lit:    #target
2118     //  skip:   [...]
2119
2120     if (isS24(offs>>2) && (_t != 0)) {
2121         // The underrunProtect for this was done above (if required by _chk).
2122         *(--_nIns) = (NIns)( ((_c)<<28) | (0xA<<24) | (((offs)>>2) & 0xFFFFFF) );
2123         asm_output("b%s %p", _c == AL ? "" : condNames[_c], (void*)(_t));
2124     } else if (_c == AL) {
2125         if(_chk) underrunProtect(8);
2126         *(--_nIns) = (NIns)(_t);
2127         *(--_nIns) = (NIns)( COND_AL | (0x51<<20) | (PC<<16) | (PC<<12) | 0x4 );
2128         asm_output("b%s %p", _c == AL ? "" : condNames[_c], (void*)(_t));
2129     } else if (PC_OFFSET_FROM(_nSlot, _nIns-1) > -0x1000) {
2130         if(_chk) underrunProtect(8);
2131         *(_nSlot++) = (NIns)(_t);
2132         offs = PC_OFFSET_FROM(_nSlot-1,_nIns-1);
2133         NanoAssert(offs < 0);
2134         *(--_nIns) = (NIns)( ((_c)<<28) | (0x51<<20) | (PC<<16) | (PC<<12) | ((-offs) & 0xFFF) );
2135         asm_output("ldr%s %s, [%s, #-%d]", condNames[_c], gpn(PC), gpn(PC), -offs);
2136         NanoAssert(uintptr_t(_nIns)+8+offs == uintptr_t(_nSlot-1));
2137     } else {
2138         if(_chk) underrunProtect(12);
2139         // Emit a pointer to the target as a literal in the instruction stream.
2140         *(--_nIns) = (NIns)(_t);
2141         // Emit a branch to skip over the literal. The PC value is 8 bytes
2142         // ahead of the executing instruction, so to branch two instructions
2143         // forward this must branch 8-8=0 bytes.
2144         *(--_nIns) = (NIns)( COND_AL | (0xA<<24) | 0x0 );
2145         // Emit the conditional branch.
2146         *(--_nIns) = (NIns)( ((_c)<<28) | (0x51<<20) | (PC<<16) | (PC<<12) | 0x0 );
2147         asm_output("b%s %p", _c == AL ? "" : condNames[_c], (void*)(_t));
2148     }
2149 }
2150
2151 /*
2152  * VFP
2153  */
2154
2155 void
2156 Assembler::asm_i2d(LIns* ins)
2157 {
2158     Register dd = prepareResultReg(ins, FpRegs & ~rmask(D0));
2159     Register rt = findRegFor(ins->oprnd1(), GpRegs);
2160
2161     evictIfActive(D0);
2162     FSITOD(dd, S0);
2163     FMSR(S0, rt);
2164
2165     freeResourcesOf(ins);
2166 }
2167
2168 void
2169 Assembler::asm_ui2d(LIns* ins)
2170 {
2171     Register dd = prepareResultReg(ins, FpRegs & ~rmask(D0));
2172     Register rt = findRegFor(ins->oprnd1(), GpRegs);
2173
2174     evictIfActive(D0);
2175     FUITOD(dd, S0);
2176     FMSR(S0, rt);
2177
2178     freeResourcesOf(ins);
2179 }
2180
2181 void Assembler::asm_d2i(LIns* ins)
2182 {
2183     evictIfActive(D0);
2184     if (ins->isInReg()) {
2185         Register rt = ins->getReg();
2186         FMRS(rt, S0);
2187     } else {
2188         // There's no active result register, so store the result directly into
2189         // memory to avoid the FP->GP transfer cost on Cortex-A8.
2190         int32_t d = arDisp(ins);
2191         // VFP can only do stores with a range of ±1020, so we might need to do
2192         // some arithmetic to extend its range.
2193         if (isU8(d/4) || isU8(-d/4)) {
2194             FSTS(S0, FP, d);
2195         } else {
2196             FSTS(S0, IP, d%1024);
2197             asm_add_imm(IP, FP, d-(d%1024));
2198         }
2199     }
2200
2201     Register dm = findRegFor(ins->oprnd1(), FpRegs & ~rmask(D0));
2202
2203     FTOSID(S0, dm);
2204
2205     freeResourcesOf(ins);
2206 }
2207
2208 void
2209 Assembler::asm_fneg(LIns* ins)
2210 {
2211     LIns* lhs = ins->oprnd1();
2212
2213     Register dd = prepareResultReg(ins, FpRegs);
2214     // If the argument doesn't have a register assigned, re-use dd.
2215     Register dm = lhs->isInReg() ? lhs->getReg() : dd;
2216
2217     FNEGD(dd, dm);
2218
2219     freeResourcesOf(ins);
2220     if (dd == dm) {
2221         NanoAssert(!lhs->isInReg());
2222         findSpecificRegForUnallocated(lhs, dd);
2223     }
2224 }
2225
2226 void
2227 Assembler::asm_fop(LIns* ins)
2228 {
2229     LIns*   lhs = ins->oprnd1();
2230     LIns*   rhs = ins->oprnd2();
2231
2232     Register    dd = prepareResultReg(ins, FpRegs);
2233     // Try to re-use the result register for one of the arguments.
2234     Register    dn = lhs->isInReg() ? lhs->getReg() : dd;
2235     Register    dm = rhs->isInReg() ? rhs->getReg() : dd;
2236     if ((dn == dm) && (lhs != rhs)) {
2237         // We can't re-use the result register for both arguments, so force one
2238         // into its own register.
2239         dm = findRegFor(rhs, FpRegs & ~rmask(dd));
2240         NanoAssert(rhs->isInReg());
2241     }
2242
2243     // TODO: Special cases for simple constants.
2244
2245     switch(ins->opcode()) {
2246         case LIR_addd:      FADDD(dd,dn,dm);        break;
2247         case LIR_subd:      FSUBD(dd,dn,dm);        break;
2248         case LIR_muld:      FMULD(dd,dn,dm);        break;
2249         case LIR_divd:      FDIVD(dd,dn,dm);        break;
2250         default:            NanoAssert(0);          break;
2251     }
2252
2253     freeResourcesOf(ins);
2254
2255     // If we re-used the result register, mark it as active.
2256     if (dn == dd) {
2257         NanoAssert(!lhs->isInReg());
2258         findSpecificRegForUnallocated(lhs, dd);
2259     } else if (dm == dd) {
2260         NanoAssert(!rhs->isInReg());
2261         findSpecificRegForUnallocated(rhs, dd);
2262     } else {
2263         NanoAssert(lhs->isInReg());
2264         NanoAssert(rhs->isInReg());
2265     }
2266 }
2267
2268 void
2269 Assembler::asm_cmpd(LIns* ins)
2270 {
2271     LIns* lhs = ins->oprnd1();
2272     LIns* rhs = ins->oprnd2();
2273     LOpcode op = ins->opcode();
2274
2275     NanoAssert(ARM_VFP);
2276     NanoAssert(isCmpDOpcode(op));
2277     NanoAssert(lhs->isD() && rhs->isD());
2278
2279     Register ra, rb;
2280     findRegFor2(FpRegs, lhs, ra, FpRegs, rhs, rb);
2281
2282     int e_bit = (op != LIR_eqd);
2283
2284     // Do the comparison and get results loaded in ARM status register.
2285     // TODO: For asm_condd, we should put the results directly into an ARM
2286     // machine register, then use bit operations to get the result.
2287     FMSTAT();
2288     FCMPD(ra, rb, e_bit);
2289 }
2290
2291 /* Call this with targ set to 0 if the target is not yet known and the branch
2292  * will be patched up later.
2293  */
2294 NIns*
2295 Assembler::asm_branch(bool branchOnFalse, LIns* cond, NIns* targ)
2296 {
2297     LOpcode condop = cond->opcode();
2298     NanoAssert(cond->isCmp());
2299     NanoAssert(ARM_VFP || !isCmpDOpcode(condop));
2300
2301     // The old "never" condition code has special meaning on newer ARM cores,
2302     // so use "always" as a sensible default code.
2303     ConditionCode cc = AL;
2304
2305     // Detect whether or not this is a floating-point comparison.
2306     bool    fp_cond;
2307
2308     // Select the appropriate ARM condition code to match the LIR instruction.
2309     switch (condop)
2310     {
2311         // Floating-point conditions. Note that the VFP LT/LE conditions
2312         // require use of the unsigned condition codes, even though
2313         // float-point comparisons are always signed.
2314         case LIR_eqd:   cc = EQ;    fp_cond = true;     break;
2315         case LIR_ltd:   cc = LO;    fp_cond = true;     break;
2316         case LIR_led:   cc = LS;    fp_cond = true;     break;
2317         case LIR_ged:   cc = GE;    fp_cond = true;     break;
2318         case LIR_gtd:   cc = GT;    fp_cond = true;     break;
2319
2320         // Standard signed and unsigned integer comparisons.
2321         case LIR_eqi:   cc = EQ;    fp_cond = false;    break;
2322         case LIR_lti:   cc = LT;    fp_cond = false;    break;
2323         case LIR_lei:   cc = LE;    fp_cond = false;    break;
2324         case LIR_gti:   cc = GT;    fp_cond = false;    break;
2325         case LIR_gei:   cc = GE;    fp_cond = false;    break;
2326         case LIR_ltui:  cc = LO;    fp_cond = false;    break;
2327         case LIR_leui:  cc = LS;    fp_cond = false;    break;
2328         case LIR_gtui:  cc = HI;    fp_cond = false;    break;
2329         case LIR_geui:  cc = HS;    fp_cond = false;    break;
2330
2331         // Default case for invalid or unexpected LIR instructions.
2332         default:        cc = AL;    fp_cond = false;    break;
2333     }
2334
2335     // Invert the condition if required.
2336     if (branchOnFalse)
2337         cc = OppositeCond(cc);
2338
2339     // Ensure that we got a sensible condition code.
2340     NanoAssert((cc != AL) && (cc != NV));
2341
2342     // Ensure that we don't hit floating-point LIR codes if VFP is disabled.
2343     NanoAssert(ARM_VFP || !fp_cond);
2344
2345     // Emit a suitable branch instruction.
2346     B_cond(cc, targ);
2347
2348     // Store the address of the branch instruction so that we can return it.
2349     // asm_[f]cmp will move _nIns so we must do this now.
2350     NIns *at = _nIns;
2351
2352     asm_cmp(cond);
2353
2354     return at;
2355 }
2356
2357 NIns* Assembler::asm_branch_ov(LOpcode op, NIns* target)
2358 {
2359     // Because MUL can't set the V flag, we use SMULL and CMP to set the Z flag
2360     // to detect overflow on multiply. Thus, if we have a LIR_mulxovi, we must
2361     // be conditional on !Z, not V.
2362     ConditionCode cc = ( (op == LIR_mulxovi) || (op == LIR_muljovi) ? NE : VS );
2363
2364     // Emit a suitable branch instruction.
2365     B_cond(cc, target);
2366     return _nIns;
2367 }
2368
2369 void
2370 Assembler::asm_cmp(LIns *cond)
2371 {
2372     LIns* lhs = cond->oprnd1();
2373     LIns* rhs = cond->oprnd2();
2374
2375     // Forward floating-point comparisons directly to asm_cmpd to simplify
2376     // logic in other methods which need to issue an implicit comparison, but
2377     // don't care about the details of comparison itself.
2378     if (lhs->isD()) {
2379         NanoAssert(rhs->isD());
2380         asm_cmpd(cond);
2381         return;
2382     }
2383
2384     NanoAssert(lhs->isI() && rhs->isI());
2385
2386     // ready to issue the compare
2387     if (rhs->isImmI()) {
2388         int c = rhs->immI();
2389         Register r = findRegFor(lhs, GpRegs);
2390         asm_cmpi(r, c);
2391     } else {
2392         Register ra, rb;
2393         findRegFor2(GpRegs, lhs, ra, GpRegs, rhs, rb);
2394         CMP(ra, rb);
2395     }
2396 }
2397
2398 void
2399 Assembler::asm_cmpi(Register r, int32_t imm)
2400 {
2401     if (imm < 0) {
2402         if (imm > -256) {
2403             ALUi(AL, cmn, 1, 0, r, -imm);
2404         } else {
2405             underrunProtect(4 + LD32_size);
2406             CMP(r, IP);
2407             asm_ld_imm(IP, imm);
2408         }
2409     } else {
2410         if (imm < 256) {
2411             ALUi(AL, cmp, 1, 0, r, imm);
2412         } else {
2413             underrunProtect(4 + LD32_size);
2414             CMP(r, IP);
2415             asm_ld_imm(IP, imm);
2416         }
2417     }
2418 }
2419
2420 void
2421 Assembler::asm_condd(LIns* ins)
2422 {
2423     Register rd = prepareResultReg(ins, GpRegs);
2424
2425     // TODO: Modify cmpd to allow the FP flags to move directly to an ARM
2426     // machine register, then use simple bit operations here rather than
2427     // conditional moves.
2428
2429     switch (ins->opcode()) {
2430         case LIR_eqd:   SETEQ(rd);      break;
2431         case LIR_ltd:   SETLO(rd);      break; // Note: VFP LT/LE operations require
2432         case LIR_led:   SETLS(rd);      break; // unsigned LO/LS condition codes!
2433         case LIR_ged:   SETGE(rd);      break;
2434         case LIR_gtd:   SETGT(rd);      break;
2435         default:        NanoAssert(0);  break;
2436     }
2437
2438     freeResourcesOf(ins);
2439
2440     asm_cmpd(ins);
2441 }
2442
2443 void
2444 Assembler::asm_cond(LIns* ins)
2445 {
2446     Register rd = prepareResultReg(ins, GpRegs);
2447     LOpcode op = ins->opcode();
2448
2449     switch(op)
2450     {
2451         case LIR_eqi:   SETEQ(rd);      break;
2452         case LIR_lti:   SETLT(rd);      break;
2453         case LIR_lei:   SETLE(rd);      break;
2454         case LIR_gti:   SETGT(rd);      break;
2455         case LIR_gei:   SETGE(rd);      break;
2456         case LIR_ltui:  SETLO(rd);      break;
2457         case LIR_leui:  SETLS(rd);      break;
2458         case LIR_gtui:  SETHI(rd);      break;
2459         case LIR_geui:  SETHS(rd);      break;
2460         default:        NanoAssert(0);  break;
2461     }
2462
2463     freeResourcesOf(ins);
2464
2465     asm_cmp(ins);
2466 }
2467
2468 void
2469 Assembler::asm_arith(LIns* ins)
2470 {
2471     LOpcode     op = ins->opcode();
2472     LIns*       lhs = ins->oprnd1();
2473     LIns*       rhs = ins->oprnd2();
2474
2475     // We always need the result register and the first operand register, so
2476     // find them up-front. (If the second operand is constant it is encoded
2477     // differently.)
2478     Register    rd = prepareResultReg(ins, GpRegs);
2479
2480     // Try to re-use the result register for operand 1.
2481     Register    rn = lhs->isInReg() ? lhs->getReg() : rd;
2482
2483     // If the rhs is constant, we can use the instruction-specific code to
2484     // determine if the value can be encoded in an ARM instruction. If the
2485     // value cannot be encoded, it will be loaded into a register.
2486     //
2487     // Note that the MUL instruction can never take an immediate argument so
2488     // even if the argument is constant, we must allocate a register for it.
2489     if (rhs->isImmI() && (op != LIR_muli) && (op != LIR_mulxovi) && (op != LIR_muljovi))
2490     {
2491         int32_t immI = rhs->immI();
2492
2493         switch (op)
2494         {
2495             case LIR_addi:       asm_add_imm(rd, rn, immI);     break;
2496             case LIR_addjovi:
2497             case LIR_addxovi:    asm_add_imm(rd, rn, immI, 1);  break;
2498             case LIR_subi:       asm_sub_imm(rd, rn, immI);     break;
2499             case LIR_subjovi:
2500             case LIR_subxovi:    asm_sub_imm(rd, rn, immI, 1);  break;
2501             case LIR_andi:       asm_and_imm(rd, rn, immI);     break;
2502             case LIR_ori:        asm_orr_imm(rd, rn, immI);     break;
2503             case LIR_xori:       asm_eor_imm(rd, rn, immI);     break;
2504             case LIR_lshi:       LSLi(rd, rn, immI);            break;
2505             case LIR_rshi:       ASRi(rd, rn, immI);            break;
2506             case LIR_rshui:      LSRi(rd, rn, immI);            break;
2507
2508             default:
2509                 NanoAssertMsg(0, "Unsupported");
2510                 break;
2511         }
2512
2513         freeResourcesOf(ins);
2514         if (rd == rn) {
2515             // Mark the re-used register as active.
2516             NanoAssert(!lhs->isInReg());
2517             findSpecificRegForUnallocated(lhs, rd);
2518         }
2519         return;
2520     }
2521
2522     // The rhs is either already in a register or cannot be encoded as an
2523     // Operand 2 constant for this operation.
2524
2525     Register    rm = rhs->isInReg() ? rhs->getReg() : rd;
2526     if ((rm == rn) && (lhs != rhs)) {
2527         // We can't re-use the result register for both arguments, so force one
2528         // into its own register. We favour re-use for operand 2 (rm) here as
2529         // it is more likely to take a fast path for LIR_mul on ARMv5.
2530         rn = findRegFor(lhs, GpRegs & ~rmask(rd));
2531         NanoAssert(lhs->isInReg());
2532     }
2533
2534     switch (op)
2535     {
2536         case LIR_addi:       ADDs(rd, rn, rm, 0);    break;
2537         case LIR_addjovi:
2538         case LIR_addxovi:    ADDs(rd, rn, rm, 1);    break;
2539         case LIR_subi:       SUBs(rd, rn, rm, 0);    break;
2540         case LIR_subjovi:
2541         case LIR_subxovi:    SUBs(rd, rn, rm, 1);    break;
2542         case LIR_andi:       ANDs(rd, rn, rm, 0);    break;
2543         case LIR_ori:        ORRs(rd, rn, rm, 0);    break;
2544         case LIR_xori:       EORs(rd, rn, rm, 0);    break;
2545
2546         case LIR_muli:
2547             if (!ARM_ARCH_AT_LEAST(6) && (rd == rn)) {
2548                 // ARMv4 and ARMv5 cannot handle a MUL where rd == rn, so
2549                 // explicitly assign a new register to rn.
2550                 NanoAssert(!lhs->isInReg());
2551                 rn = findRegFor(lhs, GpRegs & ~rmask(rd) & ~rmask(rm));
2552                 if (lhs == rhs) {
2553                     rm = rn;
2554                 }
2555             }
2556             MUL(rd, rn, rm);
2557             break;
2558         case LIR_muljovi:
2559         case LIR_mulxovi:
2560             if (!ARM_ARCH_AT_LEAST(6) && (rd == rn)) {
2561                 // ARMv5 (and earlier) cannot handle a MUL where rd == rn, so
2562                 // if that is the case, explicitly assign a new register to rn.
2563                 NanoAssert(!lhs->isInReg());
2564                 rn = findRegFor(lhs, GpRegs & ~rmask(rd) & ~rmask(rm));
2565                 if (lhs == rhs) {
2566                     rm = rn;
2567                 }
2568             }
2569             // ARM cannot automatically detect overflow from a MUL operation,
2570             // so we have to perform some other arithmetic:
2571             //   SMULL  rr, ip, ra, rb
2572             //   CMP    ip, rr, ASR #31
2573             // An explanation can be found in bug 521161. This sets Z if we did
2574             // _not_ overflow, and clears it if we did.
2575             ALUr_shi(AL, cmp, 1, SBZ, IP, rd, ASR_imm, 31);
2576             SMULL(rd, IP, rn, rm);
2577             break;
2578
2579         // The shift operations need a mask to match the JavaScript
2580         // specification because the ARM architecture allows a greater shift
2581         // range than JavaScript.
2582         case LIR_lshi:
2583             LSL(rd, rn, IP);
2584             ANDi(IP, rm, 0x1f);
2585             break;
2586         case LIR_rshi:
2587             ASR(rd, rn, IP);
2588             ANDi(IP, rm, 0x1f);
2589             break;
2590         case LIR_rshui:
2591             LSR(rd, rn, IP);
2592             ANDi(IP, rm, 0x1f);
2593             break;
2594         default:
2595             NanoAssertMsg(0, "Unsupported");
2596             break;
2597     }
2598
2599     freeResourcesOf(ins);
2600     // If we re-used the result register, mark it as active.
2601     if (rn == rd) {
2602         NanoAssert(!lhs->isInReg());
2603         findSpecificRegForUnallocated(lhs, rd);
2604     } else if (rm == rd) {
2605         NanoAssert(!rhs->isInReg());
2606         findSpecificRegForUnallocated(rhs, rd);
2607     } else {
2608         NanoAssert(lhs->isInReg());
2609         NanoAssert(rhs->isInReg());
2610     }
2611 }
2612
2613 void
2614 Assembler::asm_neg_not(LIns* ins)
2615 {
2616     LIns* lhs = ins->oprnd1();
2617     Register rr = prepareResultReg(ins, GpRegs);
2618
2619     // If 'lhs' isn't in a register, we can give it the result register.
2620     Register ra = lhs->isInReg() ? lhs->getReg() : rr;
2621
2622     if (ins->isop(LIR_noti)) {
2623         MVN(rr, ra);
2624     } else {
2625         NanoAssert(ins->isop(LIR_negi));
2626         RSBS(rr, ra);
2627     }
2628
2629     freeResourcesOf(ins);
2630     if (!lhs->isInReg()) {
2631         NanoAssert(ra == rr);
2632         // Update the register state to indicate that we've claimed ra for lhs.
2633         findSpecificRegForUnallocated(lhs, ra);
2634     }
2635 }
2636
2637 void
2638 Assembler::asm_load32(LIns* ins)
2639 {
2640     LOpcode op = ins->opcode();
2641     LIns*   base = ins->oprnd1();
2642     int     d = ins->disp();
2643
2644     Register rt = prepareResultReg(ins, GpRegs);
2645     // Try to re-use the result register for the base pointer.
2646     Register rn = base->isInReg() ? base->getReg() : rt;
2647
2648     // TODO: The x86 back-end has a special case where the base address is
2649     // given by LIR_addp. The same technique may be useful here to take
2650     // advantage of ARM's register+register addressing mode.
2651
2652     switch (op) {
2653         case LIR_lduc2ui:
2654             if (isU12(-d) || isU12(d)) {
2655                 LDRB(rt, rn, d);
2656             } else {
2657                 LDRB(rt, IP, d%4096);
2658                 asm_add_imm(IP, rn, d-(d%4096));
2659             }
2660             break;
2661         case LIR_ldus2ui:
2662             // Some ARM machines require 2-byte alignment here.
2663             // Similar to the lduc2ui case, but the max offset is smaller.
2664             if (isU8(-d) || isU8(d)) {
2665                 LDRH(rt, rn, d);
2666             } else {
2667                 LDRH(rt, IP, d%256);
2668                 asm_add_imm(IP, rn, d-(d%256));
2669             }
2670             break;
2671         case LIR_ldi:
2672             // Some ARM machines require 4-byte alignment here.
2673             if (isU12(-d) || isU12(d)) {
2674                 LDR(rt, rn, d);
2675             } else {
2676                 LDR(rt, IP, d%4096);
2677                 asm_add_imm(IP, rn, d-(d%4096));
2678             }
2679             break;
2680         case LIR_ldc2i:
2681             // Like LIR_lduc2ui, but sign-extend.
2682             // Some ARM machines require 2-byte alignment here.
2683             if (isU8(-d) || isU8(d)) {
2684                 LDRSB(rt, rn, d);
2685             } else {
2686                 LDRSB(rn, IP, d%256);
2687                 asm_add_imm(IP, rn, d-(d%256));
2688             }
2689             break;
2690         case LIR_lds2i:
2691             // Like LIR_ldus2ui, but sign-extend.
2692             if (isU8(-d) || isU8(d)) {
2693                 LDRSH(rt, rn, d);
2694             } else {
2695                 LDRSH(rt, IP, d%256);
2696                 asm_add_imm(IP, rn, d-(d%256));
2697             }
2698             break;
2699         default:
2700             NanoAssertMsg(0, "asm_load32 should never receive this LIR opcode");
2701             break;
2702     }
2703
2704     freeResourcesOf(ins);
2705
2706     if (rn == rt) {
2707         NanoAssert(!base->isInReg());
2708         findSpecificRegForUnallocated(base, rn);
2709     }
2710 }
2711
2712 void
2713 Assembler::asm_cmov(LIns* ins)
2714 {
2715     LIns*           condval = ins->oprnd1();
2716     LIns*           iftrue  = ins->oprnd2();
2717     LIns*           iffalse = ins->oprnd3();
2718     RegisterMask    allow = ins->isD() ? FpRegs : GpRegs;
2719     ConditionCode   cc;
2720
2721     NanoAssert(condval->isCmp());
2722     NanoAssert((ins->isop(LIR_cmovi) && iftrue->isI() && iffalse->isI()) ||
2723                (ins->isop(LIR_cmovd) && iftrue->isD() && iffalse->isD()));
2724
2725     Register rd = prepareResultReg(ins, allow);
2726
2727     // Try to re-use the result register for one of the arguments.
2728     Register rt = iftrue->isInReg() ? iftrue->getReg() : rd;
2729     Register rf = iffalse->isInReg() ? iffalse->getReg() : rd;
2730     // Note that iftrue and iffalse may actually be the same, though it
2731     // shouldn't happen with the LIR optimizers turned on.
2732     if ((rt == rf) && (iftrue != iffalse)) {
2733         // We can't re-use the result register for both arguments, so force one
2734         // into its own register.
2735         rf = findRegFor(iffalse, allow & ~rmask(rd));
2736         NanoAssert(iffalse->isInReg());
2737     }
2738
2739     switch(condval->opcode()) {
2740         default:        NanoAssert(0);
2741         // Integer comparisons.
2742         case LIR_eqi:   cc = EQ;        break;
2743         case LIR_lti:   cc = LT;        break;
2744         case LIR_lei:   cc = LE;        break;
2745         case LIR_gti:   cc = GT;        break;
2746         case LIR_gei:   cc = GE;        break;
2747         case LIR_ltui:  cc = LO;        break;
2748         case LIR_leui:  cc = LS;        break;
2749         case LIR_gtui:  cc = HI;        break;
2750         case LIR_geui:  cc = HS;        break;
2751         // VFP comparisons.
2752         case LIR_eqd:   cc = EQ;        break;
2753         case LIR_ltd:   cc = LO;        break;
2754         case LIR_led:   cc = LS;        break;
2755         case LIR_ged:   cc = GE;        break;
2756         case LIR_gtd:   cc = GT;        break;
2757     }
2758
2759     // Emit something like this:
2760     //      CMP         [...]
2761     //      MOV(CC)     rd, rf
2762     //      MOV(!CC)    rd, rt
2763     // If the destination was re-used for an input, the corresponding MOV will
2764     // be omitted as it will be redundant.
2765     if (ins->isI()) {
2766         if (rd != rf) {
2767             MOV_cond(OppositeCond(cc), rd, rf);
2768         }
2769         if (rd != rt) {
2770             MOV_cond(cc, rd, rt);
2771         }
2772     } else if (ins->isD()) {
2773         // The VFP sequence is similar to the integer sequence, but uses a
2774         // VFP instruction in place of MOV.
2775         NanoAssert(ARM_VFP);
2776         if (rd != rf) {
2777             FCPYD_cond(OppositeCond(cc), rd, rf);
2778         }
2779         if (rd != rt) {
2780             FCPYD_cond(cc, rd, rt);
2781         }
2782     } else {
2783         NanoAssert(0);
2784     }
2785
2786     freeResourcesOf(ins);
2787
2788     // If we re-used the result register, mark it as active for either iftrue
2789     // or iffalse (or both in the corner-case where they're the same).
2790     if (rt == rd) {
2791         NanoAssert(!iftrue->isInReg());
2792         findSpecificRegForUnallocated(iftrue, rd);
2793     } else if (rf == rd) {
2794         NanoAssert(!iffalse->isInReg());
2795         findSpecificRegForUnallocated(iffalse, rd);
2796     } else {
2797         NanoAssert(iffalse->isInReg());
2798         NanoAssert(iftrue->isInReg());
2799     }
2800
2801     asm_cmp(condval);
2802 }
2803
2804 void
2805 Assembler::asm_qhi(LIns* ins)
2806 {
2807     Register rd = prepareResultReg(ins, GpRegs);
2808     LIns *lhs = ins->oprnd1();
2809     int d = findMemFor(lhs);
2810
2811     LDR(rd, FP, d+4);
2812
2813     freeResourcesOf(ins);
2814 }
2815
2816 void
2817 Assembler::asm_qlo(LIns* ins)
2818 {
2819     Register rd = prepareResultReg(ins, GpRegs);
2820     LIns *lhs = ins->oprnd1();
2821     int d = findMemFor(lhs);
2822
2823     LDR(rd, FP, d);
2824
2825     freeResourcesOf(ins);
2826 }
2827
2828 void
2829 Assembler::asm_param(LIns* ins)
2830 {
2831     uint32_t a = ins->paramArg();
2832     uint32_t kind = ins->paramKind();
2833     if (kind == 0) {
2834         // Ordinary parameter. These are always (32-bit-)word-sized, and will
2835         // be in the first four registers (argRegs) and then on the stack.
2836         if (a < 4) {
2837             // Register argument.
2838             prepareResultReg(ins, rmask(argRegs[a]));
2839         } else {
2840             // Stack argument.
2841             Register r = prepareResultReg(ins, GpRegs);
2842             int d = (a - 4) * sizeof(intptr_t) + 8;
2843             LDR(r, FP, d);
2844         }
2845     } else {
2846         // Saved parameter.
2847         NanoAssert(a < (sizeof(savedRegs)/sizeof(savedRegs[0])));
2848         prepareResultReg(ins, rmask(savedRegs[a]));
2849     }
2850     freeResourcesOf(ins);
2851 }
2852
2853 void
2854 Assembler::asm_immi(LIns* ins)
2855 {
2856     Register rd = prepareResultReg(ins, GpRegs);
2857     asm_ld_imm(rd, ins->immI());
2858     freeResourcesOf(ins);
2859 }
2860
2861 void
2862 Assembler::asm_ret(LIns *ins)
2863 {
2864     genEpilogue();
2865
2866     // NB: our contract with genEpilogue is actually that the return value
2867     // we are intending for R0 is currently IP, not R0. This has to do with
2868     // the strange dual-nature of the patchable jump in a side-exit. See
2869     // nPatchBranch.
2870     //
2871     // With hardware floating point ABI we can skip this for retd.
2872     if (!(ARM_EABI_HARD && ins->isop(LIR_retd))) {
2873         MOV(IP, R0);
2874     }
2875
2876     // Pop the stack frame.
2877     MOV(SP,FP);
2878
2879     releaseRegisters();
2880     assignSavedRegs();
2881     LIns *value = ins->oprnd1();
2882     if (ins->isop(LIR_reti)) {
2883         findSpecificRegFor(value, R0);
2884     }
2885     else {
2886         NanoAssert(ins->isop(LIR_retd));
2887         if (ARM_VFP) {
2888 #ifdef NJ_ARM_EABI_HARD_FLOAT
2889             findSpecificRegFor(value, D0);
2890 #else
2891             Register reg = findRegFor(value, FpRegs);
2892             FMRRD(R0, R1, reg);
2893 #endif
2894         } else {
2895             NanoAssert(value->isop(LIR_ii2d));
2896             findSpecificRegFor(value->oprnd1(), R0); // lo
2897             findSpecificRegFor(value->oprnd2(), R1); // hi
2898         }
2899     }
2900 }
2901
2902 void
2903 Assembler::asm_jtbl(LIns* ins, NIns** table)
2904 {
2905     Register indexreg = findRegFor(ins->oprnd1(), GpRegs);
2906     Register tmp = registerAllocTmp(GpRegs & ~rmask(indexreg));
2907     LDR_scaled(PC, tmp, indexreg, 2);      // LDR PC, [tmp + index*4]
2908     asm_ld_imm(tmp, (int32_t)table);       // tmp = #table
2909 }
2910
2911 void Assembler::swapCodeChunks() {
2912     if (!_nExitIns)
2913         codeAlloc(exitStart, exitEnd, _nExitIns verbose_only(, exitBytes), NJ_MAX_CPOOL_OFFSET);
2914     if (!_nExitSlot)
2915         _nExitSlot = exitStart;
2916     SWAP(NIns*, _nIns, _nExitIns);
2917     SWAP(NIns*, _nSlot, _nExitSlot);        // this one is ARM-specific
2918     SWAP(NIns*, codeStart, exitStart);
2919     SWAP(NIns*, codeEnd, exitEnd);
2920     verbose_only( SWAP(size_t, codeBytes, exitBytes); )
2921 }
2922
2923 void Assembler::asm_insert_random_nop() {
2924     NanoAssert(0); // not supported
2925 }
2926
2927 }
2928 #endif /* FEATURE_NANOJIT */