sim/aarch64/simulator.c

   1 /* simulator.c -- Interface for the AArch64 simulator.
   2
   3    Copyright (C) 2015-2017 Free Software Foundation, Inc.
   4
   5    Contributed by Red Hat.
   6
   7    This file is part of GDB.
   8
   9    This program is free software; you can redistribute it and/or modify
  10    it under the terms of the GNU General Public License as published by
  11    the Free Software Foundation; either version 3 of the License, or
  12    (at your option) any later version.
  13
  14    This program is distributed in the hope that it will be useful,
  15    but WITHOUT ANY WARRANTY; without even the implied warranty of
  16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17    GNU General Public License for more details.
  18
  19    You should have received a copy of the GNU General Public License
  20    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include <stdlib.h>
  24 #include <stdio.h>
  25 #include <string.h>
  26 #include <sys/types.h>
  27 #include <math.h>
  28 #include <time.h>
  29 #include <limits.h>
  30
  31 #include "simulator.h"
  32 #include "cpustate.h"
  33 #include "memory.h"
  34
  35 #define NO_SP 0
  36 #define SP_OK 1
  37
  38 #define TST(_flag)   (aarch64_test_CPSR_bit (cpu, _flag))
  39 #define IS_SET(_X)   (TST (( _X )) ? 1 : 0)
  40 #define IS_CLEAR(_X) (TST (( _X )) ? 0 : 1)
  41
  42 /* Space saver macro.  */
  43 #define INSTR(HIGH, LOW) uimm (aarch64_get_instr (cpu), (HIGH), (LOW))
  44
  45 #define HALT_UNALLOC                                                    \
  46   do                                                                    \
  47     {                                                                   \
  48       TRACE_DISASM (cpu, aarch64_get_PC (cpu));                         \
  49       TRACE_INSN (cpu,                                                  \
  50                   "Unallocated instruction detected at sim line %d,"    \
  51                   " exe addr %" PRIx64,                                 \
  52                   __LINE__, aarch64_get_PC (cpu));                      \
  53       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),\
  54                        sim_stopped, SIM_SIGILL);                        \
  55     }                                                                   \
  56   while (0)
  57
  58 #define HALT_NYI                                                        \
  59   do                                                                    \
  60     {                                                                   \
  61       TRACE_DISASM (cpu, aarch64_get_PC (cpu));                         \
  62       TRACE_INSN (cpu,                                                  \
  63                   "Unimplemented instruction detected at sim line %d,"  \
  64                   " exe addr %" PRIx64,                                 \
  65                   __LINE__, aarch64_get_PC (cpu));                      \
  66       if (! TRACE_ANY_P (cpu))                                          \
  67         sim_io_eprintf (CPU_STATE (cpu), "SIM Error: Unimplemented instruction: %#08x\n", \
  68                         aarch64_get_instr (cpu));                       \
  69       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),\
  70                        sim_stopped, SIM_SIGABRT);                       \
  71     }                                                                   \
  72   while (0)
  73
  74 #define NYI_assert(HI, LO, EXPECTED)                                    \
  75   do                                                                    \
  76     {                                                                   \
  77       if (INSTR ((HI), (LO)) != (EXPECTED))                             \
  78         HALT_NYI;                                                       \
  79     }                                                                   \
  80   while (0)
  81
  82 /* Helper functions used by expandLogicalImmediate.  */
  83
  84 /* for i = 1, ... N result<i-1> = 1 other bits are zero  */
  85 static inline uint64_t
  86 ones (int N)
  87 {
  88   return (N == 64 ? (uint64_t)-1UL : ((1UL << N) - 1));
  89 }
  90
  91 /* result<0> to val<N>  */
  92 static inline uint64_t
  93 pickbit (uint64_t val, int N)
  94 {
  95   return pickbits64 (val, N, N);
  96 }
  97
  98 static uint64_t
  99 expand_logical_immediate (uint32_t S, uint32_t R, uint32_t N)
 100 {
 101   uint64_t mask;
 102   uint64_t imm;
 103   unsigned simd_size;
 104
 105   /* The immediate value is S+1 bits to 1, left rotated by SIMDsize - R
 106      (in other words, right rotated by R), then replicated. */
 107   if (N != 0)
 108     {
 109       simd_size = 64;
 110       mask = 0xffffffffffffffffull;
 111     }
 112   else
 113     {
 114       switch (S)
 115         {
 116         case 0x00 ... 0x1f: /* 0xxxxx */ simd_size = 32;           break;
 117         case 0x20 ... 0x2f: /* 10xxxx */ simd_size = 16; S &= 0xf; break;
 118         case 0x30 ... 0x37: /* 110xxx */ simd_size =  8; S &= 0x7; break;
 119         case 0x38 ... 0x3b: /* 1110xx */ simd_size =  4; S &= 0x3; break;
 120         case 0x3c ... 0x3d: /* 11110x */ simd_size =  2; S &= 0x1; break;
 121         default: return 0;
 122         }
 123       mask = (1ull << simd_size) - 1;
 124       /* Top bits are IGNORED.  */
 125       R &= simd_size - 1;
 126     }
 127
 128   /* NOTE: if S = simd_size - 1 we get 0xf..f which is rejected.  */
 129   if (S == simd_size - 1)
 130     return 0;
 131
 132   /* S+1 consecutive bits to 1.  */
 133   /* NOTE: S can't be 63 due to detection above.  */
 134   imm = (1ull << (S + 1)) - 1;
 135
 136   /* Rotate to the left by simd_size - R.  */
 137   if (R != 0)
 138     imm = ((imm << (simd_size - R)) & mask) | (imm >> R);
 139
 140   /* Replicate the value according to SIMD size.  */
 141   switch (simd_size)
 142     {
 143     case  2: imm = (imm <<  2) | imm;
 144     case  4: imm = (imm <<  4) | imm;
 145     case  8: imm = (imm <<  8) | imm;
 146     case 16: imm = (imm << 16) | imm;
 147     case 32: imm = (imm << 32) | imm;
 148     case 64: break;
 149     default: return 0;
 150     }
 151
 152   return imm;
 153 }
 154
 155 /* Instr[22,10] encodes N immr and imms. we want a lookup table
 156    for each possible combination i.e. 13 bits worth of int entries.  */
 157 #define  LI_TABLE_SIZE  (1 << 13)
 158 static uint64_t LITable[LI_TABLE_SIZE];
 159
 160 void
 161 aarch64_init_LIT_table (void)
 162 {
 163   unsigned index;
 164
 165   for (index = 0; index < LI_TABLE_SIZE; index++)
 166     {
 167       uint32_t N    = uimm (index, 12, 12);
 168       uint32_t immr = uimm (index, 11, 6);
 169       uint32_t imms = uimm (index, 5, 0);
 170
 171       LITable [index] = expand_logical_immediate (imms, immr, N);
 172     }
 173 }
 174
 175 static void
 176 dexNotify (sim_cpu *cpu)
 177 {
 178   /* instr[14,0] == type : 0 ==> method entry, 1 ==> method reentry
 179                            2 ==> exit Java, 3 ==> start next bytecode.  */
 180   uint32_t type = INSTR (14, 0);
 181
 182   TRACE_EVENTS (cpu, "Notify Insn encountered, type = 0x%x", type);
 183
 184   switch (type)
 185     {
 186     case 0:
 187       /* aarch64_notifyMethodEntry (aarch64_get_reg_u64 (cpu, R23, 0),
 188          aarch64_get_reg_u64 (cpu, R22, 0));  */
 189       break;
 190     case 1:
 191       /* aarch64_notifyMethodReentry (aarch64_get_reg_u64 (cpu, R23, 0),
 192          aarch64_get_reg_u64 (cpu, R22, 0));  */
 193       break;
 194     case 2:
 195       /* aarch64_notifyMethodExit ();  */
 196       break;
 197     case 3:
 198       /* aarch64_notifyBCStart (aarch64_get_reg_u64 (cpu, R23, 0),
 199          aarch64_get_reg_u64 (cpu, R22, 0));  */
 200       break;
 201     }
 202 }
 203
 204 /* secondary decode within top level groups  */
 205
 206 static void
 207 dexPseudo (sim_cpu *cpu)
 208 {
 209   /* assert instr[28,27] = 00
 210
 211      We provide 2 pseudo instructions:
 212
 213      HALT stops execution of the simulator causing an immediate
 214      return to the x86 code which entered it.
 215
 216      CALLOUT initiates recursive entry into x86 code.  A register
 217      argument holds the address of the x86 routine.  Immediate
 218      values in the instruction identify the number of general
 219      purpose and floating point register arguments to be passed
 220      and the type of any value to be returned.  */
 221
 222   uint32_t PSEUDO_HALT      =  0xE0000000U;
 223   uint32_t PSEUDO_CALLOUT   =  0x00018000U;
 224   uint32_t PSEUDO_CALLOUTR  =  0x00018001U;
 225   uint32_t PSEUDO_NOTIFY    =  0x00014000U;
 226   uint32_t dispatch;
 227
 228   if (aarch64_get_instr (cpu) == PSEUDO_HALT)
 229     {
 230       TRACE_EVENTS (cpu, " Pseudo Halt Instruction");
 231       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
 232                        sim_stopped, SIM_SIGTRAP);
 233     }
 234
 235   dispatch = INSTR (31, 15);
 236
 237   /* We do not handle callouts at the moment.  */
 238   if (dispatch == PSEUDO_CALLOUT || dispatch == PSEUDO_CALLOUTR)
 239     {
 240       TRACE_EVENTS (cpu, " Callout");
 241       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
 242                        sim_stopped, SIM_SIGABRT);
 243     }
 244
 245   else if (dispatch == PSEUDO_NOTIFY)
 246     dexNotify (cpu);
 247
 248   else
 249     HALT_UNALLOC;
 250 }
 251
 252 /* Load-store single register (unscaled offset)
 253    These instructions employ a base register plus an unscaled signed
 254    9 bit offset.
 255
 256    N.B. the base register (source) can be Xn or SP. all other
 257    registers may not be SP.  */
 258
 259 /* 32 bit load 32 bit unscaled signed 9 bit.  */
 260 static void
 261 ldur32 (sim_cpu *cpu, int32_t offset)
 262 {
 263   unsigned rn = INSTR (9, 5);
 264   unsigned rt = INSTR (4, 0);
 265
 266   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 267   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u32
 268                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 269                         + offset));
 270 }
 271
 272 /* 64 bit load 64 bit unscaled signed 9 bit.  */
 273 static void
 274 ldur64 (sim_cpu *cpu, int32_t offset)
 275 {
 276   unsigned rn = INSTR (9, 5);
 277   unsigned rt = INSTR (4, 0);
 278
 279   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 280   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u64
 281                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 282                         + offset));
 283 }
 284
 285 /* 32 bit load zero-extended byte unscaled signed 9 bit.  */
 286 static void
 287 ldurb32 (sim_cpu *cpu, int32_t offset)
 288 {
 289   unsigned rn = INSTR (9, 5);
 290   unsigned rt = INSTR (4, 0);
 291
 292   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 293   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u8
 294                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 295                         + offset));
 296 }
 297
 298 /* 32 bit load sign-extended byte unscaled signed 9 bit.  */
 299 static void
 300 ldursb32 (sim_cpu *cpu, int32_t offset)
 301 {
 302   unsigned rn = INSTR (9, 5);
 303   unsigned rt = INSTR (4, 0);
 304
 305   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 306   aarch64_set_reg_u64 (cpu, rt, NO_SP, (uint32_t) aarch64_get_mem_s8
 307                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 308                         + offset));
 309 }
 310
 311 /* 64 bit load sign-extended byte unscaled signed 9 bit.  */
 312 static void
 313 ldursb64 (sim_cpu *cpu, int32_t offset)
 314 {
 315   unsigned rn = INSTR (9, 5);
 316   unsigned rt = INSTR (4, 0);
 317
 318   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 319   aarch64_set_reg_s64 (cpu, rt, NO_SP, aarch64_get_mem_s8
 320                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 321                         + offset));
 322 }
 323
 324 /* 32 bit load zero-extended short unscaled signed 9 bit  */
 325 static void
 326 ldurh32 (sim_cpu *cpu, int32_t offset)
 327 {
 328   unsigned rn = INSTR (9, 5);
 329   unsigned rd = INSTR (4, 0);
 330
 331   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 332   aarch64_set_reg_u64 (cpu, rd, NO_SP, aarch64_get_mem_u16
 333                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 334                         + offset));
 335 }
 336
 337 /* 32 bit load sign-extended short unscaled signed 9 bit  */
 338 static void
 339 ldursh32 (sim_cpu *cpu, int32_t offset)
 340 {
 341   unsigned rn = INSTR (9, 5);
 342   unsigned rd = INSTR (4, 0);
 343
 344   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 345   aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) aarch64_get_mem_s16
 346                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 347                         + offset));
 348 }
 349
 350 /* 64 bit load sign-extended short unscaled signed 9 bit  */
 351 static void
 352 ldursh64 (sim_cpu *cpu, int32_t offset)
 353 {
 354   unsigned rn = INSTR (9, 5);
 355   unsigned rt = INSTR (4, 0);
 356
 357   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 358   aarch64_set_reg_s64 (cpu, rt, NO_SP, aarch64_get_mem_s16
 359                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 360                         + offset));
 361 }
 362
 363 /* 64 bit load sign-extended word unscaled signed 9 bit  */
 364 static void
 365 ldursw (sim_cpu *cpu, int32_t offset)
 366 {
 367   unsigned rn = INSTR (9, 5);
 368   unsigned rd = INSTR (4, 0);
 369
 370   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 371   aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) aarch64_get_mem_s32
 372                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 373                         + offset));
 374 }
 375
 376 /* N.B. with stores the value in source is written to the address
 377    identified by source2 modified by offset.  */
 378
 379 /* 32 bit store 32 bit unscaled signed 9 bit.  */
 380 static void
 381 stur32 (sim_cpu *cpu, int32_t offset)
 382 {
 383   unsigned rn = INSTR (9, 5);
 384   unsigned rd = INSTR (4, 0);
 385
 386   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 387   aarch64_set_mem_u32 (cpu,
 388                        aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
 389                        aarch64_get_reg_u32 (cpu, rd, NO_SP));
 390 }
 391
 392 /* 64 bit store 64 bit unscaled signed 9 bit  */
 393 static void
 394 stur64 (sim_cpu *cpu, int32_t offset)
 395 {
 396   unsigned rn = INSTR (9, 5);
 397   unsigned rd = INSTR (4, 0);
 398
 399   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 400   aarch64_set_mem_u64 (cpu,
 401                        aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
 402                        aarch64_get_reg_u64 (cpu, rd, NO_SP));
 403 }
 404
 405 /* 32 bit store byte unscaled signed 9 bit  */
 406 static void
 407 sturb (sim_cpu *cpu, int32_t offset)
 408 {
 409   unsigned rn = INSTR (9, 5);
 410   unsigned rd = INSTR (4, 0);
 411
 412   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 413   aarch64_set_mem_u8 (cpu,
 414                       aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
 415                       aarch64_get_reg_u8 (cpu, rd, NO_SP));
 416 }
 417
 418 /* 32 bit store short unscaled signed 9 bit  */
 419 static void
 420 sturh (sim_cpu *cpu, int32_t offset)
 421 {
 422   unsigned rn = INSTR (9, 5);
 423   unsigned rd = INSTR (4, 0);
 424
 425   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 426   aarch64_set_mem_u16 (cpu,
 427                        aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
 428                        aarch64_get_reg_u16 (cpu, rd, NO_SP));
 429 }
 430
 431 /* Load single register pc-relative label
 432    Offset is a signed 19 bit immediate count in words
 433    rt may not be SP.  */
 434
 435 /* 32 bit pc-relative load  */
 436 static void
 437 ldr32_pcrel (sim_cpu *cpu, int32_t offset)
 438 {
 439   unsigned rd = INSTR (4, 0);
 440
 441   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 442   aarch64_set_reg_u64 (cpu, rd, NO_SP,
 443                        aarch64_get_mem_u32
 444                        (cpu, aarch64_get_PC (cpu) + offset * 4));
 445 }
 446
 447 /* 64 bit pc-relative load  */
 448 static void
 449 ldr_pcrel (sim_cpu *cpu, int32_t offset)
 450 {
 451   unsigned rd = INSTR (4, 0);
 452
 453   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 454   aarch64_set_reg_u64 (cpu, rd, NO_SP,
 455                        aarch64_get_mem_u64
 456                        (cpu, aarch64_get_PC (cpu) + offset * 4));
 457 }
 458
 459 /* sign extended 32 bit pc-relative load  */
 460 static void
 461 ldrsw_pcrel (sim_cpu *cpu, int32_t offset)
 462 {
 463   unsigned rd = INSTR (4, 0);
 464
 465   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 466   aarch64_set_reg_u64 (cpu, rd, NO_SP,
 467                        aarch64_get_mem_s32
 468                        (cpu, aarch64_get_PC (cpu) + offset * 4));
 469 }
 470
 471 /* float pc-relative load  */
 472 static void
 473 fldrs_pcrel (sim_cpu *cpu, int32_t offset)
 474 {
 475   unsigned int rd = INSTR (4, 0);
 476
 477   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 478   aarch64_set_vec_u32 (cpu, rd, 0,
 479                        aarch64_get_mem_u32
 480                        (cpu, aarch64_get_PC (cpu) + offset * 4));
 481 }
 482
 483 /* double pc-relative load  */
 484 static void
 485 fldrd_pcrel (sim_cpu *cpu, int32_t offset)
 486 {
 487   unsigned int st = INSTR (4, 0);
 488
 489   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 490   aarch64_set_vec_u64 (cpu, st, 0,
 491                        aarch64_get_mem_u64
 492                        (cpu, aarch64_get_PC (cpu) + offset * 4));
 493 }
 494
 495 /* long double pc-relative load.  */
 496 static void
 497 fldrq_pcrel (sim_cpu *cpu, int32_t offset)
 498 {
 499   unsigned int st = INSTR (4, 0);
 500   uint64_t addr = aarch64_get_PC (cpu) + offset * 4;
 501   FRegister a;
 502
 503   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 504   aarch64_get_mem_long_double (cpu, addr, & a);
 505   aarch64_set_FP_long_double (cpu, st, a);
 506 }
 507
 508 /* This can be used to scale an offset by applying
 509    the requisite shift. the second argument is either
 510    16, 32 or 64.  */
 511
 512 #define SCALE(_offset, _elementSize) \
 513     ((_offset) << ScaleShift ## _elementSize)
 514
 515 /* This can be used to optionally scale a register derived offset
 516    by applying the requisite shift as indicated by the Scaling
 517    argument.  The second argument is either Byte, Short, Word
 518    or Long. The third argument is either Scaled or Unscaled.
 519    N.B. when _Scaling is Scaled the shift gets ANDed with
 520    all 1s while when it is Unscaled it gets ANDed with 0.  */
 521
 522 #define OPT_SCALE(_offset, _elementType, _Scaling) \
 523   ((_offset) << (_Scaling ? ScaleShift ## _elementType : 0))
 524
 525 /* This can be used to zero or sign extend a 32 bit register derived
 526    value to a 64 bit value.  the first argument must be the value as
 527    a uint32_t and the second must be either UXTW or SXTW. The result
 528    is returned as an int64_t.  */
 529
 530 static inline int64_t
 531 extend (uint32_t value, Extension extension)
 532 {
 533   union
 534   {
 535     uint32_t u;
 536     int32_t   n;
 537   } x;
 538
 539   /* A branchless variant of this ought to be possible.  */
 540   if (extension == UXTW || extension == NoExtension)
 541     return value;
 542
 543   x.u = value;
 544   return x.n;
 545 }
 546
 547 /* Scalar Floating Point
 548
 549    FP load/store single register (4 addressing modes)
 550
 551    N.B. the base register (source) can be the stack pointer.
 552    The secondary source register (source2) can only be an Xn register.  */
 553
 554 /* Load 32 bit unscaled signed 9 bit with pre- or post-writeback.  */
 555 static void
 556 fldrs_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 557 {
 558   unsigned rn = INSTR (9, 5);
 559   unsigned st = INSTR (4, 0);
 560   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 561
 562   if (wb != Post)
 563     address += offset;
 564
 565   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 566   aarch64_set_vec_u32 (cpu, st, 0, aarch64_get_mem_u32 (cpu, address));
 567   if (wb == Post)
 568     address += offset;
 569
 570   if (wb != NoWriteBack)
 571     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
 572 }
 573
 574 /* Load 8 bit with unsigned 12 bit offset.  */
 575 static void
 576 fldrb_abs (sim_cpu *cpu, uint32_t offset)
 577 {
 578   unsigned rd = INSTR (4, 0);
 579   unsigned rn = INSTR (9, 5);
 580   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset;
 581
 582   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 583   aarch64_set_vec_u8 (cpu, rd, 0, aarch64_get_mem_u32 (cpu, addr));
 584 }
 585
 586 /* Load 16 bit scaled unsigned 12 bit.  */
 587 static void
 588 fldrh_abs (sim_cpu *cpu, uint32_t offset)
 589 {
 590   unsigned rd = INSTR (4, 0);
 591   unsigned rn = INSTR (9, 5);
 592   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 16);
 593
 594   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 595   aarch64_set_vec_u16 (cpu, rd, 0, aarch64_get_mem_u16 (cpu, addr));
 596 }
 597
 598 /* Load 32 bit scaled unsigned 12 bit.  */
 599 static void
 600 fldrs_abs (sim_cpu *cpu, uint32_t offset)
 601 {
 602   unsigned rd = INSTR (4, 0);
 603   unsigned rn = INSTR (9, 5);
 604   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 32);
 605
 606   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 607   aarch64_set_vec_u32 (cpu, rd, 0, aarch64_get_mem_u32 (cpu, addr));
 608 }
 609
 610 /* Load 64 bit scaled unsigned 12 bit.  */
 611 static void
 612 fldrd_abs (sim_cpu *cpu, uint32_t offset)
 613 {
 614   unsigned rd = INSTR (4, 0);
 615   unsigned rn = INSTR (9, 5);
 616   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 64);
 617
 618   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 619   aarch64_set_vec_u64 (cpu, rd, 0, aarch64_get_mem_u64 (cpu, addr));
 620 }
 621
 622 /* Load 128 bit scaled unsigned 12 bit.  */
 623 static void
 624 fldrq_abs (sim_cpu *cpu, uint32_t offset)
 625 {
 626   unsigned rd = INSTR (4, 0);
 627   unsigned rn = INSTR (9, 5);
 628   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 128);
 629
 630   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 631   aarch64_set_vec_u64 (cpu, rd, 0, aarch64_get_mem_u64 (cpu, addr));
 632   aarch64_set_vec_u64 (cpu, rd, 1, aarch64_get_mem_u64 (cpu, addr + 8));
 633 }
 634
 635 /* Load 32 bit scaled or unscaled zero- or sign-extended
 636    32-bit register offset.  */
 637 static void
 638 fldrs_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 639 {
 640   unsigned rm = INSTR (20, 16);
 641   unsigned rn = INSTR (9, 5);
 642   unsigned st = INSTR (4, 0);
 643   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 644   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
 645   uint64_t displacement = OPT_SCALE (extended, 32, scaling);
 646
 647   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 648   aarch64_set_vec_u32 (cpu, st, 0, aarch64_get_mem_u32
 649                        (cpu, address + displacement));
 650 }
 651
 652 /* Load 64 bit unscaled signed 9 bit with pre- or post-writeback.  */
 653 static void
 654 fldrd_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 655 {
 656   unsigned rn = INSTR (9, 5);
 657   unsigned st = INSTR (4, 0);
 658   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 659
 660   if (wb != Post)
 661     address += offset;
 662
 663   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 664   aarch64_set_vec_u64 (cpu, st, 0, aarch64_get_mem_u64 (cpu, address));
 665
 666   if (wb == Post)
 667     address += offset;
 668
 669   if (wb != NoWriteBack)
 670     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
 671 }
 672
 673 /* Load 64 bit scaled or unscaled zero- or sign-extended 32-bit register offset.  */
 674 static void
 675 fldrd_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 676 {
 677   unsigned rm = INSTR (20, 16);
 678   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
 679   uint64_t displacement = OPT_SCALE (extended, 64, scaling);
 680
 681   fldrd_wb (cpu, displacement, NoWriteBack);
 682 }
 683
 684 /* Load 128 bit unscaled signed 9 bit with pre- or post-writeback.  */
 685 static void
 686 fldrq_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 687 {
 688   FRegister a;
 689   unsigned rn = INSTR (9, 5);
 690   unsigned st = INSTR (4, 0);
 691   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 692
 693   if (wb != Post)
 694     address += offset;
 695
 696   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 697   aarch64_get_mem_long_double (cpu, address, & a);
 698   aarch64_set_FP_long_double (cpu, st, a);
 699
 700   if (wb == Post)
 701     address += offset;
 702
 703   if (wb != NoWriteBack)
 704     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
 705 }
 706
 707 /* Load 128 bit scaled or unscaled zero- or sign-extended 32-bit register offset  */
 708 static void
 709 fldrq_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 710 {
 711   unsigned rm = INSTR (20, 16);
 712   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
 713   uint64_t displacement = OPT_SCALE (extended, 128, scaling);
 714
 715   fldrq_wb (cpu, displacement, NoWriteBack);
 716 }
 717
 718 /* Memory Access
 719
 720    load-store single register
 721    There are four addressing modes available here which all employ a
 722    64 bit source (base) register.
 723
 724    N.B. the base register (source) can be the stack pointer.
 725    The secondary source register (source2)can only be an Xn register.
 726
 727    Scaled, 12-bit, unsigned immediate offset, without pre- and
 728    post-index options.
 729    Unscaled, 9-bit, signed immediate offset with pre- or post-index
 730    writeback.
 731    scaled or unscaled 64-bit register offset.
 732    scaled or unscaled 32-bit extended register offset.
 733
 734    All offsets are assumed to be raw from the decode i.e. the
 735    simulator is expected to adjust scaled offsets based on the
 736    accessed data size with register or extended register offset
 737    versions the same applies except that in the latter case the
 738    operation may also require a sign extend.
 739
 740    A separate method is provided for each possible addressing mode.  */
 741
 742 /* 32 bit load 32 bit scaled unsigned 12 bit  */
 743 static void
 744 ldr32_abs (sim_cpu *cpu, uint32_t offset)
 745 {
 746   unsigned rn = INSTR (9, 5);
 747   unsigned rt = INSTR (4, 0);
 748
 749   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 750   /* The target register may not be SP but the source may be.  */
 751   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u32
 752                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 753                         + SCALE (offset, 32)));
 754 }
 755
 756 /* 32 bit load 32 bit unscaled signed 9 bit with pre- or post-writeback.  */
 757 static void
 758 ldr32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 759 {
 760   unsigned rn = INSTR (9, 5);
 761   unsigned rt = INSTR (4, 0);
 762   uint64_t address;
 763
 764   if (rn == rt && wb != NoWriteBack)
 765     HALT_UNALLOC;
 766
 767   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 768
 769   if (wb != Post)
 770     address += offset;
 771
 772   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 773   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u32 (cpu, address));
 774
 775   if (wb == Post)
 776     address += offset;
 777
 778   if (wb != NoWriteBack)
 779     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
 780 }
 781
 782 /* 32 bit load 32 bit scaled or unscaled
 783    zero- or sign-extended 32-bit register offset  */
 784 static void
 785 ldr32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 786 {
 787   unsigned rm = INSTR (20, 16);
 788   unsigned rn = INSTR (9, 5);
 789   unsigned rt = INSTR (4, 0);
 790   /* rn may reference SP, rm and rt must reference ZR  */
 791
 792   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 793   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
 794   uint64_t displacement =  OPT_SCALE (extended, 32, scaling);
 795
 796   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 797   aarch64_set_reg_u64 (cpu, rt, NO_SP,
 798                        aarch64_get_mem_u32 (cpu, address + displacement));
 799 }
 800
 801 /* 64 bit load 64 bit scaled unsigned 12 bit  */
 802 static void
 803 ldr_abs (sim_cpu *cpu, uint32_t offset)
 804 {
 805   unsigned rn = INSTR (9, 5);
 806   unsigned rt = INSTR (4, 0);
 807
 808   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 809   /* The target register may not be SP but the source may be.  */
 810   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u64
 811                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 812                         + SCALE (offset, 64)));
 813 }
 814
 815 /* 64 bit load 64 bit unscaled signed 9 bit with pre- or post-writeback.  */
 816 static void
 817 ldr_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 818 {
 819   unsigned rn = INSTR (9, 5);
 820   unsigned rt = INSTR (4, 0);
 821   uint64_t address;
 822
 823   if (rn == rt && wb != NoWriteBack)
 824     HALT_UNALLOC;
 825
 826   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 827
 828   if (wb != Post)
 829     address += offset;
 830
 831   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 832   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u64 (cpu, address));
 833
 834   if (wb == Post)
 835     address += offset;
 836
 837   if (wb != NoWriteBack)
 838     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
 839 }
 840
 841 /* 64 bit load 64 bit scaled or unscaled zero-
 842    or sign-extended 32-bit register offset.  */
 843 static void
 844 ldr_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 845 {
 846   unsigned rm = INSTR (20, 16);
 847   unsigned rn = INSTR (9, 5);
 848   unsigned rt = INSTR (4, 0);
 849   /* rn may reference SP, rm and rt must reference ZR  */
 850
 851   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 852   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
 853   uint64_t displacement =  OPT_SCALE (extended, 64, scaling);
 854
 855   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 856   aarch64_set_reg_u64 (cpu, rt, NO_SP,
 857                        aarch64_get_mem_u64 (cpu, address + displacement));
 858 }
 859
 860 /* 32 bit load zero-extended byte scaled unsigned 12 bit.  */
 861 static void
 862 ldrb32_abs (sim_cpu *cpu, uint32_t offset)
 863 {
 864   unsigned rn = INSTR (9, 5);
 865   unsigned rt = INSTR (4, 0);
 866
 867   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 868   /* The target register may not be SP but the source may be
 869      there is no scaling required for a byte load.  */
 870   aarch64_set_reg_u64 (cpu, rt, NO_SP,
 871                        aarch64_get_mem_u8
 872                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset));
 873 }
 874
 875 /* 32 bit load zero-extended byte unscaled signed 9 bit with pre- or post-writeback.  */
 876 static void
 877 ldrb32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 878 {
 879   unsigned rn = INSTR (9, 5);
 880   unsigned rt = INSTR (4, 0);
 881   uint64_t address;
 882
 883   if (rn == rt && wb != NoWriteBack)
 884     HALT_UNALLOC;
 885
 886   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 887
 888   if (wb != Post)
 889     address += offset;
 890
 891   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 892   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u8 (cpu, address));
 893
 894   if (wb == Post)
 895     address += offset;
 896
 897   if (wb != NoWriteBack)
 898     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
 899 }
 900
 901 /* 32 bit load zero-extended byte scaled or unscaled zero-
 902    or sign-extended 32-bit register offset.  */
 903 static void
 904 ldrb32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 905 {
 906   unsigned rm = INSTR (20, 16);
 907   unsigned rn = INSTR (9, 5);
 908   unsigned rt = INSTR (4, 0);
 909   /* rn may reference SP, rm and rt must reference ZR  */
 910
 911   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 912   int64_t displacement = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
 913                                  extension);
 914
 915   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 916   /* There is no scaling required for a byte load.  */
 917   aarch64_set_reg_u64 (cpu, rt, NO_SP,
 918                        aarch64_get_mem_u8 (cpu, address + displacement));
 919 }
 920
 921 /* 64 bit load sign-extended byte unscaled signed 9 bit
 922    with pre- or post-writeback.  */
 923 static void
 924 ldrsb_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 925 {
 926   unsigned rn = INSTR (9, 5);
 927   unsigned rt = INSTR (4, 0);
 928   uint64_t address;
 929   int64_t val;
 930
 931   if (rn == rt && wb != NoWriteBack)
 932     HALT_UNALLOC;
 933
 934   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 935
 936   if (wb != Post)
 937     address += offset;
 938
 939   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 940   val = aarch64_get_mem_s8 (cpu, address);
 941   aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
 942
 943   if (wb == Post)
 944     address += offset;
 945
 946   if (wb != NoWriteBack)
 947     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
 948 }
 949
 950 /* 64 bit load sign-extended byte scaled unsigned 12 bit.  */
 951 static void
 952 ldrsb_abs (sim_cpu *cpu, uint32_t offset)
 953 {
 954   ldrsb_wb (cpu, offset, NoWriteBack);
 955 }
 956
 957 /* 64 bit load sign-extended byte scaled or unscaled zero-
 958    or sign-extended 32-bit register offset.  */
 959 static void
 960 ldrsb_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 961 {
 962   unsigned rm = INSTR (20, 16);
 963   unsigned rn = INSTR (9, 5);
 964   unsigned rt = INSTR (4, 0);
 965   /* rn may reference SP, rm and rt must reference ZR  */
 966
 967   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 968   int64_t displacement = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
 969                                  extension);
 970   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 971   /* There is no scaling required for a byte load.  */
 972   aarch64_set_reg_s64 (cpu, rt, NO_SP,
 973                        aarch64_get_mem_s8 (cpu, address + displacement));
 974 }
 975
 976 /* 32 bit load zero-extended short scaled unsigned 12 bit.  */
 977 static void
 978 ldrh32_abs (sim_cpu *cpu, uint32_t offset)
 979 {
 980   unsigned rn = INSTR (9, 5);
 981   unsigned rt = INSTR (4, 0);
 982   uint32_t val;
 983
 984   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 985   /* The target register may not be SP but the source may be.  */
 986   val = aarch64_get_mem_u16 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 987                              + SCALE (offset, 16));
 988   aarch64_set_reg_u32 (cpu, rt, NO_SP, val);
 989 }
 990
 991 /* 32 bit load zero-extended short unscaled signed 9 bit
 992    with pre- or post-writeback.  */
 993 static void
 994 ldrh32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 995 {
 996   unsigned rn = INSTR (9, 5);
 997   unsigned rt = INSTR (4, 0);
 998   uint64_t address;
 999
1000   if (rn == rt && wb != NoWriteBack)
1001     HALT_UNALLOC;
1002
1003   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1004
1005   if (wb != Post)
1006     address += offset;
1007
1008   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1009   aarch64_set_reg_u32 (cpu, rt, NO_SP, aarch64_get_mem_u16 (cpu, address));
1010
1011   if (wb == Post)
1012     address += offset;
1013
1014   if (wb != NoWriteBack)
1015     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1016 }
1017
1018 /* 32 bit load zero-extended short scaled or unscaled zero-
1019    or sign-extended 32-bit register offset.  */
1020 static void
1021 ldrh32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1022 {
1023   unsigned rm = INSTR (20, 16);
1024   unsigned rn = INSTR (9, 5);
1025   unsigned rt = INSTR (4, 0);
1026   /* rn may reference SP, rm and rt must reference ZR  */
1027
1028   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1029   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
1030   uint64_t displacement =  OPT_SCALE (extended, 16, scaling);
1031
1032   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1033   aarch64_set_reg_u32 (cpu, rt, NO_SP,
1034                        aarch64_get_mem_u16 (cpu, address + displacement));
1035 }
1036
1037 /* 32 bit load sign-extended short scaled unsigned 12 bit.  */
1038 static void
1039 ldrsh32_abs (sim_cpu *cpu, uint32_t offset)
1040 {
1041   unsigned rn = INSTR (9, 5);
1042   unsigned rt = INSTR (4, 0);
1043   int32_t val;
1044
1045   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1046   /* The target register may not be SP but the source may be.  */
1047   val = aarch64_get_mem_s16 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
1048                              + SCALE (offset, 16));
1049   aarch64_set_reg_s32 (cpu, rt, NO_SP, val);
1050 }
1051
1052 /* 32 bit load sign-extended short unscaled signed 9 bit
1053    with pre- or post-writeback.  */
1054 static void
1055 ldrsh32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1056 {
1057   unsigned rn = INSTR (9, 5);
1058   unsigned rt = INSTR (4, 0);
1059   uint64_t address;
1060
1061   if (rn == rt && wb != NoWriteBack)
1062     HALT_UNALLOC;
1063
1064   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1065
1066   if (wb != Post)
1067     address += offset;
1068
1069   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1070   aarch64_set_reg_s32 (cpu, rt, NO_SP,
1071                        (int32_t) aarch64_get_mem_s16 (cpu, address));
1072
1073   if (wb == Post)
1074     address += offset;
1075
1076   if (wb != NoWriteBack)
1077     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1078 }
1079
1080 /* 32 bit load sign-extended short scaled or unscaled zero-
1081    or sign-extended 32-bit register offset.  */
1082 static void
1083 ldrsh32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1084 {
1085   unsigned rm = INSTR (20, 16);
1086   unsigned rn = INSTR (9, 5);
1087   unsigned rt = INSTR (4, 0);
1088   /* rn may reference SP, rm and rt must reference ZR  */
1089
1090   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1091   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
1092   uint64_t displacement =  OPT_SCALE (extended, 16, scaling);
1093
1094   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1095   aarch64_set_reg_s32 (cpu, rt, NO_SP,
1096                        (int32_t) aarch64_get_mem_s16
1097                        (cpu, address + displacement));
1098 }
1099
1100 /* 64 bit load sign-extended short scaled unsigned 12 bit.  */
1101 static void
1102 ldrsh_abs (sim_cpu *cpu, uint32_t offset)
1103 {
1104   unsigned rn = INSTR (9, 5);
1105   unsigned rt = INSTR (4, 0);
1106   int64_t val;
1107
1108   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1109   /* The target register may not be SP but the source may be.  */
1110   val = aarch64_get_mem_s16  (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
1111                               + SCALE (offset, 16));
1112   aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
1113 }
1114
1115 /* 64 bit load sign-extended short unscaled signed 9 bit
1116    with pre- or post-writeback.  */
1117 static void
1118 ldrsh64_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1119 {
1120   unsigned rn = INSTR (9, 5);
1121   unsigned rt = INSTR (4, 0);
1122   uint64_t address;
1123   int64_t val;
1124
1125   if (rn == rt && wb != NoWriteBack)
1126     HALT_UNALLOC;
1127
1128   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1129   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1130
1131   if (wb != Post)
1132     address += offset;
1133
1134   val = aarch64_get_mem_s16 (cpu, address);
1135   aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
1136
1137   if (wb == Post)
1138     address += offset;
1139
1140   if (wb != NoWriteBack)
1141     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1142 }
1143
1144 /* 64 bit load sign-extended short scaled or unscaled zero-
1145    or sign-extended 32-bit register offset.  */
1146 static void
1147 ldrsh_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1148 {
1149   unsigned rm = INSTR (20, 16);
1150   unsigned rn = INSTR (9, 5);
1151   unsigned rt = INSTR (4, 0);
1152
1153   /* rn may reference SP, rm and rt must reference ZR  */
1154
1155   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1156   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
1157   uint64_t displacement = OPT_SCALE (extended, 16, scaling);
1158   int64_t val;
1159
1160   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1161   val = aarch64_get_mem_s16 (cpu, address + displacement);
1162   aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
1163 }
1164
1165 /* 64 bit load sign-extended 32 bit scaled unsigned 12 bit.  */
1166 static void
1167 ldrsw_abs (sim_cpu *cpu, uint32_t offset)
1168 {
1169   unsigned rn = INSTR (9, 5);
1170   unsigned rt = INSTR (4, 0);
1171   int64_t val;
1172
1173   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1174   val = aarch64_get_mem_s32 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
1175                              + SCALE (offset, 32));
1176   /* The target register may not be SP but the source may be.  */
1177   return aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
1178 }
1179
1180 /* 64 bit load sign-extended 32 bit unscaled signed 9 bit
1181    with pre- or post-writeback.  */
1182 static void
1183 ldrsw_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1184 {
1185   unsigned rn = INSTR (9, 5);
1186   unsigned rt = INSTR (4, 0);
1187   uint64_t address;
1188
1189   if (rn == rt && wb != NoWriteBack)
1190     HALT_UNALLOC;
1191
1192   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1193
1194   if (wb != Post)
1195     address += offset;
1196
1197   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1198   aarch64_set_reg_s64 (cpu, rt, NO_SP, aarch64_get_mem_s32 (cpu, address));
1199
1200   if (wb == Post)
1201     address += offset;
1202
1203   if (wb != NoWriteBack)
1204     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1205 }
1206
1207 /* 64 bit load sign-extended 32 bit scaled or unscaled zero-
1208    or sign-extended 32-bit register offset.  */
1209 static void
1210 ldrsw_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1211 {
1212   unsigned rm = INSTR (20, 16);
1213   unsigned rn = INSTR (9, 5);
1214   unsigned rt = INSTR (4, 0);
1215   /* rn may reference SP, rm and rt must reference ZR  */
1216
1217   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1218   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
1219   uint64_t displacement =  OPT_SCALE (extended, 32, scaling);
1220
1221   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1222   aarch64_set_reg_s64 (cpu, rt, NO_SP,
1223                        aarch64_get_mem_s32 (cpu, address + displacement));
1224 }
1225
1226 /* N.B. with stores the value in source is written to the
1227    address identified by source2 modified by source3/offset.  */
1228
1229 /* 32 bit store scaled unsigned 12 bit.  */
1230 static void
1231 str32_abs (sim_cpu *cpu, uint32_t offset)
1232 {
1233   unsigned rn = INSTR (9, 5);
1234   unsigned rt = INSTR (4, 0);
1235
1236   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1237   /* The target register may not be SP but the source may be.  */
1238   aarch64_set_mem_u32 (cpu, (aarch64_get_reg_u64 (cpu, rn, SP_OK)
1239                              + SCALE (offset, 32)),
1240                        aarch64_get_reg_u32 (cpu, rt, NO_SP));
1241 }
1242
1243 /* 32 bit store unscaled signed 9 bit with pre- or post-writeback.  */
1244 static void
1245 str32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1246 {
1247   unsigned rn = INSTR (9, 5);
1248   unsigned rt = INSTR (4, 0);
1249   uint64_t address;
1250
1251   if (rn == rt && wb != NoWriteBack)
1252     HALT_UNALLOC;
1253
1254   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1255   if (wb != Post)
1256     address += offset;
1257
1258   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1259   aarch64_set_mem_u32 (cpu, address, aarch64_get_reg_u32 (cpu, rt, NO_SP));
1260
1261   if (wb == Post)
1262     address += offset;
1263
1264   if (wb != NoWriteBack)
1265     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1266 }
1267
1268 /* 32 bit store scaled or unscaled zero- or
1269    sign-extended 32-bit register offset.  */
1270 static void
1271 str32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1272 {
1273   unsigned rm = INSTR (20, 16);
1274   unsigned rn = INSTR (9, 5);
1275   unsigned rt = INSTR (4, 0);
1276
1277   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1278   int64_t  extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
1279   uint64_t displacement = OPT_SCALE (extended, 32, scaling);
1280
1281   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1282   aarch64_set_mem_u32 (cpu, address + displacement,
1283                        aarch64_get_reg_u64 (cpu, rt, NO_SP));
1284 }
1285
1286 /* 64 bit store scaled unsigned 12 bit.  */
1287 static void
1288 str_abs (sim_cpu *cpu, uint32_t offset)
1289 {
1290   unsigned rn = INSTR (9, 5);
1291   unsigned rt = INSTR (4, 0);
1292
1293   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1294   aarch64_set_mem_u64 (cpu,
1295                        aarch64_get_reg_u64 (cpu, rn, SP_OK)
1296                        + SCALE (offset, 64),
1297                        aarch64_get_reg_u64 (cpu, rt, NO_SP));
1298 }
1299
1300 /* 64 bit store unscaled signed 9 bit with pre- or post-writeback.  */
1301 static void
1302 str_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1303 {
1304   unsigned rn = INSTR (9, 5);
1305   unsigned rt = INSTR (4, 0);
1306   uint64_t address;
1307
1308   if (rn == rt && wb != NoWriteBack)
1309     HALT_UNALLOC;
1310
1311   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1312
1313   if (wb != Post)
1314     address += offset;
1315
1316   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1317   aarch64_set_mem_u64 (cpu, address, aarch64_get_reg_u64 (cpu, rt, NO_SP));
1318
1319   if (wb == Post)
1320     address += offset;
1321
1322   if (wb != NoWriteBack)
1323     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1324 }
1325
1326 /* 64 bit store scaled or unscaled zero-
1327    or sign-extended 32-bit register offset.  */
1328 static void
1329 str_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1330 {
1331   unsigned rm = INSTR (20, 16);
1332   unsigned rn = INSTR (9, 5);
1333   unsigned rt = INSTR (4, 0);
1334   /* rn may reference SP, rm and rt must reference ZR  */
1335
1336   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1337   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
1338                                extension);
1339   uint64_t displacement = OPT_SCALE (extended, 64, scaling);
1340
1341   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1342   aarch64_set_mem_u64 (cpu, address + displacement,
1343                        aarch64_get_reg_u64 (cpu, rt, NO_SP));
1344 }
1345
1346 /* 32 bit store byte scaled unsigned 12 bit.  */
1347 static void
1348 strb_abs (sim_cpu *cpu, uint32_t offset)
1349 {
1350   unsigned rn = INSTR (9, 5);
1351   unsigned rt = INSTR (4, 0);
1352
1353   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1354   /* The target register may not be SP but the source may be.
1355      There is no scaling required for a byte load.  */
1356   aarch64_set_mem_u8 (cpu,
1357                       aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
1358                       aarch64_get_reg_u8 (cpu, rt, NO_SP));
1359 }
1360
1361 /* 32 bit store byte unscaled signed 9 bit with pre- or post-writeback.  */
1362 static void
1363 strb_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1364 {
1365   unsigned rn = INSTR (9, 5);
1366   unsigned rt = INSTR (4, 0);
1367   uint64_t address;
1368
1369   if (rn == rt && wb != NoWriteBack)
1370     HALT_UNALLOC;
1371
1372   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1373
1374   if (wb != Post)
1375     address += offset;
1376
1377   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1378   aarch64_set_mem_u8 (cpu, address, aarch64_get_reg_u8 (cpu, rt, NO_SP));
1379
1380   if (wb == Post)
1381     address += offset;
1382
1383   if (wb != NoWriteBack)
1384     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1385 }
1386
1387 /* 32 bit store byte scaled or unscaled zero-
1388    or sign-extended 32-bit register offset.  */
1389 static void
1390 strb_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1391 {
1392   unsigned rm = INSTR (20, 16);
1393   unsigned rn = INSTR (9, 5);
1394   unsigned rt = INSTR (4, 0);
1395   /* rn may reference SP, rm and rt must reference ZR  */
1396
1397   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1398   int64_t displacement = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
1399                                  extension);
1400
1401   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1402   /* There is no scaling required for a byte load.  */
1403   aarch64_set_mem_u8 (cpu, address + displacement,
1404                       aarch64_get_reg_u8 (cpu, rt, NO_SP));
1405 }
1406
1407 /* 32 bit store short scaled unsigned 12 bit.  */
1408 static void
1409 strh_abs (sim_cpu *cpu, uint32_t offset)
1410 {
1411   unsigned rn = INSTR (9, 5);
1412   unsigned rt = INSTR (4, 0);
1413
1414   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1415   /* The target register may not be SP but the source may be.  */
1416   aarch64_set_mem_u16 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
1417                        + SCALE (offset, 16),
1418                        aarch64_get_reg_u16 (cpu, rt, NO_SP));
1419 }
1420
1421 /* 32 bit store short unscaled signed 9 bit with pre- or post-writeback.  */
1422 static void
1423 strh_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1424 {
1425   unsigned rn = INSTR (9, 5);
1426   unsigned rt = INSTR (4, 0);
1427   uint64_t address;
1428
1429   if (rn == rt && wb != NoWriteBack)
1430     HALT_UNALLOC;
1431
1432   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1433
1434   if (wb != Post)
1435     address += offset;
1436
1437   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1438   aarch64_set_mem_u16 (cpu, address, aarch64_get_reg_u16 (cpu, rt, NO_SP));
1439
1440   if (wb == Post)
1441     address += offset;
1442
1443   if (wb != NoWriteBack)
1444     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1445 }
1446
1447 /* 32 bit store short scaled or unscaled zero-
1448    or sign-extended 32-bit register offset.  */
1449 static void
1450 strh_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1451 {
1452   unsigned rm = INSTR (20, 16);
1453   unsigned rn = INSTR (9, 5);
1454   unsigned rt = INSTR (4, 0);
1455   /* rn may reference SP, rm and rt must reference ZR  */
1456
1457   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1458   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
1459   uint64_t displacement =  OPT_SCALE (extended, 16, scaling);
1460
1461   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1462   aarch64_set_mem_u16 (cpu, address + displacement,
1463                        aarch64_get_reg_u16 (cpu, rt, NO_SP));
1464 }
1465
1466 /* Prefetch unsigned 12 bit.  */
1467 static void
1468 prfm_abs (sim_cpu *cpu, uint32_t offset)
1469 {
1470   /* instr[4,0] = prfop : 00000 ==> PLDL1KEEP, 00001 ==> PLDL1STRM,
1471                           00010 ==> PLDL2KEEP, 00001 ==> PLDL2STRM,
1472                           00100 ==> PLDL3KEEP, 00101 ==> PLDL3STRM,
1473                           10000 ==> PSTL1KEEP, 10001 ==> PSTL1STRM,
1474                           10010 ==> PSTL2KEEP, 10001 ==> PSTL2STRM,
1475                           10100 ==> PSTL3KEEP, 10101 ==> PSTL3STRM,
1476                           ow ==> UNALLOC
1477      PrfOp prfop = prfop (instr, 4, 0);
1478      uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK)
1479      + SCALE (offset, 64).  */
1480
1481   /* TODO : implement prefetch of address.  */
1482 }
1483
1484 /* Prefetch scaled or unscaled zero- or sign-extended 32-bit register offset.  */
1485 static void
1486 prfm_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1487 {
1488   /* instr[4,0] = prfop : 00000 ==> PLDL1KEEP, 00001 ==> PLDL1STRM,
1489                           00010 ==> PLDL2KEEP, 00001 ==> PLDL2STRM,
1490                           00100 ==> PLDL3KEEP, 00101 ==> PLDL3STRM,
1491                           10000 ==> PSTL1KEEP, 10001 ==> PSTL1STRM,
1492                           10010 ==> PSTL2KEEP, 10001 ==> PSTL2STRM,
1493                           10100 ==> PSTL3KEEP, 10101 ==> PSTL3STRM,
1494                           ow ==> UNALLOC
1495      rn may reference SP, rm may only reference ZR
1496      PrfOp prfop = prfop (instr, 4, 0);
1497      uint64_t base = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1498      int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
1499                                 extension);
1500      uint64_t displacement =  OPT_SCALE (extended, 64, scaling);
1501      uint64_t address = base + displacement.  */
1502
1503   /* TODO : implement prefetch of address  */
1504 }
1505
1506 /* 64 bit pc-relative prefetch.  */
1507 static void
1508 prfm_pcrel (sim_cpu *cpu, int32_t offset)
1509 {
1510   /* instr[4,0] = prfop : 00000 ==> PLDL1KEEP, 00001 ==> PLDL1STRM,
1511                           00010 ==> PLDL2KEEP, 00001 ==> PLDL2STRM,
1512                           00100 ==> PLDL3KEEP, 00101 ==> PLDL3STRM,
1513                           10000 ==> PSTL1KEEP, 10001 ==> PSTL1STRM,
1514                           10010 ==> PSTL2KEEP, 10001 ==> PSTL2STRM,
1515                           10100 ==> PSTL3KEEP, 10101 ==> PSTL3STRM,
1516                           ow ==> UNALLOC
1517      PrfOp prfop = prfop (instr, 4, 0);
1518      uint64_t address = aarch64_get_PC (cpu) + offset.  */
1519
1520   /* TODO : implement this  */
1521 }
1522
1523 /* Load-store exclusive.  */
1524
1525 static void
1526 ldxr (sim_cpu *cpu)
1527 {
1528   unsigned rn = INSTR (9, 5);
1529   unsigned rt = INSTR (4, 0);
1530   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1531   int size = INSTR (31, 30);
1532   /* int ordered = INSTR (15, 15);  */
1533   /* int exclusive = ! INSTR (23, 23);  */
1534
1535   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1536   switch (size)
1537     {
1538     case 0:
1539       aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u8 (cpu, address));
1540       break;
1541     case 1:
1542       aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u16 (cpu, address));
1543       break;
1544     case 2:
1545       aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u32 (cpu, address));
1546       break;
1547     case 3:
1548       aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u64 (cpu, address));
1549       break;
1550     }
1551 }
1552
1553 static void
1554 stxr (sim_cpu *cpu)
1555 {
1556   unsigned rn = INSTR (9, 5);
1557   unsigned rt = INSTR (4, 0);
1558   unsigned rs = INSTR (20, 16);
1559   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1560   int      size = INSTR (31, 30);
1561   uint64_t data = aarch64_get_reg_u64 (cpu, rt, NO_SP);
1562
1563   switch (size)
1564     {
1565     case 0: aarch64_set_mem_u8 (cpu, address, data); break;
1566     case 1: aarch64_set_mem_u16 (cpu, address, data); break;
1567     case 2: aarch64_set_mem_u32 (cpu, address, data); break;
1568     case 3: aarch64_set_mem_u64 (cpu, address, data); break;
1569     }
1570
1571   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1572   aarch64_set_reg_u64 (cpu, rs, NO_SP, 0); /* Always exclusive...  */
1573 }
1574
1575 static void
1576 dexLoadLiteral (sim_cpu *cpu)
1577 {
1578   /* instr[29,27] == 011
1579      instr[25,24] == 00
1580      instr[31,30:26] = opc: 000 ==> LDRW,  001 ==> FLDRS
1581                             010 ==> LDRX,  011 ==> FLDRD
1582                             100 ==> LDRSW, 101 ==> FLDRQ
1583                             110 ==> PRFM, 111 ==> UNALLOC
1584      instr[26] ==> V : 0 ==> GReg, 1 ==> FReg
1585      instr[23, 5] == simm19  */
1586
1587   /* unsigned rt = INSTR (4, 0);  */
1588   uint32_t dispatch = (INSTR (31, 30) << 1) | INSTR (26, 26);
1589   int32_t imm = simm32 (aarch64_get_instr (cpu), 23, 5);
1590
1591   switch (dispatch)
1592     {
1593     case 0: ldr32_pcrel (cpu, imm); break;
1594     case 1: fldrs_pcrel (cpu, imm); break;
1595     case 2: ldr_pcrel   (cpu, imm); break;
1596     case 3: fldrd_pcrel (cpu, imm); break;
1597     case 4: ldrsw_pcrel (cpu, imm); break;
1598     case 5: fldrq_pcrel (cpu, imm); break;
1599     case 6: prfm_pcrel  (cpu, imm); break;
1600     case 7:
1601     default:
1602       HALT_UNALLOC;
1603     }
1604 }
1605
1606 /* Immediate arithmetic
1607    The aimm argument is a 12 bit unsigned value or a 12 bit unsigned
1608    value left shifted by 12 bits (done at decode).
1609
1610    N.B. the register args (dest, source) can normally be Xn or SP.
1611    the exception occurs for flag setting instructions which may
1612    only use Xn for the output (dest).  */
1613
1614 /* 32 bit add immediate.  */
1615 static void
1616 add32 (sim_cpu *cpu, uint32_t aimm)
1617 {
1618   unsigned rn = INSTR (9, 5);
1619   unsigned rd = INSTR (4, 0);
1620
1621   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1622   aarch64_set_reg_u64 (cpu, rd, SP_OK,
1623                        aarch64_get_reg_u32 (cpu, rn, SP_OK) + aimm);
1624 }
1625
1626 /* 64 bit add immediate.  */
1627 static void
1628 add64 (sim_cpu *cpu, uint32_t aimm)
1629 {
1630   unsigned rn = INSTR (9, 5);
1631   unsigned rd = INSTR (4, 0);
1632
1633   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1634   aarch64_set_reg_u64 (cpu, rd, SP_OK,
1635                        aarch64_get_reg_u64 (cpu, rn, SP_OK) + aimm);
1636 }
1637
1638 static void
1639 set_flags_for_add32 (sim_cpu *cpu, int32_t value1, int32_t value2)
1640 {
1641   int32_t   result = value1 + value2;
1642   int64_t   sresult = (int64_t) value1 + (int64_t) value2;
1643   uint64_t  uresult = (uint64_t)(uint32_t) value1
1644     + (uint64_t)(uint32_t) value2;
1645   uint32_t  flags = 0;
1646
1647   if (result == 0)
1648     flags |= Z;
1649
1650   if (result & (1 << 31))
1651     flags |= N;
1652
1653   if (uresult != result)
1654     flags |= C;
1655
1656   if (sresult != result)
1657     flags |= V;
1658
1659   aarch64_set_CPSR (cpu, flags);
1660 }
1661
1662 #define NEG(a) (((a) & signbit) == signbit)
1663 #define POS(a) (((a) & signbit) == 0)
1664
1665 static void
1666 set_flags_for_add64 (sim_cpu *cpu, uint64_t value1, uint64_t value2)
1667 {
1668   uint64_t result = value1 + value2;
1669   uint32_t flags = 0;
1670   uint64_t signbit = 1ULL << 63;
1671
1672   if (result == 0)
1673     flags |= Z;
1674
1675   if (NEG (result))
1676     flags |= N;
1677
1678   if (   (NEG (value1) && NEG (value2))
1679       || (NEG (value1) && POS (result))
1680       || (NEG (value2) && POS (result)))
1681     flags |= C;
1682
1683   if (   (NEG (value1) && NEG (value2) && POS (result))
1684       || (POS (value1) && POS (value2) && NEG (result)))
1685     flags |= V;
1686
1687   aarch64_set_CPSR (cpu, flags);
1688 }
1689
1690 static void
1691 set_flags_for_sub32 (sim_cpu *cpu, uint32_t value1, uint32_t value2)
1692 {
1693   uint32_t result = value1 - value2;
1694   uint32_t flags = 0;
1695   uint32_t signbit = 1U << 31;
1696
1697   if (result == 0)
1698     flags |= Z;
1699
1700   if (NEG (result))
1701     flags |= N;
1702
1703   if (   (NEG (value1) && POS (value2))
1704       || (NEG (value1) && POS (result))
1705       || (POS (value2) && POS (result)))
1706     flags |= C;
1707
1708   if (   (NEG (value1) && POS (value2) && POS (result))
1709       || (POS (value1) && NEG (value2) && NEG (result)))
1710     flags |= V;
1711
1712   aarch64_set_CPSR (cpu, flags);
1713 }
1714
1715 static void
1716 set_flags_for_sub64 (sim_cpu *cpu, uint64_t value1, uint64_t value2)
1717 {
1718   uint64_t result = value1 - value2;
1719   uint32_t flags = 0;
1720   uint64_t signbit = 1ULL << 63;
1721
1722   if (result == 0)
1723     flags |= Z;
1724
1725   if (NEG (result))
1726     flags |= N;
1727
1728   if (   (NEG (value1) && POS (value2))
1729       || (NEG (value1) && POS (result))
1730       || (POS (value2) && POS (result)))
1731     flags |= C;
1732
1733   if (   (NEG (value1) && POS (value2) && POS (result))
1734       || (POS (value1) && NEG (value2) && NEG (result)))
1735     flags |= V;
1736
1737   aarch64_set_CPSR (cpu, flags);
1738 }
1739
1740 static void
1741 set_flags_for_binop32 (sim_cpu *cpu, uint32_t result)
1742 {
1743   uint32_t flags = 0;
1744
1745   if (result == 0)
1746     flags |= Z;
1747   else
1748     flags &= ~ Z;
1749
1750   if (result & (1 << 31))
1751     flags |= N;
1752   else
1753     flags &= ~ N;
1754
1755   aarch64_set_CPSR (cpu, flags);
1756 }
1757
1758 static void
1759 set_flags_for_binop64 (sim_cpu *cpu, uint64_t result)
1760 {
1761   uint32_t flags = 0;
1762
1763   if (result == 0)
1764     flags |= Z;
1765   else
1766     flags &= ~ Z;
1767
1768   if (result & (1ULL << 63))
1769     flags |= N;
1770   else
1771     flags &= ~ N;
1772
1773   aarch64_set_CPSR (cpu, flags);
1774 }
1775
1776 /* 32 bit add immediate set flags.  */
1777 static void
1778 adds32 (sim_cpu *cpu, uint32_t aimm)
1779 {
1780   unsigned rn = INSTR (9, 5);
1781   unsigned rd = INSTR (4, 0);
1782   /* TODO : do we need to worry about signs here?  */
1783   int32_t value1 = aarch64_get_reg_s32 (cpu, rn, SP_OK);
1784
1785   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1786   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + aimm);
1787   set_flags_for_add32 (cpu, value1, aimm);
1788 }
1789
1790 /* 64 bit add immediate set flags.  */
1791 static void
1792 adds64 (sim_cpu *cpu, uint32_t aimm)
1793 {
1794   unsigned rn = INSTR (9, 5);
1795   unsigned rd = INSTR (4, 0);
1796   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1797   uint64_t value2 = aimm;
1798
1799   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1800   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
1801   set_flags_for_add64 (cpu, value1, value2);
1802 }
1803
1804 /* 32 bit sub immediate.  */
1805 static void
1806 sub32 (sim_cpu *cpu, uint32_t aimm)
1807 {
1808   unsigned rn = INSTR (9, 5);
1809   unsigned rd = INSTR (4, 0);
1810
1811   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1812   aarch64_set_reg_u64 (cpu, rd, SP_OK,
1813                        aarch64_get_reg_u32 (cpu, rn, SP_OK) - aimm);
1814 }
1815
1816 /* 64 bit sub immediate.  */
1817 static void
1818 sub64 (sim_cpu *cpu, uint32_t aimm)
1819 {
1820   unsigned rn = INSTR (9, 5);
1821   unsigned rd = INSTR (4, 0);
1822
1823   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1824   aarch64_set_reg_u64 (cpu, rd, SP_OK,
1825                        aarch64_get_reg_u64 (cpu, rn, SP_OK) - aimm);
1826 }
1827
1828 /* 32 bit sub immediate set flags.  */
1829 static void
1830 subs32 (sim_cpu *cpu, uint32_t aimm)
1831 {
1832   unsigned rn = INSTR (9, 5);
1833   unsigned rd = INSTR (4, 0);
1834   uint32_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1835   uint32_t value2 = aimm;
1836
1837   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1838   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
1839   set_flags_for_sub32 (cpu, value1, value2);
1840 }
1841
1842 /* 64 bit sub immediate set flags.  */
1843 static void
1844 subs64 (sim_cpu *cpu, uint32_t aimm)
1845 {
1846   unsigned rn = INSTR (9, 5);
1847   unsigned rd = INSTR (4, 0);
1848   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1849   uint32_t value2 = aimm;
1850
1851   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1852   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
1853   set_flags_for_sub64 (cpu, value1, value2);
1854 }
1855
1856 /* Data Processing Register.  */
1857
1858 /* First two helpers to perform the shift operations.  */
1859
1860 static inline uint32_t
1861 shifted32 (uint32_t value, Shift shift, uint32_t count)
1862 {
1863   switch (shift)
1864     {
1865     default:
1866     case LSL:
1867       return (value << count);
1868     case LSR:
1869       return (value >> count);
1870     case ASR:
1871       {
1872         int32_t svalue = value;
1873         return (svalue >> count);
1874       }
1875     case ROR:
1876       {
1877         uint32_t top = value >> count;
1878         uint32_t bottom = value << (32 - count);
1879         return (bottom | top);
1880       }
1881     }
1882 }
1883
1884 static inline uint64_t
1885 shifted64 (uint64_t value, Shift shift, uint32_t count)
1886 {
1887   switch (shift)
1888     {
1889     default:
1890     case LSL:
1891       return (value << count);
1892     case LSR:
1893       return (value >> count);
1894     case ASR:
1895       {
1896         int64_t svalue = value;
1897         return (svalue >> count);
1898       }
1899     case ROR:
1900       {
1901         uint64_t top = value >> count;
1902         uint64_t bottom = value << (64 - count);
1903         return (bottom | top);
1904       }
1905     }
1906 }
1907
1908 /* Arithmetic shifted register.
1909    These allow an optional LSL, ASR or LSR to the second source
1910    register with a count up to the register bit count.
1911
1912    N.B register args may not be SP.  */
1913
1914 /* 32 bit ADD shifted register.  */
1915 static void
1916 add32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
1917 {
1918   unsigned rm = INSTR (20, 16);
1919   unsigned rn = INSTR (9, 5);
1920   unsigned rd = INSTR (4, 0);
1921
1922   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1923   aarch64_set_reg_u64 (cpu, rd, NO_SP,
1924                        aarch64_get_reg_u32 (cpu, rn, NO_SP)
1925                        + shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
1926                                     shift, count));
1927 }
1928
1929 /* 64 bit ADD shifted register.  */
1930 static void
1931 add64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
1932 {
1933   unsigned rm = INSTR (20, 16);
1934   unsigned rn = INSTR (9, 5);
1935   unsigned rd = INSTR (4, 0);
1936
1937   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1938   aarch64_set_reg_u64 (cpu, rd, NO_SP,
1939                        aarch64_get_reg_u64 (cpu, rn, NO_SP)
1940                        + shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
1941                                     shift, count));
1942 }
1943
1944 /* 32 bit ADD shifted register setting flags.  */
1945 static void
1946 adds32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
1947 {
1948   unsigned rm = INSTR (20, 16);
1949   unsigned rn = INSTR (9, 5);
1950   unsigned rd = INSTR (4, 0);
1951
1952   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
1953   uint32_t value2 = shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
1954                                shift, count);
1955
1956   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1957   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
1958   set_flags_for_add32 (cpu, value1, value2);
1959 }
1960
1961 /* 64 bit ADD shifted register setting flags.  */
1962 static void
1963 adds64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
1964 {
1965   unsigned rm = INSTR (20, 16);
1966   unsigned rn = INSTR (9, 5);
1967   unsigned rd = INSTR (4, 0);
1968
1969   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
1970   uint64_t value2 = shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
1971                                shift, count);
1972
1973   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1974   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
1975   set_flags_for_add64 (cpu, value1, value2);
1976 }
1977
1978 /* 32 bit SUB shifted register.  */
1979 static void
1980 sub32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
1981 {
1982   unsigned rm = INSTR (20, 16);
1983   unsigned rn = INSTR (9, 5);
1984   unsigned rd = INSTR (4, 0);
1985
1986   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1987   aarch64_set_reg_u64 (cpu, rd, NO_SP,
1988                        aarch64_get_reg_u32 (cpu, rn, NO_SP)
1989                        - shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
1990                                     shift, count));
1991 }
1992
1993 /* 64 bit SUB shifted register.  */
1994 static void
1995 sub64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
1996 {
1997   unsigned rm = INSTR (20, 16);
1998   unsigned rn = INSTR (9, 5);
1999   unsigned rd = INSTR (4, 0);
2000
2001   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2002   aarch64_set_reg_u64 (cpu, rd, NO_SP,
2003                        aarch64_get_reg_u64 (cpu, rn, NO_SP)
2004                        - shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
2005                                     shift, count));
2006 }
2007
2008 /* 32 bit SUB shifted register setting flags.  */
2009 static void
2010 subs32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
2011 {
2012   unsigned rm = INSTR (20, 16);
2013   unsigned rn = INSTR (9, 5);
2014   unsigned rd = INSTR (4, 0);
2015
2016   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
2017   uint32_t value2 = shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
2018                               shift, count);
2019
2020   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2021   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
2022   set_flags_for_sub32 (cpu, value1, value2);
2023 }
2024
2025 /* 64 bit SUB shifted register setting flags.  */
2026 static void
2027 subs64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
2028 {
2029   unsigned rm = INSTR (20, 16);
2030   unsigned rn = INSTR (9, 5);
2031   unsigned rd = INSTR (4, 0);
2032
2033   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
2034   uint64_t value2 = shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
2035                                shift, count);
2036
2037   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2038   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
2039   set_flags_for_sub64 (cpu, value1, value2);
2040 }
2041
2042 /* First a couple more helpers to fetch the
2043    relevant source register element either
2044    sign or zero extended as required by the
2045    extension value.  */
2046
2047 static uint32_t
2048 extreg32 (sim_cpu *cpu, unsigned int lo, Extension extension)
2049 {
2050   switch (extension)
2051     {
2052     case UXTB: return aarch64_get_reg_u8  (cpu, lo, NO_SP);
2053     case UXTH: return aarch64_get_reg_u16 (cpu, lo, NO_SP);
2054     case UXTW: /* Fall through.  */
2055     case UXTX: return aarch64_get_reg_u32 (cpu, lo, NO_SP);
2056     case SXTB: return aarch64_get_reg_s8  (cpu, lo, NO_SP);
2057     case SXTH: return aarch64_get_reg_s16 (cpu, lo, NO_SP);
2058     case SXTW: /* Fall through.  */
2059     case SXTX: /* Fall through.  */
2060     default:   return aarch64_get_reg_s32 (cpu, lo, NO_SP);
2061   }
2062 }
2063
2064 static uint64_t
2065 extreg64 (sim_cpu *cpu, unsigned int lo, Extension extension)
2066 {
2067   switch (extension)
2068     {
2069     case UXTB: return aarch64_get_reg_u8  (cpu, lo, NO_SP);
2070     case UXTH: return aarch64_get_reg_u16 (cpu, lo, NO_SP);
2071     case UXTW: return aarch64_get_reg_u32 (cpu, lo, NO_SP);
2072     case UXTX: return aarch64_get_reg_u64 (cpu, lo, NO_SP);
2073     case SXTB: return aarch64_get_reg_s8  (cpu, lo, NO_SP);
2074     case SXTH: return aarch64_get_reg_s16 (cpu, lo, NO_SP);
2075     case SXTW: return aarch64_get_reg_s32 (cpu, lo, NO_SP);
2076     case SXTX:
2077     default:   return aarch64_get_reg_s64 (cpu, lo, NO_SP);
2078     }
2079 }
2080
2081 /* Arithmetic extending register
2082    These allow an optional sign extension of some portion of the
2083    second source register followed by an optional left shift of
2084    between 1 and 4 bits (i.e. a shift of 0-4 bits???)
2085
2086    N.B output (dest) and first input arg (source) may normally be Xn
2087    or SP. However, for flag setting operations dest can only be
2088    Xn. Second input registers are always Xn.  */
2089
2090 /* 32 bit ADD extending register.  */
2091 static void
2092 add32_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2093 {
2094   unsigned rm = INSTR (20, 16);
2095   unsigned rn = INSTR (9, 5);
2096   unsigned rd = INSTR (4, 0);
2097
2098   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2099   aarch64_set_reg_u64 (cpu, rd, SP_OK,
2100                        aarch64_get_reg_u32 (cpu, rn, SP_OK)
2101                        + (extreg32 (cpu, rm, extension) << shift));
2102 }
2103
2104 /* 64 bit ADD extending register.
2105    N.B. This subsumes the case with 64 bit source2 and UXTX #n or LSL #0.  */
2106 static void
2107 add64_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2108 {
2109   unsigned rm = INSTR (20, 16);
2110   unsigned rn = INSTR (9, 5);
2111   unsigned rd = INSTR (4, 0);
2112
2113   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2114   aarch64_set_reg_u64 (cpu, rd, SP_OK,
2115                        aarch64_get_reg_u64 (cpu, rn, SP_OK)
2116                        + (extreg64 (cpu, rm, extension) << shift));
2117 }
2118
2119 /* 32 bit ADD extending register setting flags.  */
2120 static void
2121 adds32_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2122 {
2123   unsigned rm = INSTR (20, 16);
2124   unsigned rn = INSTR (9, 5);
2125   unsigned rd = INSTR (4, 0);
2126
2127   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, SP_OK);
2128   uint32_t value2 = extreg32 (cpu, rm, extension) << shift;
2129
2130   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2131   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
2132   set_flags_for_add32 (cpu, value1, value2);
2133 }
2134
2135 /* 64 bit ADD extending register setting flags  */
2136 /* N.B. this subsumes the case with 64 bit source2 and UXTX #n or LSL #0  */
2137 static void
2138 adds64_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2139 {
2140   unsigned rm = INSTR (20, 16);
2141   unsigned rn = INSTR (9, 5);
2142   unsigned rd = INSTR (4, 0);
2143
2144   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
2145   uint64_t value2 = extreg64 (cpu, rm, extension) << shift;
2146
2147   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2148   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
2149   set_flags_for_add64 (cpu, value1, value2);
2150 }
2151
2152 /* 32 bit SUB extending register.  */
2153 static void
2154 sub32_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2155 {
2156   unsigned rm = INSTR (20, 16);
2157   unsigned rn = INSTR (9, 5);
2158   unsigned rd = INSTR (4, 0);
2159
2160   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2161   aarch64_set_reg_u64 (cpu, rd, SP_OK,
2162                        aarch64_get_reg_u32 (cpu, rn, SP_OK)
2163                        - (extreg32 (cpu, rm, extension) << shift));
2164 }
2165
2166 /* 64 bit SUB extending register.  */
2167 /* N.B. this subsumes the case with 64 bit source2 and UXTX #n or LSL #0.  */
2168 static void
2169 sub64_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2170 {
2171   unsigned rm = INSTR (20, 16);
2172   unsigned rn = INSTR (9, 5);
2173   unsigned rd = INSTR (4, 0);
2174
2175   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2176   aarch64_set_reg_u64 (cpu, rd, SP_OK,
2177                        aarch64_get_reg_u64 (cpu, rn, SP_OK)
2178                        - (extreg64 (cpu, rm, extension) << shift));
2179 }
2180
2181 /* 32 bit SUB extending register setting flags.  */
2182 static void
2183 subs32_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2184 {
2185   unsigned rm = INSTR (20, 16);
2186   unsigned rn = INSTR (9, 5);
2187   unsigned rd = INSTR (4, 0);
2188
2189   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, SP_OK);
2190   uint32_t value2 = extreg32 (cpu, rm, extension) << shift;
2191
2192   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2193   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
2194   set_flags_for_sub32 (cpu, value1, value2);
2195 }
2196
2197 /* 64 bit SUB extending register setting flags  */
2198 /* N.B. this subsumes the case with 64 bit source2 and UXTX #n or LSL #0  */
2199 static void
2200 subs64_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2201 {
2202   unsigned rm = INSTR (20, 16);
2203   unsigned rn = INSTR (9, 5);
2204   unsigned rd = INSTR (4, 0);
2205
2206   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
2207   uint64_t value2 = extreg64 (cpu, rm, extension) << shift;
2208
2209   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2210   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
2211   set_flags_for_sub64 (cpu, value1, value2);
2212 }
2213
2214 static void
2215 dexAddSubtractImmediate (sim_cpu *cpu)
2216 {
2217   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
2218      instr[30]    = op : 0 ==> ADD, 1 ==> SUB
2219      instr[29]    = set : 0 ==> no flags, 1 ==> set flags
2220      instr[28,24] = 10001
2221      instr[23,22] = shift : 00 == LSL#0, 01 = LSL#12 1x = UNALLOC
2222      instr[21,10] = uimm12
2223      instr[9,5]   = Rn
2224      instr[4,0]   = Rd  */
2225
2226   /* N.B. the shift is applied at decode before calling the add/sub routine.  */
2227   uint32_t shift = INSTR (23, 22);
2228   uint32_t imm = INSTR (21, 10);
2229   uint32_t dispatch = INSTR (31, 29);
2230
2231   NYI_assert (28, 24, 0x11);
2232
2233   if (shift > 1)
2234     HALT_UNALLOC;
2235
2236   if (shift)
2237     imm <<= 12;
2238
2239   switch (dispatch)
2240     {
2241     case 0: add32 (cpu, imm); break;
2242     case 1: adds32 (cpu, imm); break;
2243     case 2: sub32 (cpu, imm); break;
2244     case 3: subs32 (cpu, imm); break;
2245     case 4: add64 (cpu, imm); break;
2246     case 5: adds64 (cpu, imm); break;
2247     case 6: sub64 (cpu, imm); break;
2248     case 7: subs64 (cpu, imm); break;
2249     }
2250 }
2251
2252 static void
2253 dexAddSubtractShiftedRegister (sim_cpu *cpu)
2254 {
2255   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
2256      instr[30,29] = op : 00 ==> ADD, 01 ==> ADDS, 10 ==> SUB, 11 ==> SUBS
2257      instr[28,24] = 01011
2258      instr[23,22] = shift : 0 ==> LSL, 1 ==> LSR, 2 ==> ASR, 3 ==> UNALLOC
2259      instr[21]    = 0
2260      instr[20,16] = Rm
2261      instr[15,10] = count : must be 0xxxxx for 32 bit
2262      instr[9,5]   = Rn
2263      instr[4,0]   = Rd  */
2264
2265   uint32_t size = INSTR (31, 31);
2266   uint32_t count = INSTR (15, 10);
2267   Shift shiftType = INSTR (23, 22);
2268
2269   NYI_assert (28, 24, 0x0B);
2270   NYI_assert (21, 21, 0);
2271
2272   /* Shift encoded as ROR is unallocated.  */
2273   if (shiftType == ROR)
2274     HALT_UNALLOC;
2275
2276   /* 32 bit operations must have count[5] = 0
2277      or else we have an UNALLOC.  */
2278   if (size == 0 && uimm (count, 5, 5))
2279     HALT_UNALLOC;
2280
2281   /* Dispatch on size:op i.e instr [31,29].  */
2282   switch (INSTR (31, 29))
2283     {
2284     case 0: add32_shift  (cpu, shiftType, count); break;
2285     case 1: adds32_shift (cpu, shiftType, count); break;
2286     case 2: sub32_shift  (cpu, shiftType, count); break;
2287     case 3: subs32_shift (cpu, shiftType, count); break;
2288     case 4: add64_shift  (cpu, shiftType, count); break;
2289     case 5: adds64_shift (cpu, shiftType, count); break;
2290     case 6: sub64_shift  (cpu, shiftType, count); break;
2291     case 7: subs64_shift (cpu, shiftType, count); break;
2292     }
2293 }
2294
2295 static void
2296 dexAddSubtractExtendedRegister (sim_cpu *cpu)
2297 {
2298   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
2299      instr[30]    = op : 0 ==> ADD, 1 ==> SUB
2300      instr[29]    = set? : 0 ==> no flags, 1 ==> set flags
2301      instr[28,24] = 01011
2302      instr[23,22] = opt : 0 ==> ok, 1,2,3 ==> UNALLOC
2303      instr[21]    = 1
2304      instr[20,16] = Rm
2305      instr[15,13] = option : 000 ==> UXTB, 001 ==> UXTH,
2306                              000 ==> LSL|UXTW, 001 ==> UXTZ,
2307                              000 ==> SXTB, 001 ==> SXTH,
2308                              000 ==> SXTW, 001 ==> SXTX,
2309      instr[12,10] = shift : 0,1,2,3,4 ==> ok, 5,6,7 ==> UNALLOC
2310      instr[9,5]   = Rn
2311      instr[4,0]   = Rd  */
2312
2313   Extension extensionType = INSTR (15, 13);
2314   uint32_t shift = INSTR (12, 10);
2315
2316   NYI_assert (28, 24, 0x0B);
2317   NYI_assert (21, 21, 1);
2318
2319   /* Shift may not exceed 4.  */
2320   if (shift > 4)
2321     HALT_UNALLOC;
2322
2323   /* Dispatch on size:op:set?.  */
2324   switch (INSTR (31, 29))
2325     {
2326     case 0: add32_ext  (cpu, extensionType, shift); break;
2327     case 1: adds32_ext (cpu, extensionType, shift); break;
2328     case 2: sub32_ext  (cpu, extensionType, shift); break;
2329     case 3: subs32_ext (cpu, extensionType, shift); break;
2330     case 4: add64_ext  (cpu, extensionType, shift); break;
2331     case 5: adds64_ext (cpu, extensionType, shift); break;
2332     case 6: sub64_ext  (cpu, extensionType, shift); break;
2333     case 7: subs64_ext (cpu, extensionType, shift); break;
2334     }
2335 }
2336
2337 /* Conditional data processing
2338    Condition register is implicit 3rd source.  */
2339
2340 /* 32 bit add with carry.  */
2341 /* N.B register args may not be SP.  */
2342
2343 static void
2344 adc32 (sim_cpu *cpu)
2345 {
2346   unsigned rm = INSTR (20, 16);
2347   unsigned rn = INSTR (9, 5);
2348   unsigned rd = INSTR (4, 0);
2349
2350   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2351   aarch64_set_reg_u64 (cpu, rd, NO_SP,
2352                        aarch64_get_reg_u32 (cpu, rn, NO_SP)
2353                        + aarch64_get_reg_u32 (cpu, rm, NO_SP)
2354                        + IS_SET (C));
2355 }
2356
2357 /* 64 bit add with carry  */
2358 static void
2359 adc64 (sim_cpu *cpu)
2360 {
2361   unsigned rm = INSTR (20, 16);
2362   unsigned rn = INSTR (9, 5);
2363   unsigned rd = INSTR (4, 0);
2364
2365   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2366   aarch64_set_reg_u64 (cpu, rd, NO_SP,
2367                        aarch64_get_reg_u64 (cpu, rn, NO_SP)
2368                        + aarch64_get_reg_u64 (cpu, rm, NO_SP)
2369                        + IS_SET (C));
2370 }
2371
2372 /* 32 bit add with carry setting flags.  */
2373 static void
2374 adcs32 (sim_cpu *cpu)
2375 {
2376   unsigned rm = INSTR (20, 16);
2377   unsigned rn = INSTR (9, 5);
2378   unsigned rd = INSTR (4, 0);
2379
2380   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
2381   uint32_t value2 = aarch64_get_reg_u32 (cpu, rm, NO_SP);
2382   uint32_t carry = IS_SET (C);
2383
2384   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2385   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2 + carry);
2386   set_flags_for_add32 (cpu, value1, value2 + carry);
2387 }
2388
2389 /* 64 bit add with carry setting flags.  */
2390 static void
2391 adcs64 (sim_cpu *cpu)
2392 {
2393   unsigned rm = INSTR (20, 16);
2394   unsigned rn = INSTR (9, 5);
2395   unsigned rd = INSTR (4, 0);
2396
2397   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
2398   uint64_t value2 = aarch64_get_reg_u64 (cpu, rm, NO_SP);
2399   uint64_t carry = IS_SET (C);
2400
2401   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2402   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2 + carry);
2403   set_flags_for_add64 (cpu, value1, value2 + carry);
2404 }
2405
2406 /* 32 bit sub with carry.  */
2407 static void
2408 sbc32 (sim_cpu *cpu)
2409 {
2410   unsigned rm = INSTR (20, 16);
2411   unsigned rn = INSTR (9, 5); /* ngc iff rn == 31.  */
2412   unsigned rd = INSTR (4, 0);
2413
2414   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2415   aarch64_set_reg_u64 (cpu, rd, NO_SP,
2416                        aarch64_get_reg_u32 (cpu, rn, NO_SP)
2417                        - aarch64_get_reg_u32 (cpu, rm, NO_SP)
2418                        - 1 + IS_SET (C));
2419 }
2420
2421 /* 64 bit sub with carry  */
2422 static void
2423 sbc64 (sim_cpu *cpu)
2424 {
2425   unsigned rm = INSTR (20, 16);
2426   unsigned rn = INSTR (9, 5);
2427   unsigned rd = INSTR (4, 0);
2428
2429   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2430   aarch64_set_reg_u64 (cpu, rd, NO_SP,
2431                        aarch64_get_reg_u64 (cpu, rn, NO_SP)
2432                        - aarch64_get_reg_u64 (cpu, rm, NO_SP)
2433                        - 1 + IS_SET (C));
2434 }
2435
2436 /* 32 bit sub with carry setting flags  */
2437 static void
2438 sbcs32 (sim_cpu *cpu)
2439 {
2440   unsigned rm = INSTR (20, 16);
2441   unsigned rn = INSTR (9, 5);
2442   unsigned rd = INSTR (4, 0);
2443
2444   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
2445   uint32_t value2 = aarch64_get_reg_u32 (cpu, rm, NO_SP);
2446   uint32_t carry  = IS_SET (C);
2447   uint32_t result = value1 - value2 + 1 - carry;
2448
2449   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2450   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
2451   set_flags_for_sub32 (cpu, value1, value2 + 1 - carry);
2452 }
2453
2454 /* 64 bit sub with carry setting flags  */
2455 static void
2456 sbcs64 (sim_cpu *cpu)
2457 {
2458   unsigned rm = INSTR (20, 16);
2459   unsigned rn = INSTR (9, 5);
2460   unsigned rd = INSTR (4, 0);
2461
2462   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
2463   uint64_t value2 = aarch64_get_reg_u64 (cpu, rm, NO_SP);
2464   uint64_t carry  = IS_SET (C);
2465   uint64_t result = value1 - value2 + 1 - carry;
2466
2467   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2468   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
2469   set_flags_for_sub64 (cpu, value1, value2 + 1 - carry);
2470 }
2471
2472 static void
2473 dexAddSubtractWithCarry (sim_cpu *cpu)
2474 {
2475   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
2476      instr[30]    = op : 0 ==> ADC, 1 ==> SBC
2477      instr[29]    = set? : 0 ==> no flags, 1 ==> set flags
2478      instr[28,21] = 1 1010 000
2479      instr[20,16] = Rm
2480      instr[15,10] = op2 : 00000 ==> ok, ow ==> UNALLOC
2481      instr[9,5]   = Rn
2482      instr[4,0]   = Rd  */
2483
2484   uint32_t op2 = INSTR (15, 10);
2485
2486   NYI_assert (28, 21, 0xD0);
2487
2488   if (op2 != 0)
2489     HALT_UNALLOC;
2490
2491   /* Dispatch on size:op:set?.  */
2492   switch (INSTR (31, 29))
2493     {
2494     case 0: adc32 (cpu); break;
2495     case 1: adcs32 (cpu); break;
2496     case 2: sbc32 (cpu); break;
2497     case 3: sbcs32 (cpu); break;
2498     case 4: adc64 (cpu); break;
2499     case 5: adcs64 (cpu); break;
2500     case 6: sbc64 (cpu); break;
2501     case 7: sbcs64 (cpu); break;
2502     }
2503 }
2504
2505 static uint32_t
2506 testConditionCode (sim_cpu *cpu, CondCode cc)
2507 {
2508   /* This should be reduceable to branchless logic
2509      by some careful testing of bits in CC followed
2510      by the requisite masking and combining of bits
2511      from the flag register.
2512
2513      For now we do it with a switch.  */
2514   int res;
2515
2516   switch (cc)
2517     {
2518     case EQ:  res = IS_SET (Z);    break;
2519     case NE:  res = IS_CLEAR (Z);  break;
2520     case CS:  res = IS_SET (C);    break;
2521     case CC:  res = IS_CLEAR (C);  break;
2522     case MI:  res = IS_SET (N);    break;
2523     case PL:  res = IS_CLEAR (N);  break;
2524     case VS:  res = IS_SET (V);    break;
2525     case VC:  res = IS_CLEAR (V);  break;
2526     case HI:  res = IS_SET (C) && IS_CLEAR (Z);  break;
2527     case LS:  res = IS_CLEAR (C) || IS_SET (Z);  break;
2528     case GE:  res = IS_SET (N) == IS_SET (V);    break;
2529     case LT:  res = IS_SET (N) != IS_SET (V);    break;
2530     case GT:  res = IS_CLEAR (Z) && (IS_SET (N) == IS_SET (V));  break;
2531     case LE:  res = IS_SET (Z) || (IS_SET (N) != IS_SET (V));    break;
2532     case AL:
2533     case NV:
2534     default:
2535       res = 1;
2536       break;
2537     }
2538   return res;
2539 }
2540
2541 static void
2542 CondCompare (sim_cpu *cpu) /* aka: ccmp and ccmn  */
2543 {
2544   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
2545      instr[30]    = compare with positive (1) or negative value (0)
2546      instr[29,21] = 1 1101 0010
2547      instr[20,16] = Rm or const
2548      instr[15,12] = cond
2549      instr[11]    = compare reg (0) or const (1)
2550      instr[10]    = 0
2551      instr[9,5]   = Rn
2552      instr[4]     = 0
2553      instr[3,0]   = value for CPSR bits if the comparison does not take place.  */
2554   signed int negate;
2555   unsigned rm;
2556   unsigned rn;
2557
2558   NYI_assert (29, 21, 0x1d2);
2559   NYI_assert (10, 10, 0);
2560   NYI_assert (4, 4, 0);
2561
2562   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2563   if (! testConditionCode (cpu, INSTR (15, 12)))
2564     {
2565       aarch64_set_CPSR (cpu, INSTR (3, 0));
2566       return;
2567     }
2568
2569   negate = INSTR (30, 30) ? 1 : -1;
2570   rm = INSTR (20, 16);
2571   rn = INSTR ( 9,  5);
2572
2573   if (INSTR (31, 31))
2574     {
2575       if (INSTR (11, 11))
2576         set_flags_for_sub64 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK),
2577                              negate * (uint64_t) rm);
2578       else
2579         set_flags_for_sub64 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK),
2580                              negate * aarch64_get_reg_u64 (cpu, rm, SP_OK));
2581     }
2582   else
2583     {
2584       if (INSTR (11, 11))
2585         set_flags_for_sub32 (cpu, aarch64_get_reg_u32 (cpu, rn, SP_OK),
2586                              negate * rm);
2587       else
2588         set_flags_for_sub32 (cpu, aarch64_get_reg_u32 (cpu, rn, SP_OK),
2589                              negate * aarch64_get_reg_u32 (cpu, rm, SP_OK));
2590     }
2591 }
2592
2593 static void
2594 do_vec_MOV_whole_vector (sim_cpu *cpu)
2595 {
2596   /* MOV Vd.T, Vs.T  (alias for ORR Vd.T, Vn.T, Vm.T where Vn == Vm)
2597
2598      instr[31]    = 0
2599      instr[30]    = half(0)/full(1)
2600      instr[29,21] = 001110101
2601      instr[20,16] = Vs
2602      instr[15,10] = 000111
2603      instr[9,5]   = Vs
2604      instr[4,0]   = Vd  */
2605
2606   unsigned vs = INSTR (9, 5);
2607   unsigned vd = INSTR (4, 0);
2608
2609   NYI_assert (29, 21, 0x075);
2610   NYI_assert (15, 10, 0x07);
2611
2612   if (INSTR (20, 16) != vs)
2613     HALT_NYI;
2614
2615   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2616   if (INSTR (30, 30))
2617     aarch64_set_vec_u64 (cpu, vd, 1, aarch64_get_vec_u64 (cpu, vs, 1));
2618
2619   aarch64_set_vec_u64 (cpu, vd, 0, aarch64_get_vec_u64 (cpu, vs, 0));
2620 }
2621
2622 static void
2623 do_vec_MOV_into_scalar (sim_cpu *cpu)
2624 {
2625   /* instr[31]    = 0
2626      instr[30]    = word(0)/long(1)
2627      instr[29,21] = 00 1110 000
2628      instr[20,18] = element size and index
2629      instr[17,10] = 00 0011 11
2630      instr[9,5]   = V source
2631      instr[4,0]   = R dest  */
2632
2633   unsigned vs = INSTR (9, 5);
2634   unsigned rd = INSTR (4, 0);
2635
2636   NYI_assert (29, 21, 0x070);
2637   NYI_assert (17, 10, 0x0F);
2638
2639   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2640   switch (INSTR (20, 18))
2641     {
2642     case 0x2:
2643       aarch64_set_reg_u64 (cpu, rd, NO_SP, aarch64_get_vec_u64 (cpu, vs, 0));
2644       break;
2645
2646     case 0x6:
2647       aarch64_set_reg_u64 (cpu, rd, NO_SP, aarch64_get_vec_u64 (cpu, vs, 1));
2648       break;
2649
2650     case 0x1:
2651     case 0x3:
2652     case 0x5:
2653     case 0x7:
2654       aarch64_set_reg_u64 (cpu, rd, NO_SP, aarch64_get_vec_u32
2655                            (cpu, vs, INSTR (20, 19)));
2656       break;
2657
2658     default:
2659       HALT_NYI;
2660     }
2661 }
2662
2663 static void
2664 do_vec_INS (sim_cpu *cpu)
2665 {
2666   /* instr[31,21] = 01001110000
2667      instr[20,16] = element size and index
2668      instr[15,10] = 000111
2669      instr[9,5]   = W source
2670      instr[4,0]   = V dest  */
2671
2672   int index;
2673   unsigned rs = INSTR (9, 5);
2674   unsigned vd = INSTR (4, 0);
2675
2676   NYI_assert (31, 21, 0x270);
2677   NYI_assert (15, 10, 0x07);
2678
2679   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2680   if (INSTR (16, 16))
2681     {
2682       index = INSTR (20, 17);
2683       aarch64_set_vec_u8 (cpu, vd, index,
2684                           aarch64_get_reg_u8 (cpu, rs, NO_SP));
2685     }
2686   else if (INSTR (17, 17))
2687     {
2688       index = INSTR (20, 18);
2689       aarch64_set_vec_u16 (cpu, vd, index,
2690                            aarch64_get_reg_u16 (cpu, rs, NO_SP));
2691     }
2692   else if (INSTR (18, 18))
2693     {
2694       index = INSTR (20, 19);
2695       aarch64_set_vec_u32 (cpu, vd, index,
2696                            aarch64_get_reg_u32 (cpu, rs, NO_SP));
2697     }
2698   else if (INSTR (19, 19))
2699     {
2700       index = INSTR (20, 20);
2701       aarch64_set_vec_u64 (cpu, vd, index,
2702                            aarch64_get_reg_u64 (cpu, rs, NO_SP));
2703     }
2704   else
2705     HALT_NYI;
2706 }
2707
2708 static void
2709 do_vec_DUP_vector_into_vector (sim_cpu *cpu)
2710 {
2711   /* instr[31]    = 0
2712      instr[30]    = half(0)/full(1)
2713      instr[29,21] = 00 1110 000
2714      instr[20,16] = element size and index
2715      instr[15,10] = 0000 01
2716      instr[9,5]   = V source
2717      instr[4,0]   = V dest.  */
2718
2719   unsigned full = INSTR (30, 30);
2720   unsigned vs = INSTR (9, 5);
2721   unsigned vd = INSTR (4, 0);
2722   int i, index;
2723
2724   NYI_assert (29, 21, 0x070);
2725   NYI_assert (15, 10, 0x01);
2726
2727   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2728   if (INSTR (16, 16))
2729     {
2730       index = INSTR (20, 17);
2731
2732       for (i = 0; i < (full ? 16 : 8); i++)
2733         aarch64_set_vec_u8 (cpu, vd, i, aarch64_get_vec_u8 (cpu, vs, index));
2734     }
2735   else if (INSTR (17, 17))
2736     {
2737       index = INSTR (20, 18);
2738
2739       for (i = 0; i < (full ? 8 : 4); i++)
2740         aarch64_set_vec_u16 (cpu, vd, i, aarch64_get_vec_u16 (cpu, vs, index));
2741     }
2742   else if (INSTR (18, 18))
2743     {
2744       index = INSTR (20, 19);
2745
2746       for (i = 0; i < (full ? 4 : 2); i++)
2747         aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vs, index));
2748     }
2749   else
2750     {
2751       if (INSTR (19, 19) == 0)
2752         HALT_UNALLOC;
2753
2754       if (! full)
2755         HALT_UNALLOC;
2756
2757       index = INSTR (20, 20);
2758
2759       for (i = 0; i < 2; i++)
2760         aarch64_set_vec_u64 (cpu, vd, i, aarch64_get_vec_u64 (cpu, vs, index));
2761     }
2762 }
2763
2764 static void
2765 do_vec_TBL (sim_cpu *cpu)
2766 {
2767   /* instr[31]    = 0
2768      instr[30]    = half(0)/full(1)
2769      instr[29,21] = 00 1110 000
2770      instr[20,16] = Vm
2771      instr[15]    = 0
2772      instr[14,13] = vec length
2773      instr[12,10] = 000
2774      instr[9,5]   = V start
2775      instr[4,0]   = V dest  */
2776
2777   int full    = INSTR (30, 30);
2778   int len     = INSTR (14, 13) + 1;
2779   unsigned vm = INSTR (20, 16);
2780   unsigned vn = INSTR (9, 5);
2781   unsigned vd = INSTR (4, 0);
2782   unsigned i;
2783
2784   NYI_assert (29, 21, 0x070);
2785   NYI_assert (12, 10, 0);
2786
2787   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2788   for (i = 0; i < (full ? 16 : 8); i++)
2789     {
2790       unsigned int selector = aarch64_get_vec_u8 (cpu, vm, i);
2791       uint8_t val;
2792
2793       if (selector < 16)
2794         val = aarch64_get_vec_u8 (cpu, vn, selector);
2795       else if (selector < 32)
2796         val = len < 2 ? 0 : aarch64_get_vec_u8 (cpu, vn + 1, selector - 16);
2797       else if (selector < 48)
2798         val = len < 3 ? 0 : aarch64_get_vec_u8 (cpu, vn + 2, selector - 32);
2799       else if (selector < 64)
2800         val = len < 4 ? 0 : aarch64_get_vec_u8 (cpu, vn + 3, selector - 48);
2801       else
2802         val = 0;
2803
2804       aarch64_set_vec_u8 (cpu, vd, i, val);
2805     }
2806 }
2807
2808 static void
2809 do_vec_TRN (sim_cpu *cpu)
2810 {
2811   /* instr[31]    = 0
2812      instr[30]    = half(0)/full(1)
2813      instr[29,24] = 00 1110
2814      instr[23,22] = size
2815      instr[21]    = 0
2816      instr[20,16] = Vm
2817      instr[15]    = 0
2818      instr[14]    = TRN1 (0) / TRN2 (1)
2819      instr[13,10] = 1010
2820      instr[9,5]   = V source
2821      instr[4,0]   = V dest.  */
2822
2823   int full    = INSTR (30, 30);
2824   int second  = INSTR (14, 14);
2825   unsigned vm = INSTR (20, 16);
2826   unsigned vn = INSTR (9, 5);
2827   unsigned vd = INSTR (4, 0);
2828   unsigned i;
2829
2830   NYI_assert (29, 24, 0x0E);
2831   NYI_assert (13, 10, 0xA);
2832
2833   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2834   switch (INSTR (23, 22))
2835     {
2836     case 0:
2837       for (i = 0; i < (full ? 8 : 4); i++)
2838         {
2839           aarch64_set_vec_u8
2840             (cpu, vd, i * 2,
2841              aarch64_get_vec_u8 (cpu, second ? vm : vn, i * 2));
2842           aarch64_set_vec_u8
2843             (cpu, vd, 1 * 2 + 1,
2844              aarch64_get_vec_u8 (cpu, second ? vn : vm, i * 2 + 1));
2845         }
2846       break;
2847
2848     case 1:
2849       for (i = 0; i < (full ? 4 : 2); i++)
2850         {
2851           aarch64_set_vec_u16
2852             (cpu, vd, i * 2,
2853              aarch64_get_vec_u16 (cpu, second ? vm : vn, i * 2));
2854           aarch64_set_vec_u16
2855             (cpu, vd, 1 * 2 + 1,
2856              aarch64_get_vec_u16 (cpu, second ? vn : vm, i * 2 + 1));
2857         }
2858       break;
2859
2860     case 2:
2861       aarch64_set_vec_u32
2862         (cpu, vd, 0, aarch64_get_vec_u32 (cpu, second ? vm : vn, 0));
2863       aarch64_set_vec_u32
2864         (cpu, vd, 1, aarch64_get_vec_u32 (cpu, second ? vn : vm, 1));
2865       aarch64_set_vec_u32
2866         (cpu, vd, 2, aarch64_get_vec_u32 (cpu, second ? vm : vn, 2));
2867       aarch64_set_vec_u32
2868         (cpu, vd, 3, aarch64_get_vec_u32 (cpu, second ? vn : vm, 3));
2869       break;
2870
2871     case 3:
2872       if (! full)
2873         HALT_UNALLOC;
2874
2875       aarch64_set_vec_u64 (cpu, vd, 0,
2876                            aarch64_get_vec_u64 (cpu, second ? vm : vn, 0));
2877       aarch64_set_vec_u64 (cpu, vd, 1,
2878                            aarch64_get_vec_u64 (cpu, second ? vn : vm, 1));
2879       break;
2880     }
2881 }
2882
2883 static void
2884 do_vec_DUP_scalar_into_vector (sim_cpu *cpu)
2885 {
2886   /* instr[31]    = 0
2887      instr[30]    = 0=> zero top 64-bits, 1=> duplicate into top 64-bits
2888                     [must be 1 for 64-bit xfer]
2889      instr[29,20] = 00 1110 0000
2890      instr[19,16] = element size: 0001=> 8-bits, 0010=> 16-bits,
2891                                   0100=> 32-bits. 1000=>64-bits
2892      instr[15,10] = 0000 11
2893      instr[9,5]   = W source
2894      instr[4,0]   = V dest.  */
2895
2896   unsigned i;
2897   unsigned Vd = INSTR (4, 0);
2898   unsigned Rs = INSTR (9, 5);
2899   int both    = INSTR (30, 30);
2900
2901   NYI_assert (29, 20, 0x0E0);
2902   NYI_assert (15, 10, 0x03);
2903
2904   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2905   switch (INSTR (19, 16))
2906     {
2907     case 1:
2908       for (i = 0; i < (both ? 16 : 8); i++)
2909         aarch64_set_vec_u8 (cpu, Vd, i, aarch64_get_reg_u8 (cpu, Rs, NO_SP));
2910       break;
2911
2912     case 2:
2913       for (i = 0; i < (both ? 8 : 4); i++)
2914         aarch64_set_vec_u16 (cpu, Vd, i, aarch64_get_reg_u16 (cpu, Rs, NO_SP));
2915       break;
2916
2917     case 4:
2918       for (i = 0; i < (both ? 4 : 2); i++)
2919         aarch64_set_vec_u32 (cpu, Vd, i, aarch64_get_reg_u32 (cpu, Rs, NO_SP));
2920       break;
2921
2922     case 8:
2923       if (!both)
2924         HALT_NYI;
2925       aarch64_set_vec_u64 (cpu, Vd, 0, aarch64_get_reg_u64 (cpu, Rs, NO_SP));
2926       aarch64_set_vec_u64 (cpu, Vd, 1, aarch64_get_reg_u64 (cpu, Rs, NO_SP));
2927       break;
2928
2929     default:
2930       HALT_NYI;
2931     }
2932 }
2933
2934 static void
2935 do_vec_UZP (sim_cpu *cpu)
2936 {
2937   /* instr[31]    = 0
2938      instr[30]    = half(0)/full(1)
2939      instr[29,24] = 00 1110
2940      instr[23,22] = size: byte(00), half(01), word (10), long (11)
2941      instr[21]    = 0
2942      instr[20,16] = Vm
2943      instr[15]    = 0
2944      instr[14]    = lower (0) / upper (1)
2945      instr[13,10] = 0110
2946      instr[9,5]   = Vn
2947      instr[4,0]   = Vd.  */
2948
2949   int full = INSTR (30, 30);
2950   int upper = INSTR (14, 14);
2951
2952   unsigned vm = INSTR (20, 16);
2953   unsigned vn = INSTR (9, 5);
2954   unsigned vd = INSTR (4, 0);
2955
2956   uint64_t val_m1 = aarch64_get_vec_u64 (cpu, vm, 0);
2957   uint64_t val_m2 = aarch64_get_vec_u64 (cpu, vm, 1);
2958   uint64_t val_n1 = aarch64_get_vec_u64 (cpu, vn, 0);
2959   uint64_t val_n2 = aarch64_get_vec_u64 (cpu, vn, 1);
2960
2961   uint64_t val1 = 0;
2962   uint64_t val2 = 0;
2963
2964   uint64_t input1 = upper ? val_n1 : val_m1;
2965   uint64_t input2 = upper ? val_n2 : val_m2;
2966   unsigned i;
2967
2968   NYI_assert (29, 24, 0x0E);
2969   NYI_assert (21, 21, 0);
2970   NYI_assert (15, 15, 0);
2971   NYI_assert (13, 10, 6);
2972
2973   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2974   switch (INSTR (23, 23))
2975     {
2976     case 0:
2977       for (i = 0; i < 8; i++)
2978         {
2979           val1 |= (input1 >> (i * 8)) & (0xFFULL << (i * 8));
2980           val2 |= (input2 >> (i * 8)) & (0xFFULL << (i * 8));
2981         }
2982       break;
2983
2984     case 1:
2985       for (i = 0; i < 4; i++)
2986         {
2987           val1 |= (input1 >> (i * 16)) & (0xFFFFULL << (i * 16));
2988           val2 |= (input2 >> (i * 16)) & (0xFFFFULL << (i * 16));
2989         }
2990       break;
2991
2992     case 2:
2993       val1 = ((input1 & 0xFFFFFFFF) | ((input1 >> 32) & 0xFFFFFFFF00000000ULL));
2994       val2 = ((input2 & 0xFFFFFFFF) | ((input2 >> 32) & 0xFFFFFFFF00000000ULL));
2995
2996     case 3:
2997       val1 = input1;
2998       val2 = input2;
2999            break;
3000     }
3001
3002   aarch64_set_vec_u64 (cpu, vd, 0, val1);
3003   if (full)
3004     aarch64_set_vec_u64 (cpu, vd, 1, val2);
3005 }
3006
3007 static void
3008 do_vec_ZIP (sim_cpu *cpu)
3009 {
3010   /* instr[31]    = 0
3011      instr[30]    = half(0)/full(1)
3012      instr[29,24] = 00 1110
3013      instr[23,22] = size: byte(00), hald(01), word (10), long (11)
3014      instr[21]    = 0
3015      instr[20,16] = Vm
3016      instr[15]    = 0
3017      instr[14]    = lower (0) / upper (1)
3018      instr[13,10] = 1110
3019      instr[9,5]   = Vn
3020      instr[4,0]   = Vd.  */
3021
3022   int full = INSTR (30, 30);
3023   int upper = INSTR (14, 14);
3024
3025   unsigned vm = INSTR (20, 16);
3026   unsigned vn = INSTR (9, 5);
3027   unsigned vd = INSTR (4, 0);
3028
3029   uint64_t val_m1 = aarch64_get_vec_u64 (cpu, vm, 0);
3030   uint64_t val_m2 = aarch64_get_vec_u64 (cpu, vm, 1);
3031   uint64_t val_n1 = aarch64_get_vec_u64 (cpu, vn, 0);
3032   uint64_t val_n2 = aarch64_get_vec_u64 (cpu, vn, 1);
3033
3034   uint64_t val1 = 0;
3035   uint64_t val2 = 0;
3036
3037   uint64_t input1 = upper ? val_n1 : val_m1;
3038   uint64_t input2 = upper ? val_n2 : val_m2;
3039
3040   NYI_assert (29, 24, 0x0E);
3041   NYI_assert (21, 21, 0);
3042   NYI_assert (15, 15, 0);
3043   NYI_assert (13, 10, 0xE);
3044
3045   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3046   switch (INSTR (23, 23))
3047     {
3048     case 0:
3049       val1 =
3050           ((input1 <<  0) & (0xFF    <<  0))
3051         | ((input2 <<  8) & (0xFF    <<  8))
3052         | ((input1 <<  8) & (0xFF    << 16))
3053         | ((input2 << 16) & (0xFF    << 24))
3054         | ((input1 << 16) & (0xFFULL << 32))
3055         | ((input2 << 24) & (0xFFULL << 40))
3056         | ((input1 << 24) & (0xFFULL << 48))
3057         | ((input2 << 32) & (0xFFULL << 56));
3058
3059       val2 =
3060           ((input1 >> 32) & (0xFF    <<  0))
3061         | ((input2 >> 24) & (0xFF    <<  8))
3062         | ((input1 >> 24) & (0xFF    << 16))
3063         | ((input2 >> 16) & (0xFF    << 24))
3064         | ((input1 >> 16) & (0xFFULL << 32))
3065         | ((input2 >>  8) & (0xFFULL << 40))
3066         | ((input1 >>  8) & (0xFFULL << 48))
3067         | ((input2 >>  0) & (0xFFULL << 56));
3068       break;
3069
3070     case 1:
3071       val1 =
3072           ((input1 <<  0) & (0xFFFF    <<  0))
3073         | ((input2 << 16) & (0xFFFF    << 16))
3074         | ((input1 << 16) & (0xFFFFULL << 32))
3075         | ((input2 << 32) & (0xFFFFULL << 48));
3076
3077       val2 =
3078           ((input1 >> 32) & (0xFFFF    <<  0))
3079         | ((input2 >> 16) & (0xFFFF    << 16))
3080         | ((input1 >> 16) & (0xFFFFULL << 32))
3081         | ((input2 >>  0) & (0xFFFFULL << 48));
3082       break;
3083
3084     case 2:
3085       val1 = (input1 & 0xFFFFFFFFULL) | (input2 << 32);
3086       val2 = (input2 & 0xFFFFFFFFULL) | (input1 << 32);
3087       break;
3088
3089     case 3:
3090       val1 = input1;
3091       val2 = input2;
3092       break;
3093     }
3094
3095   aarch64_set_vec_u64 (cpu, vd, 0, val1);
3096   if (full)
3097     aarch64_set_vec_u64 (cpu, vd, 1, val2);
3098 }
3099
3100 /* Floating point immediates are encoded in 8 bits.
3101    fpimm[7] = sign bit.
3102    fpimm[6:4] = signed exponent.
3103    fpimm[3:0] = fraction (assuming leading 1).
3104    i.e. F = s * 1.f * 2^(e - b).  */
3105
3106 static float
3107 fp_immediate_for_encoding_32 (uint32_t imm8)
3108 {
3109   float u;
3110   uint32_t s, e, f, i;
3111
3112   s = (imm8 >> 7) & 0x1;
3113   e = (imm8 >> 4) & 0x7;
3114   f = imm8 & 0xf;
3115
3116   /* The fp value is s * n/16 * 2r where n is 16+e.  */
3117   u = (16.0 + f) / 16.0;
3118
3119   /* N.B. exponent is signed.  */
3120   if (e < 4)
3121     {
3122       int epos = e;
3123
3124       for (i = 0; i <= epos; i++)
3125         u *= 2.0;
3126     }
3127   else
3128     {
3129       int eneg = 7 - e;
3130
3131       for (i = 0; i < eneg; i++)
3132         u /= 2.0;
3133     }
3134
3135   if (s)
3136     u = - u;
3137
3138   return u;
3139 }
3140
3141 static double
3142 fp_immediate_for_encoding_64 (uint32_t imm8)
3143 {
3144   double u;
3145   uint32_t s, e, f, i;
3146
3147   s = (imm8 >> 7) & 0x1;
3148   e = (imm8 >> 4) & 0x7;
3149   f = imm8 & 0xf;
3150
3151   /* The fp value is s * n/16 * 2r where n is 16+e.  */
3152   u = (16.0 + f) / 16.0;
3153
3154   /* N.B. exponent is signed.  */
3155   if (e < 4)
3156     {
3157       int epos = e;
3158
3159       for (i = 0; i <= epos; i++)
3160         u *= 2.0;
3161     }
3162   else
3163     {
3164       int eneg = 7 - e;
3165
3166       for (i = 0; i < eneg; i++)
3167         u /= 2.0;
3168     }
3169
3170   if (s)
3171     u = - u;
3172
3173   return u;
3174 }
3175
3176 static void
3177 do_vec_MOV_immediate (sim_cpu *cpu)
3178 {
3179   /* instr[31]    = 0
3180      instr[30]    = full/half selector
3181      instr[29,19] = 00111100000
3182      instr[18,16] = high 3 bits of uimm8
3183      instr[15,12] = size & shift:
3184                                   0000 => 32-bit
3185                                   0010 => 32-bit + LSL#8
3186                                   0100 => 32-bit + LSL#16
3187                                   0110 => 32-bit + LSL#24
3188                                   1010 => 16-bit + LSL#8
3189                                   1000 => 16-bit
3190                                   1101 => 32-bit + MSL#16
3191                                   1100 => 32-bit + MSL#8
3192                                   1110 => 8-bit
3193                                   1111 => double
3194      instr[11,10] = 01
3195      instr[9,5]   = low 5-bits of uimm8
3196      instr[4,0]   = Vd.  */
3197
3198   int full     = INSTR (30, 30);
3199   unsigned vd  = INSTR (4, 0);
3200   unsigned val = (INSTR (18, 16) << 5) | INSTR (9, 5);
3201   unsigned i;
3202
3203   NYI_assert (29, 19, 0x1E0);
3204   NYI_assert (11, 10, 1);
3205
3206   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3207   switch (INSTR (15, 12))
3208     {
3209     case 0x0: /* 32-bit, no shift.  */
3210     case 0x2: /* 32-bit, shift by 8.  */
3211     case 0x4: /* 32-bit, shift by 16.  */
3212     case 0x6: /* 32-bit, shift by 24.  */
3213       val <<= (8 * INSTR (14, 13));
3214       for (i = 0; i < (full ? 4 : 2); i++)
3215         aarch64_set_vec_u32 (cpu, vd, i, val);
3216       break;
3217
3218     case 0xa: /* 16-bit, shift by 8.  */
3219       val <<= 8;
3220       /* Fall through.  */
3221     case 0x8: /* 16-bit, no shift.  */
3222       for (i = 0; i < (full ? 8 : 4); i++)
3223         aarch64_set_vec_u16 (cpu, vd, i, val);
3224       break;
3225
3226     case 0xd: /* 32-bit, mask shift by 16.  */
3227       val <<= 8;
3228       val |= 0xFF;
3229       /* Fall through.  */
3230     case 0xc: /* 32-bit, mask shift by 8. */
3231       val <<= 8;
3232       val |= 0xFF;
3233       for (i = 0; i < (full ? 4 : 2); i++)
3234         aarch64_set_vec_u32 (cpu, vd, i, val);
3235       break;
3236
3237     case 0xe: /* 8-bit, no shift.  */
3238       for (i = 0; i < (full ? 16 : 8); i++)
3239         aarch64_set_vec_u8 (cpu, vd, i, val);
3240       break;
3241
3242     case 0xf: /* FMOV Vs.{2|4}S, #fpimm.  */
3243       {
3244         float u = fp_immediate_for_encoding_32 (val);
3245         for (i = 0; i < (full ? 4 : 2); i++)
3246           aarch64_set_vec_float (cpu, vd, i, u);
3247         break;
3248       }
3249
3250     default:
3251       HALT_NYI;
3252     }
3253 }
3254
3255 static void
3256 do_vec_MVNI (sim_cpu *cpu)
3257 {
3258   /* instr[31]    = 0
3259      instr[30]    = full/half selector
3260      instr[29,19] = 10111100000
3261      instr[18,16] = high 3 bits of uimm8
3262      instr[15,12] = selector
3263      instr[11,10] = 01
3264      instr[9,5]   = low 5-bits of uimm8
3265      instr[4,0]   = Vd.  */
3266
3267   int full     = INSTR (30, 30);
3268   unsigned vd  = INSTR (4, 0);
3269   unsigned val = (INSTR (18, 16) << 5) | INSTR (9, 5);
3270   unsigned i;
3271
3272   NYI_assert (29, 19, 0x5E0);
3273   NYI_assert (11, 10, 1);
3274
3275   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3276   switch (INSTR (15, 12))
3277     {
3278     case 0x0: /* 32-bit, no shift.  */
3279     case 0x2: /* 32-bit, shift by 8.  */
3280     case 0x4: /* 32-bit, shift by 16.  */
3281     case 0x6: /* 32-bit, shift by 24.  */
3282       val <<= (8 * INSTR (14, 13));
3283       val = ~ val;
3284       for (i = 0; i < (full ? 4 : 2); i++)
3285         aarch64_set_vec_u32 (cpu, vd, i, val);
3286       return;
3287
3288     case 0xa: /* 16-bit, 8 bit shift. */
3289       val <<= 8;
3290     case 0x8: /* 16-bit, no shift. */
3291       val = ~ val;
3292       for (i = 0; i < (full ? 8 : 4); i++)
3293         aarch64_set_vec_u16 (cpu, vd, i, val);
3294       return;
3295
3296     case 0xd: /* 32-bit, mask shift by 16.  */
3297       val <<= 8;
3298       val |= 0xFF;
3299     case 0xc: /* 32-bit, mask shift by 8. */
3300       val <<= 8;
3301       val |= 0xFF;
3302       val = ~ val;
3303       for (i = 0; i < (full ? 4 : 2); i++)
3304         aarch64_set_vec_u32 (cpu, vd, i, val);
3305       return;
3306
3307     case 0xE: /* MOVI Dn, #mask64 */
3308       {
3309         uint64_t mask = 0;
3310
3311         for (i = 0; i < 8; i++)
3312           if (val & (1 << i))
3313             mask |= (0xFFUL << (i * 8));
3314         aarch64_set_vec_u64 (cpu, vd, 0, mask);
3315         aarch64_set_vec_u64 (cpu, vd, 1, mask);
3316         return;
3317       }
3318
3319     case 0xf: /* FMOV Vd.2D, #fpimm.  */
3320       {
3321         double u = fp_immediate_for_encoding_64 (val);
3322
3323         if (! full)
3324           HALT_UNALLOC;
3325
3326         aarch64_set_vec_double (cpu, vd, 0, u);
3327         aarch64_set_vec_double (cpu, vd, 1, u);
3328         return;
3329       }
3330
3331     default:
3332       HALT_NYI;
3333     }
3334 }
3335
3336 #define ABS(A) ((A) < 0 ? - (A) : (A))
3337
3338 static void
3339 do_vec_ABS (sim_cpu *cpu)
3340 {
3341   /* instr[31]    = 0
3342      instr[30]    = half(0)/full(1)
3343      instr[29,24] = 00 1110
3344      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit, 11=> 64-bit
3345      instr[21,10] = 10 0000 1011 10
3346      instr[9,5]   = Vn
3347      instr[4.0]   = Vd.  */
3348
3349   unsigned vn = INSTR (9, 5);
3350   unsigned vd = INSTR (4, 0);
3351   unsigned full = INSTR (30, 30);
3352   unsigned i;
3353
3354   NYI_assert (29, 24, 0x0E);
3355   NYI_assert (21, 10, 0x82E);
3356
3357   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3358   switch (INSTR (23, 22))
3359     {
3360     case 0:
3361       for (i = 0; i < (full ? 16 : 8); i++)
3362         aarch64_set_vec_s8 (cpu, vd, i,
3363                             ABS (aarch64_get_vec_s8 (cpu, vn, i)));
3364       break;
3365
3366     case 1:
3367       for (i = 0; i < (full ? 8 : 4); i++)
3368         aarch64_set_vec_s16 (cpu, vd, i,
3369                              ABS (aarch64_get_vec_s16 (cpu, vn, i)));
3370       break;
3371
3372     case 2:
3373       for (i = 0; i < (full ? 4 : 2); i++)
3374         aarch64_set_vec_s32 (cpu, vd, i,
3375                              ABS (aarch64_get_vec_s32 (cpu, vn, i)));
3376       break;
3377
3378     case 3:
3379       if (! full)
3380         HALT_NYI;
3381       for (i = 0; i < 2; i++)
3382         aarch64_set_vec_s64 (cpu, vd, i,
3383                              ABS (aarch64_get_vec_s64 (cpu, vn, i)));
3384       break;
3385     }
3386 }
3387
3388 static void
3389 do_vec_ADDV (sim_cpu *cpu)
3390 {
3391   /* instr[31]    = 0
3392      instr[30]    = full/half selector
3393      instr[29,24] = 00 1110
3394      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit, 11=> 64-bit
3395      instr[21,10] = 11 0001 1011 10
3396      instr[9,5]   = Vm
3397      instr[4.0]   = Rd.  */
3398
3399   unsigned vm = INSTR (9, 5);
3400   unsigned rd = INSTR (4, 0);
3401   unsigned i;
3402   uint64_t val = 0;
3403   int      full = INSTR (30, 30);
3404
3405   NYI_assert (29, 24, 0x0E);
3406   NYI_assert (21, 10, 0xC6E);
3407
3408   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3409   switch (INSTR (23, 22))
3410     {
3411     case 0:
3412       for (i = 0; i < (full ? 16 : 8); i++)
3413         val += aarch64_get_vec_u8 (cpu, vm, i);
3414       aarch64_set_reg_u64 (cpu, rd, NO_SP, val);
3415       return;
3416
3417     case 1:
3418       for (i = 0; i < (full ? 8 : 4); i++)
3419         val += aarch64_get_vec_u16 (cpu, vm, i);
3420       aarch64_set_reg_u64 (cpu, rd, NO_SP, val);
3421       return;
3422
3423     case 2:
3424       for (i = 0; i < (full ? 4 : 2); i++)
3425         val += aarch64_get_vec_u32 (cpu, vm, i);
3426       aarch64_set_reg_u64 (cpu, rd, NO_SP, val);
3427       return;
3428
3429     case 3:
3430       if (! full)
3431         HALT_UNALLOC;
3432       val = aarch64_get_vec_u64 (cpu, vm, 0);
3433       val += aarch64_get_vec_u64 (cpu, vm, 1);
3434       aarch64_set_reg_u64 (cpu, rd, NO_SP, val);
3435       return;
3436     }
3437 }
3438
3439 static void
3440 do_vec_ins_2 (sim_cpu *cpu)
3441 {
3442   /* instr[31,21] = 01001110000
3443      instr[20,18] = size & element selector
3444      instr[17,14] = 0000
3445      instr[13]    = direction: to vec(0), from vec (1)
3446      instr[12,10] = 111
3447      instr[9,5]   = Vm
3448      instr[4,0]   = Vd.  */
3449
3450   unsigned elem;
3451   unsigned vm = INSTR (9, 5);
3452   unsigned vd = INSTR (4, 0);
3453
3454   NYI_assert (31, 21, 0x270);
3455   NYI_assert (17, 14, 0);
3456   NYI_assert (12, 10, 7);
3457
3458   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3459   if (INSTR (13, 13) == 1)
3460     {
3461       if (INSTR (18, 18) == 1)
3462         {
3463           /* 32-bit moves.  */
3464           elem = INSTR (20, 19);
3465           aarch64_set_reg_u64 (cpu, vd, NO_SP,
3466                                aarch64_get_vec_u32 (cpu, vm, elem));
3467         }
3468       else
3469         {
3470           /* 64-bit moves.  */
3471           if (INSTR (19, 19) != 1)
3472             HALT_NYI;
3473
3474           elem = INSTR (20, 20);
3475           aarch64_set_reg_u64 (cpu, vd, NO_SP,
3476                                aarch64_get_vec_u64 (cpu, vm, elem));
3477         }
3478     }
3479   else
3480     {
3481       if (INSTR (18, 18) == 1)
3482         {
3483           /* 32-bit moves.  */
3484           elem = INSTR (20, 19);
3485           aarch64_set_vec_u32 (cpu, vd, elem,
3486                                aarch64_get_reg_u32 (cpu, vm, NO_SP));
3487         }
3488       else
3489         {
3490           /* 64-bit moves.  */
3491           if (INSTR (19, 19) != 1)
3492             HALT_NYI;
3493
3494           elem = INSTR (20, 20);
3495           aarch64_set_vec_u64 (cpu, vd, elem,
3496                                aarch64_get_reg_u64 (cpu, vm, NO_SP));
3497         }
3498     }
3499 }
3500
3501 #define DO_VEC_WIDENING_MUL(N, DST_TYPE, READ_TYPE, WRITE_TYPE)   \
3502   do                                                              \
3503     {                                                             \
3504       DST_TYPE a[N], b[N];                                        \
3505                                                                   \
3506       for (i = 0; i < (N); i++)                                   \
3507         {                                                         \
3508           a[i] = aarch64_get_vec_##READ_TYPE (cpu, vn, i + bias); \
3509           b[i] = aarch64_get_vec_##READ_TYPE (cpu, vm, i + bias); \
3510         }                                                         \
3511       for (i = 0; i < (N); i++)                                   \
3512         aarch64_set_vec_##WRITE_TYPE (cpu, vd, i, a[i] * b[i]);   \
3513     }                                                             \
3514   while (0)
3515
3516 static void
3517 do_vec_mull (sim_cpu *cpu)
3518 {
3519   /* instr[31]    = 0
3520      instr[30]    = lower(0)/upper(1) selector
3521      instr[29]    = signed(0)/unsigned(1)
3522      instr[28,24] = 0 1110
3523      instr[23,22] = size: 8-bit (00), 16-bit (01), 32-bit (10)
3524      instr[21]    = 1
3525      instr[20,16] = Vm
3526      instr[15,10] = 11 0000
3527      instr[9,5]   = Vn
3528      instr[4.0]   = Vd.  */
3529
3530   int    unsign = INSTR (29, 29);
3531   int    bias = INSTR (30, 30);
3532   unsigned vm = INSTR (20, 16);
3533   unsigned vn = INSTR ( 9,  5);
3534   unsigned vd = INSTR ( 4,  0);
3535   unsigned i;
3536
3537   NYI_assert (28, 24, 0x0E);
3538   NYI_assert (15, 10, 0x30);
3539
3540   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3541   /* NB: Read source values before writing results, in case
3542      the source and destination vectors are the same.  */
3543   switch (INSTR (23, 22))
3544     {
3545     case 0:
3546       if (bias)
3547         bias = 8;
3548       if (unsign)
3549         DO_VEC_WIDENING_MUL (8, uint16_t, u8, u16);
3550       else
3551         DO_VEC_WIDENING_MUL (8, int16_t, s8, s16);
3552       return;
3553
3554     case 1:
3555       if (bias)
3556         bias = 4;
3557       if (unsign)
3558         DO_VEC_WIDENING_MUL (4, uint32_t, u16, u32);
3559       else
3560         DO_VEC_WIDENING_MUL (4, int32_t, s16, s32);
3561       return;
3562
3563     case 2:
3564       if (bias)
3565         bias = 2;
3566       if (unsign)
3567         DO_VEC_WIDENING_MUL (2, uint64_t, u32, u64);
3568       else
3569         DO_VEC_WIDENING_MUL (2, int64_t, s32, s64);
3570       return;
3571
3572     case 3:
3573       HALT_NYI;
3574     }
3575 }
3576
3577 static void
3578 do_vec_fadd (sim_cpu *cpu)
3579 {
3580   /* instr[31]    = 0
3581      instr[30]    = half(0)/full(1)
3582      instr[29,24] = 001110
3583      instr[23]    = FADD(0)/FSUB(1)
3584      instr[22]    = float (0)/double(1)
3585      instr[21]    = 1
3586      instr[20,16] = Vm
3587      instr[15,10] = 110101
3588      instr[9,5]   = Vn
3589      instr[4.0]   = Vd.  */
3590
3591   unsigned vm = INSTR (20, 16);
3592   unsigned vn = INSTR (9, 5);
3593   unsigned vd = INSTR (4, 0);
3594   unsigned i;
3595   int      full = INSTR (30, 30);
3596
3597   NYI_assert (29, 24, 0x0E);
3598   NYI_assert (21, 21, 1);
3599   NYI_assert (15, 10, 0x35);
3600
3601   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3602   if (INSTR (23, 23))
3603     {
3604       if (INSTR (22, 22))
3605         {
3606           if (! full)
3607             HALT_NYI;
3608
3609           for (i = 0; i < 2; i++)
3610             aarch64_set_vec_double (cpu, vd, i,
3611                                     aarch64_get_vec_double (cpu, vn, i)
3612                                     - aarch64_get_vec_double (cpu, vm, i));
3613         }
3614       else
3615         {
3616           for (i = 0; i < (full ? 4 : 2); i++)
3617             aarch64_set_vec_float (cpu, vd, i,
3618                                    aarch64_get_vec_float (cpu, vn, i)
3619                                    - aarch64_get_vec_float (cpu, vm, i));
3620         }
3621     }
3622   else
3623     {
3624       if (INSTR (22, 22))
3625         {
3626           if (! full)
3627             HALT_NYI;
3628
3629           for (i = 0; i < 2; i++)
3630             aarch64_set_vec_double (cpu, vd, i,
3631                                     aarch64_get_vec_double (cpu, vm, i)
3632                                     + aarch64_get_vec_double (cpu, vn, i));
3633         }
3634       else
3635         {
3636           for (i = 0; i < (full ? 4 : 2); i++)
3637             aarch64_set_vec_float (cpu, vd, i,
3638                                    aarch64_get_vec_float (cpu, vm, i)
3639                                    + aarch64_get_vec_float (cpu, vn, i));
3640         }
3641     }
3642 }
3643
3644 static void
3645 do_vec_add (sim_cpu *cpu)
3646 {
3647   /* instr[31]    = 0
3648      instr[30]    = full/half selector
3649      instr[29,24] = 001110
3650      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit, 11=> 64-bit
3651      instr[21]    = 1
3652      instr[20,16] = Vn
3653      instr[15,10] = 100001
3654      instr[9,5]   = Vm
3655      instr[4.0]   = Vd.  */
3656
3657   unsigned vm = INSTR (20, 16);
3658   unsigned vn = INSTR (9, 5);
3659   unsigned vd = INSTR (4, 0);
3660   unsigned i;
3661   int      full = INSTR (30, 30);
3662
3663   NYI_assert (29, 24, 0x0E);
3664   NYI_assert (21, 21, 1);
3665   NYI_assert (15, 10, 0x21);
3666
3667   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3668   switch (INSTR (23, 22))
3669     {
3670     case 0:
3671       for (i = 0; i < (full ? 16 : 8); i++)
3672         aarch64_set_vec_u8 (cpu, vd, i, aarch64_get_vec_u8 (cpu, vn, i)
3673                             + aarch64_get_vec_u8 (cpu, vm, i));
3674       return;
3675
3676     case 1:
3677       for (i = 0; i < (full ? 8 : 4); i++)
3678         aarch64_set_vec_u16 (cpu, vd, i, aarch64_get_vec_u16 (cpu, vn, i)
3679                              + aarch64_get_vec_u16 (cpu, vm, i));
3680       return;
3681
3682     case 2:
3683       for (i = 0; i < (full ? 4 : 2); i++)
3684         aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vn, i)
3685                              + aarch64_get_vec_u32 (cpu, vm, i));
3686       return;
3687
3688     case 3:
3689       if (! full)
3690         HALT_UNALLOC;
3691       aarch64_set_vec_u64 (cpu, vd, 0, aarch64_get_vec_u64 (cpu, vn, 0)
3692                            + aarch64_get_vec_u64 (cpu, vm, 0));
3693       aarch64_set_vec_u64 (cpu, vd, 1,
3694                            aarch64_get_vec_u64 (cpu, vn, 1)
3695                            + aarch64_get_vec_u64 (cpu, vm, 1));
3696       return;
3697     }
3698 }
3699
3700 static void
3701 do_vec_mul (sim_cpu *cpu)
3702 {
3703   /* instr[31]    = 0
3704      instr[30]    = full/half selector
3705      instr[29,24] = 00 1110
3706      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit
3707      instr[21]    = 1
3708      instr[20,16] = Vn
3709      instr[15,10] = 10 0111
3710      instr[9,5]   = Vm
3711      instr[4.0]   = Vd.  */
3712
3713   unsigned vm = INSTR (20, 16);
3714   unsigned vn = INSTR (9, 5);
3715   unsigned vd = INSTR (4, 0);
3716   unsigned i;
3717   int      full = INSTR (30, 30);
3718   int      bias = 0;
3719
3720   NYI_assert (29, 24, 0x0E);
3721   NYI_assert (21, 21, 1);
3722   NYI_assert (15, 10, 0x27);
3723
3724   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3725   switch (INSTR (23, 22))
3726     {
3727     case 0:
3728       DO_VEC_WIDENING_MUL (full ? 16 : 8, uint8_t, u8, u8);
3729       return;
3730
3731     case 1:
3732       DO_VEC_WIDENING_MUL (full ? 8 : 4, uint16_t, u16, u16);
3733       return;
3734
3735     case 2:
3736       DO_VEC_WIDENING_MUL (full ? 4 : 2, uint32_t, u32, u32);
3737       return;
3738
3739     case 3:
3740       HALT_UNALLOC;
3741     }
3742 }
3743
3744 static void
3745 do_vec_MLA (sim_cpu *cpu)
3746 {
3747   /* instr[31]    = 0
3748      instr[30]    = full/half selector
3749      instr[29,24] = 00 1110
3750      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit
3751      instr[21]    = 1
3752      instr[20,16] = Vn
3753      instr[15,10] = 1001 01
3754      instr[9,5]   = Vm
3755      instr[4.0]   = Vd.  */
3756
3757   unsigned vm = INSTR (20, 16);
3758   unsigned vn = INSTR (9, 5);
3759   unsigned vd = INSTR (4, 0);
3760   unsigned i;
3761   int      full = INSTR (30, 30);
3762
3763   NYI_assert (29, 24, 0x0E);
3764   NYI_assert (21, 21, 1);
3765   NYI_assert (15, 10, 0x25);
3766
3767   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3768   switch (INSTR (23, 22))
3769     {
3770     case 0:
3771       {
3772         uint16_t a[16], b[16];
3773
3774         for (i = 0; i < (full ? 16 : 8); i++)
3775           {
3776             a[i] = aarch64_get_vec_u8 (cpu, vn, i);
3777             b[i] = aarch64_get_vec_u8 (cpu, vm, i);
3778           }
3779
3780         for (i = 0; i < (full ? 16 : 8); i++)
3781           {
3782             uint16_t v = aarch64_get_vec_u8 (cpu, vd, i);
3783
3784             aarch64_set_vec_u16 (cpu, vd, i, v + (a[i] * b[i]));
3785           }
3786       }
3787       return;
3788
3789     case 1:
3790       {
3791         uint32_t a[8], b[8];
3792
3793         for (i = 0; i < (full ? 8 : 4); i++)
3794           {
3795             a[i] = aarch64_get_vec_u16 (cpu, vn, i);
3796             b[i] = aarch64_get_vec_u16 (cpu, vm, i);
3797           }
3798
3799         for (i = 0; i < (full ? 8 : 4); i++)
3800           {
3801             uint32_t v = aarch64_get_vec_u16 (cpu, vd, i);
3802
3803             aarch64_set_vec_u32 (cpu, vd, i, v + (a[i] * b[i]));
3804           }
3805       }
3806       return;
3807
3808     case 2:
3809       {
3810         uint64_t a[4], b[4];
3811
3812         for (i = 0; i < (full ? 4 : 2); i++)
3813           {
3814             a[i] = aarch64_get_vec_u32 (cpu, vn, i);
3815             b[i] = aarch64_get_vec_u32 (cpu, vm, i);
3816           }
3817
3818         for (i = 0; i < (full ? 4 : 2); i++)
3819           {
3820             uint64_t v = aarch64_get_vec_u32 (cpu, vd, i);
3821
3822             aarch64_set_vec_u64 (cpu, vd, i, v + (a[i] * b[i]));
3823           }
3824       }
3825       return;
3826
3827     case 3:
3828       HALT_UNALLOC;
3829     }
3830 }
3831
3832 static float
3833 fmaxnm (float a, float b)
3834 {
3835   if (! isnan (a))
3836     {
3837       if (! isnan (b))
3838         return a > b ? a : b;
3839       return a;
3840     }
3841   else if (! isnan (b))
3842     return b;
3843   return a;
3844 }
3845
3846 static float
3847 fminnm (float a, float b)
3848 {
3849   if (! isnan (a))
3850     {
3851       if (! isnan (b))
3852         return a < b ? a : b;
3853       return a;
3854     }
3855   else if (! isnan (b))
3856     return b;
3857   return a;
3858 }
3859
3860 static double
3861 dmaxnm (double a, double b)
3862 {
3863   if (! isnan (a))
3864     {
3865       if (! isnan (b))
3866         return a > b ? a : b;
3867       return a;
3868     }
3869   else if (! isnan (b))
3870     return b;
3871   return a;
3872 }
3873
3874 static double
3875 dminnm (double a, double b)
3876 {
3877   if (! isnan (a))
3878     {
3879       if (! isnan (b))
3880         return a < b ? a : b;
3881       return a;
3882     }
3883   else if (! isnan (b))
3884     return b;
3885   return a;
3886 }
3887
3888 static void
3889 do_vec_FminmaxNMP (sim_cpu *cpu)
3890 {
3891   /* instr [31]    = 0
3892      instr [30]    = half (0)/full (1)
3893      instr [29,24] = 10 1110
3894      instr [23]    = max(0)/min(1)
3895      instr [22]    = float (0)/double (1)
3896      instr [21]    = 1
3897      instr [20,16] = Vn
3898      instr [15,10] = 1100 01
3899      instr [9,5]   = Vm
3900      instr [4.0]   = Vd.  */
3901
3902   unsigned vm = INSTR (20, 16);
3903   unsigned vn = INSTR (9, 5);
3904   unsigned vd = INSTR (4, 0);
3905   int      full = INSTR (30, 30);
3906
3907   NYI_assert (29, 24, 0x2E);
3908   NYI_assert (21, 21, 1);
3909   NYI_assert (15, 10, 0x31);
3910
3911   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3912   if (INSTR (22, 22))
3913     {
3914       double (* fn)(double, double) = INSTR (23, 23)
3915         ? dminnm : dmaxnm;
3916
3917       if (! full)
3918         HALT_NYI;
3919       aarch64_set_vec_double (cpu, vd, 0,
3920                               fn (aarch64_get_vec_double (cpu, vn, 0),
3921                                   aarch64_get_vec_double (cpu, vn, 1)));
3922       aarch64_set_vec_double (cpu, vd, 0,
3923                               fn (aarch64_get_vec_double (cpu, vm, 0),
3924                                   aarch64_get_vec_double (cpu, vm, 1)));
3925     }
3926   else
3927     {
3928       float (* fn)(float, float) = INSTR (23, 23)
3929         ? fminnm : fmaxnm;
3930
3931       aarch64_set_vec_float (cpu, vd, 0,
3932                              fn (aarch64_get_vec_float (cpu, vn, 0),
3933                                  aarch64_get_vec_float (cpu, vn, 1)));
3934       if (full)
3935         aarch64_set_vec_float (cpu, vd, 1,
3936                                fn (aarch64_get_vec_float (cpu, vn, 2),
3937                                    aarch64_get_vec_float (cpu, vn, 3)));
3938
3939       aarch64_set_vec_float (cpu, vd, (full ? 2 : 1),
3940                              fn (aarch64_get_vec_float (cpu, vm, 0),
3941                                  aarch64_get_vec_float (cpu, vm, 1)));
3942       if (full)
3943         aarch64_set_vec_float (cpu, vd, 3,
3944                                fn (aarch64_get_vec_float (cpu, vm, 2),
3945                                    aarch64_get_vec_float (cpu, vm, 3)));
3946     }
3947 }
3948
3949 static void
3950 do_vec_AND (sim_cpu *cpu)
3951 {
3952   /* instr[31]    = 0
3953      instr[30]    = half (0)/full (1)
3954      instr[29,21] = 001110001
3955      instr[20,16] = Vm
3956      instr[15,10] = 000111
3957      instr[9,5]   = Vn
3958      instr[4.0]   = Vd.  */
3959
3960   unsigned vm = INSTR (20, 16);
3961   unsigned vn = INSTR (9, 5);
3962   unsigned vd = INSTR (4, 0);
3963   unsigned i;
3964   int      full = INSTR (30, 30);
3965
3966   NYI_assert (29, 21, 0x071);
3967   NYI_assert (15, 10, 0x07);
3968
3969   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3970   for (i = 0; i < (full ? 4 : 2); i++)
3971     aarch64_set_vec_u32 (cpu, vd, i,
3972                          aarch64_get_vec_u32 (cpu, vn, i)
3973                          & aarch64_get_vec_u32 (cpu, vm, i));
3974 }
3975
3976 static void
3977 do_vec_BSL (sim_cpu *cpu)
3978 {
3979   /* instr[31]    = 0
3980      instr[30]    = half (0)/full (1)
3981      instr[29,21] = 101110011
3982      instr[20,16] = Vm
3983      instr[15,10] = 000111
3984      instr[9,5]   = Vn
3985      instr[4.0]   = Vd.  */
3986
3987   unsigned vm = INSTR (20, 16);
3988   unsigned vn = INSTR (9, 5);
3989   unsigned vd = INSTR (4, 0);
3990   unsigned i;
3991   int      full = INSTR (30, 30);
3992
3993   NYI_assert (29, 21, 0x173);
3994   NYI_assert (15, 10, 0x07);
3995
3996   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3997   for (i = 0; i < (full ? 16 : 8); i++)
3998     aarch64_set_vec_u8 (cpu, vd, i,
3999                         (    aarch64_get_vec_u8 (cpu, vd, i)
4000                            & aarch64_get_vec_u8 (cpu, vn, i))
4001                         | ((~ aarch64_get_vec_u8 (cpu, vd, i))
4002                            & aarch64_get_vec_u8 (cpu, vm, i)));
4003 }
4004
4005 static void
4006 do_vec_EOR (sim_cpu *cpu)
4007 {
4008   /* instr[31]    = 0
4009      instr[30]    = half (0)/full (1)
4010      instr[29,21] = 10 1110 001
4011      instr[20,16] = Vm
4012      instr[15,10] = 000111
4013      instr[9,5]   = Vn
4014      instr[4.0]   = Vd.  */
4015
4016   unsigned vm = INSTR (20, 16);
4017   unsigned vn = INSTR (9, 5);
4018   unsigned vd = INSTR (4, 0);
4019   unsigned i;
4020   int      full = INSTR (30, 30);
4021
4022   NYI_assert (29, 21, 0x171);
4023   NYI_assert (15, 10, 0x07);
4024
4025   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4026   for (i = 0; i < (full ? 4 : 2); i++)
4027     aarch64_set_vec_u32 (cpu, vd, i,
4028                          aarch64_get_vec_u32 (cpu, vn, i)
4029                          ^ aarch64_get_vec_u32 (cpu, vm, i));
4030 }
4031
4032 static void
4033 do_vec_bit (sim_cpu *cpu)
4034 {
4035   /* instr[31]    = 0
4036      instr[30]    = half (0)/full (1)
4037      instr[29,23] = 10 1110 1
4038      instr[22]    = BIT (0) / BIF (1)
4039      instr[21]    = 1
4040      instr[20,16] = Vm
4041      instr[15,10] = 0001 11
4042      instr[9,5]   = Vn
4043      instr[4.0]   = Vd.  */
4044
4045   unsigned vm = INSTR (20, 16);
4046   unsigned vn = INSTR (9, 5);
4047   unsigned vd = INSTR (4, 0);
4048   unsigned full = INSTR (30, 30);
4049   unsigned test_false = INSTR (22, 22);
4050   unsigned i;
4051
4052   NYI_assert (29, 23, 0x5D);
4053   NYI_assert (21, 21, 1);
4054   NYI_assert (15, 10, 0x07);
4055
4056   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4057   if (test_false)
4058     {
4059       for (i = 0; i < (full ? 16 : 8); i++)
4060         if (aarch64_get_vec_u32 (cpu, vn, i) == 0)
4061           aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vm, i));
4062     }
4063   else
4064     {
4065       for (i = 0; i < (full ? 16 : 8); i++)
4066         if (aarch64_get_vec_u32 (cpu, vn, i) != 0)
4067           aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vm, i));
4068     }
4069 }
4070
4071 static void
4072 do_vec_ORN (sim_cpu *cpu)
4073 {
4074   /* instr[31]    = 0
4075      instr[30]    = half (0)/full (1)
4076      instr[29,21] = 00 1110 111
4077      instr[20,16] = Vm
4078      instr[15,10] = 00 0111
4079      instr[9,5]   = Vn
4080      instr[4.0]   = Vd.  */
4081
4082   unsigned vm = INSTR (20, 16);
4083   unsigned vn = INSTR (9, 5);
4084   unsigned vd = INSTR (4, 0);
4085   unsigned i;
4086   int      full = INSTR (30, 30);
4087
4088   NYI_assert (29, 21, 0x077);
4089   NYI_assert (15, 10, 0x07);
4090
4091   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4092   for (i = 0; i < (full ? 16 : 8); i++)
4093     aarch64_set_vec_u8 (cpu, vd, i,
4094                         aarch64_get_vec_u8 (cpu, vn, i)
4095                         | ~ aarch64_get_vec_u8 (cpu, vm, i));
4096 }
4097
4098 static void
4099 do_vec_ORR (sim_cpu *cpu)
4100 {
4101   /* instr[31]    = 0
4102      instr[30]    = half (0)/full (1)
4103      instr[29,21] = 00 1110 101
4104      instr[20,16] = Vm
4105      instr[15,10] = 0001 11
4106      instr[9,5]   = Vn
4107      instr[4.0]   = Vd.  */
4108
4109   unsigned vm = INSTR (20, 16);
4110   unsigned vn = INSTR (9, 5);
4111   unsigned vd = INSTR (4, 0);
4112   unsigned i;
4113   int      full = INSTR (30, 30);
4114
4115   NYI_assert (29, 21, 0x075);
4116   NYI_assert (15, 10, 0x07);
4117
4118   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4119   for (i = 0; i < (full ? 16 : 8); i++)
4120     aarch64_set_vec_u8 (cpu, vd, i,
4121                         aarch64_get_vec_u8 (cpu, vn, i)
4122                         | aarch64_get_vec_u8 (cpu, vm, i));
4123 }
4124
4125 static void
4126 do_vec_BIC (sim_cpu *cpu)
4127 {
4128   /* instr[31]    = 0
4129      instr[30]    = half (0)/full (1)
4130      instr[29,21] = 00 1110 011
4131      instr[20,16] = Vm
4132      instr[15,10] = 00 0111
4133      instr[9,5]   = Vn
4134      instr[4.0]   = Vd.  */
4135
4136   unsigned vm = INSTR (20, 16);
4137   unsigned vn = INSTR (9, 5);
4138   unsigned vd = INSTR (4, 0);
4139   unsigned i;
4140   int      full = INSTR (30, 30);
4141
4142   NYI_assert (29, 21, 0x073);
4143   NYI_assert (15, 10, 0x07);
4144
4145   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4146   for (i = 0; i < (full ? 16 : 8); i++)
4147     aarch64_set_vec_u8 (cpu, vd, i,
4148                         aarch64_get_vec_u8 (cpu, vn, i)
4149                         & ~ aarch64_get_vec_u8 (cpu, vm, i));
4150 }
4151
4152 static void
4153 do_vec_XTN (sim_cpu *cpu)
4154 {
4155   /* instr[31]    = 0
4156      instr[30]    = first part (0)/ second part (1)
4157      instr[29,24] = 00 1110
4158      instr[23,22] = size: byte(00), half(01), word (10)
4159      instr[21,10] = 1000 0100 1010
4160      instr[9,5]   = Vs
4161      instr[4,0]   = Vd.  */
4162
4163   unsigned vs = INSTR (9, 5);
4164   unsigned vd = INSTR (4, 0);
4165   unsigned bias = INSTR (30, 30);
4166   unsigned i;
4167
4168   NYI_assert (29, 24, 0x0E);
4169   NYI_assert (21, 10, 0x84A);
4170
4171   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4172   switch (INSTR (23, 22))
4173     {
4174     case 0:
4175       if (bias)
4176         for (i = 0; i < 8; i++)
4177           aarch64_set_vec_u8 (cpu, vd, i + 8,
4178                               aarch64_get_vec_u16 (cpu, vs, i) >> 8);
4179       else
4180         for (i = 0; i < 8; i++)
4181           aarch64_set_vec_u8 (cpu, vd, i, aarch64_get_vec_u16 (cpu, vs, i));
4182       return;
4183
4184     case 1:
4185       if (bias)
4186         for (i = 0; i < 4; i++)
4187           aarch64_set_vec_u16 (cpu, vd, i + 4,
4188                                aarch64_get_vec_u32 (cpu, vs, i) >> 16);
4189       else
4190         for (i = 0; i < 4; i++)
4191           aarch64_set_vec_u16 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vs, i));
4192       return;
4193
4194     case 2:
4195       if (bias)
4196         for (i = 0; i < 2; i++)
4197           aarch64_set_vec_u32 (cpu, vd, i + 4,
4198                                aarch64_get_vec_u64 (cpu, vs, i) >> 32);
4199       else
4200         for (i = 0; i < 2; i++)
4201           aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u64 (cpu, vs, i));
4202       return;
4203     }
4204 }
4205
4206 static void
4207 do_vec_maxv (sim_cpu *cpu)
4208 {
4209   /* instr[31]    = 0
4210      instr[30]    = half(0)/full(1)
4211      instr[29]    = signed (0)/unsigned(1)
4212      instr[28,24] = 0 1110
4213      instr[23,22] = size: byte(00), half(01), word (10)
4214      instr[21]    = 1
4215      instr[20,17] = 1 000
4216      instr[16]    = max(0)/min(1)
4217      instr[15,10] = 1010 10
4218      instr[9,5]   = V source
4219      instr[4.0]   = R dest.  */
4220
4221   unsigned vs = INSTR (9, 5);
4222   unsigned rd = INSTR (4, 0);
4223   unsigned full = INSTR (30, 30);
4224   unsigned i;
4225
4226   NYI_assert (28, 24, 0x0E);
4227   NYI_assert (21, 21, 1);
4228   NYI_assert (20, 17, 8);
4229   NYI_assert (15, 10, 0x2A);
4230
4231   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4232   switch ((INSTR (29, 29) << 1) | INSTR (16, 16))
4233     {
4234     case 0: /* SMAXV.  */
4235        {
4236         int64_t smax;
4237         switch (INSTR (23, 22))
4238           {
4239           case 0:
4240             smax = aarch64_get_vec_s8 (cpu, vs, 0);
4241             for (i = 1; i < (full ? 16 : 8); i++)
4242               smax = max (smax, aarch64_get_vec_s8 (cpu, vs, i));
4243             break;
4244           case 1:
4245             smax = aarch64_get_vec_s16 (cpu, vs, 0);
4246             for (i = 1; i < (full ? 8 : 4); i++)
4247               smax = max (smax, aarch64_get_vec_s16 (cpu, vs, i));
4248             break;
4249           case 2:
4250             smax = aarch64_get_vec_s32 (cpu, vs, 0);
4251             for (i = 1; i < (full ? 4 : 2); i++)
4252               smax = max (smax, aarch64_get_vec_s32 (cpu, vs, i));
4253             break;
4254           case 3:
4255             HALT_UNALLOC;
4256           }
4257         aarch64_set_reg_s64 (cpu, rd, NO_SP, smax);
4258         return;
4259       }
4260
4261     case 1: /* SMINV.  */
4262       {
4263         int64_t smin;
4264         switch (INSTR (23, 22))
4265           {
4266           case 0:
4267             smin = aarch64_get_vec_s8 (cpu, vs, 0);
4268             for (i = 1; i < (full ? 16 : 8); i++)
4269               smin = min (smin, aarch64_get_vec_s8 (cpu, vs, i));
4270             break;
4271           case 1:
4272             smin = aarch64_get_vec_s16 (cpu, vs, 0);
4273             for (i = 1; i < (full ? 8 : 4); i++)
4274               smin = min (smin, aarch64_get_vec_s16 (cpu, vs, i));
4275             break;
4276           case 2:
4277             smin = aarch64_get_vec_s32 (cpu, vs, 0);
4278             for (i = 1; i < (full ? 4 : 2); i++)
4279               smin = min (smin, aarch64_get_vec_s32 (cpu, vs, i));
4280             break;
4281
4282           case 3:
4283             HALT_UNALLOC;
4284           }
4285         aarch64_set_reg_s64 (cpu, rd, NO_SP, smin);
4286         return;
4287       }
4288
4289     case 2: /* UMAXV.  */
4290       {
4291         uint64_t umax;
4292         switch (INSTR (23, 22))
4293           {
4294           case 0:
4295             umax = aarch64_get_vec_u8 (cpu, vs, 0);
4296             for (i = 1; i < (full ? 16 : 8); i++)
4297               umax = max (umax, aarch64_get_vec_u8 (cpu, vs, i));
4298             break;
4299           case 1:
4300             umax = aarch64_get_vec_u16 (cpu, vs, 0);
4301             for (i = 1; i < (full ? 8 : 4); i++)
4302               umax = max (umax, aarch64_get_vec_u16 (cpu, vs, i));
4303             break;
4304           case 2:
4305             umax = aarch64_get_vec_u32 (cpu, vs, 0);
4306             for (i = 1; i < (full ? 4 : 2); i++)
4307               umax = max (umax, aarch64_get_vec_u32 (cpu, vs, i));
4308             break;
4309
4310           case 3:
4311             HALT_UNALLOC;
4312           }
4313         aarch64_set_reg_u64 (cpu, rd, NO_SP, umax);
4314         return;
4315       }
4316
4317     case 3: /* UMINV.  */
4318       {
4319         uint64_t umin;
4320         switch (INSTR (23, 22))
4321           {
4322           case 0:
4323             umin = aarch64_get_vec_u8 (cpu, vs, 0);
4324             for (i = 1; i < (full ? 16 : 8); i++)
4325               umin = min (umin, aarch64_get_vec_u8 (cpu, vs, i));
4326             break;
4327           case 1:
4328             umin = aarch64_get_vec_u16 (cpu, vs, 0);
4329             for (i = 1; i < (full ? 8 : 4); i++)
4330               umin = min (umin, aarch64_get_vec_u16 (cpu, vs, i));
4331             break;
4332           case 2:
4333             umin = aarch64_get_vec_u32 (cpu, vs, 0);
4334             for (i = 1; i < (full ? 4 : 2); i++)
4335               umin = min (umin, aarch64_get_vec_u32 (cpu, vs, i));
4336             break;
4337
4338           case 3:
4339             HALT_UNALLOC;
4340           }
4341         aarch64_set_reg_u64 (cpu, rd, NO_SP, umin);
4342         return;
4343       }
4344     }
4345 }
4346
4347 static void
4348 do_vec_fminmaxV (sim_cpu *cpu)
4349 {
4350   /* instr[31,24] = 0110 1110
4351      instr[23]    = max(0)/min(1)
4352      instr[22,14] = 011 0000 11
4353      instr[13,12] = nm(00)/normal(11)
4354      instr[11,10] = 10
4355      instr[9,5]   = V source
4356      instr[4.0]   = R dest.  */
4357
4358   unsigned vs = INSTR (9, 5);
4359   unsigned rd = INSTR (4, 0);
4360   unsigned i;
4361   float res   = aarch64_get_vec_float (cpu, vs, 0);
4362
4363   NYI_assert (31, 24, 0x6E);
4364   NYI_assert (22, 14, 0x0C3);
4365   NYI_assert (11, 10, 2);
4366
4367   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4368   if (INSTR (23, 23))
4369     {
4370       switch (INSTR (13, 12))
4371         {
4372         case 0: /* FMNINNMV.  */
4373           for (i = 1; i < 4; i++)
4374             res = fminnm (res, aarch64_get_vec_float (cpu, vs, i));
4375           break;
4376
4377         case 3: /* FMINV.  */
4378           for (i = 1; i < 4; i++)
4379             res = min (res, aarch64_get_vec_float (cpu, vs, i));
4380           break;
4381
4382         default:
4383           HALT_NYI;
4384         }
4385     }
4386   else
4387     {
4388       switch (INSTR (13, 12))
4389         {
4390         case 0: /* FMNAXNMV.  */
4391           for (i = 1; i < 4; i++)
4392             res = fmaxnm (res, aarch64_get_vec_float (cpu, vs, i));
4393           break;
4394
4395         case 3: /* FMAXV.  */
4396           for (i = 1; i < 4; i++)
4397             res = max (res, aarch64_get_vec_float (cpu, vs, i));
4398           break;
4399
4400         default:
4401           HALT_NYI;
4402         }
4403     }
4404
4405   aarch64_set_FP_float (cpu, rd, res);
4406 }
4407
4408 static void
4409 do_vec_Fminmax (sim_cpu *cpu)
4410 {
4411   /* instr[31]    = 0
4412      instr[30]    = half(0)/full(1)
4413      instr[29,24] = 00 1110
4414      instr[23]    = max(0)/min(1)
4415      instr[22]    = float(0)/double(1)
4416      instr[21]    = 1
4417      instr[20,16] = Vm
4418      instr[15,14] = 11
4419      instr[13,12] = nm(00)/normal(11)
4420      instr[11,10] = 01
4421      instr[9,5]   = Vn
4422      instr[4,0]   = Vd.  */
4423
4424   unsigned vm = INSTR (20, 16);
4425   unsigned vn = INSTR (9, 5);
4426   unsigned vd = INSTR (4, 0);
4427   unsigned full = INSTR (30, 30);
4428   unsigned min = INSTR (23, 23);
4429   unsigned i;
4430
4431   NYI_assert (29, 24, 0x0E);
4432   NYI_assert (21, 21, 1);
4433   NYI_assert (15, 14, 3);
4434   NYI_assert (11, 10, 1);
4435
4436   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4437   if (INSTR (22, 22))
4438     {
4439       double (* func)(double, double);
4440
4441       if (! full)
4442         HALT_NYI;
4443
4444       if (INSTR (13, 12) == 0)
4445         func = min ? dminnm : dmaxnm;
4446       else if (INSTR (13, 12) == 3)
4447         func = min ? fmin : fmax;
4448       else
4449         HALT_NYI;
4450
4451       for (i = 0; i < 2; i++)
4452         aarch64_set_vec_double (cpu, vd, i,
4453                                 func (aarch64_get_vec_double (cpu, vn, i),
4454                                       aarch64_get_vec_double (cpu, vm, i)));
4455     }
4456   else
4457     {
4458       float (* func)(float, float);
4459
4460       if (INSTR (13, 12) == 0)
4461         func = min ? fminnm : fmaxnm;
4462       else if (INSTR (13, 12) == 3)
4463         func = min ? fminf : fmaxf;
4464       else
4465         HALT_NYI;
4466
4467       for (i = 0; i < (full ? 4 : 2); i++)
4468         aarch64_set_vec_float (cpu, vd, i,
4469                                func (aarch64_get_vec_float (cpu, vn, i),
4470                                      aarch64_get_vec_float (cpu, vm, i)));
4471     }
4472 }
4473
4474 static void
4475 do_vec_SCVTF (sim_cpu *cpu)
4476 {
4477   /* instr[31]    = 0
4478      instr[30]    = Q
4479      instr[29,23] = 00 1110 0
4480      instr[22]    = float(0)/double(1)
4481      instr[21,10] = 10 0001 1101 10
4482      instr[9,5]   = Vn
4483      instr[4,0]   = Vd.  */
4484
4485   unsigned vn = INSTR (9, 5);
4486   unsigned vd = INSTR (4, 0);
4487   unsigned full = INSTR (30, 30);
4488   unsigned size = INSTR (22, 22);
4489   unsigned i;
4490
4491   NYI_assert (29, 23, 0x1C);
4492   NYI_assert (21, 10, 0x876);
4493
4494   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4495   if (size)
4496     {
4497       if (! full)
4498         HALT_UNALLOC;
4499
4500       for (i = 0; i < 2; i++)
4501         {
4502           double val = (double) aarch64_get_vec_u64 (cpu, vn, i);
4503           aarch64_set_vec_double (cpu, vd, i, val);
4504         }
4505     }
4506   else
4507     {
4508       for (i = 0; i < (full ? 4 : 2); i++)
4509         {
4510           float val = (float) aarch64_get_vec_u32 (cpu, vn, i);
4511           aarch64_set_vec_float (cpu, vd, i, val);
4512         }
4513     }
4514 }
4515
4516 #define VEC_CMP(SOURCE, CMP)                                            \
4517   do                                                                    \
4518     {                                                                   \
4519       switch (size)                                                     \
4520         {                                                               \
4521         case 0:                                                         \
4522           for (i = 0; i < (full ? 16 : 8); i++)                         \
4523             aarch64_set_vec_u8 (cpu, vd, i,                             \
4524                                 aarch64_get_vec_##SOURCE##8 (cpu, vn, i) \
4525                                 CMP                                     \
4526                                 aarch64_get_vec_##SOURCE##8 (cpu, vm, i) \
4527                                 ? -1 : 0);                              \
4528           return;                                                       \
4529         case 1:                                                         \
4530           for (i = 0; i < (full ? 8 : 4); i++)                          \
4531             aarch64_set_vec_u16 (cpu, vd, i,                            \
4532                                  aarch64_get_vec_##SOURCE##16 (cpu, vn, i) \
4533                                  CMP                                    \
4534                                  aarch64_get_vec_##SOURCE##16 (cpu, vm, i) \
4535                                  ? -1 : 0);                             \
4536           return;                                                       \
4537         case 2:                                                         \
4538           for (i = 0; i < (full ? 4 : 2); i++)                          \
4539             aarch64_set_vec_u32 (cpu, vd, i, \
4540                                  aarch64_get_vec_##SOURCE##32 (cpu, vn, i) \
4541                                  CMP                                    \
4542                                  aarch64_get_vec_##SOURCE##32 (cpu, vm, i) \
4543                                  ? -1 : 0);                             \
4544           return;                                                       \
4545         case 3:                                                         \
4546           if (! full)                                                   \
4547             HALT_UNALLOC;                                               \
4548           for (i = 0; i < 2; i++)                                       \
4549             aarch64_set_vec_u64 (cpu, vd, i, \
4550                                  aarch64_get_vec_##SOURCE##64 (cpu, vn, i) \
4551                                  CMP                                    \
4552                                  aarch64_get_vec_##SOURCE##64 (cpu, vm, i) \
4553                                  ? -1ULL : 0);                          \
4554           return;                                                       \
4555         }                                                               \
4556     }                                                                   \
4557   while (0)
4558
4559 #define VEC_CMP0(SOURCE, CMP)                                           \
4560   do                                                                    \
4561     {                                                                   \
4562       switch (size)                                                     \
4563         {                                                               \
4564         case 0:                                                         \
4565           for (i = 0; i < (full ? 16 : 8); i++)                         \
4566             aarch64_set_vec_u8 (cpu, vd, i,                             \
4567                                 aarch64_get_vec_##SOURCE##8 (cpu, vn, i) \
4568                                 CMP 0 ? -1 : 0);                        \
4569           return;                                                       \
4570         case 1:                                                         \
4571           for (i = 0; i < (full ? 8 : 4); i++)                          \
4572             aarch64_set_vec_u16 (cpu, vd, i,                            \
4573                                  aarch64_get_vec_##SOURCE##16 (cpu, vn, i) \
4574                                  CMP 0 ? -1 : 0);                       \
4575           return;                                                       \
4576         case 2:                                                         \
4577           for (i = 0; i < (full ? 4 : 2); i++)                          \
4578             aarch64_set_vec_u32 (cpu, vd, i,                            \
4579                                  aarch64_get_vec_##SOURCE##32 (cpu, vn, i) \
4580                                  CMP 0 ? -1 : 0);                       \
4581           return;                                                       \
4582         case 3:                                                         \
4583           if (! full)                                                   \
4584             HALT_UNALLOC;                                               \
4585           for (i = 0; i < 2; i++)                                       \
4586             aarch64_set_vec_u64 (cpu, vd, i,                            \
4587                                  aarch64_get_vec_##SOURCE##64 (cpu, vn, i) \
4588                                  CMP 0 ? -1ULL : 0);                    \
4589           return;                                                       \
4590         }                                                               \
4591     }                                                                   \
4592   while (0)
4593
4594 #define VEC_FCMP0(CMP)                                                  \
4595   do                                                                    \
4596     {                                                                   \
4597       if (vm != 0)                                                      \
4598         HALT_NYI;                                                       \
4599       if (INSTR (22, 22))                                               \
4600         {                                                               \
4601           if (! full)                                                   \
4602             HALT_NYI;                                                   \
4603           for (i = 0; i < 2; i++)                                       \
4604             aarch64_set_vec_u64 (cpu, vd, i,                            \
4605                                  aarch64_get_vec_double (cpu, vn, i)    \
4606                                  CMP 0.0 ? -1 : 0);                     \
4607         }                                                               \
4608       else                                                              \
4609         {                                                               \
4610           for (i = 0; i < (full ? 4 : 2); i++)                          \
4611             aarch64_set_vec_u32 (cpu, vd, i,                            \
4612                                  aarch64_get_vec_float (cpu, vn, i)     \
4613                                  CMP 0.0 ? -1 : 0);                     \
4614         }                                                               \
4615       return;                                                           \
4616     }                                                                   \
4617   while (0)
4618
4619 #define VEC_FCMP(CMP)                                                   \
4620   do                                                                    \
4621     {                                                                   \
4622       if (INSTR (22, 22))                                               \
4623         {                                                               \
4624           if (! full)                                                   \
4625             HALT_NYI;                                                   \
4626           for (i = 0; i < 2; i++)                                       \
4627             aarch64_set_vec_u64 (cpu, vd, i,                            \
4628                                  aarch64_get_vec_double (cpu, vn, i)    \
4629                                  CMP                                    \
4630                                  aarch64_get_vec_double (cpu, vm, i)    \
4631                                  ? -1 : 0);                             \
4632         }                                                               \
4633       else                                                              \
4634         {                                                               \
4635           for (i = 0; i < (full ? 4 : 2); i++)                          \
4636             aarch64_set_vec_u32 (cpu, vd, i,                            \
4637                                  aarch64_get_vec_float (cpu, vn, i)     \
4638                                  CMP                                    \
4639                                  aarch64_get_vec_float (cpu, vm, i)     \
4640                                  ? -1 : 0);                             \
4641         }                                                               \
4642       return;                                                           \
4643     }                                                                   \
4644   while (0)
4645
4646 static void
4647 do_vec_compare (sim_cpu *cpu)
4648 {
4649   /* instr[31]    = 0
4650      instr[30]    = half(0)/full(1)
4651      instr[29]    = part-of-comparison-type
4652      instr[28,24] = 0 1110
4653      instr[23,22] = size of integer compares: byte(00), half(01), word (10), long (11)
4654                     type of float compares: single (-0) / double (-1)
4655      instr[21]    = 1
4656      instr[20,16] = Vm or 00000 (compare vs 0)
4657      instr[15,10] = part-of-comparison-type
4658      instr[9,5]   = Vn
4659      instr[4.0]   = Vd.  */
4660
4661   int full = INSTR (30, 30);
4662   int size = INSTR (23, 22);
4663   unsigned vm = INSTR (20, 16);
4664   unsigned vn = INSTR (9, 5);
4665   unsigned vd = INSTR (4, 0);
4666   unsigned i;
4667
4668   NYI_assert (28, 24, 0x0E);
4669   NYI_assert (21, 21, 1);
4670
4671   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4672   if ((INSTR (11, 11)
4673        && INSTR (14, 14))
4674       || ((INSTR (11, 11) == 0
4675            && INSTR (10, 10) == 0)))
4676     {
4677       /* A compare vs 0.  */
4678       if (vm != 0)
4679         {
4680           if (INSTR (15, 10) == 0x2A)
4681             do_vec_maxv (cpu);
4682           else if (INSTR (15, 10) == 0x32
4683                    || INSTR (15, 10) == 0x3E)
4684             do_vec_fminmaxV (cpu);
4685           else if (INSTR (29, 23) == 0x1C
4686                    && INSTR (21, 10) == 0x876)
4687             do_vec_SCVTF (cpu);
4688           else
4689             HALT_NYI;
4690           return;
4691         }
4692     }
4693
4694   if (INSTR (14, 14))
4695     {
4696       /* A floating point compare.  */
4697       unsigned decode = (INSTR (29, 29) << 5) | (INSTR (23, 23) << 4)
4698         | INSTR (13, 10);
4699
4700       NYI_assert (15, 15, 1);
4701
4702       switch (decode)
4703         {
4704         case /* 0b010010: GT#0 */ 0x12: VEC_FCMP0 (>);
4705         case /* 0b110010: GE#0 */ 0x32: VEC_FCMP0 (>=);
4706         case /* 0b010110: EQ#0 */ 0x16: VEC_FCMP0 (==);
4707         case /* 0b110110: LE#0 */ 0x36: VEC_FCMP0 (<=);
4708         case /* 0b011010: LT#0 */ 0x1A: VEC_FCMP0 (<);
4709         case /* 0b111001: GT */   0x39: VEC_FCMP  (>);
4710         case /* 0b101001: GE */   0x29: VEC_FCMP  (>=);
4711         case /* 0b001001: EQ */   0x09: VEC_FCMP  (==);
4712
4713         default:
4714           HALT_NYI;
4715         }
4716     }
4717   else
4718     {
4719       unsigned decode = (INSTR (29, 29) << 6) | INSTR (15, 10);
4720
4721       switch (decode)
4722         {
4723         case 0x0D: /* 0001101 GT */     VEC_CMP  (s, > );
4724         case 0x0F: /* 0001111 GE */     VEC_CMP  (s, >= );
4725         case 0x22: /* 0100010 GT #0 */  VEC_CMP0 (s, > );
4726         case 0x26: /* 0100110 EQ #0 */  VEC_CMP0 (s, == );
4727         case 0x2A: /* 0101010 LT #0 */  VEC_CMP0 (s, < );
4728         case 0x4D: /* 1001101 HI */     VEC_CMP  (u, > );
4729         case 0x4F: /* 1001111 HS */     VEC_CMP  (u, >= );
4730         case 0x62: /* 1100010 GE #0 */  VEC_CMP0 (s, >= );
4731         case 0x63: /* 1100011 EQ */     VEC_CMP  (u, == );
4732         case 0x66: /* 1100110 LE #0 */  VEC_CMP0 (s, <= );
4733         default:
4734           if (vm == 0)
4735             HALT_NYI;
4736           do_vec_maxv (cpu);
4737         }
4738     }
4739 }
4740
4741 static void
4742 do_vec_SSHL (sim_cpu *cpu)
4743 {
4744   /* instr[31]    = 0
4745      instr[30]    = first part (0)/ second part (1)
4746      instr[29,24] = 00 1110
4747      instr[23,22] = size: byte(00), half(01), word (10), long (11)
4748      instr[21]    = 1
4749      instr[20,16] = Vm
4750      instr[15,10] = 0100 01
4751      instr[9,5]   = Vn
4752      instr[4,0]   = Vd.  */
4753
4754   unsigned full = INSTR (30, 30);
4755   unsigned vm = INSTR (20, 16);
4756   unsigned vn = INSTR (9, 5);
4757   unsigned vd = INSTR (4, 0);
4758   unsigned i;
4759   signed int shift;
4760
4761   NYI_assert (29, 24, 0x0E);
4762   NYI_assert (21, 21, 1);
4763   NYI_assert (15, 10, 0x11);
4764
4765   /* FIXME: What is a signed shift left in this context ?.  */
4766
4767   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4768   switch (INSTR (23, 22))
4769     {
4770     case 0:
4771       for (i = 0; i < (full ? 16 : 8); i++)
4772         {
4773           shift = aarch64_get_vec_s8 (cpu, vm, i);
4774           if (shift >= 0)
4775             aarch64_set_vec_s8 (cpu, vd, i, aarch64_get_vec_s8 (cpu, vn, i)
4776                                 << shift);
4777           else
4778             aarch64_set_vec_s8 (cpu, vd, i, aarch64_get_vec_s8 (cpu, vn, i)
4779                                 >> - shift);
4780         }
4781       return;
4782
4783     case 1:
4784       for (i = 0; i < (full ? 8 : 4); i++)
4785         {
4786           shift = aarch64_get_vec_s8 (cpu, vm, i * 2);
4787           if (shift >= 0)
4788             aarch64_set_vec_s16 (cpu, vd, i, aarch64_get_vec_s16 (cpu, vn, i)
4789                                  << shift);
4790           else
4791             aarch64_set_vec_s16 (cpu, vd, i, aarch64_get_vec_s16 (cpu, vn, i)
4792                                  >> - shift);
4793         }
4794       return;
4795
4796     case 2:
4797       for (i = 0; i < (full ? 4 : 2); i++)
4798         {
4799           shift = aarch64_get_vec_s8 (cpu, vm, i * 4);
4800           if (shift >= 0)
4801             aarch64_set_vec_s32 (cpu, vd, i, aarch64_get_vec_s32 (cpu, vn, i)
4802                                  << shift);
4803           else
4804             aarch64_set_vec_s32 (cpu, vd, i, aarch64_get_vec_s32 (cpu, vn, i)
4805                                  >> - shift);
4806         }
4807       return;
4808
4809     case 3:
4810       if (! full)
4811         HALT_UNALLOC;
4812       for (i = 0; i < 2; i++)
4813         {
4814           shift = aarch64_get_vec_s8 (cpu, vm, i * 8);
4815           if (shift >= 0)
4816             aarch64_set_vec_s64 (cpu, vd, i, aarch64_get_vec_s64 (cpu, vn, i)
4817                                  << shift);
4818           else
4819             aarch64_set_vec_s64 (cpu, vd, i, aarch64_get_vec_s64 (cpu, vn, i)
4820                                  >> - shift);
4821         }
4822       return;
4823     }
4824 }
4825
4826 static void
4827 do_vec_USHL (sim_cpu *cpu)
4828 {
4829   /* instr[31]    = 0
4830      instr[30]    = first part (0)/ second part (1)
4831      instr[29,24] = 10 1110
4832      instr[23,22] = size: byte(00), half(01), word (10), long (11)
4833      instr[21]    = 1
4834      instr[20,16] = Vm
4835      instr[15,10] = 0100 01
4836      instr[9,5]   = Vn
4837      instr[4,0]   = Vd  */
4838
4839   unsigned full = INSTR (30, 30);
4840   unsigned vm = INSTR (20, 16);
4841   unsigned vn = INSTR (9, 5);
4842   unsigned vd = INSTR (4, 0);
4843   unsigned i;
4844   signed int shift;
4845
4846   NYI_assert (29, 24, 0x2E);
4847   NYI_assert (15, 10, 0x11);
4848
4849   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4850   switch (INSTR (23, 22))
4851     {
4852     case 0:
4853         for (i = 0; i < (full ? 16 : 8); i++)
4854           {
4855             shift = aarch64_get_vec_s8 (cpu, vm, i);
4856             if (shift >= 0)
4857               aarch64_set_vec_u8 (cpu, vd, i, aarch64_get_vec_u8 (cpu, vn, i)
4858                                   << shift);
4859             else
4860               aarch64_set_vec_u8 (cpu, vd, i, aarch64_get_vec_u8 (cpu, vn, i)
4861                                   >> - shift);
4862           }
4863       return;
4864
4865     case 1:
4866       for (i = 0; i < (full ? 8 : 4); i++)
4867         {
4868           shift = aarch64_get_vec_s8 (cpu, vm, i * 2);
4869           if (shift >= 0)
4870             aarch64_set_vec_u16 (cpu, vd, i, aarch64_get_vec_u16 (cpu, vn, i)
4871                                  << shift);
4872           else
4873             aarch64_set_vec_u16 (cpu, vd, i, aarch64_get_vec_u16 (cpu, vn, i)
4874                                  >> - shift);
4875         }
4876       return;
4877
4878     case 2:
4879       for (i = 0; i < (full ? 4 : 2); i++)
4880         {
4881           shift = aarch64_get_vec_s8 (cpu, vm, i * 4);
4882           if (shift >= 0)
4883             aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vn, i)
4884                                  << shift);
4885           else
4886             aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vn, i)
4887                                  >> - shift);
4888         }
4889       return;
4890
4891     case 3:
4892       if (! full)
4893         HALT_UNALLOC;
4894       for (i = 0; i < 2; i++)
4895         {
4896           shift = aarch64_get_vec_s8 (cpu, vm, i * 8);
4897           if (shift >= 0)
4898             aarch64_set_vec_u64 (cpu, vd, i, aarch64_get_vec_u64 (cpu, vn, i)
4899                                  << shift);
4900           else
4901             aarch64_set_vec_u64 (cpu, vd, i, aarch64_get_vec_u64 (cpu, vn, i)
4902                                  >> - shift);
4903         }
4904       return;
4905     }
4906 }
4907
4908 static void
4909 do_vec_FMLA (sim_cpu *cpu)
4910 {
4911   /* instr[31]    = 0
4912      instr[30]    = full/half selector
4913      instr[29,23] = 0011100
4914      instr[22]    = size: 0=>float, 1=>double
4915      instr[21]    = 1
4916      instr[20,16] = Vn
4917      instr[15,10] = 1100 11
4918      instr[9,5]   = Vm
4919      instr[4.0]   = Vd.  */
4920
4921   unsigned vm = INSTR (20, 16);
4922   unsigned vn = INSTR (9, 5);
4923   unsigned vd = INSTR (4, 0);
4924   unsigned i;
4925   int      full = INSTR (30, 30);
4926
4927   NYI_assert (29, 23, 0x1C);
4928   NYI_assert (21, 21, 1);
4929   NYI_assert (15, 10, 0x33);
4930
4931   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4932   if (INSTR (22, 22))
4933     {
4934       if (! full)
4935         HALT_UNALLOC;
4936       for (i = 0; i < 2; i++)
4937         aarch64_set_vec_double (cpu, vd, i,
4938                                 aarch64_get_vec_double (cpu, vn, i) *
4939                                 aarch64_get_vec_double (cpu, vm, i) +
4940                                 aarch64_get_vec_double (cpu, vd, i));
4941     }
4942   else
4943     {
4944       for (i = 0; i < (full ? 4 : 2); i++)
4945         aarch64_set_vec_float (cpu, vd, i,
4946                                aarch64_get_vec_float (cpu, vn, i) *
4947                                aarch64_get_vec_float (cpu, vm, i) +
4948                                aarch64_get_vec_float (cpu, vd, i));
4949     }
4950 }
4951
4952 static void
4953 do_vec_max (sim_cpu *cpu)
4954 {
4955   /* instr[31]    = 0
4956      instr[30]    = full/half selector
4957      instr[29]    = SMAX (0) / UMAX (1)
4958      instr[28,24] = 0 1110
4959      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit
4960      instr[21]    = 1
4961      instr[20,16] = Vn
4962      instr[15,10] = 0110 01
4963      instr[9,5]   = Vm
4964      instr[4.0]   = Vd.  */
4965
4966   unsigned vm = INSTR (20, 16);
4967   unsigned vn = INSTR (9, 5);
4968   unsigned vd = INSTR (4, 0);
4969   unsigned i;
4970   int      full = INSTR (30, 30);
4971
4972   NYI_assert (28, 24, 0x0E);
4973   NYI_assert (21, 21, 1);
4974   NYI_assert (15, 10, 0x19);
4975
4976   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4977   if (INSTR (29, 29))
4978     {
4979       switch (INSTR (23, 22))
4980         {
4981         case 0:
4982           for (i = 0; i < (full ? 16 : 8); i++)
4983             aarch64_set_vec_u8 (cpu, vd, i,
4984                                 aarch64_get_vec_u8 (cpu, vn, i)
4985                                 > aarch64_get_vec_u8 (cpu, vm, i)
4986                                 ? aarch64_get_vec_u8 (cpu, vn, i)
4987                                 : aarch64_get_vec_u8 (cpu, vm, i));
4988           return;
4989
4990         case 1:
4991           for (i = 0; i < (full ? 8 : 4); i++)
4992             aarch64_set_vec_u16 (cpu, vd, i,
4993                                  aarch64_get_vec_u16 (cpu, vn, i)
4994                                  > aarch64_get_vec_u16 (cpu, vm, i)
4995                                  ? aarch64_get_vec_u16 (cpu, vn, i)
4996                                  : aarch64_get_vec_u16 (cpu, vm, i));
4997           return;
4998
4999         case 2:
5000           for (i = 0; i < (full ? 4 : 2); i++)
5001             aarch64_set_vec_u32 (cpu, vd, i,
5002                                  aarch64_get_vec_u32 (cpu, vn, i)
5003                                  > aarch64_get_vec_u32 (cpu, vm, i)
5004                                  ? aarch64_get_vec_u32 (cpu, vn, i)
5005                                  : aarch64_get_vec_u32 (cpu, vm, i));
5006           return;
5007
5008         case 3:
5009           HALT_UNALLOC;
5010         }
5011     }
5012   else
5013     {
5014       switch (INSTR (23, 22))
5015         {
5016         case 0:
5017           for (i = 0; i < (full ? 16 : 8); i++)
5018             aarch64_set_vec_s8 (cpu, vd, i,
5019                                 aarch64_get_vec_s8 (cpu, vn, i)
5020                                 > aarch64_get_vec_s8 (cpu, vm, i)
5021                                 ? aarch64_get_vec_s8 (cpu, vn, i)
5022                                 : aarch64_get_vec_s8 (cpu, vm, i));
5023           return;
5024
5025         case 1:
5026           for (i = 0; i < (full ? 8 : 4); i++)
5027             aarch64_set_vec_s16 (cpu, vd, i,
5028                                  aarch64_get_vec_s16 (cpu, vn, i)
5029                                  > aarch64_get_vec_s16 (cpu, vm, i)
5030                                  ? aarch64_get_vec_s16 (cpu, vn, i)
5031                                  : aarch64_get_vec_s16 (cpu, vm, i));
5032           return;
5033
5034         case 2:
5035           for (i = 0; i < (full ? 4 : 2); i++)
5036             aarch64_set_vec_s32 (cpu, vd, i,
5037                                  aarch64_get_vec_s32 (cpu, vn, i)
5038                                  > aarch64_get_vec_s32 (cpu, vm, i)
5039                                  ? aarch64_get_vec_s32 (cpu, vn, i)
5040                                  : aarch64_get_vec_s32 (cpu, vm, i));
5041           return;
5042
5043         case 3:
5044           HALT_UNALLOC;
5045         }
5046     }
5047 }
5048
5049 static void
5050 do_vec_min (sim_cpu *cpu)
5051 {
5052   /* instr[31]    = 0
5053      instr[30]    = full/half selector
5054      instr[29]    = SMIN (0) / UMIN (1)
5055      instr[28,24] = 0 1110
5056      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit
5057      instr[21]    = 1
5058      instr[20,16] = Vn
5059      instr[15,10] = 0110 11
5060      instr[9,5]   = Vm
5061      instr[4.0]   = Vd.  */
5062
5063   unsigned vm = INSTR (20, 16);
5064   unsigned vn = INSTR (9, 5);
5065   unsigned vd = INSTR (4, 0);
5066   unsigned i;
5067   int      full = INSTR (30, 30);
5068
5069   NYI_assert (28, 24, 0x0E);
5070   NYI_assert (21, 21, 1);
5071   NYI_assert (15, 10, 0x1B);
5072
5073   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5074   if (INSTR (29, 29))
5075     {
5076       switch (INSTR (23, 22))
5077         {
5078         case 0:
5079           for (i = 0; i < (full ? 16 : 8); i++)
5080             aarch64_set_vec_u8 (cpu, vd, i,
5081                                 aarch64_get_vec_u8 (cpu, vn, i)
5082                                 < aarch64_get_vec_u8 (cpu, vm, i)
5083                                 ? aarch64_get_vec_u8 (cpu, vn, i)
5084                                 : aarch64_get_vec_u8 (cpu, vm, i));
5085           return;
5086
5087         case 1:
5088           for (i = 0; i < (full ? 8 : 4); i++)
5089             aarch64_set_vec_u16 (cpu, vd, i,
5090                                  aarch64_get_vec_u16 (cpu, vn, i)
5091                                  < aarch64_get_vec_u16 (cpu, vm, i)
5092                                  ? aarch64_get_vec_u16 (cpu, vn, i)
5093                                  : aarch64_get_vec_u16 (cpu, vm, i));
5094           return;
5095
5096         case 2:
5097           for (i = 0; i < (full ? 4 : 2); i++)
5098             aarch64_set_vec_u32 (cpu, vd, i,
5099                                  aarch64_get_vec_u32 (cpu, vn, i)
5100                                  < aarch64_get_vec_u32 (cpu, vm, i)
5101                                  ? aarch64_get_vec_u32 (cpu, vn, i)
5102                                  : aarch64_get_vec_u32 (cpu, vm, i));
5103           return;
5104
5105         case 3:
5106           HALT_UNALLOC;
5107         }
5108     }
5109   else
5110     {
5111       switch (INSTR (23, 22))
5112         {
5113         case 0:
5114           for (i = 0; i < (full ? 16 : 8); i++)
5115             aarch64_set_vec_s8 (cpu, vd, i,
5116                                 aarch64_get_vec_s8 (cpu, vn, i)
5117                                 < aarch64_get_vec_s8 (cpu, vm, i)
5118                                 ? aarch64_get_vec_s8 (cpu, vn, i)
5119                                 : aarch64_get_vec_s8 (cpu, vm, i));
5120           return;
5121
5122         case 1:
5123           for (i = 0; i < (full ? 8 : 4); i++)
5124             aarch64_set_vec_s16 (cpu, vd, i,
5125                                  aarch64_get_vec_s16 (cpu, vn, i)
5126                                  < aarch64_get_vec_s16 (cpu, vm, i)
5127                                  ? aarch64_get_vec_s16 (cpu, vn, i)
5128                                  : aarch64_get_vec_s16 (cpu, vm, i));
5129           return;
5130
5131         case 2:
5132           for (i = 0; i < (full ? 4 : 2); i++)
5133             aarch64_set_vec_s32 (cpu, vd, i,
5134                                  aarch64_get_vec_s32 (cpu, vn, i)
5135                                  < aarch64_get_vec_s32 (cpu, vm, i)
5136                                  ? aarch64_get_vec_s32 (cpu, vn, i)
5137                                  : aarch64_get_vec_s32 (cpu, vm, i));
5138           return;
5139
5140         case 3:
5141           HALT_UNALLOC;
5142         }
5143     }
5144 }
5145
5146 static void
5147 do_vec_sub_long (sim_cpu *cpu)
5148 {
5149   /* instr[31]    = 0
5150      instr[30]    = lower (0) / upper (1)
5151      instr[29]    = signed (0) / unsigned (1)
5152      instr[28,24] = 0 1110
5153      instr[23,22] = size: bytes (00), half (01), word (10)
5154      instr[21]    = 1
5155      insrt[20,16] = Vm
5156      instr[15,10] = 0010 00
5157      instr[9,5]   = Vn
5158      instr[4,0]   = V dest.  */
5159
5160   unsigned size = INSTR (23, 22);
5161   unsigned vm = INSTR (20, 16);
5162   unsigned vn = INSTR (9, 5);
5163   unsigned vd = INSTR (4, 0);
5164   unsigned bias = 0;
5165   unsigned i;
5166
5167   NYI_assert (28, 24, 0x0E);
5168   NYI_assert (21, 21, 1);
5169   NYI_assert (15, 10, 0x08);
5170
5171   if (size == 3)
5172     HALT_UNALLOC;
5173
5174   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5175   switch (INSTR (30, 29))
5176     {
5177     case 2: /* SSUBL2.  */
5178       bias = 2;
5179     case 0: /* SSUBL.  */
5180       switch (size)
5181         {
5182         case 0:
5183           bias *= 3;
5184           for (i = 0; i < 8; i++)
5185             aarch64_set_vec_s16 (cpu, vd, i,
5186                                  aarch64_get_vec_s8 (cpu, vn, i + bias)
5187                                  - aarch64_get_vec_s8 (cpu, vm, i + bias));
5188           break;
5189
5190         case 1:
5191           bias *= 2;
5192           for (i = 0; i < 4; i++)
5193             aarch64_set_vec_s32 (cpu, vd, i,
5194                                  aarch64_get_vec_s16 (cpu, vn, i + bias)
5195                                  - aarch64_get_vec_s16 (cpu, vm, i + bias));
5196           break;
5197
5198         case 2:
5199           for (i = 0; i < 2; i++)
5200             aarch64_set_vec_s64 (cpu, vd, i,
5201                                  aarch64_get_vec_s32 (cpu, vn, i + bias)
5202                                  - aarch64_get_vec_s32 (cpu, vm, i + bias));
5203           break;
5204
5205         default:
5206           HALT_UNALLOC;
5207         }
5208       break;
5209
5210     case 3: /* USUBL2.  */
5211       bias = 2;
5212     case 1: /* USUBL.  */
5213       switch (size)
5214         {
5215         case 0:
5216           bias *= 3;
5217           for (i = 0; i < 8; i++)
5218             aarch64_set_vec_u16 (cpu, vd, i,
5219                                  aarch64_get_vec_u8 (cpu, vn, i + bias)
5220                                  - aarch64_get_vec_u8 (cpu, vm, i + bias));
5221           break;
5222
5223         case 1:
5224           bias *= 2;
5225           for (i = 0; i < 4; i++)
5226             aarch64_set_vec_u32 (cpu, vd, i,
5227                                  aarch64_get_vec_u16 (cpu, vn, i + bias)
5228                                  - aarch64_get_vec_u16 (cpu, vm, i + bias));
5229           break;
5230
5231         case 2:
5232           for (i = 0; i < 2; i++)
5233             aarch64_set_vec_u64 (cpu, vd, i,
5234                                  aarch64_get_vec_u32 (cpu, vn, i + bias)
5235                                  - aarch64_get_vec_u32 (cpu, vm, i + bias));
5236           break;
5237
5238         default:
5239           HALT_UNALLOC;
5240         }
5241       break;
5242     }
5243 }
5244
5245 static void
5246 do_vec_ADDP (sim_cpu *cpu)
5247 {
5248   /* instr[31]    = 0
5249      instr[30]    = half(0)/full(1)
5250      instr[29,24] = 00 1110
5251      instr[23,22] = size: bytes (00), half (01), word (10), long (11)
5252      instr[21]    = 1
5253      insrt[20,16] = Vm
5254      instr[15,10] = 1011 11
5255      instr[9,5]   = Vn
5256      instr[4,0]   = V dest.  */
5257
5258   FRegister copy_vn;
5259   FRegister copy_vm;
5260   unsigned full = INSTR (30, 30);
5261   unsigned size = INSTR (23, 22);
5262   unsigned vm = INSTR (20, 16);
5263   unsigned vn = INSTR (9, 5);
5264   unsigned vd = INSTR (4, 0);
5265   unsigned i, range;
5266
5267   NYI_assert (29, 24, 0x0E);
5268   NYI_assert (21, 21, 1);
5269   NYI_assert (15, 10, 0x2F);
5270
5271   /* Make copies of the source registers in case vd == vn/vm.  */
5272   copy_vn = cpu->fr[vn];
5273   copy_vm = cpu->fr[vm];
5274
5275   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5276   switch (size)
5277     {
5278     case 0:
5279       range = full ? 8 : 4;
5280       for (i = 0; i < range; i++)
5281         {
5282           aarch64_set_vec_u8 (cpu, vd, i,
5283                               copy_vn.b[i * 2] + copy_vn.b[i * 2 + 1]);
5284           aarch64_set_vec_u8 (cpu, vd, i + range,
5285                               copy_vm.b[i * 2] + copy_vm.b[i * 2 + 1]);
5286         }
5287       return;
5288
5289     case 1:
5290       range = full ? 4 : 2;
5291       for (i = 0; i < range; i++)
5292         {
5293           aarch64_set_vec_u16 (cpu, vd, i,
5294                                copy_vn.h[i * 2] + copy_vn.h[i * 2 + 1]);
5295           aarch64_set_vec_u16 (cpu, vd, i + range,
5296                                copy_vm.h[i * 2] + copy_vm.h[i * 2 + 1]);
5297         }
5298       return;
5299
5300     case 2:
5301       range = full ? 2 : 1;
5302       for (i = 0; i < range; i++)
5303         {
5304           aarch64_set_vec_u32 (cpu, vd, i,
5305                                copy_vn.w[i * 2] + copy_vn.w[i * 2 + 1]);
5306           aarch64_set_vec_u32 (cpu, vd, i + range,
5307                                copy_vm.w[i * 2] + copy_vm.w[i * 2 + 1]);
5308         }
5309       return;
5310
5311     case 3:
5312       if (! full)
5313         HALT_UNALLOC;
5314       aarch64_set_vec_u64 (cpu, vd, 0, copy_vn.v[0] + copy_vn.v[1]);
5315       aarch64_set_vec_u64 (cpu, vd, 1, copy_vm.v[0] + copy_vm.v[1]);
5316       return;
5317     }
5318 }
5319
5320 static void
5321 do_vec_UMOV (sim_cpu *cpu)
5322 {
5323   /* instr[31]    = 0
5324      instr[30]    = 32-bit(0)/64-bit(1)
5325      instr[29,21] = 00 1110 000
5326      insrt[20,16] = size & index
5327      instr[15,10] = 0011 11
5328      instr[9,5]   = V source
5329      instr[4,0]   = R dest.  */
5330
5331   unsigned vs = INSTR (9, 5);
5332   unsigned rd = INSTR (4, 0);
5333   unsigned index;
5334
5335   NYI_assert (29, 21, 0x070);
5336   NYI_assert (15, 10, 0x0F);
5337
5338   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5339   if (INSTR (16, 16))
5340     {
5341       /* Byte transfer.  */
5342       index = INSTR (20, 17);
5343       aarch64_set_reg_u64 (cpu, rd, NO_SP,
5344                            aarch64_get_vec_u8 (cpu, vs, index));
5345     }
5346   else if (INSTR (17, 17))
5347     {
5348       index = INSTR (20, 18);
5349       aarch64_set_reg_u64 (cpu, rd, NO_SP,
5350                            aarch64_get_vec_u16 (cpu, vs, index));
5351     }
5352   else if (INSTR (18, 18))
5353     {
5354       index = INSTR (20, 19);
5355       aarch64_set_reg_u64 (cpu, rd, NO_SP,
5356                            aarch64_get_vec_u32 (cpu, vs, index));
5357     }
5358   else
5359     {
5360       if (INSTR (30, 30) != 1)
5361         HALT_UNALLOC;
5362
5363       index = INSTR (20, 20);
5364       aarch64_set_reg_u64 (cpu, rd, NO_SP,
5365                            aarch64_get_vec_u64 (cpu, vs, index));
5366     }
5367 }
5368
5369 static void
5370 do_vec_FABS (sim_cpu *cpu)
5371 {
5372   /* instr[31]    = 0
5373      instr[30]    = half(0)/full(1)
5374      instr[29,23] = 00 1110 1
5375      instr[22]    = float(0)/double(1)
5376      instr[21,16] = 10 0000
5377      instr[15,10] = 1111 10
5378      instr[9,5]   = Vn
5379      instr[4,0]   = Vd.  */
5380
5381   unsigned vn = INSTR (9, 5);
5382   unsigned vd = INSTR (4, 0);
5383   unsigned full = INSTR (30, 30);
5384   unsigned i;
5385
5386   NYI_assert (29, 23, 0x1D);
5387   NYI_assert (21, 10, 0x83E);
5388
5389   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5390   if (INSTR (22, 22))
5391     {
5392       if (! full)
5393         HALT_NYI;
5394
5395       for (i = 0; i < 2; i++)
5396         aarch64_set_vec_double (cpu, vd, i,
5397                                 fabs (aarch64_get_vec_double (cpu, vn, i)));
5398     }
5399   else
5400     {
5401       for (i = 0; i < (full ? 4 : 2); i++)
5402         aarch64_set_vec_float (cpu, vd, i,
5403                                fabsf (aarch64_get_vec_float (cpu, vn, i)));
5404     }
5405 }
5406
5407 static void
5408 do_vec_FCVTZS (sim_cpu *cpu)
5409 {
5410   /* instr[31]    = 0
5411      instr[30]    = half (0) / all (1)
5412      instr[29,23] = 00 1110 1
5413      instr[22]    = single (0) / double (1)
5414      instr[21,10] = 10 0001 1011 10
5415      instr[9,5]   = Rn
5416      instr[4,0]   = Rd.  */
5417
5418   unsigned rn = INSTR (9, 5);
5419   unsigned rd = INSTR (4, 0);
5420   unsigned full = INSTR (30, 30);
5421   unsigned i;
5422
5423   NYI_assert (31, 31, 0);
5424   NYI_assert (29, 23, 0x1D);
5425   NYI_assert (21, 10, 0x86E);
5426
5427   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5428   if (INSTR (22, 22))
5429     {
5430       if (! full)
5431         HALT_UNALLOC;
5432
5433       for (i = 0; i < 2; i++)
5434         aarch64_set_vec_s64 (cpu, rd, i,
5435                              (int64_t) aarch64_get_vec_double (cpu, rn, i));
5436     }
5437   else
5438     for (i = 0; i < (full ? 4 : 2); i++)
5439       aarch64_set_vec_s32 (cpu, rd, i,
5440                            (int32_t) aarch64_get_vec_float (cpu, rn, i));
5441 }
5442
5443 static void
5444 do_vec_REV64 (sim_cpu *cpu)
5445 {
5446   /* instr[31]    = 0
5447      instr[30]    = full/half
5448      instr[29,24] = 00 1110
5449      instr[23,22] = size
5450      instr[21,10] = 10 0000 0000 10
5451      instr[9,5]   = Rn
5452      instr[4,0]   = Rd.  */
5453
5454   unsigned rn = INSTR (9, 5);
5455   unsigned rd = INSTR (4, 0);
5456   unsigned size = INSTR (23, 22);
5457   unsigned full = INSTR (30, 30);
5458   unsigned i;
5459   FRegister val;
5460
5461   NYI_assert (29, 24, 0x0E);
5462   NYI_assert (21, 10, 0x802);
5463
5464   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5465   switch (size)
5466     {
5467     case 0:
5468       for (i = 0; i < (full ? 16 : 8); i++)
5469         val.b[i ^ 0x7] = aarch64_get_vec_u8 (cpu, rn, i);
5470       break;
5471
5472     case 1:
5473       for (i = 0; i < (full ? 8 : 4); i++)
5474         val.h[i ^ 0x3] = aarch64_get_vec_u16 (cpu, rn, i);
5475       break;
5476
5477     case 2:
5478       for (i = 0; i < (full ? 4 : 2); i++)
5479         val.w[i ^ 0x1] = aarch64_get_vec_u32 (cpu, rn, i);
5480       break;
5481
5482     case 3:
5483       HALT_UNALLOC;
5484     }
5485
5486   aarch64_set_vec_u64 (cpu, rd, 0, val.v[0]);
5487   if (full)
5488     aarch64_set_vec_u64 (cpu, rd, 1, val.v[1]);
5489 }
5490
5491 static void
5492 do_vec_REV16 (sim_cpu *cpu)
5493 {
5494   /* instr[31]    = 0
5495      instr[30]    = full/half
5496      instr[29,24] = 00 1110
5497      instr[23,22] = size
5498      instr[21,10] = 10 0000 0001 10
5499      instr[9,5]   = Rn
5500      instr[4,0]   = Rd.  */
5501
5502   unsigned rn = INSTR (9, 5);
5503   unsigned rd = INSTR (4, 0);
5504   unsigned size = INSTR (23, 22);
5505   unsigned full = INSTR (30, 30);
5506   unsigned i;
5507   FRegister val;
5508
5509   NYI_assert (29, 24, 0x0E);
5510   NYI_assert (21, 10, 0x806);
5511
5512   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5513   switch (size)
5514     {
5515     case 0:
5516       for (i = 0; i < (full ? 16 : 8); i++)
5517         val.b[i ^ 0x1] = aarch64_get_vec_u8 (cpu, rn, i);
5518       break;
5519
5520     default:
5521       HALT_UNALLOC;
5522     }
5523
5524   aarch64_set_vec_u64 (cpu, rd, 0, val.v[0]);
5525   if (full)
5526     aarch64_set_vec_u64 (cpu, rd, 1, val.v[1]);
5527 }
5528
5529 static void
5530 do_vec_op1 (sim_cpu *cpu)
5531 {
5532   /* instr[31]    = 0
5533      instr[30]    = half/full
5534      instr[29,24] = 00 1110
5535      instr[23,21] = ???
5536      instr[20,16] = Vm
5537      instr[15,10] = sub-opcode
5538      instr[9,5]   = Vn
5539      instr[4,0]   = Vd  */
5540   NYI_assert (29, 24, 0x0E);
5541
5542   if (INSTR (21, 21) == 0)
5543     {
5544       if (INSTR (23, 22) == 0)
5545         {
5546           if (INSTR (30, 30) == 1
5547               && INSTR (17, 14) == 0
5548               && INSTR (12, 10) == 7)
5549             return do_vec_ins_2 (cpu);
5550
5551           switch (INSTR (15, 10))
5552             {
5553             case 0x01: do_vec_DUP_vector_into_vector (cpu); return;
5554             case 0x03: do_vec_DUP_scalar_into_vector (cpu); return;
5555             case 0x07: do_vec_INS (cpu); return;
5556             case 0x0A: do_vec_TRN (cpu); return;
5557
5558             case 0x0F:
5559               if (INSTR (17, 16) == 0)
5560                 {
5561                   do_vec_MOV_into_scalar (cpu);
5562                   return;
5563                 }
5564               break;
5565
5566             case 0x00:
5567             case 0x08:
5568             case 0x10:
5569             case 0x18:
5570               do_vec_TBL (cpu); return;
5571
5572             case 0x06:
5573             case 0x16:
5574               do_vec_UZP (cpu); return;
5575
5576             case 0x0E:
5577             case 0x1E:
5578               do_vec_ZIP (cpu); return;
5579
5580             default:
5581               HALT_NYI;
5582             }
5583         }
5584
5585       switch (INSTR (13, 10))
5586         {
5587         case 0x6: do_vec_UZP (cpu); return;
5588         case 0xE: do_vec_ZIP (cpu); return;
5589         case 0xA: do_vec_TRN (cpu); return;
5590         case 0xF: do_vec_UMOV (cpu); return;
5591         default:  HALT_NYI;
5592         }
5593     }
5594
5595   switch (INSTR (15, 10))
5596     {
5597     case 0x02: do_vec_REV64 (cpu); return;
5598     case 0x06: do_vec_REV16 (cpu); return;
5599
5600     case 0x07:
5601       switch (INSTR (23, 21))
5602         {
5603         case 1: do_vec_AND (cpu); return;
5604         case 3: do_vec_BIC (cpu); return;
5605         case 5: do_vec_ORR (cpu); return;
5606         case 7: do_vec_ORN (cpu); return;
5607         default: HALT_NYI;
5608         }
5609
5610     case 0x08: do_vec_sub_long (cpu); return;
5611     case 0x0a: do_vec_XTN (cpu); return;
5612     case 0x11: do_vec_SSHL (cpu); return;
5613     case 0x19: do_vec_max (cpu); return;
5614     case 0x1B: do_vec_min (cpu); return;
5615     case 0x21: do_vec_add (cpu); return;
5616     case 0x25: do_vec_MLA (cpu); return;
5617     case 0x27: do_vec_mul (cpu); return;
5618     case 0x2F: do_vec_ADDP (cpu); return;
5619     case 0x30: do_vec_mull (cpu); return;
5620     case 0x33: do_vec_FMLA (cpu); return;
5621     case 0x35: do_vec_fadd (cpu); return;
5622
5623     case 0x2E:
5624       switch (INSTR (20, 16))
5625         {
5626         case 0x00: do_vec_ABS (cpu); return;
5627         case 0x01: do_vec_FCVTZS (cpu); return;
5628         case 0x11: do_vec_ADDV (cpu); return;
5629         default: HALT_NYI;
5630         }
5631
5632     case 0x31:
5633     case 0x3B:
5634       do_vec_Fminmax (cpu); return;
5635
5636     case 0x0D:
5637     case 0x0F:
5638     case 0x22:
5639     case 0x23:
5640     case 0x26:
5641     case 0x2A:
5642     case 0x32:
5643     case 0x36:
5644     case 0x39:
5645     case 0x3A:
5646       do_vec_compare (cpu); return;
5647
5648     case 0x3E:
5649       do_vec_FABS (cpu); return;
5650
5651     default:
5652       HALT_NYI;
5653     }
5654 }
5655
5656 static void
5657 do_vec_xtl (sim_cpu *cpu)
5658 {
5659   /* instr[31]    = 0
5660      instr[30,29] = SXTL (00), UXTL (01), SXTL2 (10), UXTL2 (11)
5661      instr[28,22] = 0 1111 00
5662      instr[21,16] = size & shift (USHLL, SSHLL, USHLL2, SSHLL2)
5663      instr[15,10] = 1010 01
5664      instr[9,5]   = V source
5665      instr[4,0]   = V dest.  */
5666
5667   unsigned vs = INSTR (9, 5);
5668   unsigned vd = INSTR (4, 0);
5669   unsigned i, shift, bias = 0;
5670
5671   NYI_assert (28, 22, 0x3C);
5672   NYI_assert (15, 10, 0x29);
5673
5674   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5675   switch (INSTR (30, 29))
5676     {
5677     case 2: /* SXTL2, SSHLL2.  */
5678       bias = 2;
5679     case 0: /* SXTL, SSHLL.  */
5680       if (INSTR (21, 21))
5681         {
5682           int64_t val1, val2;
5683
5684           shift = INSTR (20, 16);
5685           /* Get the source values before setting the destination values
5686              in case the source and destination are the same.  */
5687           val1 = aarch64_get_vec_s32 (cpu, vs, bias) << shift;
5688           val2 = aarch64_get_vec_s32 (cpu, vs, bias + 1) << shift;
5689           aarch64_set_vec_s64 (cpu, vd, 0, val1);
5690           aarch64_set_vec_s64 (cpu, vd, 1, val2);
5691         }
5692       else if (INSTR (20, 20))
5693         {
5694           int32_t v[4];
5695           int32_t v1,v2,v3,v4;
5696
5697           shift = INSTR (19, 16);
5698           bias *= 2;
5699           for (i = 0; i < 4; i++)
5700             v[i] = aarch64_get_vec_s16 (cpu, vs, bias + i) << shift;
5701           for (i = 0; i < 4; i++)
5702             aarch64_set_vec_s32 (cpu, vd, i, v[i]);
5703         }
5704       else
5705         {
5706           int16_t v[8];
5707           NYI_assert (19, 19, 1);
5708
5709           shift = INSTR (18, 16);
5710           bias *= 3;
5711           for (i = 0; i < 8; i++)
5712             v[i] = aarch64_get_vec_s8 (cpu, vs, i + bias) << shift;
5713           for (i = 0; i < 8; i++)
5714             aarch64_set_vec_s16 (cpu, vd, i, v[i]);
5715         }
5716       return;
5717
5718     case 3: /* UXTL2, USHLL2.  */
5719       bias = 2;
5720     case 1: /* UXTL, USHLL.  */
5721       if (INSTR (21, 21))
5722         {
5723           uint64_t v1, v2;
5724           shift = INSTR (20, 16);
5725           v1 = aarch64_get_vec_u32 (cpu, vs, bias) << shift;
5726           v2 = aarch64_get_vec_u32 (cpu, vs, bias + 1) << shift;
5727           aarch64_set_vec_u64 (cpu, vd, 0, v1);
5728           aarch64_set_vec_u64 (cpu, vd, 1, v2);
5729         }
5730       else if (INSTR (20, 20))
5731         {
5732           uint32_t v[4];
5733           shift = INSTR (19, 16);
5734           bias *= 2;
5735           for (i = 0; i < 4; i++)
5736             v[i] = aarch64_get_vec_u16 (cpu, vs, i + bias) << shift;
5737           for (i = 0; i < 4; i++)
5738             aarch64_set_vec_u32 (cpu, vd, i, v[i]);
5739         }
5740       else
5741         {
5742           uint16_t v[8];
5743           NYI_assert (19, 19, 1);
5744
5745           shift = INSTR (18, 16);
5746           bias *= 3;
5747           for (i = 0; i < 8; i++)
5748             v[i] = aarch64_get_vec_u8 (cpu, vs, i + bias) << shift;
5749           for (i = 0; i < 8; i++)
5750             aarch64_set_vec_u16 (cpu, vd, i, v[i]);
5751         }
5752       return;
5753     }
5754 }
5755
5756 static void
5757 do_vec_SHL (sim_cpu *cpu)
5758 {
5759   /* instr [31]    = 0
5760      instr [30]    = half(0)/full(1)
5761      instr [29,23] = 001 1110
5762      instr [22,16] = size and shift amount
5763      instr [15,10] = 01 0101
5764      instr [9, 5]  = Vs
5765      instr [4, 0]  = Vd.  */
5766
5767   int shift;
5768   int full    = INSTR (30, 30);
5769   unsigned vs = INSTR (9, 5);
5770   unsigned vd = INSTR (4, 0);
5771   unsigned i;
5772
5773   NYI_assert (29, 23, 0x1E);
5774   NYI_assert (15, 10, 0x15);
5775
5776   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5777   if (INSTR (22, 22))
5778     {
5779       shift = INSTR (21, 16);
5780
5781       if (full == 0)
5782         HALT_UNALLOC;
5783
5784       for (i = 0; i < 2; i++)
5785         {
5786           uint64_t val = aarch64_get_vec_u64 (cpu, vs, i);
5787           aarch64_set_vec_u64 (cpu, vd, i, val << shift);
5788         }
5789
5790       return;
5791     }
5792
5793   if (INSTR (21, 21))
5794     {
5795       shift = INSTR (20, 16);
5796
5797       for (i = 0; i < (full ? 4 : 2); i++)
5798         {
5799           uint32_t val = aarch64_get_vec_u32 (cpu, vs, i);
5800           aarch64_set_vec_u32 (cpu, vd, i, val << shift);
5801         }
5802
5803       return;
5804     }
5805
5806   if (INSTR (20, 20))
5807     {
5808       shift = INSTR (19, 16);
5809
5810       for (i = 0; i < (full ? 8 : 4); i++)
5811         {
5812           uint16_t val = aarch64_get_vec_u16 (cpu, vs, i);
5813           aarch64_set_vec_u16 (cpu, vd, i, val << shift);
5814         }
5815
5816       return;
5817     }
5818
5819   if (INSTR (19, 19) == 0)
5820     HALT_UNALLOC;
5821
5822   shift = INSTR (18, 16);
5823
5824   for (i = 0; i < (full ? 16 : 8); i++)
5825     {
5826       uint8_t val = aarch64_get_vec_u8 (cpu, vs, i);
5827       aarch64_set_vec_u8 (cpu, vd, i, val << shift);
5828     }
5829 }
5830
5831 static void
5832 do_vec_SSHR_USHR (sim_cpu *cpu)
5833 {
5834   /* instr [31]    = 0
5835      instr [30]    = half(0)/full(1)
5836      instr [29]    = signed(0)/unsigned(1)
5837      instr [28,23] = 0 1111 0
5838      instr [22,16] = size and shift amount
5839      instr [15,10] = 0000 01
5840      instr [9, 5]  = Vs
5841      instr [4, 0]  = Vd.  */
5842
5843   int full       = INSTR (30, 30);
5844   int sign       = ! INSTR (29, 29);
5845   unsigned shift = INSTR (22, 16);
5846   unsigned vs    = INSTR (9, 5);
5847   unsigned vd    = INSTR (4, 0);
5848   unsigned i;
5849
5850   NYI_assert (28, 23, 0x1E);
5851   NYI_assert (15, 10, 0x01);
5852
5853   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5854   if (INSTR (22, 22))
5855     {
5856       shift = 128 - shift;
5857
5858       if (full == 0)
5859         HALT_UNALLOC;
5860
5861       if (sign)
5862         for (i = 0; i < 2; i++)
5863           {
5864             int64_t val = aarch64_get_vec_s64 (cpu, vs, i);
5865             aarch64_set_vec_s64 (cpu, vd, i, val >> shift);
5866           }
5867       else
5868         for (i = 0; i < 2; i++)
5869           {
5870             uint64_t val = aarch64_get_vec_u64 (cpu, vs, i);
5871             aarch64_set_vec_u64 (cpu, vd, i, val >> shift);
5872           }
5873
5874       return;
5875     }
5876
5877   if (INSTR (21, 21))
5878     {
5879       shift = 64 - shift;
5880
5881       if (sign)
5882         for (i = 0; i < (full ? 4 : 2); i++)
5883           {
5884             int32_t val = aarch64_get_vec_s32 (cpu, vs, i);
5885             aarch64_set_vec_s32 (cpu, vd, i, val >> shift);
5886           }
5887       else
5888         for (i = 0; i < (full ? 4 : 2); i++)
5889           {
5890             uint32_t val = aarch64_get_vec_u32 (cpu, vs, i);
5891             aarch64_set_vec_u32 (cpu, vd, i, val >> shift);
5892           }
5893
5894       return;
5895     }
5896
5897   if (INSTR (20, 20))
5898     {
5899       shift = 32 - shift;
5900
5901       if (sign)
5902         for (i = 0; i < (full ? 8 : 4); i++)
5903           {
5904             int16_t val = aarch64_get_vec_s16 (cpu, vs, i);
5905             aarch64_set_vec_s16 (cpu, vd, i, val >> shift);
5906           }
5907       else
5908         for (i = 0; i < (full ? 8 : 4); i++)
5909           {
5910             uint16_t val = aarch64_get_vec_u16 (cpu, vs, i);
5911             aarch64_set_vec_u16 (cpu, vd, i, val >> shift);
5912           }
5913
5914       return;
5915     }
5916
5917   if (INSTR (19, 19) == 0)
5918     HALT_UNALLOC;
5919
5920   shift = 16 - shift;
5921
5922   if (sign)
5923     for (i = 0; i < (full ? 16 : 8); i++)
5924       {
5925         int8_t val = aarch64_get_vec_s8 (cpu, vs, i);
5926         aarch64_set_vec_s8 (cpu, vd, i, val >> shift);
5927       }
5928   else
5929     for (i = 0; i < (full ? 16 : 8); i++)
5930       {
5931         uint8_t val = aarch64_get_vec_u8 (cpu, vs, i);
5932         aarch64_set_vec_u8 (cpu, vd, i, val >> shift);
5933       }
5934 }
5935
5936 static void
5937 do_vec_MUL_by_element (sim_cpu *cpu)
5938 {
5939   /* instr[31]    = 0
5940      instr[30]    = half/full
5941      instr[29,24] = 00 1111
5942      instr[23,22] = size
5943      instr[21]    = L
5944      instr[20]    = M
5945      instr[19,16] = m
5946      instr[15,12] = 1000
5947      instr[11]    = H
5948      instr[10]    = 0
5949      instr[9,5]   = Vn
5950      instr[4,0]   = Vd  */
5951
5952   unsigned full     = INSTR (30, 30);
5953   unsigned L        = INSTR (21, 21);
5954   unsigned H        = INSTR (11, 11);
5955   unsigned vn       = INSTR (9, 5);
5956   unsigned vd       = INSTR (4, 0);
5957   unsigned size     = INSTR (23, 22);
5958   unsigned index;
5959   unsigned vm;
5960   unsigned e;
5961
5962   NYI_assert (29, 24, 0x0F);
5963   NYI_assert (15, 12, 0x8);
5964   NYI_assert (10, 10, 0);
5965
5966   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5967   switch (size)
5968     {
5969     case 1:
5970       {
5971         /* 16 bit products.  */
5972         uint16_t product;
5973         uint16_t element1;
5974         uint16_t element2;
5975
5976         index = (H << 2) | (L << 1) | INSTR (20, 20);
5977         vm = INSTR (19, 16);
5978         element2 = aarch64_get_vec_u16 (cpu, vm, index);
5979
5980         for (e = 0; e < (full ? 8 : 4); e ++)
5981           {
5982             element1 = aarch64_get_vec_u16 (cpu, vn, e);
5983             product  = element1 * element2;
5984             aarch64_set_vec_u16 (cpu, vd, e, product);
5985           }
5986       }
5987       break;
5988
5989     case 2:
5990       {
5991         /* 32 bit products.  */
5992         uint32_t product;
5993         uint32_t element1;
5994         uint32_t element2;
5995
5996         index = (H << 1) | L;
5997         vm = INSTR (20, 16);
5998         element2 = aarch64_get_vec_u32 (cpu, vm, index);
5999
6000         for (e = 0; e < (full ? 4 : 2); e ++)
6001           {
6002             element1 = aarch64_get_vec_u32 (cpu, vn, e);
6003             product  = element1 * element2;
6004             aarch64_set_vec_u32 (cpu, vd, e, product);
6005           }
6006       }
6007       break;
6008
6009     default:
6010       HALT_UNALLOC;
6011     }
6012 }
6013
6014 static void
6015 do_FMLA_by_element (sim_cpu *cpu)
6016 {
6017   /* instr[31]    = 0
6018      instr[30]    = half/full
6019      instr[29,23] = 00 1111 1
6020      instr[22]    = size
6021      instr[21]    = L
6022      instr[20,16] = m
6023      instr[15,12] = 0001
6024      instr[11]    = H
6025      instr[10]    = 0
6026      instr[9,5]   = Vn
6027      instr[4,0]   = Vd  */
6028
6029   unsigned full     = INSTR (30, 30);
6030   unsigned size     = INSTR (22, 22);
6031   unsigned L        = INSTR (21, 21);
6032   unsigned vm       = INSTR (20, 16);
6033   unsigned H        = INSTR (11, 11);
6034   unsigned vn       = INSTR (9, 5);
6035   unsigned vd       = INSTR (4, 0);
6036   unsigned e;
6037
6038   NYI_assert (29, 23, 0x1F);
6039   NYI_assert (15, 12, 0x1);
6040   NYI_assert (10, 10, 0);
6041
6042   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6043   if (size)
6044     {
6045       double element1, element2;
6046
6047       if (! full || L)
6048         HALT_UNALLOC;
6049
6050       element2 = aarch64_get_vec_double (cpu, vm, H);
6051
6052       for (e = 0; e < 2; e++)
6053         {
6054           element1 = aarch64_get_vec_double (cpu, vn, e);
6055           element1 *= element2;
6056           element1 += aarch64_get_vec_double (cpu, vd, e);
6057           aarch64_set_vec_double (cpu, vd, e, element1);
6058         }
6059     }
6060   else
6061     {
6062       float element1;
6063       float element2 = aarch64_get_vec_float (cpu, vm, (H << 1) | L);
6064
6065       for (e = 0; e < (full ? 4 : 2); e++)
6066         {
6067           element1 = aarch64_get_vec_float (cpu, vn, e);
6068           element1 *= element2;
6069           element1 += aarch64_get_vec_float (cpu, vd, e);
6070           aarch64_set_vec_float (cpu, vd, e, element1);
6071         }
6072     }
6073 }
6074
6075 static void
6076 do_vec_op2 (sim_cpu *cpu)
6077 {
6078   /* instr[31]    = 0
6079      instr[30]    = half/full
6080      instr[29,24] = 00 1111
6081      instr[23]    = ?
6082      instr[22,16] = element size & index
6083      instr[15,10] = sub-opcode
6084      instr[9,5]   = Vm
6085      instr[4,0]   = Vd  */
6086
6087   NYI_assert (29, 24, 0x0F);
6088
6089   if (INSTR (23, 23) != 0)
6090     {
6091       switch (INSTR (15, 10))
6092         {
6093         case 0x04:
6094         case 0x06:
6095           do_FMLA_by_element (cpu);
6096           return;
6097
6098         case 0x20:
6099         case 0x22:
6100           do_vec_MUL_by_element (cpu);
6101           return;
6102
6103         default:
6104           HALT_NYI;
6105         }
6106     }
6107   else
6108     {
6109       switch (INSTR (15, 10))
6110         {
6111         case 0x01: do_vec_SSHR_USHR (cpu); return;
6112         case 0x15: do_vec_SHL (cpu); return;
6113         case 0x20:
6114         case 0x22: do_vec_MUL_by_element (cpu); return;
6115         case 0x29: do_vec_xtl (cpu); return;
6116         default:   HALT_NYI;
6117         }
6118     }
6119 }
6120
6121 static void
6122 do_vec_neg (sim_cpu *cpu)
6123 {
6124   /* instr[31]    = 0
6125      instr[30]    = full(1)/half(0)
6126      instr[29,24] = 10 1110
6127      instr[23,22] = size: byte(00), half (01), word (10), long (11)
6128      instr[21,10] = 1000 0010 1110
6129      instr[9,5]   = Vs
6130      instr[4,0]   = Vd  */
6131
6132   int    full = INSTR (30, 30);
6133   unsigned vs = INSTR (9, 5);
6134   unsigned vd = INSTR (4, 0);
6135   unsigned i;
6136
6137   NYI_assert (29, 24, 0x2E);
6138   NYI_assert (21, 10, 0x82E);
6139
6140   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6141   switch (INSTR (23, 22))
6142     {
6143     case 0:
6144       for (i = 0; i < (full ? 16 : 8); i++)
6145         aarch64_set_vec_s8 (cpu, vd, i, - aarch64_get_vec_s8 (cpu, vs, i));
6146       return;
6147
6148     case 1:
6149       for (i = 0; i < (full ? 8 : 4); i++)
6150         aarch64_set_vec_s16 (cpu, vd, i, - aarch64_get_vec_s16 (cpu, vs, i));
6151       return;
6152
6153     case 2:
6154       for (i = 0; i < (full ? 4 : 2); i++)
6155         aarch64_set_vec_s32 (cpu, vd, i, - aarch64_get_vec_s32 (cpu, vs, i));
6156       return;
6157
6158     case 3:
6159       if (! full)
6160         HALT_NYI;
6161       for (i = 0; i < 2; i++)
6162         aarch64_set_vec_s64 (cpu, vd, i, - aarch64_get_vec_s64 (cpu, vs, i));
6163       return;
6164     }
6165 }
6166
6167 static void
6168 do_vec_sqrt (sim_cpu *cpu)
6169 {
6170   /* instr[31]    = 0
6171      instr[30]    = full(1)/half(0)
6172      instr[29,23] = 101 1101
6173      instr[22]    = single(0)/double(1)
6174      instr[21,10] = 1000 0111 1110
6175      instr[9,5]   = Vs
6176      instr[4,0]   = Vd.  */
6177
6178   int    full = INSTR (30, 30);
6179   unsigned vs = INSTR (9, 5);
6180   unsigned vd = INSTR (4, 0);
6181   unsigned i;
6182
6183   NYI_assert (29, 23, 0x5B);
6184   NYI_assert (21, 10, 0x87E);
6185
6186   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6187   if (INSTR (22, 22) == 0)
6188     for (i = 0; i < (full ? 4 : 2); i++)
6189       aarch64_set_vec_float (cpu, vd, i,
6190                              sqrtf (aarch64_get_vec_float (cpu, vs, i)));
6191   else
6192     for (i = 0; i < 2; i++)
6193       aarch64_set_vec_double (cpu, vd, i,
6194                               sqrt (aarch64_get_vec_double (cpu, vs, i)));
6195 }
6196
6197 static void
6198 do_vec_mls_indexed (sim_cpu *cpu)
6199 {
6200   /* instr[31]       = 0
6201      instr[30]       = half(0)/full(1)
6202      instr[29,24]    = 10 1111
6203      instr[23,22]    = 16-bit(01)/32-bit(10)
6204      instr[21,20+11] = index (if 16-bit)
6205      instr[21+11]    = index (if 32-bit)
6206      instr[20,16]    = Vm
6207      instr[15,12]    = 0100
6208      instr[11]       = part of index
6209      instr[10]       = 0
6210      instr[9,5]      = Vs
6211      instr[4,0]      = Vd.  */
6212
6213   int    full = INSTR (30, 30);
6214   unsigned vs = INSTR (9, 5);
6215   unsigned vd = INSTR (4, 0);
6216   unsigned vm = INSTR (20, 16);
6217   unsigned i;
6218
6219   NYI_assert (15, 12, 4);
6220   NYI_assert (10, 10, 0);
6221
6222   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6223   switch (INSTR (23, 22))
6224     {
6225     case 1:
6226       {
6227         unsigned elem;
6228         uint32_t val;
6229
6230         if (vm > 15)
6231           HALT_NYI;
6232
6233         elem = (INSTR (21, 20) << 1) | INSTR (11, 11);
6234         val = aarch64_get_vec_u16 (cpu, vm, elem);
6235
6236         for (i = 0; i < (full ? 8 : 4); i++)
6237           aarch64_set_vec_u32 (cpu, vd, i,
6238                                aarch64_get_vec_u32 (cpu, vd, i) -
6239                                (aarch64_get_vec_u32 (cpu, vs, i) * val));
6240         return;
6241       }
6242
6243     case 2:
6244       {
6245         unsigned elem = (INSTR (21, 21) << 1) | INSTR (11, 11);
6246         uint64_t val = aarch64_get_vec_u32 (cpu, vm, elem);
6247
6248         for (i = 0; i < (full ? 4 : 2); i++)
6249           aarch64_set_vec_u64 (cpu, vd, i,
6250                                aarch64_get_vec_u64 (cpu, vd, i) -
6251                                (aarch64_get_vec_u64 (cpu, vs, i) * val));
6252         return;
6253       }
6254
6255     case 0:
6256     case 3:
6257     default:
6258       HALT_NYI;
6259     }
6260 }
6261
6262 static void
6263 do_vec_SUB (sim_cpu *cpu)
6264 {
6265   /* instr [31]    = 0
6266      instr [30]    = half(0)/full(1)
6267      instr [29,24] = 10 1110
6268      instr [23,22] = size: byte(00, half(01), word (10), long (11)
6269      instr [21]    = 1
6270      instr [20,16] = Vm
6271      instr [15,10] = 10 0001
6272      instr [9, 5]  = Vn
6273      instr [4, 0]  = Vd.  */
6274
6275   unsigned full = INSTR (30, 30);
6276   unsigned vm = INSTR (20, 16);
6277   unsigned vn = INSTR (9, 5);
6278   unsigned vd = INSTR (4, 0);
6279   unsigned i;
6280
6281   NYI_assert (29, 24, 0x2E);
6282   NYI_assert (21, 21, 1);
6283   NYI_assert (15, 10, 0x21);
6284
6285   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6286   switch (INSTR (23, 22))
6287     {
6288     case 0:
6289       for (i = 0; i < (full ? 16 : 8); i++)
6290         aarch64_set_vec_s8 (cpu, vd, i,
6291                             aarch64_get_vec_s8 (cpu, vn, i)
6292                             - aarch64_get_vec_s8 (cpu, vm, i));
6293       return;
6294
6295     case 1:
6296       for (i = 0; i < (full ? 8 : 4); i++)
6297         aarch64_set_vec_s16 (cpu, vd, i,
6298                              aarch64_get_vec_s16 (cpu, vn, i)
6299                              - aarch64_get_vec_s16 (cpu, vm, i));
6300       return;
6301
6302     case 2:
6303       for (i = 0; i < (full ? 4 : 2); i++)
6304         aarch64_set_vec_s32 (cpu, vd, i,
6305                              aarch64_get_vec_s32 (cpu, vn, i)
6306                              - aarch64_get_vec_s32 (cpu, vm, i));
6307       return;
6308
6309     case 3:
6310       if (full == 0)
6311         HALT_UNALLOC;
6312
6313       for (i = 0; i < 2; i++)
6314         aarch64_set_vec_s64 (cpu, vd, i,
6315                              aarch64_get_vec_s64 (cpu, vn, i)
6316                              - aarch64_get_vec_s64 (cpu, vm, i));
6317       return;
6318     }
6319 }
6320
6321 static void
6322 do_vec_MLS (sim_cpu *cpu)
6323 {
6324   /* instr [31]    = 0
6325      instr [30]    = half(0)/full(1)
6326      instr [29,24] = 10 1110
6327      instr [23,22] = size: byte(00, half(01), word (10)
6328      instr [21]    = 1
6329      instr [20,16] = Vm
6330      instr [15,10] = 10 0101
6331      instr [9, 5]  = Vn
6332      instr [4, 0]  = Vd.  */
6333
6334   unsigned full = INSTR (30, 30);
6335   unsigned vm = INSTR (20, 16);
6336   unsigned vn = INSTR (9, 5);
6337   unsigned vd = INSTR (4, 0);
6338   unsigned i;
6339
6340   NYI_assert (29, 24, 0x2E);
6341   NYI_assert (21, 21, 1);
6342   NYI_assert (15, 10, 0x25);
6343
6344   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6345   switch (INSTR (23, 22))
6346     {
6347     case 0:
6348       for (i = 0; i < (full ? 16 : 8); i++)
6349         aarch64_set_vec_u8 (cpu, vd, i,
6350                             aarch64_get_vec_u8 (cpu, vd, i)
6351                             - (aarch64_get_vec_u8 (cpu, vn, i)
6352                                * aarch64_get_vec_u8 (cpu, vm, i)));
6353       return;
6354
6355     case 1:
6356       for (i = 0; i < (full ? 8 : 4); i++)
6357         aarch64_set_vec_u16 (cpu, vd, i,
6358                              aarch64_get_vec_u16 (cpu, vd, i)
6359                              - (aarch64_get_vec_u16 (cpu, vn, i)
6360                                 * aarch64_get_vec_u16 (cpu, vm, i)));
6361       return;
6362
6363     case 2:
6364       for (i = 0; i < (full ? 4 : 2); i++)
6365         aarch64_set_vec_u32 (cpu, vd, i,
6366                              aarch64_get_vec_u32 (cpu, vd, i)
6367                              - (aarch64_get_vec_u32 (cpu, vn, i)
6368                                 * aarch64_get_vec_u32 (cpu, vm, i)));
6369       return;
6370
6371     default:
6372       HALT_UNALLOC;
6373     }
6374 }
6375
6376 static void
6377 do_vec_FDIV (sim_cpu *cpu)
6378 {
6379   /* instr [31]    = 0
6380      instr [30]    = half(0)/full(1)
6381      instr [29,23] = 10 1110 0
6382      instr [22]    = float()/double(1)
6383      instr [21]    = 1
6384      instr [20,16] = Vm
6385      instr [15,10] = 1111 11
6386      instr [9, 5]  = Vn
6387      instr [4, 0]  = Vd.  */
6388
6389   unsigned full = INSTR (30, 30);
6390   unsigned vm = INSTR (20, 16);
6391   unsigned vn = INSTR (9, 5);
6392   unsigned vd = INSTR (4, 0);
6393   unsigned i;
6394
6395   NYI_assert (29, 23, 0x5C);
6396   NYI_assert (21, 21, 1);
6397   NYI_assert (15, 10, 0x3F);
6398
6399   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6400   if (INSTR (22, 22))
6401     {
6402       if (! full)
6403         HALT_UNALLOC;
6404
6405       for (i = 0; i < 2; i++)
6406         aarch64_set_vec_double (cpu, vd, i,
6407                                 aarch64_get_vec_double (cpu, vn, i)
6408                                 / aarch64_get_vec_double (cpu, vm, i));
6409     }
6410   else
6411     for (i = 0; i < (full ? 4 : 2); i++)
6412       aarch64_set_vec_float (cpu, vd, i,
6413                              aarch64_get_vec_float (cpu, vn, i)
6414                              / aarch64_get_vec_float (cpu, vm, i));
6415 }
6416
6417 static void
6418 do_vec_FMUL (sim_cpu *cpu)
6419 {
6420   /* instr [31]    = 0
6421      instr [30]    = half(0)/full(1)
6422      instr [29,23] = 10 1110 0
6423      instr [22]    = float(0)/double(1)
6424      instr [21]    = 1
6425      instr [20,16] = Vm
6426      instr [15,10] = 1101 11
6427      instr [9, 5]  = Vn
6428      instr [4, 0]  = Vd.  */
6429
6430   unsigned full = INSTR (30, 30);
6431   unsigned vm = INSTR (20, 16);
6432   unsigned vn = INSTR (9, 5);
6433   unsigned vd = INSTR (4, 0);
6434   unsigned i;
6435
6436   NYI_assert (29, 23, 0x5C);
6437   NYI_assert (21, 21, 1);
6438   NYI_assert (15, 10, 0x37);
6439
6440   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6441   if (INSTR (22, 22))
6442     {
6443       if (! full)
6444         HALT_UNALLOC;
6445
6446       for (i = 0; i < 2; i++)
6447         aarch64_set_vec_double (cpu, vd, i,
6448                                 aarch64_get_vec_double (cpu, vn, i)
6449                                 * aarch64_get_vec_double (cpu, vm, i));
6450     }
6451   else
6452     for (i = 0; i < (full ? 4 : 2); i++)
6453       aarch64_set_vec_float (cpu, vd, i,
6454                              aarch64_get_vec_float (cpu, vn, i)
6455                              * aarch64_get_vec_float (cpu, vm, i));
6456 }
6457
6458 static void
6459 do_vec_FADDP (sim_cpu *cpu)
6460 {
6461   /* instr [31]    = 0
6462      instr [30]    = half(0)/full(1)
6463      instr [29,23] = 10 1110 0
6464      instr [22]    = float(0)/double(1)
6465      instr [21]    = 1
6466      instr [20,16] = Vm
6467      instr [15,10] = 1101 01
6468      instr [9, 5]  = Vn
6469      instr [4, 0]  = Vd.  */
6470
6471   unsigned full = INSTR (30, 30);
6472   unsigned vm = INSTR (20, 16);
6473   unsigned vn = INSTR (9, 5);
6474   unsigned vd = INSTR (4, 0);
6475
6476   NYI_assert (29, 23, 0x5C);
6477   NYI_assert (21, 21, 1);
6478   NYI_assert (15, 10, 0x35);
6479
6480   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6481   if (INSTR (22, 22))
6482     {
6483       /* Extract values before adding them incase vd == vn/vm.  */
6484       double tmp1 = aarch64_get_vec_double (cpu, vn, 0);
6485       double tmp2 = aarch64_get_vec_double (cpu, vn, 1);
6486       double tmp3 = aarch64_get_vec_double (cpu, vm, 0);
6487       double tmp4 = aarch64_get_vec_double (cpu, vm, 1);
6488
6489       if (! full)
6490         HALT_UNALLOC;
6491
6492       aarch64_set_vec_double (cpu, vd, 0, tmp1 + tmp2);
6493       aarch64_set_vec_double (cpu, vd, 1, tmp3 + tmp4);
6494     }
6495   else
6496     {
6497       /* Extract values before adding them incase vd == vn/vm.  */
6498       float tmp1 = aarch64_get_vec_float (cpu, vn, 0);
6499       float tmp2 = aarch64_get_vec_float (cpu, vn, 1);
6500       float tmp5 = aarch64_get_vec_float (cpu, vm, 0);
6501       float tmp6 = aarch64_get_vec_float (cpu, vm, 1);
6502
6503       if (full)
6504         {
6505           float tmp3 = aarch64_get_vec_float (cpu, vn, 2);
6506           float tmp4 = aarch64_get_vec_float (cpu, vn, 3);
6507           float tmp7 = aarch64_get_vec_float (cpu, vm, 2);
6508           float tmp8 = aarch64_get_vec_float (cpu, vm, 3);
6509
6510           aarch64_set_vec_float (cpu, vd, 0, tmp1 + tmp2);
6511           aarch64_set_vec_float (cpu, vd, 1, tmp3 + tmp4);
6512           aarch64_set_vec_float (cpu, vd, 2, tmp5 + tmp6);
6513           aarch64_set_vec_float (cpu, vd, 3, tmp7 + tmp8);
6514         }
6515       else
6516         {
6517           aarch64_set_vec_float (cpu, vd, 0, tmp1 + tmp2);
6518           aarch64_set_vec_float (cpu, vd, 1, tmp5 + tmp6);
6519         }
6520     }
6521 }
6522
6523 static void
6524 do_vec_FSQRT (sim_cpu *cpu)
6525 {
6526   /* instr[31]    = 0
6527      instr[30]    = half(0)/full(1)
6528      instr[29,23] = 10 1110 1
6529      instr[22]    = single(0)/double(1)
6530      instr[21,10] = 10 0001 1111 10
6531      instr[9,5]   = Vsrc
6532      instr[4,0]   = Vdest.  */
6533
6534   unsigned vn = INSTR (9, 5);
6535   unsigned vd = INSTR (4, 0);
6536   unsigned full = INSTR (30, 30);
6537   int i;
6538
6539   NYI_assert (29, 23, 0x5D);
6540   NYI_assert (21, 10, 0x87E);
6541
6542   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6543   if (INSTR (22, 22))
6544     {
6545       if (! full)
6546         HALT_UNALLOC;
6547
6548       for (i = 0; i < 2; i++)
6549         aarch64_set_vec_double (cpu, vd, i,
6550                                 sqrt (aarch64_get_vec_double (cpu, vn, i)));
6551     }
6552   else
6553     {
6554       for (i = 0; i < (full ? 4 : 2); i++)
6555         aarch64_set_vec_float (cpu, vd, i,
6556                                sqrtf (aarch64_get_vec_float (cpu, vn, i)));
6557     }
6558 }
6559
6560 static void
6561 do_vec_FNEG (sim_cpu *cpu)
6562 {
6563   /* instr[31]    = 0
6564      instr[30]    = half (0)/full (1)
6565      instr[29,23] = 10 1110 1
6566      instr[22]    = single (0)/double (1)
6567      instr[21,10] = 10 0000 1111 10
6568      instr[9,5]   = Vsrc
6569      instr[4,0]   = Vdest.  */
6570
6571   unsigned vn = INSTR (9, 5);
6572   unsigned vd = INSTR (4, 0);
6573   unsigned full = INSTR (30, 30);
6574   int i;
6575
6576   NYI_assert (29, 23, 0x5D);
6577   NYI_assert (21, 10, 0x83E);
6578
6579   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6580   if (INSTR (22, 22))
6581     {
6582       if (! full)
6583         HALT_UNALLOC;
6584
6585       for (i = 0; i < 2; i++)
6586         aarch64_set_vec_double (cpu, vd, i,
6587                                 - aarch64_get_vec_double (cpu, vn, i));
6588     }
6589   else
6590     {
6591       for (i = 0; i < (full ? 4 : 2); i++)
6592         aarch64_set_vec_float (cpu, vd, i,
6593                                - aarch64_get_vec_float (cpu, vn, i));
6594     }
6595 }
6596
6597 static void
6598 do_vec_NOT (sim_cpu *cpu)
6599 {
6600   /* instr[31]    = 0
6601      instr[30]    = half (0)/full (1)
6602      instr[29,10] = 10 1110 0010 0000 0101 10
6603      instr[9,5]   = Vn
6604      instr[4.0]   = Vd.  */
6605
6606   unsigned vn = INSTR (9, 5);
6607   unsigned vd = INSTR (4, 0);
6608   unsigned i;
6609   int      full = INSTR (30, 30);
6610
6611   NYI_assert (29, 10, 0xB8816);
6612
6613   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6614   for (i = 0; i < (full ? 16 : 8); i++)
6615     aarch64_set_vec_u8 (cpu, vd, i, ~ aarch64_get_vec_u8 (cpu, vn, i));
6616 }
6617
6618 static unsigned int
6619 clz (uint64_t val, unsigned size)
6620 {
6621   uint64_t mask = 1;
6622   int      count;
6623
6624   mask <<= (size - 1);
6625   count = 0;
6626   do
6627     {
6628       if (val & mask)
6629         break;
6630       mask >>= 1;
6631       count ++;
6632     }
6633   while (mask);
6634
6635   return count;
6636 }
6637
6638 static void
6639 do_vec_CLZ (sim_cpu *cpu)
6640 {
6641   /* instr[31]    = 0
6642      instr[30]    = half (0)/full (1)
6643      instr[29,24] = 10 1110
6644      instr[23,22] = size
6645      instr[21,10] = 10 0000 0100 10
6646      instr[9,5]   = Vn
6647      instr[4.0]   = Vd.  */
6648
6649   unsigned vn = INSTR (9, 5);
6650   unsigned vd = INSTR (4, 0);
6651   unsigned i;
6652   int      full = INSTR (30,30);
6653
6654   NYI_assert (29, 24, 0x2E);
6655   NYI_assert (21, 10, 0x812);
6656
6657   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6658   switch (INSTR (23, 22))
6659     {
6660     case 0:
6661       for (i = 0; i < (full ? 16 : 8); i++)
6662         aarch64_set_vec_u8 (cpu, vd, i, clz (aarch64_get_vec_u8 (cpu, vn, i), 8));
6663       break;
6664     case 1:
6665       for (i = 0; i < (full ? 8 : 4); i++)
6666         aarch64_set_vec_u16 (cpu, vd, i, clz (aarch64_get_vec_u16 (cpu, vn, i), 16));
6667       break;
6668     case 2:
6669       for (i = 0; i < (full ? 4 : 2); i++)
6670         aarch64_set_vec_u32 (cpu, vd, i, clz (aarch64_get_vec_u32 (cpu, vn, i), 32));
6671       break;
6672     case 3:
6673       if (! full)
6674         HALT_UNALLOC;
6675       aarch64_set_vec_u64 (cpu, vd, 0, clz (aarch64_get_vec_u64 (cpu, vn, 0), 64));
6676       aarch64_set_vec_u64 (cpu, vd, 1, clz (aarch64_get_vec_u64 (cpu, vn, 1), 64));
6677       break;
6678     }
6679 }
6680
6681 static void
6682 do_vec_MOV_element (sim_cpu *cpu)
6683 {
6684   /* instr[31,21] = 0110 1110 000
6685      instr[20,16] = size & dest index
6686      instr[15]    = 0
6687      instr[14,11] = source index
6688      instr[10]    = 1
6689      instr[9,5]   = Vs
6690      instr[4.0]   = Vd.  */
6691
6692   unsigned vs = INSTR (9, 5);
6693   unsigned vd = INSTR (4, 0);
6694   unsigned src_index;
6695   unsigned dst_index;
6696
6697   NYI_assert (31, 21, 0x370);
6698   NYI_assert (15, 15, 0);
6699   NYI_assert (10, 10, 1);
6700
6701   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6702   if (INSTR (16, 16))
6703     {
6704       /* Move a byte.  */
6705       src_index = INSTR (14, 11);
6706       dst_index = INSTR (20, 17);
6707       aarch64_set_vec_u8 (cpu, vd, dst_index,
6708                           aarch64_get_vec_u8 (cpu, vs, src_index));
6709     }
6710   else if (INSTR (17, 17))
6711     {
6712       /* Move 16-bits.  */
6713       NYI_assert (11, 11, 0);
6714       src_index = INSTR (14, 12);
6715       dst_index = INSTR (20, 18);
6716       aarch64_set_vec_u16 (cpu, vd, dst_index,
6717                            aarch64_get_vec_u16 (cpu, vs, src_index));
6718     }
6719   else if (INSTR (18, 18))
6720     {
6721       /* Move 32-bits.  */
6722       NYI_assert (12, 11, 0);
6723       src_index = INSTR (14, 13);
6724       dst_index = INSTR (20, 19);
6725       aarch64_set_vec_u32 (cpu, vd, dst_index,
6726                            aarch64_get_vec_u32 (cpu, vs, src_index));
6727     }
6728   else
6729     {
6730       NYI_assert (19, 19, 1);
6731       NYI_assert (13, 11, 0);
6732       src_index = INSTR (14, 14);
6733       dst_index = INSTR (20, 20);
6734       aarch64_set_vec_u64 (cpu, vd, dst_index,
6735                            aarch64_get_vec_u64 (cpu, vs, src_index));
6736     }
6737 }
6738
6739 static void
6740 do_vec_REV32 (sim_cpu *cpu)
6741 {
6742   /* instr[31]    = 0
6743      instr[30]    = full/half
6744      instr[29,24] = 10 1110
6745      instr[23,22] = size
6746      instr[21,10] = 10 0000 0000 10
6747      instr[9,5]   = Rn
6748      instr[4,0]   = Rd.  */
6749
6750   unsigned rn = INSTR (9, 5);
6751   unsigned rd = INSTR (4, 0);
6752   unsigned size = INSTR (23, 22);
6753   unsigned full = INSTR (30, 30);
6754   unsigned i;
6755   FRegister val;
6756
6757   NYI_assert (29, 24, 0x2E);
6758   NYI_assert (21, 10, 0x802);
6759
6760   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6761   switch (size)
6762     {
6763     case 0:
6764       for (i = 0; i < (full ? 16 : 8); i++)
6765         val.b[i ^ 0x3] = aarch64_get_vec_u8 (cpu, rn, i);
6766       break;
6767
6768     case 1:
6769       for (i = 0; i < (full ? 8 : 4); i++)
6770         val.h[i ^ 0x1] = aarch64_get_vec_u16 (cpu, rn, i);
6771       break;
6772
6773     default:
6774       HALT_UNALLOC;
6775     }
6776
6777   aarch64_set_vec_u64 (cpu, rd, 0, val.v[0]);
6778   if (full)
6779     aarch64_set_vec_u64 (cpu, rd, 1, val.v[1]);
6780 }
6781
6782 static void
6783 do_vec_EXT (sim_cpu *cpu)
6784 {
6785   /* instr[31]    = 0
6786      instr[30]    = full/half
6787      instr[29,21] = 10 1110 000
6788      instr[20,16] = Vm
6789      instr[15]    = 0
6790      instr[14,11] = source index
6791      instr[10]    = 0
6792      instr[9,5]   = Vn
6793      instr[4.0]   = Vd.  */
6794
6795   unsigned vm = INSTR (20, 16);
6796   unsigned vn = INSTR (9, 5);
6797   unsigned vd = INSTR (4, 0);
6798   unsigned src_index = INSTR (14, 11);
6799   unsigned full = INSTR (30, 30);
6800   unsigned i;
6801   unsigned j;
6802   FRegister val;
6803
6804   NYI_assert (31, 21, 0x370);
6805   NYI_assert (15, 15, 0);
6806   NYI_assert (10, 10, 0);
6807
6808   if (!full && (src_index & 0x8))
6809     HALT_UNALLOC;
6810
6811   j = 0;
6812
6813   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6814   for (i = src_index; i < (full ? 16 : 8); i++)
6815     val.b[j ++] = aarch64_get_vec_u8 (cpu, vn, i);
6816   for (i = 0; i < src_index; i++)
6817     val.b[j ++] = aarch64_get_vec_u8 (cpu, vm, i);
6818
6819   aarch64_set_vec_u64 (cpu, vd, 0, val.v[0]);
6820   if (full)
6821     aarch64_set_vec_u64 (cpu, vd, 1, val.v[1]);
6822 }
6823
6824 static void
6825 dexAdvSIMD0 (sim_cpu *cpu)
6826 {
6827   /* instr [28,25] = 0 111.  */
6828   if (    INSTR (15, 10) == 0x07
6829       && (INSTR (9, 5) ==
6830           INSTR (20, 16)))
6831     {
6832       if (INSTR (31, 21) == 0x075
6833           || INSTR (31, 21) == 0x275)
6834         {
6835           do_vec_MOV_whole_vector (cpu);
6836           return;
6837         }
6838     }
6839
6840   if (INSTR (29, 19) == 0x1E0)
6841     {
6842       do_vec_MOV_immediate (cpu);
6843       return;
6844     }
6845
6846   if (INSTR (29, 19) == 0x5E0)
6847     {
6848       do_vec_MVNI (cpu);
6849       return;
6850     }
6851
6852   if (INSTR (29, 19) == 0x1C0
6853       || INSTR (29, 19) == 0x1C1)
6854     {
6855       if (INSTR (15, 10) == 0x03)
6856         {
6857           do_vec_DUP_scalar_into_vector (cpu);
6858           return;
6859         }
6860     }
6861
6862   switch (INSTR (29, 24))
6863     {
6864     case 0x0E: do_vec_op1 (cpu); return;
6865     case 0x0F: do_vec_op2 (cpu); return;
6866
6867     case 0x2E:
6868       if (INSTR (21, 21) == 1)
6869         {
6870           switch (INSTR (15, 10))
6871             {
6872             case 0x02:
6873               do_vec_REV32 (cpu);
6874               return;
6875
6876             case 0x07:
6877               switch (INSTR (23, 22))
6878                 {
6879                 case 0: do_vec_EOR (cpu); return;
6880                 case 1: do_vec_BSL (cpu); return;
6881                 case 2:
6882                 case 3: do_vec_bit (cpu); return;
6883                 }
6884               break;
6885
6886             case 0x08: do_vec_sub_long (cpu); return;
6887             case 0x11: do_vec_USHL (cpu); return;
6888             case 0x12: do_vec_CLZ (cpu); return;
6889             case 0x16: do_vec_NOT (cpu); return;
6890             case 0x19: do_vec_max (cpu); return;
6891             case 0x1B: do_vec_min (cpu); return;
6892             case 0x21: do_vec_SUB (cpu); return;
6893             case 0x25: do_vec_MLS (cpu); return;
6894             case 0x31: do_vec_FminmaxNMP (cpu); return;
6895             case 0x35: do_vec_FADDP (cpu); return;
6896             case 0x37: do_vec_FMUL (cpu); return;
6897             case 0x3F: do_vec_FDIV (cpu); return;
6898
6899             case 0x3E:
6900               switch (INSTR (20, 16))
6901                 {
6902                 case 0x00: do_vec_FNEG (cpu); return;
6903                 case 0x01: do_vec_FSQRT (cpu); return;
6904                 default:   HALT_NYI;
6905                 }
6906
6907             case 0x0D:
6908             case 0x0F:
6909             case 0x22:
6910             case 0x23:
6911             case 0x26:
6912             case 0x2A:
6913             case 0x32:
6914             case 0x36:
6915             case 0x39:
6916             case 0x3A:
6917               do_vec_compare (cpu); return;
6918
6919             default:
6920               break;
6921             }
6922         }
6923
6924       if (INSTR (31, 21) == 0x370)
6925         {
6926           if (INSTR (10, 10))
6927             do_vec_MOV_element (cpu);
6928           else
6929             do_vec_EXT (cpu);
6930           return;
6931         }
6932
6933       switch (INSTR (21, 10))
6934         {
6935         case 0x82E: do_vec_neg (cpu); return;
6936         case 0x87E: do_vec_sqrt (cpu); return;
6937         default:
6938           if (INSTR (15, 10) == 0x30)
6939             {
6940               do_vec_mull (cpu);
6941               return;
6942             }
6943           break;
6944         }
6945       break;
6946
6947     case 0x2f:
6948       switch (INSTR (15, 10))
6949         {
6950         case 0x01: do_vec_SSHR_USHR (cpu); return;
6951         case 0x10:
6952         case 0x12: do_vec_mls_indexed (cpu); return;
6953         case 0x29: do_vec_xtl (cpu); return;
6954         default:
6955           HALT_NYI;
6956         }
6957
6958     default:
6959       break;
6960     }
6961
6962   HALT_NYI;
6963 }
6964
6965 /* 3 sources.  */
6966
6967 /* Float multiply add.  */
6968 static void
6969 fmadds (sim_cpu *cpu)
6970 {
6971   unsigned sa = INSTR (14, 10);
6972   unsigned sm = INSTR (20, 16);
6973   unsigned sn = INSTR ( 9,  5);
6974   unsigned sd = INSTR ( 4,  0);
6975
6976   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6977   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sa)
6978                         + aarch64_get_FP_float (cpu, sn)
6979                         * aarch64_get_FP_float (cpu, sm));
6980 }
6981
6982 /* Double multiply add.  */
6983 static void
6984 fmaddd (sim_cpu *cpu)
6985 {
6986   unsigned sa = INSTR (14, 10);
6987   unsigned sm = INSTR (20, 16);
6988   unsigned sn = INSTR ( 9,  5);
6989   unsigned sd = INSTR ( 4,  0);
6990
6991   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6992   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sa)
6993                          + aarch64_get_FP_double (cpu, sn)
6994                          * aarch64_get_FP_double (cpu, sm));
6995 }
6996
6997 /* Float multiply subtract.  */
6998 static void
6999 fmsubs (sim_cpu *cpu)
7000 {
7001   unsigned sa = INSTR (14, 10);
7002   unsigned sm = INSTR (20, 16);
7003   unsigned sn = INSTR ( 9,  5);
7004   unsigned sd = INSTR ( 4,  0);
7005
7006   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7007   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sa)
7008                         - aarch64_get_FP_float (cpu, sn)
7009                         * aarch64_get_FP_float (cpu, sm));
7010 }
7011
7012 /* Double multiply subtract.  */
7013 static void
7014 fmsubd (sim_cpu *cpu)
7015 {
7016   unsigned sa = INSTR (14, 10);
7017   unsigned sm = INSTR (20, 16);
7018   unsigned sn = INSTR ( 9,  5);
7019   unsigned sd = INSTR ( 4,  0);
7020
7021   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7022   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sa)
7023                          - aarch64_get_FP_double (cpu, sn)
7024                          * aarch64_get_FP_double (cpu, sm));
7025 }
7026
7027 /* Float negative multiply add.  */
7028 static void
7029 fnmadds (sim_cpu *cpu)
7030 {
7031   unsigned sa = INSTR (14, 10);
7032   unsigned sm = INSTR (20, 16);
7033   unsigned sn = INSTR ( 9,  5);
7034   unsigned sd = INSTR ( 4,  0);
7035
7036   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7037   aarch64_set_FP_float (cpu, sd, - aarch64_get_FP_float (cpu, sa)
7038                         + (- aarch64_get_FP_float (cpu, sn))
7039                         * aarch64_get_FP_float (cpu, sm));
7040 }
7041
7042 /* Double negative multiply add.  */
7043 static void
7044 fnmaddd (sim_cpu *cpu)
7045 {
7046   unsigned sa = INSTR (14, 10);
7047   unsigned sm = INSTR (20, 16);
7048   unsigned sn = INSTR ( 9,  5);
7049   unsigned sd = INSTR ( 4,  0);
7050
7051   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7052   aarch64_set_FP_double (cpu, sd, - aarch64_get_FP_double (cpu, sa)
7053                          + (- aarch64_get_FP_double (cpu, sn))
7054                          * aarch64_get_FP_double (cpu, sm));
7055 }
7056
7057 /* Float negative multiply subtract.  */
7058 static void
7059 fnmsubs (sim_cpu *cpu)
7060 {
7061   unsigned sa = INSTR (14, 10);
7062   unsigned sm = INSTR (20, 16);
7063   unsigned sn = INSTR ( 9,  5);
7064   unsigned sd = INSTR ( 4,  0);
7065
7066   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7067   aarch64_set_FP_float (cpu, sd, - aarch64_get_FP_float (cpu, sa)
7068                         + aarch64_get_FP_float (cpu, sn)
7069                         * aarch64_get_FP_float (cpu, sm));
7070 }
7071
7072 /* Double negative multiply subtract.  */
7073 static void
7074 fnmsubd (sim_cpu *cpu)
7075 {
7076   unsigned sa = INSTR (14, 10);
7077   unsigned sm = INSTR (20, 16);
7078   unsigned sn = INSTR ( 9,  5);
7079   unsigned sd = INSTR ( 4,  0);
7080
7081   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7082   aarch64_set_FP_double (cpu, sd, - aarch64_get_FP_double (cpu, sa)
7083                          + aarch64_get_FP_double (cpu, sn)
7084                          * aarch64_get_FP_double (cpu, sm));
7085 }
7086
7087 static void
7088 dexSimpleFPDataProc3Source (sim_cpu *cpu)
7089 {
7090   /* instr[31]    ==> M : 0 ==> OK, 1 ==> UNALLOC
7091      instr[30]    = 0
7092      instr[29]    ==> S :  0 ==> OK, 1 ==> UNALLOC
7093      instr[28,25] = 1111
7094      instr[24]    = 1
7095      instr[23,22] ==> type : 0 ==> single, 01 ==> double, 1x ==> UNALLOC
7096      instr[21]    ==> o1 : 0 ==> unnegated, 1 ==> negated
7097      instr[15]    ==> o2 : 0 ==> ADD, 1 ==> SUB  */
7098
7099   uint32_t M_S = (INSTR (31, 31) << 1) | INSTR (29, 29);
7100   /* dispatch on combined type:o1:o2.  */
7101   uint32_t dispatch = (INSTR (23, 21) << 1) | INSTR (15, 15);
7102
7103   if (M_S != 0)
7104     HALT_UNALLOC;
7105
7106   switch (dispatch)
7107     {
7108     case 0: fmadds (cpu); return;
7109     case 1: fmsubs (cpu); return;
7110     case 2: fnmadds (cpu); return;
7111     case 3: fnmsubs (cpu); return;
7112     case 4: fmaddd (cpu); return;
7113     case 5: fmsubd (cpu); return;
7114     case 6: fnmaddd (cpu); return;
7115     case 7: fnmsubd (cpu); return;
7116     default:
7117       /* type > 1 is currently unallocated.  */
7118       HALT_UNALLOC;
7119     }
7120 }
7121
7122 static void
7123 dexSimpleFPFixedConvert (sim_cpu *cpu)
7124 {
7125   HALT_NYI;
7126 }
7127
7128 static void
7129 dexSimpleFPCondCompare (sim_cpu *cpu)
7130 {
7131   /* instr [31,23] = 0001 1110 0
7132      instr [22]    = type
7133      instr [21]    = 1
7134      instr [20,16] = Rm
7135      instr [15,12] = condition
7136      instr [11,10] = 01
7137      instr [9,5]   = Rn
7138      instr [4]     = 0
7139      instr [3,0]   = nzcv  */
7140
7141   unsigned rm = INSTR (20, 16);
7142   unsigned rn = INSTR (9, 5);
7143
7144   NYI_assert (31, 23, 0x3C);
7145   NYI_assert (11, 10, 0x1);
7146   NYI_assert (4,  4,  0);
7147
7148   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7149   if (! testConditionCode (cpu, INSTR (15, 12)))
7150     {
7151       aarch64_set_CPSR (cpu, INSTR (3, 0));
7152       return;
7153     }
7154
7155   if (INSTR (22, 22))
7156     {
7157       /* Double precision.  */
7158       double val1 = aarch64_get_vec_double (cpu, rn, 0);
7159       double val2 = aarch64_get_vec_double (cpu, rm, 0);
7160
7161       /* FIXME: Check for NaNs.  */
7162       if (val1 == val2)
7163         aarch64_set_CPSR (cpu, (Z | C));
7164       else if (val1 < val2)
7165         aarch64_set_CPSR (cpu, N);
7166       else /* val1 > val2 */
7167         aarch64_set_CPSR (cpu, C);
7168     }
7169   else
7170     {
7171       /* Single precision.  */
7172       float val1 = aarch64_get_vec_float (cpu, rn, 0);
7173       float val2 = aarch64_get_vec_float (cpu, rm, 0);
7174
7175       /* FIXME: Check for NaNs.  */
7176       if (val1 == val2)
7177         aarch64_set_CPSR (cpu, (Z | C));
7178       else if (val1 < val2)
7179         aarch64_set_CPSR (cpu, N);
7180       else /* val1 > val2 */
7181         aarch64_set_CPSR (cpu, C);
7182     }
7183 }
7184
7185 /* 2 sources.  */
7186
7187 /* Float add.  */
7188 static void
7189 fadds (sim_cpu *cpu)
7190 {
7191   unsigned sm = INSTR (20, 16);
7192   unsigned sn = INSTR ( 9,  5);
7193   unsigned sd = INSTR ( 4,  0);
7194
7195   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7196   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sn)
7197                         + aarch64_get_FP_float (cpu, sm));
7198 }
7199
7200 /* Double add.  */
7201 static void
7202 faddd (sim_cpu *cpu)
7203 {
7204   unsigned sm = INSTR (20, 16);
7205   unsigned sn = INSTR ( 9,  5);
7206   unsigned sd = INSTR ( 4,  0);
7207
7208   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7209   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sn)
7210                          + aarch64_get_FP_double (cpu, sm));
7211 }
7212
7213 /* Float divide.  */
7214 static void
7215 fdivs (sim_cpu *cpu)
7216 {
7217   unsigned sm = INSTR (20, 16);
7218   unsigned sn = INSTR ( 9,  5);
7219   unsigned sd = INSTR ( 4,  0);
7220
7221   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7222   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sn)
7223                         / aarch64_get_FP_float (cpu, sm));
7224 }
7225
7226 /* Double divide.  */
7227 static void
7228 fdivd (sim_cpu *cpu)
7229 {
7230   unsigned sm = INSTR (20, 16);
7231   unsigned sn = INSTR ( 9,  5);
7232   unsigned sd = INSTR ( 4,  0);
7233
7234   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7235   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sn)
7236                          / aarch64_get_FP_double (cpu, sm));
7237 }
7238
7239 /* Float multiply.  */
7240 static void
7241 fmuls (sim_cpu *cpu)
7242 {
7243   unsigned sm = INSTR (20, 16);
7244   unsigned sn = INSTR ( 9,  5);
7245   unsigned sd = INSTR ( 4,  0);
7246
7247   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7248   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sn)
7249                         * aarch64_get_FP_float (cpu, sm));
7250 }
7251
7252 /* Double multiply.  */
7253 static void
7254 fmuld (sim_cpu *cpu)
7255 {
7256   unsigned sm = INSTR (20, 16);
7257   unsigned sn = INSTR ( 9,  5);
7258   unsigned sd = INSTR ( 4,  0);
7259
7260   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7261   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sn)
7262                          * aarch64_get_FP_double (cpu, sm));
7263 }
7264
7265 /* Float negate and multiply.  */
7266 static void
7267 fnmuls (sim_cpu *cpu)
7268 {
7269   unsigned sm = INSTR (20, 16);
7270   unsigned sn = INSTR ( 9,  5);
7271   unsigned sd = INSTR ( 4,  0);
7272
7273   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7274   aarch64_set_FP_float (cpu, sd, - (aarch64_get_FP_float (cpu, sn)
7275                                     * aarch64_get_FP_float (cpu, sm)));
7276 }
7277
7278 /* Double negate and multiply.  */
7279 static void
7280 fnmuld (sim_cpu *cpu)
7281 {
7282   unsigned sm = INSTR (20, 16);
7283   unsigned sn = INSTR ( 9,  5);
7284   unsigned sd = INSTR ( 4,  0);
7285
7286   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7287   aarch64_set_FP_double (cpu, sd, - (aarch64_get_FP_double (cpu, sn)
7288                                      * aarch64_get_FP_double (cpu, sm)));
7289 }
7290
7291 /* Float subtract.  */
7292 static void
7293 fsubs (sim_cpu *cpu)
7294 {
7295   unsigned sm = INSTR (20, 16);
7296   unsigned sn = INSTR ( 9,  5);
7297   unsigned sd = INSTR ( 4,  0);
7298
7299   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7300   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sn)
7301                         - aarch64_get_FP_float (cpu, sm));
7302 }
7303
7304 /* Double subtract.  */
7305 static void
7306 fsubd (sim_cpu *cpu)
7307 {
7308   unsigned sm = INSTR (20, 16);
7309   unsigned sn = INSTR ( 9,  5);
7310   unsigned sd = INSTR ( 4,  0);
7311
7312   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7313   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sn)
7314                          - aarch64_get_FP_double (cpu, sm));
7315 }
7316
7317 static void
7318 do_FMINNM (sim_cpu *cpu)
7319 {
7320   /* instr[31,23] = 0 0011 1100
7321      instr[22]    = float(0)/double(1)
7322      instr[21]    = 1
7323      instr[20,16] = Sm
7324      instr[15,10] = 01 1110
7325      instr[9,5]   = Sn
7326      instr[4,0]   = Cpu  */
7327
7328   unsigned sm = INSTR (20, 16);
7329   unsigned sn = INSTR ( 9,  5);
7330   unsigned sd = INSTR ( 4,  0);
7331
7332   NYI_assert (31, 23, 0x03C);
7333   NYI_assert (15, 10, 0x1E);
7334
7335   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7336   if (INSTR (22, 22))
7337     aarch64_set_FP_double (cpu, sd,
7338                            dminnm (aarch64_get_FP_double (cpu, sn),
7339                                    aarch64_get_FP_double (cpu, sm)));
7340   else
7341     aarch64_set_FP_float (cpu, sd,
7342                           fminnm (aarch64_get_FP_float (cpu, sn),
7343                                   aarch64_get_FP_float (cpu, sm)));
7344 }
7345
7346 static void
7347 do_FMAXNM (sim_cpu *cpu)
7348 {
7349   /* instr[31,23] = 0 0011 1100
7350      instr[22]    = float(0)/double(1)
7351      instr[21]    = 1
7352      instr[20,16] = Sm
7353      instr[15,10] = 01 1010
7354      instr[9,5]   = Sn
7355      instr[4,0]   = Cpu  */
7356
7357   unsigned sm = INSTR (20, 16);
7358   unsigned sn = INSTR ( 9,  5);
7359   unsigned sd = INSTR ( 4,  0);
7360
7361   NYI_assert (31, 23, 0x03C);
7362   NYI_assert (15, 10, 0x1A);
7363
7364   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7365   if (INSTR (22, 22))
7366     aarch64_set_FP_double (cpu, sd,
7367                            dmaxnm (aarch64_get_FP_double (cpu, sn),
7368                                    aarch64_get_FP_double (cpu, sm)));
7369   else
7370     aarch64_set_FP_float (cpu, sd,
7371                           fmaxnm (aarch64_get_FP_float (cpu, sn),
7372                                   aarch64_get_FP_float (cpu, sm)));
7373 }
7374
7375 static void
7376 dexSimpleFPDataProc2Source (sim_cpu *cpu)
7377 {
7378   /* instr[31]    ==> M : 0 ==> OK, 1 ==> UNALLOC
7379      instr[30]    = 0
7380      instr[29]    ==> S :  0 ==> OK, 1 ==> UNALLOC
7381      instr[28,25] = 1111
7382      instr[24]    = 0
7383      instr[23,22] ==> type : 0 ==> single, 01 ==> double, 1x ==> UNALLOC
7384      instr[21]    = 1
7385      instr[20,16] = Vm
7386      instr[15,12] ==> opcode : 0000 ==> FMUL, 0001 ==> FDIV
7387                                0010 ==> FADD, 0011 ==> FSUB,
7388                                0100 ==> FMAX, 0101 ==> FMIN
7389                                0110 ==> FMAXNM, 0111 ==> FMINNM
7390                                1000 ==> FNMUL, ow ==> UNALLOC
7391      instr[11,10] = 10
7392      instr[9,5]   = Vn
7393      instr[4,0]   = Vd  */
7394
7395   uint32_t M_S = (INSTR (31, 31) << 1) | INSTR (29, 29);
7396   uint32_t type = INSTR (23, 22);
7397   /* Dispatch on opcode.  */
7398   uint32_t dispatch = INSTR (15, 12);
7399
7400   if (type > 1)
7401     HALT_UNALLOC;
7402
7403   if (M_S != 0)
7404     HALT_UNALLOC;
7405
7406   if (type)
7407     switch (dispatch)
7408       {
7409       case 0: fmuld (cpu); return;
7410       case 1: fdivd (cpu); return;
7411       case 2: faddd (cpu); return;
7412       case 3: fsubd (cpu); return;
7413       case 6: do_FMAXNM (cpu); return;
7414       case 7: do_FMINNM (cpu); return;
7415       case 8: fnmuld (cpu); return;
7416
7417         /* Have not yet implemented fmax and fmin.  */
7418       case 4:
7419       case 5:
7420         HALT_NYI;
7421
7422       default:
7423         HALT_UNALLOC;
7424       }
7425   else /* type == 0 => floats.  */
7426     switch (dispatch)
7427       {
7428       case 0: fmuls (cpu); return;
7429       case 1: fdivs (cpu); return;
7430       case 2: fadds (cpu); return;
7431       case 3: fsubs (cpu); return;
7432       case 6: do_FMAXNM (cpu); return;
7433       case 7: do_FMINNM (cpu); return;
7434       case 8: fnmuls (cpu); return;
7435
7436       case 4:
7437       case 5:
7438         HALT_NYI;
7439
7440       default:
7441         HALT_UNALLOC;
7442       }
7443 }
7444
7445 static void
7446 dexSimpleFPCondSelect (sim_cpu *cpu)
7447 {
7448   /* FCSEL
7449      instr[31,23] = 0 0011 1100
7450      instr[22]    = 0=>single 1=>double
7451      instr[21]    = 1
7452      instr[20,16] = Sm
7453      instr[15,12] = cond
7454      instr[11,10] = 11
7455      instr[9,5]   = Sn
7456      instr[4,0]   = Cpu  */
7457   unsigned sm = INSTR (20, 16);
7458   unsigned sn = INSTR ( 9, 5);
7459   unsigned sd = INSTR ( 4, 0);
7460   uint32_t set = testConditionCode (cpu, INSTR (15, 12));
7461
7462   NYI_assert (31, 23, 0x03C);
7463   NYI_assert (11, 10, 0x3);
7464
7465   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7466   if (INSTR (22, 22))
7467     aarch64_set_FP_double (cpu, sd, (set ? aarch64_get_FP_double (cpu, sn)
7468                                      : aarch64_get_FP_double (cpu, sm)));
7469   else
7470     aarch64_set_FP_float (cpu, sd, (set ? aarch64_get_FP_float (cpu, sn)
7471                                     : aarch64_get_FP_float (cpu, sm)));
7472 }
7473
7474 /* Store 32 bit unscaled signed 9 bit.  */
7475 static void
7476 fsturs (sim_cpu *cpu, int32_t offset)
7477 {
7478   unsigned int rn = INSTR (9, 5);
7479   unsigned int st = INSTR (4, 0);
7480
7481   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7482   aarch64_set_mem_u32 (cpu, aarch64_get_reg_u64 (cpu, rn, 1) + offset,
7483                        aarch64_get_vec_u32 (cpu, st, 0));
7484 }
7485
7486 /* Store 64 bit unscaled signed 9 bit.  */
7487 static void
7488 fsturd (sim_cpu *cpu, int32_t offset)
7489 {
7490   unsigned int rn = INSTR (9, 5);
7491   unsigned int st = INSTR (4, 0);
7492
7493   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7494   aarch64_set_mem_u64 (cpu, aarch64_get_reg_u64 (cpu, rn, 1) + offset,
7495                        aarch64_get_vec_u64 (cpu, st, 0));
7496 }
7497
7498 /* Store 128 bit unscaled signed 9 bit.  */
7499 static void
7500 fsturq (sim_cpu *cpu, int32_t offset)
7501 {
7502   unsigned int rn = INSTR (9, 5);
7503   unsigned int st = INSTR (4, 0);
7504   FRegister a;
7505
7506   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7507   aarch64_get_FP_long_double (cpu, st, & a);
7508   aarch64_set_mem_long_double (cpu,
7509                                aarch64_get_reg_u64 (cpu, rn, 1)
7510                                + offset, a);
7511 }
7512
7513 /* TODO FP move register.  */
7514
7515 /* 32 bit fp to fp move register.  */
7516 static void
7517 ffmovs (sim_cpu *cpu)
7518 {
7519   unsigned int rn = INSTR (9, 5);
7520   unsigned int st = INSTR (4, 0);
7521
7522   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7523   aarch64_set_FP_float (cpu, st, aarch64_get_FP_float (cpu, rn));
7524 }
7525
7526 /* 64 bit fp to fp move register.  */
7527 static void
7528 ffmovd (sim_cpu *cpu)
7529 {
7530   unsigned int rn = INSTR (9, 5);
7531   unsigned int st = INSTR (4, 0);
7532
7533   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7534   aarch64_set_FP_double (cpu, st, aarch64_get_FP_double (cpu, rn));
7535 }
7536
7537 /* 32 bit GReg to Vec move register.  */
7538 static void
7539 fgmovs (sim_cpu *cpu)
7540 {
7541   unsigned int rn = INSTR (9, 5);
7542   unsigned int st = INSTR (4, 0);
7543
7544   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7545   aarch64_set_vec_u32 (cpu, st, 0, aarch64_get_reg_u32 (cpu, rn, NO_SP));
7546 }
7547
7548 /* 64 bit g to fp move register.  */
7549 static void
7550 fgmovd (sim_cpu *cpu)
7551 {
7552   unsigned int rn = INSTR (9, 5);
7553   unsigned int st = INSTR (4, 0);
7554
7555   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7556   aarch64_set_vec_u64 (cpu, st, 0, aarch64_get_reg_u64 (cpu, rn, NO_SP));
7557 }
7558
7559 /* 32 bit fp to g move register.  */
7560 static void
7561 gfmovs (sim_cpu *cpu)
7562 {
7563   unsigned int rn = INSTR (9, 5);
7564   unsigned int st = INSTR (4, 0);
7565
7566   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7567   aarch64_set_reg_u64 (cpu, st, NO_SP, aarch64_get_vec_u32 (cpu, rn, 0));
7568 }
7569
7570 /* 64 bit fp to g move register.  */
7571 static void
7572 gfmovd (sim_cpu *cpu)
7573 {
7574   unsigned int rn = INSTR (9, 5);
7575   unsigned int st = INSTR (4, 0);
7576
7577   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7578   aarch64_set_reg_u64 (cpu, st, NO_SP, aarch64_get_vec_u64 (cpu, rn, 0));
7579 }
7580
7581 /* FP move immediate
7582
7583    These install an immediate 8 bit value in the target register
7584    where the 8 bits comprise 1 sign bit, 4 bits of fraction and a 3
7585    bit exponent.  */
7586
7587 static void
7588 fmovs (sim_cpu *cpu)
7589 {
7590   unsigned int sd = INSTR (4, 0);
7591   uint32_t imm = INSTR (20, 13);
7592   float f = fp_immediate_for_encoding_32 (imm);
7593
7594   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7595   aarch64_set_FP_float (cpu, sd, f);
7596 }
7597
7598 static void
7599 fmovd (sim_cpu *cpu)
7600 {
7601   unsigned int sd = INSTR (4, 0);
7602   uint32_t imm = INSTR (20, 13);
7603   double d = fp_immediate_for_encoding_64 (imm);
7604
7605   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7606   aarch64_set_FP_double (cpu, sd, d);
7607 }
7608
7609 static void
7610 dexSimpleFPImmediate (sim_cpu *cpu)
7611 {
7612   /* instr[31,23] == 00111100
7613      instr[22]    == type : single(0)/double(1)
7614      instr[21]    == 1
7615      instr[20,13] == imm8
7616      instr[12,10] == 100
7617      instr[9,5]   == imm5 : 00000 ==> PK, ow ==> UNALLOC
7618      instr[4,0]   == Rd  */
7619   uint32_t imm5 = INSTR (9, 5);
7620
7621   NYI_assert (31, 23, 0x3C);
7622
7623   if (imm5 != 0)
7624     HALT_UNALLOC;
7625
7626   if (INSTR (22, 22))
7627     fmovd (cpu);
7628   else
7629     fmovs (cpu);
7630 }
7631
7632 /* TODO specific decode and execute for group Load Store.  */
7633
7634 /* TODO FP load/store single register (unscaled offset).  */
7635
7636 /* TODO load 8 bit unscaled signed 9 bit.  */
7637 /* TODO load 16 bit unscaled signed 9 bit.  */
7638
7639 /* Load 32 bit unscaled signed 9 bit.  */
7640 static void
7641 fldurs (sim_cpu *cpu, int32_t offset)
7642 {
7643   unsigned int rn = INSTR (9, 5);
7644   unsigned int st = INSTR (4, 0);
7645
7646   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7647   aarch64_set_vec_u32 (cpu, st, 0, aarch64_get_mem_u32
7648                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset));
7649 }
7650
7651 /* Load 64 bit unscaled signed 9 bit.  */
7652 static void
7653 fldurd (sim_cpu *cpu, int32_t offset)
7654 {
7655   unsigned int rn = INSTR (9, 5);
7656   unsigned int st = INSTR (4, 0);
7657
7658   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7659   aarch64_set_vec_u64 (cpu, st, 0, aarch64_get_mem_u64
7660                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset));
7661 }
7662
7663 /* Load 128 bit unscaled signed 9 bit.  */
7664 static void
7665 fldurq (sim_cpu *cpu, int32_t offset)
7666 {
7667   unsigned int rn = INSTR (9, 5);
7668   unsigned int st = INSTR (4, 0);
7669   FRegister a;
7670   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset;
7671
7672   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7673   aarch64_get_mem_long_double (cpu, addr, & a);
7674   aarch64_set_FP_long_double (cpu, st, a);
7675 }
7676
7677 /* TODO store 8 bit unscaled signed 9 bit.  */
7678 /* TODO store 16 bit unscaled signed 9 bit.  */
7679
7680
7681 /* 1 source.  */
7682
7683 /* Float absolute value.  */
7684 static void
7685 fabss (sim_cpu *cpu)
7686 {
7687   unsigned sn = INSTR (9, 5);
7688   unsigned sd = INSTR (4, 0);
7689   float value = aarch64_get_FP_float (cpu, sn);
7690
7691   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7692   aarch64_set_FP_float (cpu, sd, fabsf (value));
7693 }
7694
7695 /* Double absolute value.  */
7696 static void
7697 fabcpu (sim_cpu *cpu)
7698 {
7699   unsigned sn = INSTR (9, 5);
7700   unsigned sd = INSTR (4, 0);
7701   double value = aarch64_get_FP_double (cpu, sn);
7702
7703   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7704   aarch64_set_FP_double (cpu, sd, fabs (value));
7705 }
7706
7707 /* Float negative value.  */
7708 static void
7709 fnegs (sim_cpu *cpu)
7710 {
7711   unsigned sn = INSTR (9, 5);
7712   unsigned sd = INSTR (4, 0);
7713
7714   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7715   aarch64_set_FP_float (cpu, sd, - aarch64_get_FP_float (cpu, sn));
7716 }
7717
7718 /* Double negative value.  */
7719 static void
7720 fnegd (sim_cpu *cpu)
7721 {
7722   unsigned sn = INSTR (9, 5);
7723   unsigned sd = INSTR (4, 0);
7724
7725   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7726   aarch64_set_FP_double (cpu, sd, - aarch64_get_FP_double (cpu, sn));
7727 }
7728
7729 /* Float square root.  */
7730 static void
7731 fsqrts (sim_cpu *cpu)
7732 {
7733   unsigned sn = INSTR (9, 5);
7734   unsigned sd = INSTR (4, 0);
7735
7736   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7737   aarch64_set_FP_float (cpu, sd, sqrtf (aarch64_get_FP_float (cpu, sn)));
7738 }
7739
7740 /* Double square root.  */
7741 static void
7742 fsqrtd (sim_cpu *cpu)
7743 {
7744   unsigned sn = INSTR (9, 5);
7745   unsigned sd = INSTR (4, 0);
7746
7747   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7748   aarch64_set_FP_double (cpu, sd,
7749                          sqrt (aarch64_get_FP_double (cpu, sn)));
7750 }
7751
7752 /* Convert double to float.  */
7753 static void
7754 fcvtds (sim_cpu *cpu)
7755 {
7756   unsigned sn = INSTR (9, 5);
7757   unsigned sd = INSTR (4, 0);
7758
7759   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7760   aarch64_set_FP_float (cpu, sd, (float) aarch64_get_FP_double (cpu, sn));
7761 }
7762
7763 /* Convert float to double.  */
7764 static void
7765 fcvtcpu (sim_cpu *cpu)
7766 {
7767   unsigned sn = INSTR (9, 5);
7768   unsigned sd = INSTR (4, 0);
7769
7770   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7771   aarch64_set_FP_double (cpu, sd, (double) aarch64_get_FP_float (cpu, sn));
7772 }
7773
7774 static void
7775 do_FRINT (sim_cpu *cpu)
7776 {
7777   /* instr[31,23] = 0001 1110 0
7778      instr[22]    = single(0)/double(1)
7779      instr[21,18] = 1001
7780      instr[17,15] = rounding mode
7781      instr[14,10] = 10000
7782      instr[9,5]   = source
7783      instr[4,0]   = dest  */
7784
7785   float val;
7786   unsigned rs = INSTR (9, 5);
7787   unsigned rd = INSTR (4, 0);
7788   unsigned int rmode = INSTR (17, 15);
7789
7790   NYI_assert (31, 23, 0x03C);
7791   NYI_assert (21, 18, 0x9);
7792   NYI_assert (14, 10, 0x10);
7793
7794   if (rmode == 6 || rmode == 7)
7795     /* FIXME: Add support for rmode == 6 exactness check.  */
7796     rmode = uimm (aarch64_get_FPSR (cpu), 23, 22);
7797
7798   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7799   if (INSTR (22, 22))
7800     {
7801       double val = aarch64_get_FP_double (cpu, rs);
7802
7803       switch (rmode)
7804         {
7805         case 0: /* mode N: nearest or even.  */
7806           {
7807             double rval = round (val);
7808
7809             if (val - rval == 0.5)
7810               {
7811                 if (((rval / 2.0) * 2.0) != rval)
7812                   rval += 1.0;
7813               }
7814
7815             aarch64_set_FP_double (cpu, rd, round (val));
7816             return;
7817           }
7818
7819         case 1: /* mode P: towards +inf.  */
7820           if (val < 0.0)
7821             aarch64_set_FP_double (cpu, rd, trunc (val));
7822           else
7823             aarch64_set_FP_double (cpu, rd, round (val));
7824           return;
7825
7826         case 2: /* mode M: towards -inf.  */
7827           if (val < 0.0)
7828             aarch64_set_FP_double (cpu, rd, round (val));
7829           else
7830             aarch64_set_FP_double (cpu, rd, trunc (val));
7831           return;
7832
7833         case 3: /* mode Z: towards 0.  */
7834           aarch64_set_FP_double (cpu, rd, trunc (val));
7835           return;
7836
7837         case 4: /* mode A: away from 0.  */
7838           aarch64_set_FP_double (cpu, rd, round (val));
7839           return;
7840
7841         case 6: /* mode X: use FPCR with exactness check.  */
7842         case 7: /* mode I: use FPCR mode.  */
7843           HALT_NYI;
7844
7845         default:
7846           HALT_UNALLOC;
7847         }
7848     }
7849
7850   val = aarch64_get_FP_float (cpu, rs);
7851
7852   switch (rmode)
7853     {
7854     case 0: /* mode N: nearest or even.  */
7855       {
7856         float rval = roundf (val);
7857
7858         if (val - rval == 0.5)
7859           {
7860             if (((rval / 2.0) * 2.0) != rval)
7861               rval += 1.0;
7862           }
7863
7864         aarch64_set_FP_float (cpu, rd, rval);
7865         return;
7866       }
7867
7868     case 1: /* mode P: towards +inf.  */
7869       if (val < 0.0)
7870         aarch64_set_FP_float (cpu, rd, truncf (val));
7871       else
7872         aarch64_set_FP_float (cpu, rd, roundf (val));
7873       return;
7874
7875     case 2: /* mode M: towards -inf.  */
7876       if (val < 0.0)
7877         aarch64_set_FP_float (cpu, rd, truncf (val));
7878       else
7879         aarch64_set_FP_float (cpu, rd, roundf (val));
7880       return;
7881
7882     case 3: /* mode Z: towards 0.  */
7883       aarch64_set_FP_float (cpu, rd, truncf (val));
7884       return;
7885
7886     case 4: /* mode A: away from 0.  */
7887       aarch64_set_FP_float (cpu, rd, roundf (val));
7888       return;
7889
7890     case 6: /* mode X: use FPCR with exactness check.  */
7891     case 7: /* mode I: use FPCR mode.  */
7892       HALT_NYI;
7893
7894     default:
7895       HALT_UNALLOC;
7896     }
7897 }
7898
7899 /* Convert half to float.  */
7900 static void
7901 do_FCVT_half_to_single (sim_cpu *cpu)
7902 {
7903   unsigned rn = INSTR (9, 5);
7904   unsigned rd = INSTR (4, 0);
7905
7906   NYI_assert (31, 10, 0x7B890);
7907
7908   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7909   aarch64_set_FP_float (cpu, rd, (float) aarch64_get_FP_half  (cpu, rn));
7910 }
7911
7912 /* Convert half to double.  */
7913 static void
7914 do_FCVT_half_to_double (sim_cpu *cpu)
7915 {
7916   unsigned rn = INSTR (9, 5);
7917   unsigned rd = INSTR (4, 0);
7918
7919   NYI_assert (31, 10, 0x7B8B0);
7920
7921   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7922   aarch64_set_FP_double (cpu, rd, (double) aarch64_get_FP_half  (cpu, rn));
7923 }
7924
7925 static void
7926 do_FCVT_single_to_half (sim_cpu *cpu)
7927 {
7928   unsigned rn = INSTR (9, 5);
7929   unsigned rd = INSTR (4, 0);
7930
7931   NYI_assert (31, 10, 0x788F0);
7932
7933   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7934   aarch64_set_FP_half (cpu, rd, aarch64_get_FP_float  (cpu, rn));
7935 }
7936
7937 /* Convert double to half.  */
7938 static void
7939 do_FCVT_double_to_half (sim_cpu *cpu)
7940 {
7941   unsigned rn = INSTR (9, 5);
7942   unsigned rd = INSTR (4, 0);
7943
7944   NYI_assert (31, 10, 0x798F0);
7945
7946   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7947   aarch64_set_FP_half (cpu, rd, (float) aarch64_get_FP_double  (cpu, rn));
7948 }
7949
7950 static void
7951 dexSimpleFPDataProc1Source (sim_cpu *cpu)
7952 {
7953   /* instr[31]    ==> M : 0 ==> OK, 1 ==> UNALLOC
7954      instr[30]    = 0
7955      instr[29]    ==> S :  0 ==> OK, 1 ==> UNALLOC
7956      instr[28,25] = 1111
7957      instr[24]    = 0
7958      instr[23,22] ==> type : 00 ==> source is single,
7959                              01 ==> source is double
7960                              10 ==> UNALLOC
7961                              11 ==> UNALLOC or source is half
7962      instr[21]    = 1
7963      instr[20,15] ==> opcode : with type 00 or 01
7964                                000000 ==> FMOV, 000001 ==> FABS,
7965                                000010 ==> FNEG, 000011 ==> FSQRT,
7966                                000100 ==> UNALLOC, 000101 ==> FCVT,(to single/double)
7967                                000110 ==> UNALLOC, 000111 ==> FCVT (to half)
7968                                001000 ==> FRINTN, 001001 ==> FRINTP,
7969                                001010 ==> FRINTM, 001011 ==> FRINTZ,
7970                                001100 ==> FRINTA, 001101 ==> UNALLOC
7971                                001110 ==> FRINTX, 001111 ==> FRINTI
7972                                with type 11
7973                                000100 ==> FCVT (half-to-single)
7974                                000101 ==> FCVT (half-to-double)
7975                                instr[14,10] = 10000.  */
7976
7977   uint32_t M_S = (INSTR (31, 31) << 1) | INSTR (29, 29);
7978   uint32_t type   = INSTR (23, 22);
7979   uint32_t opcode = INSTR (20, 15);
7980
7981   if (M_S != 0)
7982     HALT_UNALLOC;
7983
7984   if (type == 3)
7985     {
7986       if (opcode == 4)
7987         do_FCVT_half_to_single (cpu);
7988       else if (opcode == 5)
7989         do_FCVT_half_to_double (cpu);
7990       else
7991         HALT_UNALLOC;
7992       return;
7993     }
7994
7995   if (type == 2)
7996     HALT_UNALLOC;
7997
7998   switch (opcode)
7999     {
8000     case 0:
8001       if (type)
8002         ffmovd (cpu);
8003       else
8004         ffmovs (cpu);
8005       return;
8006
8007     case 1:
8008       if (type)
8009         fabcpu (cpu);
8010       else
8011         fabss (cpu);
8012       return;
8013
8014     case 2:
8015       if (type)
8016         fnegd (cpu);
8017       else
8018         fnegs (cpu);
8019       return;
8020
8021     case 3:
8022       if (type)
8023         fsqrtd (cpu);
8024       else
8025         fsqrts (cpu);
8026       return;
8027
8028     case 4:
8029       if (type)
8030         fcvtds (cpu);
8031       else
8032         HALT_UNALLOC;
8033       return;
8034
8035     case 5:
8036       if (type)
8037         HALT_UNALLOC;
8038       fcvtcpu (cpu);
8039       return;
8040
8041     case 8:             /* FRINTN etc.  */
8042     case 9:
8043     case 10:
8044     case 11:
8045     case 12:
8046     case 14:
8047     case 15:
8048        do_FRINT (cpu);
8049        return;
8050
8051     case 7:
8052       if (INSTR (22, 22))
8053         do_FCVT_double_to_half (cpu);
8054       else
8055         do_FCVT_single_to_half (cpu);
8056       return;
8057
8058     case 13:
8059       HALT_NYI;
8060
8061     default:
8062       HALT_UNALLOC;
8063     }
8064 }
8065
8066 /* 32 bit signed int to float.  */
8067 static void
8068 scvtf32 (sim_cpu *cpu)
8069 {
8070   unsigned rn = INSTR (9, 5);
8071   unsigned sd = INSTR (4, 0);
8072
8073   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8074   aarch64_set_FP_float
8075     (cpu, sd, (float) aarch64_get_reg_s32 (cpu, rn, NO_SP));
8076 }
8077
8078 /* signed int to float.  */
8079 static void
8080 scvtf (sim_cpu *cpu)
8081 {
8082   unsigned rn = INSTR (9, 5);
8083   unsigned sd = INSTR (4, 0);
8084
8085   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8086   aarch64_set_FP_float
8087     (cpu, sd, (float) aarch64_get_reg_s64 (cpu, rn, NO_SP));
8088 }
8089
8090 /* 32 bit signed int to double.  */
8091 static void
8092 scvtd32 (sim_cpu *cpu)
8093 {
8094   unsigned rn = INSTR (9, 5);
8095   unsigned sd = INSTR (4, 0);
8096
8097   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8098   aarch64_set_FP_double
8099     (cpu, sd, (double) aarch64_get_reg_s32 (cpu, rn, NO_SP));
8100 }
8101
8102 /* signed int to double.  */
8103 static void
8104 scvtd (sim_cpu *cpu)
8105 {
8106   unsigned rn = INSTR (9, 5);
8107   unsigned sd = INSTR (4, 0);
8108
8109   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8110   aarch64_set_FP_double
8111     (cpu, sd, (double) aarch64_get_reg_s64 (cpu, rn, NO_SP));
8112 }
8113
8114 static const float  FLOAT_INT_MAX   = (float)  INT_MAX;
8115 static const float  FLOAT_INT_MIN   = (float)  INT_MIN;
8116 static const double DOUBLE_INT_MAX  = (double) INT_MAX;
8117 static const double DOUBLE_INT_MIN  = (double) INT_MIN;
8118 static const float  FLOAT_LONG_MAX  = (float)  LONG_MAX;
8119 static const float  FLOAT_LONG_MIN  = (float)  LONG_MIN;
8120 static const double DOUBLE_LONG_MAX = (double) LONG_MAX;
8121 static const double DOUBLE_LONG_MIN = (double) LONG_MIN;
8122
8123 #define UINT_MIN 0
8124 #define ULONG_MIN 0
8125 static const float  FLOAT_UINT_MAX   = (float)  UINT_MAX;
8126 static const float  FLOAT_UINT_MIN   = (float)  UINT_MIN;
8127 static const double DOUBLE_UINT_MAX  = (double) UINT_MAX;
8128 static const double DOUBLE_UINT_MIN  = (double) UINT_MIN;
8129 static const float  FLOAT_ULONG_MAX  = (float)  ULONG_MAX;
8130 static const float  FLOAT_ULONG_MIN  = (float)  ULONG_MIN;
8131 static const double DOUBLE_ULONG_MAX = (double) ULONG_MAX;
8132 static const double DOUBLE_ULONG_MIN = (double) ULONG_MIN;
8133
8134 /* Check for FP exception conditions:
8135      NaN raises IO
8136      Infinity raises IO
8137      Out of Range raises IO and IX and saturates value
8138      Denormal raises ID and IX and sets to zero.  */
8139 #define RAISE_EXCEPTIONS(F, VALUE, FTYPE, ITYPE)        \
8140   do                                                    \
8141     {                                                   \
8142       switch (fpclassify (F))                           \
8143         {                                               \
8144         case FP_INFINITE:                               \
8145         case FP_NAN:                                    \
8146           aarch64_set_FPSR (cpu, IO);                   \
8147           if (signbit (F))                              \
8148             VALUE = ITYPE##_MAX;                        \
8149           else                                          \
8150             VALUE = ITYPE##_MIN;                        \
8151           break;                                        \
8152                                                         \
8153         case FP_NORMAL:                                 \
8154           if (F >= FTYPE##_##ITYPE##_MAX)               \
8155             {                                           \
8156               aarch64_set_FPSR_bits (cpu, IO | IX, IO | IX);    \
8157               VALUE = ITYPE##_MAX;                      \
8158             }                                           \
8159           else if (F <= FTYPE##_##ITYPE##_MIN)          \
8160             {                                           \
8161               aarch64_set_FPSR_bits (cpu, IO | IX, IO | IX);    \
8162               VALUE = ITYPE##_MIN;                      \
8163             }                                           \
8164           break;                                        \
8165                                                         \
8166         case FP_SUBNORMAL:                              \
8167           aarch64_set_FPSR_bits (cpu, IO | IX | ID, IX | ID);   \
8168           VALUE = 0;                                    \
8169           break;                                        \
8170                                                         \
8171         default:                                        \
8172         case FP_ZERO:                                   \
8173           VALUE = 0;                                    \
8174           break;                                        \
8175         }                                               \
8176     }                                                   \
8177   while (0)
8178
8179 /* 32 bit convert float to signed int truncate towards zero.  */
8180 static void
8181 fcvtszs32 (sim_cpu *cpu)
8182 {
8183   unsigned sn = INSTR (9, 5);
8184   unsigned rd = INSTR (4, 0);
8185   /* TODO : check that this rounds toward zero.  */
8186   float   f = aarch64_get_FP_float (cpu, sn);
8187   int32_t value = (int32_t) f;
8188
8189   RAISE_EXCEPTIONS (f, value, FLOAT, INT);
8190
8191   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8192   /* Avoid sign extension to 64 bit.  */
8193   aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) value);
8194 }
8195
8196 /* 64 bit convert float to signed int truncate towards zero.  */
8197 static void
8198 fcvtszs (sim_cpu *cpu)
8199 {
8200   unsigned sn = INSTR (9, 5);
8201   unsigned rd = INSTR (4, 0);
8202   float f = aarch64_get_FP_float (cpu, sn);
8203   int64_t value = (int64_t) f;
8204
8205   RAISE_EXCEPTIONS (f, value, FLOAT, LONG);
8206
8207   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8208   aarch64_set_reg_s64 (cpu, rd, NO_SP, value);
8209 }
8210
8211 /* 32 bit convert double to signed int truncate towards zero.  */
8212 static void
8213 fcvtszd32 (sim_cpu *cpu)
8214 {
8215   unsigned sn = INSTR (9, 5);
8216   unsigned rd = INSTR (4, 0);
8217   /* TODO : check that this rounds toward zero.  */
8218   double   d = aarch64_get_FP_double (cpu, sn);
8219   int32_t  value = (int32_t) d;
8220
8221   RAISE_EXCEPTIONS (d, value, DOUBLE, INT);
8222
8223   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8224   /* Avoid sign extension to 64 bit.  */
8225   aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) value);
8226 }
8227
8228 /* 64 bit convert double to signed int truncate towards zero.  */
8229 static void
8230 fcvtszd (sim_cpu *cpu)
8231 {
8232   unsigned sn = INSTR (9, 5);
8233   unsigned rd = INSTR (4, 0);
8234   /* TODO : check that this rounds toward zero.  */
8235   double  d = aarch64_get_FP_double (cpu, sn);
8236   int64_t value;
8237
8238   value = (int64_t) d;
8239
8240   RAISE_EXCEPTIONS (d, value, DOUBLE, LONG);
8241
8242   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8243   aarch64_set_reg_s64 (cpu, rd, NO_SP, value);
8244 }
8245
8246 static void
8247 do_fcvtzu (sim_cpu *cpu)
8248 {
8249   /* instr[31]    = size: 32-bit (0), 64-bit (1)
8250      instr[30,23] = 00111100
8251      instr[22]    = type: single (0)/ double (1)
8252      instr[21]    = enable (0)/disable(1) precision
8253      instr[20,16] = 11001
8254      instr[15,10] = precision
8255      instr[9,5]   = Rs
8256      instr[4,0]   = Rd.  */
8257
8258   unsigned rs = INSTR (9, 5);
8259   unsigned rd = INSTR (4, 0);
8260
8261   NYI_assert (30, 23, 0x3C);
8262   NYI_assert (20, 16, 0x19);
8263
8264   if (INSTR (21, 21) != 1)
8265     /* Convert to fixed point.  */
8266     HALT_NYI;
8267
8268   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8269   if (INSTR (31, 31))
8270     {
8271       /* Convert to unsigned 64-bit integer.  */
8272       if (INSTR (22, 22))
8273         {
8274           double  d = aarch64_get_FP_double (cpu, rs);
8275           uint64_t value = (uint64_t) d;
8276
8277           /* Do not raise an exception if we have reached ULONG_MAX.  */
8278           if (value != (1UL << 63))
8279             RAISE_EXCEPTIONS (d, value, DOUBLE, ULONG);
8280
8281           aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
8282         }
8283       else
8284         {
8285           float  f = aarch64_get_FP_float (cpu, rs);
8286           uint64_t value = (uint64_t) f;
8287
8288           /* Do not raise an exception if we have reached ULONG_MAX.  */
8289           if (value != (1UL << 63))
8290             RAISE_EXCEPTIONS (f, value, FLOAT, ULONG);
8291
8292           aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
8293         }
8294     }
8295   else
8296     {
8297       uint32_t value;
8298
8299       /* Convert to unsigned 32-bit integer.  */
8300       if (INSTR (22, 22))
8301         {
8302           double  d = aarch64_get_FP_double (cpu, rs);
8303
8304           value = (uint32_t) d;
8305           /* Do not raise an exception if we have reached UINT_MAX.  */
8306           if (value != (1UL << 31))
8307             RAISE_EXCEPTIONS (d, value, DOUBLE, UINT);
8308         }
8309       else
8310         {
8311           float  f = aarch64_get_FP_float (cpu, rs);
8312
8313           value = (uint32_t) f;
8314           /* Do not raise an exception if we have reached UINT_MAX.  */
8315           if (value != (1UL << 31))
8316             RAISE_EXCEPTIONS (f, value, FLOAT, UINT);
8317         }
8318
8319       aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
8320     }
8321 }
8322
8323 static void
8324 do_UCVTF (sim_cpu *cpu)
8325 {
8326   /* instr[31]    = size: 32-bit (0), 64-bit (1)
8327      instr[30,23] = 001 1110 0
8328      instr[22]    = type: single (0)/ double (1)
8329      instr[21]    = enable (0)/disable(1) precision
8330      instr[20,16] = 0 0011
8331      instr[15,10] = precision
8332      instr[9,5]   = Rs
8333      instr[4,0]   = Rd.  */
8334
8335   unsigned rs = INSTR (9, 5);
8336   unsigned rd = INSTR (4, 0);
8337
8338   NYI_assert (30, 23, 0x3C);
8339   NYI_assert (20, 16, 0x03);
8340
8341   if (INSTR (21, 21) != 1)
8342     HALT_NYI;
8343
8344   /* FIXME: Add exception raising.  */
8345   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8346   if (INSTR (31, 31))
8347     {
8348       uint64_t value = aarch64_get_reg_u64 (cpu, rs, NO_SP);
8349
8350       if (INSTR (22, 22))
8351         aarch64_set_FP_double (cpu, rd, (double) value);
8352       else
8353         aarch64_set_FP_float (cpu, rd, (float) value);
8354     }
8355   else
8356     {
8357       uint32_t value =  aarch64_get_reg_u32 (cpu, rs, NO_SP);
8358
8359       if (INSTR (22, 22))
8360         aarch64_set_FP_double (cpu, rd, (double) value);
8361       else
8362         aarch64_set_FP_float (cpu, rd, (float) value);
8363     }
8364 }
8365
8366 static void
8367 float_vector_move (sim_cpu *cpu)
8368 {
8369   /* instr[31,17] == 100 1111 0101 0111
8370      instr[16]    ==> direction 0=> to GR, 1=> from GR
8371      instr[15,10] => ???
8372      instr[9,5]   ==> source
8373      instr[4,0]   ==> dest.  */
8374
8375   unsigned rn = INSTR (9, 5);
8376   unsigned rd = INSTR (4, 0);
8377
8378   NYI_assert (31, 17, 0x4F57);
8379
8380   if (INSTR (15, 10) != 0)
8381     HALT_UNALLOC;
8382
8383   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8384   if (INSTR (16, 16))
8385     aarch64_set_vec_u64 (cpu, rd, 1, aarch64_get_reg_u64 (cpu, rn, NO_SP));
8386   else
8387     aarch64_set_reg_u64 (cpu, rd, NO_SP, aarch64_get_vec_u64 (cpu, rn, 1));
8388 }
8389
8390 static void
8391 dexSimpleFPIntegerConvert (sim_cpu *cpu)
8392 {
8393   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
8394      instr[30     = 0
8395      instr[29]    = S :  0 ==> OK, 1 ==> UNALLOC
8396      instr[28,25] = 1111
8397      instr[24]    = 0
8398      instr[23,22] = type : 00 ==> single, 01 ==> double, 1x ==> UNALLOC
8399      instr[21]    = 1
8400      instr[20,19] = rmode
8401      instr[18,16] = opcode
8402      instr[15,10] = 10 0000  */
8403
8404   uint32_t rmode_opcode;
8405   uint32_t size_type;
8406   uint32_t type;
8407   uint32_t size;
8408   uint32_t S;
8409
8410   if (INSTR (31, 17) == 0x4F57)
8411     {
8412       float_vector_move (cpu);
8413       return;
8414     }
8415
8416   size = INSTR (31, 31);
8417   S = INSTR (29, 29);
8418   if (S != 0)
8419     HALT_UNALLOC;
8420
8421   type = INSTR (23, 22);
8422   if (type > 1)
8423     HALT_UNALLOC;
8424
8425   rmode_opcode = INSTR (20, 16);
8426   size_type = (size << 1) | type; /* 0==32f, 1==32d, 2==64f, 3==64d.  */
8427
8428   switch (rmode_opcode)
8429     {
8430     case 2:                     /* SCVTF.  */
8431       switch (size_type)
8432         {
8433         case 0: scvtf32 (cpu); return;
8434         case 1: scvtd32 (cpu); return;
8435         case 2: scvtf (cpu); return;
8436         case 3: scvtd (cpu); return;
8437         }
8438
8439     case 6:                     /* FMOV GR, Vec.  */
8440       switch (size_type)
8441         {
8442         case 0:  gfmovs (cpu); return;
8443         case 3:  gfmovd (cpu); return;
8444         default: HALT_UNALLOC;
8445         }
8446
8447     case 7:                     /* FMOV vec, GR.  */
8448       switch (size_type)
8449         {
8450         case 0:  fgmovs (cpu); return;
8451         case 3:  fgmovd (cpu); return;
8452         default: HALT_UNALLOC;
8453         }
8454
8455     case 24:                    /* FCVTZS.  */
8456       switch (size_type)
8457         {
8458         case 0: fcvtszs32 (cpu); return;
8459         case 1: fcvtszd32 (cpu); return;
8460         case 2: fcvtszs (cpu); return;
8461         case 3: fcvtszd (cpu); return;
8462         }
8463
8464     case 25: do_fcvtzu (cpu); return;
8465     case 3:  do_UCVTF (cpu); return;
8466
8467     case 0:     /* FCVTNS.  */
8468     case 1:     /* FCVTNU.  */
8469     case 4:     /* FCVTAS.  */
8470     case 5:     /* FCVTAU.  */
8471     case 8:     /* FCVPTS.  */
8472     case 9:     /* FCVTPU.  */
8473     case 16:    /* FCVTMS.  */
8474     case 17:    /* FCVTMU.  */
8475     default:
8476       HALT_NYI;
8477     }
8478 }
8479
8480 static void
8481 set_flags_for_float_compare (sim_cpu *cpu, float fvalue1, float fvalue2)
8482 {
8483   uint32_t flags;
8484
8485   /* FIXME: Add exception raising.  */
8486   if (isnan (fvalue1) || isnan (fvalue2))
8487     flags = C|V;
8488   else if (isinf (fvalue1) && isinf (fvalue2))
8489     {
8490       /* Subtracting two infinities may give a NaN.  We only need to compare
8491          the signs, which we can get from isinf.  */
8492       int result = isinf (fvalue1) - isinf (fvalue2);
8493
8494       if (result == 0)
8495         flags = Z|C;
8496       else if (result < 0)
8497         flags = N;
8498       else /* (result > 0).  */
8499         flags = C;
8500     }
8501   else
8502     {
8503       float result = fvalue1 - fvalue2;
8504
8505       if (result == 0.0)
8506         flags = Z|C;
8507       else if (result < 0)
8508         flags = N;
8509       else /* (result > 0).  */
8510         flags = C;
8511     }
8512
8513   aarch64_set_CPSR (cpu, flags);
8514 }
8515
8516 static void
8517 fcmps (sim_cpu *cpu)
8518 {
8519   unsigned sm = INSTR (20, 16);
8520   unsigned sn = INSTR ( 9,  5);
8521
8522   float fvalue1 = aarch64_get_FP_float (cpu, sn);
8523   float fvalue2 = aarch64_get_FP_float (cpu, sm);
8524
8525   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8526   set_flags_for_float_compare (cpu, fvalue1, fvalue2);
8527 }
8528
8529 /* Float compare to zero -- Invalid Operation exception
8530    only on signaling NaNs.  */
8531 static void
8532 fcmpzs (sim_cpu *cpu)
8533 {
8534   unsigned sn = INSTR ( 9,  5);
8535   float fvalue1 = aarch64_get_FP_float (cpu, sn);
8536
8537   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8538   set_flags_for_float_compare (cpu, fvalue1, 0.0f);
8539 }
8540
8541 /* Float compare -- Invalid Operation exception on all NaNs.  */
8542 static void
8543 fcmpes (sim_cpu *cpu)
8544 {
8545   unsigned sm = INSTR (20, 16);
8546   unsigned sn = INSTR ( 9,  5);
8547
8548   float fvalue1 = aarch64_get_FP_float (cpu, sn);
8549   float fvalue2 = aarch64_get_FP_float (cpu, sm);
8550
8551   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8552   set_flags_for_float_compare (cpu, fvalue1, fvalue2);
8553 }
8554
8555 /* Float compare to zero -- Invalid Operation exception on all NaNs.  */
8556 static void
8557 fcmpzes (sim_cpu *cpu)
8558 {
8559   unsigned sn = INSTR ( 9,  5);
8560   float fvalue1 = aarch64_get_FP_float (cpu, sn);
8561
8562   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8563   set_flags_for_float_compare (cpu, fvalue1, 0.0f);
8564 }
8565
8566 static void
8567 set_flags_for_double_compare (sim_cpu *cpu, double dval1, double dval2)
8568 {
8569   uint32_t flags;
8570
8571   /* FIXME: Add exception raising.  */
8572   if (isnan (dval1) || isnan (dval2))
8573     flags = C|V;
8574   else if (isinf (dval1) && isinf (dval2))
8575     {
8576       /* Subtracting two infinities may give a NaN.  We only need to compare
8577          the signs, which we can get from isinf.  */
8578       int result = isinf (dval1) - isinf (dval2);
8579
8580       if (result == 0)
8581         flags = Z|C;
8582       else if (result < 0)
8583         flags = N;
8584       else /* (result > 0).  */
8585         flags = C;
8586     }
8587   else
8588     {
8589       double result = dval1 - dval2;
8590
8591       if (result == 0.0)
8592         flags = Z|C;
8593       else if (result < 0)
8594         flags = N;
8595       else /* (result > 0).  */
8596         flags = C;
8597     }
8598
8599   aarch64_set_CPSR (cpu, flags);
8600 }
8601
8602 /* Double compare -- Invalid Operation exception only on signaling NaNs.  */
8603 static void
8604 fcmpd (sim_cpu *cpu)
8605 {
8606   unsigned sm = INSTR (20, 16);
8607   unsigned sn = INSTR ( 9,  5);
8608
8609   double dvalue1 = aarch64_get_FP_double (cpu, sn);
8610   double dvalue2 = aarch64_get_FP_double (cpu, sm);
8611
8612   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8613   set_flags_for_double_compare (cpu, dvalue1, dvalue2);
8614 }
8615
8616 /* Double compare to zero -- Invalid Operation exception
8617    only on signaling NaNs.  */
8618 static void
8619 fcmpzd (sim_cpu *cpu)
8620 {
8621   unsigned sn = INSTR ( 9,  5);
8622   double dvalue1 = aarch64_get_FP_double (cpu, sn);
8623
8624   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8625   set_flags_for_double_compare (cpu, dvalue1, 0.0);
8626 }
8627
8628 /* Double compare -- Invalid Operation exception on all NaNs.  */
8629 static void
8630 fcmped (sim_cpu *cpu)
8631 {
8632   unsigned sm = INSTR (20, 16);
8633   unsigned sn = INSTR ( 9,  5);
8634
8635   double dvalue1 = aarch64_get_FP_double (cpu, sn);
8636   double dvalue2 = aarch64_get_FP_double (cpu, sm);
8637
8638   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8639   set_flags_for_double_compare (cpu, dvalue1, dvalue2);
8640 }
8641
8642 /* Double compare to zero -- Invalid Operation exception on all NaNs.  */
8643 static void
8644 fcmpzed (sim_cpu *cpu)
8645 {
8646   unsigned sn = INSTR ( 9,  5);
8647   double dvalue1 = aarch64_get_FP_double (cpu, sn);
8648
8649   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8650   set_flags_for_double_compare (cpu, dvalue1, 0.0);
8651 }
8652
8653 static void
8654 dexSimpleFPCompare (sim_cpu *cpu)
8655 {
8656   /* assert instr[28,25] == 1111
8657      instr[30:24:21:13,10] = 0011000
8658      instr[31] = M : 0 ==> OK, 1 ==> UNALLOC
8659      instr[29] ==> S :  0 ==> OK, 1 ==> UNALLOC
8660      instr[23,22] ==> type : 0 ==> single, 01 ==> double, 1x ==> UNALLOC
8661      instr[15,14] ==> op : 00 ==> OK, ow ==> UNALLOC
8662      instr[4,0] ==> opcode2 : 00000 ==> FCMP, 10000 ==> FCMPE,
8663                               01000 ==> FCMPZ, 11000 ==> FCMPEZ,
8664                               ow ==> UNALLOC  */
8665   uint32_t dispatch;
8666   uint32_t M_S = (INSTR (31, 31) << 1) | INSTR (29, 29);
8667   uint32_t type = INSTR (23, 22);
8668   uint32_t op = INSTR (15, 14);
8669   uint32_t op2_2_0 = INSTR (2, 0);
8670
8671   if (op2_2_0 != 0)
8672     HALT_UNALLOC;
8673
8674   if (M_S != 0)
8675     HALT_UNALLOC;
8676
8677   if (type > 1)
8678     HALT_UNALLOC;
8679
8680   if (op != 0)
8681     HALT_UNALLOC;
8682
8683   /* dispatch on type and top 2 bits of opcode.  */
8684   dispatch = (type << 2) | INSTR (4, 3);
8685
8686   switch (dispatch)
8687     {
8688     case 0: fcmps (cpu); return;
8689     case 1: fcmpzs (cpu); return;
8690     case 2: fcmpes (cpu); return;
8691     case 3: fcmpzes (cpu); return;
8692     case 4: fcmpd (cpu); return;
8693     case 5: fcmpzd (cpu); return;
8694     case 6: fcmped (cpu); return;
8695     case 7: fcmpzed (cpu); return;
8696     }
8697 }
8698
8699 static void
8700 do_scalar_FADDP (sim_cpu *cpu)
8701 {
8702   /* instr [31,23] = 0111 1110 0
8703      instr [22]    = single(0)/double(1)
8704      instr [21,10] = 11 0000 1101 10
8705      instr [9,5]   = Fn
8706      instr [4,0]   = Fd.  */
8707
8708   unsigned Fn = INSTR (9, 5);
8709   unsigned Fd = INSTR (4, 0);
8710
8711   NYI_assert (31, 23, 0x0FC);
8712   NYI_assert (21, 10, 0xC36);
8713
8714   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8715   if (INSTR (22, 22))
8716     {
8717       double val1 = aarch64_get_vec_double (cpu, Fn, 0);
8718       double val2 = aarch64_get_vec_double (cpu, Fn, 1);
8719
8720       aarch64_set_FP_double (cpu, Fd, val1 + val2);
8721     }
8722   else
8723     {
8724       float val1 = aarch64_get_vec_float (cpu, Fn, 0);
8725       float val2 = aarch64_get_vec_float (cpu, Fn, 1);
8726
8727       aarch64_set_FP_float (cpu, Fd, val1 + val2);
8728     }
8729 }
8730
8731 /* Floating point absolute difference.  */
8732
8733 static void
8734 do_scalar_FABD (sim_cpu *cpu)
8735 {
8736   /* instr [31,23] = 0111 1110 1
8737      instr [22]    = float(0)/double(1)
8738      instr [21]    = 1
8739      instr [20,16] = Rm
8740      instr [15,10] = 1101 01
8741      instr [9, 5]  = Rn
8742      instr [4, 0]  = Rd.  */
8743
8744   unsigned rm = INSTR (20, 16);
8745   unsigned rn = INSTR (9, 5);
8746   unsigned rd = INSTR (4, 0);
8747
8748   NYI_assert (31, 23, 0x0FD);
8749   NYI_assert (21, 21, 1);
8750   NYI_assert (15, 10, 0x35);
8751
8752   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8753   if (INSTR (22, 22))
8754     aarch64_set_FP_double (cpu, rd,
8755                            fabs (aarch64_get_FP_double (cpu, rn)
8756                                  - aarch64_get_FP_double (cpu, rm)));
8757   else
8758     aarch64_set_FP_float (cpu, rd,
8759                           fabsf (aarch64_get_FP_float (cpu, rn)
8760                                  - aarch64_get_FP_float (cpu, rm)));
8761 }
8762
8763 static void
8764 do_scalar_CMGT (sim_cpu *cpu)
8765 {
8766   /* instr [31,21] = 0101 1110 111
8767      instr [20,16] = Rm
8768      instr [15,10] = 00 1101
8769      instr [9, 5]  = Rn
8770      instr [4, 0]  = Rd.  */
8771
8772   unsigned rm = INSTR (20, 16);
8773   unsigned rn = INSTR (9, 5);
8774   unsigned rd = INSTR (4, 0);
8775
8776   NYI_assert (31, 21, 0x2F7);
8777   NYI_assert (15, 10, 0x0D);
8778
8779   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8780   aarch64_set_vec_u64 (cpu, rd, 0,
8781                        aarch64_get_vec_u64 (cpu, rn, 0) >
8782                        aarch64_get_vec_u64 (cpu, rm, 0) ? -1L : 0L);
8783 }
8784
8785 static void
8786 do_scalar_USHR (sim_cpu *cpu)
8787 {
8788   /* instr [31,23] = 0111 1111 0
8789      instr [22,16] = shift amount
8790      instr [15,10] = 0000 01
8791      instr [9, 5]  = Rn
8792      instr [4, 0]  = Rd.  */
8793
8794   unsigned amount = 128 - INSTR (22, 16);
8795   unsigned rn = INSTR (9, 5);
8796   unsigned rd = INSTR (4, 0);
8797
8798   NYI_assert (31, 23, 0x0FE);
8799   NYI_assert (15, 10, 0x01);
8800
8801   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8802   aarch64_set_vec_u64 (cpu, rd, 0,
8803                        aarch64_get_vec_u64 (cpu, rn, 0) >> amount);
8804 }
8805
8806 static void
8807 do_scalar_SSHL (sim_cpu *cpu)
8808 {
8809   /* instr [31,21] = 0101 1110 111
8810      instr [20,16] = Rm
8811      instr [15,10] = 0100 01
8812      instr [9, 5]  = Rn
8813      instr [4, 0]  = Rd.  */
8814
8815   unsigned rm = INSTR (20, 16);
8816   unsigned rn = INSTR (9, 5);
8817   unsigned rd = INSTR (4, 0);
8818   signed int shift = aarch64_get_vec_s8 (cpu, rm, 0);
8819
8820   NYI_assert (31, 21, 0x2F7);
8821   NYI_assert (15, 10, 0x11);
8822
8823   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8824   if (shift >= 0)
8825     aarch64_set_vec_s64 (cpu, rd, 0,
8826                          aarch64_get_vec_s64 (cpu, rn, 0) << shift);
8827   else
8828     aarch64_set_vec_s64 (cpu, rd, 0,
8829                          aarch64_get_vec_s64 (cpu, rn, 0) >> - shift);
8830 }
8831
8832 static void
8833 do_scalar_shift (sim_cpu *cpu)
8834 {
8835   /* instr [31,23] = 0101 1111 0
8836      instr [22,16] = shift amount
8837      instr [15,10] = 0101 01   [SHL]
8838      instr [15,10] = 0000 01   [SSHR]
8839      instr [9, 5]  = Rn
8840      instr [4, 0]  = Rd.  */
8841
8842   unsigned rn = INSTR (9, 5);
8843   unsigned rd = INSTR (4, 0);
8844   unsigned amount;
8845
8846   NYI_assert (31, 23, 0x0BE);
8847
8848   if (INSTR (22, 22) == 0)
8849     HALT_UNALLOC;
8850
8851   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8852   switch (INSTR (15, 10))
8853     {
8854     case 0x01: /* SSHR */
8855       amount = 128 - INSTR (22, 16);
8856       aarch64_set_vec_s64 (cpu, rd, 0,
8857                            aarch64_get_vec_s64 (cpu, rn, 0) >> amount);
8858       return;
8859     case 0x15: /* SHL */
8860       amount = INSTR (22, 16) - 64;
8861       aarch64_set_vec_u64 (cpu, rd, 0,
8862                            aarch64_get_vec_u64 (cpu, rn, 0) << amount);
8863       return;
8864     default:
8865       HALT_NYI;
8866     }
8867 }
8868
8869 /* FCMEQ FCMGT FCMGE.  */
8870 static void
8871 do_scalar_FCM (sim_cpu *cpu)
8872 {
8873   /* instr [31,30] = 01
8874      instr [29]    = U
8875      instr [28,24] = 1 1110
8876      instr [23]    = E
8877      instr [22]    = size
8878      instr [21]    = 1
8879      instr [20,16] = Rm
8880      instr [15,12] = 1110
8881      instr [11]    = AC
8882      instr [10]    = 1
8883      instr [9, 5]  = Rn
8884      instr [4, 0]  = Rd.  */
8885
8886   unsigned rm = INSTR (20, 16);
8887   unsigned rn = INSTR (9, 5);
8888   unsigned rd = INSTR (4, 0);
8889   unsigned EUac = (INSTR (23, 23) << 2) | (INSTR (29, 29) << 1) | INSTR (11, 11);
8890   unsigned result;
8891   float val1;
8892   float val2;
8893
8894   NYI_assert (31, 30, 1);
8895   NYI_assert (28, 24, 0x1E);
8896   NYI_assert (21, 21, 1);
8897   NYI_assert (15, 12, 0xE);
8898   NYI_assert (10, 10, 1);
8899
8900   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8901   if (INSTR (22, 22))
8902     {
8903       double val1 = aarch64_get_FP_double (cpu, rn);
8904       double val2 = aarch64_get_FP_double (cpu, rm);
8905
8906       switch (EUac)
8907         {
8908         case 0: /* 000 */
8909           result = val1 == val2;
8910           break;
8911
8912         case 3: /* 011 */
8913           val1 = fabs (val1);
8914           val2 = fabs (val2);
8915           /* Fall through. */
8916         case 2: /* 010 */
8917           result = val1 >= val2;
8918           break;
8919
8920         case 7: /* 111 */
8921           val1 = fabs (val1);
8922           val2 = fabs (val2);
8923           /* Fall through. */
8924         case 6: /* 110 */
8925           result = val1 > val2;
8926           break;
8927
8928         default:
8929           HALT_UNALLOC;
8930         }
8931
8932       aarch64_set_vec_u32 (cpu, rd, 0, result ? -1 : 0);
8933       return;
8934     }
8935
8936   val1 = aarch64_get_FP_float (cpu, rn);
8937   val2 = aarch64_get_FP_float (cpu, rm);
8938
8939   switch (EUac)
8940     {
8941     case 0: /* 000 */
8942       result = val1 == val2;
8943       break;
8944
8945     case 3: /* 011 */
8946       val1 = fabsf (val1);
8947       val2 = fabsf (val2);
8948       /* Fall through. */
8949     case 2: /* 010 */
8950       result = val1 >= val2;
8951       break;
8952
8953     case 7: /* 111 */
8954       val1 = fabsf (val1);
8955       val2 = fabsf (val2);
8956       /* Fall through. */
8957     case 6: /* 110 */
8958       result = val1 > val2;
8959       break;
8960
8961     default:
8962       HALT_UNALLOC;
8963     }
8964
8965   aarch64_set_vec_u32 (cpu, rd, 0, result ? -1 : 0);
8966 }
8967
8968 /* An alias of DUP.  */
8969 static void
8970 do_scalar_MOV (sim_cpu *cpu)
8971 {
8972   /* instr [31,21] = 0101 1110 000
8973      instr [20,16] = imm5
8974      instr [15,10] = 0000 01
8975      instr [9, 5]  = Rn
8976      instr [4, 0]  = Rd.  */
8977
8978   unsigned rn = INSTR (9, 5);
8979   unsigned rd = INSTR (4, 0);
8980   unsigned index;
8981
8982   NYI_assert (31, 21, 0x2F0);
8983   NYI_assert (15, 10, 0x01);
8984
8985   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8986   if (INSTR (16, 16))
8987     {
8988       /* 8-bit.  */
8989       index = INSTR (20, 17);
8990       aarch64_set_vec_u8
8991         (cpu, rd, 0, aarch64_get_vec_u8 (cpu, rn, index));
8992     }
8993   else if (INSTR (17, 17))
8994     {
8995       /* 16-bit.  */
8996       index = INSTR (20, 18);
8997       aarch64_set_vec_u16
8998         (cpu, rd, 0, aarch64_get_vec_u16 (cpu, rn, index));
8999     }
9000   else if (INSTR (18, 18))
9001     {
9002       /* 32-bit.  */
9003       index = INSTR (20, 19);
9004       aarch64_set_vec_u32
9005         (cpu, rd, 0, aarch64_get_vec_u32 (cpu, rn, index));
9006     }
9007   else if (INSTR (19, 19))
9008     {
9009       /* 64-bit.  */
9010       index = INSTR (20, 20);
9011       aarch64_set_vec_u64
9012         (cpu, rd, 0, aarch64_get_vec_u64 (cpu, rn, index));
9013     }
9014   else
9015     HALT_UNALLOC;
9016 }
9017
9018 static void
9019 do_scalar_NEG (sim_cpu *cpu)
9020 {
9021   /* instr [31,10] = 0111 1110 1110 0000 1011 10
9022      instr [9, 5]  = Rn
9023      instr [4, 0]  = Rd.  */
9024
9025   unsigned rn = INSTR (9, 5);
9026   unsigned rd = INSTR (4, 0);
9027
9028   NYI_assert (31, 10, 0x1FB82E);
9029
9030   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9031   aarch64_set_vec_u64 (cpu, rd, 0, - aarch64_get_vec_u64 (cpu, rn, 0));
9032 }
9033
9034 static void
9035 do_scalar_USHL (sim_cpu *cpu)
9036 {
9037   /* instr [31,21] = 0111 1110 111
9038      instr [20,16] = Rm
9039      instr [15,10] = 0100 01
9040      instr [9, 5]  = Rn
9041      instr [4, 0]  = Rd.  */
9042
9043   unsigned rm = INSTR (20, 16);
9044   unsigned rn = INSTR (9, 5);
9045   unsigned rd = INSTR (4, 0);
9046   signed int shift = aarch64_get_vec_s8 (cpu, rm, 0);
9047
9048   NYI_assert (31, 21, 0x3F7);
9049   NYI_assert (15, 10, 0x11);
9050
9051   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9052   if (shift >= 0)
9053     aarch64_set_vec_u64 (cpu, rd, 0, aarch64_get_vec_u64 (cpu, rn, 0) << shift);
9054   else
9055     aarch64_set_vec_u64 (cpu, rd, 0, aarch64_get_vec_u64 (cpu, rn, 0) >> - shift);
9056 }
9057
9058 static void
9059 do_double_add (sim_cpu *cpu)
9060 {
9061   /* instr [31,21] = 0101 1110 111
9062      instr [20,16] = Fn
9063      instr [15,10] = 1000 01
9064      instr [9,5]   = Fm
9065      instr [4,0]   = Fd.  */
9066   unsigned Fd;
9067   unsigned Fm;
9068   unsigned Fn;
9069   double val1;
9070   double val2;
9071
9072   NYI_assert (31, 21, 0x2F7);
9073   NYI_assert (15, 10, 0x21);
9074
9075   Fd = INSTR (4, 0);
9076   Fm = INSTR (9, 5);
9077   Fn = INSTR (20, 16);
9078
9079   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9080   val1 = aarch64_get_FP_double (cpu, Fm);
9081   val2 = aarch64_get_FP_double (cpu, Fn);
9082
9083   aarch64_set_FP_double (cpu, Fd, val1 + val2);
9084 }
9085
9086 static void
9087 do_scalar_UCVTF (sim_cpu *cpu)
9088 {
9089   /* instr [31,23] = 0111 1110 0
9090      instr [22]    = single(0)/double(1)
9091      instr [21,10] = 10 0001 1101 10
9092      instr [9,5]   = rn
9093      instr [4,0]   = rd.  */
9094
9095   unsigned rn = INSTR (9, 5);
9096   unsigned rd = INSTR (4, 0);
9097
9098   NYI_assert (31, 23, 0x0FC);
9099   NYI_assert (21, 10, 0x876);
9100
9101   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9102   if (INSTR (22, 22))
9103     {
9104       uint64_t val = aarch64_get_vec_u64 (cpu, rn, 0);
9105
9106       aarch64_set_vec_double (cpu, rd, 0, (double) val);
9107     }
9108   else
9109     {
9110       uint32_t val = aarch64_get_vec_u32 (cpu, rn, 0);
9111
9112       aarch64_set_vec_float (cpu, rd, 0, (float) val);
9113     }
9114 }
9115
9116 static void
9117 do_scalar_vec (sim_cpu *cpu)
9118 {
9119   /* instr [30] = 1.  */
9120   /* instr [28,25] = 1111.  */
9121   switch (INSTR (31, 23))
9122     {
9123     case 0xBC:
9124       switch (INSTR (15, 10))
9125         {
9126         case 0x01: do_scalar_MOV (cpu); return;
9127         case 0x39: do_scalar_FCM (cpu); return;
9128         case 0x3B: do_scalar_FCM (cpu); return;
9129         }
9130       break;
9131
9132     case 0xBE: do_scalar_shift (cpu); return;
9133
9134     case 0xFC:
9135       switch (INSTR (15, 10))
9136         {
9137         case 0x36:
9138           switch (INSTR (21, 16))
9139             {
9140             case 0x30: do_scalar_FADDP (cpu); return;
9141             case 0x21: do_scalar_UCVTF (cpu); return;
9142             }
9143           HALT_NYI;
9144         case 0x39: do_scalar_FCM (cpu); return;
9145         case 0x3B: do_scalar_FCM (cpu); return;
9146         }
9147       break;
9148
9149     case 0xFD:
9150       switch (INSTR (15, 10))
9151         {
9152         case 0x0D: do_scalar_CMGT (cpu); return;
9153         case 0x11: do_scalar_USHL (cpu); return;
9154         case 0x2E: do_scalar_NEG (cpu); return;
9155         case 0x35: do_scalar_FABD (cpu); return;
9156         case 0x39: do_scalar_FCM (cpu); return;
9157         case 0x3B: do_scalar_FCM (cpu); return;
9158         default:
9159           HALT_NYI;
9160         }
9161
9162     case 0xFE: do_scalar_USHR (cpu); return;
9163
9164     case 0xBD:
9165       switch (INSTR (15, 10))
9166         {
9167         case 0x21: do_double_add (cpu); return;
9168         case 0x11: do_scalar_SSHL (cpu); return;
9169         default:
9170           HALT_NYI;
9171         }
9172
9173     default:
9174       HALT_NYI;
9175     }
9176 }
9177
9178 static void
9179 dexAdvSIMD1 (sim_cpu *cpu)
9180 {
9181   /* instr [28,25] = 1 111.  */
9182
9183   /* We are currently only interested in the basic
9184      scalar fp routines which all have bit 30 = 0.  */
9185   if (INSTR (30, 30))
9186     do_scalar_vec (cpu);
9187
9188   /* instr[24] is set for FP data processing 3-source and clear for
9189      all other basic scalar fp instruction groups.  */
9190   else if (INSTR (24, 24))
9191     dexSimpleFPDataProc3Source (cpu);
9192
9193   /* instr[21] is clear for floating <-> fixed conversions and set for
9194      all other basic scalar fp instruction groups.  */
9195   else if (!INSTR (21, 21))
9196     dexSimpleFPFixedConvert (cpu);
9197
9198   /* instr[11,10] : 01 ==> cond compare, 10 ==> Data Proc 2 Source
9199      11 ==> cond select,  00 ==> other.  */
9200   else
9201     switch (INSTR (11, 10))
9202       {
9203       case 1: dexSimpleFPCondCompare (cpu); return;
9204       case 2: dexSimpleFPDataProc2Source (cpu); return;
9205       case 3: dexSimpleFPCondSelect (cpu); return;
9206
9207       default:
9208         /* Now an ordered cascade of tests.
9209            FP immediate has instr [12] == 1.
9210            FP compare has   instr [13] == 1.
9211            FP Data Proc 1 Source has instr [14] == 1.
9212            FP floating <--> integer conversions has instr [15] == 0.  */
9213         if (INSTR (12, 12))
9214           dexSimpleFPImmediate (cpu);
9215
9216         else if (INSTR (13, 13))
9217           dexSimpleFPCompare (cpu);
9218
9219         else if (INSTR (14, 14))
9220           dexSimpleFPDataProc1Source (cpu);
9221
9222         else if (!INSTR (15, 15))
9223           dexSimpleFPIntegerConvert (cpu);
9224
9225         else
9226           /* If we get here then instr[15] == 1 which means UNALLOC.  */
9227           HALT_UNALLOC;
9228       }
9229 }
9230
9231 /* PC relative addressing.  */
9232
9233 static void
9234 pcadr (sim_cpu *cpu)
9235 {
9236   /* instr[31] = op : 0 ==> ADR, 1 ==> ADRP
9237      instr[30,29] = immlo
9238      instr[23,5] = immhi.  */
9239   uint64_t address;
9240   unsigned rd = INSTR (4, 0);
9241   uint32_t isPage = INSTR (31, 31);
9242   union { int64_t u64; uint64_t s64; } imm;
9243   uint64_t offset;
9244
9245   imm.s64 = simm64 (aarch64_get_instr (cpu), 23, 5);
9246   offset = imm.u64;
9247   offset = (offset << 2) | INSTR (30, 29);
9248
9249   address = aarch64_get_PC (cpu);
9250
9251   if (isPage)
9252     {
9253       offset <<= 12;
9254       address &= ~0xfff;
9255     }
9256
9257   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9258   aarch64_set_reg_u64 (cpu, rd, NO_SP, address + offset);
9259 }
9260
9261 /* Specific decode and execute for group Data Processing Immediate.  */
9262
9263 static void
9264 dexPCRelAddressing (sim_cpu *cpu)
9265 {
9266   /* assert instr[28,24] = 10000.  */
9267   pcadr (cpu);
9268 }
9269
9270 /* Immediate logical.
9271    The bimm32/64 argument is constructed by replicating a 2, 4, 8,
9272    16, 32 or 64 bit sequence pulled out at decode and possibly
9273    inverting it..
9274
9275    N.B. the output register (dest) can normally be Xn or SP
9276    the exception occurs for flag setting instructions which may
9277    only use Xn for the output (dest).  The input register can
9278    never be SP.  */
9279
9280 /* 32 bit and immediate.  */
9281 static void
9282 and32 (sim_cpu *cpu, uint32_t bimm)
9283 {
9284   unsigned rn = INSTR (9, 5);
9285   unsigned rd = INSTR (4, 0);
9286
9287   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9288   aarch64_set_reg_u64 (cpu, rd, SP_OK,
9289                        aarch64_get_reg_u32 (cpu, rn, NO_SP) & bimm);
9290 }
9291
9292 /* 64 bit and immediate.  */
9293 static void
9294 and64 (sim_cpu *cpu, uint64_t bimm)
9295 {
9296   unsigned rn = INSTR (9, 5);
9297   unsigned rd = INSTR (4, 0);
9298
9299   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9300   aarch64_set_reg_u64 (cpu, rd, SP_OK,
9301                        aarch64_get_reg_u64 (cpu, rn, NO_SP) & bimm);
9302 }
9303
9304 /* 32 bit and immediate set flags.  */
9305 static void
9306 ands32 (sim_cpu *cpu, uint32_t bimm)
9307 {
9308   unsigned rn = INSTR (9, 5);
9309   unsigned rd = INSTR (4, 0);
9310
9311   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
9312   uint32_t value2 = bimm;
9313
9314   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9315   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
9316   set_flags_for_binop32 (cpu, value1 & value2);
9317 }
9318
9319 /* 64 bit and immediate set flags.  */
9320 static void
9321 ands64 (sim_cpu *cpu, uint64_t bimm)
9322 {
9323   unsigned rn = INSTR (9, 5);
9324   unsigned rd = INSTR (4, 0);
9325
9326   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
9327   uint64_t value2 = bimm;
9328
9329   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9330   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
9331   set_flags_for_binop64 (cpu, value1 & value2);
9332 }
9333
9334 /* 32 bit exclusive or immediate.  */
9335 static void
9336 eor32 (sim_cpu *cpu, uint32_t bimm)
9337 {
9338   unsigned rn = INSTR (9, 5);
9339   unsigned rd = INSTR (4, 0);
9340
9341   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9342   aarch64_set_reg_u64 (cpu, rd, SP_OK,
9343                        aarch64_get_reg_u32 (cpu, rn, NO_SP) ^ bimm);
9344 }
9345
9346 /* 64 bit exclusive or immediate.  */
9347 static void
9348 eor64 (sim_cpu *cpu, uint64_t bimm)
9349 {
9350   unsigned rn = INSTR (9, 5);
9351   unsigned rd = INSTR (4, 0);
9352
9353   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9354   aarch64_set_reg_u64 (cpu, rd, SP_OK,
9355                        aarch64_get_reg_u64 (cpu, rn, NO_SP) ^ bimm);
9356 }
9357
9358 /* 32 bit or immediate.  */
9359 static void
9360 orr32 (sim_cpu *cpu, uint32_t bimm)
9361 {
9362   unsigned rn = INSTR (9, 5);
9363   unsigned rd = INSTR (4, 0);
9364
9365   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9366   aarch64_set_reg_u64 (cpu, rd, SP_OK,
9367                        aarch64_get_reg_u32 (cpu, rn, NO_SP) | bimm);
9368 }
9369
9370 /* 64 bit or immediate.  */
9371 static void
9372 orr64 (sim_cpu *cpu, uint64_t bimm)
9373 {
9374   unsigned rn = INSTR (9, 5);
9375   unsigned rd = INSTR (4, 0);
9376
9377   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9378   aarch64_set_reg_u64 (cpu, rd, SP_OK,
9379                        aarch64_get_reg_u64 (cpu, rn, NO_SP) | bimm);
9380 }
9381
9382 /* Logical shifted register.
9383    These allow an optional LSL, ASR, LSR or ROR to the second source
9384    register with a count up to the register bit count.
9385    N.B register args may not be SP.  */
9386
9387 /* 32 bit AND shifted register.  */
9388 static void
9389 and32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9390 {
9391   unsigned rm = INSTR (20, 16);
9392   unsigned rn = INSTR (9, 5);
9393   unsigned rd = INSTR (4, 0);
9394
9395   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9396   aarch64_set_reg_u64
9397     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
9398      & shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
9399 }
9400
9401 /* 64 bit AND shifted register.  */
9402 static void
9403 and64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9404 {
9405   unsigned rm = INSTR (20, 16);
9406   unsigned rn = INSTR (9, 5);
9407   unsigned rd = INSTR (4, 0);
9408
9409   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9410   aarch64_set_reg_u64
9411     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
9412      & shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
9413 }
9414
9415 /* 32 bit AND shifted register setting flags.  */
9416 static void
9417 ands32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9418 {
9419   unsigned rm = INSTR (20, 16);
9420   unsigned rn = INSTR (9, 5);
9421   unsigned rd = INSTR (4, 0);
9422
9423   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
9424   uint32_t value2 = shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
9425                                shift, count);
9426
9427   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9428   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
9429   set_flags_for_binop32 (cpu, value1 & value2);
9430 }
9431
9432 /* 64 bit AND shifted register setting flags.  */
9433 static void
9434 ands64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9435 {
9436   unsigned rm = INSTR (20, 16);
9437   unsigned rn = INSTR (9, 5);
9438   unsigned rd = INSTR (4, 0);
9439
9440   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
9441   uint64_t value2 = shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
9442                                shift, count);
9443
9444   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9445   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
9446   set_flags_for_binop64 (cpu, value1 & value2);
9447 }
9448
9449 /* 32 bit BIC shifted register.  */
9450 static void
9451 bic32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9452 {
9453   unsigned rm = INSTR (20, 16);
9454   unsigned rn = INSTR (9, 5);
9455   unsigned rd = INSTR (4, 0);
9456
9457   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9458   aarch64_set_reg_u64
9459     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
9460      & ~ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
9461 }
9462
9463 /* 64 bit BIC shifted register.  */
9464 static void
9465 bic64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9466 {
9467   unsigned rm = INSTR (20, 16);
9468   unsigned rn = INSTR (9, 5);
9469   unsigned rd = INSTR (4, 0);
9470
9471   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9472   aarch64_set_reg_u64
9473     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
9474      & ~ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
9475 }
9476
9477 /* 32 bit BIC shifted register setting flags.  */
9478 static void
9479 bics32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9480 {
9481   unsigned rm = INSTR (20, 16);
9482   unsigned rn = INSTR (9, 5);
9483   unsigned rd = INSTR (4, 0);
9484
9485   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
9486   uint32_t value2 = ~ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
9487                                  shift, count);
9488
9489   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9490   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
9491   set_flags_for_binop32 (cpu, value1 & value2);
9492 }
9493
9494 /* 64 bit BIC shifted register setting flags.  */
9495 static void
9496 bics64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9497 {
9498   unsigned rm = INSTR (20, 16);
9499   unsigned rn = INSTR (9, 5);
9500   unsigned rd = INSTR (4, 0);
9501
9502   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
9503   uint64_t value2 = ~ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
9504                                  shift, count);
9505
9506   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9507   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
9508   set_flags_for_binop64 (cpu, value1 & value2);
9509 }
9510
9511 /* 32 bit EON shifted register.  */
9512 static void
9513 eon32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9514 {
9515   unsigned rm = INSTR (20, 16);
9516   unsigned rn = INSTR (9, 5);
9517   unsigned rd = INSTR (4, 0);
9518
9519   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9520   aarch64_set_reg_u64
9521     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
9522      ^ ~ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
9523 }
9524
9525 /* 64 bit EON shifted register.  */
9526 static void
9527 eon64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9528 {
9529   unsigned rm = INSTR (20, 16);
9530   unsigned rn = INSTR (9, 5);
9531   unsigned rd = INSTR (4, 0);
9532
9533   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9534   aarch64_set_reg_u64
9535     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
9536      ^ ~ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
9537 }
9538
9539 /* 32 bit EOR shifted register.  */
9540 static void
9541 eor32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9542 {
9543   unsigned rm = INSTR (20, 16);
9544   unsigned rn = INSTR (9, 5);
9545   unsigned rd = INSTR (4, 0);
9546
9547   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9548   aarch64_set_reg_u64
9549     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
9550      ^ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
9551 }
9552
9553 /* 64 bit EOR shifted register.  */
9554 static void
9555 eor64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9556 {
9557   unsigned rm = INSTR (20, 16);
9558   unsigned rn = INSTR (9, 5);
9559   unsigned rd = INSTR (4, 0);
9560
9561   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9562   aarch64_set_reg_u64
9563     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
9564      ^ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
9565 }
9566
9567 /* 32 bit ORR shifted register.  */
9568 static void
9569 orr32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9570 {
9571   unsigned rm = INSTR (20, 16);
9572   unsigned rn = INSTR (9, 5);
9573   unsigned rd = INSTR (4, 0);
9574
9575   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9576   aarch64_set_reg_u64
9577     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
9578      | shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
9579 }
9580
9581 /* 64 bit ORR shifted register.  */
9582 static void
9583 orr64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9584 {
9585   unsigned rm = INSTR (20, 16);
9586   unsigned rn = INSTR (9, 5);
9587   unsigned rd = INSTR (4, 0);
9588
9589   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9590   aarch64_set_reg_u64
9591     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
9592      | shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
9593 }
9594
9595 /* 32 bit ORN shifted register.  */
9596 static void
9597 orn32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9598 {
9599   unsigned rm = INSTR (20, 16);
9600   unsigned rn = INSTR (9, 5);
9601   unsigned rd = INSTR (4, 0);
9602
9603   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9604   aarch64_set_reg_u64
9605     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
9606      | ~ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
9607 }
9608
9609 /* 64 bit ORN shifted register.  */
9610 static void
9611 orn64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9612 {
9613   unsigned rm = INSTR (20, 16);
9614   unsigned rn = INSTR (9, 5);
9615   unsigned rd = INSTR (4, 0);
9616
9617   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9618   aarch64_set_reg_u64
9619     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
9620      | ~ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
9621 }
9622
9623 static void
9624 dexLogicalImmediate (sim_cpu *cpu)
9625 {
9626   /* assert instr[28,23] = 1001000
9627      instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit
9628      instr[30,29] = op : 0 ==> AND, 1 ==> ORR, 2 ==> EOR, 3 ==> ANDS
9629      instr[22] = N : used to construct immediate mask
9630      instr[21,16] = immr
9631      instr[15,10] = imms
9632      instr[9,5] = Rn
9633      instr[4,0] = Rd  */
9634
9635   /* 32 bit operations must have N = 0 or else we have an UNALLOC.  */
9636   uint32_t size = INSTR (31, 31);
9637   uint32_t N = INSTR (22, 22);
9638   /* uint32_t immr = INSTR (21, 16);.  */
9639   /* uint32_t imms = INSTR (15, 10);.  */
9640   uint32_t index = INSTR (22, 10);
9641   uint64_t bimm64 = LITable [index];
9642   uint32_t dispatch = INSTR (30, 29);
9643
9644   if (~size & N)
9645     HALT_UNALLOC;
9646
9647   if (!bimm64)
9648     HALT_UNALLOC;
9649
9650   if (size == 0)
9651     {
9652       uint32_t bimm = (uint32_t) bimm64;
9653
9654       switch (dispatch)
9655         {
9656         case 0: and32 (cpu, bimm); return;
9657         case 1: orr32 (cpu, bimm); return;
9658         case 2: eor32 (cpu, bimm); return;
9659         case 3: ands32 (cpu, bimm); return;
9660         }
9661     }
9662   else
9663     {
9664       switch (dispatch)
9665         {
9666         case 0: and64 (cpu, bimm64); return;
9667         case 1: orr64 (cpu, bimm64); return;
9668         case 2: eor64 (cpu, bimm64); return;
9669         case 3: ands64 (cpu, bimm64); return;
9670         }
9671     }
9672   HALT_UNALLOC;
9673 }
9674
9675 /* Immediate move.
9676    The uimm argument is a 16 bit value to be inserted into the
9677    target register the pos argument locates the 16 bit word in the
9678    dest register i.e. it is in {0, 1} for 32 bit and {0, 1, 2,
9679    3} for 64 bit.
9680    N.B register arg may not be SP so it should be.
9681    accessed using the setGZRegisterXXX accessors.  */
9682
9683 /* 32 bit move 16 bit immediate zero remaining shorts.  */
9684 static void
9685 movz32 (sim_cpu *cpu, uint32_t val, uint32_t pos)
9686 {
9687   unsigned rd = INSTR (4, 0);
9688
9689   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9690   aarch64_set_reg_u64 (cpu, rd, NO_SP, val << (pos * 16));
9691 }
9692
9693 /* 64 bit move 16 bit immediate zero remaining shorts.  */
9694 static void
9695 movz64 (sim_cpu *cpu, uint32_t val, uint32_t pos)
9696 {
9697   unsigned rd = INSTR (4, 0);
9698
9699   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9700   aarch64_set_reg_u64 (cpu, rd, NO_SP, ((uint64_t) val) << (pos * 16));
9701 }
9702
9703 /* 32 bit move 16 bit immediate negated.  */
9704 static void
9705 movn32 (sim_cpu *cpu, uint32_t val, uint32_t pos)
9706 {
9707   unsigned rd = INSTR (4, 0);
9708
9709   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9710   aarch64_set_reg_u64 (cpu, rd, NO_SP, ((val << (pos * 16)) ^ 0xffffffffU));
9711 }
9712
9713 /* 64 bit move 16 bit immediate negated.  */
9714 static void
9715 movn64 (sim_cpu *cpu, uint32_t val, uint32_t pos)
9716 {
9717   unsigned rd = INSTR (4, 0);
9718
9719   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9720   aarch64_set_reg_u64
9721     (cpu, rd, NO_SP, ((((uint64_t) val) << (pos * 16))
9722                       ^ 0xffffffffffffffffULL));
9723 }
9724
9725 /* 32 bit move 16 bit immediate keep remaining shorts.  */
9726 static void
9727 movk32 (sim_cpu *cpu, uint32_t val, uint32_t pos)
9728 {
9729   unsigned rd = INSTR (4, 0);
9730   uint32_t current = aarch64_get_reg_u32 (cpu, rd, NO_SP);
9731   uint32_t value = val << (pos * 16);
9732   uint32_t mask = ~(0xffffU << (pos * 16));
9733
9734   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9735   aarch64_set_reg_u64 (cpu, rd, NO_SP, (value | (current & mask)));
9736 }
9737
9738 /* 64 bit move 16 it immediate keep remaining shorts.  */
9739 static void
9740 movk64 (sim_cpu *cpu, uint32_t val, uint32_t pos)
9741 {
9742   unsigned rd = INSTR (4, 0);
9743   uint64_t current = aarch64_get_reg_u64 (cpu, rd, NO_SP);
9744   uint64_t value = (uint64_t) val << (pos * 16);
9745   uint64_t mask = ~(0xffffULL << (pos * 16));
9746
9747   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9748   aarch64_set_reg_u64 (cpu, rd, NO_SP, (value | (current & mask)));
9749 }
9750
9751 static void
9752 dexMoveWideImmediate (sim_cpu *cpu)
9753 {
9754   /* assert instr[28:23] = 100101
9755      instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit
9756      instr[30,29] = op : 0 ==> MOVN, 1 ==> UNALLOC, 2 ==> MOVZ, 3 ==> MOVK
9757      instr[22,21] = shift : 00 == LSL#0, 01 = LSL#16, 10 = LSL#32, 11 = LSL#48
9758      instr[20,5] = uimm16
9759      instr[4,0] = Rd  */
9760
9761   /* N.B. the (multiple of 16) shift is applied by the called routine,
9762      we just pass the multiplier.  */
9763
9764   uint32_t imm;
9765   uint32_t size = INSTR (31, 31);
9766   uint32_t op = INSTR (30, 29);
9767   uint32_t shift = INSTR (22, 21);
9768
9769   /* 32 bit can only shift 0 or 1 lot of 16.
9770      anything else is an unallocated instruction.  */
9771   if (size == 0 && (shift > 1))
9772     HALT_UNALLOC;
9773
9774   if (op == 1)
9775     HALT_UNALLOC;
9776
9777   imm = INSTR (20, 5);
9778
9779   if (size == 0)
9780     {
9781       if (op == 0)
9782         movn32 (cpu, imm, shift);
9783       else if (op == 2)
9784         movz32 (cpu, imm, shift);
9785       else
9786         movk32 (cpu, imm, shift);
9787     }
9788   else
9789     {
9790       if (op == 0)
9791         movn64 (cpu, imm, shift);
9792       else if (op == 2)
9793         movz64 (cpu, imm, shift);
9794       else
9795         movk64 (cpu, imm, shift);
9796     }
9797 }
9798
9799 /* Bitfield operations.
9800    These take a pair of bit positions r and s which are in {0..31}
9801    or {0..63} depending on the instruction word size.
9802    N.B register args may not be SP.  */
9803
9804 /* OK, we start with ubfm which just needs to pick
9805    some bits out of source zero the rest and write
9806    the result to dest.  Just need two logical shifts.  */
9807
9808 /* 32 bit bitfield move, left and right of affected zeroed
9809    if r <= s Wd<s-r:0> = Wn<s:r> else Wd<32+s-r,32-r> = Wn<s:0>.  */
9810 static void
9811 ubfm32 (sim_cpu *cpu, uint32_t r, uint32_t s)
9812 {
9813   unsigned rd;
9814   unsigned rn = INSTR (9, 5);
9815   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
9816
9817   /* Pick either s+1-r or s+1 consecutive bits out of the original word.  */
9818   if (r <= s)
9819     {
9820       /* 31:...:s:xxx:r:...:0 ==> 31:...:s-r:xxx:0.
9821          We want only bits s:xxx:r at the bottom of the word
9822          so we LSL bit s up to bit 31 i.e. by 31 - s
9823          and then we LSR to bring bit 31 down to bit s - r
9824          i.e. by 31 + r - s.  */
9825       value <<= 31 - s;
9826       value >>= 31 + r - s;
9827     }
9828   else
9829     {
9830       /* 31:...:s:xxx:0 ==> 31:...:31-(r-1)+s:xxx:31-(r-1):...:0
9831          We want only bits s:xxx:0 starting at it 31-(r-1)
9832          so we LSL bit s up to bit 31 i.e. by 31 - s
9833          and then we LSL to bring bit 31 down to 31-(r-1)+s
9834          i.e. by r - (s + 1).  */
9835       value <<= 31 - s;
9836       value >>= r - (s + 1);
9837     }
9838
9839   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9840   rd = INSTR (4, 0);
9841   aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
9842 }
9843
9844 /* 64 bit bitfield move, left and right of affected zeroed
9845    if r <= s Wd<s-r:0> = Wn<s:r> else Wd<64+s-r,64-r> = Wn<s:0>.  */
9846 static void
9847 ubfm (sim_cpu *cpu, uint32_t r, uint32_t s)
9848 {
9849   unsigned rd;
9850   unsigned rn = INSTR (9, 5);
9851   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
9852
9853   if (r <= s)
9854     {
9855       /* 63:...:s:xxx:r:...:0 ==> 63:...:s-r:xxx:0.
9856          We want only bits s:xxx:r at the bottom of the word.
9857          So we LSL bit s up to bit 63 i.e. by 63 - s
9858          and then we LSR to bring bit 63 down to bit s - r
9859          i.e. by 63 + r - s.  */
9860       value <<= 63 - s;
9861       value >>= 63 + r - s;
9862     }
9863   else
9864     {
9865       /* 63:...:s:xxx:0 ==> 63:...:63-(r-1)+s:xxx:63-(r-1):...:0.
9866          We want only bits s:xxx:0 starting at it 63-(r-1).
9867          So we LSL bit s up to bit 63 i.e. by 63 - s
9868          and then we LSL to bring bit 63 down to 63-(r-1)+s
9869          i.e. by r - (s + 1).  */
9870       value <<= 63 - s;
9871       value >>= r - (s + 1);
9872     }
9873
9874   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9875   rd = INSTR (4, 0);
9876   aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
9877 }
9878
9879 /* The signed versions need to insert sign bits
9880    on the left of the inserted bit field. so we do
9881    much the same as the unsigned version except we
9882    use an arithmetic shift right -- this just means
9883    we need to operate on signed values.  */
9884
9885 /* 32 bit bitfield move, left of affected sign-extended, right zeroed.  */
9886 /* If r <= s Wd<s-r:0> = Wn<s:r> else Wd<32+s-r,32-r> = Wn<s:0>.  */
9887 static void
9888 sbfm32 (sim_cpu *cpu, uint32_t r, uint32_t s)
9889 {
9890   unsigned rd;
9891   unsigned rn = INSTR (9, 5);
9892   /* as per ubfm32 but use an ASR instead of an LSR.  */
9893   int32_t value = aarch64_get_reg_s32 (cpu, rn, NO_SP);
9894
9895   if (r <= s)
9896     {
9897       value <<= 31 - s;
9898       value >>= 31 + r - s;
9899     }
9900   else
9901     {
9902       value <<= 31 - s;
9903       value >>= r - (s + 1);
9904     }
9905
9906   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9907   rd = INSTR (4, 0);
9908   aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) value);
9909 }
9910
9911 /* 64 bit bitfield move, left of affected sign-extended, right zeroed.  */
9912 /* If r <= s Wd<s-r:0> = Wn<s:r> else Wd<64+s-r,64-r> = Wn<s:0>.  */
9913 static void
9914 sbfm (sim_cpu *cpu, uint32_t r, uint32_t s)
9915 {
9916   unsigned rd;
9917   unsigned rn = INSTR (9, 5);
9918   /* acpu per ubfm but use an ASR instead of an LSR.  */
9919   int64_t value = aarch64_get_reg_s64 (cpu, rn, NO_SP);
9920
9921   if (r <= s)
9922     {
9923       value <<= 63 - s;
9924       value >>= 63 + r - s;
9925     }
9926   else
9927     {
9928       value <<= 63 - s;
9929       value >>= r - (s + 1);
9930     }
9931
9932   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9933   rd = INSTR (4, 0);
9934   aarch64_set_reg_s64 (cpu, rd, NO_SP, value);
9935 }
9936
9937 /* Finally, these versions leave non-affected bits
9938    as is. so we need to generate the bits as per
9939    ubfm and also generate a mask to pick the
9940    bits from the original and computed values.  */
9941
9942 /* 32 bit bitfield move, non-affected bits left as is.
9943    If r <= s Wd<s-r:0> = Wn<s:r> else Wd<32+s-r,32-r> = Wn<s:0>.  */
9944 static void
9945 bfm32 (sim_cpu *cpu, uint32_t r, uint32_t s)
9946 {
9947   unsigned rn = INSTR (9, 5);
9948   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
9949   uint32_t mask = -1;
9950   unsigned rd;
9951   uint32_t value2;
9952
9953   /* Pick either s+1-r or s+1 consecutive bits out of the original word.  */
9954   if (r <= s)
9955     {
9956       /* 31:...:s:xxx:r:...:0 ==> 31:...:s-r:xxx:0.
9957          We want only bits s:xxx:r at the bottom of the word
9958          so we LSL bit s up to bit 31 i.e. by 31 - s
9959          and then we LSR to bring bit 31 down to bit s - r
9960          i.e. by 31 + r - s.  */
9961       value <<= 31 - s;
9962       value >>= 31 + r - s;
9963       /* the mask must include the same bits.  */
9964       mask <<= 31 - s;
9965       mask >>= 31 + r - s;
9966     }
9967   else
9968     {
9969       /* 31:...:s:xxx:0 ==> 31:...:31-(r-1)+s:xxx:31-(r-1):...:0.
9970          We want only bits s:xxx:0 starting at it 31-(r-1)
9971          so we LSL bit s up to bit 31 i.e. by 31 - s
9972          and then we LSL to bring bit 31 down to 31-(r-1)+s
9973          i.e. by r - (s + 1).  */
9974       value <<= 31 - s;
9975       value >>= r - (s + 1);
9976       /* The mask must include the same bits.  */
9977       mask <<= 31 - s;
9978       mask >>= r - (s + 1);
9979     }
9980
9981   rd = INSTR (4, 0);
9982   value2 = aarch64_get_reg_u32 (cpu, rd, NO_SP);
9983
9984   value2 &= ~mask;
9985   value2 |= value;
9986
9987   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9988   aarch64_set_reg_u64
9989     (cpu, rd, NO_SP, (aarch64_get_reg_u32 (cpu, rd, NO_SP) & ~mask) | value);
9990 }
9991
9992 /* 64 bit bitfield move, non-affected bits left as is.
9993    If r <= s Wd<s-r:0> = Wn<s:r> else Wd<64+s-r,64-r> = Wn<s:0>.  */
9994 static void
9995 bfm (sim_cpu *cpu, uint32_t r, uint32_t s)
9996 {
9997   unsigned rd;
9998   unsigned rn = INSTR (9, 5);
9999   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
10000   uint64_t mask = 0xffffffffffffffffULL;
10001
10002   if (r <= s)
10003     {
10004       /* 63:...:s:xxx:r:...:0 ==> 63:...:s-r:xxx:0.
10005          We want only bits s:xxx:r at the bottom of the word
10006          so we LSL bit s up to bit 63 i.e. by 63 - s
10007          and then we LSR to bring bit 63 down to bit s - r
10008          i.e. by 63 + r - s.  */
10009       value <<= 63 - s;
10010       value >>= 63 + r - s;
10011       /* The mask must include the same bits.  */
10012       mask <<= 63 - s;
10013       mask >>= 63 + r - s;
10014     }
10015   else
10016     {
10017       /* 63:...:s:xxx:0 ==> 63:...:63-(r-1)+s:xxx:63-(r-1):...:0
10018          We want only bits s:xxx:0 starting at it 63-(r-1)
10019          so we LSL bit s up to bit 63 i.e. by 63 - s
10020          and then we LSL to bring bit 63 down to 63-(r-1)+s
10021          i.e. by r - (s + 1).  */
10022       value <<= 63 - s;
10023       value >>= r - (s + 1);
10024       /* The mask must include the same bits.  */
10025       mask <<= 63 - s;
10026       mask >>= r - (s + 1);
10027     }
10028
10029   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
10030   rd = INSTR (4, 0);
10031   aarch64_set_reg_u64
10032     (cpu, rd, NO_SP, (aarch64_get_reg_u64 (cpu, rd, NO_SP) & ~mask) | value);
10033 }
10034
10035 static void
10036 dexBitfieldImmediate (sim_cpu *cpu)
10037 {
10038   /* assert instr[28:23] = 100110
10039      instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit
10040      instr[30,29] = op : 0 ==> SBFM, 1 ==> BFM, 2 ==> UBFM, 3 ==> UNALLOC
10041      instr[22] = N : must be 0 for 32 bit, 1 for 64 bit ow UNALLOC
10042      instr[21,16] = immr : 0xxxxx for 32 bit, xxxxxx for 64 bit
10043      instr[15,10] = imms :  0xxxxx for 32 bit, xxxxxx for 64 bit
10044      instr[9,5] = Rn
10045      instr[4,0] = Rd  */
10046
10047   /* 32 bit operations must have N = 0 or else we have an UNALLOC.  */
10048   uint32_t dispatch;
10049   uint32_t imms;
10050   uint32_t size = INSTR (31, 31);
10051   uint32_t N = INSTR (22, 22);
10052   /* 32 bit operations must have immr[5] = 0 and imms[5] = 0.  */
10053   /* or else we have an UNALLOC.  */
10054   uint32_t immr = INSTR (21, 16);
10055
10056   if (~size & N)
10057     HALT_UNALLOC;
10058
10059   if (!size && uimm (immr, 5, 5))
10060     HALT_UNALLOC;
10061
10062   imms = INSTR (15, 10);
10063   if (!size && uimm (imms, 5, 5))
10064     HALT_UNALLOC;
10065
10066   /* Switch on combined size and op.  */
10067   dispatch = INSTR (31, 29);
10068   switch (dispatch)
10069     {
10070     case 0: sbfm32 (cpu, immr, imms); return;
10071     case 1: bfm32 (cpu, immr, imms); return;
10072     case 2: ubfm32 (cpu, immr, imms); return;
10073     case 4: sbfm (cpu, immr, imms); return;
10074     case 5: bfm (cpu, immr, imms); return;
10075     case 6: ubfm (cpu, immr, imms); return;
10076     default: HALT_UNALLOC;
10077     }
10078 }
10079
10080 static void
10081 do_EXTR_32 (sim_cpu *cpu)
10082 {
10083   /* instr[31:21] = 00010011100
10084      instr[20,16] = Rm
10085      instr[15,10] = imms :  0xxxxx for 32 bit
10086      instr[9,5]   = Rn
10087      instr[4,0]   = Rd  */
10088   unsigned rm   = INSTR (20, 16);
10089   unsigned imms = INSTR (15, 10) & 31;
10090   unsigned rn   = INSTR ( 9,  5);
10091   unsigned rd   = INSTR ( 4,  0);
10092   uint64_t val1;
10093   uint64_t val2;
10094
10095   val1 = aarch64_get_reg_u32 (cpu, rm, NO_SP);
10096   val1 >>= imms;
10097   val2 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
10098   val2 <<= (32 - imms);
10099
10100   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
10101   aarch64_set_reg_u64 (cpu, rd, NO_SP, val1 | val2);
10102 }
10103
10104 static void
10105 do_EXTR_64 (sim_cpu *cpu)
10106 {
10107   /* instr[31:21] = 10010011100
10108      instr[20,16] = Rm
10109      instr[15,10] = imms
10110      instr[9,5]   = Rn
10111      instr[4,0]   = Rd  */
10112   unsigned rm   = INSTR (20, 16);
10113   unsigned imms = INSTR (15, 10) & 63;
10114   unsigned rn   = INSTR ( 9,  5);
10115   unsigned rd   = INSTR ( 4,  0);
10116   uint64_t val;
10117
10118   val = aarch64_get_reg_u64 (cpu, rm, NO_SP);
10119   val >>= imms;
10120   val |= (aarch64_get_reg_u64 (cpu, rn, NO_SP) << (64 - imms));
10121
10122   aarch64_set_reg_u64 (cpu, rd, NO_SP, val);
10123 }
10124
10125 static void
10126 dexExtractImmediate (sim_cpu *cpu)
10127 {
10128   /* assert instr[28:23] = 100111
10129      instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
10130      instr[30,29] = op21 : 0 ==> EXTR, 1,2,3 ==> UNALLOC
10131      instr[22]    = N : must be 0 for 32 bit, 1 for 64 bit or UNALLOC
10132      instr[21]    = op0 : must be 0 or UNALLOC
10133      instr[20,16] = Rm
10134      instr[15,10] = imms :  0xxxxx for 32 bit, xxxxxx for 64 bit
10135      instr[9,5]   = Rn
10136      instr[4,0]   = Rd  */
10137
10138   /* 32 bit operations must have N = 0 or else we have an UNALLOC.  */
10139   /* 64 bit operations must have N = 1 or else we have an UNALLOC.  */
10140   uint32_t dispatch;
10141   uint32_t size = INSTR (31, 31);
10142   uint32_t N = INSTR (22, 22);
10143   /* 32 bit operations must have imms[5] = 0
10144      or else we have an UNALLOC.  */
10145   uint32_t imms = INSTR (15, 10);
10146
10147   if (size ^ N)
10148     HALT_UNALLOC;
10149
10150   if (!size && uimm (imms, 5, 5))
10151     HALT_UNALLOC;
10152
10153   /* Switch on combined size and op.  */
10154   dispatch = INSTR (31, 29);
10155
10156   if (dispatch == 0)
10157     do_EXTR_32 (cpu);
10158
10159   else if (dispatch == 4)
10160     do_EXTR_64 (cpu);
10161
10162   else if (dispatch == 1)
10163     HALT_NYI;
10164   else
10165     HALT_UNALLOC;
10166 }
10167
10168 static void
10169 dexDPImm (sim_cpu *cpu)
10170 {
10171   /* uint32_t group = dispatchGroup (aarch64_get_instr (cpu));
10172      assert  group == GROUP_DPIMM_1000 || grpoup == GROUP_DPIMM_1001
10173      bits [25,23] of a DPImm are the secondary dispatch vector.  */
10174   uint32_t group2 = dispatchDPImm (aarch64_get_instr (cpu));
10175
10176   switch (group2)
10177     {
10178     case DPIMM_PCADR_000:
10179     case DPIMM_PCADR_001:
10180       dexPCRelAddressing (cpu);
10181       return;
10182
10183     case DPIMM_ADDSUB_010:
10184     case DPIMM_ADDSUB_011:
10185       dexAddSubtractImmediate (cpu);
10186       return;
10187
10188     case DPIMM_LOG_100:
10189       dexLogicalImmediate (cpu);
10190       return;
10191
10192     case DPIMM_MOV_101:
10193       dexMoveWideImmediate (cpu);
10194       return;
10195
10196     case DPIMM_BITF_110:
10197       dexBitfieldImmediate (cpu);
10198       return;
10199
10200     case DPIMM_EXTR_111:
10201       dexExtractImmediate (cpu);
10202       return;
10203
10204     default:
10205       /* Should never reach here.  */
10206       HALT_NYI;
10207     }
10208 }
10209
10210 static void
10211 dexLoadUnscaledImmediate (sim_cpu *cpu)
10212 {
10213   /* instr[29,24] == 111_00
10214      instr[21] == 0
10215      instr[11,10] == 00
10216      instr[31,30] = size
10217      instr[26] = V
10218      instr[23,22] = opc
10219      instr[20,12] = simm9
10220      instr[9,5] = rn may be SP.  */
10221   /* unsigned rt = INSTR (4, 0);  */
10222   uint32_t V = INSTR (26, 26);
10223   uint32_t dispatch = ((INSTR (31, 30) << 2) | INSTR (23, 22));
10224   int32_t imm = simm32 (aarch64_get_instr (cpu), 20, 12);
10225
10226   if (!V)
10227     {
10228       /* GReg operations.  */
10229       switch (dispatch)
10230         {
10231         case 0:  sturb (cpu, imm); return;
10232         case 1:  ldurb32 (cpu, imm); return;
10233         case 2:  ldursb64 (cpu, imm); return;
10234         case 3:  ldursb32 (cpu, imm); return;
10235         case 4:  sturh (cpu, imm); return;
10236         case 5:  ldurh32 (cpu, imm); return;
10237         case 6:  ldursh64 (cpu, imm); return;
10238         case 7:  ldursh32 (cpu, imm); return;
10239         case 8:  stur32 (cpu, imm); return;
10240         case 9:  ldur32 (cpu, imm); return;
10241         case 10: ldursw (cpu, imm); return;
10242         case 12: stur64 (cpu, imm); return;
10243         case 13: ldur64 (cpu, imm); return;
10244
10245         case 14:
10246           /* PRFUM NYI.  */
10247           HALT_NYI;
10248
10249         default:
10250         case 11:
10251         case 15:
10252           HALT_UNALLOC;
10253         }
10254     }
10255
10256   /* FReg operations.  */
10257   switch (dispatch)
10258     {
10259     case 2:  fsturq (cpu, imm); return;
10260     case 3:  fldurq (cpu, imm); return;
10261     case 8:  fsturs (cpu, imm); return;
10262     case 9:  fldurs (cpu, imm); return;
10263     case 12: fsturd (cpu, imm); return;
10264     case 13: fldurd (cpu, imm); return;
10265
10266     case 0: /* STUR 8 bit FP.  */
10267     case 1: /* LDUR 8 bit FP.  */
10268     case 4: /* STUR 16 bit FP.  */
10269     case 5: /* LDUR 8 bit FP.  */
10270       HALT_NYI;
10271
10272     default:
10273     case 6:
10274     case 7:
10275     case 10:
10276     case 11:
10277     case 14:
10278     case 15:
10279       HALT_UNALLOC;
10280     }
10281 }
10282
10283 /*  N.B. A preliminary note regarding all the ldrs<x>32
10284     instructions
10285
10286    The signed value loaded by these instructions is cast to unsigned
10287    before being assigned to aarch64_get_reg_u64 (cpu, N) i.e. to the
10288    64 bit element of the GReg union. this performs a 32 bit sign extension
10289    (as required) but avoids 64 bit sign extension, thus ensuring that the
10290    top half of the register word is zero. this is what the spec demands
10291    when a 32 bit load occurs.  */
10292
10293 /* 32 bit load sign-extended byte scaled unsigned 12 bit.  */
10294 static void
10295 ldrsb32_abs (sim_cpu *cpu, uint32_t offset)
10296 {
10297   unsigned int rn = INSTR (9, 5);
10298   unsigned int rt = INSTR (4, 0);
10299
10300   /* The target register may not be SP but the source may be
10301      there is no scaling required for a byte load.  */
10302   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset;
10303   aarch64_set_reg_u64 (cpu, rt, NO_SP,
10304                        (int64_t) aarch64_get_mem_s8 (cpu, address));
10305 }
10306
10307 /* 32 bit load sign-extended byte scaled or unscaled zero-
10308    or sign-extended 32-bit register offset.  */
10309 static void
10310 ldrsb32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
10311 {
10312   unsigned int rm = INSTR (20, 16);
10313   unsigned int rn = INSTR (9, 5);
10314   unsigned int rt = INSTR (4, 0);
10315
10316   /* rn may reference SP, rm and rt must reference ZR.  */
10317
10318   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10319   int64_t displacement = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
10320                                  extension);
10321
10322   /* There is no scaling required for a byte load.  */
10323   aarch64_set_reg_u64
10324     (cpu, rt, NO_SP, (int64_t) aarch64_get_mem_s8 (cpu, address
10325                                                    + displacement));
10326 }
10327
10328 /* 32 bit load sign-extended byte unscaled signed 9 bit with
10329    pre- or post-writeback.  */
10330 static void
10331 ldrsb32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
10332 {
10333   uint64_t address;
10334   unsigned int rn = INSTR (9, 5);
10335   unsigned int rt = INSTR (4, 0);
10336
10337   if (rn == rt && wb != NoWriteBack)
10338     HALT_UNALLOC;
10339
10340   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10341
10342   if (wb == Pre)
10343       address += offset;
10344
10345   aarch64_set_reg_u64 (cpu, rt, NO_SP,
10346                        (int64_t) aarch64_get_mem_s8 (cpu, address));
10347
10348   if (wb == Post)
10349     address += offset;
10350
10351   if (wb != NoWriteBack)
10352     aarch64_set_reg_u64 (cpu, rn, NO_SP, address);
10353 }
10354
10355 /* 8 bit store scaled.  */
10356 static void
10357 fstrb_abs (sim_cpu *cpu, uint32_t offset)
10358 {
10359   unsigned st = INSTR (4, 0);
10360   unsigned rn = INSTR (9, 5);
10361
10362   aarch64_set_mem_u8 (cpu,
10363                       aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
10364                       aarch64_get_vec_u8 (cpu, st, 0));
10365 }
10366
10367 /* 8 bit store scaled or unscaled zero- or
10368    sign-extended 8-bit register offset.  */
10369 static void
10370 fstrb_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
10371 {
10372   unsigned rm = INSTR (20, 16);
10373   unsigned rn = INSTR (9, 5);
10374   unsigned st = INSTR (4, 0);
10375
10376   uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10377   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
10378                                extension);
10379   uint64_t  displacement = scaling == Scaled ? extended : 0;
10380
10381   aarch64_set_mem_u8
10382     (cpu, address + displacement, aarch64_get_vec_u8 (cpu, st, 0));
10383 }
10384
10385 /* 16 bit store scaled.  */
10386 static void
10387 fstrh_abs (sim_cpu *cpu, uint32_t offset)
10388 {
10389   unsigned st = INSTR (4, 0);
10390   unsigned rn = INSTR (9, 5);
10391
10392   aarch64_set_mem_u16
10393     (cpu,
10394      aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 16),
10395      aarch64_get_vec_u16 (cpu, st, 0));
10396 }
10397
10398 /* 16 bit store scaled or unscaled zero-
10399    or sign-extended 16-bit register offset.  */
10400 static void
10401 fstrh_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
10402 {
10403   unsigned rm = INSTR (20, 16);
10404   unsigned rn = INSTR (9, 5);
10405   unsigned st = INSTR (4, 0);
10406
10407   uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10408   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
10409                                extension);
10410   uint64_t  displacement = OPT_SCALE (extended, 16, scaling);
10411
10412   aarch64_set_mem_u16
10413     (cpu, address + displacement, aarch64_get_vec_u16 (cpu, st, 0));
10414 }
10415
10416 /* 32 bit store scaled unsigned 12 bit.  */
10417 static void
10418 fstrs_abs (sim_cpu *cpu, uint32_t offset)
10419 {
10420   unsigned st = INSTR (4, 0);
10421   unsigned rn = INSTR (9, 5);
10422
10423   aarch64_set_mem_u32
10424     (cpu,
10425      aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 32),
10426      aarch64_get_vec_u32 (cpu, st, 0));
10427 }
10428
10429 /* 32 bit store unscaled signed 9 bit with pre- or post-writeback.  */
10430 static void
10431 fstrs_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
10432 {
10433   unsigned rn = INSTR (9, 5);
10434   unsigned st = INSTR (4, 0);
10435
10436   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10437
10438   if (wb != Post)
10439     address += offset;
10440
10441   aarch64_set_mem_u32 (cpu, address, aarch64_get_vec_u32 (cpu, st, 0));
10442
10443   if (wb == Post)
10444     address += offset;
10445
10446   if (wb != NoWriteBack)
10447     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
10448 }
10449
10450 /* 32 bit store scaled or unscaled zero-
10451    or sign-extended 32-bit register offset.  */
10452 static void
10453 fstrs_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
10454 {
10455   unsigned rm = INSTR (20, 16);
10456   unsigned rn = INSTR (9, 5);
10457   unsigned st = INSTR (4, 0);
10458
10459   uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10460   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
10461                                extension);
10462   uint64_t  displacement = OPT_SCALE (extended, 32, scaling);
10463
10464   aarch64_set_mem_u32
10465     (cpu, address + displacement, aarch64_get_vec_u32 (cpu, st, 0));
10466 }
10467
10468 /* 64 bit store scaled unsigned 12 bit.  */
10469 static void
10470 fstrd_abs (sim_cpu *cpu, uint32_t offset)
10471 {
10472   unsigned st = INSTR (4, 0);
10473   unsigned rn = INSTR (9, 5);
10474
10475   aarch64_set_mem_u64
10476     (cpu,
10477      aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 64),
10478      aarch64_get_vec_u64 (cpu, st, 0));
10479 }
10480
10481 /* 64 bit store unscaled signed 9 bit with pre- or post-writeback.  */
10482 static void
10483 fstrd_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
10484 {
10485   unsigned rn = INSTR (9, 5);
10486   unsigned st = INSTR (4, 0);
10487
10488   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10489
10490   if (wb != Post)
10491     address += offset;
10492
10493   aarch64_set_mem_u64 (cpu, address, aarch64_get_vec_u64 (cpu, st, 0));
10494
10495   if (wb == Post)
10496     address += offset;
10497
10498   if (wb != NoWriteBack)
10499     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
10500 }
10501
10502 /* 64 bit store scaled or unscaled zero-
10503    or sign-extended 32-bit register offset.  */
10504 static void
10505 fstrd_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
10506 {
10507   unsigned rm = INSTR (20, 16);
10508   unsigned rn = INSTR (9, 5);
10509   unsigned st = INSTR (4, 0);
10510
10511   uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10512   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
10513                                extension);
10514   uint64_t  displacement = OPT_SCALE (extended, 64, scaling);
10515
10516   aarch64_set_mem_u64
10517     (cpu, address + displacement, aarch64_get_vec_u64 (cpu, st, 0));
10518 }
10519
10520 /* 128 bit store scaled unsigned 12 bit.  */
10521 static void
10522 fstrq_abs (sim_cpu *cpu, uint32_t offset)
10523 {
10524   FRegister a;
10525   unsigned st = INSTR (4, 0);
10526   unsigned rn = INSTR (9, 5);
10527   uint64_t addr;
10528
10529   aarch64_get_FP_long_double (cpu, st, & a);
10530
10531   addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 128);
10532   aarch64_set_mem_long_double (cpu, addr, a);
10533 }
10534
10535 /* 128 bit store unscaled signed 9 bit with pre- or post-writeback.  */
10536 static void
10537 fstrq_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
10538 {
10539   FRegister a;
10540   unsigned rn = INSTR (9, 5);
10541   unsigned st = INSTR (4, 0);
10542   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10543
10544   if (wb != Post)
10545     address += offset;
10546
10547   aarch64_get_FP_long_double (cpu, st, & a);
10548   aarch64_set_mem_long_double (cpu, address, a);
10549
10550   if (wb == Post)
10551     address += offset;
10552
10553   if (wb != NoWriteBack)
10554     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
10555 }
10556
10557 /* 128 bit store scaled or unscaled zero-
10558    or sign-extended 32-bit register offset.  */
10559 static void
10560 fstrq_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
10561 {
10562   unsigned rm = INSTR (20, 16);
10563   unsigned rn = INSTR (9, 5);
10564   unsigned st = INSTR (4, 0);
10565
10566   uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10567   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
10568                                extension);
10569   uint64_t  displacement = OPT_SCALE (extended, 128, scaling);
10570
10571   FRegister a;
10572
10573   aarch64_get_FP_long_double (cpu, st, & a);
10574   aarch64_set_mem_long_double (cpu, address + displacement, a);
10575 }
10576
10577 static void
10578 dexLoadImmediatePrePost (sim_cpu *cpu)
10579 {
10580   /* instr[31,30] = size
10581      instr[29,27] = 111
10582      instr[26]    = V
10583      instr[25,24] = 00
10584      instr[23,22] = opc
10585      instr[21]    = 0
10586      instr[20,12] = simm9
10587      instr[11]    = wb : 0 ==> Post, 1 ==> Pre
10588      instr[10]    = 0
10589      instr[9,5]   = Rn may be SP.
10590      instr[4,0]   = Rt */
10591
10592   uint32_t  V        = INSTR (26, 26);
10593   uint32_t  dispatch = ((INSTR (31, 30) << 2) | INSTR (23, 22));
10594   int32_t   imm      = simm32 (aarch64_get_instr (cpu), 20, 12);
10595   WriteBack wb       = INSTR (11, 11);
10596
10597   if (!V)
10598     {
10599       /* GReg operations.  */
10600       switch (dispatch)
10601         {
10602         case 0:  strb_wb (cpu, imm, wb); return;
10603         case 1:  ldrb32_wb (cpu, imm, wb); return;
10604         case 2:  ldrsb_wb (cpu, imm, wb); return;
10605         case 3:  ldrsb32_wb (cpu, imm, wb); return;
10606         case 4:  strh_wb (cpu, imm, wb); return;
10607         case 5:  ldrh32_wb (cpu, imm, wb); return;
10608         case 6:  ldrsh64_wb (cpu, imm, wb); return;
10609         case 7:  ldrsh32_wb (cpu, imm, wb); return;
10610         case 8:  str32_wb (cpu, imm, wb); return;
10611         case 9:  ldr32_wb (cpu, imm, wb); return;
10612         case 10: ldrsw_wb (cpu, imm, wb); return;
10613         case 12: str_wb (cpu, imm, wb); return;
10614         case 13: ldr_wb (cpu, imm, wb); return;
10615
10616         default:
10617         case 11:
10618         case 14:
10619         case 15:
10620           HALT_UNALLOC;
10621         }
10622     }
10623
10624   /* FReg operations.  */
10625   switch (dispatch)
10626     {
10627     case 2:  fstrq_wb (cpu, imm, wb); return;
10628     case 3:  fldrq_wb (cpu, imm, wb); return;
10629     case 8:  fstrs_wb (cpu, imm, wb); return;
10630     case 9:  fldrs_wb (cpu, imm, wb); return;
10631     case 12: fstrd_wb (cpu, imm, wb); return;
10632     case 13: fldrd_wb (cpu, imm, wb); return;
10633
10634     case 0:       /* STUR 8 bit FP.  */
10635     case 1:       /* LDUR 8 bit FP.  */
10636     case 4:       /* STUR 16 bit FP.  */
10637     case 5:       /* LDUR 8 bit FP.  */
10638       HALT_NYI;
10639
10640     default:
10641     case 6:
10642     case 7:
10643     case 10:
10644     case 11:
10645     case 14:
10646     case 15:
10647       HALT_UNALLOC;
10648     }
10649 }
10650
10651 static void
10652 dexLoadRegisterOffset (sim_cpu *cpu)
10653 {
10654   /* instr[31,30] = size
10655      instr[29,27] = 111
10656      instr[26]    = V
10657      instr[25,24] = 00
10658      instr[23,22] = opc
10659      instr[21]    = 1
10660      instr[20,16] = rm
10661      instr[15,13] = option : 010 ==> UXTW, 011 ==> UXTX/LSL,
10662                              110 ==> SXTW, 111 ==> SXTX,
10663                              ow ==> RESERVED
10664      instr[12]    = scaled
10665      instr[11,10] = 10
10666      instr[9,5]   = rn
10667      instr[4,0]   = rt.  */
10668
10669   uint32_t  V = INSTR (26, 26);
10670   uint32_t  dispatch = ((INSTR (31, 30) << 2) | INSTR (23, 22));
10671   Scaling   scale = INSTR (12, 12);
10672   Extension extensionType = INSTR (15, 13);
10673
10674   /* Check for illegal extension types.  */
10675   if (uimm (extensionType, 1, 1) == 0)
10676     HALT_UNALLOC;
10677
10678   if (extensionType == UXTX || extensionType == SXTX)
10679     extensionType = NoExtension;
10680
10681   if (!V)
10682     {
10683       /* GReg operations.  */
10684       switch (dispatch)
10685         {
10686         case 0:  strb_scale_ext (cpu, scale, extensionType); return;
10687         case 1:  ldrb32_scale_ext (cpu, scale, extensionType); return;
10688         case 2:  ldrsb_scale_ext (cpu, scale, extensionType); return;
10689         case 3:  ldrsb32_scale_ext (cpu, scale, extensionType); return;
10690         case 4:  strh_scale_ext (cpu, scale, extensionType); return;
10691         case 5:  ldrh32_scale_ext (cpu, scale, extensionType); return;
10692         case 6:  ldrsh_scale_ext (cpu, scale, extensionType); return;
10693         case 7:  ldrsh32_scale_ext (cpu, scale, extensionType); return;
10694         case 8:  str32_scale_ext (cpu, scale, extensionType); return;
10695         case 9:  ldr32_scale_ext (cpu, scale, extensionType); return;
10696         case 10: ldrsw_scale_ext (cpu, scale, extensionType); return;
10697         case 12: str_scale_ext (cpu, scale, extensionType); return;
10698         case 13: ldr_scale_ext (cpu, scale, extensionType); return;
10699         case 14: prfm_scale_ext (cpu, scale, extensionType); return;
10700
10701         default:
10702         case 11:
10703         case 15:
10704           HALT_UNALLOC;
10705         }
10706     }
10707
10708   /* FReg operations.  */
10709   switch (dispatch)
10710     {
10711     case 1: /* LDUR 8 bit FP.  */
10712       HALT_NYI;
10713     case 3:  fldrq_scale_ext (cpu, scale, extensionType); return;
10714     case 5: /* LDUR 8 bit FP.  */
10715       HALT_NYI;
10716     case 9:  fldrs_scale_ext (cpu, scale, extensionType); return;
10717     case 13: fldrd_scale_ext (cpu, scale, extensionType); return;
10718
10719     case 0:  fstrb_scale_ext (cpu, scale, extensionType); return;
10720     case 2:  fstrq_scale_ext (cpu, scale, extensionType); return;
10721     case 4:  fstrh_scale_ext (cpu, scale, extensionType); return;
10722     case 8:  fstrs_scale_ext (cpu, scale, extensionType); return;
10723     case 12: fstrd_scale_ext (cpu, scale, extensionType); return;
10724
10725     default:
10726     case 6:
10727     case 7:
10728     case 10:
10729     case 11:
10730     case 14:
10731     case 15:
10732       HALT_UNALLOC;
10733     }
10734 }
10735
10736 static void
10737 dexLoadUnsignedImmediate (sim_cpu *cpu)
10738 {
10739   /* instr[29,24] == 111_01
10740      instr[31,30] = size
10741      instr[26]    = V
10742      instr[23,22] = opc
10743      instr[21,10] = uimm12 : unsigned immediate offset
10744      instr[9,5]   = rn may be SP.
10745      instr[4,0]   = rt.  */
10746
10747   uint32_t V = INSTR (26,26);
10748   uint32_t dispatch = ((INSTR (31, 30) << 2) | INSTR (23, 22));
10749   uint32_t imm = INSTR (21, 10);
10750
10751   if (!V)
10752     {
10753       /* GReg operations.  */
10754       switch (dispatch)
10755         {
10756         case 0:  strb_abs (cpu, imm); return;
10757         case 1:  ldrb32_abs (cpu, imm); return;
10758         case 2:  ldrsb_abs (cpu, imm); return;
10759         case 3:  ldrsb32_abs (cpu, imm); return;
10760         case 4:  strh_abs (cpu, imm); return;
10761         case 5:  ldrh32_abs (cpu, imm); return;
10762         case 6:  ldrsh_abs (cpu, imm); return;
10763         case 7:  ldrsh32_abs (cpu, imm); return;
10764         case 8:  str32_abs (cpu, imm); return;
10765         case 9:  ldr32_abs (cpu, imm); return;
10766         case 10: ldrsw_abs (cpu, imm); return;
10767         case 12: str_abs (cpu, imm); return;
10768         case 13: ldr_abs (cpu, imm); return;
10769         case 14: prfm_abs (cpu, imm); return;
10770
10771         default:
10772         case 11:
10773         case 15:
10774           HALT_UNALLOC;
10775         }
10776     }
10777
10778   /* FReg operations.  */
10779   switch (dispatch)
10780     {
10781     case 0:  fstrb_abs (cpu, imm); return;
10782     case 4:  fstrh_abs (cpu, imm); return;
10783     case 8:  fstrs_abs (cpu, imm); return;
10784     case 12: fstrd_abs (cpu, imm); return;
10785     case 2:  fstrq_abs (cpu, imm); return;
10786
10787     case 1:  fldrb_abs (cpu, imm); return;
10788     case 5:  fldrh_abs (cpu, imm); return;
10789     case 9:  fldrs_abs (cpu, imm); return;
10790     case 13: fldrd_abs (cpu, imm); return;
10791     case 3:  fldrq_abs (cpu, imm); return;
10792
10793     default:
10794     case 6:
10795     case 7:
10796     case 10:
10797     case 11:
10798     case 14:
10799     case 15:
10800       HALT_UNALLOC;
10801     }
10802 }
10803
10804 static void
10805 dexLoadExclusive (sim_cpu *cpu)
10806 {
10807   /* assert instr[29:24] = 001000;
10808      instr[31,30] = size
10809      instr[23] = 0 if exclusive
10810      instr[22] = L : 1 if load, 0 if store
10811      instr[21] = 1 if pair
10812      instr[20,16] = Rs
10813      instr[15] = o0 : 1 if ordered
10814      instr[14,10] = Rt2
10815      instr[9,5] = Rn
10816      instr[4.0] = Rt.  */
10817
10818   switch (INSTR (22, 21))
10819     {
10820     case 2:   ldxr (cpu); return;
10821     case 0:   stxr (cpu); return;
10822     default:  HALT_NYI;
10823     }
10824 }
10825
10826 static void
10827 dexLoadOther (sim_cpu *cpu)
10828 {
10829   uint32_t dispatch;
10830
10831   /* instr[29,25] = 111_0
10832      instr[24] == 0 ==> dispatch, 1 ==> ldst reg unsigned immediate
10833      instr[21:11,10] is the secondary dispatch.  */
10834   if (INSTR (24, 24))
10835     {
10836       dexLoadUnsignedImmediate (cpu);
10837       return;
10838     }
10839
10840   dispatch = ((INSTR (21, 21) << 2) | INSTR (11, 10));
10841   switch (dispatch)
10842     {
10843     case 0: dexLoadUnscaledImmediate (cpu); return;
10844     case 1: dexLoadImmediatePrePost (cpu); return;
10845     case 3: dexLoadImmediatePrePost (cpu); return;
10846     case 6: dexLoadRegisterOffset (cpu); return;
10847
10848     default:
10849     case 2:
10850     case 4:
10851     case 5:
10852     case 7:
10853       HALT_NYI;
10854     }
10855 }
10856
10857 static void
10858 store_pair_u32 (sim_cpu *cpu, int32_t offset, WriteBack wb)
10859 {
10860   unsigned rn = INSTR (14, 10);
10861   unsigned rd = INSTR (9, 5);
10862   unsigned rm = INSTR (4, 0);
10863   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
10864
10865   if ((rn == rd || rm == rd) && wb != NoWriteBack)
10866     HALT_UNALLOC; /* ??? */
10867
10868   offset <<= 2;
10869
10870   if (wb != Post)
10871     address += offset;
10872
10873   aarch64_set_mem_u32 (cpu, address,
10874                        aarch64_get_reg_u32 (cpu, rm, NO_SP));
10875   aarch64_set_mem_u32 (cpu, address + 4,
10876                        aarch64_get_reg_u32 (cpu, rn, NO_SP));
10877
10878   if (wb == Post)
10879     address += offset;
10880
10881   if (wb != NoWriteBack)
10882     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
10883 }
10884
10885 static void
10886 store_pair_u64 (sim_cpu *cpu, int32_t offset, WriteBack wb)
10887 {
10888   unsigned rn = INSTR (14, 10);
10889   unsigned rd = INSTR (9, 5);
10890   unsigned rm = INSTR (4, 0);
10891   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
10892
10893   if ((rn == rd || rm == rd) && wb != NoWriteBack)
10894     HALT_UNALLOC; /* ??? */
10895
10896   offset <<= 3;
10897
10898   if (wb != Post)
10899     address += offset;
10900
10901   aarch64_set_mem_u64 (cpu, address,
10902                        aarch64_get_reg_u64 (cpu, rm, NO_SP));
10903   aarch64_set_mem_u64 (cpu, address + 8,
10904                        aarch64_get_reg_u64 (cpu, rn, NO_SP));
10905
10906   if (wb == Post)
10907     address += offset;
10908
10909   if (wb != NoWriteBack)
10910     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
10911 }
10912
10913 static void
10914 load_pair_u32 (sim_cpu *cpu, int32_t offset, WriteBack wb)
10915 {
10916   unsigned rn = INSTR (14, 10);
10917   unsigned rd = INSTR (9, 5);
10918   unsigned rm = INSTR (4, 0);
10919   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
10920
10921   /* Treat this as unalloc to make sure we don't do it.  */
10922   if (rn == rm)
10923     HALT_UNALLOC;
10924
10925   offset <<= 2;
10926
10927   if (wb != Post)
10928     address += offset;
10929
10930   aarch64_set_reg_u64 (cpu, rm, SP_OK, aarch64_get_mem_u32 (cpu, address));
10931   aarch64_set_reg_u64 (cpu, rn, SP_OK, aarch64_get_mem_u32 (cpu, address + 4));
10932
10933   if (wb == Post)
10934     address += offset;
10935
10936   if (wb != NoWriteBack)
10937     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
10938 }
10939
10940 static void
10941 load_pair_s32 (sim_cpu *cpu, int32_t offset, WriteBack wb)
10942 {
10943   unsigned rn = INSTR (14, 10);
10944   unsigned rd = INSTR (9, 5);
10945   unsigned rm = INSTR (4, 0);
10946   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
10947
10948   /* Treat this as unalloc to make sure we don't do it.  */
10949   if (rn == rm)
10950     HALT_UNALLOC;
10951
10952   offset <<= 2;
10953
10954   if (wb != Post)
10955     address += offset;
10956
10957   aarch64_set_reg_s64 (cpu, rm, SP_OK, aarch64_get_mem_s32 (cpu, address));
10958   aarch64_set_reg_s64 (cpu, rn, SP_OK, aarch64_get_mem_s32 (cpu, address + 4));
10959
10960   if (wb == Post)
10961     address += offset;
10962
10963   if (wb != NoWriteBack)
10964     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
10965 }
10966
10967 static void
10968 load_pair_u64 (sim_cpu *cpu, int32_t offset, WriteBack wb)
10969 {
10970   unsigned rn = INSTR (14, 10);
10971   unsigned rd = INSTR (9, 5);
10972   unsigned rm = INSTR (4, 0);
10973   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
10974
10975   /* Treat this as unalloc to make sure we don't do it.  */
10976   if (rn == rm)
10977     HALT_UNALLOC;
10978
10979   offset <<= 3;
10980
10981   if (wb != Post)
10982     address += offset;
10983
10984   aarch64_set_reg_u64 (cpu, rm, SP_OK, aarch64_get_mem_u64 (cpu, address));
10985   aarch64_set_reg_u64 (cpu, rn, SP_OK, aarch64_get_mem_u64 (cpu, address + 8));
10986
10987   if (wb == Post)
10988     address += offset;
10989
10990   if (wb != NoWriteBack)
10991     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
10992 }
10993
10994 static void
10995 dex_load_store_pair_gr (sim_cpu *cpu)
10996 {
10997   /* instr[31,30] = size (10=> 64-bit, 01=> signed 32-bit, 00=> 32-bit)
10998      instr[29,25] = instruction encoding: 101_0
10999      instr[26]    = V : 1 if fp 0 if gp
11000      instr[24,23] = addressing mode (10=> offset, 01=> post, 11=> pre)
11001      instr[22]    = load/store (1=> load)
11002      instr[21,15] = signed, scaled, offset
11003      instr[14,10] = Rn
11004      instr[ 9, 5] = Rd
11005      instr[ 4, 0] = Rm.  */
11006
11007   uint32_t dispatch = ((INSTR (31, 30) << 3) | INSTR (24, 22));
11008   int32_t offset = simm32 (aarch64_get_instr (cpu), 21, 15);
11009
11010   switch (dispatch)
11011     {
11012     case 2: store_pair_u32 (cpu, offset, Post); return;
11013     case 3: load_pair_u32  (cpu, offset, Post); return;
11014     case 4: store_pair_u32 (cpu, offset, NoWriteBack); return;
11015     case 5: load_pair_u32  (cpu, offset, NoWriteBack); return;
11016     case 6: store_pair_u32 (cpu, offset, Pre); return;
11017     case 7: load_pair_u32  (cpu, offset, Pre); return;
11018
11019     case 11: load_pair_s32  (cpu, offset, Post); return;
11020     case 13: load_pair_s32  (cpu, offset, NoWriteBack); return;
11021     case 15: load_pair_s32  (cpu, offset, Pre); return;
11022
11023     case 18: store_pair_u64 (cpu, offset, Post); return;
11024     case 19: load_pair_u64  (cpu, offset, Post); return;
11025     case 20: store_pair_u64 (cpu, offset, NoWriteBack); return;
11026     case 21: load_pair_u64  (cpu, offset, NoWriteBack); return;
11027     case 22: store_pair_u64 (cpu, offset, Pre); return;
11028     case 23: load_pair_u64  (cpu, offset, Pre); return;
11029
11030     default:
11031       HALT_UNALLOC;
11032     }
11033 }
11034
11035 static void
11036 store_pair_float (sim_cpu *cpu, int32_t offset, WriteBack wb)
11037 {
11038   unsigned rn = INSTR (14, 10);
11039   unsigned rd = INSTR (9, 5);
11040   unsigned rm = INSTR (4, 0);
11041   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11042
11043   offset <<= 2;
11044
11045   if (wb != Post)
11046     address += offset;
11047
11048   aarch64_set_mem_u32 (cpu, address,     aarch64_get_vec_u32 (cpu, rm, 0));
11049   aarch64_set_mem_u32 (cpu, address + 4, aarch64_get_vec_u32 (cpu, rn, 0));
11050
11051   if (wb == Post)
11052     address += offset;
11053
11054   if (wb != NoWriteBack)
11055     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11056 }
11057
11058 static void
11059 store_pair_double (sim_cpu *cpu, int32_t offset, WriteBack wb)
11060 {
11061   unsigned rn = INSTR (14, 10);
11062   unsigned rd = INSTR (9, 5);
11063   unsigned rm = INSTR (4, 0);
11064   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11065
11066   offset <<= 3;
11067
11068   if (wb != Post)
11069     address += offset;
11070
11071   aarch64_set_mem_u64 (cpu, address,     aarch64_get_vec_u64 (cpu, rm, 0));
11072   aarch64_set_mem_u64 (cpu, address + 8, aarch64_get_vec_u64 (cpu, rn, 0));
11073
11074   if (wb == Post)
11075     address += offset;
11076
11077   if (wb != NoWriteBack)
11078     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11079 }
11080
11081 static void
11082 store_pair_long_double (sim_cpu *cpu, int32_t offset, WriteBack wb)
11083 {
11084   FRegister a;
11085   unsigned rn = INSTR (14, 10);
11086   unsigned rd = INSTR (9, 5);
11087   unsigned rm = INSTR (4, 0);
11088   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11089
11090   offset <<= 4;
11091
11092   if (wb != Post)
11093     address += offset;
11094
11095   aarch64_get_FP_long_double (cpu, rm, & a);
11096   aarch64_set_mem_long_double (cpu, address, a);
11097   aarch64_get_FP_long_double (cpu, rn, & a);
11098   aarch64_set_mem_long_double (cpu, address + 16, a);
11099
11100   if (wb == Post)
11101     address += offset;
11102
11103   if (wb != NoWriteBack)
11104     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11105 }
11106
11107 static void
11108 load_pair_float (sim_cpu *cpu, int32_t offset, WriteBack wb)
11109 {
11110   unsigned rn = INSTR (14, 10);
11111   unsigned rd = INSTR (9, 5);
11112   unsigned rm = INSTR (4, 0);
11113   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11114
11115   if (rm == rn)
11116     HALT_UNALLOC;
11117
11118   offset <<= 2;
11119
11120   if (wb != Post)
11121     address += offset;
11122
11123   aarch64_set_vec_u32 (cpu, rm, 0, aarch64_get_mem_u32 (cpu, address));
11124   aarch64_set_vec_u32 (cpu, rn, 0, aarch64_get_mem_u32 (cpu, address + 4));
11125
11126   if (wb == Post)
11127     address += offset;
11128
11129   if (wb != NoWriteBack)
11130     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11131 }
11132
11133 static void
11134 load_pair_double (sim_cpu *cpu, int32_t offset, WriteBack wb)
11135 {
11136   unsigned rn = INSTR (14, 10);
11137   unsigned rd = INSTR (9, 5);
11138   unsigned rm = INSTR (4, 0);
11139   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11140
11141   if (rm == rn)
11142     HALT_UNALLOC;
11143
11144   offset <<= 3;
11145
11146   if (wb != Post)
11147     address += offset;
11148
11149   aarch64_set_vec_u64 (cpu, rm, 0, aarch64_get_mem_u64 (cpu, address));
11150   aarch64_set_vec_u64 (cpu, rn, 0, aarch64_get_mem_u64 (cpu, address + 8));
11151
11152   if (wb == Post)
11153     address += offset;
11154
11155   if (wb != NoWriteBack)
11156     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11157 }
11158
11159 static void
11160 load_pair_long_double (sim_cpu *cpu, int32_t offset, WriteBack wb)
11161 {
11162   FRegister a;
11163   unsigned rn = INSTR (14, 10);
11164   unsigned rd = INSTR (9, 5);
11165   unsigned rm = INSTR (4, 0);
11166   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11167
11168   if (rm == rn)
11169     HALT_UNALLOC;
11170
11171   offset <<= 4;
11172
11173   if (wb != Post)
11174     address += offset;
11175
11176   aarch64_get_mem_long_double (cpu, address, & a);
11177   aarch64_set_FP_long_double (cpu, rm, a);
11178   aarch64_get_mem_long_double (cpu, address + 16, & a);
11179   aarch64_set_FP_long_double (cpu, rn, a);
11180
11181   if (wb == Post)
11182     address += offset;
11183
11184   if (wb != NoWriteBack)
11185     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11186 }
11187
11188 static void
11189 dex_load_store_pair_fp (sim_cpu *cpu)
11190 {
11191   /* instr[31,30] = size (10=> 128-bit, 01=> 64-bit, 00=> 32-bit)
11192      instr[29,25] = instruction encoding
11193      instr[24,23] = addressing mode (10=> offset, 01=> post, 11=> pre)
11194      instr[22]    = load/store (1=> load)
11195      instr[21,15] = signed, scaled, offset
11196      instr[14,10] = Rn
11197      instr[ 9, 5] = Rd
11198      instr[ 4, 0] = Rm  */
11199
11200   uint32_t dispatch = ((INSTR (31, 30) << 3) | INSTR (24, 22));
11201   int32_t offset = simm32 (aarch64_get_instr (cpu), 21, 15);
11202
11203   switch (dispatch)
11204     {
11205     case 2: store_pair_float (cpu, offset, Post); return;
11206     case 3: load_pair_float  (cpu, offset, Post); return;
11207     case 4: store_pair_float (cpu, offset, NoWriteBack); return;
11208     case 5: load_pair_float  (cpu, offset, NoWriteBack); return;
11209     case 6: store_pair_float (cpu, offset, Pre); return;
11210     case 7: load_pair_float  (cpu, offset, Pre); return;
11211
11212     case 10: store_pair_double (cpu, offset, Post); return;
11213     case 11: load_pair_double  (cpu, offset, Post); return;
11214     case 12: store_pair_double (cpu, offset, NoWriteBack); return;
11215     case 13: load_pair_double  (cpu, offset, NoWriteBack); return;
11216     case 14: store_pair_double (cpu, offset, Pre); return;
11217     case 15: load_pair_double  (cpu, offset, Pre); return;
11218
11219     case 18: store_pair_long_double (cpu, offset, Post); return;
11220     case 19: load_pair_long_double  (cpu, offset, Post); return;
11221     case 20: store_pair_long_double (cpu, offset, NoWriteBack); return;
11222     case 21: load_pair_long_double  (cpu, offset, NoWriteBack); return;
11223     case 22: store_pair_long_double (cpu, offset, Pre); return;
11224     case 23: load_pair_long_double  (cpu, offset, Pre); return;
11225
11226     default:
11227       HALT_UNALLOC;
11228     }
11229 }
11230
11231 static inline unsigned
11232 vec_reg (unsigned v, unsigned o)
11233 {
11234   return (v + o) & 0x3F;
11235 }
11236
11237 /* Load multiple N-element structures to N consecutive registers.  */
11238 static void
11239 vec_load (sim_cpu *cpu, uint64_t address, unsigned N)
11240 {
11241   int      all  = INSTR (30, 30);
11242   unsigned size = INSTR (11, 10);
11243   unsigned vd   = INSTR (4, 0);
11244   unsigned i;
11245
11246   switch (size)
11247     {
11248     case 0: /* 8-bit operations.  */
11249       if (all)
11250         for (i = 0; i < (16 * N); i++)
11251           aarch64_set_vec_u8 (cpu, vec_reg (vd, i >> 4), i & 15,
11252                               aarch64_get_mem_u8 (cpu, address + i));
11253       else
11254         for (i = 0; i < (8 * N); i++)
11255           aarch64_set_vec_u8 (cpu, vec_reg (vd, i >> 3), i & 7,
11256                               aarch64_get_mem_u8 (cpu, address + i));
11257       return;
11258
11259     case 1: /* 16-bit operations.  */
11260       if (all)
11261         for (i = 0; i < (8 * N); i++)
11262           aarch64_set_vec_u16 (cpu, vec_reg (vd, i >> 3), i & 7,
11263                                aarch64_get_mem_u16 (cpu, address + i * 2));
11264       else
11265         for (i = 0; i < (4 * N); i++)
11266           aarch64_set_vec_u16 (cpu, vec_reg (vd, i >> 2), i & 3,
11267                                aarch64_get_mem_u16 (cpu, address + i * 2));
11268       return;
11269
11270     case 2: /* 32-bit operations.  */
11271       if (all)
11272         for (i = 0; i < (4 * N); i++)
11273           aarch64_set_vec_u32 (cpu, vec_reg (vd, i >> 2), i & 3,
11274                                aarch64_get_mem_u32 (cpu, address + i * 4));
11275       else
11276         for (i = 0; i < (2 * N); i++)
11277           aarch64_set_vec_u32 (cpu, vec_reg (vd, i >> 1), i & 1,
11278                                aarch64_get_mem_u32 (cpu, address + i * 4));
11279       return;
11280
11281     case 3: /* 64-bit operations.  */
11282       if (all)
11283         for (i = 0; i < (2 * N); i++)
11284           aarch64_set_vec_u64 (cpu, vec_reg (vd, i >> 1), i & 1,
11285                                aarch64_get_mem_u64 (cpu, address + i * 8));
11286       else
11287         for (i = 0; i < N; i++)
11288           aarch64_set_vec_u64 (cpu, vec_reg (vd, i), 0,
11289                                aarch64_get_mem_u64 (cpu, address + i * 8));
11290       return;
11291     }
11292 }
11293
11294 /* LD4: load multiple 4-element to four consecutive registers.  */
11295 static void
11296 LD4 (sim_cpu *cpu, uint64_t address)
11297 {
11298   vec_load (cpu, address, 4);
11299 }
11300
11301 /* LD3: load multiple 3-element structures to three consecutive registers.  */
11302 static void
11303 LD3 (sim_cpu *cpu, uint64_t address)
11304 {
11305   vec_load (cpu, address, 3);
11306 }
11307
11308 /* LD2: load multiple 2-element structures to two consecutive registers.  */
11309 static void
11310 LD2 (sim_cpu *cpu, uint64_t address)
11311 {
11312   vec_load (cpu, address, 2);
11313 }
11314
11315 /* Load multiple 1-element structures into one register.  */
11316 static void
11317 LD1_1 (sim_cpu *cpu, uint64_t address)
11318 {
11319   int      all  = INSTR (30, 30);
11320   unsigned size = INSTR (11, 10);
11321   unsigned vd   = INSTR (4, 0);
11322   unsigned i;
11323
11324   switch (size)
11325     {
11326     case 0:
11327       /* LD1 {Vd.16b}, addr, #16 */
11328       /* LD1 {Vd.8b}, addr, #8 */
11329       for (i = 0; i < (all ? 16 : 8); i++)
11330         aarch64_set_vec_u8 (cpu, vd, i,
11331                             aarch64_get_mem_u8 (cpu, address + i));
11332       return;
11333
11334     case 1:
11335       /* LD1 {Vd.8h}, addr, #16 */
11336       /* LD1 {Vd.4h}, addr, #8 */
11337       for (i = 0; i < (all ? 8 : 4); i++)
11338         aarch64_set_vec_u16 (cpu, vd, i,
11339                              aarch64_get_mem_u16 (cpu, address + i * 2));
11340       return;
11341
11342     case 2:
11343       /* LD1 {Vd.4s}, addr, #16 */
11344       /* LD1 {Vd.2s}, addr, #8 */
11345       for (i = 0; i < (all ? 4 : 2); i++)
11346         aarch64_set_vec_u32 (cpu, vd, i,
11347                              aarch64_get_mem_u32 (cpu, address + i * 4));
11348       return;
11349
11350     case 3:
11351       /* LD1 {Vd.2d}, addr, #16 */
11352       /* LD1 {Vd.1d}, addr, #8 */
11353       for (i = 0; i < (all ? 2 : 1); i++)
11354         aarch64_set_vec_u64 (cpu, vd, i,
11355                              aarch64_get_mem_u64 (cpu, address + i * 8));
11356       return;
11357     }
11358 }
11359
11360 /* Load multiple 1-element structures into two registers.  */
11361 static void
11362 LD1_2 (sim_cpu *cpu, uint64_t address)
11363 {
11364   /* FIXME: This algorithm is *exactly* the same as the LD2 version.
11365      So why have two different instructions ?  There must be something
11366      wrong somewhere.  */
11367   vec_load (cpu, address, 2);
11368 }
11369
11370 /* Load multiple 1-element structures into three registers.  */
11371 static void
11372 LD1_3 (sim_cpu *cpu, uint64_t address)
11373 {
11374   /* FIXME: This algorithm is *exactly* the same as the LD3 version.
11375      So why have two different instructions ?  There must be something
11376      wrong somewhere.  */
11377   vec_load (cpu, address, 3);
11378 }
11379
11380 /* Load multiple 1-element structures into four registers.  */
11381 static void
11382 LD1_4 (sim_cpu *cpu, uint64_t address)
11383 {
11384   /* FIXME: This algorithm is *exactly* the same as the LD4 version.
11385      So why have two different instructions ?  There must be something
11386      wrong somewhere.  */
11387   vec_load (cpu, address, 4);
11388 }
11389
11390 /* Store multiple N-element structures to N consecutive registers.  */
11391 static void
11392 vec_store (sim_cpu *cpu, uint64_t address, unsigned N)
11393 {
11394   int      all  = INSTR (30, 30);
11395   unsigned size = INSTR (11, 10);
11396   unsigned vd   = INSTR (4, 0);
11397   unsigned i;
11398
11399   switch (size)
11400     {
11401     case 0: /* 8-bit operations.  */
11402       if (all)
11403         for (i = 0; i < (16 * N); i++)
11404           aarch64_set_mem_u8
11405             (cpu, address + i,
11406              aarch64_get_vec_u8 (cpu, vec_reg (vd, i >> 4), i & 15));
11407       else
11408         for (i = 0; i < (8 * N); i++)
11409           aarch64_set_mem_u8
11410             (cpu, address + i,
11411              aarch64_get_vec_u8 (cpu, vec_reg (vd, i >> 3), i & 7));
11412       return;
11413
11414     case 1: /* 16-bit operations.  */
11415       if (all)
11416         for (i = 0; i < (8 * N); i++)
11417           aarch64_set_mem_u16
11418             (cpu, address + i * 2,
11419              aarch64_get_vec_u16 (cpu, vec_reg (vd, i >> 3), i & 7));
11420       else
11421         for (i = 0; i < (4 * N); i++)
11422           aarch64_set_mem_u16
11423             (cpu, address + i * 2,
11424              aarch64_get_vec_u16 (cpu, vec_reg (vd, i >> 2), i & 3));
11425       return;
11426
11427     case 2: /* 32-bit operations.  */
11428       if (all)
11429         for (i = 0; i < (4 * N); i++)
11430           aarch64_set_mem_u32
11431             (cpu, address + i * 4,
11432              aarch64_get_vec_u32 (cpu, vec_reg (vd, i >> 2), i & 3));
11433       else
11434         for (i = 0; i < (2 * N); i++)
11435           aarch64_set_mem_u32
11436             (cpu, address + i * 4,
11437              aarch64_get_vec_u32 (cpu, vec_reg (vd, i >> 1), i & 1));
11438       return;
11439
11440     case 3: /* 64-bit operations.  */
11441       if (all)
11442         for (i = 0; i < (2 * N); i++)
11443           aarch64_set_mem_u64
11444             (cpu, address + i * 8,
11445              aarch64_get_vec_u64 (cpu, vec_reg (vd, i >> 1), i & 1));
11446       else
11447         for (i = 0; i < N; i++)
11448           aarch64_set_mem_u64
11449             (cpu, address + i * 8,
11450              aarch64_get_vec_u64 (cpu, vec_reg (vd, i), 0));
11451       return;
11452     }
11453 }
11454
11455 /* Store multiple 4-element structure to four consecutive registers.  */
11456 static void
11457 ST4 (sim_cpu *cpu, uint64_t address)
11458 {
11459   vec_store (cpu, address, 4);
11460 }
11461
11462 /* Store multiple 3-element structures to three consecutive registers.  */
11463 static void
11464 ST3 (sim_cpu *cpu, uint64_t address)
11465 {
11466   vec_store (cpu, address, 3);
11467 }
11468
11469 /* Store multiple 2-element structures to two consecutive registers.  */
11470 static void
11471 ST2 (sim_cpu *cpu, uint64_t address)
11472 {
11473   vec_store (cpu, address, 2);
11474 }
11475
11476 /* Store multiple 1-element structures into one register.  */
11477 static void
11478 ST1_1 (sim_cpu *cpu, uint64_t address)
11479 {
11480   int      all  = INSTR (30, 30);
11481   unsigned size = INSTR (11, 10);
11482   unsigned vd   = INSTR (4, 0);
11483   unsigned i;
11484
11485   switch (size)
11486     {
11487     case 0:
11488       for (i = 0; i < (all ? 16 : 8); i++)
11489         aarch64_set_mem_u8 (cpu, address + i,
11490                             aarch64_get_vec_u8 (cpu, vd, i));
11491       return;
11492
11493     case 1:
11494       for (i = 0; i < (all ? 8 : 4); i++)
11495         aarch64_set_mem_u16 (cpu, address + i * 2,
11496                              aarch64_get_vec_u16 (cpu, vd, i));
11497       return;
11498
11499     case 2:
11500       for (i = 0; i < (all ? 4 : 2); i++)
11501         aarch64_set_mem_u32 (cpu, address + i * 4,
11502                              aarch64_get_vec_u32 (cpu, vd, i));
11503       return;
11504
11505     case 3:
11506       for (i = 0; i < (all ? 2 : 1); i++)
11507         aarch64_set_mem_u64 (cpu, address + i * 8,
11508                              aarch64_get_vec_u64 (cpu, vd, i));
11509       return;
11510     }
11511 }
11512
11513 /* Store multiple 1-element structures into two registers.  */
11514 static void
11515 ST1_2 (sim_cpu *cpu, uint64_t address)
11516 {
11517   /* FIXME: This algorithm is *exactly* the same as the ST2 version.
11518      So why have two different instructions ?  There must be
11519      something wrong somewhere.  */
11520   vec_store (cpu, address, 2);
11521 }
11522
11523 /* Store multiple 1-element structures into three registers.  */
11524 static void
11525 ST1_3 (sim_cpu *cpu, uint64_t address)
11526 {
11527   /* FIXME: This algorithm is *exactly* the same as the ST3 version.
11528      So why have two different instructions ?  There must be
11529      something wrong somewhere.  */
11530   vec_store (cpu, address, 3);
11531 }
11532
11533 /* Store multiple 1-element structures into four registers.  */
11534 static void
11535 ST1_4 (sim_cpu *cpu, uint64_t address)
11536 {
11537   /* FIXME: This algorithm is *exactly* the same as the ST4 version.
11538      So why have two different instructions ?  There must be
11539      something wrong somewhere.  */
11540   vec_store (cpu, address, 4);
11541 }
11542
11543 static void
11544 do_vec_LDnR (sim_cpu *cpu, uint64_t address)
11545 {
11546   /* instr[31]    = 0
11547      instr[30]    = element selector 0=>half, 1=>all elements
11548      instr[29,24] = 00 1101
11549      instr[23]    = 0=>simple, 1=>post
11550      instr[22]    = 1
11551      instr[21]    = width: LD1R-or-LD3R (0) / LD2R-or-LD4R (1)
11552      instr[20,16] = 0 0000 (simple), Vinc (reg-post-inc, no SP),
11553                       11111 (immediate post inc)
11554      instr[15,14] = 11
11555      instr[13]    = width: LD1R-or-LD2R (0) / LD3R-or-LD4R (1)
11556      instr[12]    = 0
11557      instr[11,10] = element size 00=> byte(b), 01=> half(h),
11558                                  10=> word(s), 11=> double(d)
11559      instr[9,5]   = address
11560      instr[4,0]   = Vd  */
11561
11562   unsigned full = INSTR (30, 30);
11563   unsigned vd = INSTR (4, 0);
11564   unsigned size = INSTR (11, 10);
11565   int i;
11566
11567   NYI_assert (29, 24, 0x0D);
11568   NYI_assert (22, 22, 1);
11569   NYI_assert (15, 14, 3);
11570   NYI_assert (12, 12, 0);
11571
11572   switch ((INSTR (13, 13) << 1) | INSTR (21, 21))
11573     {
11574     case 0: /* LD1R.  */
11575       switch (size)
11576         {
11577         case 0:
11578           {
11579             uint8_t val = aarch64_get_mem_u8 (cpu, address);
11580             for (i = 0; i < (full ? 16 : 8); i++)
11581               aarch64_set_vec_u8 (cpu, vd, i, val);
11582             break;
11583           }
11584
11585         case 1:
11586           {
11587             uint16_t val = aarch64_get_mem_u16 (cpu, address);
11588             for (i = 0; i < (full ? 8 : 4); i++)
11589               aarch64_set_vec_u16 (cpu, vd, i, val);
11590             break;
11591           }
11592
11593         case 2:
11594           {
11595             uint32_t val = aarch64_get_mem_u32 (cpu, address);
11596             for (i = 0; i < (full ? 4 : 2); i++)
11597               aarch64_set_vec_u32 (cpu, vd, i, val);
11598             break;
11599           }
11600
11601         case 3:
11602           {
11603             uint64_t val = aarch64_get_mem_u64 (cpu, address);
11604             for (i = 0; i < (full ? 2 : 1); i++)
11605               aarch64_set_vec_u64 (cpu, vd, i, val);
11606             break;
11607           }
11608
11609         default:
11610           HALT_UNALLOC;
11611         }
11612       break;
11613
11614     case 1: /* LD2R.  */
11615       switch (size)
11616         {
11617         case 0:
11618           {
11619             uint8_t val1 = aarch64_get_mem_u8 (cpu, address);
11620             uint8_t val2 = aarch64_get_mem_u8 (cpu, address + 1);
11621
11622             for (i = 0; i < (full ? 16 : 8); i++)
11623               {
11624                 aarch64_set_vec_u8 (cpu, vd, 0, val1);
11625                 aarch64_set_vec_u8 (cpu, vd + 1, 0, val2);
11626               }
11627             break;
11628           }
11629
11630         case 1:
11631           {
11632             uint16_t val1 = aarch64_get_mem_u16 (cpu, address);
11633             uint16_t val2 = aarch64_get_mem_u16 (cpu, address + 2);
11634
11635             for (i = 0; i < (full ? 8 : 4); i++)
11636               {
11637                 aarch64_set_vec_u16 (cpu, vd, 0, val1);
11638                 aarch64_set_vec_u16 (cpu, vd + 1, 0, val2);
11639               }
11640             break;
11641           }
11642
11643         case 2:
11644           {
11645             uint32_t val1 = aarch64_get_mem_u32 (cpu, address);
11646             uint32_t val2 = aarch64_get_mem_u32 (cpu, address + 4);
11647
11648             for (i = 0; i < (full ? 4 : 2); i++)
11649               {
11650                 aarch64_set_vec_u32 (cpu, vd, 0, val1);
11651                 aarch64_set_vec_u32 (cpu, vd + 1, 0, val2);
11652               }
11653             break;
11654           }
11655
11656         case 3:
11657           {
11658             uint64_t val1 = aarch64_get_mem_u64 (cpu, address);
11659             uint64_t val2 = aarch64_get_mem_u64 (cpu, address + 8);
11660
11661             for (i = 0; i < (full ? 2 : 1); i++)
11662               {
11663                 aarch64_set_vec_u64 (cpu, vd, 0, val1);
11664                 aarch64_set_vec_u64 (cpu, vd + 1, 0, val2);
11665               }
11666             break;
11667           }
11668
11669         default:
11670           HALT_UNALLOC;
11671         }
11672       break;
11673
11674     case 2: /* LD3R.  */
11675       switch (size)
11676         {
11677         case 0:
11678           {
11679             uint8_t val1 = aarch64_get_mem_u8 (cpu, address);
11680             uint8_t val2 = aarch64_get_mem_u8 (cpu, address + 1);
11681             uint8_t val3 = aarch64_get_mem_u8 (cpu, address + 2);
11682
11683             for (i = 0; i < (full ? 16 : 8); i++)
11684               {
11685                 aarch64_set_vec_u8 (cpu, vd, 0, val1);
11686                 aarch64_set_vec_u8 (cpu, vd + 1, 0, val2);
11687                 aarch64_set_vec_u8 (cpu, vd + 2, 0, val3);
11688               }
11689           }
11690           break;
11691
11692         case 1:
11693           {
11694             uint32_t val1 = aarch64_get_mem_u16 (cpu, address);
11695             uint32_t val2 = aarch64_get_mem_u16 (cpu, address + 2);
11696             uint32_t val3 = aarch64_get_mem_u16 (cpu, address + 4);
11697
11698             for (i = 0; i < (full ? 8 : 4); i++)
11699               {
11700                 aarch64_set_vec_u16 (cpu, vd, 0, val1);
11701                 aarch64_set_vec_u16 (cpu, vd + 1, 0, val2);
11702                 aarch64_set_vec_u16 (cpu, vd + 2, 0, val3);
11703               }
11704           }
11705           break;
11706
11707         case 2:
11708           {
11709             uint32_t val1 = aarch64_get_mem_u32 (cpu, address);
11710             uint32_t val2 = aarch64_get_mem_u32 (cpu, address + 4);
11711             uint32_t val3 = aarch64_get_mem_u32 (cpu, address + 8);
11712
11713             for (i = 0; i < (full ? 4 : 2); i++)
11714               {
11715                 aarch64_set_vec_u32 (cpu, vd, 0, val1);
11716                 aarch64_set_vec_u32 (cpu, vd + 1, 0, val2);
11717                 aarch64_set_vec_u32 (cpu, vd + 2, 0, val3);
11718               }
11719           }
11720           break;
11721
11722         case 3:
11723           {
11724             uint64_t val1 = aarch64_get_mem_u64 (cpu, address);
11725             uint64_t val2 = aarch64_get_mem_u64 (cpu, address + 8);
11726             uint64_t val3 = aarch64_get_mem_u64 (cpu, address + 16);
11727
11728             for (i = 0; i < (full ? 2 : 1); i++)
11729               {
11730                 aarch64_set_vec_u64 (cpu, vd, 0, val1);
11731                 aarch64_set_vec_u64 (cpu, vd + 1, 0, val2);
11732                 aarch64_set_vec_u64 (cpu, vd + 2, 0, val3);
11733               }
11734           }
11735           break;
11736
11737         default:
11738           HALT_UNALLOC;
11739         }
11740       break;
11741
11742     case 3: /* LD4R.  */
11743       switch (size)
11744         {
11745         case 0:
11746           {
11747             uint8_t val1 = aarch64_get_mem_u8 (cpu, address);
11748             uint8_t val2 = aarch64_get_mem_u8 (cpu, address + 1);
11749             uint8_t val3 = aarch64_get_mem_u8 (cpu, address + 2);
11750             uint8_t val4 = aarch64_get_mem_u8 (cpu, address + 3);
11751
11752             for (i = 0; i < (full ? 16 : 8); i++)
11753               {
11754                 aarch64_set_vec_u8 (cpu, vd, 0, val1);
11755                 aarch64_set_vec_u8 (cpu, vd + 1, 0, val2);
11756                 aarch64_set_vec_u8 (cpu, vd + 2, 0, val3);
11757                 aarch64_set_vec_u8 (cpu, vd + 3, 0, val4);
11758               }
11759           }
11760           break;
11761
11762         case 1:
11763           {
11764             uint32_t val1 = aarch64_get_mem_u16 (cpu, address);
11765             uint32_t val2 = aarch64_get_mem_u16 (cpu, address + 2);
11766             uint32_t val3 = aarch64_get_mem_u16 (cpu, address + 4);
11767             uint32_t val4 = aarch64_get_mem_u16 (cpu, address + 6);
11768
11769             for (i = 0; i < (full ? 8 : 4); i++)
11770               {
11771                 aarch64_set_vec_u16 (cpu, vd, 0, val1);
11772                 aarch64_set_vec_u16 (cpu, vd + 1, 0, val2);
11773                 aarch64_set_vec_u16 (cpu, vd + 2, 0, val3);
11774                 aarch64_set_vec_u16 (cpu, vd + 3, 0, val4);
11775               }
11776           }
11777           break;
11778
11779         case 2:
11780           {
11781             uint32_t val1 = aarch64_get_mem_u32 (cpu, address);
11782             uint32_t val2 = aarch64_get_mem_u32 (cpu, address + 4);
11783             uint32_t val3 = aarch64_get_mem_u32 (cpu, address + 8);
11784             uint32_t val4 = aarch64_get_mem_u32 (cpu, address + 12);
11785
11786             for (i = 0; i < (full ? 4 : 2); i++)
11787               {
11788                 aarch64_set_vec_u32 (cpu, vd, 0, val1);
11789                 aarch64_set_vec_u32 (cpu, vd + 1, 0, val2);
11790                 aarch64_set_vec_u32 (cpu, vd + 2, 0, val3);
11791                 aarch64_set_vec_u32 (cpu, vd + 3, 0, val4);
11792               }
11793           }
11794           break;
11795
11796         case 3:
11797           {
11798             uint64_t val1 = aarch64_get_mem_u64 (cpu, address);
11799             uint64_t val2 = aarch64_get_mem_u64 (cpu, address + 8);
11800             uint64_t val3 = aarch64_get_mem_u64 (cpu, address + 16);
11801             uint64_t val4 = aarch64_get_mem_u64 (cpu, address + 24);
11802
11803             for (i = 0; i < (full ? 2 : 1); i++)
11804               {
11805                 aarch64_set_vec_u64 (cpu, vd, 0, val1);
11806                 aarch64_set_vec_u64 (cpu, vd + 1, 0, val2);
11807                 aarch64_set_vec_u64 (cpu, vd + 2, 0, val3);
11808                 aarch64_set_vec_u64 (cpu, vd + 3, 0, val4);
11809               }
11810           }
11811           break;
11812
11813         default:
11814           HALT_UNALLOC;
11815         }
11816       break;
11817
11818     default:
11819       HALT_UNALLOC;
11820     }
11821 }
11822
11823 static void
11824 do_vec_load_store (sim_cpu *cpu)
11825 {
11826   /* {LD|ST}<N>   {Vd..Vd+N}, vaddr
11827
11828      instr[31]    = 0
11829      instr[30]    = element selector 0=>half, 1=>all elements
11830      instr[29,25] = 00110
11831      instr[24]    = ?
11832      instr[23]    = 0=>simple, 1=>post
11833      instr[22]    = 0=>store, 1=>load
11834      instr[21]    = 0 (LDn) / small(0)-large(1) selector (LDnR)
11835      instr[20,16] = 00000 (simple), Vinc (reg-post-inc, no SP),
11836                     11111 (immediate post inc)
11837      instr[15,12] = elements and destinations.  eg for load:
11838                      0000=>LD4 => load multiple 4-element to
11839                      four consecutive registers
11840                      0100=>LD3 => load multiple 3-element to
11841                      three consecutive registers
11842                      1000=>LD2 => load multiple 2-element to
11843                      two consecutive registers
11844                      0010=>LD1 => load multiple 1-element to
11845                      four consecutive registers
11846                      0110=>LD1 => load multiple 1-element to
11847                      three consecutive registers
11848                      1010=>LD1 => load multiple 1-element to
11849                      two consecutive registers
11850                      0111=>LD1 => load multiple 1-element to
11851                      one register
11852                      1100=>LDR1,LDR2
11853                      1110=>LDR3,LDR4
11854      instr[11,10] = element size 00=> byte(b), 01=> half(h),
11855                                  10=> word(s), 11=> double(d)
11856      instr[9,5]   = Vn, can be SP
11857      instr[4,0]   = Vd  */
11858
11859   int post;
11860   int load;
11861   unsigned vn;
11862   uint64_t address;
11863   int type;
11864
11865   if (INSTR (31, 31) != 0 || INSTR (29, 25) != 0x06)
11866     HALT_NYI;
11867
11868   type = INSTR (15, 12);
11869   if (type != 0xE && type != 0xE && INSTR (21, 21) != 0)
11870     HALT_NYI;
11871
11872   post = INSTR (23, 23);
11873   load = INSTR (22, 22);
11874   vn = INSTR (9, 5);
11875   address = aarch64_get_reg_u64 (cpu, vn, SP_OK);
11876
11877   if (post)
11878     {
11879       unsigned vm = INSTR (20, 16);
11880
11881       if (vm == R31)
11882         {
11883           unsigned sizeof_operation;
11884
11885           switch (type)
11886             {
11887             case 0: sizeof_operation = 32; break;
11888             case 4: sizeof_operation = 24; break;
11889             case 8: sizeof_operation = 16; break;
11890
11891             case 0xC:
11892               sizeof_operation = INSTR (21, 21) ? 2 : 1;
11893               sizeof_operation <<= INSTR (11, 10);
11894               break;
11895
11896             case 0xE:
11897               sizeof_operation = INSTR (21, 21) ? 8 : 4;
11898               sizeof_operation <<= INSTR (11, 10);
11899               break;
11900
11901             case 7:
11902               /* One register, immediate offset variant.  */
11903               sizeof_operation = 8;
11904               break;
11905
11906             case 10:
11907               /* Two registers, immediate offset variant.  */
11908               sizeof_operation = 16;
11909               break;
11910
11911             case 6:
11912               /* Three registers, immediate offset variant.  */
11913               sizeof_operation = 24;
11914               break;
11915
11916             case 2:
11917               /* Four registers, immediate offset variant.  */
11918               sizeof_operation = 32;
11919               break;
11920
11921             default:
11922               HALT_UNALLOC;
11923             }
11924
11925           if (INSTR (30, 30))
11926             sizeof_operation *= 2;
11927
11928           aarch64_set_reg_u64 (cpu, vn, SP_OK, address + sizeof_operation);
11929         }
11930       else
11931         aarch64_set_reg_u64 (cpu, vn, SP_OK,
11932                              address + aarch64_get_reg_u64 (cpu, vm, NO_SP));
11933     }
11934   else
11935     {
11936       NYI_assert (20, 16, 0);
11937     }
11938
11939   if (load)
11940     {
11941       switch (type)
11942         {
11943         case 0:  LD4 (cpu, address); return;
11944         case 4:  LD3 (cpu, address); return;
11945         case 8:  LD2 (cpu, address); return;
11946         case 2:  LD1_4 (cpu, address); return;
11947         case 6:  LD1_3 (cpu, address); return;
11948         case 10: LD1_2 (cpu, address); return;
11949         case 7:  LD1_1 (cpu, address); return;
11950
11951         case 0xE:
11952         case 0xC: do_vec_LDnR (cpu, address); return;
11953
11954         default:
11955           HALT_NYI;
11956         }
11957     }
11958
11959   /* Stores.  */
11960   switch (type)
11961     {
11962     case 0:  ST4 (cpu, address); return;
11963     case 4:  ST3 (cpu, address); return;
11964     case 8:  ST2 (cpu, address); return;
11965     case 2:  ST1_4 (cpu, address); return;
11966     case 6:  ST1_3 (cpu, address); return;
11967     case 10: ST1_2 (cpu, address); return;
11968     case 7:  ST1_1 (cpu, address); return;
11969     default:
11970       HALT_NYI;
11971     }
11972 }
11973
11974 static void
11975 dexLdSt (sim_cpu *cpu)
11976 {
11977   /* uint32_t group = dispatchGroup (aarch64_get_instr (cpu));
11978      assert  group == GROUP_LDST_0100 || group == GROUP_LDST_0110 ||
11979              group == GROUP_LDST_1100 || group == GROUP_LDST_1110
11980      bits [29,28:26] of a LS are the secondary dispatch vector.  */
11981   uint32_t group2 = dispatchLS (aarch64_get_instr (cpu));
11982
11983   switch (group2)
11984     {
11985     case LS_EXCL_000:
11986       dexLoadExclusive (cpu); return;
11987
11988     case LS_LIT_010:
11989     case LS_LIT_011:
11990       dexLoadLiteral (cpu); return;
11991
11992     case LS_OTHER_110:
11993     case LS_OTHER_111:
11994       dexLoadOther (cpu); return;
11995
11996     case LS_ADVSIMD_001:
11997       do_vec_load_store (cpu); return;
11998
11999     case LS_PAIR_100:
12000       dex_load_store_pair_gr (cpu); return;
12001
12002     case LS_PAIR_101:
12003       dex_load_store_pair_fp (cpu); return;
12004
12005     default:
12006       /* Should never reach here.  */
12007       HALT_NYI;
12008     }
12009 }
12010
12011 /* Specific decode and execute for group Data Processing Register.  */
12012
12013 static void
12014 dexLogicalShiftedRegister (sim_cpu *cpu)
12015 {
12016   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
12017      instr[30,29] = op
12018      instr[28:24] = 01010
12019      instr[23,22] = shift : 0 ==> LSL, 1 ==> LSR, 2 ==> ASR, 3 ==> ROR
12020      instr[21]    = N
12021      instr[20,16] = Rm
12022      instr[15,10] = count : must be 0xxxxx for 32 bit
12023      instr[9,5]   = Rn
12024      instr[4,0]   = Rd  */
12025
12026   uint32_t size      = INSTR (31, 31);
12027   Shift    shiftType = INSTR (23, 22);
12028   uint32_t count     = INSTR (15, 10);
12029
12030   /* 32 bit operations must have count[5] = 0.
12031      or else we have an UNALLOC.  */
12032   if (size == 0 && uimm (count, 5, 5))
12033     HALT_UNALLOC;
12034
12035   /* Dispatch on size:op:N.  */
12036   switch ((INSTR (31, 29) << 1) | INSTR (21, 21))
12037     {
12038     case 0: and32_shift  (cpu, shiftType, count); return;
12039     case 1: bic32_shift  (cpu, shiftType, count); return;
12040     case 2: orr32_shift  (cpu, shiftType, count); return;
12041     case 3: orn32_shift  (cpu, shiftType, count); return;
12042     case 4: eor32_shift  (cpu, shiftType, count); return;
12043     case 5: eon32_shift  (cpu, shiftType, count); return;
12044     case 6: ands32_shift (cpu, shiftType, count); return;
12045     case 7: bics32_shift (cpu, shiftType, count); return;
12046     case 8: and64_shift  (cpu, shiftType, count); return;
12047     case 9: bic64_shift  (cpu, shiftType, count); return;
12048     case 10:orr64_shift  (cpu, shiftType, count); return;
12049     case 11:orn64_shift  (cpu, shiftType, count); return;
12050     case 12:eor64_shift  (cpu, shiftType, count); return;
12051     case 13:eon64_shift  (cpu, shiftType, count); return;
12052     case 14:ands64_shift (cpu, shiftType, count); return;
12053     case 15:bics64_shift (cpu, shiftType, count); return;
12054     }
12055 }
12056
12057 /* 32 bit conditional select.  */
12058 static void
12059 csel32 (sim_cpu *cpu, CondCode cc)
12060 {
12061   unsigned rm = INSTR (20, 16);
12062   unsigned rn = INSTR (9, 5);
12063   unsigned rd = INSTR (4, 0);
12064
12065   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12066                        testConditionCode (cpu, cc)
12067                        ? aarch64_get_reg_u32 (cpu, rn, NO_SP)
12068                        : aarch64_get_reg_u32 (cpu, rm, NO_SP));
12069 }
12070
12071 /* 64 bit conditional select.  */
12072 static void
12073 csel64 (sim_cpu *cpu, CondCode cc)
12074 {
12075   unsigned rm = INSTR (20, 16);
12076   unsigned rn = INSTR (9, 5);
12077   unsigned rd = INSTR (4, 0);
12078
12079   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12080                        testConditionCode (cpu, cc)
12081                        ? aarch64_get_reg_u64 (cpu, rn, NO_SP)
12082                        : aarch64_get_reg_u64 (cpu, rm, NO_SP));
12083 }
12084
12085 /* 32 bit conditional increment.  */
12086 static void
12087 csinc32 (sim_cpu *cpu, CondCode cc)
12088 {
12089   unsigned rm = INSTR (20, 16);
12090   unsigned rn = INSTR (9, 5);
12091   unsigned rd = INSTR (4, 0);
12092
12093   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12094                        testConditionCode (cpu, cc)
12095                        ? aarch64_get_reg_u32 (cpu, rn, NO_SP)
12096                        : aarch64_get_reg_u32 (cpu, rm, NO_SP) + 1);
12097 }
12098
12099 /* 64 bit conditional increment.  */
12100 static void
12101 csinc64 (sim_cpu *cpu, CondCode cc)
12102 {
12103   unsigned rm = INSTR (20, 16);
12104   unsigned rn = INSTR (9, 5);
12105   unsigned rd = INSTR (4, 0);
12106
12107   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12108                        testConditionCode (cpu, cc)
12109                        ? aarch64_get_reg_u64 (cpu, rn, NO_SP)
12110                        : aarch64_get_reg_u64 (cpu, rm, NO_SP) + 1);
12111 }
12112
12113 /* 32 bit conditional invert.  */
12114 static void
12115 csinv32 (sim_cpu *cpu, CondCode cc)
12116 {
12117   unsigned rm = INSTR (20, 16);
12118   unsigned rn = INSTR (9, 5);
12119   unsigned rd = INSTR (4, 0);
12120
12121   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12122                        testConditionCode (cpu, cc)
12123                        ? aarch64_get_reg_u32 (cpu, rn, NO_SP)
12124                        : ~ aarch64_get_reg_u32 (cpu, rm, NO_SP));
12125 }
12126
12127 /* 64 bit conditional invert.  */
12128 static void
12129 csinv64 (sim_cpu *cpu, CondCode cc)
12130 {
12131   unsigned rm = INSTR (20, 16);
12132   unsigned rn = INSTR (9, 5);
12133   unsigned rd = INSTR (4, 0);
12134
12135   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12136                        testConditionCode (cpu, cc)
12137                        ? aarch64_get_reg_u64 (cpu, rn, NO_SP)
12138                        : ~ aarch64_get_reg_u64 (cpu, rm, NO_SP));
12139 }
12140
12141 /* 32 bit conditional negate.  */
12142 static void
12143 csneg32 (sim_cpu *cpu, CondCode cc)
12144 {
12145   unsigned rm = INSTR (20, 16);
12146   unsigned rn = INSTR (9, 5);
12147   unsigned rd = INSTR (4, 0);
12148
12149   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12150                        testConditionCode (cpu, cc)
12151                        ? aarch64_get_reg_u32 (cpu, rn, NO_SP)
12152                        : - aarch64_get_reg_u32 (cpu, rm, NO_SP));
12153 }
12154
12155 /* 64 bit conditional negate.  */
12156 static void
12157 csneg64 (sim_cpu *cpu, CondCode cc)
12158 {
12159   unsigned rm = INSTR (20, 16);
12160   unsigned rn = INSTR (9, 5);
12161   unsigned rd = INSTR (4, 0);
12162
12163   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12164                        testConditionCode (cpu, cc)
12165                        ? aarch64_get_reg_u64 (cpu, rn, NO_SP)
12166                        : - aarch64_get_reg_u64 (cpu, rm, NO_SP));
12167 }
12168
12169 static void
12170 dexCondSelect (sim_cpu *cpu)
12171 {
12172   /* instr[28,21] = 11011011
12173      instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
12174      instr[30:11,10] = op : 000 ==> CSEL, 001 ==> CSINC,
12175                             100 ==> CSINV, 101 ==> CSNEG,
12176                             _1_ ==> UNALLOC
12177      instr[29] = S : 0 ==> ok, 1 ==> UNALLOC
12178      instr[15,12] = cond
12179      instr[29] = S : 0 ==> ok, 1 ==> UNALLOC  */
12180
12181   CondCode cc = INSTR (15, 12);
12182   uint32_t S = INSTR (29, 29);
12183   uint32_t op2 = INSTR (11, 10);
12184
12185   if (S == 1)
12186     HALT_UNALLOC;
12187
12188   if (op2 & 0x2)
12189     HALT_UNALLOC;
12190
12191   switch ((INSTR (31, 30) << 1) | op2)
12192     {
12193     case 0: csel32  (cpu, cc); return;
12194     case 1: csinc32 (cpu, cc); return;
12195     case 2: csinv32 (cpu, cc); return;
12196     case 3: csneg32 (cpu, cc); return;
12197     case 4: csel64  (cpu, cc); return;
12198     case 5: csinc64 (cpu, cc); return;
12199     case 6: csinv64 (cpu, cc); return;
12200     case 7: csneg64 (cpu, cc); return;
12201     }
12202 }
12203
12204 /* Some helpers for counting leading 1 or 0 bits.  */
12205
12206 /* Counts the number of leading bits which are the same
12207    in a 32 bit value in the range 1 to 32.  */
12208 static uint32_t
12209 leading32 (uint32_t value)
12210 {
12211   int32_t mask= 0xffff0000;
12212   uint32_t count= 16; /* Counts number of bits set in mask.  */
12213   uint32_t lo = 1;    /* Lower bound for number of sign bits.  */
12214   uint32_t hi = 32;   /* Upper bound for number of sign bits.  */
12215
12216   while (lo + 1 < hi)
12217     {
12218       int32_t test = (value & mask);
12219
12220       if (test == 0 || test == mask)
12221         {
12222           lo = count;
12223           count = (lo + hi) / 2;
12224           mask >>= (count - lo);
12225         }
12226       else
12227         {
12228           hi = count;
12229           count = (lo + hi) / 2;
12230           mask <<= hi - count;
12231         }
12232     }
12233
12234   if (lo != hi)
12235     {
12236       int32_t test;
12237
12238       mask >>= 1;
12239       test = (value & mask);
12240
12241       if (test == 0 || test == mask)
12242         count = hi;
12243       else
12244         count = lo;
12245     }
12246
12247   return count;
12248 }
12249
12250 /* Counts the number of leading bits which are the same
12251    in a 64 bit value in the range 1 to 64.  */
12252 static uint64_t
12253 leading64 (uint64_t value)
12254 {
12255   int64_t mask= 0xffffffff00000000LL;
12256   uint64_t count = 32; /* Counts number of bits set in mask.  */
12257   uint64_t lo = 1;     /* Lower bound for number of sign bits.  */
12258   uint64_t hi = 64;    /* Upper bound for number of sign bits.  */
12259
12260   while (lo + 1 < hi)
12261     {
12262       int64_t test = (value & mask);
12263
12264       if (test == 0 || test == mask)
12265         {
12266           lo = count;
12267           count = (lo + hi) / 2;
12268           mask >>= (count - lo);
12269         }
12270       else
12271         {
12272           hi = count;
12273           count = (lo + hi) / 2;
12274           mask <<= hi - count;
12275         }
12276     }
12277
12278   if (lo != hi)
12279     {
12280       int64_t test;
12281
12282       mask >>= 1;
12283       test = (value & mask);
12284
12285       if (test == 0 || test == mask)
12286         count = hi;
12287       else
12288         count = lo;
12289     }
12290
12291   return count;
12292 }
12293
12294 /* Bit operations.  */
12295 /* N.B register args may not be SP.  */
12296
12297 /* 32 bit count leading sign bits.  */
12298 static void
12299 cls32 (sim_cpu *cpu)
12300 {
12301   unsigned rn = INSTR (9, 5);
12302   unsigned rd = INSTR (4, 0);
12303
12304   /* N.B. the result needs to exclude the leading bit.  */
12305   aarch64_set_reg_u64
12306     (cpu, rd, NO_SP, leading32 (aarch64_get_reg_u32 (cpu, rn, NO_SP)) - 1);
12307 }
12308
12309 /* 64 bit count leading sign bits.  */
12310 static void
12311 cls64 (sim_cpu *cpu)
12312 {
12313   unsigned rn = INSTR (9, 5);
12314   unsigned rd = INSTR (4, 0);
12315
12316   /* N.B. the result needs to exclude the leading bit.  */
12317   aarch64_set_reg_u64
12318     (cpu, rd, NO_SP, leading64 (aarch64_get_reg_u64 (cpu, rn, NO_SP)) - 1);
12319 }
12320
12321 /* 32 bit count leading zero bits.  */
12322 static void
12323 clz32 (sim_cpu *cpu)
12324 {
12325   unsigned rn = INSTR (9, 5);
12326   unsigned rd = INSTR (4, 0);
12327   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
12328
12329   /* if the sign (top) bit is set then the count is 0.  */
12330   if (pick32 (value, 31, 31))
12331     aarch64_set_reg_u64 (cpu, rd, NO_SP, 0L);
12332   else
12333     aarch64_set_reg_u64 (cpu, rd, NO_SP, leading32 (value));
12334 }
12335
12336 /* 64 bit count leading zero bits.  */
12337 static void
12338 clz64 (sim_cpu *cpu)
12339 {
12340   unsigned rn = INSTR (9, 5);
12341   unsigned rd = INSTR (4, 0);
12342   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
12343
12344   /* if the sign (top) bit is set then the count is 0.  */
12345   if (pick64 (value, 63, 63))
12346     aarch64_set_reg_u64 (cpu, rd, NO_SP, 0L);
12347   else
12348     aarch64_set_reg_u64 (cpu, rd, NO_SP, leading64 (value));
12349 }
12350
12351 /* 32 bit reverse bits.  */
12352 static void
12353 rbit32 (sim_cpu *cpu)
12354 {
12355   unsigned rn = INSTR (9, 5);
12356   unsigned rd = INSTR (4, 0);
12357   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
12358   uint32_t result = 0;
12359   int i;
12360
12361   for (i = 0; i < 32; i++)
12362     {
12363       result <<= 1;
12364       result |= (value & 1);
12365       value >>= 1;
12366     }
12367   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
12368 }
12369
12370 /* 64 bit reverse bits.  */
12371 static void
12372 rbit64 (sim_cpu *cpu)
12373 {
12374   unsigned rn = INSTR (9, 5);
12375   unsigned rd = INSTR (4, 0);
12376   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
12377   uint64_t result = 0;
12378   int i;
12379
12380   for (i = 0; i < 64; i++)
12381     {
12382       result <<= 1;
12383       result |= (value & 1UL);
12384       value >>= 1;
12385     }
12386   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
12387 }
12388
12389 /* 32 bit reverse bytes.  */
12390 static void
12391 rev32 (sim_cpu *cpu)
12392 {
12393   unsigned rn = INSTR (9, 5);
12394   unsigned rd = INSTR (4, 0);
12395   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
12396   uint32_t result = 0;
12397   int i;
12398
12399   for (i = 0; i < 4; i++)
12400     {
12401       result <<= 8;
12402       result |= (value & 0xff);
12403       value >>= 8;
12404     }
12405   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
12406 }
12407
12408 /* 64 bit reverse bytes.  */
12409 static void
12410 rev64 (sim_cpu *cpu)
12411 {
12412   unsigned rn = INSTR (9, 5);
12413   unsigned rd = INSTR (4, 0);
12414   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
12415   uint64_t result = 0;
12416   int i;
12417
12418   for (i = 0; i < 8; i++)
12419     {
12420       result <<= 8;
12421       result |= (value & 0xffULL);
12422       value >>= 8;
12423     }
12424   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
12425 }
12426
12427 /* 32 bit reverse shorts.  */
12428 /* N.B.this reverses the order of the bytes in each half word.  */
12429 static void
12430 revh32 (sim_cpu *cpu)
12431 {
12432   unsigned rn = INSTR (9, 5);
12433   unsigned rd = INSTR (4, 0);
12434   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
12435   uint32_t result = 0;
12436   int i;
12437
12438   for (i = 0; i < 2; i++)
12439     {
12440       result <<= 8;
12441       result |= (value & 0x00ff00ff);
12442       value >>= 8;
12443     }
12444   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
12445 }
12446
12447 /* 64 bit reverse shorts.  */
12448 /* N.B.this reverses the order of the bytes in each half word.  */
12449 static void
12450 revh64 (sim_cpu *cpu)
12451 {
12452   unsigned rn = INSTR (9, 5);
12453   unsigned rd = INSTR (4, 0);
12454   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
12455   uint64_t result = 0;
12456   int i;
12457
12458   for (i = 0; i < 2; i++)
12459     {
12460       result <<= 8;
12461       result |= (value & 0x00ff00ff00ff00ffULL);
12462       value >>= 8;
12463     }
12464   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
12465 }
12466
12467 static void
12468 dexDataProc1Source (sim_cpu *cpu)
12469 {
12470   /* instr[30]    = 1
12471      instr[28,21] = 111010110
12472      instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
12473      instr[29]    = S : 0 ==> ok, 1 ==> UNALLOC
12474      instr[20,16] = opcode2 : 00000 ==> ok, ow ==> UNALLOC
12475      instr[15,10] = opcode : 000000 ==> RBIT, 000001 ==> REV16,
12476                              000010 ==> REV, 000011 ==> UNALLOC
12477                              000100 ==> CLZ, 000101 ==> CLS
12478                              ow ==> UNALLOC
12479      instr[9,5]   = rn : may not be SP
12480      instr[4,0]   = rd : may not be SP.  */
12481
12482   uint32_t S = INSTR (29, 29);
12483   uint32_t opcode2 = INSTR (20, 16);
12484   uint32_t opcode = INSTR (15, 10);
12485   uint32_t dispatch = ((INSTR (31, 31) << 3) | opcode);
12486
12487   if (S == 1)
12488     HALT_UNALLOC;
12489
12490   if (opcode2 != 0)
12491     HALT_UNALLOC;
12492
12493   if (opcode & 0x38)
12494     HALT_UNALLOC;
12495
12496   switch (dispatch)
12497     {
12498     case 0: rbit32 (cpu); return;
12499     case 1: revh32 (cpu); return;
12500     case 2: rev32 (cpu); return;
12501     case 4: clz32 (cpu); return;
12502     case 5: cls32 (cpu); return;
12503     case 8: rbit64 (cpu); return;
12504     case 9: revh64 (cpu); return;
12505     case 10:rev32 (cpu); return;
12506     case 11:rev64 (cpu); return;
12507     case 12:clz64 (cpu); return;
12508     case 13:cls64 (cpu); return;
12509     default: HALT_UNALLOC;
12510     }
12511 }
12512
12513 /* Variable shift.
12514    Shifts by count supplied in register.
12515    N.B register args may not be SP.
12516    These all use the shifted auxiliary function for
12517    simplicity and clarity.  Writing the actual shift
12518    inline would avoid a branch and so be faster but
12519    would also necessitate getting signs right.  */
12520
12521 /* 32 bit arithmetic shift right.  */
12522 static void
12523 asrv32 (sim_cpu *cpu)
12524 {
12525   unsigned rm = INSTR (20, 16);
12526   unsigned rn = INSTR (9, 5);
12527   unsigned rd = INSTR (4, 0);
12528
12529   aarch64_set_reg_u64
12530     (cpu, rd, NO_SP,
12531      shifted32 (aarch64_get_reg_u32 (cpu, rn, NO_SP), ASR,
12532                 (aarch64_get_reg_u32 (cpu, rm, NO_SP) & 0x1f)));
12533 }
12534
12535 /* 64 bit arithmetic shift right.  */
12536 static void
12537 asrv64 (sim_cpu *cpu)
12538 {
12539   unsigned rm = INSTR (20, 16);
12540   unsigned rn = INSTR (9, 5);
12541   unsigned rd = INSTR (4, 0);
12542
12543   aarch64_set_reg_u64
12544     (cpu, rd, NO_SP,
12545      shifted64 (aarch64_get_reg_u64 (cpu, rn, NO_SP), ASR,
12546                 (aarch64_get_reg_u64 (cpu, rm, NO_SP) & 0x3f)));
12547 }
12548
12549 /* 32 bit logical shift left.  */
12550 static void
12551 lslv32 (sim_cpu *cpu)
12552 {
12553   unsigned rm = INSTR (20, 16);
12554   unsigned rn = INSTR (9, 5);
12555   unsigned rd = INSTR (4, 0);
12556
12557   aarch64_set_reg_u64
12558     (cpu, rd, NO_SP,
12559      shifted32 (aarch64_get_reg_u32 (cpu, rn, NO_SP), LSL,
12560                 (aarch64_get_reg_u32 (cpu, rm, NO_SP) & 0x1f)));
12561 }
12562
12563 /* 64 bit arithmetic shift left.  */
12564 static void
12565 lslv64 (sim_cpu *cpu)
12566 {
12567   unsigned rm = INSTR (20, 16);
12568   unsigned rn = INSTR (9, 5);
12569   unsigned rd = INSTR (4, 0);
12570
12571   aarch64_set_reg_u64
12572     (cpu, rd, NO_SP,
12573      shifted64 (aarch64_get_reg_u64 (cpu, rn, NO_SP), LSL,
12574                 (aarch64_get_reg_u64 (cpu, rm, NO_SP) & 0x3f)));
12575 }
12576
12577 /* 32 bit logical shift right.  */
12578 static void
12579 lsrv32 (sim_cpu *cpu)
12580 {
12581   unsigned rm = INSTR (20, 16);
12582   unsigned rn = INSTR (9, 5);
12583   unsigned rd = INSTR (4, 0);
12584
12585   aarch64_set_reg_u64
12586     (cpu, rd, NO_SP,
12587      shifted32 (aarch64_get_reg_u32 (cpu, rn, NO_SP), LSR,
12588                 (aarch64_get_reg_u32 (cpu, rm, NO_SP) & 0x1f)));
12589 }
12590
12591 /* 64 bit logical shift right.  */
12592 static void
12593 lsrv64 (sim_cpu *cpu)
12594 {
12595   unsigned rm = INSTR (20, 16);
12596   unsigned rn = INSTR (9, 5);
12597   unsigned rd = INSTR (4, 0);
12598
12599   aarch64_set_reg_u64
12600     (cpu, rd, NO_SP,
12601      shifted64 (aarch64_get_reg_u64 (cpu, rn, NO_SP), LSR,
12602                 (aarch64_get_reg_u64 (cpu, rm, NO_SP) & 0x3f)));
12603 }
12604
12605 /* 32 bit rotate right.  */
12606 static void
12607 rorv32 (sim_cpu *cpu)
12608 {
12609   unsigned rm = INSTR (20, 16);
12610   unsigned rn = INSTR (9, 5);
12611   unsigned rd = INSTR (4, 0);
12612
12613   aarch64_set_reg_u64
12614     (cpu, rd, NO_SP,
12615      shifted32 (aarch64_get_reg_u32 (cpu, rn, NO_SP), ROR,
12616                 (aarch64_get_reg_u32 (cpu, rm, NO_SP) & 0x1f)));
12617 }
12618
12619 /* 64 bit rotate right.  */
12620 static void
12621 rorv64 (sim_cpu *cpu)
12622 {
12623   unsigned rm = INSTR (20, 16);
12624   unsigned rn = INSTR (9, 5);
12625   unsigned rd = INSTR (4, 0);
12626
12627   aarch64_set_reg_u64
12628     (cpu, rd, NO_SP,
12629      shifted64 (aarch64_get_reg_u64 (cpu, rn, NO_SP), ROR,
12630                 (aarch64_get_reg_u64 (cpu, rm, NO_SP) & 0x3f)));
12631 }
12632
12633
12634 /* divide.  */
12635
12636 /* 32 bit signed divide.  */
12637 static void
12638 cpuiv32 (sim_cpu *cpu)
12639 {
12640   unsigned rm = INSTR (20, 16);
12641   unsigned rn = INSTR (9, 5);
12642   unsigned rd = INSTR (4, 0);
12643   /* N.B. the pseudo-code does the divide using 64 bit data.  */
12644   /* TODO : check that this rounds towards zero as required.  */
12645   int64_t dividend = aarch64_get_reg_s32 (cpu, rn, NO_SP);
12646   int64_t divisor = aarch64_get_reg_s32 (cpu, rm, NO_SP);
12647
12648   aarch64_set_reg_s64 (cpu, rd, NO_SP,
12649                        divisor ? ((int32_t) (dividend / divisor)) : 0);
12650 }
12651
12652 /* 64 bit signed divide.  */
12653 static void
12654 cpuiv64 (sim_cpu *cpu)
12655 {
12656   unsigned rm = INSTR (20, 16);
12657   unsigned rn = INSTR (9, 5);
12658   unsigned rd = INSTR (4, 0);
12659
12660   /* TODO : check that this rounds towards zero as required.  */
12661   int64_t divisor = aarch64_get_reg_s64 (cpu, rm, NO_SP);
12662
12663   aarch64_set_reg_s64
12664     (cpu, rd, NO_SP,
12665      divisor ? (aarch64_get_reg_s64 (cpu, rn, NO_SP) / divisor) : 0);
12666 }
12667
12668 /* 32 bit unsigned divide.  */
12669 static void
12670 udiv32 (sim_cpu *cpu)
12671 {
12672   unsigned rm = INSTR (20, 16);
12673   unsigned rn = INSTR (9, 5);
12674   unsigned rd = INSTR (4, 0);
12675
12676   /* N.B. the pseudo-code does the divide using 64 bit data.  */
12677   uint64_t dividend = aarch64_get_reg_u32 (cpu, rn, NO_SP);
12678   uint64_t divisor  = aarch64_get_reg_u32 (cpu, rm, NO_SP);
12679
12680   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12681                        divisor ? (uint32_t) (dividend / divisor) : 0);
12682 }
12683
12684 /* 64 bit unsigned divide.  */
12685 static void
12686 udiv64 (sim_cpu *cpu)
12687 {
12688   unsigned rm = INSTR (20, 16);
12689   unsigned rn = INSTR (9, 5);
12690   unsigned rd = INSTR (4, 0);
12691
12692   /* TODO : check that this rounds towards zero as required.  */
12693   uint64_t divisor = aarch64_get_reg_u64 (cpu, rm, NO_SP);
12694
12695   aarch64_set_reg_u64
12696     (cpu, rd, NO_SP,
12697      divisor ? (aarch64_get_reg_u64 (cpu, rn, NO_SP) / divisor) : 0);
12698 }
12699
12700 static void
12701 dexDataProc2Source (sim_cpu *cpu)
12702 {
12703   /* assert instr[30] == 0
12704      instr[28,21] == 11010110
12705      instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit
12706      instr[29] = S : 0 ==> ok, 1 ==> UNALLOC
12707      instr[15,10] = opcode : 000010 ==> UDIV, 000011 ==> CPUIV,
12708                              001000 ==> LSLV, 001001 ==> LSRV
12709                              001010 ==> ASRV, 001011 ==> RORV
12710                              ow ==> UNALLOC.  */
12711
12712   uint32_t dispatch;
12713   uint32_t S = INSTR (29, 29);
12714   uint32_t opcode = INSTR (15, 10);
12715
12716   if (S == 1)
12717     HALT_UNALLOC;
12718
12719   if (opcode & 0x34)
12720     HALT_UNALLOC;
12721
12722   dispatch = (  (INSTR (31, 31) << 3)
12723               | (uimm (opcode, 3, 3) << 2)
12724               |  uimm (opcode, 1, 0));
12725   switch (dispatch)
12726     {
12727     case 2:  udiv32 (cpu); return;
12728     case 3:  cpuiv32 (cpu); return;
12729     case 4:  lslv32 (cpu); return;
12730     case 5:  lsrv32 (cpu); return;
12731     case 6:  asrv32 (cpu); return;
12732     case 7:  rorv32 (cpu); return;
12733     case 10: udiv64 (cpu); return;
12734     case 11: cpuiv64 (cpu); return;
12735     case 12: lslv64 (cpu); return;
12736     case 13: lsrv64 (cpu); return;
12737     case 14: asrv64 (cpu); return;
12738     case 15: rorv64 (cpu); return;
12739     default: HALT_UNALLOC;
12740     }
12741 }
12742
12743
12744 /* Multiply.  */
12745
12746 /* 32 bit multiply and add.  */
12747 static void
12748 madd32 (sim_cpu *cpu)
12749 {
12750   unsigned rm = INSTR (20, 16);
12751   unsigned ra = INSTR (14, 10);
12752   unsigned rn = INSTR (9, 5);
12753   unsigned rd = INSTR (4, 0);
12754
12755   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
12756   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12757                        aarch64_get_reg_u32 (cpu, ra, NO_SP)
12758                        + aarch64_get_reg_u32 (cpu, rn, NO_SP)
12759                        * aarch64_get_reg_u32 (cpu, rm, NO_SP));
12760 }
12761
12762 /* 64 bit multiply and add.  */
12763 static void
12764 madd64 (sim_cpu *cpu)
12765 {
12766   unsigned rm = INSTR (20, 16);
12767   unsigned ra = INSTR (14, 10);
12768   unsigned rn = INSTR (9, 5);
12769   unsigned rd = INSTR (4, 0);
12770
12771   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
12772   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12773                        aarch64_get_reg_u64 (cpu, ra, NO_SP)
12774                        + (aarch64_get_reg_u64 (cpu, rn, NO_SP)
12775                           * aarch64_get_reg_u64 (cpu, rm, NO_SP)));
12776 }
12777
12778 /* 32 bit multiply and sub.  */
12779 static void
12780 msub32 (sim_cpu *cpu)
12781 {
12782   unsigned rm = INSTR (20, 16);
12783   unsigned ra = INSTR (14, 10);
12784   unsigned rn = INSTR (9, 5);
12785   unsigned rd = INSTR (4, 0);
12786
12787   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
12788   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12789                        aarch64_get_reg_u32 (cpu, ra, NO_SP)
12790                        - aarch64_get_reg_u32 (cpu, rn, NO_SP)
12791                        * aarch64_get_reg_u32 (cpu, rm, NO_SP));
12792 }
12793
12794 /* 64 bit multiply and sub.  */
12795 static void
12796 msub64 (sim_cpu *cpu)
12797 {
12798   unsigned rm = INSTR (20, 16);
12799   unsigned ra = INSTR (14, 10);
12800   unsigned rn = INSTR (9, 5);
12801   unsigned rd = INSTR (4, 0);
12802
12803   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
12804   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12805                        aarch64_get_reg_u64 (cpu, ra, NO_SP)
12806                        - aarch64_get_reg_u64 (cpu, rn, NO_SP)
12807                        * aarch64_get_reg_u64 (cpu, rm, NO_SP));
12808 }
12809
12810 /* Signed multiply add long -- source, source2 : 32 bit, source3 : 64 bit.  */
12811 static void
12812 smaddl (sim_cpu *cpu)
12813 {
12814   unsigned rm = INSTR (20, 16);
12815   unsigned ra = INSTR (14, 10);
12816   unsigned rn = INSTR (9, 5);
12817   unsigned rd = INSTR (4, 0);
12818
12819   /* N.B. we need to multiply the signed 32 bit values in rn, rm to
12820      obtain a 64 bit product.  */
12821   aarch64_set_reg_s64
12822     (cpu, rd, NO_SP,
12823      aarch64_get_reg_s64 (cpu, ra, NO_SP)
12824      + ((int64_t) aarch64_get_reg_s32 (cpu, rn, NO_SP))
12825      * ((int64_t) aarch64_get_reg_s32 (cpu, rm, NO_SP)));
12826 }
12827
12828 /* Signed multiply sub long -- source, source2 : 32 bit, source3 : 64 bit.  */
12829 static void
12830 smsubl (sim_cpu *cpu)
12831 {
12832   unsigned rm = INSTR (20, 16);
12833   unsigned ra = INSTR (14, 10);
12834   unsigned rn = INSTR (9, 5);
12835   unsigned rd = INSTR (4, 0);
12836
12837   /* N.B. we need to multiply the signed 32 bit values in rn, rm to
12838      obtain a 64 bit product.  */
12839   aarch64_set_reg_s64
12840     (cpu, rd, NO_SP,
12841      aarch64_get_reg_s64 (cpu, ra, NO_SP)
12842      - ((int64_t) aarch64_get_reg_s32 (cpu, rn, NO_SP))
12843      * ((int64_t) aarch64_get_reg_s32 (cpu, rm, NO_SP)));
12844 }
12845
12846 /* Integer Multiply/Divide.  */
12847
12848 /* First some macros and a helper function.  */
12849 /* Macros to test or access elements of 64 bit words.  */
12850
12851 /* Mask used to access lo 32 bits of 64 bit unsigned int.  */
12852 #define LOW_WORD_MASK ((1ULL << 32) - 1)
12853 /* Return the lo 32 bit word of a 64 bit unsigned int as a 64 bit unsigned int.  */
12854 #define lowWordToU64(_value_u64) ((_value_u64) & LOW_WORD_MASK)
12855 /* Return the hi 32 bit word of a 64 bit unsigned int as a 64 bit unsigned int.  */
12856 #define highWordToU64(_value_u64) ((_value_u64) >> 32)
12857
12858 /* Offset of sign bit in 64 bit signed integger.  */
12859 #define SIGN_SHIFT_U64 63
12860 /* The sign bit itself -- also identifies the minimum negative int value.  */
12861 #define SIGN_BIT_U64 (1UL << SIGN_SHIFT_U64)
12862 /* Return true if a 64 bit signed int presented as an unsigned int is the
12863    most negative value.  */
12864 #define isMinimumU64(_value_u64) ((_value_u64) == SIGN_BIT_U64)
12865 /* Return true (non-zero) if a 64 bit signed int presented as an unsigned
12866    int has its sign bit set to false.  */
12867 #define isSignSetU64(_value_u64) ((_value_u64) & SIGN_BIT_U64)
12868 /* Return 1L or -1L according to whether a 64 bit signed int presented as
12869    an unsigned int has its sign bit set or not.  */
12870 #define signOfU64(_value_u64) (1L + (((value_u64) >> SIGN_SHIFT_U64) * -2L)
12871 /* Clear the sign bit of a 64 bit signed int presented as an unsigned int.  */
12872 #define clearSignU64(_value_u64) ((_value_u64) &= ~SIGN_BIT_U64)
12873
12874 /* Multiply two 64 bit ints and return.
12875    the hi 64 bits of the 128 bit product.  */
12876
12877 static uint64_t
12878 mul64hi (uint64_t value1, uint64_t value2)
12879 {
12880   uint64_t resultmid1;
12881   uint64_t result;
12882   uint64_t value1_lo = lowWordToU64 (value1);
12883   uint64_t value1_hi = highWordToU64 (value1) ;
12884   uint64_t value2_lo = lowWordToU64 (value2);
12885   uint64_t value2_hi = highWordToU64 (value2);
12886
12887   /* Cross-multiply and collect results.  */
12888   uint64_t xproductlo = value1_lo * value2_lo;
12889   uint64_t xproductmid1 = value1_lo * value2_hi;
12890   uint64_t xproductmid2 = value1_hi * value2_lo;
12891   uint64_t xproducthi = value1_hi * value2_hi;
12892   uint64_t carry = 0;
12893   /* Start accumulating 64 bit results.  */
12894   /* Drop bottom half of lowest cross-product.  */
12895   uint64_t resultmid = xproductlo >> 32;
12896   /* Add in middle products.  */
12897   resultmid = resultmid + xproductmid1;
12898
12899   /* Check for overflow.  */
12900   if (resultmid < xproductmid1)
12901     /* Carry over 1 into top cross-product.  */
12902     carry++;
12903
12904   resultmid1  = resultmid + xproductmid2;
12905
12906   /* Check for overflow.  */
12907   if (resultmid1 < xproductmid2)
12908     /* Carry over 1 into top cross-product.  */
12909     carry++;
12910
12911   /* Drop lowest 32 bits of middle cross-product.  */
12912   result = resultmid1 >> 32;
12913
12914   /* Add top cross-product plus and any carry.  */
12915   result += xproducthi + carry;
12916
12917   return result;
12918 }
12919
12920 /* Signed multiply high, source, source2 :
12921    64 bit, dest <-- high 64-bit of result.  */
12922 static void
12923 smulh (sim_cpu *cpu)
12924 {
12925   uint64_t uresult;
12926   int64_t  result;
12927   unsigned rm = INSTR (20, 16);
12928   unsigned rn = INSTR (9, 5);
12929   unsigned rd = INSTR (4, 0);
12930   GReg     ra = INSTR (14, 10);
12931   int64_t  value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
12932   int64_t  value2 = aarch64_get_reg_u64 (cpu, rm, NO_SP);
12933   uint64_t uvalue1;
12934   uint64_t uvalue2;
12935   int64_t  signum = 1;
12936
12937   if (ra != R31)
12938     HALT_UNALLOC;
12939
12940   /* Convert to unsigned and use the unsigned mul64hi routine
12941      the fix the sign up afterwards.  */
12942   if (value1 < 0)
12943     {
12944       signum *= -1L;
12945       uvalue1 = -value1;
12946     }
12947   else
12948     {
12949       uvalue1 = value1;
12950     }
12951
12952   if (value2 < 0)
12953     {
12954       signum *= -1L;
12955       uvalue2 = -value2;
12956     }
12957   else
12958     {
12959       uvalue2 = value2;
12960     }
12961
12962   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
12963   uresult = mul64hi (uvalue1, uvalue2);
12964   result = uresult;
12965   result *= signum;
12966
12967   aarch64_set_reg_s64 (cpu, rd, NO_SP, result);
12968 }
12969
12970 /* Unsigned multiply add long -- source, source2 :
12971    32 bit, source3 : 64 bit.  */
12972 static void
12973 umaddl (sim_cpu *cpu)
12974 {
12975   unsigned rm = INSTR (20, 16);
12976   unsigned ra = INSTR (14, 10);
12977   unsigned rn = INSTR (9, 5);
12978   unsigned rd = INSTR (4, 0);
12979
12980   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
12981   /* N.B. we need to multiply the signed 32 bit values in rn, rm to
12982      obtain a 64 bit product.  */
12983   aarch64_set_reg_u64
12984     (cpu, rd, NO_SP,
12985      aarch64_get_reg_u64 (cpu, ra, NO_SP)
12986      + ((uint64_t) aarch64_get_reg_u32 (cpu, rn, NO_SP))
12987      * ((uint64_t) aarch64_get_reg_u32 (cpu, rm, NO_SP)));
12988 }
12989
12990 /* Unsigned multiply sub long -- source, source2 : 32 bit, source3 : 64 bit.  */
12991 static void
12992 umsubl (sim_cpu *cpu)
12993 {
12994   unsigned rm = INSTR (20, 16);
12995   unsigned ra = INSTR (14, 10);
12996   unsigned rn = INSTR (9, 5);
12997   unsigned rd = INSTR (4, 0);
12998
12999   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13000   /* N.B. we need to multiply the signed 32 bit values in rn, rm to
13001      obtain a 64 bit product.  */
13002   aarch64_set_reg_u64
13003     (cpu, rd, NO_SP,
13004      aarch64_get_reg_u64 (cpu, ra, NO_SP)
13005      - ((uint64_t) aarch64_get_reg_u32 (cpu, rn, NO_SP))
13006      * ((uint64_t) aarch64_get_reg_u32 (cpu, rm, NO_SP)));
13007 }
13008
13009 /* Unsigned multiply high, source, source2 :
13010    64 bit, dest <-- high 64-bit of result.  */
13011 static void
13012 umulh (sim_cpu *cpu)
13013 {
13014   unsigned rm = INSTR (20, 16);
13015   unsigned rn = INSTR (9, 5);
13016   unsigned rd = INSTR (4, 0);
13017   GReg     ra = INSTR (14, 10);
13018
13019   if (ra != R31)
13020     HALT_UNALLOC;
13021
13022   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13023   aarch64_set_reg_u64 (cpu, rd, NO_SP,
13024                        mul64hi (aarch64_get_reg_u64 (cpu, rn, NO_SP),
13025                                 aarch64_get_reg_u64 (cpu, rm, NO_SP)));
13026 }
13027
13028 static void
13029 dexDataProc3Source (sim_cpu *cpu)
13030 {
13031   /* assert instr[28,24] == 11011.  */
13032   /* instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit (for rd at least)
13033      instr[30,29] = op54 : 00 ==> ok, ow ==> UNALLOC
13034      instr[23,21] = op31 : 111 ==> UNALLOC, o2 ==> ok
13035      instr[15] = o0 : 0/1 ==> ok
13036      instr[23,21:15] ==> op : 0000 ==> MADD, 0001 ==> MSUB,     (32/64 bit)
13037                               0010 ==> SMADDL, 0011 ==> SMSUBL, (64 bit only)
13038                               0100 ==> SMULH,                   (64 bit only)
13039                               1010 ==> UMADDL, 1011 ==> UNSUBL, (64 bit only)
13040                               1100 ==> UMULH                    (64 bit only)
13041                               ow ==> UNALLOC.  */
13042
13043   uint32_t dispatch;
13044   uint32_t size = INSTR (31, 31);
13045   uint32_t op54 = INSTR (30, 29);
13046   uint32_t op31 = INSTR (23, 21);
13047   uint32_t o0 = INSTR (15, 15);
13048
13049   if (op54 != 0)
13050     HALT_UNALLOC;
13051
13052   if (size == 0)
13053     {
13054       if (op31 != 0)
13055         HALT_UNALLOC;
13056
13057       if (o0 == 0)
13058         madd32 (cpu);
13059       else
13060         msub32 (cpu);
13061       return;
13062     }
13063
13064   dispatch = (op31 << 1) | o0;
13065
13066   switch (dispatch)
13067     {
13068     case 0:  madd64 (cpu); return;
13069     case 1:  msub64 (cpu); return;
13070     case 2:  smaddl (cpu); return;
13071     case 3:  smsubl (cpu); return;
13072     case 4:  smulh (cpu); return;
13073     case 10: umaddl (cpu); return;
13074     case 11: umsubl (cpu); return;
13075     case 12: umulh (cpu); return;
13076     default: HALT_UNALLOC;
13077     }
13078 }
13079
13080 static void
13081 dexDPReg (sim_cpu *cpu)
13082 {
13083   /* uint32_t group = dispatchGroup (aarch64_get_instr (cpu));
13084      assert  group == GROUP_DPREG_0101 || group == GROUP_DPREG_1101
13085      bits [28:24:21] of a DPReg are the secondary dispatch vector.  */
13086   uint32_t group2 = dispatchDPReg (aarch64_get_instr (cpu));
13087
13088   switch (group2)
13089     {
13090     case DPREG_LOG_000:
13091     case DPREG_LOG_001:
13092       dexLogicalShiftedRegister (cpu); return;
13093
13094     case DPREG_ADDSHF_010:
13095       dexAddSubtractShiftedRegister (cpu); return;
13096
13097     case DPREG_ADDEXT_011:
13098       dexAddSubtractExtendedRegister (cpu); return;
13099
13100     case DPREG_ADDCOND_100:
13101       {
13102         /* This set bundles a variety of different operations.  */
13103         /* Check for.  */
13104         /* 1) add/sub w carry.  */
13105         uint32_t mask1 = 0x1FE00000U;
13106         uint32_t val1  = 0x1A000000U;
13107         /* 2) cond compare register/immediate.  */
13108         uint32_t mask2 = 0x1FE00000U;
13109         uint32_t val2  = 0x1A400000U;
13110         /* 3) cond select.  */
13111         uint32_t mask3 = 0x1FE00000U;
13112         uint32_t val3  = 0x1A800000U;
13113         /* 4) data proc 1/2 source.  */
13114         uint32_t mask4 = 0x1FE00000U;
13115         uint32_t val4  = 0x1AC00000U;
13116
13117         if ((aarch64_get_instr (cpu) & mask1) == val1)
13118           dexAddSubtractWithCarry (cpu);
13119
13120         else if ((aarch64_get_instr (cpu) & mask2) == val2)
13121           CondCompare (cpu);
13122
13123         else if ((aarch64_get_instr (cpu) & mask3) == val3)
13124           dexCondSelect (cpu);
13125
13126         else if ((aarch64_get_instr (cpu) & mask4) == val4)
13127           {
13128             /* Bit 30 is clear for data proc 2 source
13129                and set for data proc 1 source.  */
13130             if (aarch64_get_instr (cpu)  & (1U << 30))
13131               dexDataProc1Source (cpu);
13132             else
13133               dexDataProc2Source (cpu);
13134           }
13135
13136         else
13137           /* Should not reach here.  */
13138           HALT_NYI;
13139
13140         return;
13141       }
13142
13143     case DPREG_3SRC_110:
13144       dexDataProc3Source (cpu); return;
13145
13146     case DPREG_UNALLOC_101:
13147       HALT_UNALLOC;
13148
13149     case DPREG_3SRC_111:
13150       dexDataProc3Source (cpu); return;
13151
13152     default:
13153       /* Should never reach here.  */
13154       HALT_NYI;
13155     }
13156 }
13157
13158 /* Unconditional Branch immediate.
13159    Offset is a PC-relative byte offset in the range +/- 128MiB.
13160    The offset is assumed to be raw from the decode i.e. the
13161    simulator is expected to scale them from word offsets to byte.  */
13162
13163 /* Unconditional branch.  */
13164 static void
13165 buc (sim_cpu *cpu, int32_t offset)
13166 {
13167   aarch64_set_next_PC_by_offset (cpu, offset);
13168 }
13169
13170 static unsigned stack_depth = 0;
13171
13172 /* Unconditional branch and link -- writes return PC to LR.  */
13173 static void
13174 bl (sim_cpu *cpu, int32_t offset)
13175 {
13176   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13177   aarch64_save_LR (cpu);
13178   aarch64_set_next_PC_by_offset (cpu, offset);
13179
13180   if (TRACE_BRANCH_P (cpu))
13181     {
13182       ++ stack_depth;
13183       TRACE_BRANCH (cpu,
13184                     " %*scall %" PRIx64 " [%s]"
13185                     " [args: %" PRIx64 " %" PRIx64 " %" PRIx64 "]",
13186                     stack_depth, " ", aarch64_get_next_PC (cpu),
13187                     aarch64_get_func (CPU_STATE (cpu),
13188                                       aarch64_get_next_PC (cpu)),
13189                     aarch64_get_reg_u64 (cpu, 0, NO_SP),
13190                     aarch64_get_reg_u64 (cpu, 1, NO_SP),
13191                     aarch64_get_reg_u64 (cpu, 2, NO_SP)
13192                     );
13193     }
13194 }
13195
13196 /* Unconditional Branch register.
13197    Branch/return address is in source register.  */
13198
13199 /* Unconditional branch.  */
13200 static void
13201 br (sim_cpu *cpu)
13202 {
13203   unsigned rn = INSTR (9, 5);
13204   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13205   aarch64_set_next_PC (cpu, aarch64_get_reg_u64 (cpu, rn, NO_SP));
13206 }
13207
13208 /* Unconditional branch and link -- writes return PC to LR.  */
13209 static void
13210 blr (sim_cpu *cpu)
13211 {
13212   unsigned rn = INSTR (9, 5);
13213
13214   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13215   /* The pseudo code in the spec says we update LR before fetching.
13216      the value from the rn.  */
13217   aarch64_save_LR (cpu);
13218   aarch64_set_next_PC (cpu, aarch64_get_reg_u64 (cpu, rn, NO_SP));
13219
13220   if (TRACE_BRANCH_P (cpu))
13221     {
13222       ++ stack_depth;
13223       TRACE_BRANCH (cpu,
13224                     " %*scall %" PRIx64 " [%s]"
13225                     " [args: %" PRIx64 " %" PRIx64 " %" PRIx64 "]",
13226                     stack_depth, " ", aarch64_get_next_PC (cpu),
13227                     aarch64_get_func (CPU_STATE (cpu),
13228                                       aarch64_get_next_PC (cpu)),
13229                     aarch64_get_reg_u64 (cpu, 0, NO_SP),
13230                     aarch64_get_reg_u64 (cpu, 1, NO_SP),
13231                     aarch64_get_reg_u64 (cpu, 2, NO_SP)
13232                     );
13233     }
13234 }
13235
13236 /* Return -- assembler will default source to LR this is functionally
13237    equivalent to br but, presumably, unlike br it side effects the
13238    branch predictor.  */
13239 static void
13240 ret (sim_cpu *cpu)
13241 {
13242   unsigned rn = INSTR (9, 5);
13243   aarch64_set_next_PC (cpu, aarch64_get_reg_u64 (cpu, rn, NO_SP));
13244
13245   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13246   if (TRACE_BRANCH_P (cpu))
13247     {
13248       TRACE_BRANCH (cpu,
13249                     " %*sreturn [result: %" PRIx64 "]",
13250                     stack_depth, " ", aarch64_get_reg_u64 (cpu, 0, NO_SP));
13251       -- stack_depth;
13252     }
13253 }
13254
13255 /* NOP -- we implement this and call it from the decode in case we
13256    want to intercept it later.  */
13257
13258 static void
13259 nop (sim_cpu *cpu)
13260 {
13261   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13262 }
13263
13264 /* Data synchronization barrier.  */
13265
13266 static void
13267 dsb (sim_cpu *cpu)
13268 {
13269   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13270 }
13271
13272 /* Data memory barrier.  */
13273
13274 static void
13275 dmb (sim_cpu *cpu)
13276 {
13277   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13278 }
13279
13280 /* Instruction synchronization barrier.  */
13281
13282 static void
13283 isb (sim_cpu *cpu)
13284 {
13285   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13286 }
13287
13288 static void
13289 dexBranchImmediate (sim_cpu *cpu)
13290 {
13291   /* assert instr[30,26] == 00101
13292      instr[31] ==> 0 == B, 1 == BL
13293      instr[25,0] == imm26 branch offset counted in words.  */
13294
13295   uint32_t top = INSTR (31, 31);
13296   /* We have a 26 byte signed word offset which we need to pass to the
13297      execute routine as a signed byte offset.  */
13298   int32_t offset = simm32 (aarch64_get_instr (cpu), 25, 0) << 2;
13299
13300   if (top)
13301     bl (cpu, offset);
13302   else
13303     buc (cpu, offset);
13304 }
13305
13306 /* Control Flow.  */
13307
13308 /* Conditional branch
13309
13310    Offset is a PC-relative byte offset in the range +/- 1MiB pos is
13311    a bit position in the range 0 .. 63
13312
13313    cc is a CondCode enum value as pulled out of the decode
13314
13315    N.B. any offset register (source) can only be Xn or Wn.  */
13316
13317 static void
13318 bcc (sim_cpu *cpu, int32_t offset, CondCode cc)
13319 {
13320   /* The test returns TRUE if CC is met.  */
13321   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13322   if (testConditionCode (cpu, cc))
13323     aarch64_set_next_PC_by_offset (cpu, offset);
13324 }
13325
13326 /* 32 bit branch on register non-zero.  */
13327 static void
13328 cbnz32 (sim_cpu *cpu, int32_t offset)
13329 {
13330   unsigned rt = INSTR (4, 0);
13331
13332   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13333   if (aarch64_get_reg_u32 (cpu, rt, NO_SP) != 0)
13334     aarch64_set_next_PC_by_offset (cpu, offset);
13335 }
13336
13337 /* 64 bit branch on register zero.  */
13338 static void
13339 cbnz (sim_cpu *cpu, int32_t offset)
13340 {
13341   unsigned rt = INSTR (4, 0);
13342
13343   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13344   if (aarch64_get_reg_u64 (cpu, rt, NO_SP) != 0)
13345     aarch64_set_next_PC_by_offset (cpu, offset);
13346 }
13347
13348 /* 32 bit branch on register non-zero.  */
13349 static void
13350 cbz32 (sim_cpu *cpu, int32_t offset)
13351 {
13352   unsigned rt = INSTR (4, 0);
13353
13354   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13355   if (aarch64_get_reg_u32 (cpu, rt, NO_SP) == 0)
13356     aarch64_set_next_PC_by_offset (cpu, offset);
13357 }
13358
13359 /* 64 bit branch on register zero.  */
13360 static void
13361 cbz (sim_cpu *cpu, int32_t offset)
13362 {
13363   unsigned rt = INSTR (4, 0);
13364
13365   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13366   if (aarch64_get_reg_u64 (cpu, rt, NO_SP) == 0)
13367     aarch64_set_next_PC_by_offset (cpu, offset);
13368 }
13369
13370 /* Branch on register bit test non-zero -- one size fits all.  */
13371 static void
13372 tbnz (sim_cpu *cpu, uint32_t  pos, int32_t offset)
13373 {
13374   unsigned rt = INSTR (4, 0);
13375
13376   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13377   if (aarch64_get_reg_u64 (cpu, rt, NO_SP) & (((uint64_t) 1) << pos))
13378     aarch64_set_next_PC_by_offset (cpu, offset);
13379 }
13380
13381 /* Branch on register bit test zero -- one size fits all.  */
13382 static void
13383 tbz (sim_cpu *cpu, uint32_t  pos, int32_t offset)
13384 {
13385   unsigned rt = INSTR (4, 0);
13386
13387   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13388   if (!(aarch64_get_reg_u64 (cpu, rt, NO_SP) & (((uint64_t) 1) << pos)))
13389     aarch64_set_next_PC_by_offset (cpu, offset);
13390 }
13391
13392 static void
13393 dexCompareBranchImmediate (sim_cpu *cpu)
13394 {
13395   /* instr[30,25] = 01 1010
13396      instr[31]    = size : 0 ==> 32, 1 ==> 64
13397      instr[24]    = op : 0 ==> CBZ, 1 ==> CBNZ
13398      instr[23,5]  = simm19 branch offset counted in words
13399      instr[4,0]   = rt  */
13400
13401   uint32_t size = INSTR (31, 31);
13402   uint32_t op   = INSTR (24, 24);
13403   int32_t offset = simm32 (aarch64_get_instr (cpu), 23, 5) << 2;
13404
13405   if (size == 0)
13406     {
13407       if (op == 0)
13408         cbz32 (cpu, offset);
13409       else
13410         cbnz32 (cpu, offset);
13411     }
13412   else
13413     {
13414       if (op == 0)
13415         cbz (cpu, offset);
13416       else
13417         cbnz (cpu, offset);
13418     }
13419 }
13420
13421 static void
13422 dexTestBranchImmediate (sim_cpu *cpu)
13423 {
13424   /* instr[31]    = b5 : bit 5 of test bit idx
13425      instr[30,25] = 01 1011
13426      instr[24]    = op : 0 ==> TBZ, 1 == TBNZ
13427      instr[23,19] = b40 : bits 4 to 0 of test bit idx
13428      instr[18,5]  = simm14 : signed offset counted in words
13429      instr[4,0]   = uimm5  */
13430
13431   uint32_t pos = ((INSTR (31, 31) << 5) | INSTR (23, 19));
13432   int32_t offset = simm32 (aarch64_get_instr (cpu), 18, 5) << 2;
13433
13434   NYI_assert (30, 25, 0x1b);
13435
13436   if (INSTR (24, 24) == 0)
13437     tbz (cpu, pos, offset);
13438   else
13439     tbnz (cpu, pos, offset);
13440 }
13441
13442 static void
13443 dexCondBranchImmediate (sim_cpu *cpu)
13444 {
13445   /* instr[31,25] = 010 1010
13446      instr[24]    = op1; op => 00 ==> B.cond
13447      instr[23,5]  = simm19 : signed offset counted in words
13448      instr[4]     = op0
13449      instr[3,0]   = cond  */
13450
13451   int32_t offset;
13452   uint32_t op = ((INSTR (24, 24) << 1) | INSTR (4, 4));
13453
13454   NYI_assert (31, 25, 0x2a);
13455
13456   if (op != 0)
13457     HALT_UNALLOC;
13458
13459   offset = simm32 (aarch64_get_instr (cpu), 23, 5) << 2;
13460
13461   bcc (cpu, offset, INSTR (3, 0));
13462 }
13463
13464 static void
13465 dexBranchRegister (sim_cpu *cpu)
13466 {
13467   /* instr[31,25] = 110 1011
13468      instr[24,21] = op : 0 ==> BR, 1 => BLR, 2 => RET, 3 => ERET, 4 => DRPS
13469      instr[20,16] = op2 : must be 11111
13470      instr[15,10] = op3 : must be 000000
13471      instr[4,0]   = op2 : must be 11111.  */
13472
13473   uint32_t op = INSTR (24, 21);
13474   uint32_t op2 = INSTR (20, 16);
13475   uint32_t op3 = INSTR (15, 10);
13476   uint32_t op4 = INSTR (4, 0);
13477
13478   NYI_assert (31, 25, 0x6b);
13479
13480   if (op2 != 0x1F || op3 != 0 || op4 != 0)
13481     HALT_UNALLOC;
13482
13483   if (op == 0)
13484     br (cpu);
13485
13486   else if (op == 1)
13487     blr (cpu);
13488
13489   else if (op == 2)
13490     ret (cpu);
13491
13492   else
13493     {
13494       /* ERET and DRPS accept 0b11111 for rn = instr [4,0].  */
13495       /* anything else is unallocated.  */
13496       uint32_t rn = INSTR (4, 0);
13497
13498       if (rn != 0x1f)
13499         HALT_UNALLOC;
13500
13501       if (op == 4 || op == 5)
13502         HALT_NYI;
13503
13504       HALT_UNALLOC;
13505     }
13506 }
13507
13508 /* FIXME: We should get the Angel SWI values from ../../libgloss/aarch64/svc.h
13509    but this may not be available.  So instead we define the values we need
13510    here.  */
13511 #define AngelSVC_Reason_Open            0x01
13512 #define AngelSVC_Reason_Close           0x02
13513 #define AngelSVC_Reason_Write           0x05
13514 #define AngelSVC_Reason_Read            0x06
13515 #define AngelSVC_Reason_IsTTY           0x09
13516 #define AngelSVC_Reason_Seek            0x0A
13517 #define AngelSVC_Reason_FLen            0x0C
13518 #define AngelSVC_Reason_Remove          0x0E
13519 #define AngelSVC_Reason_Rename          0x0F
13520 #define AngelSVC_Reason_Clock           0x10
13521 #define AngelSVC_Reason_Time            0x11
13522 #define AngelSVC_Reason_System          0x12
13523 #define AngelSVC_Reason_Errno           0x13
13524 #define AngelSVC_Reason_GetCmdLine      0x15
13525 #define AngelSVC_Reason_HeapInfo        0x16
13526 #define AngelSVC_Reason_ReportException 0x18
13527 #define AngelSVC_Reason_Elapsed         0x30
13528
13529
13530 static void
13531 handle_halt (sim_cpu *cpu, uint32_t val)
13532 {
13533   uint64_t result = 0;
13534
13535   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13536   if (val != 0xf000)
13537     {
13538       TRACE_SYSCALL (cpu, " HLT [0x%x]", val);
13539       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13540                        sim_stopped, SIM_SIGTRAP);
13541     }
13542
13543   /* We have encountered an Angel SVC call.  See if we can process it.  */
13544   switch (aarch64_get_reg_u32 (cpu, 0, NO_SP))
13545     {
13546     case AngelSVC_Reason_HeapInfo:
13547       {
13548         /* Get the values.  */
13549         uint64_t stack_top = aarch64_get_stack_start (cpu);
13550         uint64_t heap_base = aarch64_get_heap_start (cpu);
13551
13552         /* Get the pointer  */
13553         uint64_t ptr = aarch64_get_reg_u64 (cpu, 1, SP_OK);
13554         ptr = aarch64_get_mem_u64 (cpu, ptr);
13555
13556         /* Fill in the memory block.  */
13557         /* Start addr of heap.  */
13558         aarch64_set_mem_u64 (cpu, ptr +  0, heap_base);
13559         /* End addr of heap.  */
13560         aarch64_set_mem_u64 (cpu, ptr +  8, stack_top);
13561         /* Lowest stack addr.  */
13562         aarch64_set_mem_u64 (cpu, ptr + 16, heap_base);
13563         /* Initial stack addr.  */
13564         aarch64_set_mem_u64 (cpu, ptr + 24, stack_top);
13565
13566         TRACE_SYSCALL (cpu, " AngelSVC: Get Heap Info");
13567       }
13568       break;
13569
13570     case AngelSVC_Reason_Open:
13571       {
13572         /* Get the pointer  */
13573         /* uint64_t ptr = aarch64_get_reg_u64 (cpu, 1, SP_OK);.  */
13574         /* FIXME: For now we just assume that we will only be asked
13575            to open the standard file descriptors.  */
13576         static int fd = 0;
13577         result = fd ++;
13578
13579         TRACE_SYSCALL (cpu, " AngelSVC: Open file %d", fd - 1);
13580       }
13581       break;
13582
13583     case AngelSVC_Reason_Close:
13584       {
13585         uint64_t fh = aarch64_get_reg_u64 (cpu, 1, SP_OK);
13586         TRACE_SYSCALL (cpu, " AngelSVC: Close file %d", (int) fh);
13587         result = 0;
13588       }
13589       break;
13590
13591     case AngelSVC_Reason_Errno:
13592       result = 0;
13593       TRACE_SYSCALL (cpu, " AngelSVC: Get Errno");
13594       break;
13595
13596     case AngelSVC_Reason_Clock:
13597       result =
13598 #ifdef CLOCKS_PER_SEC
13599         (CLOCKS_PER_SEC >= 100)
13600         ? (clock () / (CLOCKS_PER_SEC / 100))
13601         : ((clock () * 100) / CLOCKS_PER_SEC)
13602 #else
13603         /* Presume unix... clock() returns microseconds.  */
13604         (clock () / 10000)
13605 #endif
13606         ;
13607         TRACE_SYSCALL (cpu, " AngelSVC: Get Clock");
13608       break;
13609
13610     case AngelSVC_Reason_GetCmdLine:
13611       {
13612         /* Get the pointer  */
13613         uint64_t ptr = aarch64_get_reg_u64 (cpu, 1, SP_OK);
13614         ptr = aarch64_get_mem_u64 (cpu, ptr);
13615
13616         /* FIXME: No command line for now.  */
13617         aarch64_set_mem_u64 (cpu, ptr, 0);
13618         TRACE_SYSCALL (cpu, " AngelSVC: Get Command Line");
13619       }
13620       break;
13621
13622     case AngelSVC_Reason_IsTTY:
13623       result = 1;
13624         TRACE_SYSCALL (cpu, " AngelSVC: IsTTY ?");
13625       break;
13626
13627     case AngelSVC_Reason_Write:
13628       {
13629         /* Get the pointer  */
13630         uint64_t ptr = aarch64_get_reg_u64 (cpu, 1, SP_OK);
13631         /* Get the write control block.  */
13632         uint64_t fd  = aarch64_get_mem_u64 (cpu, ptr);
13633         uint64_t buf = aarch64_get_mem_u64 (cpu, ptr + 8);
13634         uint64_t len = aarch64_get_mem_u64 (cpu, ptr + 16);
13635
13636         TRACE_SYSCALL (cpu, "write of %" PRIx64 " bytes from %"
13637                        PRIx64 " on descriptor %" PRIx64,
13638                        len, buf, fd);
13639
13640         if (len > 1280)
13641           {
13642             TRACE_SYSCALL (cpu,
13643                            " AngelSVC: Write: Suspiciously long write: %ld",
13644                            (long) len);
13645             sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13646                              sim_stopped, SIM_SIGBUS);
13647           }
13648         else if (fd == 1)
13649           {
13650             printf ("%.*s", (int) len, aarch64_get_mem_ptr (cpu, buf));
13651           }
13652         else if (fd == 2)
13653           {
13654             TRACE (cpu, 0, "\n");
13655             sim_io_eprintf (CPU_STATE (cpu), "%.*s",
13656                             (int) len, aarch64_get_mem_ptr (cpu, buf));
13657             TRACE (cpu, 0, "\n");
13658           }
13659         else
13660           {
13661             TRACE_SYSCALL (cpu,
13662                            " AngelSVC: Write: Unexpected file handle: %d",
13663                            (int) fd);
13664             sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13665                              sim_stopped, SIM_SIGABRT);
13666           }
13667       }
13668       break;
13669
13670     case AngelSVC_Reason_ReportException:
13671       {
13672         /* Get the pointer  */
13673         uint64_t ptr = aarch64_get_reg_u64 (cpu, 1, SP_OK);
13674         /*ptr = aarch64_get_mem_u64 (cpu, ptr);.  */
13675         uint64_t type = aarch64_get_mem_u64 (cpu, ptr);
13676         uint64_t state = aarch64_get_mem_u64 (cpu, ptr + 8);
13677
13678         TRACE_SYSCALL (cpu,
13679                        "Angel Exception: type 0x%" PRIx64 " state %" PRIx64,
13680                        type, state);
13681
13682         if (type == 0x20026)
13683           sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13684                            sim_exited, state);
13685         else
13686           sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13687                            sim_stopped, SIM_SIGINT);
13688       }
13689       break;
13690
13691     case AngelSVC_Reason_Read:
13692     case AngelSVC_Reason_FLen:
13693     case AngelSVC_Reason_Seek:
13694     case AngelSVC_Reason_Remove:
13695     case AngelSVC_Reason_Time:
13696     case AngelSVC_Reason_System:
13697     case AngelSVC_Reason_Rename:
13698     case AngelSVC_Reason_Elapsed:
13699     default:
13700       TRACE_SYSCALL (cpu, " HLT [Unknown angel %x]",
13701                      aarch64_get_reg_u32 (cpu, 0, NO_SP));
13702       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13703                        sim_stopped, SIM_SIGTRAP);
13704     }
13705
13706   aarch64_set_reg_u64 (cpu, 0, NO_SP, result);
13707 }
13708
13709 static void
13710 dexExcpnGen (sim_cpu *cpu)
13711 {
13712   /* instr[31:24] = 11010100
13713      instr[23,21] = opc : 000 ==> GEN EXCPN, 001 ==> BRK
13714                           010 ==> HLT,       101 ==> DBG GEN EXCPN
13715      instr[20,5]  = imm16
13716      instr[4,2]   = opc2 000 ==> OK, ow ==> UNALLOC
13717      instr[1,0]   = LL : discriminates opc  */
13718
13719   uint32_t opc = INSTR (23, 21);
13720   uint32_t imm16 = INSTR (20, 5);
13721   uint32_t opc2 = INSTR (4, 2);
13722   uint32_t LL;
13723
13724   NYI_assert (31, 24, 0xd4);
13725
13726   if (opc2 != 0)
13727     HALT_UNALLOC;
13728
13729   LL = INSTR (1, 0);
13730
13731   /* We only implement HLT and BRK for now.  */
13732   if (opc == 1 && LL == 0)
13733     {
13734       TRACE_EVENTS (cpu, " BRK [0x%x]", imm16);
13735       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13736                        sim_exited, aarch64_get_reg_s32 (cpu, R0, SP_OK));
13737     }
13738
13739   if (opc == 2 && LL == 0)
13740     handle_halt (cpu, imm16);
13741
13742   else if (opc == 0 || opc == 5)
13743     HALT_NYI;
13744
13745   else
13746     HALT_UNALLOC;
13747 }
13748
13749 /* Stub for accessing system registers.  */
13750
13751 static uint64_t
13752 system_get (sim_cpu *cpu, unsigned op0, unsigned op1, unsigned crn,
13753             unsigned crm, unsigned op2)
13754 {
13755   if (crn == 0 && op1 == 3 && crm == 0 && op2 == 7)
13756     /* DCZID_EL0 - the Data Cache Zero ID register.
13757        We do not support DC ZVA at the moment, so
13758        we return a value with the disable bit set.
13759        We implement support for the DCZID register since
13760        it is used by the C library's memset function.  */
13761     return ((uint64_t) 1) << 4;
13762
13763   if (crn == 0 && op1 == 3 && crm == 0 && op2 == 1)
13764     /* Cache Type Register.  */
13765     return 0x80008000UL;
13766
13767   if (crn == 13 && op1 == 3 && crm == 0 && op2 == 2)
13768     /* TPIDR_EL0 - thread pointer id.  */
13769     return aarch64_get_thread_id (cpu);
13770
13771   if (op1 == 3 && crm == 4 && op2 == 0)
13772     return aarch64_get_FPCR (cpu);
13773
13774   if (op1 == 3 && crm == 4 && op2 == 1)
13775     return aarch64_get_FPSR (cpu);
13776
13777   else if (op1 == 3 && crm == 2 && op2 == 0)
13778     return aarch64_get_CPSR (cpu);
13779
13780   HALT_NYI;
13781 }
13782
13783 static void
13784 system_set (sim_cpu *cpu, unsigned op0, unsigned op1, unsigned crn,
13785             unsigned crm, unsigned op2, uint64_t val)
13786 {
13787   if (op1 == 3 && crm == 4 && op2 == 0)
13788     aarch64_set_FPCR (cpu, val);
13789
13790   else if (op1 == 3 && crm == 4 && op2 == 1)
13791     aarch64_set_FPSR (cpu, val);
13792
13793   else if (op1 == 3 && crm == 2 && op2 == 0)
13794     aarch64_set_CPSR (cpu, val);
13795
13796   else
13797     HALT_NYI;
13798 }
13799
13800 static void
13801 do_mrs (sim_cpu *cpu)
13802 {
13803   /* instr[31:20] = 1101 0101 0001 1
13804      instr[19]    = op0
13805      instr[18,16] = op1
13806      instr[15,12] = CRn
13807      instr[11,8]  = CRm
13808      instr[7,5]   = op2
13809      instr[4,0]   = Rt  */
13810   unsigned sys_op0 = INSTR (19, 19) + 2;
13811   unsigned sys_op1 = INSTR (18, 16);
13812   unsigned sys_crn = INSTR (15, 12);
13813   unsigned sys_crm = INSTR (11, 8);
13814   unsigned sys_op2 = INSTR (7, 5);
13815   unsigned rt = INSTR (4, 0);
13816
13817   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13818   aarch64_set_reg_u64 (cpu, rt, NO_SP,
13819                        system_get (cpu, sys_op0, sys_op1, sys_crn, sys_crm, sys_op2));
13820 }
13821
13822 static void
13823 do_MSR_immediate (sim_cpu *cpu)
13824 {
13825   /* instr[31:19] = 1101 0101 0000 0
13826      instr[18,16] = op1
13827      instr[15,12] = 0100
13828      instr[11,8]  = CRm
13829      instr[7,5]   = op2
13830      instr[4,0]   = 1 1111  */
13831
13832   unsigned op1 = INSTR (18, 16);
13833   /*unsigned crm = INSTR (11, 8);*/
13834   unsigned op2 = INSTR (7, 5);
13835
13836   NYI_assert (31, 19, 0x1AA0);
13837   NYI_assert (15, 12, 0x4);
13838   NYI_assert (4,  0,  0x1F);
13839
13840   if (op1 == 0)
13841     {
13842       if (op2 == 5)
13843         HALT_NYI; /* set SPSel.  */
13844       else
13845         HALT_UNALLOC;
13846     }
13847   else if (op1 == 3)
13848     {
13849       if (op2 == 6)
13850         HALT_NYI; /* set DAIFset.  */
13851       else if (op2 == 7)
13852         HALT_NYI; /* set DAIFclr.  */
13853       else
13854         HALT_UNALLOC;
13855     }
13856   else
13857     HALT_UNALLOC;
13858 }
13859
13860 static void
13861 do_MSR_reg (sim_cpu *cpu)
13862 {
13863   /* instr[31:20] = 1101 0101 0001
13864      instr[19]    = op0
13865      instr[18,16] = op1
13866      instr[15,12] = CRn
13867      instr[11,8]  = CRm
13868      instr[7,5]   = op2
13869      instr[4,0]   = Rt  */
13870
13871   unsigned sys_op0 = INSTR (19, 19) + 2;
13872   unsigned sys_op1 = INSTR (18, 16);
13873   unsigned sys_crn = INSTR (15, 12);
13874   unsigned sys_crm = INSTR (11, 8);
13875   unsigned sys_op2 = INSTR (7, 5);
13876   unsigned rt = INSTR (4, 0);
13877
13878   NYI_assert (31, 20, 0xD51);
13879
13880   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13881   system_set (cpu, sys_op0, sys_op1, sys_crn, sys_crm, sys_op2,
13882               aarch64_get_reg_u64 (cpu, rt, NO_SP));
13883 }
13884
13885 static void
13886 do_SYS (sim_cpu *cpu)
13887 {
13888   /* instr[31,19] = 1101 0101 0000 1
13889      instr[18,16] = op1
13890      instr[15,12] = CRn
13891      instr[11,8]  = CRm
13892      instr[7,5]   = op2
13893      instr[4,0]   = Rt  */
13894   NYI_assert (31, 19, 0x1AA1);
13895
13896   /* FIXME: For now we just silently accept system ops.  */
13897 }
13898
13899 static void
13900 dexSystem (sim_cpu *cpu)
13901 {
13902   /* instr[31:22] = 1101 01010 0
13903      instr[21]    = L
13904      instr[20,19] = op0
13905      instr[18,16] = op1
13906      instr[15,12] = CRn
13907      instr[11,8]  = CRm
13908      instr[7,5]   = op2
13909      instr[4,0]   = uimm5  */
13910
13911   /* We are interested in HINT, DSB, DMB and ISB
13912
13913      Hint #0 encodes NOOP (this is the only hint we care about)
13914      L == 0, op0 == 0, op1 = 011, CRn = 0010, Rt = 11111,
13915      CRm op2  != 0000 000 OR CRm op2 == 0000 000 || CRm op > 0000 101
13916
13917      DSB, DMB, ISB are data store barrier, data memory barrier and
13918      instruction store barrier, respectively, where
13919
13920      L == 0, op0 == 0, op1 = 011, CRn = 0011, Rt = 11111,
13921      op2 : DSB ==> 100, DMB ==> 101, ISB ==> 110
13922      CRm<3:2> ==> domain, CRm<1:0> ==> types,
13923      domain : 00 ==> OuterShareable, 01 ==> Nonshareable,
13924               10 ==> InerShareable, 11 ==> FullSystem
13925      types :  01 ==> Reads, 10 ==> Writes,
13926               11 ==> All, 00 ==> All (domain == FullSystem).  */
13927
13928   unsigned rt = INSTR (4, 0);
13929
13930   NYI_assert (31, 22, 0x354);
13931
13932   switch (INSTR (21, 12))
13933     {
13934     case 0x032:
13935       if (rt == 0x1F)
13936         {
13937           /* NOP has CRm != 0000 OR.  */
13938           /*         (CRm == 0000 AND (op2 == 000 OR op2 > 101)).  */
13939           uint32_t crm = INSTR (11, 8);
13940           uint32_t op2 = INSTR (7, 5);
13941
13942           if (crm != 0 || (op2 == 0 || op2 > 5))
13943             {
13944               /* Actually call nop method so we can reimplement it later.  */
13945               nop (cpu);
13946               return;
13947             }
13948         }
13949       HALT_NYI;
13950
13951     case 0x033:
13952       {
13953         uint32_t op2 =  INSTR (7, 5);
13954
13955         switch (op2)
13956           {
13957           case 2: HALT_NYI;
13958           case 4: dsb (cpu); return;
13959           case 5: dmb (cpu); return;
13960           case 6: isb (cpu); return;
13961           default: HALT_UNALLOC;
13962         }
13963       }
13964
13965     case 0x3B0:
13966     case 0x3B4:
13967     case 0x3BD:
13968       do_mrs (cpu);
13969       return;
13970
13971     case 0x0B7:
13972       do_SYS (cpu); /* DC is an alias of SYS.  */
13973       return;
13974
13975     default:
13976       if (INSTR (21, 20) == 0x1)
13977         do_MSR_reg (cpu);
13978       else if (INSTR (21, 19) == 0 && INSTR (15, 12) == 0x4)
13979         do_MSR_immediate (cpu);
13980       else
13981         HALT_NYI;
13982       return;
13983     }
13984 }
13985
13986 static void
13987 dexBr (sim_cpu *cpu)
13988 {
13989   /* uint32_t group = dispatchGroup (aarch64_get_instr (cpu));
13990      assert  group == GROUP_BREXSYS_1010 || group == GROUP_BREXSYS_1011
13991      bits [31,29] of a BrExSys are the secondary dispatch vector.  */
13992   uint32_t group2 = dispatchBrExSys (aarch64_get_instr (cpu));
13993
13994   switch (group2)
13995     {
13996     case BR_IMM_000:
13997       return dexBranchImmediate (cpu);
13998
13999     case BR_IMMCMP_001:
14000       /* Compare has bit 25 clear while test has it set.  */
14001       if (!INSTR (25, 25))
14002         dexCompareBranchImmediate (cpu);
14003       else
14004         dexTestBranchImmediate (cpu);
14005       return;
14006
14007     case BR_IMMCOND_010:
14008       /* This is a conditional branch if bit 25 is clear otherwise
14009          unallocated.  */
14010       if (!INSTR (25, 25))
14011         dexCondBranchImmediate (cpu);
14012       else
14013         HALT_UNALLOC;
14014       return;
14015
14016     case BR_UNALLOC_011:
14017       HALT_UNALLOC;
14018
14019     case BR_IMM_100:
14020       dexBranchImmediate (cpu);
14021       return;
14022
14023     case BR_IMMCMP_101:
14024       /* Compare has bit 25 clear while test has it set.  */
14025       if (!INSTR (25, 25))
14026         dexCompareBranchImmediate (cpu);
14027       else
14028         dexTestBranchImmediate (cpu);
14029       return;
14030
14031     case BR_REG_110:
14032       /* Unconditional branch reg has bit 25 set.  */
14033       if (INSTR (25, 25))
14034         dexBranchRegister (cpu);
14035
14036       /* This includes both Excpn Gen, System and unalloc operations.
14037          We need to decode the Excpn Gen operation BRK so we can plant
14038          debugger entry points.
14039          Excpn Gen operations have instr [24] = 0.
14040          we need to decode at least one of the System operations NOP
14041          which is an alias for HINT #0.
14042          System operations have instr [24,22] = 100.  */
14043       else if (INSTR (24, 24) == 0)
14044         dexExcpnGen (cpu);
14045
14046       else if (INSTR (24, 22) == 4)
14047         dexSystem (cpu);
14048
14049       else
14050         HALT_UNALLOC;
14051
14052       return;
14053
14054     case BR_UNALLOC_111:
14055       HALT_UNALLOC;
14056
14057     default:
14058       /* Should never reach here.  */
14059       HALT_NYI;
14060     }
14061 }
14062
14063 static void
14064 aarch64_decode_and_execute (sim_cpu *cpu, uint64_t pc)
14065 {
14066   /* We need to check if gdb wants an in here.  */
14067   /* checkBreak (cpu);.  */
14068
14069   uint64_t group = dispatchGroup (aarch64_get_instr (cpu));
14070
14071   switch (group)
14072     {
14073     case GROUP_PSEUDO_0000:   dexPseudo (cpu); break;
14074     case GROUP_LDST_0100:     dexLdSt (cpu); break;
14075     case GROUP_DPREG_0101:    dexDPReg (cpu); break;
14076     case GROUP_LDST_0110:     dexLdSt (cpu); break;
14077     case GROUP_ADVSIMD_0111:  dexAdvSIMD0 (cpu); break;
14078     case GROUP_DPIMM_1000:    dexDPImm (cpu); break;
14079     case GROUP_DPIMM_1001:    dexDPImm (cpu); break;
14080     case GROUP_BREXSYS_1010:  dexBr (cpu); break;
14081     case GROUP_BREXSYS_1011:  dexBr (cpu); break;
14082     case GROUP_LDST_1100:     dexLdSt (cpu); break;
14083     case GROUP_DPREG_1101:    dexDPReg (cpu); break;
14084     case GROUP_LDST_1110:     dexLdSt (cpu); break;
14085     case GROUP_ADVSIMD_1111:  dexAdvSIMD1 (cpu); break;
14086
14087     case GROUP_UNALLOC_0001:
14088     case GROUP_UNALLOC_0010:
14089     case GROUP_UNALLOC_0011:
14090       HALT_UNALLOC;
14091
14092     default:
14093       /* Should never reach here.  */
14094       HALT_NYI;
14095     }
14096 }
14097
14098 static bfd_boolean
14099 aarch64_step (sim_cpu *cpu)
14100 {
14101   uint64_t pc = aarch64_get_PC (cpu);
14102
14103   if (pc == TOP_LEVEL_RETURN_PC)
14104     return FALSE;
14105
14106   aarch64_set_next_PC (cpu, pc + 4);
14107
14108   /* Code is always little-endian.  */
14109   sim_core_read_buffer (CPU_STATE (cpu), cpu, read_map,
14110                         & aarch64_get_instr (cpu), pc, 4);
14111   aarch64_get_instr (cpu) = endian_le2h_4 (aarch64_get_instr (cpu));
14112
14113   TRACE_INSN (cpu, " pc = %" PRIx64 " instr = %08x", pc,
14114               aarch64_get_instr (cpu));
14115   TRACE_DISASM (cpu, pc);
14116
14117   aarch64_decode_and_execute (cpu, pc);
14118
14119   return TRUE;
14120 }
14121
14122 void
14123 aarch64_run (SIM_DESC sd)
14124 {
14125   sim_cpu *cpu = STATE_CPU (sd, 0);
14126
14127   while (aarch64_step (cpu))
14128     {
14129       aarch64_update_PC (cpu);
14130
14131       if (sim_events_tick (sd))
14132         sim_events_process (sd);
14133     }
14134
14135   sim_engine_halt (sd, cpu, NULL, aarch64_get_PC (cpu),
14136                    sim_exited, aarch64_get_reg_s32 (cpu, R0, NO_SP));
14137 }
14138
14139 void
14140 aarch64_init (sim_cpu *cpu, uint64_t pc)
14141 {
14142   uint64_t sp = aarch64_get_stack_start (cpu);
14143
14144   /* Install SP, FP and PC and set LR to -20
14145      so we can detect a top-level return.  */
14146   aarch64_set_reg_u64 (cpu, SP, SP_OK, sp);
14147   aarch64_set_reg_u64 (cpu, FP, SP_OK, sp);
14148   aarch64_set_reg_u64 (cpu, LR, SP_OK, TOP_LEVEL_RETURN_PC);
14149   aarch64_set_next_PC (cpu, pc);
14150   aarch64_update_PC (cpu);
14151   aarch64_init_LIT_table ();
14152 }