gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2016 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #define INCLUDE_STRING
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "memmodel.h"
  30 #include "gimple.h"
  31 #include "cfghooks.h"
  32 #include "cfgloop.h"
  33 #include "df.h"
  34 #include "tm_p.h"
  35 #include "stringpool.h"
  36 #include "optabs.h"
  37 #include "regs.h"
  38 #include "emit-rtl.h"
  39 #include "recog.h"
  40 #include "diagnostic.h"
  41 #include "insn-attr.h"
  42 #include "alias.h"
  43 #include "fold-const.h"
  44 #include "stor-layout.h"
  45 #include "calls.h"
  46 #include "varasm.h"
  47 #include "output.h"
  48 #include "flags.h"
  49 #include "explow.h"
  50 #include "expr.h"
  51 #include "reload.h"
  52 #include "langhooks.h"
  53 #include "opts.h"
  54 #include "params.h"
  55 #include "gimplify.h"
  56 #include "dwarf2.h"
  57 #include "gimple-iterator.h"
  58 #include "tree-vectorizer.h"
  59 #include "aarch64-cost-tables.h"
  60 #include "dumpfile.h"
  61 #include "builtins.h"
  62 #include "rtl-iter.h"
  63 #include "tm-constrs.h"
  64 #include "sched-int.h"
  65 #include "target-globals.h"
  66 #include "common/common-target.h"
  67
  68 /* This file should be included last.  */
  69 #include "target-def.h"
  70
  71 /* Defined for convenience.  */
  72 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  73
  74 /* Classifies an address.
  75
  76    ADDRESS_REG_IMM
  77        A simple base register plus immediate offset.
  78
  79    ADDRESS_REG_WB
  80        A base register indexed by immediate offset with writeback.
  81
  82    ADDRESS_REG_REG
  83        A base register indexed by (optionally scaled) register.
  84
  85    ADDRESS_REG_UXTW
  86        A base register indexed by (optionally scaled) zero-extended register.
  87
  88    ADDRESS_REG_SXTW
  89        A base register indexed by (optionally scaled) sign-extended register.
  90
  91    ADDRESS_LO_SUM
  92        A LO_SUM rtx with a base register and "LO12" symbol relocation.
  93
  94    ADDRESS_SYMBOLIC:
  95        A constant symbolic address, in pc-relative literal pool.  */
  96
  97 enum aarch64_address_type {
  98   ADDRESS_REG_IMM,
  99   ADDRESS_REG_WB,
 100   ADDRESS_REG_REG,
 101   ADDRESS_REG_UXTW,
 102   ADDRESS_REG_SXTW,
 103   ADDRESS_LO_SUM,
 104   ADDRESS_SYMBOLIC
 105 };
 106
 107 struct aarch64_address_info {
 108   enum aarch64_address_type type;
 109   rtx base;
 110   rtx offset;
 111   int shift;
 112   enum aarch64_symbol_type symbol_type;
 113 };
 114
 115 struct simd_immediate_info
 116 {
 117   rtx value;
 118   int shift;
 119   int element_width;
 120   bool mvn;
 121   bool msl;
 122 };
 123
 124 /* The current code model.  */
 125 enum aarch64_code_model aarch64_cmodel;
 126
 127 #ifdef HAVE_AS_TLS
 128 #undef TARGET_HAVE_TLS
 129 #define TARGET_HAVE_TLS 1
 130 #endif
 131
 132 static bool aarch64_composite_type_p (const_tree, machine_mode);
 133 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 134                                                      const_tree,
 135                                                      machine_mode *, int *,
 136                                                      bool *);
 137 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 138 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 139 static void aarch64_override_options_after_change (void);
 140 static bool aarch64_vector_mode_supported_p (machine_mode);
 141 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
 142                                                  const unsigned char *sel);
 143 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 144
 145 /* Major revision number of the ARM Architecture implemented by the target.  */
 146 unsigned aarch64_architecture_version;
 147
 148 /* The processor for which instructions should be scheduled.  */
 149 enum aarch64_processor aarch64_tune = cortexa53;
 150
 151 /* Mask to specify which instruction scheduling options should be used.  */
 152 unsigned long aarch64_tune_flags = 0;
 153
 154 /* Global flag for PC relative loads.  */
 155 bool aarch64_pcrelative_literal_loads;
 156
 157 /* Support for command line parsing of boolean flags in the tuning
 158    structures.  */
 159 struct aarch64_flag_desc
 160 {
 161   const char* name;
 162   unsigned int flag;
 163 };
 164
 165 #define AARCH64_FUSION_PAIR(name, internal_name) \
 166   { name, AARCH64_FUSE_##internal_name },
 167 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 168 {
 169   { "none", AARCH64_FUSE_NOTHING },
 170 #include "aarch64-fusion-pairs.def"
 171   { "all", AARCH64_FUSE_ALL },
 172   { NULL, AARCH64_FUSE_NOTHING }
 173 };
 174 #undef AARCH64_FUION_PAIR
 175
 176 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 177   { name, AARCH64_EXTRA_TUNE_##internal_name },
 178 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 179 {
 180   { "none", AARCH64_EXTRA_TUNE_NONE },
 181 #include "aarch64-tuning-flags.def"
 182   { "all", AARCH64_EXTRA_TUNE_ALL },
 183   { NULL, AARCH64_EXTRA_TUNE_NONE }
 184 };
 185 #undef AARCH64_EXTRA_TUNING_OPTION
 186
 187 /* Tuning parameters.  */
 188
 189 static const struct cpu_addrcost_table generic_addrcost_table =
 190 {
 191     {
 192       0, /* hi  */
 193       0, /* si  */
 194       0, /* di  */
 195       0, /* ti  */
 196     },
 197   0, /* pre_modify  */
 198   0, /* post_modify  */
 199   0, /* register_offset  */
 200   0, /* register_sextend  */
 201   0, /* register_zextend  */
 202   0 /* imm_offset  */
 203 };
 204
 205 static const struct cpu_addrcost_table cortexa57_addrcost_table =
 206 {
 207     {
 208       1, /* hi  */
 209       0, /* si  */
 210       0, /* di  */
 211       1, /* ti  */
 212     },
 213   0, /* pre_modify  */
 214   0, /* post_modify  */
 215   0, /* register_offset  */
 216   0, /* register_sextend  */
 217   0, /* register_zextend  */
 218   0, /* imm_offset  */
 219 };
 220
 221 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 222 {
 223     {
 224       0, /* hi  */
 225       0, /* si  */
 226       0, /* di  */
 227       2, /* ti  */
 228     },
 229   0, /* pre_modify  */
 230   0, /* post_modify  */
 231   1, /* register_offset  */
 232   1, /* register_sextend  */
 233   2, /* register_zextend  */
 234   0, /* imm_offset  */
 235 };
 236
 237 static const struct cpu_addrcost_table xgene1_addrcost_table =
 238 {
 239     {
 240       1, /* hi  */
 241       0, /* si  */
 242       0, /* di  */
 243       1, /* ti  */
 244     },
 245   1, /* pre_modify  */
 246   0, /* post_modify  */
 247   0, /* register_offset  */
 248   1, /* register_sextend  */
 249   1, /* register_zextend  */
 250   0, /* imm_offset  */
 251 };
 252
 253 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
 254 {
 255     {
 256       1, /* hi  */
 257       0, /* si  */
 258       0, /* di  */
 259       1, /* ti  */
 260     },
 261   0, /* pre_modify  */
 262   0, /* post_modify  */
 263   0, /* register_offset  */
 264   0, /* register_sextend  */
 265   0, /* register_zextend  */
 266   0 /* imm_offset  */
 267 };
 268
 269 static const struct cpu_addrcost_table vulcan_addrcost_table =
 270 {
 271     {
 272       0, /* hi  */
 273       0, /* si  */
 274       0, /* di  */
 275       2, /* ti  */
 276     },
 277   0, /* pre_modify  */
 278   0, /* post_modify  */
 279   2, /* register_offset  */
 280   3, /* register_sextend  */
 281   3, /* register_zextend  */
 282   0, /* imm_offset  */
 283 };
 284
 285 static const struct cpu_regmove_cost generic_regmove_cost =
 286 {
 287   1, /* GP2GP  */
 288   /* Avoid the use of slow int<->fp moves for spilling by setting
 289      their cost higher than memmov_cost.  */
 290   5, /* GP2FP  */
 291   5, /* FP2GP  */
 292   2 /* FP2FP  */
 293 };
 294
 295 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 296 {
 297   1, /* GP2GP  */
 298   /* Avoid the use of slow int<->fp moves for spilling by setting
 299      their cost higher than memmov_cost.  */
 300   5, /* GP2FP  */
 301   5, /* FP2GP  */
 302   2 /* FP2FP  */
 303 };
 304
 305 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 306 {
 307   1, /* GP2GP  */
 308   /* Avoid the use of slow int<->fp moves for spilling by setting
 309      their cost higher than memmov_cost.  */
 310   5, /* GP2FP  */
 311   5, /* FP2GP  */
 312   2 /* FP2FP  */
 313 };
 314
 315 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 316 {
 317   1, /* GP2GP  */
 318   /* Avoid the use of slow int<->fp moves for spilling by setting
 319      their cost higher than memmov_cost (actual, 4 and 9).  */
 320   9, /* GP2FP  */
 321   9, /* FP2GP  */
 322   1 /* FP2FP  */
 323 };
 324
 325 static const struct cpu_regmove_cost thunderx_regmove_cost =
 326 {
 327   2, /* GP2GP  */
 328   2, /* GP2FP  */
 329   6, /* FP2GP  */
 330   4 /* FP2FP  */
 331 };
 332
 333 static const struct cpu_regmove_cost xgene1_regmove_cost =
 334 {
 335   1, /* GP2GP  */
 336   /* Avoid the use of slow int<->fp moves for spilling by setting
 337      their cost higher than memmov_cost.  */
 338   8, /* GP2FP  */
 339   8, /* FP2GP  */
 340   2 /* FP2FP  */
 341 };
 342
 343 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 344 {
 345   2, /* GP2GP  */
 346   /* Avoid the use of int<->fp moves for spilling.  */
 347   6, /* GP2FP  */
 348   6, /* FP2GP  */
 349   4 /* FP2FP  */
 350 };
 351
 352 static const struct cpu_regmove_cost vulcan_regmove_cost =
 353 {
 354   1, /* GP2GP  */
 355   /* Avoid the use of int<->fp moves for spilling.  */
 356   8, /* GP2FP  */
 357   8, /* FP2GP  */
 358   4  /* FP2FP  */
 359 };
 360
 361 /* Generic costs for vector insn classes.  */
 362 static const struct cpu_vector_cost generic_vector_cost =
 363 {
 364   1, /* scalar_stmt_cost  */
 365   1, /* scalar_load_cost  */
 366   1, /* scalar_store_cost  */
 367   1, /* vec_stmt_cost  */
 368   2, /* vec_permute_cost  */
 369   1, /* vec_to_scalar_cost  */
 370   1, /* scalar_to_vec_cost  */
 371   1, /* vec_align_load_cost  */
 372   1, /* vec_unalign_load_cost  */
 373   1, /* vec_unalign_store_cost  */
 374   1, /* vec_store_cost  */
 375   3, /* cond_taken_branch_cost  */
 376   1 /* cond_not_taken_branch_cost  */
 377 };
 378
 379 /* ThunderX costs for vector insn classes.  */
 380 static const struct cpu_vector_cost thunderx_vector_cost =
 381 {
 382   1, /* scalar_stmt_cost  */
 383   3, /* scalar_load_cost  */
 384   1, /* scalar_store_cost  */
 385   4, /* vec_stmt_cost  */
 386   4, /* vec_permute_cost  */
 387   2, /* vec_to_scalar_cost  */
 388   2, /* scalar_to_vec_cost  */
 389   3, /* vec_align_load_cost  */
 390   10, /* vec_unalign_load_cost  */
 391   10, /* vec_unalign_store_cost  */
 392   1, /* vec_store_cost  */
 393   3, /* cond_taken_branch_cost  */
 394   3 /* cond_not_taken_branch_cost  */
 395 };
 396
 397 /* Generic costs for vector insn classes.  */
 398 static const struct cpu_vector_cost cortexa57_vector_cost =
 399 {
 400   1, /* scalar_stmt_cost  */
 401   4, /* scalar_load_cost  */
 402   1, /* scalar_store_cost  */
 403   2, /* vec_stmt_cost  */
 404   3, /* vec_permute_cost  */
 405   8, /* vec_to_scalar_cost  */
 406   8, /* scalar_to_vec_cost  */
 407   4, /* vec_align_load_cost  */
 408   4, /* vec_unalign_load_cost  */
 409   1, /* vec_unalign_store_cost  */
 410   1, /* vec_store_cost  */
 411   1, /* cond_taken_branch_cost  */
 412   1 /* cond_not_taken_branch_cost  */
 413 };
 414
 415 static const struct cpu_vector_cost exynosm1_vector_cost =
 416 {
 417   1, /* scalar_stmt_cost  */
 418   5, /* scalar_load_cost  */
 419   1, /* scalar_store_cost  */
 420   3, /* vec_stmt_cost  */
 421   3, /* vec_permute_cost  */
 422   3, /* vec_to_scalar_cost  */
 423   3, /* scalar_to_vec_cost  */
 424   5, /* vec_align_load_cost  */
 425   5, /* vec_unalign_load_cost  */
 426   1, /* vec_unalign_store_cost  */
 427   1, /* vec_store_cost  */
 428   1, /* cond_taken_branch_cost  */
 429   1 /* cond_not_taken_branch_cost  */
 430 };
 431
 432 /* Generic costs for vector insn classes.  */
 433 static const struct cpu_vector_cost xgene1_vector_cost =
 434 {
 435   1, /* scalar_stmt_cost  */
 436   5, /* scalar_load_cost  */
 437   1, /* scalar_store_cost  */
 438   2, /* vec_stmt_cost  */
 439   2, /* vec_permute_cost  */
 440   4, /* vec_to_scalar_cost  */
 441   4, /* scalar_to_vec_cost  */
 442   10, /* vec_align_load_cost  */
 443   10, /* vec_unalign_load_cost  */
 444   2, /* vec_unalign_store_cost  */
 445   2, /* vec_store_cost  */
 446   2, /* cond_taken_branch_cost  */
 447   1 /* cond_not_taken_branch_cost  */
 448 };
 449
 450 /* Costs for vector insn classes for Vulcan.  */
 451 static const struct cpu_vector_cost vulcan_vector_cost =
 452 {
 453   6, /* scalar_stmt_cost  */
 454   4, /* scalar_load_cost  */
 455   1, /* scalar_store_cost  */
 456   6, /* vec_stmt_cost  */
 457   3, /* vec_permute_cost  */
 458   6, /* vec_to_scalar_cost  */
 459   5, /* scalar_to_vec_cost  */
 460   8, /* vec_align_load_cost  */
 461   8, /* vec_unalign_load_cost  */
 462   4, /* vec_unalign_store_cost  */
 463   4, /* vec_store_cost  */
 464   2, /* cond_taken_branch_cost  */
 465   1  /* cond_not_taken_branch_cost  */
 466 };
 467
 468 /* Generic costs for branch instructions.  */
 469 static const struct cpu_branch_cost generic_branch_cost =
 470 {
 471   2,  /* Predictable.  */
 472   2   /* Unpredictable.  */
 473 };
 474
 475 /* Branch costs for Cortex-A57.  */
 476 static const struct cpu_branch_cost cortexa57_branch_cost =
 477 {
 478   1,  /* Predictable.  */
 479   3   /* Unpredictable.  */
 480 };
 481
 482 /* Branch costs for Vulcan.  */
 483 static const struct cpu_branch_cost vulcan_branch_cost =
 484 {
 485   1,  /* Predictable.  */
 486   3   /* Unpredictable.  */
 487 };
 488
 489 /* Generic approximation modes.  */
 490 static const cpu_approx_modes generic_approx_modes =
 491 {
 492   AARCH64_APPROX_NONE,  /* division  */
 493   AARCH64_APPROX_NONE,  /* sqrt  */
 494   AARCH64_APPROX_NONE   /* recip_sqrt  */
 495 };
 496
 497 /* Approximation modes for Exynos M1.  */
 498 static const cpu_approx_modes exynosm1_approx_modes =
 499 {
 500   AARCH64_APPROX_NONE,  /* division  */
 501   AARCH64_APPROX_ALL,   /* sqrt  */
 502   AARCH64_APPROX_ALL    /* recip_sqrt  */
 503 };
 504
 505 /* Approximation modes for X-Gene 1.  */
 506 static const cpu_approx_modes xgene1_approx_modes =
 507 {
 508   AARCH64_APPROX_NONE,  /* division  */
 509   AARCH64_APPROX_NONE,  /* sqrt  */
 510   AARCH64_APPROX_ALL    /* recip_sqrt  */
 511 };
 512
 513 static const struct tune_params generic_tunings =
 514 {
 515   &cortexa57_extra_costs,
 516   &generic_addrcost_table,
 517   &generic_regmove_cost,
 518   &generic_vector_cost,
 519   &generic_branch_cost,
 520   &generic_approx_modes,
 521   4, /* memmov_cost  */
 522   2, /* issue_rate  */
 523   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 524   8,    /* function_align.  */
 525   8,    /* jump_align.  */
 526   4,    /* loop_align.  */
 527   2,    /* int_reassoc_width.  */
 528   4,    /* fp_reassoc_width.  */
 529   1,    /* vec_reassoc_width.  */
 530   2,    /* min_div_recip_mul_sf.  */
 531   2,    /* min_div_recip_mul_df.  */
 532   0,    /* max_case_values.  */
 533   0,    /* cache_line_size.  */
 534   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 535   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 536 };
 537
 538 static const struct tune_params cortexa35_tunings =
 539 {
 540   &cortexa53_extra_costs,
 541   &generic_addrcost_table,
 542   &cortexa53_regmove_cost,
 543   &generic_vector_cost,
 544   &cortexa57_branch_cost,
 545   &generic_approx_modes,
 546   4, /* memmov_cost  */
 547   1, /* issue_rate  */
 548   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 549    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 550   16,   /* function_align.  */
 551   8,    /* jump_align.  */
 552   8,    /* loop_align.  */
 553   2,    /* int_reassoc_width.  */
 554   4,    /* fp_reassoc_width.  */
 555   1,    /* vec_reassoc_width.  */
 556   2,    /* min_div_recip_mul_sf.  */
 557   2,    /* min_div_recip_mul_df.  */
 558   0,    /* max_case_values.  */
 559   0,    /* cache_line_size.  */
 560   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 561   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 562 };
 563
 564 static const struct tune_params cortexa53_tunings =
 565 {
 566   &cortexa53_extra_costs,
 567   &generic_addrcost_table,
 568   &cortexa53_regmove_cost,
 569   &generic_vector_cost,
 570   &cortexa57_branch_cost,
 571   &generic_approx_modes,
 572   4, /* memmov_cost  */
 573   2, /* issue_rate  */
 574   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 575    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 576   16,   /* function_align.  */
 577   8,    /* jump_align.  */
 578   8,    /* loop_align.  */
 579   2,    /* int_reassoc_width.  */
 580   4,    /* fp_reassoc_width.  */
 581   1,    /* vec_reassoc_width.  */
 582   2,    /* min_div_recip_mul_sf.  */
 583   2,    /* min_div_recip_mul_df.  */
 584   0,    /* max_case_values.  */
 585   0,    /* cache_line_size.  */
 586   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 587   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 588 };
 589
 590 static const struct tune_params cortexa57_tunings =
 591 {
 592   &cortexa57_extra_costs,
 593   &cortexa57_addrcost_table,
 594   &cortexa57_regmove_cost,
 595   &cortexa57_vector_cost,
 596   &cortexa57_branch_cost,
 597   &generic_approx_modes,
 598   4, /* memmov_cost  */
 599   3, /* issue_rate  */
 600   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 601    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 602   16,   /* function_align.  */
 603   8,    /* jump_align.  */
 604   8,    /* loop_align.  */
 605   2,    /* int_reassoc_width.  */
 606   4,    /* fp_reassoc_width.  */
 607   1,    /* vec_reassoc_width.  */
 608   2,    /* min_div_recip_mul_sf.  */
 609   2,    /* min_div_recip_mul_df.  */
 610   0,    /* max_case_values.  */
 611   0,    /* cache_line_size.  */
 612   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 613   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS)  /* tune_flags.  */
 614 };
 615
 616 static const struct tune_params cortexa72_tunings =
 617 {
 618   &cortexa57_extra_costs,
 619   &cortexa57_addrcost_table,
 620   &cortexa57_regmove_cost,
 621   &cortexa57_vector_cost,
 622   &cortexa57_branch_cost,
 623   &generic_approx_modes,
 624   4, /* memmov_cost  */
 625   3, /* issue_rate  */
 626   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 627    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 628   16,   /* function_align.  */
 629   8,    /* jump_align.  */
 630   8,    /* loop_align.  */
 631   2,    /* int_reassoc_width.  */
 632   4,    /* fp_reassoc_width.  */
 633   1,    /* vec_reassoc_width.  */
 634   2,    /* min_div_recip_mul_sf.  */
 635   2,    /* min_div_recip_mul_df.  */
 636   0,    /* max_case_values.  */
 637   0,    /* cache_line_size.  */
 638   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 639   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 640 };
 641
 642 static const struct tune_params cortexa73_tunings =
 643 {
 644   &cortexa57_extra_costs,
 645   &cortexa57_addrcost_table,
 646   &cortexa57_regmove_cost,
 647   &cortexa57_vector_cost,
 648   &cortexa57_branch_cost,
 649   &generic_approx_modes,
 650   4, /* memmov_cost.  */
 651   2, /* issue_rate.  */
 652   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 653    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 654   16,   /* function_align.  */
 655   8,    /* jump_align.  */
 656   8,    /* loop_align.  */
 657   2,    /* int_reassoc_width.  */
 658   4,    /* fp_reassoc_width.  */
 659   1,    /* vec_reassoc_width.  */
 660   2,    /* min_div_recip_mul_sf.  */
 661   2,    /* min_div_recip_mul_df.  */
 662   0,    /* max_case_values.  */
 663   0,    /* cache_line_size.  */
 664   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 665   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 666 };
 667
 668 static const struct tune_params exynosm1_tunings =
 669 {
 670   &exynosm1_extra_costs,
 671   &exynosm1_addrcost_table,
 672   &exynosm1_regmove_cost,
 673   &exynosm1_vector_cost,
 674   &generic_branch_cost,
 675   &exynosm1_approx_modes,
 676   4,    /* memmov_cost  */
 677   3,    /* issue_rate  */
 678   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 679   4,    /* function_align.  */
 680   4,    /* jump_align.  */
 681   4,    /* loop_align.  */
 682   2,    /* int_reassoc_width.  */
 683   4,    /* fp_reassoc_width.  */
 684   1,    /* vec_reassoc_width.  */
 685   2,    /* min_div_recip_mul_sf.  */
 686   2,    /* min_div_recip_mul_df.  */
 687   48,   /* max_case_values.  */
 688   64,   /* cache_line_size.  */
 689   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 690   (AARCH64_EXTRA_TUNE_NONE) /* tune_flags.  */
 691 };
 692
 693 static const struct tune_params thunderx_tunings =
 694 {
 695   &thunderx_extra_costs,
 696   &generic_addrcost_table,
 697   &thunderx_regmove_cost,
 698   &thunderx_vector_cost,
 699   &generic_branch_cost,
 700   &generic_approx_modes,
 701   6, /* memmov_cost  */
 702   2, /* issue_rate  */
 703   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 704   8,    /* function_align.  */
 705   8,    /* jump_align.  */
 706   8,    /* loop_align.  */
 707   2,    /* int_reassoc_width.  */
 708   4,    /* fp_reassoc_width.  */
 709   1,    /* vec_reassoc_width.  */
 710   2,    /* min_div_recip_mul_sf.  */
 711   2,    /* min_div_recip_mul_df.  */
 712   0,    /* max_case_values.  */
 713   0,    /* cache_line_size.  */
 714   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 715   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)      /* tune_flags.  */
 716 };
 717
 718 static const struct tune_params xgene1_tunings =
 719 {
 720   &xgene1_extra_costs,
 721   &xgene1_addrcost_table,
 722   &xgene1_regmove_cost,
 723   &xgene1_vector_cost,
 724   &generic_branch_cost,
 725   &xgene1_approx_modes,
 726   6, /* memmov_cost  */
 727   4, /* issue_rate  */
 728   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 729   16,   /* function_align.  */
 730   8,    /* jump_align.  */
 731   16,   /* loop_align.  */
 732   2,    /* int_reassoc_width.  */
 733   4,    /* fp_reassoc_width.  */
 734   1,    /* vec_reassoc_width.  */
 735   2,    /* min_div_recip_mul_sf.  */
 736   2,    /* min_div_recip_mul_df.  */
 737   0,    /* max_case_values.  */
 738   0,    /* cache_line_size.  */
 739   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 740   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 741 };
 742
 743 static const struct tune_params qdf24xx_tunings =
 744 {
 745   &qdf24xx_extra_costs,
 746   &qdf24xx_addrcost_table,
 747   &qdf24xx_regmove_cost,
 748   &generic_vector_cost,
 749   &generic_branch_cost,
 750   &generic_approx_modes,
 751   4, /* memmov_cost  */
 752   4, /* issue_rate  */
 753   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 754    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 755   16,   /* function_align.  */
 756   8,    /* jump_align.  */
 757   16,   /* loop_align.  */
 758   2,    /* int_reassoc_width.  */
 759   4,    /* fp_reassoc_width.  */
 760   1,    /* vec_reassoc_width.  */
 761   2,    /* min_div_recip_mul_sf.  */
 762   2,    /* min_div_recip_mul_df.  */
 763   0,    /* max_case_values.  */
 764   64,   /* cache_line_size.  */
 765   tune_params::AUTOPREFETCHER_STRONG,   /* autoprefetcher_model.  */
 766   (AARCH64_EXTRA_TUNE_NONE)             /* tune_flags.  */
 767 };
 768
 769 static const struct tune_params vulcan_tunings =
 770 {
 771   &vulcan_extra_costs,
 772   &vulcan_addrcost_table,
 773   &vulcan_regmove_cost,
 774   &vulcan_vector_cost,
 775   &vulcan_branch_cost,
 776   &generic_approx_modes,
 777   4, /* memmov_cost.  */
 778   4, /* issue_rate.  */
 779   AARCH64_FUSE_NOTHING, /* fuseable_ops.  */
 780   16,   /* function_align.  */
 781   8,    /* jump_align.  */
 782   16,   /* loop_align.  */
 783   3,    /* int_reassoc_width.  */
 784   2,    /* fp_reassoc_width.  */
 785   2,    /* vec_reassoc_width.  */
 786   2,    /* min_div_recip_mul_sf.  */
 787   2,    /* min_div_recip_mul_df.  */
 788   0,    /* max_case_values.  */
 789   64,   /* cache_line_size.  */
 790   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 791   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 792 };
 793
 794 /* Support for fine-grained override of the tuning structures.  */
 795 struct aarch64_tuning_override_function
 796 {
 797   const char* name;
 798   void (*parse_override)(const char*, struct tune_params*);
 799 };
 800
 801 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
 802 static void aarch64_parse_tune_string (const char*, struct tune_params*);
 803
 804 static const struct aarch64_tuning_override_function
 805 aarch64_tuning_override_functions[] =
 806 {
 807   { "fuse", aarch64_parse_fuse_string },
 808   { "tune", aarch64_parse_tune_string },
 809   { NULL, NULL }
 810 };
 811
 812 /* A processor implementing AArch64.  */
 813 struct processor
 814 {
 815   const char *const name;
 816   enum aarch64_processor ident;
 817   enum aarch64_processor sched_core;
 818   enum aarch64_arch arch;
 819   unsigned architecture_version;
 820   const unsigned long flags;
 821   const struct tune_params *const tune;
 822 };
 823
 824 /* Architectures implementing AArch64.  */
 825 static const struct processor all_architectures[] =
 826 {
 827 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
 828   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
 829 #include "aarch64-arches.def"
 830 #undef AARCH64_ARCH
 831   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 832 };
 833
 834 /* Processor cores implementing AArch64.  */
 835 static const struct processor all_cores[] =
 836 {
 837 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART) \
 838   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
 839   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
 840   FLAGS, &COSTS##_tunings},
 841 #include "aarch64-cores.def"
 842 #undef AARCH64_CORE
 843   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
 844     AARCH64_FL_FOR_ARCH8, &generic_tunings},
 845   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 846 };
 847
 848
 849 /* Target specification.  These are populated by the -march, -mtune, -mcpu
 850    handling code or by target attributes.  */
 851 static const struct processor *selected_arch;
 852 static const struct processor *selected_cpu;
 853 static const struct processor *selected_tune;
 854
 855 /* The current tuning set.  */
 856 struct tune_params aarch64_tune_params = generic_tunings;
 857
 858 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 859
 860 /* An ISA extension in the co-processor and main instruction set space.  */
 861 struct aarch64_option_extension
 862 {
 863   const char *const name;
 864   const unsigned long flags_on;
 865   const unsigned long flags_off;
 866 };
 867
 868 typedef enum aarch64_cond_code
 869 {
 870   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
 871   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
 872   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
 873 }
 874 aarch64_cc;
 875
 876 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
 877
 878 /* The condition codes of the processor, and the inverse function.  */
 879 static const char * const aarch64_condition_codes[] =
 880 {
 881   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
 882   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
 883 };
 884
 885 /* Generate code to enable conditional branches in functions over 1 MiB.  */
 886 const char *
 887 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
 888                         const char * branch_format)
 889 {
 890     rtx_code_label * tmp_label = gen_label_rtx ();
 891     char label_buf[256];
 892     char buffer[128];
 893     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
 894                                  CODE_LABEL_NUMBER (tmp_label));
 895     const char *label_ptr = targetm.strip_name_encoding (label_buf);
 896     rtx dest_label = operands[pos_label];
 897     operands[pos_label] = tmp_label;
 898
 899     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
 900     output_asm_insn (buffer, operands);
 901
 902     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
 903     operands[pos_label] = dest_label;
 904     output_asm_insn (buffer, operands);
 905     return "";
 906 }
 907
 908 void
 909 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
 910 {
 911   const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
 912   if (TARGET_GENERAL_REGS_ONLY)
 913     error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
 914   else
 915     error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
 916 }
 917
 918 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
 919    The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
 920    the same cost even if ALL_REGS has a much larger cost.  ALL_REGS is also
 921    used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
 922    cost (in this case the best class is the lowest cost one).  Using ALL_REGS
 923    irrespectively of its cost results in bad allocations with many redundant
 924    int<->FP moves which are expensive on various cores.
 925    To avoid this we don't allow ALL_REGS as the allocno class, but force a
 926    decision between FP_REGS and GENERAL_REGS.  We use the allocno class if it
 927    isn't ALL_REGS.  Similarly, use the best class if it isn't ALL_REGS.
 928    Otherwise set the allocno class depending on the mode.
 929    The result of this is that it is no longer inefficient to have a higher
 930    memory move cost than the register move cost.
 931 */
 932
 933 static reg_class_t
 934 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
 935                                          reg_class_t best_class)
 936 {
 937   enum machine_mode mode;
 938
 939   if (allocno_class != ALL_REGS)
 940     return allocno_class;
 941
 942   if (best_class != ALL_REGS)
 943     return best_class;
 944
 945   mode = PSEUDO_REGNO_MODE (regno);
 946   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
 947 }
 948
 949 static unsigned int
 950 aarch64_min_divisions_for_recip_mul (enum machine_mode mode)
 951 {
 952   if (GET_MODE_UNIT_SIZE (mode) == 4)
 953     return aarch64_tune_params.min_div_recip_mul_sf;
 954   return aarch64_tune_params.min_div_recip_mul_df;
 955 }
 956
 957 static int
 958 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
 959                              enum machine_mode mode)
 960 {
 961   if (VECTOR_MODE_P (mode))
 962     return aarch64_tune_params.vec_reassoc_width;
 963   if (INTEGRAL_MODE_P (mode))
 964     return aarch64_tune_params.int_reassoc_width;
 965   if (FLOAT_MODE_P (mode))
 966     return aarch64_tune_params.fp_reassoc_width;
 967   return 1;
 968 }
 969
 970 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
 971 unsigned
 972 aarch64_dbx_register_number (unsigned regno)
 973 {
 974    if (GP_REGNUM_P (regno))
 975      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
 976    else if (regno == SP_REGNUM)
 977      return AARCH64_DWARF_SP;
 978    else if (FP_REGNUM_P (regno))
 979      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
 980
 981    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
 982       equivalent DWARF register.  */
 983    return DWARF_FRAME_REGISTERS;
 984 }
 985
 986 /* Return TRUE if MODE is any of the large INT modes.  */
 987 static bool
 988 aarch64_vect_struct_mode_p (machine_mode mode)
 989 {
 990   return mode == OImode || mode == CImode || mode == XImode;
 991 }
 992
 993 /* Return TRUE if MODE is any of the vector modes.  */
 994 static bool
 995 aarch64_vector_mode_p (machine_mode mode)
 996 {
 997   return aarch64_vector_mode_supported_p (mode)
 998          || aarch64_vect_struct_mode_p (mode);
 999 }
1000
1001 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1002 static bool
1003 aarch64_array_mode_supported_p (machine_mode mode,
1004                                 unsigned HOST_WIDE_INT nelems)
1005 {
1006   if (TARGET_SIMD
1007       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1008           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1009       && (nelems >= 2 && nelems <= 4))
1010     return true;
1011
1012   return false;
1013 }
1014
1015 /* Implement HARD_REGNO_NREGS.  */
1016
1017 int
1018 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1019 {
1020   switch (aarch64_regno_regclass (regno))
1021     {
1022     case FP_REGS:
1023     case FP_LO_REGS:
1024       return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
1025     default:
1026       return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
1027     }
1028   gcc_unreachable ();
1029 }
1030
1031 /* Implement HARD_REGNO_MODE_OK.  */
1032
1033 int
1034 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1035 {
1036   if (GET_MODE_CLASS (mode) == MODE_CC)
1037     return regno == CC_REGNUM;
1038
1039   if (regno == SP_REGNUM)
1040     /* The purpose of comparing with ptr_mode is to support the
1041        global register variable associated with the stack pointer
1042        register via the syntax of asm ("wsp") in ILP32.  */
1043     return mode == Pmode || mode == ptr_mode;
1044
1045   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1046     return mode == Pmode;
1047
1048   if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
1049     return 1;
1050
1051   if (FP_REGNUM_P (regno))
1052     {
1053       if (aarch64_vect_struct_mode_p (mode))
1054         return
1055           (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
1056       else
1057         return 1;
1058     }
1059
1060   return 0;
1061 }
1062
1063 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1064 machine_mode
1065 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
1066                                      machine_mode mode)
1067 {
1068   /* Handle modes that fit within single registers.  */
1069   if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
1070     {
1071       if (GET_MODE_SIZE (mode) >= 4)
1072         return mode;
1073       else
1074         return SImode;
1075     }
1076   /* Fall back to generic for multi-reg and very large modes.  */
1077   else
1078     return choose_hard_reg_mode (regno, nregs, false);
1079 }
1080
1081 /* Return true if calls to DECL should be treated as
1082    long-calls (ie called via a register).  */
1083 static bool
1084 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1085 {
1086   return false;
1087 }
1088
1089 /* Return true if calls to symbol-ref SYM should be treated as
1090    long-calls (ie called via a register).  */
1091 bool
1092 aarch64_is_long_call_p (rtx sym)
1093 {
1094   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1095 }
1096
1097 /* Return true if calls to symbol-ref SYM should not go through
1098    plt stubs.  */
1099
1100 bool
1101 aarch64_is_noplt_call_p (rtx sym)
1102 {
1103   const_tree decl = SYMBOL_REF_DECL (sym);
1104
1105   if (flag_pic
1106       && decl
1107       && (!flag_plt
1108           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1109       && !targetm.binds_local_p (decl))
1110     return true;
1111
1112   return false;
1113 }
1114
1115 /* Return true if the offsets to a zero/sign-extract operation
1116    represent an expression that matches an extend operation.  The
1117    operands represent the paramters from
1118
1119    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1120 bool
1121 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
1122                                 rtx extract_imm)
1123 {
1124   HOST_WIDE_INT mult_val, extract_val;
1125
1126   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1127     return false;
1128
1129   mult_val = INTVAL (mult_imm);
1130   extract_val = INTVAL (extract_imm);
1131
1132   if (extract_val > 8
1133       && extract_val < GET_MODE_BITSIZE (mode)
1134       && exact_log2 (extract_val & ~7) > 0
1135       && (extract_val & 7) <= 4
1136       && mult_val == (1 << (extract_val & 7)))
1137     return true;
1138
1139   return false;
1140 }
1141
1142 /* Emit an insn that's a simple single-set.  Both the operands must be
1143    known to be valid.  */
1144 inline static rtx
1145 emit_set_insn (rtx x, rtx y)
1146 {
1147   return emit_insn (gen_rtx_SET (x, y));
1148 }
1149
1150 /* X and Y are two things to compare using CODE.  Emit the compare insn and
1151    return the rtx for register 0 in the proper mode.  */
1152 rtx
1153 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1154 {
1155   machine_mode mode = SELECT_CC_MODE (code, x, y);
1156   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1157
1158   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1159   return cc_reg;
1160 }
1161
1162 /* Build the SYMBOL_REF for __tls_get_addr.  */
1163
1164 static GTY(()) rtx tls_get_addr_libfunc;
1165
1166 rtx
1167 aarch64_tls_get_addr (void)
1168 {
1169   if (!tls_get_addr_libfunc)
1170     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1171   return tls_get_addr_libfunc;
1172 }
1173
1174 /* Return the TLS model to use for ADDR.  */
1175
1176 static enum tls_model
1177 tls_symbolic_operand_type (rtx addr)
1178 {
1179   enum tls_model tls_kind = TLS_MODEL_NONE;
1180   rtx sym, addend;
1181
1182   if (GET_CODE (addr) == CONST)
1183     {
1184       split_const (addr, &sym, &addend);
1185       if (GET_CODE (sym) == SYMBOL_REF)
1186         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1187     }
1188   else if (GET_CODE (addr) == SYMBOL_REF)
1189     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1190
1191   return tls_kind;
1192 }
1193
1194 /* We'll allow lo_sum's in addresses in our legitimate addresses
1195    so that combine would take care of combining addresses where
1196    necessary, but for generation purposes, we'll generate the address
1197    as :
1198    RTL                               Absolute
1199    tmp = hi (symbol_ref);            adrp  x1, foo
1200    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
1201                                      nop
1202
1203    PIC                               TLS
1204    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
1205    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
1206                                      bl   __tls_get_addr
1207                                      nop
1208
1209    Load TLS symbol, depending on TLS mechanism and TLS access model.
1210
1211    Global Dynamic - Traditional TLS:
1212    adrp tmp, :tlsgd:imm
1213    add  dest, tmp, #:tlsgd_lo12:imm
1214    bl   __tls_get_addr
1215
1216    Global Dynamic - TLS Descriptors:
1217    adrp dest, :tlsdesc:imm
1218    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
1219    add  dest, dest, #:tlsdesc_lo12:imm
1220    blr  tmp
1221    mrs  tp, tpidr_el0
1222    add  dest, dest, tp
1223
1224    Initial Exec:
1225    mrs  tp, tpidr_el0
1226    adrp tmp, :gottprel:imm
1227    ldr  dest, [tmp, #:gottprel_lo12:imm]
1228    add  dest, dest, tp
1229
1230    Local Exec:
1231    mrs  tp, tpidr_el0
1232    add  t0, tp, #:tprel_hi12:imm, lsl #12
1233    add  t0, t0, #:tprel_lo12_nc:imm
1234 */
1235
1236 static void
1237 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1238                                    enum aarch64_symbol_type type)
1239 {
1240   switch (type)
1241     {
1242     case SYMBOL_SMALL_ABSOLUTE:
1243       {
1244         /* In ILP32, the mode of dest can be either SImode or DImode.  */
1245         rtx tmp_reg = dest;
1246         machine_mode mode = GET_MODE (dest);
1247
1248         gcc_assert (mode == Pmode || mode == ptr_mode);
1249
1250         if (can_create_pseudo_p ())
1251           tmp_reg = gen_reg_rtx (mode);
1252
1253         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1254         emit_insn (gen_add_losym (dest, tmp_reg, imm));
1255         return;
1256       }
1257
1258     case SYMBOL_TINY_ABSOLUTE:
1259       emit_insn (gen_rtx_SET (dest, imm));
1260       return;
1261
1262     case SYMBOL_SMALL_GOT_28K:
1263       {
1264         machine_mode mode = GET_MODE (dest);
1265         rtx gp_rtx = pic_offset_table_rtx;
1266         rtx insn;
1267         rtx mem;
1268
1269         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1270            here before rtl expand.  Tree IVOPT will generate rtl pattern to
1271            decide rtx costs, in which case pic_offset_table_rtx is not
1272            initialized.  For that case no need to generate the first adrp
1273            instruction as the final cost for global variable access is
1274            one instruction.  */
1275         if (gp_rtx != NULL)
1276           {
1277             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1278                using the page base as GOT base, the first page may be wasted,
1279                in the worst scenario, there is only 28K space for GOT).
1280
1281                The generate instruction sequence for accessing global variable
1282                is:
1283
1284                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1285
1286                Only one instruction needed. But we must initialize
1287                pic_offset_table_rtx properly.  We generate initialize insn for
1288                every global access, and allow CSE to remove all redundant.
1289
1290                The final instruction sequences will look like the following
1291                for multiply global variables access.
1292
1293                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1294
1295                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1296                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1297                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1298                  ...  */
1299
1300             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1301             crtl->uses_pic_offset_table = 1;
1302             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1303
1304             if (mode != GET_MODE (gp_rtx))
1305               gp_rtx = simplify_gen_subreg (mode, gp_rtx, GET_MODE (gp_rtx), 0);
1306           }
1307
1308         if (mode == ptr_mode)
1309           {
1310             if (mode == DImode)
1311               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1312             else
1313               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1314
1315             mem = XVECEXP (SET_SRC (insn), 0, 0);
1316           }
1317         else
1318           {
1319             gcc_assert (mode == Pmode);
1320
1321             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1322             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1323           }
1324
1325         /* The operand is expected to be MEM.  Whenever the related insn
1326            pattern changed, above code which calculate mem should be
1327            updated.  */
1328         gcc_assert (GET_CODE (mem) == MEM);
1329         MEM_READONLY_P (mem) = 1;
1330         MEM_NOTRAP_P (mem) = 1;
1331         emit_insn (insn);
1332         return;
1333       }
1334
1335     case SYMBOL_SMALL_GOT_4G:
1336       {
1337         /* In ILP32, the mode of dest can be either SImode or DImode,
1338            while the got entry is always of SImode size.  The mode of
1339            dest depends on how dest is used: if dest is assigned to a
1340            pointer (e.g. in the memory), it has SImode; it may have
1341            DImode if dest is dereferenced to access the memeory.
1342            This is why we have to handle three different ldr_got_small
1343            patterns here (two patterns for ILP32).  */
1344
1345         rtx insn;
1346         rtx mem;
1347         rtx tmp_reg = dest;
1348         machine_mode mode = GET_MODE (dest);
1349
1350         if (can_create_pseudo_p ())
1351           tmp_reg = gen_reg_rtx (mode);
1352
1353         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1354         if (mode == ptr_mode)
1355           {
1356             if (mode == DImode)
1357               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1358             else
1359               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1360
1361             mem = XVECEXP (SET_SRC (insn), 0, 0);
1362           }
1363         else
1364           {
1365             gcc_assert (mode == Pmode);
1366
1367             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1368             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1369           }
1370
1371         gcc_assert (GET_CODE (mem) == MEM);
1372         MEM_READONLY_P (mem) = 1;
1373         MEM_NOTRAP_P (mem) = 1;
1374         emit_insn (insn);
1375         return;
1376       }
1377
1378     case SYMBOL_SMALL_TLSGD:
1379       {
1380         rtx_insn *insns;
1381         rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
1382
1383         start_sequence ();
1384         aarch64_emit_call_insn (gen_tlsgd_small (result, imm));
1385         insns = get_insns ();
1386         end_sequence ();
1387
1388         RTL_CONST_CALL_P (insns) = 1;
1389         emit_libcall_block (insns, dest, result, imm);
1390         return;
1391       }
1392
1393     case SYMBOL_SMALL_TLSDESC:
1394       {
1395         machine_mode mode = GET_MODE (dest);
1396         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1397         rtx tp;
1398
1399         gcc_assert (mode == Pmode || mode == ptr_mode);
1400
1401         /* In ILP32, the got entry is always of SImode size.  Unlike
1402            small GOT, the dest is fixed at reg 0.  */
1403         if (TARGET_ILP32)
1404           emit_insn (gen_tlsdesc_small_si (imm));
1405         else
1406           emit_insn (gen_tlsdesc_small_di (imm));
1407         tp = aarch64_load_tp (NULL);
1408
1409         if (mode != Pmode)
1410           tp = gen_lowpart (mode, tp);
1411
1412         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1413         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1414         return;
1415       }
1416
1417     case SYMBOL_SMALL_TLSIE:
1418       {
1419         /* In ILP32, the mode of dest can be either SImode or DImode,
1420            while the got entry is always of SImode size.  The mode of
1421            dest depends on how dest is used: if dest is assigned to a
1422            pointer (e.g. in the memory), it has SImode; it may have
1423            DImode if dest is dereferenced to access the memeory.
1424            This is why we have to handle three different tlsie_small
1425            patterns here (two patterns for ILP32).  */
1426         machine_mode mode = GET_MODE (dest);
1427         rtx tmp_reg = gen_reg_rtx (mode);
1428         rtx tp = aarch64_load_tp (NULL);
1429
1430         if (mode == ptr_mode)
1431           {
1432             if (mode == DImode)
1433               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1434             else
1435               {
1436                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1437                 tp = gen_lowpart (mode, tp);
1438               }
1439           }
1440         else
1441           {
1442             gcc_assert (mode == Pmode);
1443             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1444           }
1445
1446         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1447         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1448         return;
1449       }
1450
1451     case SYMBOL_TLSLE12:
1452     case SYMBOL_TLSLE24:
1453     case SYMBOL_TLSLE32:
1454     case SYMBOL_TLSLE48:
1455       {
1456         machine_mode mode = GET_MODE (dest);
1457         rtx tp = aarch64_load_tp (NULL);
1458
1459         if (mode != Pmode)
1460           tp = gen_lowpart (mode, tp);
1461
1462         switch (type)
1463           {
1464           case SYMBOL_TLSLE12:
1465             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1466                         (dest, tp, imm));
1467             break;
1468           case SYMBOL_TLSLE24:
1469             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1470                         (dest, tp, imm));
1471           break;
1472           case SYMBOL_TLSLE32:
1473             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1474                         (dest, imm));
1475             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1476                         (dest, dest, tp));
1477           break;
1478           case SYMBOL_TLSLE48:
1479             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1480                         (dest, imm));
1481             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1482                         (dest, dest, tp));
1483             break;
1484           default:
1485             gcc_unreachable ();
1486           }
1487
1488         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1489         return;
1490       }
1491
1492     case SYMBOL_TINY_GOT:
1493       emit_insn (gen_ldr_got_tiny (dest, imm));
1494       return;
1495
1496     case SYMBOL_TINY_TLSIE:
1497       {
1498         machine_mode mode = GET_MODE (dest);
1499         rtx tp = aarch64_load_tp (NULL);
1500
1501         if (mode == ptr_mode)
1502           {
1503             if (mode == DImode)
1504               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1505             else
1506               {
1507                 tp = gen_lowpart (mode, tp);
1508                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1509               }
1510           }
1511         else
1512           {
1513             gcc_assert (mode == Pmode);
1514             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1515           }
1516
1517         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1518         return;
1519       }
1520
1521     default:
1522       gcc_unreachable ();
1523     }
1524 }
1525
1526 /* Emit a move from SRC to DEST.  Assume that the move expanders can
1527    handle all moves if !can_create_pseudo_p ().  The distinction is
1528    important because, unlike emit_move_insn, the move expanders know
1529    how to force Pmode objects into the constant pool even when the
1530    constant pool address is not itself legitimate.  */
1531 static rtx
1532 aarch64_emit_move (rtx dest, rtx src)
1533 {
1534   return (can_create_pseudo_p ()
1535           ? emit_move_insn (dest, src)
1536           : emit_move_insn_1 (dest, src));
1537 }
1538
1539 /* Split a 128-bit move operation into two 64-bit move operations,
1540    taking care to handle partial overlap of register to register
1541    copies.  Special cases are needed when moving between GP regs and
1542    FP regs.  SRC can be a register, constant or memory; DST a register
1543    or memory.  If either operand is memory it must not have any side
1544    effects.  */
1545 void
1546 aarch64_split_128bit_move (rtx dst, rtx src)
1547 {
1548   rtx dst_lo, dst_hi;
1549   rtx src_lo, src_hi;
1550
1551   machine_mode mode = GET_MODE (dst);
1552
1553   gcc_assert (mode == TImode || mode == TFmode);
1554   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1555   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1556
1557   if (REG_P (dst) && REG_P (src))
1558     {
1559       int src_regno = REGNO (src);
1560       int dst_regno = REGNO (dst);
1561
1562       /* Handle FP <-> GP regs.  */
1563       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1564         {
1565           src_lo = gen_lowpart (word_mode, src);
1566           src_hi = gen_highpart (word_mode, src);
1567
1568           if (mode == TImode)
1569             {
1570               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1571               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1572             }
1573           else
1574             {
1575               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1576               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1577             }
1578           return;
1579         }
1580       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1581         {
1582           dst_lo = gen_lowpart (word_mode, dst);
1583           dst_hi = gen_highpart (word_mode, dst);
1584
1585           if (mode == TImode)
1586             {
1587               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1588               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1589             }
1590           else
1591             {
1592               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1593               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1594             }
1595           return;
1596         }
1597     }
1598
1599   dst_lo = gen_lowpart (word_mode, dst);
1600   dst_hi = gen_highpart (word_mode, dst);
1601   src_lo = gen_lowpart (word_mode, src);
1602   src_hi = gen_highpart_mode (word_mode, mode, src);
1603
1604   /* At most one pairing may overlap.  */
1605   if (reg_overlap_mentioned_p (dst_lo, src_hi))
1606     {
1607       aarch64_emit_move (dst_hi, src_hi);
1608       aarch64_emit_move (dst_lo, src_lo);
1609     }
1610   else
1611     {
1612       aarch64_emit_move (dst_lo, src_lo);
1613       aarch64_emit_move (dst_hi, src_hi);
1614     }
1615 }
1616
1617 bool
1618 aarch64_split_128bit_move_p (rtx dst, rtx src)
1619 {
1620   return (! REG_P (src)
1621           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1622 }
1623
1624 /* Split a complex SIMD combine.  */
1625
1626 void
1627 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1628 {
1629   machine_mode src_mode = GET_MODE (src1);
1630   machine_mode dst_mode = GET_MODE (dst);
1631
1632   gcc_assert (VECTOR_MODE_P (dst_mode));
1633
1634   if (REG_P (dst) && REG_P (src1) && REG_P (src2))
1635     {
1636       rtx (*gen) (rtx, rtx, rtx);
1637
1638       switch (src_mode)
1639         {
1640         case V8QImode:
1641           gen = gen_aarch64_simd_combinev8qi;
1642           break;
1643         case V4HImode:
1644           gen = gen_aarch64_simd_combinev4hi;
1645           break;
1646         case V2SImode:
1647           gen = gen_aarch64_simd_combinev2si;
1648           break;
1649         case V4HFmode:
1650           gen = gen_aarch64_simd_combinev4hf;
1651           break;
1652         case V2SFmode:
1653           gen = gen_aarch64_simd_combinev2sf;
1654           break;
1655         case DImode:
1656           gen = gen_aarch64_simd_combinedi;
1657           break;
1658         case DFmode:
1659           gen = gen_aarch64_simd_combinedf;
1660           break;
1661         default:
1662           gcc_unreachable ();
1663         }
1664
1665       emit_insn (gen (dst, src1, src2));
1666       return;
1667     }
1668 }
1669
1670 /* Split a complex SIMD move.  */
1671
1672 void
1673 aarch64_split_simd_move (rtx dst, rtx src)
1674 {
1675   machine_mode src_mode = GET_MODE (src);
1676   machine_mode dst_mode = GET_MODE (dst);
1677
1678   gcc_assert (VECTOR_MODE_P (dst_mode));
1679
1680   if (REG_P (dst) && REG_P (src))
1681     {
1682       rtx (*gen) (rtx, rtx);
1683
1684       gcc_assert (VECTOR_MODE_P (src_mode));
1685
1686       switch (src_mode)
1687         {
1688         case V16QImode:
1689           gen = gen_aarch64_split_simd_movv16qi;
1690           break;
1691         case V8HImode:
1692           gen = gen_aarch64_split_simd_movv8hi;
1693           break;
1694         case V4SImode:
1695           gen = gen_aarch64_split_simd_movv4si;
1696           break;
1697         case V2DImode:
1698           gen = gen_aarch64_split_simd_movv2di;
1699           break;
1700         case V8HFmode:
1701           gen = gen_aarch64_split_simd_movv8hf;
1702           break;
1703         case V4SFmode:
1704           gen = gen_aarch64_split_simd_movv4sf;
1705           break;
1706         case V2DFmode:
1707           gen = gen_aarch64_split_simd_movv2df;
1708           break;
1709         default:
1710           gcc_unreachable ();
1711         }
1712
1713       emit_insn (gen (dst, src));
1714       return;
1715     }
1716 }
1717
1718 bool
1719 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
1720                               machine_mode ymode, rtx y)
1721 {
1722   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
1723   gcc_assert (r != NULL);
1724   return rtx_equal_p (x, r);
1725 }
1726
1727
1728 static rtx
1729 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1730 {
1731   if (can_create_pseudo_p ())
1732     return force_reg (mode, value);
1733   else
1734     {
1735       x = aarch64_emit_move (x, value);
1736       return x;
1737     }
1738 }
1739
1740
1741 static rtx
1742 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1743 {
1744   if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1745     {
1746       rtx high;
1747       /* Load the full offset into a register.  This
1748          might be improvable in the future.  */
1749       high = GEN_INT (offset);
1750       offset = 0;
1751       high = aarch64_force_temporary (mode, temp, high);
1752       reg = aarch64_force_temporary (mode, temp,
1753                                      gen_rtx_PLUS (mode, high, reg));
1754     }
1755   return plus_constant (mode, reg, offset);
1756 }
1757
1758 static int
1759 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1760                                 machine_mode mode)
1761 {
1762   int i;
1763   unsigned HOST_WIDE_INT val, val2, mask;
1764   int one_match, zero_match;
1765   int num_insns;
1766
1767   val = INTVAL (imm);
1768
1769   if (aarch64_move_imm (val, mode))
1770     {
1771       if (generate)
1772         emit_insn (gen_rtx_SET (dest, imm));
1773       return 1;
1774     }
1775
1776   if ((val >> 32) == 0 || mode == SImode)
1777     {
1778       if (generate)
1779         {
1780           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
1781           if (mode == SImode)
1782             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1783                                        GEN_INT ((val >> 16) & 0xffff)));
1784           else
1785             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
1786                                        GEN_INT ((val >> 16) & 0xffff)));
1787         }
1788       return 2;
1789     }
1790
1791   /* Remaining cases are all for DImode.  */
1792
1793   mask = 0xffff;
1794   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
1795     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
1796   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
1797     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
1798
1799   if (zero_match != 2 && one_match != 2)
1800     {
1801       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1802          For a 64-bit bitmask try whether changing 16 bits to all ones or
1803          zeroes creates a valid bitmask.  To check any repeated bitmask,
1804          try using 16 bits from the other 32-bit half of val.  */
1805
1806       for (i = 0; i < 64; i += 16, mask <<= 16)
1807         {
1808           val2 = val & ~mask;
1809           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1810             break;
1811           val2 = val | mask;
1812           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1813             break;
1814           val2 = val2 & ~mask;
1815           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
1816           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1817             break;
1818         }
1819       if (i != 64)
1820         {
1821           if (generate)
1822             {
1823               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1824               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1825                                          GEN_INT ((val >> i) & 0xffff)));
1826             }
1827           return 2;
1828         }
1829     }
1830
1831   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1832      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
1833      otherwise skip zero bits.  */
1834
1835   num_insns = 1;
1836   mask = 0xffff;
1837   val2 = one_match > zero_match ? ~val : val;
1838   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
1839
1840   if (generate)
1841     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
1842                                            ? (val | ~(mask << i))
1843                                            : (val & (mask << i)))));
1844   for (i += 16; i < 64; i += 16)
1845     {
1846       if ((val2 & (mask << i)) == 0)
1847         continue;
1848       if (generate)
1849         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1850                                    GEN_INT ((val >> i) & 0xffff)));
1851       num_insns ++;
1852     }
1853
1854   return num_insns;
1855 }
1856
1857
1858 void
1859 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1860 {
1861   machine_mode mode = GET_MODE (dest);
1862
1863   gcc_assert (mode == SImode || mode == DImode);
1864
1865   /* Check on what type of symbol it is.  */
1866   if (GET_CODE (imm) == SYMBOL_REF
1867       || GET_CODE (imm) == LABEL_REF
1868       || GET_CODE (imm) == CONST)
1869     {
1870       rtx mem, base, offset;
1871       enum aarch64_symbol_type sty;
1872
1873       /* If we have (const (plus symbol offset)), separate out the offset
1874          before we start classifying the symbol.  */
1875       split_const (imm, &base, &offset);
1876
1877       sty = aarch64_classify_symbol (base, offset);
1878       switch (sty)
1879         {
1880         case SYMBOL_FORCE_TO_MEM:
1881           if (offset != const0_rtx
1882               && targetm.cannot_force_const_mem (mode, imm))
1883             {
1884               gcc_assert (can_create_pseudo_p ());
1885               base = aarch64_force_temporary (mode, dest, base);
1886               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1887               aarch64_emit_move (dest, base);
1888               return;
1889             }
1890
1891           mem = force_const_mem (ptr_mode, imm);
1892           gcc_assert (mem);
1893
1894           /* If we aren't generating PC relative literals, then
1895              we need to expand the literal pool access carefully.
1896              This is something that needs to be done in a number
1897              of places, so could well live as a separate function.  */
1898           if (!aarch64_pcrelative_literal_loads)
1899             {
1900               gcc_assert (can_create_pseudo_p ());
1901               base = gen_reg_rtx (ptr_mode);
1902               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
1903               mem = gen_rtx_MEM (ptr_mode, base);
1904             }
1905
1906           if (mode != ptr_mode)
1907             mem = gen_rtx_ZERO_EXTEND (mode, mem);
1908
1909           emit_insn (gen_rtx_SET (dest, mem));
1910
1911           return;
1912
1913         case SYMBOL_SMALL_TLSGD:
1914         case SYMBOL_SMALL_TLSDESC:
1915         case SYMBOL_SMALL_TLSIE:
1916         case SYMBOL_SMALL_GOT_28K:
1917         case SYMBOL_SMALL_GOT_4G:
1918         case SYMBOL_TINY_GOT:
1919         case SYMBOL_TINY_TLSIE:
1920           if (offset != const0_rtx)
1921             {
1922               gcc_assert(can_create_pseudo_p ());
1923               base = aarch64_force_temporary (mode, dest, base);
1924               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1925               aarch64_emit_move (dest, base);
1926               return;
1927             }
1928           /* FALLTHRU */
1929
1930         case SYMBOL_SMALL_ABSOLUTE:
1931         case SYMBOL_TINY_ABSOLUTE:
1932         case SYMBOL_TLSLE12:
1933         case SYMBOL_TLSLE24:
1934         case SYMBOL_TLSLE32:
1935         case SYMBOL_TLSLE48:
1936           aarch64_load_symref_appropriately (dest, imm, sty);
1937           return;
1938
1939         default:
1940           gcc_unreachable ();
1941         }
1942     }
1943
1944   if (!CONST_INT_P (imm))
1945     {
1946       if (GET_CODE (imm) == HIGH)
1947         emit_insn (gen_rtx_SET (dest, imm));
1948       else
1949         {
1950           rtx mem = force_const_mem (mode, imm);
1951           gcc_assert (mem);
1952           emit_insn (gen_rtx_SET (dest, mem));
1953         }
1954
1955       return;
1956     }
1957
1958   aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
1959 }
1960
1961 /* Add DELTA to REGNUM in mode MODE.  SCRATCHREG can be used to hold a
1962    temporary value if necessary.  FRAME_RELATED_P should be true if
1963    the RTX_FRAME_RELATED flag should be set and CFA adjustments added
1964    to the generated instructions.  If SCRATCHREG is known to hold
1965    abs (delta), EMIT_MOVE_IMM can be set to false to avoid emitting the
1966    immediate again.
1967
1968    Since this function may be used to adjust the stack pointer, we must
1969    ensure that it cannot cause transient stack deallocation (for example
1970    by first incrementing SP and then decrementing when adjusting by a
1971    large immediate).  */
1972
1973 static void
1974 aarch64_add_constant_internal (machine_mode mode, int regnum, int scratchreg,
1975                                HOST_WIDE_INT delta, bool frame_related_p,
1976                                bool emit_move_imm)
1977 {
1978   HOST_WIDE_INT mdelta = abs_hwi (delta);
1979   rtx this_rtx = gen_rtx_REG (mode, regnum);
1980   rtx_insn *insn;
1981
1982   if (!mdelta)
1983     return;
1984
1985   /* Single instruction adjustment.  */
1986   if (aarch64_uimm12_shift (mdelta))
1987     {
1988       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
1989       RTX_FRAME_RELATED_P (insn) = frame_related_p;
1990       return;
1991     }
1992
1993   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits.
1994      Only do this if mdelta is not a 16-bit move as adjusting using a move
1995      is better.  */
1996   if (mdelta < 0x1000000 && !aarch64_move_imm (mdelta, mode))
1997     {
1998       HOST_WIDE_INT low_off = mdelta & 0xfff;
1999
2000       low_off = delta < 0 ? -low_off : low_off;
2001       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
2002       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2003       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
2004       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2005       return;
2006     }
2007
2008   /* Emit a move immediate if required and an addition/subtraction.  */
2009   rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
2010   if (emit_move_imm)
2011     aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (mdelta), true, mode);
2012   insn = emit_insn (delta < 0 ? gen_sub2_insn (this_rtx, scratch_rtx)
2013                               : gen_add2_insn (this_rtx, scratch_rtx));
2014   if (frame_related_p)
2015     {
2016       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2017       rtx adj = plus_constant (mode, this_rtx, delta);
2018       add_reg_note (insn , REG_CFA_ADJUST_CFA, gen_rtx_SET (this_rtx, adj));
2019     }
2020 }
2021
2022 static inline void
2023 aarch64_add_constant (machine_mode mode, int regnum, int scratchreg,
2024                       HOST_WIDE_INT delta)
2025 {
2026   aarch64_add_constant_internal (mode, regnum, scratchreg, delta, false, true);
2027 }
2028
2029 static inline void
2030 aarch64_add_sp (int scratchreg, HOST_WIDE_INT delta, bool emit_move_imm)
2031 {
2032   aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, delta,
2033                                  true, emit_move_imm);
2034 }
2035
2036 static inline void
2037 aarch64_sub_sp (int scratchreg, HOST_WIDE_INT delta, bool frame_related_p)
2038 {
2039   aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, -delta,
2040                                  frame_related_p, true);
2041 }
2042
2043 static bool
2044 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
2045                                  tree exp ATTRIBUTE_UNUSED)
2046 {
2047   /* Currently, always true.  */
2048   return true;
2049 }
2050
2051 /* Implement TARGET_PASS_BY_REFERENCE.  */
2052
2053 static bool
2054 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
2055                            machine_mode mode,
2056                            const_tree type,
2057                            bool named ATTRIBUTE_UNUSED)
2058 {
2059   HOST_WIDE_INT size;
2060   machine_mode dummymode;
2061   int nregs;
2062
2063   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
2064   size = (mode == BLKmode && type)
2065     ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
2066
2067   /* Aggregates are passed by reference based on their size.  */
2068   if (type && AGGREGATE_TYPE_P (type))
2069     {
2070       size = int_size_in_bytes (type);
2071     }
2072
2073   /* Variable sized arguments are always returned by reference.  */
2074   if (size < 0)
2075     return true;
2076
2077   /* Can this be a candidate to be passed in fp/simd register(s)?  */
2078   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2079                                                &dummymode, &nregs,
2080                                                NULL))
2081     return false;
2082
2083   /* Arguments which are variable sized or larger than 2 registers are
2084      passed by reference unless they are a homogenous floating point
2085      aggregate.  */
2086   return size > 2 * UNITS_PER_WORD;
2087 }
2088
2089 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
2090 static bool
2091 aarch64_return_in_msb (const_tree valtype)
2092 {
2093   machine_mode dummy_mode;
2094   int dummy_int;
2095
2096   /* Never happens in little-endian mode.  */
2097   if (!BYTES_BIG_ENDIAN)
2098     return false;
2099
2100   /* Only composite types smaller than or equal to 16 bytes can
2101      be potentially returned in registers.  */
2102   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
2103       || int_size_in_bytes (valtype) <= 0
2104       || int_size_in_bytes (valtype) > 16)
2105     return false;
2106
2107   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
2108      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
2109      is always passed/returned in the least significant bits of fp/simd
2110      register(s).  */
2111   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
2112                                                &dummy_mode, &dummy_int, NULL))
2113     return false;
2114
2115   return true;
2116 }
2117
2118 /* Implement TARGET_FUNCTION_VALUE.
2119    Define how to find the value returned by a function.  */
2120
2121 static rtx
2122 aarch64_function_value (const_tree type, const_tree func,
2123                         bool outgoing ATTRIBUTE_UNUSED)
2124 {
2125   machine_mode mode;
2126   int unsignedp;
2127   int count;
2128   machine_mode ag_mode;
2129
2130   mode = TYPE_MODE (type);
2131   if (INTEGRAL_TYPE_P (type))
2132     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
2133
2134   if (aarch64_return_in_msb (type))
2135     {
2136       HOST_WIDE_INT size = int_size_in_bytes (type);
2137
2138       if (size % UNITS_PER_WORD != 0)
2139         {
2140           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
2141           mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
2142         }
2143     }
2144
2145   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2146                                                &ag_mode, &count, NULL))
2147     {
2148       if (!aarch64_composite_type_p (type, mode))
2149         {
2150           gcc_assert (count == 1 && mode == ag_mode);
2151           return gen_rtx_REG (mode, V0_REGNUM);
2152         }
2153       else
2154         {
2155           int i;
2156           rtx par;
2157
2158           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
2159           for (i = 0; i < count; i++)
2160             {
2161               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
2162               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2163                                        GEN_INT (i * GET_MODE_SIZE (ag_mode)));
2164               XVECEXP (par, 0, i) = tmp;
2165             }
2166           return par;
2167         }
2168     }
2169   else
2170     return gen_rtx_REG (mode, R0_REGNUM);
2171 }
2172
2173 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
2174    Return true if REGNO is the number of a hard register in which the values
2175    of called function may come back.  */
2176
2177 static bool
2178 aarch64_function_value_regno_p (const unsigned int regno)
2179 {
2180   /* Maximum of 16 bytes can be returned in the general registers.  Examples
2181      of 16-byte return values are: 128-bit integers and 16-byte small
2182      structures (excluding homogeneous floating-point aggregates).  */
2183   if (regno == R0_REGNUM || regno == R1_REGNUM)
2184     return true;
2185
2186   /* Up to four fp/simd registers can return a function value, e.g. a
2187      homogeneous floating-point aggregate having four members.  */
2188   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
2189     return TARGET_FLOAT;
2190
2191   return false;
2192 }
2193
2194 /* Implement TARGET_RETURN_IN_MEMORY.
2195
2196    If the type T of the result of a function is such that
2197      void func (T arg)
2198    would require that arg be passed as a value in a register (or set of
2199    registers) according to the parameter passing rules, then the result
2200    is returned in the same registers as would be used for such an
2201    argument.  */
2202
2203 static bool
2204 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
2205 {
2206   HOST_WIDE_INT size;
2207   machine_mode ag_mode;
2208   int count;
2209
2210   if (!AGGREGATE_TYPE_P (type)
2211       && TREE_CODE (type) != COMPLEX_TYPE
2212       && TREE_CODE (type) != VECTOR_TYPE)
2213     /* Simple scalar types always returned in registers.  */
2214     return false;
2215
2216   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
2217                                                type,
2218                                                &ag_mode,
2219                                                &count,
2220                                                NULL))
2221     return false;
2222
2223   /* Types larger than 2 registers returned in memory.  */
2224   size = int_size_in_bytes (type);
2225   return (size < 0 || size > 2 * UNITS_PER_WORD);
2226 }
2227
2228 static bool
2229 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
2230                                const_tree type, int *nregs)
2231 {
2232   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2233   return aarch64_vfp_is_call_or_return_candidate (mode,
2234                                                   type,
2235                                                   &pcum->aapcs_vfp_rmode,
2236                                                   nregs,
2237                                                   NULL);
2238 }
2239
2240 /* Given MODE and TYPE of a function argument, return the alignment in
2241    bits.  The idea is to suppress any stronger alignment requested by
2242    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
2243    This is a helper function for local use only.  */
2244
2245 static unsigned int
2246 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
2247 {
2248   if (!type)
2249     return GET_MODE_ALIGNMENT (mode);
2250
2251   if (integer_zerop (TYPE_SIZE (type)))
2252     return 0;
2253
2254   gcc_assert (TYPE_MODE (type) == mode);
2255
2256   if (!AGGREGATE_TYPE_P (type))
2257     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
2258
2259   if (TREE_CODE (type) == ARRAY_TYPE)
2260     return TYPE_ALIGN (TREE_TYPE (type));
2261
2262   unsigned int alignment = 0;
2263   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
2264     if (TREE_CODE (field) == FIELD_DECL)
2265       alignment = std::max (alignment, DECL_ALIGN (field));
2266
2267   return alignment;
2268 }
2269
2270 /* Layout a function argument according to the AAPCS64 rules.  The rule
2271    numbers refer to the rule numbers in the AAPCS64.  */
2272
2273 static void
2274 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
2275                     const_tree type,
2276                     bool named ATTRIBUTE_UNUSED)
2277 {
2278   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2279   int ncrn, nvrn, nregs;
2280   bool allocate_ncrn, allocate_nvrn;
2281   HOST_WIDE_INT size;
2282
2283   /* We need to do this once per argument.  */
2284   if (pcum->aapcs_arg_processed)
2285     return;
2286
2287   pcum->aapcs_arg_processed = true;
2288
2289   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
2290   size
2291     = ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
2292                 UNITS_PER_WORD);
2293
2294   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
2295   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
2296                                                  mode,
2297                                                  type,
2298                                                  &nregs);
2299
2300   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
2301      The following code thus handles passing by SIMD/FP registers first.  */
2302
2303   nvrn = pcum->aapcs_nvrn;
2304
2305   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
2306      and homogenous short-vector aggregates (HVA).  */
2307   if (allocate_nvrn)
2308     {
2309       if (!TARGET_FLOAT)
2310         aarch64_err_no_fpadvsimd (mode, "argument");
2311
2312       if (nvrn + nregs <= NUM_FP_ARG_REGS)
2313         {
2314           pcum->aapcs_nextnvrn = nvrn + nregs;
2315           if (!aarch64_composite_type_p (type, mode))
2316             {
2317               gcc_assert (nregs == 1);
2318               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
2319             }
2320           else
2321             {
2322               rtx par;
2323               int i;
2324               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2325               for (i = 0; i < nregs; i++)
2326                 {
2327                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
2328                                          V0_REGNUM + nvrn + i);
2329                   tmp = gen_rtx_EXPR_LIST
2330                     (VOIDmode, tmp,
2331                      GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
2332                   XVECEXP (par, 0, i) = tmp;
2333                 }
2334               pcum->aapcs_reg = par;
2335             }
2336           return;
2337         }
2338       else
2339         {
2340           /* C.3 NSRN is set to 8.  */
2341           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
2342           goto on_stack;
2343         }
2344     }
2345
2346   ncrn = pcum->aapcs_ncrn;
2347   nregs = size / UNITS_PER_WORD;
2348
2349   /* C6 - C9.  though the sign and zero extension semantics are
2350      handled elsewhere.  This is the case where the argument fits
2351      entirely general registers.  */
2352   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
2353     {
2354
2355       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
2356
2357       /* C.8 if the argument has an alignment of 16 then the NGRN is
2358          rounded up to the next even number.  */
2359       if (nregs == 2
2360           && ncrn % 2
2361           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
2362              comparison is there because for > 16 * BITS_PER_UNIT
2363              alignment nregs should be > 2 and therefore it should be
2364              passed by reference rather than value.  */
2365           && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2366         {
2367           ++ncrn;
2368           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
2369         }
2370
2371       /* NREGS can be 0 when e.g. an empty structure is to be passed.
2372          A reg is still generated for it, but the caller should be smart
2373          enough not to use it.  */
2374       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
2375         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
2376       else
2377         {
2378           rtx par;
2379           int i;
2380
2381           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2382           for (i = 0; i < nregs; i++)
2383             {
2384               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
2385               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2386                                        GEN_INT (i * UNITS_PER_WORD));
2387               XVECEXP (par, 0, i) = tmp;
2388             }
2389           pcum->aapcs_reg = par;
2390         }
2391
2392       pcum->aapcs_nextncrn = ncrn + nregs;
2393       return;
2394     }
2395
2396   /* C.11  */
2397   pcum->aapcs_nextncrn = NUM_ARG_REGS;
2398
2399   /* The argument is passed on stack; record the needed number of words for
2400      this argument and align the total size if necessary.  */
2401 on_stack:
2402   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2403
2404   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2405     pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
2406                                        16 / UNITS_PER_WORD);
2407   return;
2408 }
2409
2410 /* Implement TARGET_FUNCTION_ARG.  */
2411
2412 static rtx
2413 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
2414                       const_tree type, bool named)
2415 {
2416   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2417   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
2418
2419   if (mode == VOIDmode)
2420     return NULL_RTX;
2421
2422   aarch64_layout_arg (pcum_v, mode, type, named);
2423   return pcum->aapcs_reg;
2424 }
2425
2426 void
2427 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
2428                            const_tree fntype ATTRIBUTE_UNUSED,
2429                            rtx libname ATTRIBUTE_UNUSED,
2430                            const_tree fndecl ATTRIBUTE_UNUSED,
2431                            unsigned n_named ATTRIBUTE_UNUSED)
2432 {
2433   pcum->aapcs_ncrn = 0;
2434   pcum->aapcs_nvrn = 0;
2435   pcum->aapcs_nextncrn = 0;
2436   pcum->aapcs_nextnvrn = 0;
2437   pcum->pcs_variant = ARM_PCS_AAPCS64;
2438   pcum->aapcs_reg = NULL_RTX;
2439   pcum->aapcs_arg_processed = false;
2440   pcum->aapcs_stack_words = 0;
2441   pcum->aapcs_stack_size = 0;
2442
2443   if (!TARGET_FLOAT
2444       && fndecl && TREE_PUBLIC (fndecl)
2445       && fntype && fntype != error_mark_node)
2446     {
2447       const_tree type = TREE_TYPE (fntype);
2448       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
2449       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
2450       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2451                                                    &mode, &nregs, NULL))
2452         aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2453     }
2454   return;
2455 }
2456
2457 static void
2458 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2459                               machine_mode mode,
2460                               const_tree type,
2461                               bool named)
2462 {
2463   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2464   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2465     {
2466       aarch64_layout_arg (pcum_v, mode, type, named);
2467       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2468                   != (pcum->aapcs_stack_words != 0));
2469       pcum->aapcs_arg_processed = false;
2470       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2471       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2472       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2473       pcum->aapcs_stack_words = 0;
2474       pcum->aapcs_reg = NULL_RTX;
2475     }
2476 }
2477
2478 bool
2479 aarch64_function_arg_regno_p (unsigned regno)
2480 {
2481   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2482           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2483 }
2484
2485 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
2486    PARM_BOUNDARY bits of alignment, but will be given anything up
2487    to STACK_BOUNDARY bits if the type requires it.  This makes sure
2488    that both before and after the layout of each argument, the Next
2489    Stacked Argument Address (NSAA) will have a minimum alignment of
2490    8 bytes.  */
2491
2492 static unsigned int
2493 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2494 {
2495   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2496   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
2497 }
2498
2499 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
2500
2501    Return true if an argument passed on the stack should be padded upwards,
2502    i.e. if the least-significant byte of the stack slot has useful data.
2503
2504    Small aggregate types are placed in the lowest memory address.
2505
2506    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
2507
2508 bool
2509 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
2510 {
2511   /* On little-endian targets, the least significant byte of every stack
2512      argument is passed at the lowest byte address of the stack slot.  */
2513   if (!BYTES_BIG_ENDIAN)
2514     return true;
2515
2516   /* Otherwise, integral, floating-point and pointer types are padded downward:
2517      the least significant byte of a stack argument is passed at the highest
2518      byte address of the stack slot.  */
2519   if (type
2520       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2521          || POINTER_TYPE_P (type))
2522       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2523     return false;
2524
2525   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
2526   return true;
2527 }
2528
2529 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2530
2531    It specifies padding for the last (may also be the only)
2532    element of a block move between registers and memory.  If
2533    assuming the block is in the memory, padding upward means that
2534    the last element is padded after its highest significant byte,
2535    while in downward padding, the last element is padded at the
2536    its least significant byte side.
2537
2538    Small aggregates and small complex types are always padded
2539    upwards.
2540
2541    We don't need to worry about homogeneous floating-point or
2542    short-vector aggregates; their move is not affected by the
2543    padding direction determined here.  Regardless of endianness,
2544    each element of such an aggregate is put in the least
2545    significant bits of a fp/simd register.
2546
2547    Return !BYTES_BIG_ENDIAN if the least significant byte of the
2548    register has useful data, and return the opposite if the most
2549    significant byte does.  */
2550
2551 bool
2552 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2553                      bool first ATTRIBUTE_UNUSED)
2554 {
2555
2556   /* Small composite types are always padded upward.  */
2557   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2558     {
2559       HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2560                             : GET_MODE_SIZE (mode));
2561       if (size < 2 * UNITS_PER_WORD)
2562         return true;
2563     }
2564
2565   /* Otherwise, use the default padding.  */
2566   return !BYTES_BIG_ENDIAN;
2567 }
2568
2569 static machine_mode
2570 aarch64_libgcc_cmp_return_mode (void)
2571 {
2572   return SImode;
2573 }
2574
2575 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
2576
2577 /* We use the 12-bit shifted immediate arithmetic instructions so values
2578    must be multiple of (1 << 12), i.e. 4096.  */
2579 #define ARITH_FACTOR 4096
2580
2581 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
2582 #error Cannot use simple address calculation for stack probing
2583 #endif
2584
2585 /* The pair of scratch registers used for stack probing.  */
2586 #define PROBE_STACK_FIRST_REG  9
2587 #define PROBE_STACK_SECOND_REG 10
2588
2589 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
2590    inclusive.  These are offsets from the current stack pointer.  */
2591
2592 static void
2593 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
2594 {
2595   rtx reg1 = gen_rtx_REG (ptr_mode, PROBE_STACK_FIRST_REG);
2596
2597   /* See the same assertion on PROBE_INTERVAL above.  */
2598   gcc_assert ((first % ARITH_FACTOR) == 0);
2599
2600   /* See if we have a constant small number of probes to generate.  If so,
2601      that's the easy case.  */
2602   if (size <= PROBE_INTERVAL)
2603     {
2604       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
2605
2606       emit_set_insn (reg1,
2607                      plus_constant (ptr_mode,
2608                                     stack_pointer_rtx, -(first + base)));
2609       emit_stack_probe (plus_constant (ptr_mode, reg1, base - size));
2610     }
2611
2612   /* The run-time loop is made up of 8 insns in the generic case while the
2613      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
2614   else if (size <= 4 * PROBE_INTERVAL)
2615     {
2616       HOST_WIDE_INT i, rem;
2617
2618       emit_set_insn (reg1,
2619                      plus_constant (ptr_mode,
2620                                     stack_pointer_rtx,
2621                                     -(first + PROBE_INTERVAL)));
2622       emit_stack_probe (reg1);
2623
2624       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
2625          it exceeds SIZE.  If only two probes are needed, this will not
2626          generate any code.  Then probe at FIRST + SIZE.  */
2627       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
2628         {
2629           emit_set_insn (reg1,
2630                          plus_constant (ptr_mode, reg1, -PROBE_INTERVAL));
2631           emit_stack_probe (reg1);
2632         }
2633
2634       rem = size - (i - PROBE_INTERVAL);
2635       if (rem > 256)
2636         {
2637           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2638
2639           emit_set_insn (reg1, plus_constant (ptr_mode, reg1, -base));
2640           emit_stack_probe (plus_constant (ptr_mode, reg1, base - rem));
2641         }
2642       else
2643         emit_stack_probe (plus_constant (ptr_mode, reg1, -rem));
2644     }
2645
2646   /* Otherwise, do the same as above, but in a loop.  Note that we must be
2647      extra careful with variables wrapping around because we might be at
2648      the very top (or the very bottom) of the address space and we have
2649      to be able to handle this case properly; in particular, we use an
2650      equality test for the loop condition.  */
2651   else
2652     {
2653       rtx reg2 = gen_rtx_REG (ptr_mode, PROBE_STACK_SECOND_REG);
2654
2655       /* Step 1: round SIZE to the previous multiple of the interval.  */
2656
2657       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
2658
2659
2660       /* Step 2: compute initial and final value of the loop counter.  */
2661
2662       /* TEST_ADDR = SP + FIRST.  */
2663       emit_set_insn (reg1,
2664                      plus_constant (ptr_mode, stack_pointer_rtx, -first));
2665
2666       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
2667       emit_set_insn (reg2,
2668                      plus_constant (ptr_mode, stack_pointer_rtx,
2669                                     -(first + rounded_size)));
2670
2671
2672       /* Step 3: the loop
2673
2674          do
2675            {
2676              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
2677              probe at TEST_ADDR
2678            }
2679          while (TEST_ADDR != LAST_ADDR)
2680
2681          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
2682          until it is equal to ROUNDED_SIZE.  */
2683
2684       if (ptr_mode == DImode)
2685         emit_insn (gen_probe_stack_range_di (reg1, reg1, reg2));
2686       else
2687         emit_insn (gen_probe_stack_range_si (reg1, reg1, reg2));
2688
2689
2690       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
2691          that SIZE is equal to ROUNDED_SIZE.  */
2692
2693       if (size != rounded_size)
2694         {
2695           HOST_WIDE_INT rem = size - rounded_size;
2696
2697           if (rem > 256)
2698             {
2699               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2700
2701               emit_set_insn (reg2, plus_constant (ptr_mode, reg2, -base));
2702               emit_stack_probe (plus_constant (ptr_mode, reg2, base - rem));
2703             }
2704           else
2705             emit_stack_probe (plus_constant (ptr_mode, reg2, -rem));
2706         }
2707     }
2708
2709   /* Make sure nothing is scheduled before we are done.  */
2710   emit_insn (gen_blockage ());
2711 }
2712
2713 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
2714    absolute addresses.  */
2715
2716 const char *
2717 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
2718 {
2719   static int labelno = 0;
2720   char loop_lab[32];
2721   rtx xops[2];
2722
2723   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
2724
2725   /* Loop.  */
2726   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
2727
2728   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
2729   xops[0] = reg1;
2730   xops[1] = GEN_INT (PROBE_INTERVAL);
2731   output_asm_insn ("sub\t%0, %0, %1", xops);
2732
2733   /* Probe at TEST_ADDR.  */
2734   output_asm_insn ("str\txzr, [%0]", xops);
2735
2736   /* Test if TEST_ADDR == LAST_ADDR.  */
2737   xops[1] = reg2;
2738   output_asm_insn ("cmp\t%0, %1", xops);
2739
2740   /* Branch.  */
2741   fputs ("\tb.ne\t", asm_out_file);
2742   assemble_name_raw (asm_out_file, loop_lab);
2743   fputc ('\n', asm_out_file);
2744
2745   return "";
2746 }
2747
2748 static bool
2749 aarch64_frame_pointer_required (void)
2750 {
2751   /* In aarch64_override_options_after_change
2752      flag_omit_leaf_frame_pointer turns off the frame pointer by
2753      default.  Turn it back on now if we've not got a leaf
2754      function.  */
2755   if (flag_omit_leaf_frame_pointer
2756       && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2757     return true;
2758
2759   /* Force a frame pointer for EH returns so the return address is at FP+8.  */
2760   if (crtl->calls_eh_return)
2761     return true;
2762
2763   return false;
2764 }
2765
2766 /* Mark the registers that need to be saved by the callee and calculate
2767    the size of the callee-saved registers area and frame record (both FP
2768    and LR may be omitted).  */
2769 static void
2770 aarch64_layout_frame (void)
2771 {
2772   HOST_WIDE_INT offset = 0;
2773   int regno, last_fp_reg = INVALID_REGNUM;
2774
2775   if (reload_completed && cfun->machine->frame.laid_out)
2776     return;
2777
2778 #define SLOT_NOT_REQUIRED (-2)
2779 #define SLOT_REQUIRED     (-1)
2780
2781   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
2782   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
2783
2784   /* First mark all the registers that really need to be saved...  */
2785   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2786     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2787
2788   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2789     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2790
2791   /* ... that includes the eh data registers (if needed)...  */
2792   if (crtl->calls_eh_return)
2793     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2794       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2795         = SLOT_REQUIRED;
2796
2797   /* ... and any callee saved register that dataflow says is live.  */
2798   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2799     if (df_regs_ever_live_p (regno)
2800         && (regno == R30_REGNUM
2801             || !call_used_regs[regno]))
2802       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2803
2804   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2805     if (df_regs_ever_live_p (regno)
2806         && !call_used_regs[regno])
2807       {
2808         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2809         last_fp_reg = regno;
2810       }
2811
2812   if (frame_pointer_needed)
2813     {
2814       /* FP and LR are placed in the linkage record.  */
2815       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2816       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2817       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2818       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2819       offset += 2 * UNITS_PER_WORD;
2820     }
2821
2822   /* Now assign stack slots for them.  */
2823   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2824     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2825       {
2826         cfun->machine->frame.reg_offset[regno] = offset;
2827         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2828           cfun->machine->frame.wb_candidate1 = regno;
2829         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
2830           cfun->machine->frame.wb_candidate2 = regno;
2831         offset += UNITS_PER_WORD;
2832       }
2833
2834   HOST_WIDE_INT max_int_offset = offset;
2835   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2836   bool has_align_gap = offset != max_int_offset;
2837
2838   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2839     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2840       {
2841         /* If there is an alignment gap between integer and fp callee-saves,
2842            allocate the last fp register to it if possible.  */
2843         if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
2844           {
2845             cfun->machine->frame.reg_offset[regno] = max_int_offset;
2846             break;
2847           }
2848
2849         cfun->machine->frame.reg_offset[regno] = offset;
2850         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2851           cfun->machine->frame.wb_candidate1 = regno;
2852         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
2853                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2854           cfun->machine->frame.wb_candidate2 = regno;
2855         offset += UNITS_PER_WORD;
2856       }
2857
2858   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2859
2860   cfun->machine->frame.saved_regs_size = offset;
2861
2862   HOST_WIDE_INT varargs_and_saved_regs_size
2863     = offset + cfun->machine->frame.saved_varargs_size;
2864
2865   cfun->machine->frame.hard_fp_offset
2866     = ROUND_UP (varargs_and_saved_regs_size + get_frame_size (),
2867                 STACK_BOUNDARY / BITS_PER_UNIT);
2868
2869   cfun->machine->frame.frame_size
2870     = ROUND_UP (cfun->machine->frame.hard_fp_offset
2871                 + crtl->outgoing_args_size,
2872                 STACK_BOUNDARY / BITS_PER_UNIT);
2873
2874   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
2875
2876   cfun->machine->frame.initial_adjust = 0;
2877   cfun->machine->frame.final_adjust = 0;
2878   cfun->machine->frame.callee_adjust = 0;
2879   cfun->machine->frame.callee_offset = 0;
2880
2881   HOST_WIDE_INT max_push_offset = 0;
2882   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
2883     max_push_offset = 512;
2884   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
2885     max_push_offset = 256;
2886
2887   if (cfun->machine->frame.frame_size < max_push_offset
2888       && crtl->outgoing_args_size == 0)
2889     {
2890       /* Simple, small frame with no outgoing arguments:
2891          stp reg1, reg2, [sp, -frame_size]!
2892          stp reg3, reg4, [sp, 16]  */
2893       cfun->machine->frame.callee_adjust = cfun->machine->frame.frame_size;
2894     }
2895   else if ((crtl->outgoing_args_size
2896             + cfun->machine->frame.saved_regs_size < 512)
2897            && !(cfun->calls_alloca
2898                 && cfun->machine->frame.hard_fp_offset < max_push_offset))
2899     {
2900       /* Frame with small outgoing arguments:
2901          sub sp, sp, frame_size
2902          stp reg1, reg2, [sp, outgoing_args_size]
2903          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
2904       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
2905       cfun->machine->frame.callee_offset
2906         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
2907     }
2908   else if (cfun->machine->frame.hard_fp_offset < max_push_offset)
2909     {
2910       /* Frame with large outgoing arguments but a small local area:
2911          stp reg1, reg2, [sp, -hard_fp_offset]!
2912          stp reg3, reg4, [sp, 16]
2913          sub sp, sp, outgoing_args_size  */
2914       cfun->machine->frame.callee_adjust = cfun->machine->frame.hard_fp_offset;
2915       cfun->machine->frame.final_adjust
2916         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
2917     }
2918   else if (!frame_pointer_needed
2919            && varargs_and_saved_regs_size < max_push_offset)
2920     {
2921       /* Frame with large local area and outgoing arguments (this pushes the
2922          callee-saves first, followed by the locals and outgoing area):
2923          stp reg1, reg2, [sp, -varargs_and_saved_regs_size]!
2924          stp reg3, reg4, [sp, 16]
2925          sub sp, sp, frame_size - varargs_and_saved_regs_size  */
2926       cfun->machine->frame.callee_adjust = varargs_and_saved_regs_size;
2927       cfun->machine->frame.final_adjust
2928         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
2929       cfun->machine->frame.hard_fp_offset = cfun->machine->frame.callee_adjust;
2930       cfun->machine->frame.locals_offset = cfun->machine->frame.hard_fp_offset;
2931     }
2932   else
2933     {
2934       /* Frame with large local area and outgoing arguments using frame pointer:
2935          sub sp, sp, hard_fp_offset
2936          stp x29, x30, [sp, 0]
2937          add x29, sp, 0
2938          stp reg3, reg4, [sp, 16]
2939          sub sp, sp, outgoing_args_size  */
2940       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
2941       cfun->machine->frame.final_adjust
2942         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
2943     }
2944
2945   cfun->machine->frame.laid_out = true;
2946 }
2947
2948 /* Return true if the register REGNO is saved on entry to
2949    the current function.  */
2950
2951 static bool
2952 aarch64_register_saved_on_entry (int regno)
2953 {
2954   return cfun->machine->frame.reg_offset[regno] >= 0;
2955 }
2956
2957 /* Return the next register up from REGNO up to LIMIT for the callee
2958    to save.  */
2959
2960 static unsigned
2961 aarch64_next_callee_save (unsigned regno, unsigned limit)
2962 {
2963   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
2964     regno ++;
2965   return regno;
2966 }
2967
2968 /* Push the register number REGNO of mode MODE to the stack with write-back
2969    adjusting the stack by ADJUSTMENT.  */
2970
2971 static void
2972 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
2973                            HOST_WIDE_INT adjustment)
2974  {
2975   rtx base_rtx = stack_pointer_rtx;
2976   rtx insn, reg, mem;
2977
2978   reg = gen_rtx_REG (mode, regno);
2979   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
2980                             plus_constant (Pmode, base_rtx, -adjustment));
2981   mem = gen_rtx_MEM (mode, mem);
2982
2983   insn = emit_move_insn (mem, reg);
2984   RTX_FRAME_RELATED_P (insn) = 1;
2985 }
2986
2987 /* Generate and return an instruction to store the pair of registers
2988    REG and REG2 of mode MODE to location BASE with write-back adjusting
2989    the stack location BASE by ADJUSTMENT.  */
2990
2991 static rtx
2992 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2993                           HOST_WIDE_INT adjustment)
2994 {
2995   switch (mode)
2996     {
2997     case DImode:
2998       return gen_storewb_pairdi_di (base, base, reg, reg2,
2999                                     GEN_INT (-adjustment),
3000                                     GEN_INT (UNITS_PER_WORD - adjustment));
3001     case DFmode:
3002       return gen_storewb_pairdf_di (base, base, reg, reg2,
3003                                     GEN_INT (-adjustment),
3004                                     GEN_INT (UNITS_PER_WORD - adjustment));
3005     default:
3006       gcc_unreachable ();
3007     }
3008 }
3009
3010 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
3011    stack pointer by ADJUSTMENT.  */
3012
3013 static void
3014 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
3015 {
3016   rtx_insn *insn;
3017   machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
3018
3019   if (regno2 == INVALID_REGNUM)
3020     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
3021
3022   rtx reg1 = gen_rtx_REG (mode, regno1);
3023   rtx reg2 = gen_rtx_REG (mode, regno2);
3024
3025   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
3026                                               reg2, adjustment));
3027   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
3028   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3029   RTX_FRAME_RELATED_P (insn) = 1;
3030 }
3031
3032 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
3033    adjusting it by ADJUSTMENT afterwards.  */
3034
3035 static rtx
3036 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3037                          HOST_WIDE_INT adjustment)
3038 {
3039   switch (mode)
3040     {
3041     case DImode:
3042       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
3043                                    GEN_INT (UNITS_PER_WORD));
3044     case DFmode:
3045       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
3046                                    GEN_INT (UNITS_PER_WORD));
3047     default:
3048       gcc_unreachable ();
3049     }
3050 }
3051
3052 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
3053    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
3054    into CFI_OPS.  */
3055
3056 static void
3057 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
3058                   rtx *cfi_ops)
3059 {
3060   machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
3061   rtx reg1 = gen_rtx_REG (mode, regno1);
3062
3063   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
3064
3065   if (regno2 == INVALID_REGNUM)
3066     {
3067       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
3068       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
3069       emit_move_insn (reg1, gen_rtx_MEM (mode, mem));
3070     }
3071   else
3072     {
3073       rtx reg2 = gen_rtx_REG (mode, regno2);
3074       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3075       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
3076                                           reg2, adjustment));
3077     }
3078 }
3079
3080 /* Generate and return a store pair instruction of mode MODE to store
3081    register REG1 to MEM1 and register REG2 to MEM2.  */
3082
3083 static rtx
3084 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
3085                         rtx reg2)
3086 {
3087   switch (mode)
3088     {
3089     case DImode:
3090       return gen_store_pairdi (mem1, reg1, mem2, reg2);
3091
3092     case DFmode:
3093       return gen_store_pairdf (mem1, reg1, mem2, reg2);
3094
3095     default:
3096       gcc_unreachable ();
3097     }
3098 }
3099
3100 /* Generate and regurn a load pair isntruction of mode MODE to load register
3101    REG1 from MEM1 and register REG2 from MEM2.  */
3102
3103 static rtx
3104 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
3105                        rtx mem2)
3106 {
3107   switch (mode)
3108     {
3109     case DImode:
3110       return gen_load_pairdi (reg1, mem1, reg2, mem2);
3111
3112     case DFmode:
3113       return gen_load_pairdf (reg1, mem1, reg2, mem2);
3114
3115     default:
3116       gcc_unreachable ();
3117     }
3118 }
3119
3120 /* Emit code to save the callee-saved registers from register number START
3121    to LIMIT to the stack at the location starting at offset START_OFFSET,
3122    skipping any write-back candidates if SKIP_WB is true.  */
3123
3124 static void
3125 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
3126                            unsigned start, unsigned limit, bool skip_wb)
3127 {
3128   rtx_insn *insn;
3129   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
3130                                                  ? gen_frame_mem : gen_rtx_MEM);
3131   unsigned regno;
3132   unsigned regno2;
3133
3134   for (regno = aarch64_next_callee_save (start, limit);
3135        regno <= limit;
3136        regno = aarch64_next_callee_save (regno + 1, limit))
3137     {
3138       rtx reg, mem;
3139       HOST_WIDE_INT offset;
3140
3141       if (skip_wb
3142           && (regno == cfun->machine->frame.wb_candidate1
3143               || regno == cfun->machine->frame.wb_candidate2))
3144         continue;
3145
3146       reg = gen_rtx_REG (mode, regno);
3147       offset = start_offset + cfun->machine->frame.reg_offset[regno];
3148       mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
3149                                               offset));
3150
3151       regno2 = aarch64_next_callee_save (regno + 1, limit);
3152
3153       if (regno2 <= limit
3154           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3155               == cfun->machine->frame.reg_offset[regno2]))
3156
3157         {
3158           rtx reg2 = gen_rtx_REG (mode, regno2);
3159           rtx mem2;
3160
3161           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3162           mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
3163                                                    offset));
3164           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
3165                                                     reg2));
3166
3167           /* The first part of a frame-related parallel insn is
3168              always assumed to be relevant to the frame
3169              calculations; subsequent parts, are only
3170              frame-related if explicitly marked.  */
3171           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3172           regno = regno2;
3173         }
3174       else
3175         insn = emit_move_insn (mem, reg);
3176
3177       RTX_FRAME_RELATED_P (insn) = 1;
3178     }
3179 }
3180
3181 /* Emit code to restore the callee registers of mode MODE from register
3182    number START up to and including LIMIT.  Restore from the stack offset
3183    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
3184    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
3185
3186 static void
3187 aarch64_restore_callee_saves (machine_mode mode,
3188                               HOST_WIDE_INT start_offset, unsigned start,
3189                               unsigned limit, bool skip_wb, rtx *cfi_ops)
3190 {
3191   rtx base_rtx = stack_pointer_rtx;
3192   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
3193                                                  ? gen_frame_mem : gen_rtx_MEM);
3194   unsigned regno;
3195   unsigned regno2;
3196   HOST_WIDE_INT offset;
3197
3198   for (regno = aarch64_next_callee_save (start, limit);
3199        regno <= limit;
3200        regno = aarch64_next_callee_save (regno + 1, limit))
3201     {
3202       rtx reg, mem;
3203
3204       if (skip_wb
3205           && (regno == cfun->machine->frame.wb_candidate1
3206               || regno == cfun->machine->frame.wb_candidate2))
3207         continue;
3208
3209       reg = gen_rtx_REG (mode, regno);
3210       offset = start_offset + cfun->machine->frame.reg_offset[regno];
3211       mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
3212
3213       regno2 = aarch64_next_callee_save (regno + 1, limit);
3214
3215       if (regno2 <= limit
3216           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3217               == cfun->machine->frame.reg_offset[regno2]))
3218         {
3219           rtx reg2 = gen_rtx_REG (mode, regno2);
3220           rtx mem2;
3221
3222           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3223           mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
3224           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3225
3226           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3227           regno = regno2;
3228         }
3229       else
3230         emit_move_insn (reg, mem);
3231       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
3232     }
3233 }
3234
3235 /* AArch64 stack frames generated by this compiler look like:
3236
3237         +-------------------------------+
3238         |                               |
3239         |  incoming stack arguments     |
3240         |                               |
3241         +-------------------------------+
3242         |                               | <-- incoming stack pointer (aligned)
3243         |  callee-allocated save area   |
3244         |  for register varargs         |
3245         |                               |
3246         +-------------------------------+
3247         |  local variables              | <-- frame_pointer_rtx
3248         |                               |
3249         +-------------------------------+
3250         |  padding0                     | \
3251         +-------------------------------+  |
3252         |  callee-saved registers       |  | frame.saved_regs_size
3253         +-------------------------------+  |
3254         |  LR'                          |  |
3255         +-------------------------------+  |
3256         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
3257         +-------------------------------+
3258         |  dynamic allocation           |
3259         +-------------------------------+
3260         |  padding                      |
3261         +-------------------------------+
3262         |  outgoing stack arguments     | <-- arg_pointer
3263         |                               |
3264         +-------------------------------+
3265         |                               | <-- stack_pointer_rtx (aligned)
3266
3267    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
3268    but leave frame_pointer_rtx and hard_frame_pointer_rtx
3269    unchanged.  */
3270
3271 /* Generate the prologue instructions for entry into a function.
3272    Establish the stack frame by decreasing the stack pointer with a
3273    properly calculated size and, if necessary, create a frame record
3274    filled with the values of LR and previous frame pointer.  The
3275    current FP is also set up if it is in use.  */
3276
3277 void
3278 aarch64_expand_prologue (void)
3279 {
3280   aarch64_layout_frame ();
3281
3282   HOST_WIDE_INT frame_size = cfun->machine->frame.frame_size;
3283   HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3284   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3285   HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3286   HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3287   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3288   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3289   rtx_insn *insn;
3290
3291   if (flag_stack_usage_info)
3292     current_function_static_stack_size = frame_size;
3293
3294   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
3295     {
3296       if (crtl->is_leaf && !cfun->calls_alloca)
3297         {
3298           if (frame_size > PROBE_INTERVAL && frame_size > STACK_CHECK_PROTECT)
3299             aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT,
3300                                             frame_size - STACK_CHECK_PROTECT);
3301         }
3302       else if (frame_size > 0)
3303         aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
3304     }
3305
3306   aarch64_sub_sp (IP0_REGNUM, initial_adjust, true);
3307
3308   if (callee_adjust != 0)
3309     aarch64_push_regs (reg1, reg2, callee_adjust);
3310
3311   if (frame_pointer_needed)
3312     {
3313       if (callee_adjust == 0)
3314         aarch64_save_callee_saves (DImode, callee_offset, R29_REGNUM,
3315                                    R30_REGNUM, false);
3316       insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
3317                                        stack_pointer_rtx,
3318                                        GEN_INT (callee_offset)));
3319       RTX_FRAME_RELATED_P (insn) = 1;
3320       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
3321     }
3322
3323   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3324                              callee_adjust != 0 || frame_pointer_needed);
3325   aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3326                              callee_adjust != 0 || frame_pointer_needed);
3327   aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed);
3328 }
3329
3330 /* Return TRUE if we can use a simple_return insn.
3331
3332    This function checks whether the callee saved stack is empty, which
3333    means no restore actions are need. The pro_and_epilogue will use
3334    this to check whether shrink-wrapping opt is feasible.  */
3335
3336 bool
3337 aarch64_use_return_insn_p (void)
3338 {
3339   if (!reload_completed)
3340     return false;
3341
3342   if (crtl->profile)
3343     return false;
3344
3345   aarch64_layout_frame ();
3346
3347   return cfun->machine->frame.frame_size == 0;
3348 }
3349
3350 /* Generate the epilogue instructions for returning from a function.
3351    This is almost exactly the reverse of the prolog sequence, except
3352    that we need to insert barriers to avoid scheduling loads that read
3353    from a deallocated stack, and we optimize the unwind records by
3354    emitting them all together if possible.  */
3355 void
3356 aarch64_expand_epilogue (bool for_sibcall)
3357 {
3358   aarch64_layout_frame ();
3359
3360   HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3361   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3362   HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3363   HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3364   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3365   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3366   rtx cfi_ops = NULL;
3367   rtx_insn *insn;
3368
3369   /* We need to add memory barrier to prevent read from deallocated stack.  */
3370   bool need_barrier_p = (get_frame_size ()
3371                          + cfun->machine->frame.saved_varargs_size) != 0;
3372
3373   /* Emit a barrier to prevent loads from a deallocated stack.  */
3374   if (final_adjust > crtl->outgoing_args_size || cfun->calls_alloca
3375       || crtl->calls_eh_return)
3376     {
3377       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3378       need_barrier_p = false;
3379     }
3380
3381   /* Restore the stack pointer from the frame pointer if it may not
3382      be the same as the stack pointer.  */
3383   if (frame_pointer_needed && (final_adjust || cfun->calls_alloca))
3384     {
3385       insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
3386                                        hard_frame_pointer_rtx,
3387                                        GEN_INT (-callee_offset)));
3388       /* If writeback is used when restoring callee-saves, the CFA
3389          is restored on the instruction doing the writeback.  */
3390       RTX_FRAME_RELATED_P (insn) = callee_adjust == 0;
3391     }
3392   else
3393     aarch64_add_sp (IP1_REGNUM, final_adjust, df_regs_ever_live_p (IP1_REGNUM));
3394
3395   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3396                                 callee_adjust != 0, &cfi_ops);
3397   aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3398                                 callee_adjust != 0, &cfi_ops);
3399
3400   if (need_barrier_p)
3401     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3402
3403   if (callee_adjust != 0)
3404     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
3405
3406   if (callee_adjust != 0 || initial_adjust > 65536)
3407     {
3408       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
3409       insn = get_last_insn ();
3410       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
3411       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
3412       RTX_FRAME_RELATED_P (insn) = 1;
3413       cfi_ops = NULL;
3414     }
3415
3416   aarch64_add_sp (IP0_REGNUM, initial_adjust, df_regs_ever_live_p (IP0_REGNUM));
3417
3418   if (cfi_ops)
3419     {
3420       /* Emit delayed restores and reset the CFA to be SP.  */
3421       insn = get_last_insn ();
3422       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
3423       REG_NOTES (insn) = cfi_ops;
3424       RTX_FRAME_RELATED_P (insn) = 1;
3425     }
3426
3427   /* Stack adjustment for exception handler.  */
3428   if (crtl->calls_eh_return)
3429     {
3430       /* We need to unwind the stack by the offset computed by
3431          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
3432          to be SP; letting the CFA move during this adjustment
3433          is just as correct as retaining the CFA from the body
3434          of the function.  Therefore, do nothing special.  */
3435       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
3436     }
3437
3438   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
3439   if (!for_sibcall)
3440     emit_jump_insn (ret_rtx);
3441 }
3442
3443 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
3444    normally or return to a previous frame after unwinding.
3445
3446    An EH return uses a single shared return sequence.  The epilogue is
3447    exactly like a normal epilogue except that it has an extra input
3448    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
3449    that must be applied after the frame has been destroyed.  An extra label
3450    is inserted before the epilogue which initializes this register to zero,
3451    and this is the entry point for a normal return.
3452
3453    An actual EH return updates the return address, initializes the stack
3454    adjustment and jumps directly into the epilogue (bypassing the zeroing
3455    of the adjustment).  Since the return address is typically saved on the
3456    stack when a function makes a call, the saved LR must be updated outside
3457    the epilogue.
3458
3459    This poses problems as the store is generated well before the epilogue,
3460    so the offset of LR is not known yet.  Also optimizations will remove the
3461    store as it appears dead, even after the epilogue is generated (as the
3462    base or offset for loading LR is different in many cases).
3463
3464    To avoid these problems this implementation forces the frame pointer
3465    in eh_return functions so that the location of LR is fixed and known early.
3466    It also marks the store volatile, so no optimization is permitted to
3467    remove the store.  */
3468 rtx
3469 aarch64_eh_return_handler_rtx (void)
3470 {
3471   rtx tmp = gen_frame_mem (Pmode,
3472     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
3473
3474   /* Mark the store volatile, so no optimization is permitted to remove it.  */
3475   MEM_VOLATILE_P (tmp) = true;
3476   return tmp;
3477 }
3478
3479 /* Output code to add DELTA to the first argument, and then jump
3480    to FUNCTION.  Used for C++ multiple inheritance.  */
3481 static void
3482 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3483                          HOST_WIDE_INT delta,
3484                          HOST_WIDE_INT vcall_offset,
3485                          tree function)
3486 {
3487   /* The this pointer is always in x0.  Note that this differs from
3488      Arm where the this pointer maybe bumped to r1 if r0 is required
3489      to return a pointer to an aggregate.  On AArch64 a result value
3490      pointer will be in x8.  */
3491   int this_regno = R0_REGNUM;
3492   rtx this_rtx, temp0, temp1, addr, funexp;
3493   rtx_insn *insn;
3494
3495   reload_completed = 1;
3496   emit_note (NOTE_INSN_PROLOGUE_END);
3497
3498   if (vcall_offset == 0)
3499     aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3500   else
3501     {
3502       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3503
3504       this_rtx = gen_rtx_REG (Pmode, this_regno);
3505       temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3506       temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3507
3508       addr = this_rtx;
3509       if (delta != 0)
3510         {
3511           if (delta >= -256 && delta < 256)
3512             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3513                                        plus_constant (Pmode, this_rtx, delta));
3514           else
3515             aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3516         }
3517
3518       if (Pmode == ptr_mode)
3519         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3520       else
3521         aarch64_emit_move (temp0,
3522                            gen_rtx_ZERO_EXTEND (Pmode,
3523                                                 gen_rtx_MEM (ptr_mode, addr)));
3524
3525       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3526           addr = plus_constant (Pmode, temp0, vcall_offset);
3527       else
3528         {
3529           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
3530                                           Pmode);
3531           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3532         }
3533
3534       if (Pmode == ptr_mode)
3535         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3536       else
3537         aarch64_emit_move (temp1,
3538                            gen_rtx_SIGN_EXTEND (Pmode,
3539                                                 gen_rtx_MEM (ptr_mode, addr)));
3540
3541       emit_insn (gen_add2_insn (this_rtx, temp1));
3542     }
3543
3544   /* Generate a tail call to the target function.  */
3545   if (!TREE_USED (function))
3546     {
3547       assemble_external (function);
3548       TREE_USED (function) = 1;
3549     }
3550   funexp = XEXP (DECL_RTL (function), 0);
3551   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3552   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3553   SIBLING_CALL_P (insn) = 1;
3554
3555   insn = get_insns ();
3556   shorten_branches (insn);
3557   final_start_function (insn, file, 1);
3558   final (insn, file, 1);
3559   final_end_function ();
3560
3561   /* Stop pretending to be a post-reload pass.  */
3562   reload_completed = 0;
3563 }
3564
3565 static bool
3566 aarch64_tls_referenced_p (rtx x)
3567 {
3568   if (!TARGET_HAVE_TLS)
3569     return false;
3570   subrtx_iterator::array_type array;
3571   FOR_EACH_SUBRTX (iter, array, x, ALL)
3572     {
3573       const_rtx x = *iter;
3574       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3575         return true;
3576       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3577          TLS offsets, not real symbol references.  */
3578       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3579         iter.skip_subrtxes ();
3580     }
3581   return false;
3582 }
3583
3584
3585 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3586    a left shift of 0 or 12 bits.  */
3587 bool
3588 aarch64_uimm12_shift (HOST_WIDE_INT val)
3589 {
3590   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3591           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3592           );
3593 }
3594
3595
3596 /* Return true if val is an immediate that can be loaded into a
3597    register by a MOVZ instruction.  */
3598 static bool
3599 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3600 {
3601   if (GET_MODE_SIZE (mode) > 4)
3602     {
3603       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3604           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3605         return 1;
3606     }
3607   else
3608     {
3609       /* Ignore sign extension.  */
3610       val &= (HOST_WIDE_INT) 0xffffffff;
3611     }
3612   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3613           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3614 }
3615
3616 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
3617
3618 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
3619   {
3620     0x0000000100000001ull,
3621     0x0001000100010001ull,
3622     0x0101010101010101ull,
3623     0x1111111111111111ull,
3624     0x5555555555555555ull,
3625   };
3626
3627
3628 /* Return true if val is a valid bitmask immediate.  */
3629
3630 bool
3631 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
3632 {
3633   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
3634   int bits;
3635
3636   /* Check for a single sequence of one bits and return quickly if so.
3637      The special cases of all ones and all zeroes returns false.  */
3638   val = (unsigned HOST_WIDE_INT) val_in;
3639   tmp = val + (val & -val);
3640
3641   if (tmp == (tmp & -tmp))
3642     return (val + 1) > 1;
3643
3644   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
3645   if (mode == SImode)
3646     val = (val << 32) | (val & 0xffffffff);
3647
3648   /* Invert if the immediate doesn't start with a zero bit - this means we
3649      only need to search for sequences of one bits.  */
3650   if (val & 1)
3651     val = ~val;
3652
3653   /* Find the first set bit and set tmp to val with the first sequence of one
3654      bits removed.  Return success if there is a single sequence of ones.  */
3655   first_one = val & -val;
3656   tmp = val & (val + first_one);
3657
3658   if (tmp == 0)
3659     return true;
3660
3661   /* Find the next set bit and compute the difference in bit position.  */
3662   next_one = tmp & -tmp;
3663   bits = clz_hwi (first_one) - clz_hwi (next_one);
3664   mask = val ^ tmp;
3665
3666   /* Check the bit position difference is a power of 2, and that the first
3667      sequence of one bits fits within 'bits' bits.  */
3668   if ((mask >> bits) != 0 || bits != (bits & -bits))
3669     return false;
3670
3671   /* Check the sequence of one bits is repeated 64/bits times.  */
3672   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
3673 }
3674
3675
3676 /* Return true if val is an immediate that can be loaded into a
3677    register in a single instruction.  */
3678 bool
3679 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
3680 {
3681   if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
3682     return 1;
3683   return aarch64_bitmask_imm (val, mode);
3684 }
3685
3686 static bool
3687 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
3688 {
3689   rtx base, offset;
3690
3691   if (GET_CODE (x) == HIGH)
3692     return true;
3693
3694   split_const (x, &base, &offset);
3695   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
3696     {
3697       if (aarch64_classify_symbol (base, offset)
3698           != SYMBOL_FORCE_TO_MEM)
3699         return true;
3700       else
3701         /* Avoid generating a 64-bit relocation in ILP32; leave
3702            to aarch64_expand_mov_immediate to handle it properly.  */
3703         return mode != ptr_mode;
3704     }
3705
3706   return aarch64_tls_referenced_p (x);
3707 }
3708
3709 /* Implement TARGET_CASE_VALUES_THRESHOLD.
3710    The expansion for a table switch is quite expensive due to the number
3711    of instructions, the table lookup and hard to predict indirect jump.
3712    When optimizing for speed, and -O3 enabled, use the per-core tuning if
3713    set, otherwise use tables for > 16 cases as a tradeoff between size and
3714    performance.  When optimizing for size, use the default setting.  */
3715
3716 static unsigned int
3717 aarch64_case_values_threshold (void)
3718 {
3719   /* Use the specified limit for the number of cases before using jump
3720      tables at higher optimization levels.  */
3721   if (optimize > 2
3722       && selected_cpu->tune->max_case_values != 0)
3723     return selected_cpu->tune->max_case_values;
3724   else
3725     return optimize_size ? default_case_values_threshold () : 17;
3726 }
3727
3728 /* Return true if register REGNO is a valid index register.
3729    STRICT_P is true if REG_OK_STRICT is in effect.  */
3730
3731 bool
3732 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
3733 {
3734   if (!HARD_REGISTER_NUM_P (regno))
3735     {
3736       if (!strict_p)
3737         return true;
3738
3739       if (!reg_renumber)
3740         return false;
3741
3742       regno = reg_renumber[regno];
3743     }
3744   return GP_REGNUM_P (regno);
3745 }
3746
3747 /* Return true if register REGNO is a valid base register for mode MODE.
3748    STRICT_P is true if REG_OK_STRICT is in effect.  */
3749
3750 bool
3751 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
3752 {
3753   if (!HARD_REGISTER_NUM_P (regno))
3754     {
3755       if (!strict_p)
3756         return true;
3757
3758       if (!reg_renumber)
3759         return false;
3760
3761       regno = reg_renumber[regno];
3762     }
3763
3764   /* The fake registers will be eliminated to either the stack or
3765      hard frame pointer, both of which are usually valid base registers.
3766      Reload deals with the cases where the eliminated form isn't valid.  */
3767   return (GP_REGNUM_P (regno)
3768           || regno == SP_REGNUM
3769           || regno == FRAME_POINTER_REGNUM
3770           || regno == ARG_POINTER_REGNUM);
3771 }
3772
3773 /* Return true if X is a valid base register for mode MODE.
3774    STRICT_P is true if REG_OK_STRICT is in effect.  */
3775
3776 static bool
3777 aarch64_base_register_rtx_p (rtx x, bool strict_p)
3778 {
3779   if (!strict_p && GET_CODE (x) == SUBREG)
3780     x = SUBREG_REG (x);
3781
3782   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
3783 }
3784
3785 /* Return true if address offset is a valid index.  If it is, fill in INFO
3786    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
3787
3788 static bool
3789 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
3790                         machine_mode mode, bool strict_p)
3791 {
3792   enum aarch64_address_type type;
3793   rtx index;
3794   int shift;
3795
3796   /* (reg:P) */
3797   if ((REG_P (x) || GET_CODE (x) == SUBREG)
3798       && GET_MODE (x) == Pmode)
3799     {
3800       type = ADDRESS_REG_REG;
3801       index = x;
3802       shift = 0;
3803     }
3804   /* (sign_extend:DI (reg:SI)) */
3805   else if ((GET_CODE (x) == SIGN_EXTEND
3806             || GET_CODE (x) == ZERO_EXTEND)
3807            && GET_MODE (x) == DImode
3808            && GET_MODE (XEXP (x, 0)) == SImode)
3809     {
3810       type = (GET_CODE (x) == SIGN_EXTEND)
3811         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3812       index = XEXP (x, 0);
3813       shift = 0;
3814     }
3815   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
3816   else if (GET_CODE (x) == MULT
3817            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3818                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3819            && GET_MODE (XEXP (x, 0)) == DImode
3820            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3821            && CONST_INT_P (XEXP (x, 1)))
3822     {
3823       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3824         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3825       index = XEXP (XEXP (x, 0), 0);
3826       shift = exact_log2 (INTVAL (XEXP (x, 1)));
3827     }
3828   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
3829   else if (GET_CODE (x) == ASHIFT
3830            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3831                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3832            && GET_MODE (XEXP (x, 0)) == DImode
3833            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3834            && CONST_INT_P (XEXP (x, 1)))
3835     {
3836       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3837         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3838       index = XEXP (XEXP (x, 0), 0);
3839       shift = INTVAL (XEXP (x, 1));
3840     }
3841   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
3842   else if ((GET_CODE (x) == SIGN_EXTRACT
3843             || GET_CODE (x) == ZERO_EXTRACT)
3844            && GET_MODE (x) == DImode
3845            && GET_CODE (XEXP (x, 0)) == MULT
3846            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3847            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3848     {
3849       type = (GET_CODE (x) == SIGN_EXTRACT)
3850         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3851       index = XEXP (XEXP (x, 0), 0);
3852       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3853       if (INTVAL (XEXP (x, 1)) != 32 + shift
3854           || INTVAL (XEXP (x, 2)) != 0)
3855         shift = -1;
3856     }
3857   /* (and:DI (mult:DI (reg:DI) (const_int scale))
3858      (const_int 0xffffffff<<shift)) */
3859   else if (GET_CODE (x) == AND
3860            && GET_MODE (x) == DImode
3861            && GET_CODE (XEXP (x, 0)) == MULT
3862            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3863            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3864            && CONST_INT_P (XEXP (x, 1)))
3865     {
3866       type = ADDRESS_REG_UXTW;
3867       index = XEXP (XEXP (x, 0), 0);
3868       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3869       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3870         shift = -1;
3871     }
3872   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
3873   else if ((GET_CODE (x) == SIGN_EXTRACT
3874             || GET_CODE (x) == ZERO_EXTRACT)
3875            && GET_MODE (x) == DImode
3876            && GET_CODE (XEXP (x, 0)) == ASHIFT
3877            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3878            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3879     {
3880       type = (GET_CODE (x) == SIGN_EXTRACT)
3881         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3882       index = XEXP (XEXP (x, 0), 0);
3883       shift = INTVAL (XEXP (XEXP (x, 0), 1));
3884       if (INTVAL (XEXP (x, 1)) != 32 + shift
3885           || INTVAL (XEXP (x, 2)) != 0)
3886         shift = -1;
3887     }
3888   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
3889      (const_int 0xffffffff<<shift)) */
3890   else if (GET_CODE (x) == AND
3891            && GET_MODE (x) == DImode
3892            && GET_CODE (XEXP (x, 0)) == ASHIFT
3893            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3894            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3895            && CONST_INT_P (XEXP (x, 1)))
3896     {
3897       type = ADDRESS_REG_UXTW;
3898       index = XEXP (XEXP (x, 0), 0);
3899       shift = INTVAL (XEXP (XEXP (x, 0), 1));
3900       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3901         shift = -1;
3902     }
3903   /* (mult:P (reg:P) (const_int scale)) */
3904   else if (GET_CODE (x) == MULT
3905            && GET_MODE (x) == Pmode
3906            && GET_MODE (XEXP (x, 0)) == Pmode
3907            && CONST_INT_P (XEXP (x, 1)))
3908     {
3909       type = ADDRESS_REG_REG;
3910       index = XEXP (x, 0);
3911       shift = exact_log2 (INTVAL (XEXP (x, 1)));
3912     }
3913   /* (ashift:P (reg:P) (const_int shift)) */
3914   else if (GET_CODE (x) == ASHIFT
3915            && GET_MODE (x) == Pmode
3916            && GET_MODE (XEXP (x, 0)) == Pmode
3917            && CONST_INT_P (XEXP (x, 1)))
3918     {
3919       type = ADDRESS_REG_REG;
3920       index = XEXP (x, 0);
3921       shift = INTVAL (XEXP (x, 1));
3922     }
3923   else
3924     return false;
3925
3926   if (GET_CODE (index) == SUBREG)
3927     index = SUBREG_REG (index);
3928
3929   if ((shift == 0 ||
3930        (shift > 0 && shift <= 3
3931         && (1 << shift) == GET_MODE_SIZE (mode)))
3932       && REG_P (index)
3933       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
3934     {
3935       info->type = type;
3936       info->offset = index;
3937       info->shift = shift;
3938       return true;
3939     }
3940
3941   return false;
3942 }
3943
3944 bool
3945 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3946 {
3947   return (offset >= -64 * GET_MODE_SIZE (mode)
3948           && offset < 64 * GET_MODE_SIZE (mode)
3949           && offset % GET_MODE_SIZE (mode) == 0);
3950 }
3951
3952 static inline bool
3953 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3954                                HOST_WIDE_INT offset)
3955 {
3956   return offset >= -256 && offset < 256;
3957 }
3958
3959 static inline bool
3960 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3961 {
3962   return (offset >= 0
3963           && offset < 4096 * GET_MODE_SIZE (mode)
3964           && offset % GET_MODE_SIZE (mode) == 0);
3965 }
3966
3967 /* Return true if MODE is one of the modes for which we
3968    support LDP/STP operations.  */
3969
3970 static bool
3971 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
3972 {
3973   return mode == SImode || mode == DImode
3974          || mode == SFmode || mode == DFmode
3975          || (aarch64_vector_mode_supported_p (mode)
3976              && GET_MODE_SIZE (mode) == 8);
3977 }
3978
3979 /* Return true if REGNO is a virtual pointer register, or an eliminable
3980    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
3981    include stack_pointer or hard_frame_pointer.  */
3982 static bool
3983 virt_or_elim_regno_p (unsigned regno)
3984 {
3985   return ((regno >= FIRST_VIRTUAL_REGISTER
3986            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
3987           || regno == FRAME_POINTER_REGNUM
3988           || regno == ARG_POINTER_REGNUM);
3989 }
3990
3991 /* Return true if X is a valid address for machine mode MODE.  If it is,
3992    fill in INFO appropriately.  STRICT_P is true if REG_OK_STRICT is in
3993    effect.  OUTER_CODE is PARALLEL for a load/store pair.  */
3994
3995 static bool
3996 aarch64_classify_address (struct aarch64_address_info *info,
3997                           rtx x, machine_mode mode,
3998                           RTX_CODE outer_code, bool strict_p)
3999 {
4000   enum rtx_code code = GET_CODE (x);
4001   rtx op0, op1;
4002
4003   /* On BE, we use load/store pair for all large int mode load/stores.  */
4004   bool load_store_pair_p = (outer_code == PARALLEL
4005                             || (BYTES_BIG_ENDIAN
4006                                 && aarch64_vect_struct_mode_p (mode)));
4007
4008   bool allow_reg_index_p =
4009     !load_store_pair_p
4010     && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
4011     && !aarch64_vect_struct_mode_p (mode);
4012
4013   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
4014      REG addressing.  */
4015   if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
4016       && (code != POST_INC && code != REG))
4017     return false;
4018
4019   switch (code)
4020     {
4021     case REG:
4022     case SUBREG:
4023       info->type = ADDRESS_REG_IMM;
4024       info->base = x;
4025       info->offset = const0_rtx;
4026       return aarch64_base_register_rtx_p (x, strict_p);
4027
4028     case PLUS:
4029       op0 = XEXP (x, 0);
4030       op1 = XEXP (x, 1);
4031
4032       if (! strict_p
4033           && REG_P (op0)
4034           && virt_or_elim_regno_p (REGNO (op0))
4035           && CONST_INT_P (op1))
4036         {
4037           info->type = ADDRESS_REG_IMM;
4038           info->base = op0;
4039           info->offset = op1;
4040
4041           return true;
4042         }
4043
4044       if (GET_MODE_SIZE (mode) != 0
4045           && CONST_INT_P (op1)
4046           && aarch64_base_register_rtx_p (op0, strict_p))
4047         {
4048           HOST_WIDE_INT offset = INTVAL (op1);
4049
4050           info->type = ADDRESS_REG_IMM;
4051           info->base = op0;
4052           info->offset = op1;
4053
4054           /* TImode and TFmode values are allowed in both pairs of X
4055              registers and individual Q registers.  The available
4056              address modes are:
4057              X,X: 7-bit signed scaled offset
4058              Q:   9-bit signed offset
4059              We conservatively require an offset representable in either mode.
4060              When performing the check for pairs of X registers i.e.  LDP/STP
4061              pass down DImode since that is the natural size of the LDP/STP
4062              instruction memory accesses.  */
4063           if (mode == TImode || mode == TFmode)
4064             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
4065                     && offset_9bit_signed_unscaled_p (mode, offset));
4066
4067           /* A 7bit offset check because OImode will emit a ldp/stp
4068              instruction (only big endian will get here).
4069              For ldp/stp instructions, the offset is scaled for the size of a
4070              single element of the pair.  */
4071           if (mode == OImode)
4072             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
4073
4074           /* Three 9/12 bit offsets checks because CImode will emit three
4075              ldr/str instructions (only big endian will get here).  */
4076           if (mode == CImode)
4077             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4078                     && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
4079                         || offset_12bit_unsigned_scaled_p (V16QImode,
4080                                                            offset + 32)));
4081
4082           /* Two 7bit offsets checks because XImode will emit two ldp/stp
4083              instructions (only big endian will get here).  */
4084           if (mode == XImode)
4085             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4086                     && aarch64_offset_7bit_signed_scaled_p (TImode,
4087                                                             offset + 32));
4088
4089           if (load_store_pair_p)
4090             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4091                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4092           else
4093             return (offset_9bit_signed_unscaled_p (mode, offset)
4094                     || offset_12bit_unsigned_scaled_p (mode, offset));
4095         }
4096
4097       if (allow_reg_index_p)
4098         {
4099           /* Look for base + (scaled/extended) index register.  */
4100           if (aarch64_base_register_rtx_p (op0, strict_p)
4101               && aarch64_classify_index (info, op1, mode, strict_p))
4102             {
4103               info->base = op0;
4104               return true;
4105             }
4106           if (aarch64_base_register_rtx_p (op1, strict_p)
4107               && aarch64_classify_index (info, op0, mode, strict_p))
4108             {
4109               info->base = op1;
4110               return true;
4111             }
4112         }
4113
4114       return false;
4115
4116     case POST_INC:
4117     case POST_DEC:
4118     case PRE_INC:
4119     case PRE_DEC:
4120       info->type = ADDRESS_REG_WB;
4121       info->base = XEXP (x, 0);
4122       info->offset = NULL_RTX;
4123       return aarch64_base_register_rtx_p (info->base, strict_p);
4124
4125     case POST_MODIFY:
4126     case PRE_MODIFY:
4127       info->type = ADDRESS_REG_WB;
4128       info->base = XEXP (x, 0);
4129       if (GET_CODE (XEXP (x, 1)) == PLUS
4130           && CONST_INT_P (XEXP (XEXP (x, 1), 1))
4131           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
4132           && aarch64_base_register_rtx_p (info->base, strict_p))
4133         {
4134           HOST_WIDE_INT offset;
4135           info->offset = XEXP (XEXP (x, 1), 1);
4136           offset = INTVAL (info->offset);
4137
4138           /* TImode and TFmode values are allowed in both pairs of X
4139              registers and individual Q registers.  The available
4140              address modes are:
4141              X,X: 7-bit signed scaled offset
4142              Q:   9-bit signed offset
4143              We conservatively require an offset representable in either mode.
4144            */
4145           if (mode == TImode || mode == TFmode)
4146             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
4147                     && offset_9bit_signed_unscaled_p (mode, offset));
4148
4149           if (load_store_pair_p)
4150             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4151                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4152           else
4153             return offset_9bit_signed_unscaled_p (mode, offset);
4154         }
4155       return false;
4156
4157     case CONST:
4158     case SYMBOL_REF:
4159     case LABEL_REF:
4160       /* load literal: pc-relative constant pool entry.  Only supported
4161          for SI mode or larger.  */
4162       info->type = ADDRESS_SYMBOLIC;
4163
4164       if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
4165         {
4166           rtx sym, addend;
4167
4168           split_const (x, &sym, &addend);
4169           return ((GET_CODE (sym) == LABEL_REF
4170                    || (GET_CODE (sym) == SYMBOL_REF
4171                        && CONSTANT_POOL_ADDRESS_P (sym)
4172                        && aarch64_pcrelative_literal_loads)));
4173         }
4174       return false;
4175
4176     case LO_SUM:
4177       info->type = ADDRESS_LO_SUM;
4178       info->base = XEXP (x, 0);
4179       info->offset = XEXP (x, 1);
4180       if (allow_reg_index_p
4181           && aarch64_base_register_rtx_p (info->base, strict_p))
4182         {
4183           rtx sym, offs;
4184           split_const (info->offset, &sym, &offs);
4185           if (GET_CODE (sym) == SYMBOL_REF
4186               && (aarch64_classify_symbol (sym, offs) == SYMBOL_SMALL_ABSOLUTE))
4187             {
4188               /* The symbol and offset must be aligned to the access size.  */
4189               unsigned int align;
4190               unsigned int ref_size;
4191
4192               if (CONSTANT_POOL_ADDRESS_P (sym))
4193                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
4194               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
4195                 {
4196                   tree exp = SYMBOL_REF_DECL (sym);
4197                   align = TYPE_ALIGN (TREE_TYPE (exp));
4198                   align = CONSTANT_ALIGNMENT (exp, align);
4199                 }
4200               else if (SYMBOL_REF_DECL (sym))
4201                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
4202               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
4203                        && SYMBOL_REF_BLOCK (sym) != NULL)
4204                 align = SYMBOL_REF_BLOCK (sym)->alignment;
4205               else
4206                 align = BITS_PER_UNIT;
4207
4208               ref_size = GET_MODE_SIZE (mode);
4209               if (ref_size == 0)
4210                 ref_size = GET_MODE_SIZE (DImode);
4211
4212               return ((INTVAL (offs) & (ref_size - 1)) == 0
4213                       && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
4214             }
4215         }
4216       return false;
4217
4218     default:
4219       return false;
4220     }
4221 }
4222
4223 bool
4224 aarch64_symbolic_address_p (rtx x)
4225 {
4226   rtx offset;
4227
4228   split_const (x, &x, &offset);
4229   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
4230 }
4231
4232 /* Classify the base of symbolic expression X.  */
4233
4234 enum aarch64_symbol_type
4235 aarch64_classify_symbolic_expression (rtx x)
4236 {
4237   rtx offset;
4238
4239   split_const (x, &x, &offset);
4240   return aarch64_classify_symbol (x, offset);
4241 }
4242
4243
4244 /* Return TRUE if X is a legitimate address for accessing memory in
4245    mode MODE.  */
4246 static bool
4247 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
4248 {
4249   struct aarch64_address_info addr;
4250
4251   return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
4252 }
4253
4254 /* Return TRUE if X is a legitimate address for accessing memory in
4255    mode MODE.  OUTER_CODE will be PARALLEL if this is a load/store
4256    pair operation.  */
4257 bool
4258 aarch64_legitimate_address_p (machine_mode mode, rtx x,
4259                               RTX_CODE outer_code, bool strict_p)
4260 {
4261   struct aarch64_address_info addr;
4262
4263   return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
4264 }
4265
4266 /* Split an out-of-range address displacement into a base and offset.
4267    Use 4KB range for 1- and 2-byte accesses and a 16KB range otherwise
4268    to increase opportunities for sharing the base address of different sizes.
4269    For TI/TFmode and unaligned accesses use a 256-byte range.  */
4270 static bool
4271 aarch64_legitimize_address_displacement (rtx *disp, rtx *off, machine_mode mode)
4272 {
4273   HOST_WIDE_INT mask = GET_MODE_SIZE (mode) < 4 ? 0xfff : 0x3fff;
4274
4275   if (mode == TImode || mode == TFmode ||
4276       (INTVAL (*disp) & (GET_MODE_SIZE (mode) - 1)) != 0)
4277     mask = 0xff;
4278
4279   *off = GEN_INT (INTVAL (*disp) & ~mask);
4280   *disp = GEN_INT (INTVAL (*disp) & mask);
4281   return true;
4282 }
4283
4284 /* Return TRUE if rtx X is immediate constant 0.0 */
4285 bool
4286 aarch64_float_const_zero_rtx_p (rtx x)
4287 {
4288   if (GET_MODE (x) == VOIDmode)
4289     return false;
4290
4291   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
4292     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
4293   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
4294 }
4295
4296 /* Return the fixed registers used for condition codes.  */
4297
4298 static bool
4299 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
4300 {
4301   *p1 = CC_REGNUM;
4302   *p2 = INVALID_REGNUM;
4303   return true;
4304 }
4305
4306 /* Emit call insn with PAT and do aarch64-specific handling.  */
4307
4308 void
4309 aarch64_emit_call_insn (rtx pat)
4310 {
4311   rtx insn = emit_call_insn (pat);
4312
4313   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
4314   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
4315   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
4316 }
4317
4318 machine_mode
4319 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
4320 {
4321   /* All floating point compares return CCFP if it is an equality
4322      comparison, and CCFPE otherwise.  */
4323   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
4324     {
4325       switch (code)
4326         {
4327         case EQ:
4328         case NE:
4329         case UNORDERED:
4330         case ORDERED:
4331         case UNLT:
4332         case UNLE:
4333         case UNGT:
4334         case UNGE:
4335         case UNEQ:
4336         case LTGT:
4337           return CCFPmode;
4338
4339         case LT:
4340         case LE:
4341         case GT:
4342         case GE:
4343           return CCFPEmode;
4344
4345         default:
4346           gcc_unreachable ();
4347         }
4348     }
4349
4350   /* Equality comparisons of short modes against zero can be performed
4351      using the TST instruction with the appropriate bitmask.  */
4352   if (y == const0_rtx && REG_P (x)
4353       && (code == EQ || code == NE)
4354       && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
4355     return CC_NZmode;
4356
4357   /* Similarly, comparisons of zero_extends from shorter modes can
4358      be performed using an ANDS with an immediate mask.  */
4359   if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
4360       && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4361       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
4362       && (code == EQ || code == NE))
4363     return CC_NZmode;
4364
4365   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4366       && y == const0_rtx
4367       && (code == EQ || code == NE || code == LT || code == GE)
4368       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
4369           || GET_CODE (x) == NEG
4370           || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
4371               && CONST_INT_P (XEXP (x, 2)))))
4372     return CC_NZmode;
4373
4374   /* A compare with a shifted operand.  Because of canonicalization,
4375      the comparison will have to be swapped when we emit the assembly
4376      code.  */
4377   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4378       && (REG_P (y) || GET_CODE (y) == SUBREG)
4379       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
4380           || GET_CODE (x) == LSHIFTRT
4381           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
4382     return CC_SWPmode;
4383
4384   /* Similarly for a negated operand, but we can only do this for
4385      equalities.  */
4386   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4387       && (REG_P (y) || GET_CODE (y) == SUBREG)
4388       && (code == EQ || code == NE)
4389       && GET_CODE (x) == NEG)
4390     return CC_Zmode;
4391
4392   /* A test for unsigned overflow.  */
4393   if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
4394       && code == NE
4395       && GET_CODE (x) == PLUS
4396       && GET_CODE (y) == ZERO_EXTEND)
4397     return CC_Cmode;
4398
4399   /* For everything else, return CCmode.  */
4400   return CCmode;
4401 }
4402
4403 static int
4404 aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
4405
4406 int
4407 aarch64_get_condition_code (rtx x)
4408 {
4409   machine_mode mode = GET_MODE (XEXP (x, 0));
4410   enum rtx_code comp_code = GET_CODE (x);
4411
4412   if (GET_MODE_CLASS (mode) != MODE_CC)
4413     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
4414   return aarch64_get_condition_code_1 (mode, comp_code);
4415 }
4416
4417 static int
4418 aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
4419 {
4420   switch (mode)
4421     {
4422     case CCFPmode:
4423     case CCFPEmode:
4424       switch (comp_code)
4425         {
4426         case GE: return AARCH64_GE;
4427         case GT: return AARCH64_GT;
4428         case LE: return AARCH64_LS;
4429         case LT: return AARCH64_MI;
4430         case NE: return AARCH64_NE;
4431         case EQ: return AARCH64_EQ;
4432         case ORDERED: return AARCH64_VC;
4433         case UNORDERED: return AARCH64_VS;
4434         case UNLT: return AARCH64_LT;
4435         case UNLE: return AARCH64_LE;
4436         case UNGT: return AARCH64_HI;
4437         case UNGE: return AARCH64_PL;
4438         default: return -1;
4439         }
4440       break;
4441
4442     case CCmode:
4443       switch (comp_code)
4444         {
4445         case NE: return AARCH64_NE;
4446         case EQ: return AARCH64_EQ;
4447         case GE: return AARCH64_GE;
4448         case GT: return AARCH64_GT;
4449         case LE: return AARCH64_LE;
4450         case LT: return AARCH64_LT;
4451         case GEU: return AARCH64_CS;
4452         case GTU: return AARCH64_HI;
4453         case LEU: return AARCH64_LS;
4454         case LTU: return AARCH64_CC;
4455         default: return -1;
4456         }
4457       break;
4458
4459     case CC_SWPmode:
4460       switch (comp_code)
4461         {
4462         case NE: return AARCH64_NE;
4463         case EQ: return AARCH64_EQ;
4464         case GE: return AARCH64_LE;
4465         case GT: return AARCH64_LT;
4466         case LE: return AARCH64_GE;
4467         case LT: return AARCH64_GT;
4468         case GEU: return AARCH64_LS;
4469         case GTU: return AARCH64_CC;
4470         case LEU: return AARCH64_CS;
4471         case LTU: return AARCH64_HI;
4472         default: return -1;
4473         }
4474       break;
4475
4476     case CC_NZmode:
4477       switch (comp_code)
4478         {
4479         case NE: return AARCH64_NE;
4480         case EQ: return AARCH64_EQ;
4481         case GE: return AARCH64_PL;
4482         case LT: return AARCH64_MI;
4483         default: return -1;
4484         }
4485       break;
4486
4487     case CC_Zmode:
4488       switch (comp_code)
4489         {
4490         case NE: return AARCH64_NE;
4491         case EQ: return AARCH64_EQ;
4492         default: return -1;
4493         }
4494       break;
4495
4496     case CC_Cmode:
4497       switch (comp_code)
4498         {
4499         case NE: return AARCH64_CS;
4500         case EQ: return AARCH64_CC;
4501         default: return -1;
4502         }
4503       break;
4504
4505     default:
4506       return -1;
4507       break;
4508     }
4509
4510   return -1;
4511 }
4512
4513 bool
4514 aarch64_const_vec_all_same_in_range_p (rtx x,
4515                                   HOST_WIDE_INT minval,
4516                                   HOST_WIDE_INT maxval)
4517 {
4518   HOST_WIDE_INT firstval;
4519   int count, i;
4520
4521   if (GET_CODE (x) != CONST_VECTOR
4522       || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
4523     return false;
4524
4525   firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
4526   if (firstval < minval || firstval > maxval)
4527     return false;
4528
4529   count = CONST_VECTOR_NUNITS (x);
4530   for (i = 1; i < count; i++)
4531     if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
4532       return false;
4533
4534   return true;
4535 }
4536
4537 bool
4538 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
4539 {
4540   return aarch64_const_vec_all_same_in_range_p (x, val, val);
4541 }
4542
4543
4544 /* N Z C V.  */
4545 #define AARCH64_CC_V 1
4546 #define AARCH64_CC_C (1 << 1)
4547 #define AARCH64_CC_Z (1 << 2)
4548 #define AARCH64_CC_N (1 << 3)
4549
4550 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
4551 static const int aarch64_nzcv_codes[] =
4552 {
4553   0,            /* EQ, Z == 1.  */
4554   AARCH64_CC_Z, /* NE, Z == 0.  */
4555   0,            /* CS, C == 1.  */
4556   AARCH64_CC_C, /* CC, C == 0.  */
4557   0,            /* MI, N == 1.  */
4558   AARCH64_CC_N, /* PL, N == 0.  */
4559   0,            /* VS, V == 1.  */
4560   AARCH64_CC_V, /* VC, V == 0.  */
4561   0,            /* HI, C ==1 && Z == 0.  */
4562   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
4563   AARCH64_CC_V, /* GE, N == V.  */
4564   0,            /* LT, N != V.  */
4565   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
4566   0,            /* LE, !(Z == 0 && N == V).  */
4567   0,            /* AL, Any.  */
4568   0             /* NV, Any.  */
4569 };
4570
4571 static void
4572 aarch64_print_operand (FILE *f, rtx x, int code)
4573 {
4574   switch (code)
4575     {
4576     /* An integer or symbol address without a preceding # sign.  */
4577     case 'c':
4578       switch (GET_CODE (x))
4579         {
4580         case CONST_INT:
4581           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
4582           break;
4583
4584         case SYMBOL_REF:
4585           output_addr_const (f, x);
4586           break;
4587
4588         case CONST:
4589           if (GET_CODE (XEXP (x, 0)) == PLUS
4590               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
4591             {
4592               output_addr_const (f, x);
4593               break;
4594             }
4595           /* Fall through.  */
4596
4597         default:
4598           output_operand_lossage ("Unsupported operand for code '%c'", code);
4599         }
4600       break;
4601
4602     case 'e':
4603       /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w.  */
4604       {
4605         int n;
4606
4607         if (!CONST_INT_P (x)
4608             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
4609           {
4610             output_operand_lossage ("invalid operand for '%%%c'", code);
4611             return;
4612           }
4613
4614         switch (n)
4615           {
4616           case 3:
4617             fputc ('b', f);
4618             break;
4619           case 4:
4620             fputc ('h', f);
4621             break;
4622           case 5:
4623             fputc ('w', f);
4624             break;
4625           default:
4626             output_operand_lossage ("invalid operand for '%%%c'", code);
4627             return;
4628           }
4629       }
4630       break;
4631
4632     case 'p':
4633       {
4634         int n;
4635
4636         /* Print N such that 2^N == X.  */
4637         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
4638           {
4639             output_operand_lossage ("invalid operand for '%%%c'", code);
4640             return;
4641           }
4642
4643         asm_fprintf (f, "%d", n);
4644       }
4645       break;
4646
4647     case 'P':
4648       /* Print the number of non-zero bits in X (a const_int).  */
4649       if (!CONST_INT_P (x))
4650         {
4651           output_operand_lossage ("invalid operand for '%%%c'", code);
4652           return;
4653         }
4654
4655       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
4656       break;
4657
4658     case 'H':
4659       if (x == const0_rtx)
4660         {
4661           asm_fprintf (f, "xzr");
4662           break;
4663         }
4664       /* Print the higher numbered register of a pair (TImode) of regs.  */
4665       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
4666         {
4667           output_operand_lossage ("invalid operand for '%%%c'", code);
4668           return;
4669         }
4670
4671       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
4672       break;
4673
4674     case 'M':
4675     case 'm':
4676       {
4677         int cond_code;
4678         /* Print a condition (eq, ne, etc) or its inverse.  */
4679
4680         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
4681         if (x == const_true_rtx)
4682           {
4683             if (code == 'M')
4684               fputs ("nv", f);
4685             return;
4686           }
4687
4688         if (!COMPARISON_P (x))
4689           {
4690             output_operand_lossage ("invalid operand for '%%%c'", code);
4691             return;
4692           }
4693
4694         cond_code = aarch64_get_condition_code (x);
4695         gcc_assert (cond_code >= 0);
4696         if (code == 'M')
4697           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
4698         fputs (aarch64_condition_codes[cond_code], f);
4699       }
4700       break;
4701
4702     case 'b':
4703     case 'h':
4704     case 's':
4705     case 'd':
4706     case 'q':
4707       /* Print a scalar FP/SIMD register name.  */
4708       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4709         {
4710           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4711           return;
4712         }
4713       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
4714       break;
4715
4716     case 'S':
4717     case 'T':
4718     case 'U':
4719     case 'V':
4720       /* Print the first FP/SIMD register name in a list.  */
4721       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4722         {
4723           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4724           return;
4725         }
4726       asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
4727       break;
4728
4729     case 'R':
4730       /* Print a scalar FP/SIMD register name + 1.  */
4731       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4732         {
4733           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4734           return;
4735         }
4736       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
4737       break;
4738
4739     case 'X':
4740       /* Print bottom 16 bits of integer constant in hex.  */
4741       if (!CONST_INT_P (x))
4742         {
4743           output_operand_lossage ("invalid operand for '%%%c'", code);
4744           return;
4745         }
4746       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
4747       break;
4748
4749     case 'w':
4750     case 'x':
4751       /* Print a general register name or the zero register (32-bit or
4752          64-bit).  */
4753       if (x == const0_rtx
4754           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
4755         {
4756           asm_fprintf (f, "%czr", code);
4757           break;
4758         }
4759
4760       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
4761         {
4762           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
4763           break;
4764         }
4765
4766       if (REG_P (x) && REGNO (x) == SP_REGNUM)
4767         {
4768           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
4769           break;
4770         }
4771
4772       /* Fall through */
4773
4774     case 0:
4775       /* Print a normal operand, if it's a general register, then we
4776          assume DImode.  */
4777       if (x == NULL)
4778         {
4779           output_operand_lossage ("missing operand");
4780           return;
4781         }
4782
4783       switch (GET_CODE (x))
4784         {
4785         case REG:
4786           asm_fprintf (f, "%s", reg_names [REGNO (x)]);
4787           break;
4788
4789         case MEM:
4790           output_address (GET_MODE (x), XEXP (x, 0));
4791           break;
4792
4793         case CONST:
4794         case LABEL_REF:
4795         case SYMBOL_REF:
4796           output_addr_const (asm_out_file, x);
4797           break;
4798
4799         case CONST_INT:
4800           asm_fprintf (f, "%wd", INTVAL (x));
4801           break;
4802
4803         case CONST_VECTOR:
4804           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
4805             {
4806               gcc_assert (
4807                   aarch64_const_vec_all_same_in_range_p (x,
4808                                                          HOST_WIDE_INT_MIN,
4809                                                          HOST_WIDE_INT_MAX));
4810               asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
4811             }
4812           else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
4813             {
4814               fputc ('0', f);
4815             }
4816           else
4817             gcc_unreachable ();
4818           break;
4819
4820         case CONST_DOUBLE:
4821           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
4822              be getting CONST_DOUBLEs holding integers.  */
4823           gcc_assert (GET_MODE (x) != VOIDmode);
4824           if (aarch64_float_const_zero_rtx_p (x))
4825             {
4826               fputc ('0', f);
4827               break;
4828             }
4829           else if (aarch64_float_const_representable_p (x))
4830             {
4831 #define buf_size 20
4832               char float_buf[buf_size] = {'\0'};
4833               real_to_decimal_for_mode (float_buf,
4834                                         CONST_DOUBLE_REAL_VALUE (x),
4835                                         buf_size, buf_size,
4836                                         1, GET_MODE (x));
4837               asm_fprintf (asm_out_file, "%s", float_buf);
4838               break;
4839 #undef buf_size
4840             }
4841           output_operand_lossage ("invalid constant");
4842           return;
4843         default:
4844           output_operand_lossage ("invalid operand");
4845           return;
4846         }
4847       break;
4848
4849     case 'A':
4850       if (GET_CODE (x) == HIGH)
4851         x = XEXP (x, 0);
4852
4853       switch (aarch64_classify_symbolic_expression (x))
4854         {
4855         case SYMBOL_SMALL_GOT_4G:
4856           asm_fprintf (asm_out_file, ":got:");
4857           break;
4858
4859         case SYMBOL_SMALL_TLSGD:
4860           asm_fprintf (asm_out_file, ":tlsgd:");
4861           break;
4862
4863         case SYMBOL_SMALL_TLSDESC:
4864           asm_fprintf (asm_out_file, ":tlsdesc:");
4865           break;
4866
4867         case SYMBOL_SMALL_TLSIE:
4868           asm_fprintf (asm_out_file, ":gottprel:");
4869           break;
4870
4871         case SYMBOL_TLSLE24:
4872           asm_fprintf (asm_out_file, ":tprel:");
4873           break;
4874
4875         case SYMBOL_TINY_GOT:
4876           gcc_unreachable ();
4877           break;
4878
4879         default:
4880           break;
4881         }
4882       output_addr_const (asm_out_file, x);
4883       break;
4884
4885     case 'L':
4886       switch (aarch64_classify_symbolic_expression (x))
4887         {
4888         case SYMBOL_SMALL_GOT_4G:
4889           asm_fprintf (asm_out_file, ":lo12:");
4890           break;
4891
4892         case SYMBOL_SMALL_TLSGD:
4893           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
4894           break;
4895
4896         case SYMBOL_SMALL_TLSDESC:
4897           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
4898           break;
4899
4900         case SYMBOL_SMALL_TLSIE:
4901           asm_fprintf (asm_out_file, ":gottprel_lo12:");
4902           break;
4903
4904         case SYMBOL_TLSLE12:
4905           asm_fprintf (asm_out_file, ":tprel_lo12:");
4906           break;
4907
4908         case SYMBOL_TLSLE24:
4909           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
4910           break;
4911
4912         case SYMBOL_TINY_GOT:
4913           asm_fprintf (asm_out_file, ":got:");
4914           break;
4915
4916         case SYMBOL_TINY_TLSIE:
4917           asm_fprintf (asm_out_file, ":gottprel:");
4918           break;
4919
4920         default:
4921           break;
4922         }
4923       output_addr_const (asm_out_file, x);
4924       break;
4925
4926     case 'G':
4927
4928       switch (aarch64_classify_symbolic_expression (x))
4929         {
4930         case SYMBOL_TLSLE24:
4931           asm_fprintf (asm_out_file, ":tprel_hi12:");
4932           break;
4933         default:
4934           break;
4935         }
4936       output_addr_const (asm_out_file, x);
4937       break;
4938
4939     case 'k':
4940       {
4941         HOST_WIDE_INT cond_code;
4942         /* Print nzcv.  */
4943
4944         if (!CONST_INT_P (x))
4945           {
4946             output_operand_lossage ("invalid operand for '%%%c'", code);
4947             return;
4948           }
4949
4950         cond_code = INTVAL (x);
4951         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
4952         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
4953       }
4954       break;
4955
4956     default:
4957       output_operand_lossage ("invalid operand prefix '%%%c'", code);
4958       return;
4959     }
4960 }
4961
4962 static void
4963 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
4964 {
4965   struct aarch64_address_info addr;
4966
4967   if (aarch64_classify_address (&addr, x, mode, MEM, true))
4968     switch (addr.type)
4969       {
4970       case ADDRESS_REG_IMM:
4971         if (addr.offset == const0_rtx)
4972           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
4973         else
4974           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
4975                        INTVAL (addr.offset));
4976         return;
4977
4978       case ADDRESS_REG_REG:
4979         if (addr.shift == 0)
4980           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
4981                        reg_names [REGNO (addr.offset)]);
4982         else
4983           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
4984                        reg_names [REGNO (addr.offset)], addr.shift);
4985         return;
4986
4987       case ADDRESS_REG_UXTW:
4988         if (addr.shift == 0)
4989           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
4990                        REGNO (addr.offset) - R0_REGNUM);
4991         else
4992           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
4993                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
4994         return;
4995
4996       case ADDRESS_REG_SXTW:
4997         if (addr.shift == 0)
4998           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
4999                        REGNO (addr.offset) - R0_REGNUM);
5000         else
5001           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
5002                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
5003         return;
5004
5005       case ADDRESS_REG_WB:
5006         switch (GET_CODE (x))
5007           {
5008           case PRE_INC:
5009             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
5010                          GET_MODE_SIZE (mode));
5011             return;
5012           case POST_INC:
5013             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
5014                          GET_MODE_SIZE (mode));
5015             return;
5016           case PRE_DEC:
5017             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
5018                          GET_MODE_SIZE (mode));
5019             return;
5020           case POST_DEC:
5021             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
5022                          GET_MODE_SIZE (mode));
5023             return;
5024           case PRE_MODIFY:
5025             asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
5026                          INTVAL (addr.offset));
5027             return;
5028           case POST_MODIFY:
5029             asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
5030                          INTVAL (addr.offset));
5031             return;
5032           default:
5033             break;
5034           }
5035         break;
5036
5037       case ADDRESS_LO_SUM:
5038         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
5039         output_addr_const (f, addr.offset);
5040         asm_fprintf (f, "]");
5041         return;
5042
5043       case ADDRESS_SYMBOLIC:
5044         break;
5045       }
5046
5047   output_addr_const (f, x);
5048 }
5049
5050 bool
5051 aarch64_label_mentioned_p (rtx x)
5052 {
5053   const char *fmt;
5054   int i;
5055
5056   if (GET_CODE (x) == LABEL_REF)
5057     return true;
5058
5059   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
5060      referencing instruction, but they are constant offsets, not
5061      symbols.  */
5062   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5063     return false;
5064
5065   fmt = GET_RTX_FORMAT (GET_CODE (x));
5066   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
5067     {
5068       if (fmt[i] == 'E')
5069         {
5070           int j;
5071
5072           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
5073             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
5074               return 1;
5075         }
5076       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
5077         return 1;
5078     }
5079
5080   return 0;
5081 }
5082
5083 /* Implement REGNO_REG_CLASS.  */
5084
5085 enum reg_class
5086 aarch64_regno_regclass (unsigned regno)
5087 {
5088   if (GP_REGNUM_P (regno))
5089     return GENERAL_REGS;
5090
5091   if (regno == SP_REGNUM)
5092     return STACK_REG;
5093
5094   if (regno == FRAME_POINTER_REGNUM
5095       || regno == ARG_POINTER_REGNUM)
5096     return POINTER_REGS;
5097
5098   if (FP_REGNUM_P (regno))
5099     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
5100
5101   return NO_REGS;
5102 }
5103
5104 static rtx
5105 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
5106 {
5107   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
5108      where mask is selected by alignment and size of the offset.
5109      We try to pick as large a range for the offset as possible to
5110      maximize the chance of a CSE.  However, for aligned addresses
5111      we limit the range to 4k so that structures with different sized
5112      elements are likely to use the same base.  We need to be careful
5113      not to split a CONST for some forms of address expression, otherwise
5114      it will generate sub-optimal code.  */
5115
5116   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
5117     {
5118       rtx base = XEXP (x, 0);
5119       rtx offset_rtx = XEXP (x, 1);
5120       HOST_WIDE_INT offset = INTVAL (offset_rtx);
5121
5122       if (GET_CODE (base) == PLUS)
5123         {
5124           rtx op0 = XEXP (base, 0);
5125           rtx op1 = XEXP (base, 1);
5126
5127           /* Force any scaling into a temp for CSE.  */
5128           op0 = force_reg (Pmode, op0);
5129           op1 = force_reg (Pmode, op1);
5130
5131           /* Let the pointer register be in op0.  */
5132           if (REG_POINTER (op1))
5133             std::swap (op0, op1);
5134
5135           /* If the pointer is virtual or frame related, then we know that
5136              virtual register instantiation or register elimination is going
5137              to apply a second constant.  We want the two constants folded
5138              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
5139           if (virt_or_elim_regno_p (REGNO (op0)))
5140             {
5141               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
5142                                    NULL_RTX, true, OPTAB_DIRECT);
5143               return gen_rtx_PLUS (Pmode, base, op1);
5144             }
5145
5146           /* Otherwise, in order to encourage CSE (and thence loop strength
5147              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
5148           base = expand_binop (Pmode, add_optab, op0, op1,
5149                                NULL_RTX, true, OPTAB_DIRECT);
5150           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
5151         }
5152
5153       /* Does it look like we'll need a load/store-pair operation?  */
5154       HOST_WIDE_INT base_offset;
5155       if (GET_MODE_SIZE (mode) > 16
5156           || mode == TImode)
5157         base_offset = ((offset + 64 * GET_MODE_SIZE (mode))
5158                        & ~((128 * GET_MODE_SIZE (mode)) - 1));
5159       /* For offsets aren't a multiple of the access size, the limit is
5160          -256...255.  */
5161       else if (offset & (GET_MODE_SIZE (mode) - 1))
5162         base_offset = (offset + 0x100) & ~0x1ff;
5163       else
5164         base_offset = offset & ~0xfff;
5165
5166       if (base_offset != 0)
5167         {
5168           base = plus_constant (Pmode, base, base_offset);
5169           base = force_operand (base, NULL_RTX);
5170           return plus_constant (Pmode, base, offset - base_offset);
5171         }
5172     }
5173
5174   return x;
5175 }
5176
5177 /* Return the reload icode required for a constant pool in mode.  */
5178 static enum insn_code
5179 aarch64_constant_pool_reload_icode (machine_mode mode)
5180 {
5181   switch (mode)
5182     {
5183     case SFmode:
5184       return CODE_FOR_aarch64_reload_movcpsfdi;
5185
5186     case DFmode:
5187       return CODE_FOR_aarch64_reload_movcpdfdi;
5188
5189     case TFmode:
5190       return CODE_FOR_aarch64_reload_movcptfdi;
5191
5192     case V8QImode:
5193       return CODE_FOR_aarch64_reload_movcpv8qidi;
5194
5195     case V16QImode:
5196       return CODE_FOR_aarch64_reload_movcpv16qidi;
5197
5198     case V4HImode:
5199       return CODE_FOR_aarch64_reload_movcpv4hidi;
5200
5201     case V8HImode:
5202       return CODE_FOR_aarch64_reload_movcpv8hidi;
5203
5204     case V2SImode:
5205       return CODE_FOR_aarch64_reload_movcpv2sidi;
5206
5207     case V4SImode:
5208       return CODE_FOR_aarch64_reload_movcpv4sidi;
5209
5210     case V2DImode:
5211       return CODE_FOR_aarch64_reload_movcpv2didi;
5212
5213     case V2DFmode:
5214       return CODE_FOR_aarch64_reload_movcpv2dfdi;
5215
5216     default:
5217       gcc_unreachable ();
5218     }
5219
5220   gcc_unreachable ();
5221 }
5222 static reg_class_t
5223 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
5224                           reg_class_t rclass,
5225                           machine_mode mode,
5226                           secondary_reload_info *sri)
5227 {
5228
5229   /* If we have to disable direct literal pool loads and stores because the
5230      function is too big, then we need a scratch register.  */
5231   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
5232       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
5233           || targetm.vector_mode_supported_p (GET_MODE (x)))
5234       && !aarch64_pcrelative_literal_loads)
5235     {
5236       sri->icode = aarch64_constant_pool_reload_icode (mode);
5237       return NO_REGS;
5238     }
5239
5240   /* Without the TARGET_SIMD instructions we cannot move a Q register
5241      to a Q register directly.  We need a scratch.  */
5242   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
5243       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
5244       && reg_class_subset_p (rclass, FP_REGS))
5245     {
5246       if (mode == TFmode)
5247         sri->icode = CODE_FOR_aarch64_reload_movtf;
5248       else if (mode == TImode)
5249         sri->icode = CODE_FOR_aarch64_reload_movti;
5250       return NO_REGS;
5251     }
5252
5253   /* A TFmode or TImode memory access should be handled via an FP_REGS
5254      because AArch64 has richer addressing modes for LDR/STR instructions
5255      than LDP/STP instructions.  */
5256   if (TARGET_FLOAT && rclass == GENERAL_REGS
5257       && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
5258     return FP_REGS;
5259
5260   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
5261       return GENERAL_REGS;
5262
5263   return NO_REGS;
5264 }
5265
5266 static bool
5267 aarch64_can_eliminate (const int from, const int to)
5268 {
5269   /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
5270      HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM.  */
5271
5272   if (frame_pointer_needed)
5273     {
5274       if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5275         return true;
5276       if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
5277         return false;
5278       if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
5279           && !cfun->calls_alloca)
5280         return true;
5281       if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5282         return true;
5283
5284       return false;
5285     }
5286   else
5287     {
5288       /* If we decided that we didn't need a leaf frame pointer but then used
5289          LR in the function, then we'll want a frame pointer after all, so
5290          prevent this elimination to ensure a frame pointer is used.  */
5291       if (to == STACK_POINTER_REGNUM
5292           && flag_omit_leaf_frame_pointer
5293           && df_regs_ever_live_p (LR_REGNUM))
5294         return false;
5295     }
5296
5297   return true;
5298 }
5299
5300 HOST_WIDE_INT
5301 aarch64_initial_elimination_offset (unsigned from, unsigned to)
5302 {
5303   aarch64_layout_frame ();
5304
5305   if (to == HARD_FRAME_POINTER_REGNUM)
5306     {
5307       if (from == ARG_POINTER_REGNUM)
5308         return cfun->machine->frame.hard_fp_offset;
5309
5310       if (from == FRAME_POINTER_REGNUM)
5311         return cfun->machine->frame.hard_fp_offset
5312                - cfun->machine->frame.locals_offset;
5313     }
5314
5315   if (to == STACK_POINTER_REGNUM)
5316     {
5317       if (from == FRAME_POINTER_REGNUM)
5318           return cfun->machine->frame.frame_size
5319                  - cfun->machine->frame.locals_offset;
5320     }
5321
5322   return cfun->machine->frame.frame_size;
5323 }
5324
5325 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
5326    previous frame.  */
5327
5328 rtx
5329 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
5330 {
5331   if (count != 0)
5332     return const0_rtx;
5333   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
5334 }
5335
5336
5337 static void
5338 aarch64_asm_trampoline_template (FILE *f)
5339 {
5340   if (TARGET_ILP32)
5341     {
5342       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5343       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5344     }
5345   else
5346     {
5347       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5348       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5349     }
5350   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5351   assemble_aligned_integer (4, const0_rtx);
5352   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5353   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5354 }
5355
5356 static void
5357 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5358 {
5359   rtx fnaddr, mem, a_tramp;
5360   const int tramp_code_sz = 16;
5361
5362   /* Don't need to copy the trailing D-words, we fill those in below.  */
5363   emit_block_move (m_tramp, assemble_trampoline_template (),
5364                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
5365   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
5366   fnaddr = XEXP (DECL_RTL (fndecl), 0);
5367   if (GET_MODE (fnaddr) != ptr_mode)
5368     fnaddr = convert_memory_address (ptr_mode, fnaddr);
5369   emit_move_insn (mem, fnaddr);
5370
5371   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
5372   emit_move_insn (mem, chain_value);
5373
5374   /* XXX We should really define a "clear_cache" pattern and use
5375      gen_clear_cache().  */
5376   a_tramp = XEXP (m_tramp, 0);
5377   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
5378                      LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
5379                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
5380                      ptr_mode);
5381 }
5382
5383 static unsigned char
5384 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
5385 {
5386   switch (regclass)
5387     {
5388     case CALLER_SAVE_REGS:
5389     case POINTER_REGS:
5390     case GENERAL_REGS:
5391     case ALL_REGS:
5392     case FP_REGS:
5393     case FP_LO_REGS:
5394       return
5395         aarch64_vector_mode_p (mode)
5396           ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
5397           : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5398     case STACK_REG:
5399       return 1;
5400
5401     case NO_REGS:
5402       return 0;
5403
5404     default:
5405       break;
5406     }
5407   gcc_unreachable ();
5408 }
5409
5410 static reg_class_t
5411 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
5412 {
5413   if (regclass == POINTER_REGS)
5414     return GENERAL_REGS;
5415
5416   if (regclass == STACK_REG)
5417     {
5418       if (REG_P(x)
5419           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
5420           return regclass;
5421
5422       return NO_REGS;
5423     }
5424
5425   /* If it's an integer immediate that MOVI can't handle, then
5426      FP_REGS is not an option, so we return NO_REGS instead.  */
5427   if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
5428       && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
5429     return NO_REGS;
5430
5431   /* Register eliminiation can result in a request for
5432      SP+constant->FP_REGS.  We cannot support such operations which
5433      use SP as source and an FP_REG as destination, so reject out
5434      right now.  */
5435   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
5436     {
5437       rtx lhs = XEXP (x, 0);
5438
5439       /* Look through a possible SUBREG introduced by ILP32.  */
5440       if (GET_CODE (lhs) == SUBREG)
5441         lhs = SUBREG_REG (lhs);
5442
5443       gcc_assert (REG_P (lhs));
5444       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
5445                                       POINTER_REGS));
5446       return NO_REGS;
5447     }
5448
5449   return regclass;
5450 }
5451
5452 void
5453 aarch64_asm_output_labelref (FILE* f, const char *name)
5454 {
5455   asm_fprintf (f, "%U%s", name);
5456 }
5457
5458 static void
5459 aarch64_elf_asm_constructor (rtx symbol, int priority)
5460 {
5461   if (priority == DEFAULT_INIT_PRIORITY)
5462     default_ctor_section_asm_out_constructor (symbol, priority);
5463   else
5464     {
5465       section *s;
5466       /* While priority is known to be in range [0, 65535], so 18 bytes
5467          would be enough, the compiler might not know that.  To avoid
5468          -Wformat-truncation false positive, use a larger size.  */
5469       char buf[23];
5470       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
5471       s = get_section (buf, SECTION_WRITE, NULL);
5472       switch_to_section (s);
5473       assemble_align (POINTER_SIZE);
5474       assemble_aligned_integer (POINTER_BYTES, symbol);
5475     }
5476 }
5477
5478 static void
5479 aarch64_elf_asm_destructor (rtx symbol, int priority)
5480 {
5481   if (priority == DEFAULT_INIT_PRIORITY)
5482     default_dtor_section_asm_out_destructor (symbol, priority);
5483   else
5484     {
5485       section *s;
5486       /* While priority is known to be in range [0, 65535], so 18 bytes
5487          would be enough, the compiler might not know that.  To avoid
5488          -Wformat-truncation false positive, use a larger size.  */
5489       char buf[23];
5490       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
5491       s = get_section (buf, SECTION_WRITE, NULL);
5492       switch_to_section (s);
5493       assemble_align (POINTER_SIZE);
5494       assemble_aligned_integer (POINTER_BYTES, symbol);
5495     }
5496 }
5497
5498 const char*
5499 aarch64_output_casesi (rtx *operands)
5500 {
5501   char buf[100];
5502   char label[100];
5503   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
5504   int index;
5505   static const char *const patterns[4][2] =
5506   {
5507     {
5508       "ldrb\t%w3, [%0,%w1,uxtw]",
5509       "add\t%3, %4, %w3, sxtb #2"
5510     },
5511     {
5512       "ldrh\t%w3, [%0,%w1,uxtw #1]",
5513       "add\t%3, %4, %w3, sxth #2"
5514     },
5515     {
5516       "ldr\t%w3, [%0,%w1,uxtw #2]",
5517       "add\t%3, %4, %w3, sxtw #2"
5518     },
5519     /* We assume that DImode is only generated when not optimizing and
5520        that we don't really need 64-bit address offsets.  That would
5521        imply an object file with 8GB of code in a single function!  */
5522     {
5523       "ldr\t%w3, [%0,%w1,uxtw #2]",
5524       "add\t%3, %4, %w3, sxtw #2"
5525     }
5526   };
5527
5528   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
5529
5530   index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
5531
5532   gcc_assert (index >= 0 && index <= 3);
5533
5534   /* Need to implement table size reduction, by chaning the code below.  */
5535   output_asm_insn (patterns[index][0], operands);
5536   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
5537   snprintf (buf, sizeof (buf),
5538             "adr\t%%4, %s", targetm.strip_name_encoding (label));
5539   output_asm_insn (buf, operands);
5540   output_asm_insn (patterns[index][1], operands);
5541   output_asm_insn ("br\t%3", operands);
5542   assemble_label (asm_out_file, label);
5543   return "";
5544 }
5545
5546
5547 /* Return size in bits of an arithmetic operand which is shifted/scaled and
5548    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
5549    operator.  */
5550
5551 int
5552 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
5553 {
5554   if (shift >= 0 && shift <= 3)
5555     {
5556       int size;
5557       for (size = 8; size <= 32; size *= 2)
5558         {
5559           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
5560           if (mask == bits << shift)
5561             return size;
5562         }
5563     }
5564   return 0;
5565 }
5566
5567 /* Constant pools are per function only when PC relative
5568    literal loads are true or we are in the large memory
5569    model.  */
5570
5571 static inline bool
5572 aarch64_can_use_per_function_literal_pools_p (void)
5573 {
5574   return (aarch64_pcrelative_literal_loads
5575           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
5576 }
5577
5578 static bool
5579 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
5580 {
5581   /* Fixme:: In an ideal world this would work similar
5582      to the logic in aarch64_select_rtx_section but this
5583      breaks bootstrap in gcc go.  For now we workaround
5584      this by returning false here.  */
5585   return false;
5586 }
5587
5588 /* Select appropriate section for constants depending
5589    on where we place literal pools.  */
5590
5591 static section *
5592 aarch64_select_rtx_section (machine_mode mode,
5593                             rtx x,
5594                             unsigned HOST_WIDE_INT align)
5595 {
5596   if (aarch64_can_use_per_function_literal_pools_p ())
5597     return function_section (current_function_decl);
5598
5599   return default_elf_select_rtx_section (mode, x, align);
5600 }
5601
5602 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
5603 void
5604 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
5605                                   HOST_WIDE_INT offset)
5606 {
5607   /* When using per-function literal pools, we must ensure that any code
5608      section is aligned to the minimal instruction length, lest we get
5609      errors from the assembler re "unaligned instructions".  */
5610   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
5611     ASM_OUTPUT_ALIGN (f, 2);
5612 }
5613
5614 /* Costs.  */
5615
5616 /* Helper function for rtx cost calculation.  Strip a shift expression
5617    from X.  Returns the inner operand if successful, or the original
5618    expression on failure.  */
5619 static rtx
5620 aarch64_strip_shift (rtx x)
5621 {
5622   rtx op = x;
5623
5624   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
5625      we can convert both to ROR during final output.  */
5626   if ((GET_CODE (op) == ASHIFT
5627        || GET_CODE (op) == ASHIFTRT
5628        || GET_CODE (op) == LSHIFTRT
5629        || GET_CODE (op) == ROTATERT
5630        || GET_CODE (op) == ROTATE)
5631       && CONST_INT_P (XEXP (op, 1)))
5632     return XEXP (op, 0);
5633
5634   if (GET_CODE (op) == MULT
5635       && CONST_INT_P (XEXP (op, 1))
5636       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
5637     return XEXP (op, 0);
5638
5639   return x;
5640 }
5641
5642 /* Helper function for rtx cost calculation.  Strip an extend
5643    expression from X.  Returns the inner operand if successful, or the
5644    original expression on failure.  We deal with a number of possible
5645    canonicalization variations here.  */
5646 static rtx
5647 aarch64_strip_extend (rtx x)
5648 {
5649   rtx op = x;
5650
5651   /* Zero and sign extraction of a widened value.  */
5652   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
5653       && XEXP (op, 2) == const0_rtx
5654       && GET_CODE (XEXP (op, 0)) == MULT
5655       && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
5656                                          XEXP (op, 1)))
5657     return XEXP (XEXP (op, 0), 0);
5658
5659   /* It can also be represented (for zero-extend) as an AND with an
5660      immediate.  */
5661   if (GET_CODE (op) == AND
5662       && GET_CODE (XEXP (op, 0)) == MULT
5663       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
5664       && CONST_INT_P (XEXP (op, 1))
5665       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
5666                            INTVAL (XEXP (op, 1))) != 0)
5667     return XEXP (XEXP (op, 0), 0);
5668
5669   /* Now handle extended register, as this may also have an optional
5670      left shift by 1..4.  */
5671   if (GET_CODE (op) == ASHIFT
5672       && CONST_INT_P (XEXP (op, 1))
5673       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
5674     op = XEXP (op, 0);
5675
5676   if (GET_CODE (op) == ZERO_EXTEND
5677       || GET_CODE (op) == SIGN_EXTEND)
5678     op = XEXP (op, 0);
5679
5680   if (op != x)
5681     return op;
5682
5683   return x;
5684 }
5685
5686 /* Return true iff CODE is a shift supported in combination
5687    with arithmetic instructions.  */
5688
5689 static bool
5690 aarch64_shift_p (enum rtx_code code)
5691 {
5692   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
5693 }
5694
5695 /* Helper function for rtx cost calculation.  Calculate the cost of
5696    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
5697    Return the calculated cost of the expression, recursing manually in to
5698    operands where needed.  */
5699
5700 static int
5701 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
5702 {
5703   rtx op0, op1;
5704   const struct cpu_cost_table *extra_cost
5705     = aarch64_tune_params.insn_extra_cost;
5706   int cost = 0;
5707   bool compound_p = (outer == PLUS || outer == MINUS);
5708   machine_mode mode = GET_MODE (x);
5709
5710   gcc_checking_assert (code == MULT);
5711
5712   op0 = XEXP (x, 0);
5713   op1 = XEXP (x, 1);
5714
5715   if (VECTOR_MODE_P (mode))
5716     mode = GET_MODE_INNER (mode);
5717
5718   /* Integer multiply/fma.  */
5719   if (GET_MODE_CLASS (mode) == MODE_INT)
5720     {
5721       /* The multiply will be canonicalized as a shift, cost it as such.  */
5722       if (aarch64_shift_p (GET_CODE (x))
5723           || (CONST_INT_P (op1)
5724               && exact_log2 (INTVAL (op1)) > 0))
5725         {
5726           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
5727                            || GET_CODE (op0) == SIGN_EXTEND;
5728           if (speed)
5729             {
5730               if (compound_p)
5731                 {
5732                   if (REG_P (op1))
5733                     /* ARITH + shift-by-register.  */
5734                     cost += extra_cost->alu.arith_shift_reg;
5735                   else if (is_extend)
5736                     /* ARITH + extended register.  We don't have a cost field
5737                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
5738                     cost += extra_cost->alu.extend_arith;
5739                   else
5740                     /* ARITH + shift-by-immediate.  */
5741                     cost += extra_cost->alu.arith_shift;
5742                 }
5743               else
5744                 /* LSL (immediate).  */
5745                 cost += extra_cost->alu.shift;
5746
5747             }
5748           /* Strip extends as we will have costed them in the case above.  */
5749           if (is_extend)
5750             op0 = aarch64_strip_extend (op0);
5751
5752           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
5753
5754           return cost;
5755         }
5756
5757       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
5758          compound and let the below cases handle it.  After all, MNEG is a
5759          special-case alias of MSUB.  */
5760       if (GET_CODE (op0) == NEG)
5761         {
5762           op0 = XEXP (op0, 0);
5763           compound_p = true;
5764         }
5765
5766       /* Integer multiplies or FMAs have zero/sign extending variants.  */
5767       if ((GET_CODE (op0) == ZERO_EXTEND
5768            && GET_CODE (op1) == ZERO_EXTEND)
5769           || (GET_CODE (op0) == SIGN_EXTEND
5770               && GET_CODE (op1) == SIGN_EXTEND))
5771         {
5772           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
5773           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
5774
5775           if (speed)
5776             {
5777               if (compound_p)
5778                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
5779                 cost += extra_cost->mult[0].extend_add;
5780               else
5781                 /* MUL/SMULL/UMULL.  */
5782                 cost += extra_cost->mult[0].extend;
5783             }
5784
5785           return cost;
5786         }
5787
5788       /* This is either an integer multiply or a MADD.  In both cases
5789          we want to recurse and cost the operands.  */
5790       cost += rtx_cost (op0, mode, MULT, 0, speed);
5791       cost += rtx_cost (op1, mode, MULT, 1, speed);
5792
5793       if (speed)
5794         {
5795           if (compound_p)
5796             /* MADD/MSUB.  */
5797             cost += extra_cost->mult[mode == DImode].add;
5798           else
5799             /* MUL.  */
5800             cost += extra_cost->mult[mode == DImode].simple;
5801         }
5802
5803       return cost;
5804     }
5805   else
5806     {
5807       if (speed)
5808         {
5809           /* Floating-point FMA/FMUL can also support negations of the
5810              operands, unless the rounding mode is upward or downward in
5811              which case FNMUL is different than FMUL with operand negation.  */
5812           bool neg0 = GET_CODE (op0) == NEG;
5813           bool neg1 = GET_CODE (op1) == NEG;
5814           if (compound_p || !flag_rounding_math || (neg0 && neg1))
5815             {
5816               if (neg0)
5817                 op0 = XEXP (op0, 0);
5818               if (neg1)
5819                 op1 = XEXP (op1, 0);
5820             }
5821
5822           if (compound_p)
5823             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
5824             cost += extra_cost->fp[mode == DFmode].fma;
5825           else
5826             /* FMUL/FNMUL.  */
5827             cost += extra_cost->fp[mode == DFmode].mult;
5828         }
5829
5830       cost += rtx_cost (op0, mode, MULT, 0, speed);
5831       cost += rtx_cost (op1, mode, MULT, 1, speed);
5832       return cost;
5833     }
5834 }
5835
5836 static int
5837 aarch64_address_cost (rtx x,
5838                       machine_mode mode,
5839                       addr_space_t as ATTRIBUTE_UNUSED,
5840                       bool speed)
5841 {
5842   enum rtx_code c = GET_CODE (x);
5843   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
5844   struct aarch64_address_info info;
5845   int cost = 0;
5846   info.shift = 0;
5847
5848   if (!aarch64_classify_address (&info, x, mode, c, false))
5849     {
5850       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
5851         {
5852           /* This is a CONST or SYMBOL ref which will be split
5853              in a different way depending on the code model in use.
5854              Cost it through the generic infrastructure.  */
5855           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
5856           /* Divide through by the cost of one instruction to
5857              bring it to the same units as the address costs.  */
5858           cost_symbol_ref /= COSTS_N_INSNS (1);
5859           /* The cost is then the cost of preparing the address,
5860              followed by an immediate (possibly 0) offset.  */
5861           return cost_symbol_ref + addr_cost->imm_offset;
5862         }
5863       else
5864         {
5865           /* This is most likely a jump table from a case
5866              statement.  */
5867           return addr_cost->register_offset;
5868         }
5869     }
5870
5871   switch (info.type)
5872     {
5873       case ADDRESS_LO_SUM:
5874       case ADDRESS_SYMBOLIC:
5875       case ADDRESS_REG_IMM:
5876         cost += addr_cost->imm_offset;
5877         break;
5878
5879       case ADDRESS_REG_WB:
5880         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
5881           cost += addr_cost->pre_modify;
5882         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
5883           cost += addr_cost->post_modify;
5884         else
5885           gcc_unreachable ();
5886
5887         break;
5888
5889       case ADDRESS_REG_REG:
5890         cost += addr_cost->register_offset;
5891         break;
5892
5893       case ADDRESS_REG_SXTW:
5894         cost += addr_cost->register_sextend;
5895         break;
5896
5897       case ADDRESS_REG_UXTW:
5898         cost += addr_cost->register_zextend;
5899         break;
5900
5901       default:
5902         gcc_unreachable ();
5903     }
5904
5905
5906   if (info.shift > 0)
5907     {
5908       /* For the sake of calculating the cost of the shifted register
5909          component, we can treat same sized modes in the same way.  */
5910       switch (GET_MODE_BITSIZE (mode))
5911         {
5912           case 16:
5913             cost += addr_cost->addr_scale_costs.hi;
5914             break;
5915
5916           case 32:
5917             cost += addr_cost->addr_scale_costs.si;
5918             break;
5919
5920           case 64:
5921             cost += addr_cost->addr_scale_costs.di;
5922             break;
5923
5924           /* We can't tell, or this is a 128-bit vector.  */
5925           default:
5926             cost += addr_cost->addr_scale_costs.ti;
5927             break;
5928         }
5929     }
5930
5931   return cost;
5932 }
5933
5934 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
5935    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
5936    to be taken.  */
5937
5938 int
5939 aarch64_branch_cost (bool speed_p, bool predictable_p)
5940 {
5941   /* When optimizing for speed, use the cost of unpredictable branches.  */
5942   const struct cpu_branch_cost *branch_costs =
5943     aarch64_tune_params.branch_costs;
5944
5945   if (!speed_p || predictable_p)
5946     return branch_costs->predictable;
5947   else
5948     return branch_costs->unpredictable;
5949 }
5950
5951 /* Return true if the RTX X in mode MODE is a zero or sign extract
5952    usable in an ADD or SUB (extended register) instruction.  */
5953 static bool
5954 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
5955 {
5956   /* Catch add with a sign extract.
5957      This is add_<optab><mode>_multp2.  */
5958   if (GET_CODE (x) == SIGN_EXTRACT
5959       || GET_CODE (x) == ZERO_EXTRACT)
5960     {
5961       rtx op0 = XEXP (x, 0);
5962       rtx op1 = XEXP (x, 1);
5963       rtx op2 = XEXP (x, 2);
5964
5965       if (GET_CODE (op0) == MULT
5966           && CONST_INT_P (op1)
5967           && op2 == const0_rtx
5968           && CONST_INT_P (XEXP (op0, 1))
5969           && aarch64_is_extend_from_extract (mode,
5970                                              XEXP (op0, 1),
5971                                              op1))
5972         {
5973           return true;
5974         }
5975     }
5976   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
5977      No shift.  */
5978   else if (GET_CODE (x) == SIGN_EXTEND
5979            || GET_CODE (x) == ZERO_EXTEND)
5980     return REG_P (XEXP (x, 0));
5981
5982   return false;
5983 }
5984
5985 static bool
5986 aarch64_frint_unspec_p (unsigned int u)
5987 {
5988   switch (u)
5989     {
5990       case UNSPEC_FRINTZ:
5991       case UNSPEC_FRINTP:
5992       case UNSPEC_FRINTM:
5993       case UNSPEC_FRINTA:
5994       case UNSPEC_FRINTN:
5995       case UNSPEC_FRINTX:
5996       case UNSPEC_FRINTI:
5997         return true;
5998
5999       default:
6000         return false;
6001     }
6002 }
6003
6004 /* Return true iff X is an rtx that will match an extr instruction
6005    i.e. as described in the *extr<mode>5_insn family of patterns.
6006    OP0 and OP1 will be set to the operands of the shifts involved
6007    on success and will be NULL_RTX otherwise.  */
6008
6009 static bool
6010 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
6011 {
6012   rtx op0, op1;
6013   machine_mode mode = GET_MODE (x);
6014
6015   *res_op0 = NULL_RTX;
6016   *res_op1 = NULL_RTX;
6017
6018   if (GET_CODE (x) != IOR)
6019     return false;
6020
6021   op0 = XEXP (x, 0);
6022   op1 = XEXP (x, 1);
6023
6024   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
6025       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
6026     {
6027      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
6028       if (GET_CODE (op1) == ASHIFT)
6029         std::swap (op0, op1);
6030
6031       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
6032         return false;
6033
6034       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
6035       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
6036
6037       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
6038           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
6039         {
6040           *res_op0 = XEXP (op0, 0);
6041           *res_op1 = XEXP (op1, 0);
6042           return true;
6043         }
6044     }
6045
6046   return false;
6047 }
6048
6049 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
6050    storing it in *COST.  Result is true if the total cost of the operation
6051    has now been calculated.  */
6052 static bool
6053 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
6054 {
6055   rtx inner;
6056   rtx comparator;
6057   enum rtx_code cmpcode;
6058
6059   if (COMPARISON_P (op0))
6060     {
6061       inner = XEXP (op0, 0);
6062       comparator = XEXP (op0, 1);
6063       cmpcode = GET_CODE (op0);
6064     }
6065   else
6066     {
6067       inner = op0;
6068       comparator = const0_rtx;
6069       cmpcode = NE;
6070     }
6071
6072   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
6073     {
6074       /* Conditional branch.  */
6075       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6076         return true;
6077       else
6078         {
6079           if (cmpcode == NE || cmpcode == EQ)
6080             {
6081               if (comparator == const0_rtx)
6082                 {
6083                   /* TBZ/TBNZ/CBZ/CBNZ.  */
6084                   if (GET_CODE (inner) == ZERO_EXTRACT)
6085                     /* TBZ/TBNZ.  */
6086                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
6087                                        ZERO_EXTRACT, 0, speed);
6088                   else
6089                     /* CBZ/CBNZ.  */
6090                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
6091
6092                 return true;
6093               }
6094             }
6095           else if (cmpcode == LT || cmpcode == GE)
6096             {
6097               /* TBZ/TBNZ.  */
6098               if (comparator == const0_rtx)
6099                 return true;
6100             }
6101         }
6102     }
6103   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6104     {
6105       /* CCMP.  */
6106       if (GET_CODE (op1) == COMPARE)
6107         {
6108           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
6109           if (XEXP (op1, 1) == const0_rtx)
6110             *cost += 1;
6111           if (speed)
6112             {
6113               machine_mode mode = GET_MODE (XEXP (op1, 0));
6114               const struct cpu_cost_table *extra_cost
6115                 = aarch64_tune_params.insn_extra_cost;
6116
6117               if (GET_MODE_CLASS (mode) == MODE_INT)
6118                 *cost += extra_cost->alu.arith;
6119               else
6120                 *cost += extra_cost->fp[mode == DFmode].compare;
6121             }
6122           return true;
6123         }
6124
6125       /* It's a conditional operation based on the status flags,
6126          so it must be some flavor of CSEL.  */
6127
6128       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
6129       if (GET_CODE (op1) == NEG
6130           || GET_CODE (op1) == NOT
6131           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
6132         op1 = XEXP (op1, 0);
6133       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
6134         {
6135           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
6136           op1 = XEXP (op1, 0);
6137           op2 = XEXP (op2, 0);
6138         }
6139
6140       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
6141       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
6142       return true;
6143     }
6144
6145   /* We don't know what this is, cost all operands.  */
6146   return false;
6147 }
6148
6149 /* Check whether X is a bitfield operation of the form shift + extend that
6150    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
6151    operand to which the bitfield operation is applied.  Otherwise return
6152    NULL_RTX.  */
6153
6154 static rtx
6155 aarch64_extend_bitfield_pattern_p (rtx x)
6156 {
6157   rtx_code outer_code = GET_CODE (x);
6158   machine_mode outer_mode = GET_MODE (x);
6159
6160   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
6161       && outer_mode != SImode && outer_mode != DImode)
6162     return NULL_RTX;
6163
6164   rtx inner = XEXP (x, 0);
6165   rtx_code inner_code = GET_CODE (inner);
6166   machine_mode inner_mode = GET_MODE (inner);
6167   rtx op = NULL_RTX;
6168
6169   switch (inner_code)
6170     {
6171       case ASHIFT:
6172         if (CONST_INT_P (XEXP (inner, 1))
6173             && (inner_mode == QImode || inner_mode == HImode))
6174           op = XEXP (inner, 0);
6175         break;
6176       case LSHIFTRT:
6177         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
6178             && (inner_mode == QImode || inner_mode == HImode))
6179           op = XEXP (inner, 0);
6180         break;
6181       case ASHIFTRT:
6182         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
6183             && (inner_mode == QImode || inner_mode == HImode))
6184           op = XEXP (inner, 0);
6185         break;
6186       default:
6187         break;
6188     }
6189
6190   return op;
6191 }
6192
6193 /* Return true if the mask and a shift amount from an RTX of the form
6194    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
6195    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
6196
6197 bool
6198 aarch64_mask_and_shift_for_ubfiz_p (machine_mode mode, rtx mask, rtx shft_amnt)
6199 {
6200   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
6201          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
6202          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
6203          && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
6204 }
6205
6206 /* Calculate the cost of calculating X, storing it in *COST.  Result
6207    is true if the total cost of the operation has now been calculated.  */
6208 static bool
6209 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
6210                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
6211 {
6212   rtx op0, op1, op2;
6213   const struct cpu_cost_table *extra_cost
6214     = aarch64_tune_params.insn_extra_cost;
6215   int code = GET_CODE (x);
6216
6217   /* By default, assume that everything has equivalent cost to the
6218      cheapest instruction.  Any additional costs are applied as a delta
6219      above this default.  */
6220   *cost = COSTS_N_INSNS (1);
6221
6222   switch (code)
6223     {
6224     case SET:
6225       /* The cost depends entirely on the operands to SET.  */
6226       *cost = 0;
6227       op0 = SET_DEST (x);
6228       op1 = SET_SRC (x);
6229
6230       switch (GET_CODE (op0))
6231         {
6232         case MEM:
6233           if (speed)
6234             {
6235               rtx address = XEXP (op0, 0);
6236               if (VECTOR_MODE_P (mode))
6237                 *cost += extra_cost->ldst.storev;
6238               else if (GET_MODE_CLASS (mode) == MODE_INT)
6239                 *cost += extra_cost->ldst.store;
6240               else if (mode == SFmode)
6241                 *cost += extra_cost->ldst.storef;
6242               else if (mode == DFmode)
6243                 *cost += extra_cost->ldst.stored;
6244
6245               *cost +=
6246                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6247                                                      0, speed));
6248             }
6249
6250           *cost += rtx_cost (op1, mode, SET, 1, speed);
6251           return true;
6252
6253         case SUBREG:
6254           if (! REG_P (SUBREG_REG (op0)))
6255             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
6256
6257           /* Fall through.  */
6258         case REG:
6259           /* The cost is one per vector-register copied.  */
6260           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
6261             {
6262               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6263                               / GET_MODE_SIZE (V4SImode);
6264               *cost = COSTS_N_INSNS (n_minus_1 + 1);
6265             }
6266           /* const0_rtx is in general free, but we will use an
6267              instruction to set a register to 0.  */
6268           else if (REG_P (op1) || op1 == const0_rtx)
6269             {
6270               /* The cost is 1 per register copied.  */
6271               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6272                               / UNITS_PER_WORD;
6273               *cost = COSTS_N_INSNS (n_minus_1 + 1);
6274             }
6275           else
6276             /* Cost is just the cost of the RHS of the set.  */
6277             *cost += rtx_cost (op1, mode, SET, 1, speed);
6278           return true;
6279
6280         case ZERO_EXTRACT:
6281         case SIGN_EXTRACT:
6282           /* Bit-field insertion.  Strip any redundant widening of
6283              the RHS to meet the width of the target.  */
6284           if (GET_CODE (op1) == SUBREG)
6285             op1 = SUBREG_REG (op1);
6286           if ((GET_CODE (op1) == ZERO_EXTEND
6287                || GET_CODE (op1) == SIGN_EXTEND)
6288               && CONST_INT_P (XEXP (op0, 1))
6289               && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
6290                   >= INTVAL (XEXP (op0, 1))))
6291             op1 = XEXP (op1, 0);
6292
6293           if (CONST_INT_P (op1))
6294             {
6295               /* MOV immediate is assumed to always be cheap.  */
6296               *cost = COSTS_N_INSNS (1);
6297             }
6298           else
6299             {
6300               /* BFM.  */
6301               if (speed)
6302                 *cost += extra_cost->alu.bfi;
6303               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
6304             }
6305
6306           return true;
6307
6308         default:
6309           /* We can't make sense of this, assume default cost.  */
6310           *cost = COSTS_N_INSNS (1);
6311           return false;
6312         }
6313       return false;
6314
6315     case CONST_INT:
6316       /* If an instruction can incorporate a constant within the
6317          instruction, the instruction's expression avoids calling
6318          rtx_cost() on the constant.  If rtx_cost() is called on a
6319          constant, then it is usually because the constant must be
6320          moved into a register by one or more instructions.
6321
6322          The exception is constant 0, which can be expressed
6323          as XZR/WZR and is therefore free.  The exception to this is
6324          if we have (set (reg) (const0_rtx)) in which case we must cost
6325          the move.  However, we can catch that when we cost the SET, so
6326          we don't need to consider that here.  */
6327       if (x == const0_rtx)
6328         *cost = 0;
6329       else
6330         {
6331           /* To an approximation, building any other constant is
6332              proportionally expensive to the number of instructions
6333              required to build that constant.  This is true whether we
6334              are compiling for SPEED or otherwise.  */
6335           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
6336                                  (NULL_RTX, x, false, mode));
6337         }
6338       return true;
6339
6340     case CONST_DOUBLE:
6341       if (speed)
6342         {
6343           /* mov[df,sf]_aarch64.  */
6344           if (aarch64_float_const_representable_p (x))
6345             /* FMOV (scalar immediate).  */
6346             *cost += extra_cost->fp[mode == DFmode].fpconst;
6347           else if (!aarch64_float_const_zero_rtx_p (x))
6348             {
6349               /* This will be a load from memory.  */
6350               if (mode == DFmode)
6351                 *cost += extra_cost->ldst.loadd;
6352               else
6353                 *cost += extra_cost->ldst.loadf;
6354             }
6355           else
6356             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
6357                or MOV v0.s[0], wzr - neither of which are modeled by the
6358                cost tables.  Just use the default cost.  */
6359             {
6360             }
6361         }
6362
6363       return true;
6364
6365     case MEM:
6366       if (speed)
6367         {
6368           /* For loads we want the base cost of a load, plus an
6369              approximation for the additional cost of the addressing
6370              mode.  */
6371           rtx address = XEXP (x, 0);
6372           if (VECTOR_MODE_P (mode))
6373             *cost += extra_cost->ldst.loadv;
6374           else if (GET_MODE_CLASS (mode) == MODE_INT)
6375             *cost += extra_cost->ldst.load;
6376           else if (mode == SFmode)
6377             *cost += extra_cost->ldst.loadf;
6378           else if (mode == DFmode)
6379             *cost += extra_cost->ldst.loadd;
6380
6381           *cost +=
6382                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6383                                                      0, speed));
6384         }
6385
6386       return true;
6387
6388     case NEG:
6389       op0 = XEXP (x, 0);
6390
6391       if (VECTOR_MODE_P (mode))
6392         {
6393           if (speed)
6394             {
6395               /* FNEG.  */
6396               *cost += extra_cost->vect.alu;
6397             }
6398           return false;
6399         }
6400
6401       if (GET_MODE_CLASS (mode) == MODE_INT)
6402         {
6403           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6404               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6405             {
6406               /* CSETM.  */
6407               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
6408               return true;
6409             }
6410
6411           /* Cost this as SUB wzr, X.  */
6412           op0 = CONST0_RTX (mode);
6413           op1 = XEXP (x, 0);
6414           goto cost_minus;
6415         }
6416
6417       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6418         {
6419           /* Support (neg(fma...)) as a single instruction only if
6420              sign of zeros is unimportant.  This matches the decision
6421              making in aarch64.md.  */
6422           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
6423             {
6424               /* FNMADD.  */
6425               *cost = rtx_cost (op0, mode, NEG, 0, speed);
6426               return true;
6427             }
6428           if (GET_CODE (op0) == MULT)
6429             {
6430               /* FNMUL.  */
6431               *cost = rtx_cost (op0, mode, NEG, 0, speed);
6432               return true;
6433             }
6434           if (speed)
6435             /* FNEG.  */
6436             *cost += extra_cost->fp[mode == DFmode].neg;
6437           return false;
6438         }
6439
6440       return false;
6441
6442     case CLRSB:
6443     case CLZ:
6444       if (speed)
6445         {
6446           if (VECTOR_MODE_P (mode))
6447             *cost += extra_cost->vect.alu;
6448           else
6449             *cost += extra_cost->alu.clz;
6450         }
6451
6452       return false;
6453
6454     case COMPARE:
6455       op0 = XEXP (x, 0);
6456       op1 = XEXP (x, 1);
6457
6458       if (op1 == const0_rtx
6459           && GET_CODE (op0) == AND)
6460         {
6461           x = op0;
6462           mode = GET_MODE (op0);
6463           goto cost_logic;
6464         }
6465
6466       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
6467         {
6468           /* TODO: A write to the CC flags possibly costs extra, this
6469              needs encoding in the cost tables.  */
6470
6471           mode = GET_MODE (op0);
6472           /* ANDS.  */
6473           if (GET_CODE (op0) == AND)
6474             {
6475               x = op0;
6476               goto cost_logic;
6477             }
6478
6479           if (GET_CODE (op0) == PLUS)
6480             {
6481               /* ADDS (and CMN alias).  */
6482               x = op0;
6483               goto cost_plus;
6484             }
6485
6486           if (GET_CODE (op0) == MINUS)
6487             {
6488               /* SUBS.  */
6489               x = op0;
6490               goto cost_minus;
6491             }
6492
6493           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
6494               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
6495               && CONST_INT_P (XEXP (op0, 2)))
6496             {
6497               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
6498                  Handle it here directly rather than going to cost_logic
6499                  since we know the immediate generated for the TST is valid
6500                  so we can avoid creating an intermediate rtx for it only
6501                  for costing purposes.  */
6502               if (speed)
6503                 *cost += extra_cost->alu.logical;
6504
6505               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
6506                                  ZERO_EXTRACT, 0, speed);
6507               return true;
6508             }
6509
6510           if (GET_CODE (op1) == NEG)
6511             {
6512               /* CMN.  */
6513               if (speed)
6514                 *cost += extra_cost->alu.arith;
6515
6516               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
6517               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
6518               return true;
6519             }
6520
6521           /* CMP.
6522
6523              Compare can freely swap the order of operands, and
6524              canonicalization puts the more complex operation first.
6525              But the integer MINUS logic expects the shift/extend
6526              operation in op1.  */
6527           if (! (REG_P (op0)
6528                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
6529           {
6530             op0 = XEXP (x, 1);
6531             op1 = XEXP (x, 0);
6532           }
6533           goto cost_minus;
6534         }
6535
6536       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
6537         {
6538           /* FCMP.  */
6539           if (speed)
6540             *cost += extra_cost->fp[mode == DFmode].compare;
6541
6542           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
6543             {
6544               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
6545               /* FCMP supports constant 0.0 for no extra cost. */
6546               return true;
6547             }
6548           return false;
6549         }
6550
6551       if (VECTOR_MODE_P (mode))
6552         {
6553           /* Vector compare.  */
6554           if (speed)
6555             *cost += extra_cost->vect.alu;
6556
6557           if (aarch64_float_const_zero_rtx_p (op1))
6558             {
6559               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
6560                  cost.  */
6561               return true;
6562             }
6563           return false;
6564         }
6565       return false;
6566
6567     case MINUS:
6568       {
6569         op0 = XEXP (x, 0);
6570         op1 = XEXP (x, 1);
6571
6572 cost_minus:
6573         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
6574
6575         /* Detect valid immediates.  */
6576         if ((GET_MODE_CLASS (mode) == MODE_INT
6577              || (GET_MODE_CLASS (mode) == MODE_CC
6578                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
6579             && CONST_INT_P (op1)
6580             && aarch64_uimm12_shift (INTVAL (op1)))
6581           {
6582             if (speed)
6583               /* SUB(S) (immediate).  */
6584               *cost += extra_cost->alu.arith;
6585             return true;
6586           }
6587
6588         /* Look for SUB (extended register).  */
6589         if (aarch64_rtx_arith_op_extract_p (op1, mode))
6590           {
6591             if (speed)
6592               *cost += extra_cost->alu.extend_arith;
6593
6594             op1 = aarch64_strip_extend (op1);
6595             *cost += rtx_cost (op1, VOIDmode,
6596                                (enum rtx_code) GET_CODE (op1), 0, speed);
6597             return true;
6598           }
6599
6600         rtx new_op1 = aarch64_strip_extend (op1);
6601
6602         /* Cost this as an FMA-alike operation.  */
6603         if ((GET_CODE (new_op1) == MULT
6604              || aarch64_shift_p (GET_CODE (new_op1)))
6605             && code != COMPARE)
6606           {
6607             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
6608                                             (enum rtx_code) code,
6609                                             speed);
6610             return true;
6611           }
6612
6613         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
6614
6615         if (speed)
6616           {
6617             if (VECTOR_MODE_P (mode))
6618               {
6619                 /* Vector SUB.  */
6620                 *cost += extra_cost->vect.alu;
6621               }
6622             else if (GET_MODE_CLASS (mode) == MODE_INT)
6623               {
6624                 /* SUB(S).  */
6625                 *cost += extra_cost->alu.arith;
6626               }
6627             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6628               {
6629                 /* FSUB.  */
6630                 *cost += extra_cost->fp[mode == DFmode].addsub;
6631               }
6632           }
6633         return true;
6634       }
6635
6636     case PLUS:
6637       {
6638         rtx new_op0;
6639
6640         op0 = XEXP (x, 0);
6641         op1 = XEXP (x, 1);
6642
6643 cost_plus:
6644         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6645             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6646           {
6647             /* CSINC.  */
6648             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
6649             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
6650             return true;
6651           }
6652
6653         if (GET_MODE_CLASS (mode) == MODE_INT
6654             && CONST_INT_P (op1)
6655             && aarch64_uimm12_shift (INTVAL (op1)))
6656           {
6657             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
6658
6659             if (speed)
6660               /* ADD (immediate).  */
6661               *cost += extra_cost->alu.arith;
6662             return true;
6663           }
6664
6665         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
6666
6667         /* Look for ADD (extended register).  */
6668         if (aarch64_rtx_arith_op_extract_p (op0, mode))
6669           {
6670             if (speed)
6671               *cost += extra_cost->alu.extend_arith;
6672
6673             op0 = aarch64_strip_extend (op0);
6674             *cost += rtx_cost (op0, VOIDmode,
6675                                (enum rtx_code) GET_CODE (op0), 0, speed);
6676             return true;
6677           }
6678
6679         /* Strip any extend, leave shifts behind as we will
6680            cost them through mult_cost.  */
6681         new_op0 = aarch64_strip_extend (op0);
6682
6683         if (GET_CODE (new_op0) == MULT
6684             || aarch64_shift_p (GET_CODE (new_op0)))
6685           {
6686             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
6687                                             speed);
6688             return true;
6689           }
6690
6691         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
6692
6693         if (speed)
6694           {
6695             if (VECTOR_MODE_P (mode))
6696               {
6697                 /* Vector ADD.  */
6698                 *cost += extra_cost->vect.alu;
6699               }
6700             else if (GET_MODE_CLASS (mode) == MODE_INT)
6701               {
6702                 /* ADD.  */
6703                 *cost += extra_cost->alu.arith;
6704               }
6705             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6706               {
6707                 /* FADD.  */
6708                 *cost += extra_cost->fp[mode == DFmode].addsub;
6709               }
6710           }
6711         return true;
6712       }
6713
6714     case BSWAP:
6715       *cost = COSTS_N_INSNS (1);
6716
6717       if (speed)
6718         {
6719           if (VECTOR_MODE_P (mode))
6720             *cost += extra_cost->vect.alu;
6721           else
6722             *cost += extra_cost->alu.rev;
6723         }
6724       return false;
6725
6726     case IOR:
6727       if (aarch_rev16_p (x))
6728         {
6729           *cost = COSTS_N_INSNS (1);
6730
6731           if (speed)
6732             {
6733               if (VECTOR_MODE_P (mode))
6734                 *cost += extra_cost->vect.alu;
6735               else
6736                 *cost += extra_cost->alu.rev;
6737             }
6738           return true;
6739         }
6740
6741       if (aarch64_extr_rtx_p (x, &op0, &op1))
6742         {
6743           *cost += rtx_cost (op0, mode, IOR, 0, speed);
6744           *cost += rtx_cost (op1, mode, IOR, 1, speed);
6745           if (speed)
6746             *cost += extra_cost->alu.shift;
6747
6748           return true;
6749         }
6750     /* Fall through.  */
6751     case XOR:
6752     case AND:
6753     cost_logic:
6754       op0 = XEXP (x, 0);
6755       op1 = XEXP (x, 1);
6756
6757       if (VECTOR_MODE_P (mode))
6758         {
6759           if (speed)
6760             *cost += extra_cost->vect.alu;
6761           return true;
6762         }
6763
6764       if (code == AND
6765           && GET_CODE (op0) == MULT
6766           && CONST_INT_P (XEXP (op0, 1))
6767           && CONST_INT_P (op1)
6768           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
6769                                INTVAL (op1)) != 0)
6770         {
6771           /* This is a UBFM/SBFM.  */
6772           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
6773           if (speed)
6774             *cost += extra_cost->alu.bfx;
6775           return true;
6776         }
6777
6778       if (GET_MODE_CLASS (mode) == MODE_INT)
6779         {
6780           if (CONST_INT_P (op1))
6781             {
6782               /* We have a mask + shift version of a UBFIZ
6783                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
6784               if (GET_CODE (op0) == ASHIFT
6785                   && aarch64_mask_and_shift_for_ubfiz_p (mode, op1,
6786                                                           XEXP (op0, 1)))
6787                 {
6788                   *cost += rtx_cost (XEXP (op0, 0), mode,
6789                                      (enum rtx_code) code, 0, speed);
6790                   if (speed)
6791                     *cost += extra_cost->alu.bfx;
6792
6793                   return true;
6794                 }
6795               else if (aarch64_bitmask_imm (INTVAL (op1), mode))
6796                 {
6797                 /* We possibly get the immediate for free, this is not
6798                    modelled.  */
6799                   *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
6800                   if (speed)
6801                     *cost += extra_cost->alu.logical;
6802
6803                   return true;
6804                 }
6805             }
6806           else
6807             {
6808               rtx new_op0 = op0;
6809
6810               /* Handle ORN, EON, or BIC.  */
6811               if (GET_CODE (op0) == NOT)
6812                 op0 = XEXP (op0, 0);
6813
6814               new_op0 = aarch64_strip_shift (op0);
6815
6816               /* If we had a shift on op0 then this is a logical-shift-
6817                  by-register/immediate operation.  Otherwise, this is just
6818                  a logical operation.  */
6819               if (speed)
6820                 {
6821                   if (new_op0 != op0)
6822                     {
6823                       /* Shift by immediate.  */
6824                       if (CONST_INT_P (XEXP (op0, 1)))
6825                         *cost += extra_cost->alu.log_shift;
6826                       else
6827                         *cost += extra_cost->alu.log_shift_reg;
6828                     }
6829                   else
6830                     *cost += extra_cost->alu.logical;
6831                 }
6832
6833               /* In both cases we want to cost both operands.  */
6834               *cost += rtx_cost (new_op0, mode, (enum rtx_code) code, 0, speed);
6835               *cost += rtx_cost (op1, mode, (enum rtx_code) code, 1, speed);
6836
6837               return true;
6838             }
6839         }
6840       return false;
6841
6842     case NOT:
6843       x = XEXP (x, 0);
6844       op0 = aarch64_strip_shift (x);
6845
6846       if (VECTOR_MODE_P (mode))
6847         {
6848           /* Vector NOT.  */
6849           *cost += extra_cost->vect.alu;
6850           return false;
6851         }
6852
6853       /* MVN-shifted-reg.  */
6854       if (op0 != x)
6855         {
6856           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
6857
6858           if (speed)
6859             *cost += extra_cost->alu.log_shift;
6860
6861           return true;
6862         }
6863       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
6864          Handle the second form here taking care that 'a' in the above can
6865          be a shift.  */
6866       else if (GET_CODE (op0) == XOR)
6867         {
6868           rtx newop0 = XEXP (op0, 0);
6869           rtx newop1 = XEXP (op0, 1);
6870           rtx op0_stripped = aarch64_strip_shift (newop0);
6871
6872           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
6873           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
6874
6875           if (speed)
6876             {
6877               if (op0_stripped != newop0)
6878                 *cost += extra_cost->alu.log_shift;
6879               else
6880                 *cost += extra_cost->alu.logical;
6881             }
6882
6883           return true;
6884         }
6885       /* MVN.  */
6886       if (speed)
6887         *cost += extra_cost->alu.logical;
6888
6889       return false;
6890
6891     case ZERO_EXTEND:
6892
6893       op0 = XEXP (x, 0);
6894       /* If a value is written in SI mode, then zero extended to DI
6895          mode, the operation will in general be free as a write to
6896          a 'w' register implicitly zeroes the upper bits of an 'x'
6897          register.  However, if this is
6898
6899            (set (reg) (zero_extend (reg)))
6900
6901          we must cost the explicit register move.  */
6902       if (mode == DImode
6903           && GET_MODE (op0) == SImode
6904           && outer == SET)
6905         {
6906           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
6907
6908         /* If OP_COST is non-zero, then the cost of the zero extend
6909            is effectively the cost of the inner operation.  Otherwise
6910            we have a MOV instruction and we take the cost from the MOV
6911            itself.  This is true independently of whether we are
6912            optimizing for space or time.  */
6913           if (op_cost)
6914             *cost = op_cost;
6915
6916           return true;
6917         }
6918       else if (MEM_P (op0))
6919         {
6920           /* All loads can zero extend to any size for free.  */
6921           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
6922           return true;
6923         }
6924
6925       op0 = aarch64_extend_bitfield_pattern_p (x);
6926       if (op0)
6927         {
6928           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
6929           if (speed)
6930             *cost += extra_cost->alu.bfx;
6931           return true;
6932         }
6933
6934       if (speed)
6935         {
6936           if (VECTOR_MODE_P (mode))
6937             {
6938               /* UMOV.  */
6939               *cost += extra_cost->vect.alu;
6940             }
6941           else
6942             {
6943               /* We generate an AND instead of UXTB/UXTH.  */
6944               *cost += extra_cost->alu.logical;
6945             }
6946         }
6947       return false;
6948
6949     case SIGN_EXTEND:
6950       if (MEM_P (XEXP (x, 0)))
6951         {
6952           /* LDRSH.  */
6953           if (speed)
6954             {
6955               rtx address = XEXP (XEXP (x, 0), 0);
6956               *cost += extra_cost->ldst.load_sign_extend;
6957
6958               *cost +=
6959                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6960                                                      0, speed));
6961             }
6962           return true;
6963         }
6964
6965       op0 = aarch64_extend_bitfield_pattern_p (x);
6966       if (op0)
6967         {
6968           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
6969           if (speed)
6970             *cost += extra_cost->alu.bfx;
6971           return true;
6972         }
6973
6974       if (speed)
6975         {
6976           if (VECTOR_MODE_P (mode))
6977             *cost += extra_cost->vect.alu;
6978           else
6979             *cost += extra_cost->alu.extend;
6980         }
6981       return false;
6982
6983     case ASHIFT:
6984       op0 = XEXP (x, 0);
6985       op1 = XEXP (x, 1);
6986
6987       if (CONST_INT_P (op1))
6988         {
6989           if (speed)
6990             {
6991               if (VECTOR_MODE_P (mode))
6992                 {
6993                   /* Vector shift (immediate).  */
6994                   *cost += extra_cost->vect.alu;
6995                 }
6996               else
6997                 {
6998                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
6999                      aliases.  */
7000                   *cost += extra_cost->alu.shift;
7001                 }
7002             }
7003
7004           /* We can incorporate zero/sign extend for free.  */
7005           if (GET_CODE (op0) == ZERO_EXTEND
7006               || GET_CODE (op0) == SIGN_EXTEND)
7007             op0 = XEXP (op0, 0);
7008
7009           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
7010           return true;
7011         }
7012       else
7013         {
7014           if (speed)
7015             {
7016               if (VECTOR_MODE_P (mode))
7017                 {
7018                   /* Vector shift (register).  */
7019                   *cost += extra_cost->vect.alu;
7020                 }
7021               else
7022                 {
7023                   /* LSLV.  */
7024                   *cost += extra_cost->alu.shift_reg;
7025                 }
7026             }
7027           return false;  /* All arguments need to be in registers.  */
7028         }
7029
7030     case ROTATE:
7031     case ROTATERT:
7032     case LSHIFTRT:
7033     case ASHIFTRT:
7034       op0 = XEXP (x, 0);
7035       op1 = XEXP (x, 1);
7036
7037       if (CONST_INT_P (op1))
7038         {
7039           /* ASR (immediate) and friends.  */
7040           if (speed)
7041             {
7042               if (VECTOR_MODE_P (mode))
7043                 *cost += extra_cost->vect.alu;
7044               else
7045                 *cost += extra_cost->alu.shift;
7046             }
7047
7048           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7049           return true;
7050         }
7051       else
7052         {
7053
7054           /* ASR (register) and friends.  */
7055           if (speed)
7056             {
7057               if (VECTOR_MODE_P (mode))
7058                 *cost += extra_cost->vect.alu;
7059               else
7060                 *cost += extra_cost->alu.shift_reg;
7061             }
7062           return false;  /* All arguments need to be in registers.  */
7063         }
7064
7065     case SYMBOL_REF:
7066
7067       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
7068           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
7069         {
7070           /* LDR.  */
7071           if (speed)
7072             *cost += extra_cost->ldst.load;
7073         }
7074       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
7075                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
7076         {
7077           /* ADRP, followed by ADD.  */
7078           *cost += COSTS_N_INSNS (1);
7079           if (speed)
7080             *cost += 2 * extra_cost->alu.arith;
7081         }
7082       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
7083                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
7084         {
7085           /* ADR.  */
7086           if (speed)
7087             *cost += extra_cost->alu.arith;
7088         }
7089
7090       if (flag_pic)
7091         {
7092           /* One extra load instruction, after accessing the GOT.  */
7093           *cost += COSTS_N_INSNS (1);
7094           if (speed)
7095             *cost += extra_cost->ldst.load;
7096         }
7097       return true;
7098
7099     case HIGH:
7100     case LO_SUM:
7101       /* ADRP/ADD (immediate).  */
7102       if (speed)
7103         *cost += extra_cost->alu.arith;
7104       return true;
7105
7106     case ZERO_EXTRACT:
7107     case SIGN_EXTRACT:
7108       /* UBFX/SBFX.  */
7109       if (speed)
7110         {
7111           if (VECTOR_MODE_P (mode))
7112             *cost += extra_cost->vect.alu;
7113           else
7114             *cost += extra_cost->alu.bfx;
7115         }
7116
7117       /* We can trust that the immediates used will be correct (there
7118          are no by-register forms), so we need only cost op0.  */
7119       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
7120       return true;
7121
7122     case MULT:
7123       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
7124       /* aarch64_rtx_mult_cost always handles recursion to its
7125          operands.  */
7126       return true;
7127
7128     case MOD:
7129     /* We can expand signed mod by power of 2 using a NEGS, two parallel
7130        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
7131        an unconditional negate.  This case should only ever be reached through
7132        the set_smod_pow2_cheap check in expmed.c.  */
7133       if (CONST_INT_P (XEXP (x, 1))
7134           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
7135           && (mode == SImode || mode == DImode))
7136         {
7137           /* We expand to 4 instructions.  Reset the baseline.  */
7138           *cost = COSTS_N_INSNS (4);
7139
7140           if (speed)
7141             *cost += 2 * extra_cost->alu.logical
7142                      + 2 * extra_cost->alu.arith;
7143
7144           return true;
7145         }
7146
7147     /* Fall-through.  */
7148     case UMOD:
7149       if (speed)
7150         {
7151           if (VECTOR_MODE_P (mode))
7152             *cost += extra_cost->vect.alu;
7153           else if (GET_MODE_CLASS (mode) == MODE_INT)
7154             *cost += (extra_cost->mult[mode == DImode].add
7155                       + extra_cost->mult[mode == DImode].idiv);
7156           else if (mode == DFmode)
7157             *cost += (extra_cost->fp[1].mult
7158                       + extra_cost->fp[1].div);
7159           else if (mode == SFmode)
7160             *cost += (extra_cost->fp[0].mult
7161                       + extra_cost->fp[0].div);
7162         }
7163       return false;  /* All arguments need to be in registers.  */
7164
7165     case DIV:
7166     case UDIV:
7167     case SQRT:
7168       if (speed)
7169         {
7170           if (VECTOR_MODE_P (mode))
7171             *cost += extra_cost->vect.alu;
7172           else if (GET_MODE_CLASS (mode) == MODE_INT)
7173             /* There is no integer SQRT, so only DIV and UDIV can get
7174                here.  */
7175             *cost += extra_cost->mult[mode == DImode].idiv;
7176           else
7177             *cost += extra_cost->fp[mode == DFmode].div;
7178         }
7179       return false;  /* All arguments need to be in registers.  */
7180
7181     case IF_THEN_ELSE:
7182       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
7183                                          XEXP (x, 2), cost, speed);
7184
7185     case EQ:
7186     case NE:
7187     case GT:
7188     case GTU:
7189     case LT:
7190     case LTU:
7191     case GE:
7192     case GEU:
7193     case LE:
7194     case LEU:
7195
7196       return false; /* All arguments must be in registers.  */
7197
7198     case FMA:
7199       op0 = XEXP (x, 0);
7200       op1 = XEXP (x, 1);
7201       op2 = XEXP (x, 2);
7202
7203       if (speed)
7204         {
7205           if (VECTOR_MODE_P (mode))
7206             *cost += extra_cost->vect.alu;
7207           else
7208             *cost += extra_cost->fp[mode == DFmode].fma;
7209         }
7210
7211       /* FMSUB, FNMADD, and FNMSUB are free.  */
7212       if (GET_CODE (op0) == NEG)
7213         op0 = XEXP (op0, 0);
7214
7215       if (GET_CODE (op2) == NEG)
7216         op2 = XEXP (op2, 0);
7217
7218       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
7219          and the by-element operand as operand 0.  */
7220       if (GET_CODE (op1) == NEG)
7221         op1 = XEXP (op1, 0);
7222
7223       /* Catch vector-by-element operations.  The by-element operand can
7224          either be (vec_duplicate (vec_select (x))) or just
7225          (vec_select (x)), depending on whether we are multiplying by
7226          a vector or a scalar.
7227
7228          Canonicalization is not very good in these cases, FMA4 will put the
7229          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
7230       if (GET_CODE (op0) == VEC_DUPLICATE)
7231         op0 = XEXP (op0, 0);
7232       else if (GET_CODE (op1) == VEC_DUPLICATE)
7233         op1 = XEXP (op1, 0);
7234
7235       if (GET_CODE (op0) == VEC_SELECT)
7236         op0 = XEXP (op0, 0);
7237       else if (GET_CODE (op1) == VEC_SELECT)
7238         op1 = XEXP (op1, 0);
7239
7240       /* If the remaining parameters are not registers,
7241          get the cost to put them into registers.  */
7242       *cost += rtx_cost (op0, mode, FMA, 0, speed);
7243       *cost += rtx_cost (op1, mode, FMA, 1, speed);
7244       *cost += rtx_cost (op2, mode, FMA, 2, speed);
7245       return true;
7246
7247     case FLOAT:
7248     case UNSIGNED_FLOAT:
7249       if (speed)
7250         *cost += extra_cost->fp[mode == DFmode].fromint;
7251       return false;
7252
7253     case FLOAT_EXTEND:
7254       if (speed)
7255         {
7256           if (VECTOR_MODE_P (mode))
7257             {
7258               /*Vector truncate.  */
7259               *cost += extra_cost->vect.alu;
7260             }
7261           else
7262             *cost += extra_cost->fp[mode == DFmode].widen;
7263         }
7264       return false;
7265
7266     case FLOAT_TRUNCATE:
7267       if (speed)
7268         {
7269           if (VECTOR_MODE_P (mode))
7270             {
7271               /*Vector conversion.  */
7272               *cost += extra_cost->vect.alu;
7273             }
7274           else
7275             *cost += extra_cost->fp[mode == DFmode].narrow;
7276         }
7277       return false;
7278
7279     case FIX:
7280     case UNSIGNED_FIX:
7281       x = XEXP (x, 0);
7282       /* Strip the rounding part.  They will all be implemented
7283          by the fcvt* family of instructions anyway.  */
7284       if (GET_CODE (x) == UNSPEC)
7285         {
7286           unsigned int uns_code = XINT (x, 1);
7287
7288           if (uns_code == UNSPEC_FRINTA
7289               || uns_code == UNSPEC_FRINTM
7290               || uns_code == UNSPEC_FRINTN
7291               || uns_code == UNSPEC_FRINTP
7292               || uns_code == UNSPEC_FRINTZ)
7293             x = XVECEXP (x, 0, 0);
7294         }
7295
7296       if (speed)
7297         {
7298           if (VECTOR_MODE_P (mode))
7299             *cost += extra_cost->vect.alu;
7300           else
7301             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
7302         }
7303
7304       /* We can combine fmul by a power of 2 followed by a fcvt into a single
7305          fixed-point fcvt.  */
7306       if (GET_CODE (x) == MULT
7307           && ((VECTOR_MODE_P (mode)
7308                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
7309               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
7310         {
7311           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
7312                              0, speed);
7313           return true;
7314         }
7315
7316       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
7317       return true;
7318
7319     case ABS:
7320       if (VECTOR_MODE_P (mode))
7321         {
7322           /* ABS (vector).  */
7323           if (speed)
7324             *cost += extra_cost->vect.alu;
7325         }
7326       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7327         {
7328           op0 = XEXP (x, 0);
7329
7330           /* FABD, which is analogous to FADD.  */
7331           if (GET_CODE (op0) == MINUS)
7332             {
7333               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
7334               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
7335               if (speed)
7336                 *cost += extra_cost->fp[mode == DFmode].addsub;
7337
7338               return true;
7339             }
7340           /* Simple FABS is analogous to FNEG.  */
7341           if (speed)
7342             *cost += extra_cost->fp[mode == DFmode].neg;
7343         }
7344       else
7345         {
7346           /* Integer ABS will either be split to
7347              two arithmetic instructions, or will be an ABS
7348              (scalar), which we don't model.  */
7349           *cost = COSTS_N_INSNS (2);
7350           if (speed)
7351             *cost += 2 * extra_cost->alu.arith;
7352         }
7353       return false;
7354
7355     case SMAX:
7356     case SMIN:
7357       if (speed)
7358         {
7359           if (VECTOR_MODE_P (mode))
7360             *cost += extra_cost->vect.alu;
7361           else
7362             {
7363               /* FMAXNM/FMINNM/FMAX/FMIN.
7364                  TODO: This may not be accurate for all implementations, but
7365                  we do not model this in the cost tables.  */
7366               *cost += extra_cost->fp[mode == DFmode].addsub;
7367             }
7368         }
7369       return false;
7370
7371     case UNSPEC:
7372       /* The floating point round to integer frint* instructions.  */
7373       if (aarch64_frint_unspec_p (XINT (x, 1)))
7374         {
7375           if (speed)
7376             *cost += extra_cost->fp[mode == DFmode].roundint;
7377
7378           return false;
7379         }
7380
7381       if (XINT (x, 1) == UNSPEC_RBIT)
7382         {
7383           if (speed)
7384             *cost += extra_cost->alu.rev;
7385
7386           return false;
7387         }
7388       break;
7389
7390     case TRUNCATE:
7391
7392       /* Decompose <su>muldi3_highpart.  */
7393       if (/* (truncate:DI  */
7394           mode == DImode
7395           /*   (lshiftrt:TI  */
7396           && GET_MODE (XEXP (x, 0)) == TImode
7397           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
7398           /*      (mult:TI  */
7399           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
7400           /*        (ANY_EXTEND:TI (reg:DI))
7401                     (ANY_EXTEND:TI (reg:DI)))  */
7402           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
7403                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
7404               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
7405                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
7406           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
7407           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
7408           /*     (const_int 64)  */
7409           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7410           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
7411         {
7412           /* UMULH/SMULH.  */
7413           if (speed)
7414             *cost += extra_cost->mult[mode == DImode].extend;
7415           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
7416                              mode, MULT, 0, speed);
7417           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
7418                              mode, MULT, 1, speed);
7419           return true;
7420         }
7421
7422       /* Fall through.  */
7423     default:
7424       break;
7425     }
7426
7427   if (dump_file && (dump_flags & TDF_DETAILS))
7428     fprintf (dump_file,
7429       "\nFailed to cost RTX.  Assuming default cost.\n");
7430
7431   return true;
7432 }
7433
7434 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
7435    calculated for X.  This cost is stored in *COST.  Returns true
7436    if the total cost of X was calculated.  */
7437 static bool
7438 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
7439                    int param, int *cost, bool speed)
7440 {
7441   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
7442
7443   if (dump_file && (dump_flags & TDF_DETAILS))
7444     {
7445       print_rtl_single (dump_file, x);
7446       fprintf (dump_file, "\n%s cost: %d (%s)\n",
7447                speed ? "Hot" : "Cold",
7448                *cost, result ? "final" : "partial");
7449     }
7450
7451   return result;
7452 }
7453
7454 static int
7455 aarch64_register_move_cost (machine_mode mode,
7456                             reg_class_t from_i, reg_class_t to_i)
7457 {
7458   enum reg_class from = (enum reg_class) from_i;
7459   enum reg_class to = (enum reg_class) to_i;
7460   const struct cpu_regmove_cost *regmove_cost
7461     = aarch64_tune_params.regmove_cost;
7462
7463   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
7464   if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
7465     to = GENERAL_REGS;
7466
7467   if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
7468     from = GENERAL_REGS;
7469
7470   /* Moving between GPR and stack cost is the same as GP2GP.  */
7471   if ((from == GENERAL_REGS && to == STACK_REG)
7472       || (to == GENERAL_REGS && from == STACK_REG))
7473     return regmove_cost->GP2GP;
7474
7475   /* To/From the stack register, we move via the gprs.  */
7476   if (to == STACK_REG || from == STACK_REG)
7477     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
7478             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
7479
7480   if (GET_MODE_SIZE (mode) == 16)
7481     {
7482       /* 128-bit operations on general registers require 2 instructions.  */
7483       if (from == GENERAL_REGS && to == GENERAL_REGS)
7484         return regmove_cost->GP2GP * 2;
7485       else if (from == GENERAL_REGS)
7486         return regmove_cost->GP2FP * 2;
7487       else if (to == GENERAL_REGS)
7488         return regmove_cost->FP2GP * 2;
7489
7490       /* When AdvSIMD instructions are disabled it is not possible to move
7491          a 128-bit value directly between Q registers.  This is handled in
7492          secondary reload.  A general register is used as a scratch to move
7493          the upper DI value and the lower DI value is moved directly,
7494          hence the cost is the sum of three moves. */
7495       if (! TARGET_SIMD)
7496         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
7497
7498       return regmove_cost->FP2FP;
7499     }
7500
7501   if (from == GENERAL_REGS && to == GENERAL_REGS)
7502     return regmove_cost->GP2GP;
7503   else if (from == GENERAL_REGS)
7504     return regmove_cost->GP2FP;
7505   else if (to == GENERAL_REGS)
7506     return regmove_cost->FP2GP;
7507
7508   return regmove_cost->FP2FP;
7509 }
7510
7511 static int
7512 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
7513                           reg_class_t rclass ATTRIBUTE_UNUSED,
7514                           bool in ATTRIBUTE_UNUSED)
7515 {
7516   return aarch64_tune_params.memmov_cost;
7517 }
7518
7519 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
7520    to optimize 1.0/sqrt.  */
7521
7522 static bool
7523 use_rsqrt_p (machine_mode mode)
7524 {
7525   return (!flag_trapping_math
7526           && flag_unsafe_math_optimizations
7527           && ((aarch64_tune_params.approx_modes->recip_sqrt
7528                & AARCH64_APPROX_MODE (mode))
7529               || flag_mrecip_low_precision_sqrt));
7530 }
7531
7532 /* Function to decide when to use the approximate reciprocal square root
7533    builtin.  */
7534
7535 static tree
7536 aarch64_builtin_reciprocal (tree fndecl)
7537 {
7538   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
7539
7540   if (!use_rsqrt_p (mode))
7541     return NULL_TREE;
7542   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
7543 }
7544
7545 typedef rtx (*rsqrte_type) (rtx, rtx);
7546
7547 /* Select reciprocal square root initial estimate insn depending on machine
7548    mode.  */
7549
7550 static rsqrte_type
7551 get_rsqrte_type (machine_mode mode)
7552 {
7553   switch (mode)
7554   {
7555     case DFmode:   return gen_aarch64_rsqrtedf;
7556     case SFmode:   return gen_aarch64_rsqrtesf;
7557     case V2DFmode: return gen_aarch64_rsqrtev2df;
7558     case V2SFmode: return gen_aarch64_rsqrtev2sf;
7559     case V4SFmode: return gen_aarch64_rsqrtev4sf;
7560     default: gcc_unreachable ();
7561   }
7562 }
7563
7564 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
7565
7566 /* Select reciprocal square root series step insn depending on machine mode.  */
7567
7568 static rsqrts_type
7569 get_rsqrts_type (machine_mode mode)
7570 {
7571   switch (mode)
7572   {
7573     case DFmode:   return gen_aarch64_rsqrtsdf;
7574     case SFmode:   return gen_aarch64_rsqrtssf;
7575     case V2DFmode: return gen_aarch64_rsqrtsv2df;
7576     case V2SFmode: return gen_aarch64_rsqrtsv2sf;
7577     case V4SFmode: return gen_aarch64_rsqrtsv4sf;
7578     default: gcc_unreachable ();
7579   }
7580 }
7581
7582 /* Emit instruction sequence to compute either the approximate square root
7583    or its approximate reciprocal, depending on the flag RECP, and return
7584    whether the sequence was emitted or not.  */
7585
7586 bool
7587 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
7588 {
7589   machine_mode mode = GET_MODE (dst);
7590
7591   if (GET_MODE_INNER (mode) == HFmode)
7592     return false;
7593
7594   machine_mode mmsk = mode_for_vector
7595                         (int_mode_for_mode (GET_MODE_INNER (mode)),
7596                          GET_MODE_NUNITS (mode));
7597   bool use_approx_sqrt_p = (!recp
7598                             && (flag_mlow_precision_sqrt
7599                                 || (aarch64_tune_params.approx_modes->sqrt
7600                                     & AARCH64_APPROX_MODE (mode))));
7601   bool use_approx_rsqrt_p = (recp
7602                              && (flag_mrecip_low_precision_sqrt
7603                                  || (aarch64_tune_params.approx_modes->recip_sqrt
7604                                      & AARCH64_APPROX_MODE (mode))));
7605
7606   if (!flag_finite_math_only
7607       || flag_trapping_math
7608       || !flag_unsafe_math_optimizations
7609       || !(use_approx_sqrt_p || use_approx_rsqrt_p)
7610       || optimize_function_for_size_p (cfun))
7611     return false;
7612
7613   rtx xmsk = gen_reg_rtx (mmsk);
7614   if (!recp)
7615     /* When calculating the approximate square root, compare the argument with
7616        0.0 and create a mask.  */
7617     emit_insn (gen_rtx_SET (xmsk, gen_rtx_NEG (mmsk, gen_rtx_EQ (mmsk, src,
7618                                                           CONST0_RTX (mode)))));
7619
7620   /* Estimate the approximate reciprocal square root.  */
7621   rtx xdst = gen_reg_rtx (mode);
7622   emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
7623
7624   /* Iterate over the series twice for SF and thrice for DF.  */
7625   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
7626
7627   /* Optionally iterate over the series once less for faster performance
7628      while sacrificing the accuracy.  */
7629   if ((recp && flag_mrecip_low_precision_sqrt)
7630       || (!recp && flag_mlow_precision_sqrt))
7631     iterations--;
7632
7633   /* Iterate over the series to calculate the approximate reciprocal square
7634      root.  */
7635   rtx x1 = gen_reg_rtx (mode);
7636   while (iterations--)
7637     {
7638       rtx x2 = gen_reg_rtx (mode);
7639       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
7640
7641       emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
7642
7643       if (iterations > 0)
7644         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
7645     }
7646
7647   if (!recp)
7648     {
7649       /* Qualify the approximate reciprocal square root when the argument is
7650          0.0 by squashing the intermediary result to 0.0.  */
7651       rtx xtmp = gen_reg_rtx (mmsk);
7652       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
7653                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
7654       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
7655
7656       /* Calculate the approximate square root.  */
7657       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
7658     }
7659
7660   /* Finalize the approximation.  */
7661   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
7662
7663   return true;
7664 }
7665
7666 typedef rtx (*recpe_type) (rtx, rtx);
7667
7668 /* Select reciprocal initial estimate insn depending on machine mode.  */
7669
7670 static recpe_type
7671 get_recpe_type (machine_mode mode)
7672 {
7673   switch (mode)
7674   {
7675     case SFmode:   return (gen_aarch64_frecpesf);
7676     case V2SFmode: return (gen_aarch64_frecpev2sf);
7677     case V4SFmode: return (gen_aarch64_frecpev4sf);
7678     case DFmode:   return (gen_aarch64_frecpedf);
7679     case V2DFmode: return (gen_aarch64_frecpev2df);
7680     default:       gcc_unreachable ();
7681   }
7682 }
7683
7684 typedef rtx (*recps_type) (rtx, rtx, rtx);
7685
7686 /* Select reciprocal series step insn depending on machine mode.  */
7687
7688 static recps_type
7689 get_recps_type (machine_mode mode)
7690 {
7691   switch (mode)
7692   {
7693     case SFmode:   return (gen_aarch64_frecpssf);
7694     case V2SFmode: return (gen_aarch64_frecpsv2sf);
7695     case V4SFmode: return (gen_aarch64_frecpsv4sf);
7696     case DFmode:   return (gen_aarch64_frecpsdf);
7697     case V2DFmode: return (gen_aarch64_frecpsv2df);
7698     default:       gcc_unreachable ();
7699   }
7700 }
7701
7702 /* Emit the instruction sequence to compute the approximation for the division
7703    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
7704
7705 bool
7706 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
7707 {
7708   machine_mode mode = GET_MODE (quo);
7709
7710   if (GET_MODE_INNER (mode) == HFmode)
7711     return false;
7712
7713   bool use_approx_division_p = (flag_mlow_precision_div
7714                                 || (aarch64_tune_params.approx_modes->division
7715                                     & AARCH64_APPROX_MODE (mode)));
7716
7717   if (!flag_finite_math_only
7718       || flag_trapping_math
7719       || !flag_unsafe_math_optimizations
7720       || optimize_function_for_size_p (cfun)
7721       || !use_approx_division_p)
7722     return false;
7723
7724   /* Estimate the approximate reciprocal.  */
7725   rtx xrcp = gen_reg_rtx (mode);
7726   emit_insn ((*get_recpe_type (mode)) (xrcp, den));
7727
7728   /* Iterate over the series twice for SF and thrice for DF.  */
7729   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
7730
7731   /* Optionally iterate over the series once less for faster performance,
7732      while sacrificing the accuracy.  */
7733   if (flag_mlow_precision_div)
7734     iterations--;
7735
7736   /* Iterate over the series to calculate the approximate reciprocal.  */
7737   rtx xtmp = gen_reg_rtx (mode);
7738   while (iterations--)
7739     {
7740       emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
7741
7742       if (iterations > 0)
7743         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
7744     }
7745
7746   if (num != CONST1_RTX (mode))
7747     {
7748       /* As the approximate reciprocal of DEN is already calculated, only
7749          calculate the approximate division when NUM is not 1.0.  */
7750       rtx xnum = force_reg (mode, num);
7751       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
7752     }
7753
7754   /* Finalize the approximation.  */
7755   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
7756   return true;
7757 }
7758
7759 /* Return the number of instructions that can be issued per cycle.  */
7760 static int
7761 aarch64_sched_issue_rate (void)
7762 {
7763   return aarch64_tune_params.issue_rate;
7764 }
7765
7766 static int
7767 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
7768 {
7769   int issue_rate = aarch64_sched_issue_rate ();
7770
7771   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
7772 }
7773
7774
7775 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
7776    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
7777    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
7778
7779 static int
7780 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
7781                                                     int ready_index)
7782 {
7783   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
7784 }
7785
7786
7787 /* Vectorizer cost model target hooks.  */
7788
7789 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
7790 static int
7791 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
7792                                     tree vectype,
7793                                     int misalign ATTRIBUTE_UNUSED)
7794 {
7795   unsigned elements;
7796
7797   switch (type_of_cost)
7798     {
7799       case scalar_stmt:
7800         return aarch64_tune_params.vec_costs->scalar_stmt_cost;
7801
7802       case scalar_load:
7803         return aarch64_tune_params.vec_costs->scalar_load_cost;
7804
7805       case scalar_store:
7806         return aarch64_tune_params.vec_costs->scalar_store_cost;
7807
7808       case vector_stmt:
7809         return aarch64_tune_params.vec_costs->vec_stmt_cost;
7810
7811       case vector_load:
7812         return aarch64_tune_params.vec_costs->vec_align_load_cost;
7813
7814       case vector_store:
7815         return aarch64_tune_params.vec_costs->vec_store_cost;
7816
7817       case vec_to_scalar:
7818         return aarch64_tune_params.vec_costs->vec_to_scalar_cost;
7819
7820       case scalar_to_vec:
7821         return aarch64_tune_params.vec_costs->scalar_to_vec_cost;
7822
7823       case unaligned_load:
7824         return aarch64_tune_params.vec_costs->vec_unalign_load_cost;
7825
7826       case unaligned_store:
7827         return aarch64_tune_params.vec_costs->vec_unalign_store_cost;
7828
7829       case cond_branch_taken:
7830         return aarch64_tune_params.vec_costs->cond_taken_branch_cost;
7831
7832       case cond_branch_not_taken:
7833         return aarch64_tune_params.vec_costs->cond_not_taken_branch_cost;
7834
7835       case vec_perm:
7836         return aarch64_tune_params.vec_costs->vec_permute_cost;
7837
7838       case vec_promote_demote:
7839         return aarch64_tune_params.vec_costs->vec_stmt_cost;
7840
7841       case vec_construct:
7842         elements = TYPE_VECTOR_SUBPARTS (vectype);
7843         return elements / 2 + 1;
7844
7845       default:
7846         gcc_unreachable ();
7847     }
7848 }
7849
7850 /* Implement targetm.vectorize.add_stmt_cost.  */
7851 static unsigned
7852 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
7853                        struct _stmt_vec_info *stmt_info, int misalign,
7854                        enum vect_cost_model_location where)
7855 {
7856   unsigned *cost = (unsigned *) data;
7857   unsigned retval = 0;
7858
7859   if (flag_vect_cost_model)
7860     {
7861       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
7862       int stmt_cost =
7863             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
7864
7865       /* Statements in an inner loop relative to the loop being
7866          vectorized are weighted more heavily.  The value here is
7867          arbitrary and could potentially be improved with analysis.  */
7868       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
7869         count *= 50; /*  FIXME  */
7870
7871       retval = (unsigned) (count * stmt_cost);
7872       cost[where] += retval;
7873     }
7874
7875   return retval;
7876 }
7877
7878 static void initialize_aarch64_code_model (struct gcc_options *);
7879
7880 /* Parse the TO_PARSE string and put the architecture struct that it
7881    selects into RES and the architectural features into ISA_FLAGS.
7882    Return an aarch64_parse_opt_result describing the parse result.
7883    If there is an error parsing, RES and ISA_FLAGS are left unchanged.  */
7884
7885 static enum aarch64_parse_opt_result
7886 aarch64_parse_arch (const char *to_parse, const struct processor **res,
7887                     unsigned long *isa_flags)
7888 {
7889   char *ext;
7890   const struct processor *arch;
7891   char *str = (char *) alloca (strlen (to_parse) + 1);
7892   size_t len;
7893
7894   strcpy (str, to_parse);
7895
7896   ext = strchr (str, '+');
7897
7898   if (ext != NULL)
7899     len = ext - str;
7900   else
7901     len = strlen (str);
7902
7903   if (len == 0)
7904     return AARCH64_PARSE_MISSING_ARG;
7905
7906
7907   /* Loop through the list of supported ARCHes to find a match.  */
7908   for (arch = all_architectures; arch->name != NULL; arch++)
7909     {
7910       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
7911         {
7912           unsigned long isa_temp = arch->flags;
7913
7914           if (ext != NULL)
7915             {
7916               /* TO_PARSE string contains at least one extension.  */
7917               enum aarch64_parse_opt_result ext_res
7918                 = aarch64_parse_extension (ext, &isa_temp);
7919
7920               if (ext_res != AARCH64_PARSE_OK)
7921                 return ext_res;
7922             }
7923           /* Extension parsing was successful.  Confirm the result
7924              arch and ISA flags.  */
7925           *res = arch;
7926           *isa_flags = isa_temp;
7927           return AARCH64_PARSE_OK;
7928         }
7929     }
7930
7931   /* ARCH name not found in list.  */
7932   return AARCH64_PARSE_INVALID_ARG;
7933 }
7934
7935 /* Parse the TO_PARSE string and put the result tuning in RES and the
7936    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
7937    describing the parse result.  If there is an error parsing, RES and
7938    ISA_FLAGS are left unchanged.  */
7939
7940 static enum aarch64_parse_opt_result
7941 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
7942                    unsigned long *isa_flags)
7943 {
7944   char *ext;
7945   const struct processor *cpu;
7946   char *str = (char *) alloca (strlen (to_parse) + 1);
7947   size_t len;
7948
7949   strcpy (str, to_parse);
7950
7951   ext = strchr (str, '+');
7952
7953   if (ext != NULL)
7954     len = ext - str;
7955   else
7956     len = strlen (str);
7957
7958   if (len == 0)
7959     return AARCH64_PARSE_MISSING_ARG;
7960
7961
7962   /* Loop through the list of supported CPUs to find a match.  */
7963   for (cpu = all_cores; cpu->name != NULL; cpu++)
7964     {
7965       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
7966         {
7967           unsigned long isa_temp = cpu->flags;
7968
7969
7970           if (ext != NULL)
7971             {
7972               /* TO_PARSE string contains at least one extension.  */
7973               enum aarch64_parse_opt_result ext_res
7974                 = aarch64_parse_extension (ext, &isa_temp);
7975
7976               if (ext_res != AARCH64_PARSE_OK)
7977                 return ext_res;
7978             }
7979           /* Extension parsing was successfull.  Confirm the result
7980              cpu and ISA flags.  */
7981           *res = cpu;
7982           *isa_flags = isa_temp;
7983           return AARCH64_PARSE_OK;
7984         }
7985     }
7986
7987   /* CPU name not found in list.  */
7988   return AARCH64_PARSE_INVALID_ARG;
7989 }
7990
7991 /* Parse the TO_PARSE string and put the cpu it selects into RES.
7992    Return an aarch64_parse_opt_result describing the parse result.
7993    If the parsing fails the RES does not change.  */
7994
7995 static enum aarch64_parse_opt_result
7996 aarch64_parse_tune (const char *to_parse, const struct processor **res)
7997 {
7998   const struct processor *cpu;
7999   char *str = (char *) alloca (strlen (to_parse) + 1);
8000
8001   strcpy (str, to_parse);
8002
8003   /* Loop through the list of supported CPUs to find a match.  */
8004   for (cpu = all_cores; cpu->name != NULL; cpu++)
8005     {
8006       if (strcmp (cpu->name, str) == 0)
8007         {
8008           *res = cpu;
8009           return AARCH64_PARSE_OK;
8010         }
8011     }
8012
8013   /* CPU name not found in list.  */
8014   return AARCH64_PARSE_INVALID_ARG;
8015 }
8016
8017 /* Parse TOKEN, which has length LENGTH to see if it is an option
8018    described in FLAG.  If it is, return the index bit for that fusion type.
8019    If not, error (printing OPTION_NAME) and return zero.  */
8020
8021 static unsigned int
8022 aarch64_parse_one_option_token (const char *token,
8023                                 size_t length,
8024                                 const struct aarch64_flag_desc *flag,
8025                                 const char *option_name)
8026 {
8027   for (; flag->name != NULL; flag++)
8028     {
8029       if (length == strlen (flag->name)
8030           && !strncmp (flag->name, token, length))
8031         return flag->flag;
8032     }
8033
8034   error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
8035   return 0;
8036 }
8037
8038 /* Parse OPTION which is a comma-separated list of flags to enable.
8039    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
8040    default state we inherit from the CPU tuning structures.  OPTION_NAME
8041    gives the top-level option we are parsing in the -moverride string,
8042    for use in error messages.  */
8043
8044 static unsigned int
8045 aarch64_parse_boolean_options (const char *option,
8046                                const struct aarch64_flag_desc *flags,
8047                                unsigned int initial_state,
8048                                const char *option_name)
8049 {
8050   const char separator = '.';
8051   const char* specs = option;
8052   const char* ntoken = option;
8053   unsigned int found_flags = initial_state;
8054
8055   while ((ntoken = strchr (specs, separator)))
8056     {
8057       size_t token_length = ntoken - specs;
8058       unsigned token_ops = aarch64_parse_one_option_token (specs,
8059                                                            token_length,
8060                                                            flags,
8061                                                            option_name);
8062       /* If we find "none" (or, for simplicity's sake, an error) anywhere
8063          in the token stream, reset the supported operations.  So:
8064
8065            adrp+add.cmp+branch.none.adrp+add
8066
8067            would have the result of turning on only adrp+add fusion.  */
8068       if (!token_ops)
8069         found_flags = 0;
8070
8071       found_flags |= token_ops;
8072       specs = ++ntoken;
8073     }
8074
8075   /* We ended with a comma, print something.  */
8076   if (!(*specs))
8077     {
8078       error ("%s string ill-formed\n", option_name);
8079       return 0;
8080     }
8081
8082   /* We still have one more token to parse.  */
8083   size_t token_length = strlen (specs);
8084   unsigned token_ops = aarch64_parse_one_option_token (specs,
8085                                                        token_length,
8086                                                        flags,
8087                                                        option_name);
8088    if (!token_ops)
8089      found_flags = 0;
8090
8091   found_flags |= token_ops;
8092   return found_flags;
8093 }
8094
8095 /* Support for overriding instruction fusion.  */
8096
8097 static void
8098 aarch64_parse_fuse_string (const char *fuse_string,
8099                             struct tune_params *tune)
8100 {
8101   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
8102                                                      aarch64_fusible_pairs,
8103                                                      tune->fusible_ops,
8104                                                      "fuse=");
8105 }
8106
8107 /* Support for overriding other tuning flags.  */
8108
8109 static void
8110 aarch64_parse_tune_string (const char *tune_string,
8111                             struct tune_params *tune)
8112 {
8113   tune->extra_tuning_flags
8114     = aarch64_parse_boolean_options (tune_string,
8115                                      aarch64_tuning_flags,
8116                                      tune->extra_tuning_flags,
8117                                      "tune=");
8118 }
8119
8120 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
8121    we understand.  If it is, extract the option string and handoff to
8122    the appropriate function.  */
8123
8124 void
8125 aarch64_parse_one_override_token (const char* token,
8126                                   size_t length,
8127                                   struct tune_params *tune)
8128 {
8129   const struct aarch64_tuning_override_function *fn
8130     = aarch64_tuning_override_functions;
8131
8132   const char *option_part = strchr (token, '=');
8133   if (!option_part)
8134     {
8135       error ("tuning string missing in option (%s)", token);
8136       return;
8137     }
8138
8139   /* Get the length of the option name.  */
8140   length = option_part - token;
8141   /* Skip the '=' to get to the option string.  */
8142   option_part++;
8143
8144   for (; fn->name != NULL; fn++)
8145     {
8146       if (!strncmp (fn->name, token, length))
8147         {
8148           fn->parse_override (option_part, tune);
8149           return;
8150         }
8151     }
8152
8153   error ("unknown tuning option (%s)",token);
8154   return;
8155 }
8156
8157 /* A checking mechanism for the implementation of the tls size.  */
8158
8159 static void
8160 initialize_aarch64_tls_size (struct gcc_options *opts)
8161 {
8162   if (aarch64_tls_size == 0)
8163     aarch64_tls_size = 24;
8164
8165   switch (opts->x_aarch64_cmodel_var)
8166     {
8167     case AARCH64_CMODEL_TINY:
8168       /* Both the default and maximum TLS size allowed under tiny is 1M which
8169          needs two instructions to address, so we clamp the size to 24.  */
8170       if (aarch64_tls_size > 24)
8171         aarch64_tls_size = 24;
8172       break;
8173     case AARCH64_CMODEL_SMALL:
8174       /* The maximum TLS size allowed under small is 4G.  */
8175       if (aarch64_tls_size > 32)
8176         aarch64_tls_size = 32;
8177       break;
8178     case AARCH64_CMODEL_LARGE:
8179       /* The maximum TLS size allowed under large is 16E.
8180          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
8181       if (aarch64_tls_size > 48)
8182         aarch64_tls_size = 48;
8183       break;
8184     default:
8185       gcc_unreachable ();
8186     }
8187
8188   return;
8189 }
8190
8191 /* Parse STRING looking for options in the format:
8192      string     :: option:string
8193      option     :: name=substring
8194      name       :: {a-z}
8195      substring  :: defined by option.  */
8196
8197 static void
8198 aarch64_parse_override_string (const char* input_string,
8199                                struct tune_params* tune)
8200 {
8201   const char separator = ':';
8202   size_t string_length = strlen (input_string) + 1;
8203   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
8204   char *string = string_root;
8205   strncpy (string, input_string, string_length);
8206   string[string_length - 1] = '\0';
8207
8208   char* ntoken = string;
8209
8210   while ((ntoken = strchr (string, separator)))
8211     {
8212       size_t token_length = ntoken - string;
8213       /* Make this substring look like a string.  */
8214       *ntoken = '\0';
8215       aarch64_parse_one_override_token (string, token_length, tune);
8216       string = ++ntoken;
8217     }
8218
8219   /* One last option to parse.  */
8220   aarch64_parse_one_override_token (string, strlen (string), tune);
8221   free (string_root);
8222 }
8223
8224
8225 static void
8226 aarch64_override_options_after_change_1 (struct gcc_options *opts)
8227 {
8228   /* The logic here is that if we are disabling all frame pointer generation
8229      then we do not need to disable leaf frame pointer generation as a
8230      separate operation.  But if we are *only* disabling leaf frame pointer
8231      generation then we set flag_omit_frame_pointer to true, but in
8232      aarch64_frame_pointer_required we return false only for leaf functions.
8233
8234      PR 70044: We have to be careful about being called multiple times for the
8235      same function.  Once we have decided to set flag_omit_frame_pointer just
8236      so that we can omit leaf frame pointers, we must then not interpret a
8237      second call as meaning that all frame pointer generation should be
8238      omitted.  We do this by setting flag_omit_frame_pointer to a special,
8239      non-zero value.  */
8240   if (opts->x_flag_omit_frame_pointer == 2)
8241     opts->x_flag_omit_frame_pointer = 0;
8242
8243   if (opts->x_flag_omit_frame_pointer)
8244     opts->x_flag_omit_leaf_frame_pointer = false;
8245   else if (opts->x_flag_omit_leaf_frame_pointer)
8246     opts->x_flag_omit_frame_pointer = 2;
8247
8248   /* If not optimizing for size, set the default
8249      alignment to what the target wants.  */
8250   if (!opts->x_optimize_size)
8251     {
8252       if (opts->x_align_loops <= 0)
8253         opts->x_align_loops = aarch64_tune_params.loop_align;
8254       if (opts->x_align_jumps <= 0)
8255         opts->x_align_jumps = aarch64_tune_params.jump_align;
8256       if (opts->x_align_functions <= 0)
8257         opts->x_align_functions = aarch64_tune_params.function_align;
8258     }
8259
8260   /* We default to no pc-relative literal loads.  */
8261
8262   aarch64_pcrelative_literal_loads = false;
8263
8264   /* If -mpc-relative-literal-loads is set on the command line, this
8265      implies that the user asked for PC relative literal loads.  */
8266   if (opts->x_pcrelative_literal_loads == 1)
8267     aarch64_pcrelative_literal_loads = true;
8268
8269   /* This is PR70113. When building the Linux kernel with
8270      CONFIG_ARM64_ERRATUM_843419, support for relocations
8271      R_AARCH64_ADR_PREL_PG_HI21 and R_AARCH64_ADR_PREL_PG_HI21_NC is
8272      removed from the kernel to avoid loading objects with possibly
8273      offending sequences.  Without -mpc-relative-literal-loads we would
8274      generate such relocations, preventing the kernel build from
8275      succeeding.  */
8276   if (opts->x_pcrelative_literal_loads == 2
8277       && TARGET_FIX_ERR_A53_843419)
8278     aarch64_pcrelative_literal_loads = true;
8279
8280   /* In the tiny memory model it makes no sense to disallow PC relative
8281      literal pool loads.  */
8282   if (aarch64_cmodel == AARCH64_CMODEL_TINY
8283       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
8284     aarch64_pcrelative_literal_loads = true;
8285
8286   /* When enabling the lower precision Newton series for the square root, also
8287      enable it for the reciprocal square root, since the latter is an
8288      intermediary step for the former.  */
8289   if (flag_mlow_precision_sqrt)
8290     flag_mrecip_low_precision_sqrt = true;
8291 }
8292
8293 /* 'Unpack' up the internal tuning structs and update the options
8294     in OPTS.  The caller must have set up selected_tune and selected_arch
8295     as all the other target-specific codegen decisions are
8296     derived from them.  */
8297
8298 void
8299 aarch64_override_options_internal (struct gcc_options *opts)
8300 {
8301   aarch64_tune_flags = selected_tune->flags;
8302   aarch64_tune = selected_tune->sched_core;
8303   /* Make a copy of the tuning parameters attached to the core, which
8304      we may later overwrite.  */
8305   aarch64_tune_params = *(selected_tune->tune);
8306   aarch64_architecture_version = selected_arch->architecture_version;
8307
8308   if (opts->x_aarch64_override_tune_string)
8309     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
8310                                   &aarch64_tune_params);
8311
8312   /* This target defaults to strict volatile bitfields.  */
8313   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
8314     opts->x_flag_strict_volatile_bitfields = 1;
8315
8316   initialize_aarch64_code_model (opts);
8317   initialize_aarch64_tls_size (opts);
8318
8319   int queue_depth = 0;
8320   switch (aarch64_tune_params.autoprefetcher_model)
8321     {
8322       case tune_params::AUTOPREFETCHER_OFF:
8323         queue_depth = -1;
8324         break;
8325       case tune_params::AUTOPREFETCHER_WEAK:
8326         queue_depth = 0;
8327         break;
8328       case tune_params::AUTOPREFETCHER_STRONG:
8329         queue_depth = max_insn_queue_index + 1;
8330         break;
8331       default:
8332         gcc_unreachable ();
8333     }
8334
8335   /* We don't mind passing in global_options_set here as we don't use
8336      the *options_set structs anyway.  */
8337   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
8338                          queue_depth,
8339                          opts->x_param_values,
8340                          global_options_set.x_param_values);
8341
8342   /* Set the L1 cache line size.  */
8343   if (selected_cpu->tune->cache_line_size != 0)
8344     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
8345                            selected_cpu->tune->cache_line_size,
8346                            opts->x_param_values,
8347                            global_options_set.x_param_values);
8348
8349   aarch64_override_options_after_change_1 (opts);
8350 }
8351
8352 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
8353    specified in STR and throw errors if appropriate.  Put the results if
8354    they are valid in RES and ISA_FLAGS.  Return whether the option is
8355    valid.  */
8356
8357 static bool
8358 aarch64_validate_mcpu (const char *str, const struct processor **res,
8359                        unsigned long *isa_flags)
8360 {
8361   enum aarch64_parse_opt_result parse_res
8362     = aarch64_parse_cpu (str, res, isa_flags);
8363
8364   if (parse_res == AARCH64_PARSE_OK)
8365     return true;
8366
8367   switch (parse_res)
8368     {
8369       case AARCH64_PARSE_MISSING_ARG:
8370         error ("missing cpu name in -mcpu=%qs", str);
8371         break;
8372       case AARCH64_PARSE_INVALID_ARG:
8373         error ("unknown value %qs for -mcpu", str);
8374         break;
8375       case AARCH64_PARSE_INVALID_FEATURE:
8376         error ("invalid feature modifier in -mcpu=%qs", str);
8377         break;
8378       default:
8379         gcc_unreachable ();
8380     }
8381
8382   return false;
8383 }
8384
8385 /* Validate a command-line -march option.  Parse the arch and extensions
8386    (if any) specified in STR and throw errors if appropriate.  Put the
8387    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
8388    option is valid.  */
8389
8390 static bool
8391 aarch64_validate_march (const char *str, const struct processor **res,
8392                        unsigned long *isa_flags)
8393 {
8394   enum aarch64_parse_opt_result parse_res
8395     = aarch64_parse_arch (str, res, isa_flags);
8396
8397   if (parse_res == AARCH64_PARSE_OK)
8398     return true;
8399
8400   switch (parse_res)
8401     {
8402       case AARCH64_PARSE_MISSING_ARG:
8403         error ("missing arch name in -march=%qs", str);
8404         break;
8405       case AARCH64_PARSE_INVALID_ARG:
8406         error ("unknown value %qs for -march", str);
8407         break;
8408       case AARCH64_PARSE_INVALID_FEATURE:
8409         error ("invalid feature modifier in -march=%qs", str);
8410         break;
8411       default:
8412         gcc_unreachable ();
8413     }
8414
8415   return false;
8416 }
8417
8418 /* Validate a command-line -mtune option.  Parse the cpu
8419    specified in STR and throw errors if appropriate.  Put the
8420    result, if it is valid, in RES.  Return whether the option is
8421    valid.  */
8422
8423 static bool
8424 aarch64_validate_mtune (const char *str, const struct processor **res)
8425 {
8426   enum aarch64_parse_opt_result parse_res
8427     = aarch64_parse_tune (str, res);
8428
8429   if (parse_res == AARCH64_PARSE_OK)
8430     return true;
8431
8432   switch (parse_res)
8433     {
8434       case AARCH64_PARSE_MISSING_ARG:
8435         error ("missing cpu name in -mtune=%qs", str);
8436         break;
8437       case AARCH64_PARSE_INVALID_ARG:
8438         error ("unknown value %qs for -mtune", str);
8439         break;
8440       default:
8441         gcc_unreachable ();
8442     }
8443   return false;
8444 }
8445
8446 /* Return the CPU corresponding to the enum CPU.
8447    If it doesn't specify a cpu, return the default.  */
8448
8449 static const struct processor *
8450 aarch64_get_tune_cpu (enum aarch64_processor cpu)
8451 {
8452   if (cpu != aarch64_none)
8453     return &all_cores[cpu];
8454
8455   /* The & 0x3f is to extract the bottom 6 bits that encode the
8456      default cpu as selected by the --with-cpu GCC configure option
8457      in config.gcc.
8458      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
8459      flags mechanism should be reworked to make it more sane.  */
8460   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
8461 }
8462
8463 /* Return the architecture corresponding to the enum ARCH.
8464    If it doesn't specify a valid architecture, return the default.  */
8465
8466 static const struct processor *
8467 aarch64_get_arch (enum aarch64_arch arch)
8468 {
8469   if (arch != aarch64_no_arch)
8470     return &all_architectures[arch];
8471
8472   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
8473
8474   return &all_architectures[cpu->arch];
8475 }
8476
8477 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
8478    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
8479    tuning structs.  In particular it must set selected_tune and
8480    aarch64_isa_flags that define the available ISA features and tuning
8481    decisions.  It must also set selected_arch as this will be used to
8482    output the .arch asm tags for each function.  */
8483
8484 static void
8485 aarch64_override_options (void)
8486 {
8487   unsigned long cpu_isa = 0;
8488   unsigned long arch_isa = 0;
8489   aarch64_isa_flags = 0;
8490
8491   bool valid_cpu = true;
8492   bool valid_tune = true;
8493   bool valid_arch = true;
8494
8495   selected_cpu = NULL;
8496   selected_arch = NULL;
8497   selected_tune = NULL;
8498
8499   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
8500      If either of -march or -mtune is given, they override their
8501      respective component of -mcpu.  */
8502   if (aarch64_cpu_string)
8503     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
8504                                         &cpu_isa);
8505
8506   if (aarch64_arch_string)
8507     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
8508                                           &arch_isa);
8509
8510   if (aarch64_tune_string)
8511     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
8512
8513   /* If the user did not specify a processor, choose the default
8514      one for them.  This will be the CPU set during configuration using
8515      --with-cpu, otherwise it is "generic".  */
8516   if (!selected_cpu)
8517     {
8518       if (selected_arch)
8519         {
8520           selected_cpu = &all_cores[selected_arch->ident];
8521           aarch64_isa_flags = arch_isa;
8522           explicit_arch = selected_arch->arch;
8523         }
8524       else
8525         {
8526           /* Get default configure-time CPU.  */
8527           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
8528           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
8529         }
8530
8531       if (selected_tune)
8532         explicit_tune_core = selected_tune->ident;
8533     }
8534   /* If both -mcpu and -march are specified check that they are architecturally
8535      compatible, warn if they're not and prefer the -march ISA flags.  */
8536   else if (selected_arch)
8537     {
8538       if (selected_arch->arch != selected_cpu->arch)
8539         {
8540           warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
8541                        all_architectures[selected_cpu->arch].name,
8542                        selected_arch->name);
8543         }
8544       aarch64_isa_flags = arch_isa;
8545       explicit_arch = selected_arch->arch;
8546       explicit_tune_core = selected_tune ? selected_tune->ident
8547                                           : selected_cpu->ident;
8548     }
8549   else
8550     {
8551       /* -mcpu but no -march.  */
8552       aarch64_isa_flags = cpu_isa;
8553       explicit_tune_core = selected_tune ? selected_tune->ident
8554                                           : selected_cpu->ident;
8555       gcc_assert (selected_cpu);
8556       selected_arch = &all_architectures[selected_cpu->arch];
8557       explicit_arch = selected_arch->arch;
8558     }
8559
8560   /* Set the arch as well as we will need it when outputing
8561      the .arch directive in assembly.  */
8562   if (!selected_arch)
8563     {
8564       gcc_assert (selected_cpu);
8565       selected_arch = &all_architectures[selected_cpu->arch];
8566     }
8567
8568   if (!selected_tune)
8569     selected_tune = selected_cpu;
8570
8571 #ifndef HAVE_AS_MABI_OPTION
8572   /* The compiler may have been configured with 2.23.* binutils, which does
8573      not have support for ILP32.  */
8574   if (TARGET_ILP32)
8575     error ("Assembler does not support -mabi=ilp32");
8576 #endif
8577
8578   /* Make sure we properly set up the explicit options.  */
8579   if ((aarch64_cpu_string && valid_cpu)
8580        || (aarch64_tune_string && valid_tune))
8581     gcc_assert (explicit_tune_core != aarch64_none);
8582
8583   if ((aarch64_cpu_string && valid_cpu)
8584        || (aarch64_arch_string && valid_arch))
8585     gcc_assert (explicit_arch != aarch64_no_arch);
8586
8587   /* The pass to insert speculation tracking runs before
8588      shrink-wrapping and the latter does not know how to update the
8589      tracking status.  So disable it in this case.  */
8590   if (aarch64_track_speculation)
8591     flag_shrink_wrap = 0;
8592
8593   aarch64_override_options_internal (&global_options);
8594
8595   /* Save these options as the default ones in case we push and pop them later
8596      while processing functions with potential target attributes.  */
8597   target_option_default_node = target_option_current_node
8598       = build_target_option_node (&global_options);
8599 }
8600
8601 /* Implement targetm.override_options_after_change.  */
8602
8603 static void
8604 aarch64_override_options_after_change (void)
8605 {
8606   aarch64_override_options_after_change_1 (&global_options);
8607 }
8608
8609 static struct machine_function *
8610 aarch64_init_machine_status (void)
8611 {
8612   struct machine_function *machine;
8613   machine = ggc_cleared_alloc<machine_function> ();
8614   return machine;
8615 }
8616
8617 void
8618 aarch64_init_expanders (void)
8619 {
8620   init_machine_status = aarch64_init_machine_status;
8621 }
8622
8623 /* A checking mechanism for the implementation of the various code models.  */
8624 static void
8625 initialize_aarch64_code_model (struct gcc_options *opts)
8626 {
8627    if (opts->x_flag_pic)
8628      {
8629        switch (opts->x_aarch64_cmodel_var)
8630          {
8631          case AARCH64_CMODEL_TINY:
8632            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
8633            break;
8634          case AARCH64_CMODEL_SMALL:
8635 #ifdef HAVE_AS_SMALL_PIC_RELOCS
8636            aarch64_cmodel = (flag_pic == 2
8637                              ? AARCH64_CMODEL_SMALL_PIC
8638                              : AARCH64_CMODEL_SMALL_SPIC);
8639 #else
8640            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
8641 #endif
8642            break;
8643          case AARCH64_CMODEL_LARGE:
8644            sorry ("code model %qs with -f%s", "large",
8645                   opts->x_flag_pic > 1 ? "PIC" : "pic");
8646            break;
8647          default:
8648            gcc_unreachable ();
8649          }
8650      }
8651    else
8652      aarch64_cmodel = opts->x_aarch64_cmodel_var;
8653 }
8654
8655 /* Implement TARGET_OPTION_SAVE.  */
8656
8657 static void
8658 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
8659 {
8660   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
8661 }
8662
8663 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
8664    using the information saved in PTR.  */
8665
8666 static void
8667 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
8668 {
8669   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
8670   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
8671   opts->x_explicit_arch = ptr->x_explicit_arch;
8672   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
8673   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
8674
8675   aarch64_override_options_internal (opts);
8676 }
8677
8678 /* Implement TARGET_OPTION_PRINT.  */
8679
8680 static void
8681 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
8682 {
8683   const struct processor *cpu
8684     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
8685   unsigned long isa_flags = ptr->x_aarch64_isa_flags;
8686   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
8687   std::string extension
8688     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
8689
8690   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
8691   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
8692            arch->name, extension.c_str ());
8693 }
8694
8695 static GTY(()) tree aarch64_previous_fndecl;
8696
8697 void
8698 aarch64_reset_previous_fndecl (void)
8699 {
8700   aarch64_previous_fndecl = NULL;
8701 }
8702
8703 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
8704    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
8705    make sure optab availability predicates are recomputed when necessary.  */
8706
8707 void
8708 aarch64_save_restore_target_globals (tree new_tree)
8709 {
8710   if (TREE_TARGET_GLOBALS (new_tree))
8711     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
8712   else if (new_tree == target_option_default_node)
8713     restore_target_globals (&default_target_globals);
8714   else
8715     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
8716 }
8717
8718 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
8719    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
8720    of the function, if such exists.  This function may be called multiple
8721    times on a single function so use aarch64_previous_fndecl to avoid
8722    setting up identical state.  */
8723
8724 static void
8725 aarch64_set_current_function (tree fndecl)
8726 {
8727   if (!fndecl || fndecl == aarch64_previous_fndecl)
8728     return;
8729
8730   tree old_tree = (aarch64_previous_fndecl
8731                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
8732                    : NULL_TREE);
8733
8734   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
8735
8736   /* If current function has no attributes but the previous one did,
8737      use the default node.  */
8738   if (!new_tree && old_tree)
8739     new_tree = target_option_default_node;
8740
8741   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
8742      the default have been handled by aarch64_save_restore_target_globals from
8743      aarch64_pragma_target_parse.  */
8744   if (old_tree == new_tree)
8745     return;
8746
8747   aarch64_previous_fndecl = fndecl;
8748
8749   /* First set the target options.  */
8750   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
8751
8752   aarch64_save_restore_target_globals (new_tree);
8753 }
8754
8755 /* Enum describing the various ways we can handle attributes.
8756    In many cases we can reuse the generic option handling machinery.  */
8757
8758 enum aarch64_attr_opt_type
8759 {
8760   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
8761   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
8762   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
8763   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
8764 };
8765
8766 /* All the information needed to handle a target attribute.
8767    NAME is the name of the attribute.
8768    ATTR_TYPE specifies the type of behavior of the attribute as described
8769    in the definition of enum aarch64_attr_opt_type.
8770    ALLOW_NEG is true if the attribute supports a "no-" form.
8771    HANDLER is the function that takes the attribute string and whether
8772    it is a pragma or attribute and handles the option.  It is needed only
8773    when the ATTR_TYPE is aarch64_attr_custom.
8774    OPT_NUM is the enum specifying the option that the attribute modifies.
8775    This is needed for attributes that mirror the behavior of a command-line
8776    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
8777    aarch64_attr_enum.  */
8778
8779 struct aarch64_attribute_info
8780 {
8781   const char *name;
8782   enum aarch64_attr_opt_type attr_type;
8783   bool allow_neg;
8784   bool (*handler) (const char *, const char *);
8785   enum opt_code opt_num;
8786 };
8787
8788 /* Handle the ARCH_STR argument to the arch= target attribute.
8789    PRAGMA_OR_ATTR is used in potential error messages.  */
8790
8791 static bool
8792 aarch64_handle_attr_arch (const char *str, const char *pragma_or_attr)
8793 {
8794   const struct processor *tmp_arch = NULL;
8795   enum aarch64_parse_opt_result parse_res
8796     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
8797
8798   if (parse_res == AARCH64_PARSE_OK)
8799     {
8800       gcc_assert (tmp_arch);
8801       selected_arch = tmp_arch;
8802       explicit_arch = selected_arch->arch;
8803       return true;
8804     }
8805
8806   switch (parse_res)
8807     {
8808       case AARCH64_PARSE_MISSING_ARG:
8809         error ("missing architecture name in 'arch' target %s", pragma_or_attr);
8810         break;
8811       case AARCH64_PARSE_INVALID_ARG:
8812         error ("unknown value %qs for 'arch' target %s", str, pragma_or_attr);
8813         break;
8814       case AARCH64_PARSE_INVALID_FEATURE:
8815         error ("invalid feature modifier %qs for 'arch' target %s",
8816                str, pragma_or_attr);
8817         break;
8818       default:
8819         gcc_unreachable ();
8820     }
8821
8822   return false;
8823 }
8824
8825 /* Handle the argument CPU_STR to the cpu= target attribute.
8826    PRAGMA_OR_ATTR is used in potential error messages.  */
8827
8828 static bool
8829 aarch64_handle_attr_cpu (const char *str, const char *pragma_or_attr)
8830 {
8831   const struct processor *tmp_cpu = NULL;
8832   enum aarch64_parse_opt_result parse_res
8833     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
8834
8835   if (parse_res == AARCH64_PARSE_OK)
8836     {
8837       gcc_assert (tmp_cpu);
8838       selected_tune = tmp_cpu;
8839       explicit_tune_core = selected_tune->ident;
8840
8841       selected_arch = &all_architectures[tmp_cpu->arch];
8842       explicit_arch = selected_arch->arch;
8843       return true;
8844     }
8845
8846   switch (parse_res)
8847     {
8848       case AARCH64_PARSE_MISSING_ARG:
8849         error ("missing cpu name in 'cpu' target %s", pragma_or_attr);
8850         break;
8851       case AARCH64_PARSE_INVALID_ARG:
8852         error ("unknown value %qs for 'cpu' target %s", str, pragma_or_attr);
8853         break;
8854       case AARCH64_PARSE_INVALID_FEATURE:
8855         error ("invalid feature modifier %qs for 'cpu' target %s",
8856                str, pragma_or_attr);
8857         break;
8858       default:
8859         gcc_unreachable ();
8860     }
8861
8862   return false;
8863 }
8864
8865 /* Handle the argument STR to the tune= target attribute.
8866    PRAGMA_OR_ATTR is used in potential error messages.  */
8867
8868 static bool
8869 aarch64_handle_attr_tune (const char *str, const char *pragma_or_attr)
8870 {
8871   const struct processor *tmp_tune = NULL;
8872   enum aarch64_parse_opt_result parse_res
8873     = aarch64_parse_tune (str, &tmp_tune);
8874
8875   if (parse_res == AARCH64_PARSE_OK)
8876     {
8877       gcc_assert (tmp_tune);
8878       selected_tune = tmp_tune;
8879       explicit_tune_core = selected_tune->ident;
8880       return true;
8881     }
8882
8883   switch (parse_res)
8884     {
8885       case AARCH64_PARSE_INVALID_ARG:
8886         error ("unknown value %qs for 'tune' target %s", str, pragma_or_attr);
8887         break;
8888       default:
8889         gcc_unreachable ();
8890     }
8891
8892   return false;
8893 }
8894
8895 /* Parse an architecture extensions target attribute string specified in STR.
8896    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
8897    if successful.  Update aarch64_isa_flags to reflect the ISA features
8898    modified.
8899    PRAGMA_OR_ATTR is used in potential error messages.  */
8900
8901 static bool
8902 aarch64_handle_attr_isa_flags (char *str, const char *pragma_or_attr)
8903 {
8904   enum aarch64_parse_opt_result parse_res;
8905   unsigned long isa_flags = aarch64_isa_flags;
8906
8907   /* We allow "+nothing" in the beginning to clear out all architectural
8908      features if the user wants to handpick specific features.  */
8909   if (strncmp ("+nothing", str, 8) == 0)
8910     {
8911       isa_flags = 0;
8912       str += 8;
8913     }
8914
8915   parse_res = aarch64_parse_extension (str, &isa_flags);
8916
8917   if (parse_res == AARCH64_PARSE_OK)
8918     {
8919       aarch64_isa_flags = isa_flags;
8920       return true;
8921     }
8922
8923   switch (parse_res)
8924     {
8925       case AARCH64_PARSE_MISSING_ARG:
8926         error ("missing feature modifier in target %s %qs",
8927                pragma_or_attr, str);
8928         break;
8929
8930       case AARCH64_PARSE_INVALID_FEATURE:
8931         error ("invalid feature modifier in target %s %qs",
8932                pragma_or_attr, str);
8933         break;
8934
8935       default:
8936         gcc_unreachable ();
8937     }
8938
8939  return false;
8940 }
8941
8942 /* The target attributes that we support.  On top of these we also support just
8943    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
8944    handled explicitly in aarch64_process_one_target_attr.  */
8945
8946 static const struct aarch64_attribute_info aarch64_attributes[] =
8947 {
8948   { "general-regs-only", aarch64_attr_mask, false, NULL,
8949      OPT_mgeneral_regs_only },
8950   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
8951      OPT_mfix_cortex_a53_835769 },
8952   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
8953      OPT_mfix_cortex_a53_843419 },
8954   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
8955   { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
8956   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
8957      OPT_momit_leaf_frame_pointer },
8958   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
8959   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
8960      OPT_march_ },
8961   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
8962   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
8963      OPT_mtune_ },
8964   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
8965 };
8966
8967 /* Parse ARG_STR which contains the definition of one target attribute.
8968    Show appropriate errors if any or return true if the attribute is valid.
8969    PRAGMA_OR_ATTR holds the string to use in error messages about whether
8970    we're processing a target attribute or pragma.  */
8971
8972 static bool
8973 aarch64_process_one_target_attr (char *arg_str, const char* pragma_or_attr)
8974 {
8975   bool invert = false;
8976
8977   size_t len = strlen (arg_str);
8978
8979   if (len == 0)
8980     {
8981       error ("malformed target %s", pragma_or_attr);
8982       return false;
8983     }
8984
8985   char *str_to_check = (char *) alloca (len + 1);
8986   strcpy (str_to_check, arg_str);
8987
8988   /* Skip leading whitespace.  */
8989   while (*str_to_check == ' ' || *str_to_check == '\t')
8990     str_to_check++;
8991
8992   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
8993      It is easier to detect and handle it explicitly here rather than going
8994      through the machinery for the rest of the target attributes in this
8995      function.  */
8996   if (*str_to_check == '+')
8997     return aarch64_handle_attr_isa_flags (str_to_check, pragma_or_attr);
8998
8999   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
9000     {
9001       invert = true;
9002       str_to_check += 3;
9003     }
9004   char *arg = strchr (str_to_check, '=');
9005
9006   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
9007      and point ARG to "foo".  */
9008   if (arg)
9009     {
9010       *arg = '\0';
9011       arg++;
9012     }
9013   const struct aarch64_attribute_info *p_attr;
9014   bool found = false;
9015   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
9016     {
9017       /* If the names don't match up, or the user has given an argument
9018          to an attribute that doesn't accept one, or didn't give an argument
9019          to an attribute that expects one, fail to match.  */
9020       if (strcmp (str_to_check, p_attr->name) != 0)
9021         continue;
9022
9023       found = true;
9024       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
9025                               || p_attr->attr_type == aarch64_attr_enum;
9026
9027       if (attr_need_arg_p ^ (arg != NULL))
9028         {
9029           error ("target %s %qs does not accept an argument",
9030                   pragma_or_attr, str_to_check);
9031           return false;
9032         }
9033
9034       /* If the name matches but the attribute does not allow "no-" versions
9035          then we can't match.  */
9036       if (invert && !p_attr->allow_neg)
9037         {
9038           error ("target %s %qs does not allow a negated form",
9039                   pragma_or_attr, str_to_check);
9040           return false;
9041         }
9042
9043       switch (p_attr->attr_type)
9044         {
9045         /* Has a custom handler registered.
9046            For example, cpu=, arch=, tune=.  */
9047           case aarch64_attr_custom:
9048             gcc_assert (p_attr->handler);
9049             if (!p_attr->handler (arg, pragma_or_attr))
9050               return false;
9051             break;
9052
9053           /* Either set or unset a boolean option.  */
9054           case aarch64_attr_bool:
9055             {
9056               struct cl_decoded_option decoded;
9057
9058               generate_option (p_attr->opt_num, NULL, !invert,
9059                                CL_TARGET, &decoded);
9060               aarch64_handle_option (&global_options, &global_options_set,
9061                                       &decoded, input_location);
9062               break;
9063             }
9064           /* Set or unset a bit in the target_flags.  aarch64_handle_option
9065              should know what mask to apply given the option number.  */
9066           case aarch64_attr_mask:
9067             {
9068               struct cl_decoded_option decoded;
9069               /* We only need to specify the option number.
9070                  aarch64_handle_option will know which mask to apply.  */
9071               decoded.opt_index = p_attr->opt_num;
9072               decoded.value = !invert;
9073               aarch64_handle_option (&global_options, &global_options_set,
9074                                       &decoded, input_location);
9075               break;
9076             }
9077           /* Use the option setting machinery to set an option to an enum.  */
9078           case aarch64_attr_enum:
9079             {
9080               gcc_assert (arg);
9081               bool valid;
9082               int value;
9083               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
9084                                               &value, CL_TARGET);
9085               if (valid)
9086                 {
9087                   set_option (&global_options, NULL, p_attr->opt_num, value,
9088                               NULL, DK_UNSPECIFIED, input_location,
9089                               global_dc);
9090                 }
9091               else
9092                 {
9093                   error ("target %s %s=%s is not valid",
9094                          pragma_or_attr, str_to_check, arg);
9095                 }
9096               break;
9097             }
9098           default:
9099             gcc_unreachable ();
9100         }
9101     }
9102
9103   /* If we reached here we either have found an attribute and validated
9104      it or didn't match any.  If we matched an attribute but its arguments
9105      were malformed we will have returned false already.  */
9106   return found;
9107 }
9108
9109 /* Count how many times the character C appears in
9110    NULL-terminated string STR.  */
9111
9112 static unsigned int
9113 num_occurences_in_str (char c, char *str)
9114 {
9115   unsigned int res = 0;
9116   while (*str != '\0')
9117     {
9118       if (*str == c)
9119         res++;
9120
9121       str++;
9122     }
9123
9124   return res;
9125 }
9126
9127 /* Parse the tree in ARGS that contains the target attribute information
9128    and update the global target options space.  PRAGMA_OR_ATTR is a string
9129    to be used in error messages, specifying whether this is processing
9130    a target attribute or a target pragma.  */
9131
9132 bool
9133 aarch64_process_target_attr (tree args, const char* pragma_or_attr)
9134 {
9135   if (TREE_CODE (args) == TREE_LIST)
9136     {
9137       do
9138         {
9139           tree head = TREE_VALUE (args);
9140           if (head)
9141             {
9142               if (!aarch64_process_target_attr (head, pragma_or_attr))
9143                 return false;
9144             }
9145           args = TREE_CHAIN (args);
9146         } while (args);
9147
9148       return true;
9149     }
9150   /* We expect to find a string to parse.  */
9151   gcc_assert (TREE_CODE (args) == STRING_CST);
9152
9153   size_t len = strlen (TREE_STRING_POINTER (args));
9154   char *str_to_check = (char *) alloca (len + 1);
9155   strcpy (str_to_check, TREE_STRING_POINTER (args));
9156
9157   if (len == 0)
9158     {
9159       error ("malformed target %s value", pragma_or_attr);
9160       return false;
9161     }
9162
9163   /* Used to catch empty spaces between commas i.e.
9164      attribute ((target ("attr1,,attr2"))).  */
9165   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
9166
9167   /* Handle multiple target attributes separated by ','.  */
9168   char *token = strtok (str_to_check, ",");
9169
9170   unsigned int num_attrs = 0;
9171   while (token)
9172     {
9173       num_attrs++;
9174       if (!aarch64_process_one_target_attr (token, pragma_or_attr))
9175         {
9176           error ("target %s %qs is invalid", pragma_or_attr, token);
9177           return false;
9178         }
9179
9180       token = strtok (NULL, ",");
9181     }
9182
9183   if (num_attrs != num_commas + 1)
9184     {
9185       error ("malformed target %s list %qs",
9186               pragma_or_attr, TREE_STRING_POINTER (args));
9187       return false;
9188     }
9189
9190   return true;
9191 }
9192
9193 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
9194    process attribute ((target ("..."))).  */
9195
9196 static bool
9197 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
9198 {
9199   struct cl_target_option cur_target;
9200   bool ret;
9201   tree old_optimize;
9202   tree new_target, new_optimize;
9203   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9204
9205   /* If what we're processing is the current pragma string then the
9206      target option node is already stored in target_option_current_node
9207      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
9208      having to re-parse the string.  This is especially useful to keep
9209      arm_neon.h compile times down since that header contains a lot
9210      of intrinsics enclosed in pragmas.  */
9211   if (!existing_target && args == current_target_pragma)
9212     {
9213       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
9214       return true;
9215     }
9216   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9217
9218   old_optimize = build_optimization_node (&global_options);
9219   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9220
9221   /* If the function changed the optimization levels as well as setting
9222      target options, start with the optimizations specified.  */
9223   if (func_optimize && func_optimize != old_optimize)
9224     cl_optimization_restore (&global_options,
9225                              TREE_OPTIMIZATION (func_optimize));
9226
9227   /* Save the current target options to restore at the end.  */
9228   cl_target_option_save (&cur_target, &global_options);
9229
9230   /* If fndecl already has some target attributes applied to it, unpack
9231      them so that we add this attribute on top of them, rather than
9232      overwriting them.  */
9233   if (existing_target)
9234     {
9235       struct cl_target_option *existing_options
9236         = TREE_TARGET_OPTION (existing_target);
9237
9238       if (existing_options)
9239         cl_target_option_restore (&global_options, existing_options);
9240     }
9241   else
9242     cl_target_option_restore (&global_options,
9243                         TREE_TARGET_OPTION (target_option_current_node));
9244
9245
9246   ret = aarch64_process_target_attr (args, "attribute");
9247
9248   /* Set up any additional state.  */
9249   if (ret)
9250     {
9251       aarch64_override_options_internal (&global_options);
9252       /* Initialize SIMD builtins if we haven't already.
9253          Set current_target_pragma to NULL for the duration so that
9254          the builtin initialization code doesn't try to tag the functions
9255          being built with the attributes specified by any current pragma, thus
9256          going into an infinite recursion.  */
9257       if (TARGET_SIMD)
9258         {
9259           tree saved_current_target_pragma = current_target_pragma;
9260           current_target_pragma = NULL;
9261           aarch64_init_simd_builtins ();
9262           current_target_pragma = saved_current_target_pragma;
9263         }
9264       new_target = build_target_option_node (&global_options);
9265     }
9266   else
9267     new_target = NULL;
9268
9269   new_optimize = build_optimization_node (&global_options);
9270
9271   if (fndecl && ret)
9272     {
9273       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
9274
9275       if (old_optimize != new_optimize)
9276         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
9277     }
9278
9279   cl_target_option_restore (&global_options, &cur_target);
9280
9281   if (old_optimize != new_optimize)
9282     cl_optimization_restore (&global_options,
9283                              TREE_OPTIMIZATION (old_optimize));
9284   return ret;
9285 }
9286
9287 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
9288    tri-bool options (yes, no, don't care) and the default value is
9289    DEF, determine whether to reject inlining.  */
9290
9291 static bool
9292 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
9293                                      int dont_care, int def)
9294 {
9295   /* If the callee doesn't care, always allow inlining.  */
9296   if (callee == dont_care)
9297     return true;
9298
9299   /* If the caller doesn't care, always allow inlining.  */
9300   if (caller == dont_care)
9301     return true;
9302
9303   /* Otherwise, allow inlining if either the callee and caller values
9304      agree, or if the callee is using the default value.  */
9305   return (callee == caller || callee == def);
9306 }
9307
9308 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
9309    to inline CALLEE into CALLER based on target-specific info.
9310    Make sure that the caller and callee have compatible architectural
9311    features.  Then go through the other possible target attributes
9312    and see if they can block inlining.  Try not to reject always_inline
9313    callees unless they are incompatible architecturally.  */
9314
9315 static bool
9316 aarch64_can_inline_p (tree caller, tree callee)
9317 {
9318   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
9319   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
9320
9321   /* If callee has no option attributes, then it is ok to inline.  */
9322   if (!callee_tree)
9323     return true;
9324
9325   struct cl_target_option *caller_opts
9326         = TREE_TARGET_OPTION (caller_tree ? caller_tree
9327                                            : target_option_default_node);
9328
9329   struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
9330
9331
9332   /* Callee's ISA flags should be a subset of the caller's.  */
9333   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
9334        != callee_opts->x_aarch64_isa_flags)
9335     return false;
9336
9337   /* Allow non-strict aligned functions inlining into strict
9338      aligned ones.  */
9339   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
9340        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
9341       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
9342            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
9343     return false;
9344
9345   bool always_inline = lookup_attribute ("always_inline",
9346                                           DECL_ATTRIBUTES (callee));
9347
9348   /* If the architectural features match up and the callee is always_inline
9349      then the other attributes don't matter.  */
9350   if (always_inline)
9351     return true;
9352
9353   if (caller_opts->x_aarch64_cmodel_var
9354       != callee_opts->x_aarch64_cmodel_var)
9355     return false;
9356
9357   if (caller_opts->x_aarch64_tls_dialect
9358       != callee_opts->x_aarch64_tls_dialect)
9359     return false;
9360
9361   /* Honour explicit requests to workaround errata.  */
9362   if (!aarch64_tribools_ok_for_inlining_p (
9363           caller_opts->x_aarch64_fix_a53_err835769,
9364           callee_opts->x_aarch64_fix_a53_err835769,
9365           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
9366     return false;
9367
9368   if (!aarch64_tribools_ok_for_inlining_p (
9369           caller_opts->x_aarch64_fix_a53_err843419,
9370           callee_opts->x_aarch64_fix_a53_err843419,
9371           2, TARGET_FIX_ERR_A53_843419))
9372     return false;
9373
9374   /* If the user explicitly specified -momit-leaf-frame-pointer for the
9375      caller and calle and they don't match up, reject inlining.  */
9376   if (!aarch64_tribools_ok_for_inlining_p (
9377           caller_opts->x_flag_omit_leaf_frame_pointer,
9378           callee_opts->x_flag_omit_leaf_frame_pointer,
9379           2, 1))
9380     return false;
9381
9382   /* If the callee has specific tuning overrides, respect them.  */
9383   if (callee_opts->x_aarch64_override_tune_string != NULL
9384       && caller_opts->x_aarch64_override_tune_string == NULL)
9385     return false;
9386
9387   /* If the user specified tuning override strings for the
9388      caller and callee and they don't match up, reject inlining.
9389      We just do a string compare here, we don't analyze the meaning
9390      of the string, as it would be too costly for little gain.  */
9391   if (callee_opts->x_aarch64_override_tune_string
9392       && caller_opts->x_aarch64_override_tune_string
9393       && (strcmp (callee_opts->x_aarch64_override_tune_string,
9394                   caller_opts->x_aarch64_override_tune_string) != 0))
9395     return false;
9396
9397   return true;
9398 }
9399
9400 /* Return true if SYMBOL_REF X binds locally.  */
9401
9402 static bool
9403 aarch64_symbol_binds_local_p (const_rtx x)
9404 {
9405   return (SYMBOL_REF_DECL (x)
9406           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
9407           : SYMBOL_REF_LOCAL_P (x));
9408 }
9409
9410 /* Return true if SYMBOL_REF X is thread local */
9411 static bool
9412 aarch64_tls_symbol_p (rtx x)
9413 {
9414   if (! TARGET_HAVE_TLS)
9415     return false;
9416
9417   if (GET_CODE (x) != SYMBOL_REF)
9418     return false;
9419
9420   return SYMBOL_REF_TLS_MODEL (x) != 0;
9421 }
9422
9423 /* Classify a TLS symbol into one of the TLS kinds.  */
9424 enum aarch64_symbol_type
9425 aarch64_classify_tls_symbol (rtx x)
9426 {
9427   enum tls_model tls_kind = tls_symbolic_operand_type (x);
9428
9429   switch (tls_kind)
9430     {
9431     case TLS_MODEL_GLOBAL_DYNAMIC:
9432     case TLS_MODEL_LOCAL_DYNAMIC:
9433       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
9434
9435     case TLS_MODEL_INITIAL_EXEC:
9436       switch (aarch64_cmodel)
9437         {
9438         case AARCH64_CMODEL_TINY:
9439         case AARCH64_CMODEL_TINY_PIC:
9440           return SYMBOL_TINY_TLSIE;
9441         default:
9442           return SYMBOL_SMALL_TLSIE;
9443         }
9444
9445     case TLS_MODEL_LOCAL_EXEC:
9446       if (aarch64_tls_size == 12)
9447         return SYMBOL_TLSLE12;
9448       else if (aarch64_tls_size == 24)
9449         return SYMBOL_TLSLE24;
9450       else if (aarch64_tls_size == 32)
9451         return SYMBOL_TLSLE32;
9452       else if (aarch64_tls_size == 48)
9453         return SYMBOL_TLSLE48;
9454       else
9455         gcc_unreachable ();
9456
9457     case TLS_MODEL_EMULATED:
9458     case TLS_MODEL_NONE:
9459       return SYMBOL_FORCE_TO_MEM;
9460
9461     default:
9462       gcc_unreachable ();
9463     }
9464 }
9465
9466 /* Return the method that should be used to access SYMBOL_REF or
9467    LABEL_REF X.  */
9468
9469 enum aarch64_symbol_type
9470 aarch64_classify_symbol (rtx x, rtx offset)
9471 {
9472   if (GET_CODE (x) == LABEL_REF)
9473     {
9474       switch (aarch64_cmodel)
9475         {
9476         case AARCH64_CMODEL_LARGE:
9477           return SYMBOL_FORCE_TO_MEM;
9478
9479         case AARCH64_CMODEL_TINY_PIC:
9480         case AARCH64_CMODEL_TINY:
9481           return SYMBOL_TINY_ABSOLUTE;
9482
9483         case AARCH64_CMODEL_SMALL_SPIC:
9484         case AARCH64_CMODEL_SMALL_PIC:
9485         case AARCH64_CMODEL_SMALL:
9486           return SYMBOL_SMALL_ABSOLUTE;
9487
9488         default:
9489           gcc_unreachable ();
9490         }
9491     }
9492
9493   if (GET_CODE (x) == SYMBOL_REF)
9494     {
9495       if (aarch64_tls_symbol_p (x))
9496         return aarch64_classify_tls_symbol (x);
9497
9498       switch (aarch64_cmodel)
9499         {
9500         case AARCH64_CMODEL_TINY:
9501           /* When we retrieve symbol + offset address, we have to make sure
9502              the offset does not cause overflow of the final address.  But
9503              we have no way of knowing the address of symbol at compile time
9504              so we can't accurately say if the distance between the PC and
9505              symbol + offset is outside the addressible range of +/-1M in the
9506              TINY code model.  So we rely on images not being greater than
9507              1M and cap the offset at 1M and anything beyond 1M will have to
9508              be loaded using an alternative mechanism.  Furthermore if the
9509              symbol is a weak reference to something that isn't known to
9510              resolve to a symbol in this module, then force to memory.  */
9511           if ((SYMBOL_REF_WEAK (x)
9512                && !aarch64_symbol_binds_local_p (x))
9513               || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
9514             return SYMBOL_FORCE_TO_MEM;
9515           return SYMBOL_TINY_ABSOLUTE;
9516
9517         case AARCH64_CMODEL_SMALL:
9518           /* Same reasoning as the tiny code model, but the offset cap here is
9519              4G.  */
9520           if ((SYMBOL_REF_WEAK (x)
9521                && !aarch64_symbol_binds_local_p (x))
9522               || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
9523                             HOST_WIDE_INT_C (4294967264)))
9524             return SYMBOL_FORCE_TO_MEM;
9525           return SYMBOL_SMALL_ABSOLUTE;
9526
9527         case AARCH64_CMODEL_TINY_PIC:
9528           if (!aarch64_symbol_binds_local_p (x))
9529             return SYMBOL_TINY_GOT;
9530           return SYMBOL_TINY_ABSOLUTE;
9531
9532         case AARCH64_CMODEL_SMALL_SPIC:
9533         case AARCH64_CMODEL_SMALL_PIC:
9534           if (!aarch64_symbol_binds_local_p (x))
9535             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
9536                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
9537           return SYMBOL_SMALL_ABSOLUTE;
9538
9539         case AARCH64_CMODEL_LARGE:
9540           /* This is alright even in PIC code as the constant
9541              pool reference is always PC relative and within
9542              the same translation unit.  */
9543           if (CONSTANT_POOL_ADDRESS_P (x))
9544             return SYMBOL_SMALL_ABSOLUTE;
9545           else
9546             return SYMBOL_FORCE_TO_MEM;
9547
9548         default:
9549           gcc_unreachable ();
9550         }
9551     }
9552
9553   /* By default push everything into the constant pool.  */
9554   return SYMBOL_FORCE_TO_MEM;
9555 }
9556
9557 bool
9558 aarch64_constant_address_p (rtx x)
9559 {
9560   return (CONSTANT_P (x) && memory_address_p (DImode, x));
9561 }
9562
9563 bool
9564 aarch64_legitimate_pic_operand_p (rtx x)
9565 {
9566   if (GET_CODE (x) == SYMBOL_REF
9567       || (GET_CODE (x) == CONST
9568           && GET_CODE (XEXP (x, 0)) == PLUS
9569           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
9570      return false;
9571
9572   return true;
9573 }
9574
9575 /* Return true if X holds either a quarter-precision or
9576      floating-point +0.0 constant.  */
9577 static bool
9578 aarch64_valid_floating_const (machine_mode mode, rtx x)
9579 {
9580   if (!CONST_DOUBLE_P (x))
9581     return false;
9582
9583   if (aarch64_float_const_zero_rtx_p (x))
9584     return true;
9585
9586   /* We only handle moving 0.0 to a TFmode register.  */
9587   if (!(mode == SFmode || mode == DFmode))
9588     return false;
9589
9590   return aarch64_float_const_representable_p (x);
9591 }
9592
9593 static bool
9594 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
9595 {
9596   /* Do not allow vector struct mode constants.  We could support
9597      0 and -1 easily, but they need support in aarch64-simd.md.  */
9598   if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
9599     return false;
9600
9601   /* This could probably go away because
9602      we now decompose CONST_INTs according to expand_mov_immediate.  */
9603   if ((GET_CODE (x) == CONST_VECTOR
9604        && aarch64_simd_valid_immediate (x, mode, false, NULL))
9605       || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
9606         return !targetm.cannot_force_const_mem (mode, x);
9607
9608   if (GET_CODE (x) == HIGH
9609       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
9610     return true;
9611
9612   return aarch64_constant_address_p (x);
9613 }
9614
9615 rtx
9616 aarch64_load_tp (rtx target)
9617 {
9618   if (!target
9619       || GET_MODE (target) != Pmode
9620       || !register_operand (target, Pmode))
9621     target = gen_reg_rtx (Pmode);
9622
9623   /* Can return in any reg.  */
9624   emit_insn (gen_aarch64_load_tp_hard (target));
9625   return target;
9626 }
9627
9628 /* On AAPCS systems, this is the "struct __va_list".  */
9629 static GTY(()) tree va_list_type;
9630
9631 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
9632    Return the type to use as __builtin_va_list.
9633
9634    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
9635
9636    struct __va_list
9637    {
9638      void *__stack;
9639      void *__gr_top;
9640      void *__vr_top;
9641      int   __gr_offs;
9642      int   __vr_offs;
9643    };  */
9644
9645 static tree
9646 aarch64_build_builtin_va_list (void)
9647 {
9648   tree va_list_name;
9649   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
9650
9651   /* Create the type.  */
9652   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
9653   /* Give it the required name.  */
9654   va_list_name = build_decl (BUILTINS_LOCATION,
9655                              TYPE_DECL,
9656                              get_identifier ("__va_list"),
9657                              va_list_type);
9658   DECL_ARTIFICIAL (va_list_name) = 1;
9659   TYPE_NAME (va_list_type) = va_list_name;
9660   TYPE_STUB_DECL (va_list_type) = va_list_name;
9661
9662   /* Create the fields.  */
9663   f_stack = build_decl (BUILTINS_LOCATION,
9664                         FIELD_DECL, get_identifier ("__stack"),
9665                         ptr_type_node);
9666   f_grtop = build_decl (BUILTINS_LOCATION,
9667                         FIELD_DECL, get_identifier ("__gr_top"),
9668                         ptr_type_node);
9669   f_vrtop = build_decl (BUILTINS_LOCATION,
9670                         FIELD_DECL, get_identifier ("__vr_top"),
9671                         ptr_type_node);
9672   f_groff = build_decl (BUILTINS_LOCATION,
9673                         FIELD_DECL, get_identifier ("__gr_offs"),
9674                         integer_type_node);
9675   f_vroff = build_decl (BUILTINS_LOCATION,
9676                         FIELD_DECL, get_identifier ("__vr_offs"),
9677                         integer_type_node);
9678
9679   /* Tell tree-stdarg pass about our internal offset fields.
9680      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
9681      purpose to identify whether the code is updating va_list internal
9682      offset fields through irregular way.  */
9683   va_list_gpr_counter_field = f_groff;
9684   va_list_fpr_counter_field = f_vroff;
9685
9686   DECL_ARTIFICIAL (f_stack) = 1;
9687   DECL_ARTIFICIAL (f_grtop) = 1;
9688   DECL_ARTIFICIAL (f_vrtop) = 1;
9689   DECL_ARTIFICIAL (f_groff) = 1;
9690   DECL_ARTIFICIAL (f_vroff) = 1;
9691
9692   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
9693   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
9694   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
9695   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
9696   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
9697
9698   TYPE_FIELDS (va_list_type) = f_stack;
9699   DECL_CHAIN (f_stack) = f_grtop;
9700   DECL_CHAIN (f_grtop) = f_vrtop;
9701   DECL_CHAIN (f_vrtop) = f_groff;
9702   DECL_CHAIN (f_groff) = f_vroff;
9703
9704   /* Compute its layout.  */
9705   layout_type (va_list_type);
9706
9707   return va_list_type;
9708 }
9709
9710 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
9711 static void
9712 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
9713 {
9714   const CUMULATIVE_ARGS *cum;
9715   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
9716   tree stack, grtop, vrtop, groff, vroff;
9717   tree t;
9718   int gr_save_area_size = cfun->va_list_gpr_size;
9719   int vr_save_area_size = cfun->va_list_fpr_size;
9720   int vr_offset;
9721
9722   cum = &crtl->args.info;
9723   if (cfun->va_list_gpr_size)
9724     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
9725                              cfun->va_list_gpr_size);
9726   if (cfun->va_list_fpr_size)
9727     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
9728                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
9729
9730   if (!TARGET_FLOAT)
9731     {
9732       gcc_assert (cum->aapcs_nvrn == 0);
9733       vr_save_area_size = 0;
9734     }
9735
9736   f_stack = TYPE_FIELDS (va_list_type_node);
9737   f_grtop = DECL_CHAIN (f_stack);
9738   f_vrtop = DECL_CHAIN (f_grtop);
9739   f_groff = DECL_CHAIN (f_vrtop);
9740   f_vroff = DECL_CHAIN (f_groff);
9741
9742   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
9743                   NULL_TREE);
9744   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
9745                   NULL_TREE);
9746   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
9747                   NULL_TREE);
9748   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
9749                   NULL_TREE);
9750   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
9751                   NULL_TREE);
9752
9753   /* Emit code to initialize STACK, which points to the next varargs stack
9754      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
9755      by named arguments.  STACK is 8-byte aligned.  */
9756   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
9757   if (cum->aapcs_stack_size > 0)
9758     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
9759   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
9760   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9761
9762   /* Emit code to initialize GRTOP, the top of the GR save area.
9763      virtual_incoming_args_rtx should have been 16 byte aligned.  */
9764   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
9765   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
9766   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9767
9768   /* Emit code to initialize VRTOP, the top of the VR save area.
9769      This address is gr_save_area_bytes below GRTOP, rounded
9770      down to the next 16-byte boundary.  */
9771   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
9772   vr_offset = ROUND_UP (gr_save_area_size,
9773                         STACK_BOUNDARY / BITS_PER_UNIT);
9774
9775   if (vr_offset)
9776     t = fold_build_pointer_plus_hwi (t, -vr_offset);
9777   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
9778   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9779
9780   /* Emit code to initialize GROFF, the offset from GRTOP of the
9781      next GPR argument.  */
9782   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
9783               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
9784   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9785
9786   /* Likewise emit code to initialize VROFF, the offset from FTOP
9787      of the next VR argument.  */
9788   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
9789               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
9790   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9791 }
9792
9793 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
9794
9795 static tree
9796 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
9797                               gimple_seq *post_p ATTRIBUTE_UNUSED)
9798 {
9799   tree addr;
9800   bool indirect_p;
9801   bool is_ha;           /* is HFA or HVA.  */
9802   bool dw_align;        /* double-word align.  */
9803   machine_mode ag_mode = VOIDmode;
9804   int nregs;
9805   machine_mode mode;
9806
9807   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
9808   tree stack, f_top, f_off, off, arg, roundup, on_stack;
9809   HOST_WIDE_INT size, rsize, adjust, align;
9810   tree t, u, cond1, cond2;
9811
9812   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
9813   if (indirect_p)
9814     type = build_pointer_type (type);
9815
9816   mode = TYPE_MODE (type);
9817
9818   f_stack = TYPE_FIELDS (va_list_type_node);
9819   f_grtop = DECL_CHAIN (f_stack);
9820   f_vrtop = DECL_CHAIN (f_grtop);
9821   f_groff = DECL_CHAIN (f_vrtop);
9822   f_vroff = DECL_CHAIN (f_groff);
9823
9824   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
9825                   f_stack, NULL_TREE);
9826   size = int_size_in_bytes (type);
9827   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
9828
9829   dw_align = false;
9830   adjust = 0;
9831   if (aarch64_vfp_is_call_or_return_candidate (mode,
9832                                                type,
9833                                                &ag_mode,
9834                                                &nregs,
9835                                                &is_ha))
9836     {
9837       /* TYPE passed in fp/simd registers.  */
9838       if (!TARGET_FLOAT)
9839         aarch64_err_no_fpadvsimd (mode, "varargs");
9840
9841       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
9842                       unshare_expr (valist), f_vrtop, NULL_TREE);
9843       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
9844                       unshare_expr (valist), f_vroff, NULL_TREE);
9845
9846       rsize = nregs * UNITS_PER_VREG;
9847
9848       if (is_ha)
9849         {
9850           if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
9851             adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
9852         }
9853       else if (BLOCK_REG_PADDING (mode, type, 1) == downward
9854                && size < UNITS_PER_VREG)
9855         {
9856           adjust = UNITS_PER_VREG - size;
9857         }
9858     }
9859   else
9860     {
9861       /* TYPE passed in general registers.  */
9862       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
9863                       unshare_expr (valist), f_grtop, NULL_TREE);
9864       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
9865                       unshare_expr (valist), f_groff, NULL_TREE);
9866       rsize = ROUND_UP (size, UNITS_PER_WORD);
9867       nregs = rsize / UNITS_PER_WORD;
9868
9869       if (align > 8)
9870         dw_align = true;
9871
9872       if (BLOCK_REG_PADDING (mode, type, 1) == downward
9873           && size < UNITS_PER_WORD)
9874         {
9875           adjust = UNITS_PER_WORD  - size;
9876         }
9877     }
9878
9879   /* Get a local temporary for the field value.  */
9880   off = get_initialized_tmp_var (f_off, pre_p, NULL);
9881
9882   /* Emit code to branch if off >= 0.  */
9883   t = build2 (GE_EXPR, boolean_type_node, off,
9884               build_int_cst (TREE_TYPE (off), 0));
9885   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
9886
9887   if (dw_align)
9888     {
9889       /* Emit: offs = (offs + 15) & -16.  */
9890       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
9891                   build_int_cst (TREE_TYPE (off), 15));
9892       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
9893                   build_int_cst (TREE_TYPE (off), -16));
9894       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
9895     }
9896   else
9897     roundup = NULL;
9898
9899   /* Update ap.__[g|v]r_offs  */
9900   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
9901               build_int_cst (TREE_TYPE (off), rsize));
9902   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
9903
9904   /* String up.  */
9905   if (roundup)
9906     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
9907
9908   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
9909   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
9910               build_int_cst (TREE_TYPE (f_off), 0));
9911   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
9912
9913   /* String up: make sure the assignment happens before the use.  */
9914   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
9915   COND_EXPR_ELSE (cond1) = t;
9916
9917   /* Prepare the trees handling the argument that is passed on the stack;
9918      the top level node will store in ON_STACK.  */
9919   arg = get_initialized_tmp_var (stack, pre_p, NULL);
9920   if (align > 8)
9921     {
9922       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
9923       t = fold_convert (intDI_type_node, arg);
9924       t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
9925                   build_int_cst (TREE_TYPE (t), 15));
9926       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
9927                   build_int_cst (TREE_TYPE (t), -16));
9928       t = fold_convert (TREE_TYPE (arg), t);
9929       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
9930     }
9931   else
9932     roundup = NULL;
9933   /* Advance ap.__stack  */
9934   t = fold_convert (intDI_type_node, arg);
9935   t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
9936               build_int_cst (TREE_TYPE (t), size + 7));
9937   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
9938               build_int_cst (TREE_TYPE (t), -8));
9939   t = fold_convert (TREE_TYPE (arg), t);
9940   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
9941   /* String up roundup and advance.  */
9942   if (roundup)
9943     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
9944   /* String up with arg */
9945   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
9946   /* Big-endianness related address adjustment.  */
9947   if (BLOCK_REG_PADDING (mode, type, 1) == downward
9948       && size < UNITS_PER_WORD)
9949   {
9950     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
9951                 size_int (UNITS_PER_WORD - size));
9952     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
9953   }
9954
9955   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
9956   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
9957
9958   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
9959   t = off;
9960   if (adjust)
9961     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
9962                 build_int_cst (TREE_TYPE (off), adjust));
9963
9964   t = fold_convert (sizetype, t);
9965   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
9966
9967   if (is_ha)
9968     {
9969       /* type ha; // treat as "struct {ftype field[n];}"
9970          ... [computing offs]
9971          for (i = 0; i <nregs; ++i, offs += 16)
9972            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
9973          return ha;  */
9974       int i;
9975       tree tmp_ha, field_t, field_ptr_t;
9976
9977       /* Declare a local variable.  */
9978       tmp_ha = create_tmp_var_raw (type, "ha");
9979       gimple_add_tmp_var (tmp_ha);
9980
9981       /* Establish the base type.  */
9982       switch (ag_mode)
9983         {
9984         case SFmode:
9985           field_t = float_type_node;
9986           field_ptr_t = float_ptr_type_node;
9987           break;
9988         case DFmode:
9989           field_t = double_type_node;
9990           field_ptr_t = double_ptr_type_node;
9991           break;
9992         case TFmode:
9993           field_t = long_double_type_node;
9994           field_ptr_t = long_double_ptr_type_node;
9995           break;
9996         case HFmode:
9997           field_t = aarch64_fp16_type_node;
9998           field_ptr_t = aarch64_fp16_ptr_type_node;
9999           break;
10000         case V2SImode:
10001         case V4SImode:
10002             {
10003               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
10004               field_t = build_vector_type_for_mode (innertype, ag_mode);
10005               field_ptr_t = build_pointer_type (field_t);
10006             }
10007           break;
10008         default:
10009           gcc_assert (0);
10010         }
10011
10012       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
10013       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
10014       addr = t;
10015       t = fold_convert (field_ptr_t, addr);
10016       t = build2 (MODIFY_EXPR, field_t,
10017                   build1 (INDIRECT_REF, field_t, tmp_ha),
10018                   build1 (INDIRECT_REF, field_t, t));
10019
10020       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
10021       for (i = 1; i < nregs; ++i)
10022         {
10023           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
10024           u = fold_convert (field_ptr_t, addr);
10025           u = build2 (MODIFY_EXPR, field_t,
10026                       build2 (MEM_REF, field_t, tmp_ha,
10027                               build_int_cst (field_ptr_t,
10028                                              (i *
10029                                               int_size_in_bytes (field_t)))),
10030                       build1 (INDIRECT_REF, field_t, u));
10031           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
10032         }
10033
10034       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
10035       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
10036     }
10037
10038   COND_EXPR_ELSE (cond2) = t;
10039   addr = fold_convert (build_pointer_type (type), cond1);
10040   addr = build_va_arg_indirect_ref (addr);
10041
10042   if (indirect_p)
10043     addr = build_va_arg_indirect_ref (addr);
10044
10045   return addr;
10046 }
10047
10048 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
10049
10050 static void
10051 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
10052                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
10053                                 int no_rtl)
10054 {
10055   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10056   CUMULATIVE_ARGS local_cum;
10057   int gr_saved = cfun->va_list_gpr_size;
10058   int vr_saved = cfun->va_list_fpr_size;
10059
10060   /* The caller has advanced CUM up to, but not beyond, the last named
10061      argument.  Advance a local copy of CUM past the last "real" named
10062      argument, to find out how many registers are left over.  */
10063   local_cum = *cum;
10064   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
10065
10066   /* Found out how many registers we need to save.
10067      Honor tree-stdvar analysis results.  */
10068   if (cfun->va_list_gpr_size)
10069     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
10070                     cfun->va_list_gpr_size / UNITS_PER_WORD);
10071   if (cfun->va_list_fpr_size)
10072     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
10073                     cfun->va_list_fpr_size / UNITS_PER_VREG);
10074
10075   if (!TARGET_FLOAT)
10076     {
10077       gcc_assert (local_cum.aapcs_nvrn == 0);
10078       vr_saved = 0;
10079     }
10080
10081   if (!no_rtl)
10082     {
10083       if (gr_saved > 0)
10084         {
10085           rtx ptr, mem;
10086
10087           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
10088           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
10089                                - gr_saved * UNITS_PER_WORD);
10090           mem = gen_frame_mem (BLKmode, ptr);
10091           set_mem_alias_set (mem, get_varargs_alias_set ());
10092
10093           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
10094                                mem, gr_saved);
10095         }
10096       if (vr_saved > 0)
10097         {
10098           /* We can't use move_block_from_reg, because it will use
10099              the wrong mode, storing D regs only.  */
10100           machine_mode mode = TImode;
10101           int off, i, vr_start;
10102
10103           /* Set OFF to the offset from virtual_incoming_args_rtx of
10104              the first vector register.  The VR save area lies below
10105              the GR one, and is aligned to 16 bytes.  */
10106           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
10107                            STACK_BOUNDARY / BITS_PER_UNIT);
10108           off -= vr_saved * UNITS_PER_VREG;
10109
10110           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
10111           for (i = 0; i < vr_saved; ++i)
10112             {
10113               rtx ptr, mem;
10114
10115               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
10116               mem = gen_frame_mem (mode, ptr);
10117               set_mem_alias_set (mem, get_varargs_alias_set ());
10118               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
10119               off += UNITS_PER_VREG;
10120             }
10121         }
10122     }
10123
10124   /* We don't save the size into *PRETEND_SIZE because we want to avoid
10125      any complication of having crtl->args.pretend_args_size changed.  */
10126   cfun->machine->frame.saved_varargs_size
10127     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
10128                  STACK_BOUNDARY / BITS_PER_UNIT)
10129        + vr_saved * UNITS_PER_VREG);
10130 }
10131
10132 static void
10133 aarch64_conditional_register_usage (void)
10134 {
10135   int i;
10136   if (!TARGET_FLOAT)
10137     {
10138       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
10139         {
10140           fixed_regs[i] = 1;
10141           call_used_regs[i] = 1;
10142         }
10143     }
10144
10145   /* When tracking speculation, we need a couple of call-clobbered registers
10146      to track the speculation state.  It would be nice to just use
10147      IP0 and IP1, but currently there are numerous places that just
10148      assume these registers are free for other uses (eg pointer
10149      authentication).  */
10150   if (aarch64_track_speculation)
10151     {
10152       fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
10153       call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
10154       fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
10155       call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
10156     }
10157 }
10158
10159 /* Walk down the type tree of TYPE counting consecutive base elements.
10160    If *MODEP is VOIDmode, then set it to the first valid floating point
10161    type.  If a non-floating point type is found, or if a floating point
10162    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
10163    otherwise return the count in the sub-tree.  */
10164 static int
10165 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
10166 {
10167   machine_mode mode;
10168   HOST_WIDE_INT size;
10169
10170   switch (TREE_CODE (type))
10171     {
10172     case REAL_TYPE:
10173       mode = TYPE_MODE (type);
10174       if (mode != DFmode && mode != SFmode
10175           && mode != TFmode && mode != HFmode)
10176         return -1;
10177
10178       if (*modep == VOIDmode)
10179         *modep = mode;
10180
10181       if (*modep == mode)
10182         return 1;
10183
10184       break;
10185
10186     case COMPLEX_TYPE:
10187       mode = TYPE_MODE (TREE_TYPE (type));
10188       if (mode != DFmode && mode != SFmode
10189           && mode != TFmode && mode != HFmode)
10190         return -1;
10191
10192       if (*modep == VOIDmode)
10193         *modep = mode;
10194
10195       if (*modep == mode)
10196         return 2;
10197
10198       break;
10199
10200     case VECTOR_TYPE:
10201       /* Use V2SImode and V4SImode as representatives of all 64-bit
10202          and 128-bit vector types.  */
10203       size = int_size_in_bytes (type);
10204       switch (size)
10205         {
10206         case 8:
10207           mode = V2SImode;
10208           break;
10209         case 16:
10210           mode = V4SImode;
10211           break;
10212         default:
10213           return -1;
10214         }
10215
10216       if (*modep == VOIDmode)
10217         *modep = mode;
10218
10219       /* Vector modes are considered to be opaque: two vectors are
10220          equivalent for the purposes of being homogeneous aggregates
10221          if they are the same size.  */
10222       if (*modep == mode)
10223         return 1;
10224
10225       break;
10226
10227     case ARRAY_TYPE:
10228       {
10229         int count;
10230         tree index = TYPE_DOMAIN (type);
10231
10232         /* Can't handle incomplete types nor sizes that are not
10233            fixed.  */
10234         if (!COMPLETE_TYPE_P (type)
10235             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10236           return -1;
10237
10238         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
10239         if (count == -1
10240             || !index
10241             || !TYPE_MAX_VALUE (index)
10242             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
10243             || !TYPE_MIN_VALUE (index)
10244             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
10245             || count < 0)
10246           return -1;
10247
10248         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
10249                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
10250
10251         /* There must be no padding.  */
10252         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10253           return -1;
10254
10255         return count;
10256       }
10257
10258     case RECORD_TYPE:
10259       {
10260         int count = 0;
10261         int sub_count;
10262         tree field;
10263
10264         /* Can't handle incomplete types nor sizes that are not
10265            fixed.  */
10266         if (!COMPLETE_TYPE_P (type)
10267             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10268           return -1;
10269
10270         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
10271           {
10272             if (TREE_CODE (field) != FIELD_DECL)
10273               continue;
10274
10275             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
10276             if (sub_count < 0)
10277               return -1;
10278             count += sub_count;
10279           }
10280
10281         /* There must be no padding.  */
10282         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10283           return -1;
10284
10285         return count;
10286       }
10287
10288     case UNION_TYPE:
10289     case QUAL_UNION_TYPE:
10290       {
10291         /* These aren't very interesting except in a degenerate case.  */
10292         int count = 0;
10293         int sub_count;
10294         tree field;
10295
10296         /* Can't handle incomplete types nor sizes that are not
10297            fixed.  */
10298         if (!COMPLETE_TYPE_P (type)
10299             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10300           return -1;
10301
10302         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
10303           {
10304             if (TREE_CODE (field) != FIELD_DECL)
10305               continue;
10306
10307             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
10308             if (sub_count < 0)
10309               return -1;
10310             count = count > sub_count ? count : sub_count;
10311           }
10312
10313         /* There must be no padding.  */
10314         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10315           return -1;
10316
10317         return count;
10318       }
10319
10320     default:
10321       break;
10322     }
10323
10324   return -1;
10325 }
10326
10327 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
10328    type as described in AAPCS64 \S 4.1.2.
10329
10330    See the comment above aarch64_composite_type_p for the notes on MODE.  */
10331
10332 static bool
10333 aarch64_short_vector_p (const_tree type,
10334                         machine_mode mode)
10335 {
10336   HOST_WIDE_INT size = -1;
10337
10338   if (type && TREE_CODE (type) == VECTOR_TYPE)
10339     size = int_size_in_bytes (type);
10340   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
10341             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
10342     size = GET_MODE_SIZE (mode);
10343
10344   return (size == 8 || size == 16);
10345 }
10346
10347 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
10348    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
10349    array types.  The C99 floating-point complex types are also considered
10350    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
10351    types, which are GCC extensions and out of the scope of AAPCS64, are
10352    treated as composite types here as well.
10353
10354    Note that MODE itself is not sufficient in determining whether a type
10355    is such a composite type or not.  This is because
10356    stor-layout.c:compute_record_mode may have already changed the MODE
10357    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
10358    structure with only one field may have its MODE set to the mode of the
10359    field.  Also an integer mode whose size matches the size of the
10360    RECORD_TYPE type may be used to substitute the original mode
10361    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
10362    solely relied on.  */
10363
10364 static bool
10365 aarch64_composite_type_p (const_tree type,
10366                           machine_mode mode)
10367 {
10368   if (aarch64_short_vector_p (type, mode))
10369     return false;
10370
10371   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
10372     return true;
10373
10374   if (mode == BLKmode
10375       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
10376       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
10377     return true;
10378
10379   return false;
10380 }
10381
10382 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
10383    shall be passed or returned in simd/fp register(s) (providing these
10384    parameter passing registers are available).
10385
10386    Upon successful return, *COUNT returns the number of needed registers,
10387    *BASE_MODE returns the mode of the individual register and when IS_HAF
10388    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
10389    floating-point aggregate or a homogeneous short-vector aggregate.  */
10390
10391 static bool
10392 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
10393                                          const_tree type,
10394                                          machine_mode *base_mode,
10395                                          int *count,
10396                                          bool *is_ha)
10397 {
10398   machine_mode new_mode = VOIDmode;
10399   bool composite_p = aarch64_composite_type_p (type, mode);
10400
10401   if (is_ha != NULL) *is_ha = false;
10402
10403   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
10404       || aarch64_short_vector_p (type, mode))
10405     {
10406       *count = 1;
10407       new_mode = mode;
10408     }
10409   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
10410     {
10411       if (is_ha != NULL) *is_ha = true;
10412       *count = 2;
10413       new_mode = GET_MODE_INNER (mode);
10414     }
10415   else if (type && composite_p)
10416     {
10417       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
10418
10419       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
10420         {
10421           if (is_ha != NULL) *is_ha = true;
10422           *count = ag_count;
10423         }
10424       else
10425         return false;
10426     }
10427   else
10428     return false;
10429
10430   *base_mode = new_mode;
10431   return true;
10432 }
10433
10434 /* Implement TARGET_STRUCT_VALUE_RTX.  */
10435
10436 static rtx
10437 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
10438                           int incoming ATTRIBUTE_UNUSED)
10439 {
10440   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
10441 }
10442
10443 /* Implements target hook vector_mode_supported_p.  */
10444 static bool
10445 aarch64_vector_mode_supported_p (machine_mode mode)
10446 {
10447   if (TARGET_SIMD
10448       && (mode == V4SImode  || mode == V8HImode
10449           || mode == V16QImode || mode == V2DImode
10450           || mode == V2SImode  || mode == V4HImode
10451           || mode == V8QImode || mode == V2SFmode
10452           || mode == V4SFmode || mode == V2DFmode
10453           || mode == V4HFmode || mode == V8HFmode
10454           || mode == V1DFmode))
10455     return true;
10456
10457   return false;
10458 }
10459
10460 /* Return appropriate SIMD container
10461    for MODE within a vector of WIDTH bits.  */
10462 static machine_mode
10463 aarch64_simd_container_mode (machine_mode mode, unsigned width)
10464 {
10465   gcc_assert (width == 64 || width == 128);
10466   if (TARGET_SIMD)
10467     {
10468       if (width == 128)
10469         switch (mode)
10470           {
10471           case DFmode:
10472             return V2DFmode;
10473           case SFmode:
10474             return V4SFmode;
10475           case SImode:
10476             return V4SImode;
10477           case HImode:
10478             return V8HImode;
10479           case QImode:
10480             return V16QImode;
10481           case DImode:
10482             return V2DImode;
10483           default:
10484             break;
10485           }
10486       else
10487         switch (mode)
10488           {
10489           case SFmode:
10490             return V2SFmode;
10491           case SImode:
10492             return V2SImode;
10493           case HImode:
10494             return V4HImode;
10495           case QImode:
10496             return V8QImode;
10497           default:
10498             break;
10499           }
10500     }
10501   return word_mode;
10502 }
10503
10504 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
10505 static machine_mode
10506 aarch64_preferred_simd_mode (machine_mode mode)
10507 {
10508   return aarch64_simd_container_mode (mode, 128);
10509 }
10510
10511 /* Return the bitmask of possible vector sizes for the vectorizer
10512    to iterate over.  */
10513 static unsigned int
10514 aarch64_autovectorize_vector_sizes (void)
10515 {
10516   return (16 | 8);
10517 }
10518
10519 /* Implement TARGET_MANGLE_TYPE.  */
10520
10521 static const char *
10522 aarch64_mangle_type (const_tree type)
10523 {
10524   /* The AArch64 ABI documents say that "__va_list" has to be
10525      managled as if it is in the "std" namespace.  */
10526   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
10527     return "St9__va_list";
10528
10529   /* Half-precision float.  */
10530   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
10531     return "Dh";
10532
10533   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
10534      builtin types.  */
10535   if (TYPE_NAME (type) != NULL)
10536     return aarch64_mangle_builtin_type (type);
10537
10538   /* Use the default mangling.  */
10539   return NULL;
10540 }
10541
10542
10543 /* Return true if the rtx_insn contains a MEM RTX somewhere
10544    in it.  */
10545
10546 static bool
10547 has_memory_op (rtx_insn *mem_insn)
10548 {
10549   subrtx_iterator::array_type array;
10550   FOR_EACH_SUBRTX (iter, array, PATTERN (mem_insn), ALL)
10551     if (MEM_P (*iter))
10552       return true;
10553
10554   return false;
10555 }
10556
10557 /* Find the first rtx_insn before insn that will generate an assembly
10558    instruction.  */
10559
10560 static rtx_insn *
10561 aarch64_prev_real_insn (rtx_insn *insn)
10562 {
10563   if (!insn)
10564     return NULL;
10565
10566   do
10567     {
10568       insn = prev_real_insn (insn);
10569     }
10570   while (insn && recog_memoized (insn) < 0);
10571
10572   return insn;
10573 }
10574
10575 static bool
10576 is_madd_op (enum attr_type t1)
10577 {
10578   unsigned int i;
10579   /* A number of these may be AArch32 only.  */
10580   enum attr_type mlatypes[] = {
10581     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
10582     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
10583     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
10584   };
10585
10586   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
10587     {
10588       if (t1 == mlatypes[i])
10589         return true;
10590     }
10591
10592   return false;
10593 }
10594
10595 /* Check if there is a register dependency between a load and the insn
10596    for which we hold recog_data.  */
10597
10598 static bool
10599 dep_between_memop_and_curr (rtx memop)
10600 {
10601   rtx load_reg;
10602   int opno;
10603
10604   gcc_assert (GET_CODE (memop) == SET);
10605
10606   if (!REG_P (SET_DEST (memop)))
10607     return false;
10608
10609   load_reg = SET_DEST (memop);
10610   for (opno = 1; opno < recog_data.n_operands; opno++)
10611     {
10612       rtx operand = recog_data.operand[opno];
10613       if (REG_P (operand)
10614           && reg_overlap_mentioned_p (load_reg, operand))
10615         return true;
10616
10617     }
10618   return false;
10619 }
10620
10621
10622 /* When working around the Cortex-A53 erratum 835769,
10623    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
10624    instruction and has a preceding memory instruction such that a NOP
10625    should be inserted between them.  */
10626
10627 bool
10628 aarch64_madd_needs_nop (rtx_insn* insn)
10629 {
10630   enum attr_type attr_type;
10631   rtx_insn *prev;
10632   rtx body;
10633
10634   if (!TARGET_FIX_ERR_A53_835769)
10635     return false;
10636
10637   if (!INSN_P (insn) || recog_memoized (insn) < 0)
10638     return false;
10639
10640   attr_type = get_attr_type (insn);
10641   if (!is_madd_op (attr_type))
10642     return false;
10643
10644   prev = aarch64_prev_real_insn (insn);
10645   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
10646      Restore recog state to INSN to avoid state corruption.  */
10647   extract_constrain_insn_cached (insn);
10648
10649   if (!prev || !has_memory_op (prev))
10650     return false;
10651
10652   body = single_set (prev);
10653
10654   /* If the previous insn is a memory op and there is no dependency between
10655      it and the DImode madd, emit a NOP between them.  If body is NULL then we
10656      have a complex memory operation, probably a load/store pair.
10657      Be conservative for now and emit a NOP.  */
10658   if (GET_MODE (recog_data.operand[0]) == DImode
10659       && (!body || !dep_between_memop_and_curr (body)))
10660     return true;
10661
10662   return false;
10663
10664 }
10665
10666
10667 /* Implement FINAL_PRESCAN_INSN.  */
10668
10669 void
10670 aarch64_final_prescan_insn (rtx_insn *insn)
10671 {
10672   if (aarch64_madd_needs_nop (insn))
10673     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
10674 }
10675
10676
10677 /* Return the equivalent letter for size.  */
10678 static char
10679 sizetochar (int size)
10680 {
10681   switch (size)
10682     {
10683     case 64: return 'd';
10684     case 32: return 's';
10685     case 16: return 'h';
10686     case 8 : return 'b';
10687     default: gcc_unreachable ();
10688     }
10689 }
10690
10691 /* Return true iff x is a uniform vector of floating-point
10692    constants, and the constant can be represented in
10693    quarter-precision form.  Note, as aarch64_float_const_representable
10694    rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0.  */
10695 static bool
10696 aarch64_vect_float_const_representable_p (rtx x)
10697 {
10698   rtx elt;
10699   return (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
10700           && const_vec_duplicate_p (x, &elt)
10701           && aarch64_float_const_representable_p (elt));
10702 }
10703
10704 /* Return true for valid and false for invalid.  */
10705 bool
10706 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
10707                               struct simd_immediate_info *info)
10708 {
10709 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG)  \
10710   matches = 1;                                          \
10711   for (i = 0; i < idx; i += (STRIDE))                   \
10712     if (!(TEST))                                        \
10713       matches = 0;                                      \
10714   if (matches)                                          \
10715     {                                                   \
10716       immtype = (CLASS);                                \
10717       elsize = (ELSIZE);                                \
10718       eshift = (SHIFT);                                 \
10719       emvn = (NEG);                                     \
10720       break;                                            \
10721     }
10722
10723   unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
10724   unsigned int innersize = GET_MODE_UNIT_SIZE (mode);
10725   unsigned char bytes[16];
10726   int immtype = -1, matches;
10727   unsigned int invmask = inverse ? 0xff : 0;
10728   int eshift, emvn;
10729
10730   if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
10731     {
10732       if (! (aarch64_simd_imm_zero_p (op, mode)
10733              || aarch64_vect_float_const_representable_p (op)))
10734         return false;
10735
10736       if (info)
10737         {
10738           info->value = CONST_VECTOR_ELT (op, 0);
10739           info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
10740           info->mvn = false;
10741           info->shift = 0;
10742         }
10743
10744       return true;
10745     }
10746
10747   /* Splat vector constant out into a byte vector.  */
10748   for (i = 0; i < n_elts; i++)
10749     {
10750       /* The vector is provided in gcc endian-neutral fashion.  For aarch64_be,
10751          it must be laid out in the vector register in reverse order.  */
10752       rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
10753       unsigned HOST_WIDE_INT elpart;
10754
10755       gcc_assert (CONST_INT_P (el));
10756       elpart = INTVAL (el);
10757
10758       for (unsigned int byte = 0; byte < innersize; byte++)
10759         {
10760           bytes[idx++] = (elpart & 0xff) ^ invmask;
10761           elpart >>= BITS_PER_UNIT;
10762         }
10763
10764     }
10765
10766   /* Sanity check.  */
10767   gcc_assert (idx == GET_MODE_SIZE (mode));
10768
10769   do
10770     {
10771       CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
10772              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
10773
10774       CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
10775              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
10776
10777       CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
10778              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
10779
10780       CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
10781              && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
10782
10783       CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
10784
10785       CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
10786
10787       CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
10788              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
10789
10790       CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
10791              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
10792
10793       CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
10794              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
10795
10796       CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
10797              && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
10798
10799       CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
10800
10801       CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
10802
10803       CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
10804              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
10805
10806       CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
10807              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
10808
10809       CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
10810              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
10811
10812       CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
10813              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
10814
10815       CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
10816
10817       CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
10818              && bytes[i] == bytes[(i + 8) % idx], 0, 0);
10819     }
10820   while (0);
10821
10822   if (immtype == -1)
10823     return false;
10824
10825   if (info)
10826     {
10827       info->element_width = elsize;
10828       info->mvn = emvn != 0;
10829       info->shift = eshift;
10830
10831       unsigned HOST_WIDE_INT imm = 0;
10832
10833       if (immtype >= 12 && immtype <= 15)
10834         info->msl = true;
10835
10836       /* Un-invert bytes of recognized vector, if necessary.  */
10837       if (invmask != 0)
10838         for (i = 0; i < idx; i++)
10839           bytes[i] ^= invmask;
10840
10841       if (immtype == 17)
10842         {
10843           /* FIXME: Broken on 32-bit H_W_I hosts.  */
10844           gcc_assert (sizeof (HOST_WIDE_INT) == 8);
10845
10846           for (i = 0; i < 8; i++)
10847             imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
10848               << (i * BITS_PER_UNIT);
10849
10850
10851           info->value = GEN_INT (imm);
10852         }
10853       else
10854         {
10855           for (i = 0; i < elsize / BITS_PER_UNIT; i++)
10856             imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
10857
10858           /* Construct 'abcdefgh' because the assembler cannot handle
10859              generic constants.  */
10860           if (info->mvn)
10861             imm = ~imm;
10862           imm = (imm >> info->shift) & 0xff;
10863           info->value = GEN_INT (imm);
10864         }
10865     }
10866
10867   return true;
10868 #undef CHECK
10869 }
10870
10871 /* Check of immediate shift constants are within range.  */
10872 bool
10873 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
10874 {
10875   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
10876   if (left)
10877     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
10878   else
10879     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
10880 }
10881
10882 /* Return true if X is a uniform vector where all elements
10883    are either the floating-point constant 0.0 or the
10884    integer constant 0.  */
10885 bool
10886 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
10887 {
10888   return x == CONST0_RTX (mode);
10889 }
10890
10891
10892 /* Return the bitmask CONST_INT to select the bits required by a zero extract
10893    operation of width WIDTH at bit position POS.  */
10894
10895 rtx
10896 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
10897 {
10898   gcc_assert (CONST_INT_P (width));
10899   gcc_assert (CONST_INT_P (pos));
10900
10901   unsigned HOST_WIDE_INT mask
10902     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
10903   return GEN_INT (mask << UINTVAL (pos));
10904 }
10905
10906 bool
10907 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
10908 {
10909   HOST_WIDE_INT imm = INTVAL (x);
10910   int i;
10911
10912   for (i = 0; i < 8; i++)
10913     {
10914       unsigned int byte = imm & 0xff;
10915       if (byte != 0xff && byte != 0)
10916        return false;
10917       imm >>= 8;
10918     }
10919
10920   return true;
10921 }
10922
10923 bool
10924 aarch64_mov_operand_p (rtx x, machine_mode mode)
10925 {
10926   if (GET_CODE (x) == HIGH
10927       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
10928     return true;
10929
10930   if (CONST_INT_P (x))
10931     return true;
10932
10933   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
10934     return true;
10935
10936   return aarch64_classify_symbolic_expression (x)
10937     == SYMBOL_TINY_ABSOLUTE;
10938 }
10939
10940 /* Return a const_int vector of VAL.  */
10941 rtx
10942 aarch64_simd_gen_const_vector_dup (machine_mode mode, int val)
10943 {
10944   int nunits = GET_MODE_NUNITS (mode);
10945   rtvec v = rtvec_alloc (nunits);
10946   int i;
10947
10948   for (i=0; i < nunits; i++)
10949     RTVEC_ELT (v, i) = GEN_INT (val);
10950
10951   return gen_rtx_CONST_VECTOR (mode, v);
10952 }
10953
10954 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
10955
10956 bool
10957 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
10958 {
10959   machine_mode vmode;
10960
10961   gcc_assert (!VECTOR_MODE_P (mode));
10962   vmode = aarch64_preferred_simd_mode (mode);
10963   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
10964   return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
10965 }
10966
10967 /* Construct and return a PARALLEL RTX vector with elements numbering the
10968    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
10969    the vector - from the perspective of the architecture.  This does not
10970    line up with GCC's perspective on lane numbers, so we end up with
10971    different masks depending on our target endian-ness.  The diagram
10972    below may help.  We must draw the distinction when building masks
10973    which select one half of the vector.  An instruction selecting
10974    architectural low-lanes for a big-endian target, must be described using
10975    a mask selecting GCC high-lanes.
10976
10977                  Big-Endian             Little-Endian
10978
10979 GCC             0   1   2   3           3   2   1   0
10980               | x | x | x | x |       | x | x | x | x |
10981 Architecture    3   2   1   0           3   2   1   0
10982
10983 Low Mask:         { 2, 3 }                { 0, 1 }
10984 High Mask:        { 0, 1 }                { 2, 3 }
10985 */
10986
10987 rtx
10988 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
10989 {
10990   int nunits = GET_MODE_NUNITS (mode);
10991   rtvec v = rtvec_alloc (nunits / 2);
10992   int high_base = nunits / 2;
10993   int low_base = 0;
10994   int base;
10995   rtx t1;
10996   int i;
10997
10998   if (BYTES_BIG_ENDIAN)
10999     base = high ? low_base : high_base;
11000   else
11001     base = high ? high_base : low_base;
11002
11003   for (i = 0; i < nunits / 2; i++)
11004     RTVEC_ELT (v, i) = GEN_INT (base + i);
11005
11006   t1 = gen_rtx_PARALLEL (mode, v);
11007   return t1;
11008 }
11009
11010 /* Check OP for validity as a PARALLEL RTX vector with elements
11011    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
11012    from the perspective of the architecture.  See the diagram above
11013    aarch64_simd_vect_par_cnst_half for more details.  */
11014
11015 bool
11016 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
11017                                        bool high)
11018 {
11019   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
11020   HOST_WIDE_INT count_op = XVECLEN (op, 0);
11021   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
11022   int i = 0;
11023
11024   if (!VECTOR_MODE_P (mode))
11025     return false;
11026
11027   if (count_op != count_ideal)
11028     return false;
11029
11030   for (i = 0; i < count_ideal; i++)
11031     {
11032       rtx elt_op = XVECEXP (op, 0, i);
11033       rtx elt_ideal = XVECEXP (ideal, 0, i);
11034
11035       if (!CONST_INT_P (elt_op)
11036           || INTVAL (elt_ideal) != INTVAL (elt_op))
11037         return false;
11038     }
11039   return true;
11040 }
11041
11042 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
11043    HIGH (exclusive).  */
11044 void
11045 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
11046                           const_tree exp)
11047 {
11048   HOST_WIDE_INT lane;
11049   gcc_assert (CONST_INT_P (operand));
11050   lane = INTVAL (operand);
11051
11052   if (lane < low || lane >= high)
11053   {
11054     if (exp)
11055       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
11056     else
11057       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
11058   }
11059 }
11060
11061 /* Return TRUE if OP is a valid vector addressing mode.  */
11062 bool
11063 aarch64_simd_mem_operand_p (rtx op)
11064 {
11065   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
11066                         || REG_P (XEXP (op, 0)));
11067 }
11068
11069 /* Emit a register copy from operand to operand, taking care not to
11070    early-clobber source registers in the process.
11071
11072    COUNT is the number of components into which the copy needs to be
11073    decomposed.  */
11074 void
11075 aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
11076                                 unsigned int count)
11077 {
11078   unsigned int i;
11079   int rdest = REGNO (operands[0]);
11080   int rsrc = REGNO (operands[1]);
11081
11082   if (!reg_overlap_mentioned_p (operands[0], operands[1])
11083       || rdest < rsrc)
11084     for (i = 0; i < count; i++)
11085       emit_move_insn (gen_rtx_REG (mode, rdest + i),
11086                       gen_rtx_REG (mode, rsrc + i));
11087   else
11088     for (i = 0; i < count; i++)
11089       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
11090                       gen_rtx_REG (mode, rsrc + count - i - 1));
11091 }
11092
11093 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
11094    one of VSTRUCT modes: OI, CI, or XI.  */
11095 int
11096 aarch64_simd_attr_length_rglist (enum machine_mode mode)
11097 {
11098   return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
11099 }
11100
11101 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
11102    alignment of a vector to 128 bits.  */
11103 static HOST_WIDE_INT
11104 aarch64_simd_vector_alignment (const_tree type)
11105 {
11106   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
11107   return MIN (align, 128);
11108 }
11109
11110 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
11111 static bool
11112 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
11113 {
11114   if (is_packed)
11115     return false;
11116
11117   /* We guarantee alignment for vectors up to 128-bits.  */
11118   if (tree_int_cst_compare (TYPE_SIZE (type),
11119                             bitsize_int (BIGGEST_ALIGNMENT)) > 0)
11120     return false;
11121
11122   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
11123   return true;
11124 }
11125
11126 /* If VALS is a vector constant that can be loaded into a register
11127    using DUP, generate instructions to do so and return an RTX to
11128    assign to the register.  Otherwise return NULL_RTX.  */
11129 static rtx
11130 aarch64_simd_dup_constant (rtx vals)
11131 {
11132   machine_mode mode = GET_MODE (vals);
11133   machine_mode inner_mode = GET_MODE_INNER (mode);
11134   rtx x;
11135
11136   if (!const_vec_duplicate_p (vals, &x))
11137     return NULL_RTX;
11138
11139   /* We can load this constant by using DUP and a constant in a
11140      single ARM register.  This will be cheaper than a vector
11141      load.  */
11142   x = copy_to_mode_reg (inner_mode, x);
11143   return gen_rtx_VEC_DUPLICATE (mode, x);
11144 }
11145
11146
11147 /* Generate code to load VALS, which is a PARALLEL containing only
11148    constants (for vec_init) or CONST_VECTOR, efficiently into a
11149    register.  Returns an RTX to copy into the register, or NULL_RTX
11150    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
11151 static rtx
11152 aarch64_simd_make_constant (rtx vals)
11153 {
11154   machine_mode mode = GET_MODE (vals);
11155   rtx const_dup;
11156   rtx const_vec = NULL_RTX;
11157   int n_elts = GET_MODE_NUNITS (mode);
11158   int n_const = 0;
11159   int i;
11160
11161   if (GET_CODE (vals) == CONST_VECTOR)
11162     const_vec = vals;
11163   else if (GET_CODE (vals) == PARALLEL)
11164     {
11165       /* A CONST_VECTOR must contain only CONST_INTs and
11166          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
11167          Only store valid constants in a CONST_VECTOR.  */
11168       for (i = 0; i < n_elts; ++i)
11169         {
11170           rtx x = XVECEXP (vals, 0, i);
11171           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11172             n_const++;
11173         }
11174       if (n_const == n_elts)
11175         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
11176     }
11177   else
11178     gcc_unreachable ();
11179
11180   if (const_vec != NULL_RTX
11181       && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
11182     /* Load using MOVI/MVNI.  */
11183     return const_vec;
11184   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
11185     /* Loaded using DUP.  */
11186     return const_dup;
11187   else if (const_vec != NULL_RTX)
11188     /* Load from constant pool. We can not take advantage of single-cycle
11189        LD1 because we need a PC-relative addressing mode.  */
11190     return const_vec;
11191   else
11192     /* A PARALLEL containing something not valid inside CONST_VECTOR.
11193        We can not construct an initializer.  */
11194     return NULL_RTX;
11195 }
11196
11197 /* Expand a vector initialisation sequence, such that TARGET is
11198    initialised to contain VALS.  */
11199
11200 void
11201 aarch64_expand_vector_init (rtx target, rtx vals)
11202 {
11203   machine_mode mode = GET_MODE (target);
11204   machine_mode inner_mode = GET_MODE_INNER (mode);
11205   /* The number of vector elements.  */
11206   int n_elts = GET_MODE_NUNITS (mode);
11207   /* The number of vector elements which are not constant.  */
11208   int n_var = 0;
11209   rtx any_const = NULL_RTX;
11210   /* The first element of vals.  */
11211   rtx v0 = XVECEXP (vals, 0, 0);
11212   bool all_same = true;
11213
11214   /* Count the number of variable elements to initialise.  */
11215   for (int i = 0; i < n_elts; ++i)
11216     {
11217       rtx x = XVECEXP (vals, 0, i);
11218       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
11219         ++n_var;
11220       else
11221         any_const = x;
11222
11223       all_same &= rtx_equal_p (x, v0);
11224     }
11225
11226   /* No variable elements, hand off to aarch64_simd_make_constant which knows
11227      how best to handle this.  */
11228   if (n_var == 0)
11229     {
11230       rtx constant = aarch64_simd_make_constant (vals);
11231       if (constant != NULL_RTX)
11232         {
11233           emit_move_insn (target, constant);
11234           return;
11235         }
11236     }
11237
11238   /* Splat a single non-constant element if we can.  */
11239   if (all_same)
11240     {
11241       rtx x = copy_to_mode_reg (inner_mode, v0);
11242       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
11243       return;
11244     }
11245
11246   /* Initialise a vector which is part-variable.  We want to first try
11247      to build those lanes which are constant in the most efficient way we
11248      can.  */
11249   if (n_var != n_elts)
11250     {
11251       rtx copy = copy_rtx (vals);
11252
11253       /* Load constant part of vector.  We really don't care what goes into the
11254          parts we will overwrite, but we're more likely to be able to load the
11255          constant efficiently if it has fewer, larger, repeating parts
11256          (see aarch64_simd_valid_immediate).  */
11257       for (int i = 0; i < n_elts; i++)
11258         {
11259           rtx x = XVECEXP (vals, 0, i);
11260           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11261             continue;
11262           rtx subst = any_const;
11263           for (int bit = n_elts / 2; bit > 0; bit /= 2)
11264             {
11265               /* Look in the copied vector, as more elements are const.  */
11266               rtx test = XVECEXP (copy, 0, i ^ bit);
11267               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
11268                 {
11269                   subst = test;
11270                   break;
11271                 }
11272             }
11273           XVECEXP (copy, 0, i) = subst;
11274         }
11275       aarch64_expand_vector_init (target, copy);
11276     }
11277
11278   /* Insert the variable lanes directly.  */
11279
11280   enum insn_code icode = optab_handler (vec_set_optab, mode);
11281   gcc_assert (icode != CODE_FOR_nothing);
11282
11283   for (int i = 0; i < n_elts; i++)
11284     {
11285       rtx x = XVECEXP (vals, 0, i);
11286       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11287         continue;
11288       x = copy_to_mode_reg (inner_mode, x);
11289       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
11290     }
11291 }
11292
11293 static unsigned HOST_WIDE_INT
11294 aarch64_shift_truncation_mask (machine_mode mode)
11295 {
11296   return
11297     (!SHIFT_COUNT_TRUNCATED
11298      || aarch64_vector_mode_supported_p (mode)
11299      || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
11300 }
11301
11302 /* Select a format to encode pointers in exception handling data.  */
11303 int
11304 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
11305 {
11306    int type;
11307    switch (aarch64_cmodel)
11308      {
11309      case AARCH64_CMODEL_TINY:
11310      case AARCH64_CMODEL_TINY_PIC:
11311      case AARCH64_CMODEL_SMALL:
11312      case AARCH64_CMODEL_SMALL_PIC:
11313      case AARCH64_CMODEL_SMALL_SPIC:
11314        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
11315           for everything.  */
11316        type = DW_EH_PE_sdata4;
11317        break;
11318      default:
11319        /* No assumptions here.  8-byte relocs required.  */
11320        type = DW_EH_PE_sdata8;
11321        break;
11322      }
11323    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
11324 }
11325
11326 /* The last .arch and .tune assembly strings that we printed.  */
11327 static std::string aarch64_last_printed_arch_string;
11328 static std::string aarch64_last_printed_tune_string;
11329
11330 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
11331    by the function fndecl.  */
11332
11333 void
11334 aarch64_declare_function_name (FILE *stream, const char* name,
11335                                 tree fndecl)
11336 {
11337   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11338
11339   struct cl_target_option *targ_options;
11340   if (target_parts)
11341     targ_options = TREE_TARGET_OPTION (target_parts);
11342   else
11343     targ_options = TREE_TARGET_OPTION (target_option_current_node);
11344   gcc_assert (targ_options);
11345
11346   const struct processor *this_arch
11347     = aarch64_get_arch (targ_options->x_explicit_arch);
11348
11349   unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
11350   std::string extension
11351     = aarch64_get_extension_string_for_isa_flags (isa_flags,
11352                                                   this_arch->flags);
11353   /* Only update the assembler .arch string if it is distinct from the last
11354      such string we printed.  */
11355   std::string to_print = this_arch->name + extension;
11356   if (to_print != aarch64_last_printed_arch_string)
11357     {
11358       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
11359       aarch64_last_printed_arch_string = to_print;
11360     }
11361
11362   /* Print the cpu name we're tuning for in the comments, might be
11363      useful to readers of the generated asm.  Do it only when it changes
11364      from function to function and verbose assembly is requested.  */
11365   const struct processor *this_tune
11366     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
11367
11368   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
11369     {
11370       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
11371                    this_tune->name);
11372       aarch64_last_printed_tune_string = this_tune->name;
11373     }
11374
11375   /* Don't forget the type directive for ELF.  */
11376   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
11377   ASM_OUTPUT_LABEL (stream, name);
11378 }
11379
11380 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
11381
11382 static void
11383 aarch64_start_file (void)
11384 {
11385   struct cl_target_option *default_options
11386     = TREE_TARGET_OPTION (target_option_default_node);
11387
11388   const struct processor *default_arch
11389     = aarch64_get_arch (default_options->x_explicit_arch);
11390   unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
11391   std::string extension
11392     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
11393                                                   default_arch->flags);
11394
11395    aarch64_last_printed_arch_string = default_arch->name + extension;
11396    aarch64_last_printed_tune_string = "";
11397    asm_fprintf (asm_out_file, "\t.arch %s\n",
11398                 aarch64_last_printed_arch_string.c_str ());
11399
11400    default_file_start ();
11401 }
11402
11403 /* Emit load exclusive.  */
11404
11405 static void
11406 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
11407                              rtx mem, rtx model_rtx)
11408 {
11409   rtx (*gen) (rtx, rtx, rtx);
11410
11411   switch (mode)
11412     {
11413     case QImode: gen = gen_aarch64_load_exclusiveqi; break;
11414     case HImode: gen = gen_aarch64_load_exclusivehi; break;
11415     case SImode: gen = gen_aarch64_load_exclusivesi; break;
11416     case DImode: gen = gen_aarch64_load_exclusivedi; break;
11417     default:
11418       gcc_unreachable ();
11419     }
11420
11421   emit_insn (gen (rval, mem, model_rtx));
11422 }
11423
11424 /* Emit store exclusive.  */
11425
11426 static void
11427 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
11428                               rtx rval, rtx mem, rtx model_rtx)
11429 {
11430   rtx (*gen) (rtx, rtx, rtx, rtx);
11431
11432   switch (mode)
11433     {
11434     case QImode: gen = gen_aarch64_store_exclusiveqi; break;
11435     case HImode: gen = gen_aarch64_store_exclusivehi; break;
11436     case SImode: gen = gen_aarch64_store_exclusivesi; break;
11437     case DImode: gen = gen_aarch64_store_exclusivedi; break;
11438     default:
11439       gcc_unreachable ();
11440     }
11441
11442   emit_insn (gen (bval, rval, mem, model_rtx));
11443 }
11444
11445 /* Mark the previous jump instruction as unlikely.  */
11446
11447 static void
11448 aarch64_emit_unlikely_jump (rtx insn)
11449 {
11450   int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
11451
11452   insn = emit_jump_insn (insn);
11453   add_int_reg_note (insn, REG_BR_PROB, very_unlikely);
11454 }
11455
11456 /* Expand a compare and swap pattern.  */
11457
11458 void
11459 aarch64_expand_compare_and_swap (rtx operands[])
11460 {
11461   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
11462   machine_mode mode, cmp_mode;
11463   typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
11464   int idx;
11465   gen_cas_fn gen;
11466   const gen_cas_fn split_cas[] =
11467   {
11468     gen_aarch64_compare_and_swapqi,
11469     gen_aarch64_compare_and_swaphi,
11470     gen_aarch64_compare_and_swapsi,
11471     gen_aarch64_compare_and_swapdi
11472   };
11473   const gen_cas_fn atomic_cas[] =
11474   {
11475     gen_aarch64_compare_and_swapqi_lse,
11476     gen_aarch64_compare_and_swaphi_lse,
11477     gen_aarch64_compare_and_swapsi_lse,
11478     gen_aarch64_compare_and_swapdi_lse
11479   };
11480
11481   bval = operands[0];
11482   rval = operands[1];
11483   mem = operands[2];
11484   oldval = operands[3];
11485   newval = operands[4];
11486   is_weak = operands[5];
11487   mod_s = operands[6];
11488   mod_f = operands[7];
11489   mode = GET_MODE (mem);
11490   cmp_mode = mode;
11491
11492   /* Normally the succ memory model must be stronger than fail, but in the
11493      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
11494      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
11495
11496   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
11497       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
11498     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
11499
11500   switch (mode)
11501     {
11502     case QImode:
11503     case HImode:
11504       /* For short modes, we're going to perform the comparison in SImode,
11505          so do the zero-extension now.  */
11506       cmp_mode = SImode;
11507       rval = gen_reg_rtx (SImode);
11508       oldval = convert_modes (SImode, mode, oldval, true);
11509       /* Fall through.  */
11510
11511     case SImode:
11512     case DImode:
11513       /* Force the value into a register if needed.  */
11514       if (!aarch64_plus_operand (oldval, mode))
11515         oldval = force_reg (cmp_mode, oldval);
11516       break;
11517
11518     default:
11519       gcc_unreachable ();
11520     }
11521
11522   switch (mode)
11523     {
11524     case QImode: idx = 0; break;
11525     case HImode: idx = 1; break;
11526     case SImode: idx = 2; break;
11527     case DImode: idx = 3; break;
11528     default:
11529       gcc_unreachable ();
11530     }
11531   if (TARGET_LSE)
11532     gen = atomic_cas[idx];
11533   else
11534     gen = split_cas[idx];
11535
11536   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
11537
11538   if (mode == QImode || mode == HImode)
11539     emit_move_insn (operands[1], gen_lowpart (mode, rval));
11540
11541   x = gen_rtx_REG (CCmode, CC_REGNUM);
11542   x = gen_rtx_EQ (SImode, x, const0_rtx);
11543   emit_insn (gen_rtx_SET (bval, x));
11544 }
11545
11546 /* Test whether the target supports using a atomic load-operate instruction.
11547    CODE is the operation and AFTER is TRUE if the data in memory after the
11548    operation should be returned and FALSE if the data before the operation
11549    should be returned.  Returns FALSE if the operation isn't supported by the
11550    architecture.  */
11551
11552 bool
11553 aarch64_atomic_ldop_supported_p (enum rtx_code code)
11554 {
11555   if (!TARGET_LSE)
11556     return false;
11557
11558   switch (code)
11559     {
11560     case SET:
11561     case AND:
11562     case IOR:
11563     case XOR:
11564     case MINUS:
11565     case PLUS:
11566       return true;
11567     default:
11568       return false;
11569     }
11570 }
11571
11572 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
11573    sequence implementing an atomic operation.  */
11574
11575 static void
11576 aarch64_emit_post_barrier (enum memmodel model)
11577 {
11578   const enum memmodel base_model = memmodel_base (model);
11579
11580   if (is_mm_sync (model)
11581       && (base_model == MEMMODEL_ACQUIRE
11582           || base_model == MEMMODEL_ACQ_REL
11583           || base_model == MEMMODEL_SEQ_CST))
11584     {
11585       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
11586     }
11587 }
11588
11589 /* Emit an atomic compare-and-swap operation.  RVAL is the destination register
11590    for the data in memory.  EXPECTED is the value expected to be in memory.
11591    DESIRED is the value to store to memory.  MEM is the memory location.  MODEL
11592    is the memory ordering to use.  */
11593
11594 void
11595 aarch64_gen_atomic_cas (rtx rval, rtx mem,
11596                         rtx expected, rtx desired,
11597                         rtx model)
11598 {
11599   rtx (*gen) (rtx, rtx, rtx, rtx);
11600   machine_mode mode;
11601
11602   mode = GET_MODE (mem);
11603
11604   switch (mode)
11605     {
11606     case QImode: gen = gen_aarch64_atomic_casqi; break;
11607     case HImode: gen = gen_aarch64_atomic_cashi; break;
11608     case SImode: gen = gen_aarch64_atomic_cassi; break;
11609     case DImode: gen = gen_aarch64_atomic_casdi; break;
11610     default:
11611       gcc_unreachable ();
11612     }
11613
11614   /* Move the expected value into the CAS destination register.  */
11615   emit_insn (gen_rtx_SET (rval, expected));
11616
11617   /* Emit the CAS.  */
11618   emit_insn (gen (rval, mem, desired, model));
11619
11620   /* Compare the expected value with the value loaded by the CAS, to establish
11621      whether the swap was made.  */
11622   aarch64_gen_compare_reg (EQ, rval, expected);
11623 }
11624
11625 /* Split a compare and swap pattern.  */
11626
11627 void
11628 aarch64_split_compare_and_swap (rtx operands[])
11629 {
11630   rtx rval, mem, oldval, newval, scratch;
11631   machine_mode mode;
11632   bool is_weak;
11633   rtx_code_label *label1, *label2;
11634   rtx x, cond;
11635   enum memmodel model;
11636   rtx model_rtx;
11637
11638   rval = operands[0];
11639   mem = operands[1];
11640   oldval = operands[2];
11641   newval = operands[3];
11642   is_weak = (operands[4] != const0_rtx);
11643   model_rtx = operands[5];
11644   scratch = operands[7];
11645   mode = GET_MODE (mem);
11646   model = memmodel_from_int (INTVAL (model_rtx));
11647
11648   /* When OLDVAL is zero and we want the strong version we can emit a tighter
11649     loop:
11650     .label1:
11651         LD[A]XR rval, [mem]
11652         CBNZ    rval, .label2
11653         ST[L]XR scratch, newval, [mem]
11654         CBNZ    scratch, .label1
11655     .label2:
11656         CMP     rval, 0.  */
11657   bool strong_zero_p = !is_weak && oldval == const0_rtx;
11658
11659   label1 = NULL;
11660   if (!is_weak)
11661     {
11662       label1 = gen_label_rtx ();
11663       emit_label (label1);
11664     }
11665   label2 = gen_label_rtx ();
11666
11667   /* The initial load can be relaxed for a __sync operation since a final
11668      barrier will be emitted to stop code hoisting.  */
11669   if (is_mm_sync (model))
11670     aarch64_emit_load_exclusive (mode, rval, mem,
11671                                  GEN_INT (MEMMODEL_RELAXED));
11672   else
11673     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
11674
11675   if (strong_zero_p)
11676     {
11677       if (aarch64_track_speculation)
11678         {
11679           /* Emit an explicit compare instruction, so that we can correctly
11680              track the condition codes.  */
11681           rtx cc_reg = aarch64_gen_compare_reg (NE, rval, const0_rtx);
11682           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
11683         }
11684       else
11685         x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
11686
11687       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
11688                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
11689       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
11690     }
11691   else
11692     {
11693       cond = aarch64_gen_compare_reg (NE, rval, oldval);
11694       x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
11695       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
11696                                  gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
11697       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
11698     }
11699
11700   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
11701
11702   if (!is_weak)
11703     {
11704       if (aarch64_track_speculation)
11705         {
11706           /* Emit an explicit compare instruction, so that we can correctly
11707              track the condition codes.  */
11708           rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
11709           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
11710         }
11711       else
11712         x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
11713
11714       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
11715                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
11716       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
11717     }
11718   else
11719     {
11720       cond = gen_rtx_REG (CCmode, CC_REGNUM);
11721       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
11722       emit_insn (gen_rtx_SET (cond, x));
11723     }
11724
11725   emit_label (label2);
11726   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
11727      to set the condition flags.  If this is not used it will be removed by
11728      later passes.  */
11729   if (strong_zero_p)
11730     {
11731       cond = gen_rtx_REG (CCmode, CC_REGNUM);
11732       x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
11733       emit_insn (gen_rtx_SET (cond, x));
11734     }
11735   /* Emit any final barrier needed for a __sync operation.  */
11736   if (is_mm_sync (model))
11737     aarch64_emit_post_barrier (model);
11738 }
11739
11740 /* Emit a BIC instruction.  */
11741
11742 static void
11743 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
11744 {
11745   rtx shift_rtx = GEN_INT (shift);
11746   rtx (*gen) (rtx, rtx, rtx, rtx);
11747
11748   switch (mode)
11749     {
11750     case SImode: gen = gen_and_one_cmpl_lshrsi3; break;
11751     case DImode: gen = gen_and_one_cmpl_lshrdi3; break;
11752     default:
11753       gcc_unreachable ();
11754     }
11755
11756   emit_insn (gen (dst, s2, shift_rtx, s1));
11757 }
11758
11759 /* Emit an atomic swap.  */
11760
11761 static void
11762 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
11763                           rtx mem, rtx model)
11764 {
11765   rtx (*gen) (rtx, rtx, rtx, rtx);
11766
11767   switch (mode)
11768     {
11769     case QImode: gen = gen_aarch64_atomic_swpqi; break;
11770     case HImode: gen = gen_aarch64_atomic_swphi; break;
11771     case SImode: gen = gen_aarch64_atomic_swpsi; break;
11772     case DImode: gen = gen_aarch64_atomic_swpdi; break;
11773     default:
11774       gcc_unreachable ();
11775     }
11776
11777   emit_insn (gen (dst, mem, value, model));
11778 }
11779
11780 /* Operations supported by aarch64_emit_atomic_load_op.  */
11781
11782 enum aarch64_atomic_load_op_code
11783 {
11784   AARCH64_LDOP_PLUS,    /* A + B  */
11785   AARCH64_LDOP_XOR,     /* A ^ B  */
11786   AARCH64_LDOP_OR,      /* A | B  */
11787   AARCH64_LDOP_BIC      /* A & ~B  */
11788 };
11789
11790 /* Emit an atomic load-operate.  */
11791
11792 static void
11793 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
11794                              machine_mode mode, rtx dst, rtx src,
11795                              rtx mem, rtx model)
11796 {
11797   typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
11798   const aarch64_atomic_load_op_fn plus[] =
11799   {
11800     gen_aarch64_atomic_loadaddqi,
11801     gen_aarch64_atomic_loadaddhi,
11802     gen_aarch64_atomic_loadaddsi,
11803     gen_aarch64_atomic_loadadddi
11804   };
11805   const aarch64_atomic_load_op_fn eor[] =
11806   {
11807     gen_aarch64_atomic_loadeorqi,
11808     gen_aarch64_atomic_loadeorhi,
11809     gen_aarch64_atomic_loadeorsi,
11810     gen_aarch64_atomic_loadeordi
11811   };
11812   const aarch64_atomic_load_op_fn ior[] =
11813   {
11814     gen_aarch64_atomic_loadsetqi,
11815     gen_aarch64_atomic_loadsethi,
11816     gen_aarch64_atomic_loadsetsi,
11817     gen_aarch64_atomic_loadsetdi
11818   };
11819   const aarch64_atomic_load_op_fn bic[] =
11820   {
11821     gen_aarch64_atomic_loadclrqi,
11822     gen_aarch64_atomic_loadclrhi,
11823     gen_aarch64_atomic_loadclrsi,
11824     gen_aarch64_atomic_loadclrdi
11825   };
11826   aarch64_atomic_load_op_fn gen;
11827   int idx = 0;
11828
11829   switch (mode)
11830     {
11831     case QImode: idx = 0; break;
11832     case HImode: idx = 1; break;
11833     case SImode: idx = 2; break;
11834     case DImode: idx = 3; break;
11835     default:
11836       gcc_unreachable ();
11837     }
11838
11839   switch (code)
11840     {
11841     case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
11842     case AARCH64_LDOP_XOR: gen = eor[idx]; break;
11843     case AARCH64_LDOP_OR: gen = ior[idx]; break;
11844     case AARCH64_LDOP_BIC: gen = bic[idx]; break;
11845     default:
11846       gcc_unreachable ();
11847     }
11848
11849   emit_insn (gen (dst, mem, src, model));
11850 }
11851
11852 /* Emit an atomic load+operate.  CODE is the operation.  OUT_DATA is the
11853    location to store the data read from memory.  OUT_RESULT is the location to
11854    store the result of the operation.  MEM is the memory location to read and
11855    modify.  MODEL_RTX is the memory ordering to use.  VALUE is the second
11856    operand for the operation.  Either OUT_DATA or OUT_RESULT, but not both, can
11857    be NULL.  */
11858
11859 void
11860 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
11861                          rtx mem, rtx value, rtx model_rtx)
11862 {
11863   machine_mode mode = GET_MODE (mem);
11864   machine_mode wmode = (mode == DImode ? DImode : SImode);
11865   const bool short_mode = (mode < SImode);
11866   aarch64_atomic_load_op_code ldop_code;
11867   rtx src;
11868   rtx x;
11869
11870   if (out_data)
11871     out_data = gen_lowpart (mode, out_data);
11872
11873   if (out_result)
11874     out_result = gen_lowpart (mode, out_result);
11875
11876   /* Make sure the value is in a register, putting it into a destination
11877      register if it needs to be manipulated.  */
11878   if (!register_operand (value, mode)
11879       || code == AND || code == MINUS)
11880     {
11881       src = out_result ? out_result : out_data;
11882       emit_move_insn (src, gen_lowpart (mode, value));
11883     }
11884   else
11885     src = value;
11886   gcc_assert (register_operand (src, mode));
11887
11888   /* Preprocess the data for the operation as necessary.  If the operation is
11889      a SET then emit a swap instruction and finish.  */
11890   switch (code)
11891     {
11892     case SET:
11893       aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
11894       return;
11895
11896     case MINUS:
11897       /* Negate the value and treat it as a PLUS.  */
11898       {
11899         rtx neg_src;
11900
11901         /* Resize the value if necessary.  */
11902         if (short_mode)
11903           src = gen_lowpart (wmode, src);
11904
11905         neg_src = gen_rtx_NEG (wmode, src);
11906         emit_insn (gen_rtx_SET (src, neg_src));
11907
11908         if (short_mode)
11909           src = gen_lowpart (mode, src);
11910       }
11911       /* Fall-through.  */
11912     case PLUS:
11913       ldop_code = AARCH64_LDOP_PLUS;
11914       break;
11915
11916     case IOR:
11917       ldop_code = AARCH64_LDOP_OR;
11918       break;
11919
11920     case XOR:
11921       ldop_code = AARCH64_LDOP_XOR;
11922       break;
11923
11924     case AND:
11925       {
11926         rtx not_src;
11927
11928         /* Resize the value if necessary.  */
11929         if (short_mode)
11930           src = gen_lowpart (wmode, src);
11931
11932         not_src = gen_rtx_NOT (wmode, src);
11933         emit_insn (gen_rtx_SET (src, not_src));
11934
11935         if (short_mode)
11936           src = gen_lowpart (mode, src);
11937       }
11938       ldop_code = AARCH64_LDOP_BIC;
11939       break;
11940
11941     default:
11942       /* The operation can't be done with atomic instructions.  */
11943       gcc_unreachable ();
11944     }
11945
11946   aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
11947
11948   /* If necessary, calculate the data in memory after the update by redoing the
11949      operation from values in registers.  */
11950   if (!out_result)
11951     return;
11952
11953   if (short_mode)
11954     {
11955       src = gen_lowpart (wmode, src);
11956       out_data = gen_lowpart (wmode, out_data);
11957       out_result = gen_lowpart (wmode, out_result);
11958     }
11959
11960   x = NULL_RTX;
11961
11962   switch (code)
11963     {
11964     case MINUS:
11965     case PLUS:
11966       x = gen_rtx_PLUS (wmode, out_data, src);
11967       break;
11968     case IOR:
11969       x = gen_rtx_IOR (wmode, out_data, src);
11970       break;
11971     case XOR:
11972       x = gen_rtx_XOR (wmode, out_data, src);
11973       break;
11974     case AND:
11975       aarch64_emit_bic (wmode, out_result, out_data, src, 0);
11976       return;
11977     default:
11978       gcc_unreachable ();
11979     }
11980
11981   emit_set_insn (out_result, x);
11982
11983   return;
11984 }
11985
11986 /* Split an atomic operation.  */
11987
11988 void
11989 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
11990                          rtx value, rtx model_rtx, rtx cond)
11991 {
11992   machine_mode mode = GET_MODE (mem);
11993   machine_mode wmode = (mode == DImode ? DImode : SImode);
11994   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
11995   const bool is_sync = is_mm_sync (model);
11996   rtx_code_label *label;
11997   rtx x;
11998
11999   /* Split the atomic operation into a sequence.  */
12000   label = gen_label_rtx ();
12001   emit_label (label);
12002
12003   if (new_out)
12004     new_out = gen_lowpart (wmode, new_out);
12005   if (old_out)
12006     old_out = gen_lowpart (wmode, old_out);
12007   else
12008     old_out = new_out;
12009   value = simplify_gen_subreg (wmode, value, mode, 0);
12010
12011   /* The initial load can be relaxed for a __sync operation since a final
12012      barrier will be emitted to stop code hoisting.  */
12013  if (is_sync)
12014     aarch64_emit_load_exclusive (mode, old_out, mem,
12015                                  GEN_INT (MEMMODEL_RELAXED));
12016   else
12017     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
12018
12019   switch (code)
12020     {
12021     case SET:
12022       new_out = value;
12023       break;
12024
12025     case NOT:
12026       x = gen_rtx_AND (wmode, old_out, value);
12027       emit_insn (gen_rtx_SET (new_out, x));
12028       x = gen_rtx_NOT (wmode, new_out);
12029       emit_insn (gen_rtx_SET (new_out, x));
12030       break;
12031
12032     case MINUS:
12033       if (CONST_INT_P (value))
12034         {
12035           value = GEN_INT (-INTVAL (value));
12036           code = PLUS;
12037         }
12038       /* Fall through.  */
12039
12040     default:
12041       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
12042       emit_insn (gen_rtx_SET (new_out, x));
12043       break;
12044     }
12045
12046   aarch64_emit_store_exclusive (mode, cond, mem,
12047                                 gen_lowpart (mode, new_out), model_rtx);
12048
12049   if (aarch64_track_speculation)
12050     {
12051       /* Emit an explicit compare instruction, so that we can correctly
12052          track the condition codes.  */
12053       rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
12054       x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
12055     }
12056   else
12057     x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12058
12059   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12060                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
12061   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12062
12063   /* Emit any final barrier needed for a __sync operation.  */
12064   if (is_sync)
12065     aarch64_emit_post_barrier (model);
12066 }
12067
12068 static void
12069 aarch64_init_libfuncs (void)
12070 {
12071    /* Half-precision float operations.  The compiler handles all operations
12072      with NULL libfuncs by converting to SFmode.  */
12073
12074   /* Conversions.  */
12075   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
12076   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
12077
12078   /* Arithmetic.  */
12079   set_optab_libfunc (add_optab, HFmode, NULL);
12080   set_optab_libfunc (sdiv_optab, HFmode, NULL);
12081   set_optab_libfunc (smul_optab, HFmode, NULL);
12082   set_optab_libfunc (neg_optab, HFmode, NULL);
12083   set_optab_libfunc (sub_optab, HFmode, NULL);
12084
12085   /* Comparisons.  */
12086   set_optab_libfunc (eq_optab, HFmode, NULL);
12087   set_optab_libfunc (ne_optab, HFmode, NULL);
12088   set_optab_libfunc (lt_optab, HFmode, NULL);
12089   set_optab_libfunc (le_optab, HFmode, NULL);
12090   set_optab_libfunc (ge_optab, HFmode, NULL);
12091   set_optab_libfunc (gt_optab, HFmode, NULL);
12092   set_optab_libfunc (unord_optab, HFmode, NULL);
12093 }
12094
12095 /* Target hook for c_mode_for_suffix.  */
12096 static machine_mode
12097 aarch64_c_mode_for_suffix (char suffix)
12098 {
12099   if (suffix == 'q')
12100     return TFmode;
12101
12102   return VOIDmode;
12103 }
12104
12105 /* We can only represent floating point constants which will fit in
12106    "quarter-precision" values.  These values are characterised by
12107    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
12108    by:
12109
12110    (-1)^s * (n/16) * 2^r
12111
12112    Where:
12113      's' is the sign bit.
12114      'n' is an integer in the range 16 <= n <= 31.
12115      'r' is an integer in the range -3 <= r <= 4.  */
12116
12117 /* Return true iff X can be represented by a quarter-precision
12118    floating point immediate operand X.  Note, we cannot represent 0.0.  */
12119 bool
12120 aarch64_float_const_representable_p (rtx x)
12121 {
12122   /* This represents our current view of how many bits
12123      make up the mantissa.  */
12124   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
12125   int exponent;
12126   unsigned HOST_WIDE_INT mantissa, mask;
12127   REAL_VALUE_TYPE r, m;
12128   bool fail;
12129
12130   if (!CONST_DOUBLE_P (x))
12131     return false;
12132
12133   /* We don't support HFmode constants yet.  */
12134   if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
12135     return false;
12136
12137   r = *CONST_DOUBLE_REAL_VALUE (x);
12138
12139   /* We cannot represent infinities, NaNs or +/-zero.  We won't
12140      know if we have +zero until we analyse the mantissa, but we
12141      can reject the other invalid values.  */
12142   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
12143       || REAL_VALUE_MINUS_ZERO (r))
12144     return false;
12145
12146   /* Extract exponent.  */
12147   r = real_value_abs (&r);
12148   exponent = REAL_EXP (&r);
12149
12150   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
12151      highest (sign) bit, with a fixed binary point at bit point_pos.
12152      m1 holds the low part of the mantissa, m2 the high part.
12153      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
12154      bits for the mantissa, this can fail (low bits will be lost).  */
12155   real_ldexp (&m, &r, point_pos - exponent);
12156   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
12157
12158   /* If the low part of the mantissa has bits set we cannot represent
12159      the value.  */
12160   if (w.elt (0) != 0)
12161     return false;
12162   /* We have rejected the lower HOST_WIDE_INT, so update our
12163      understanding of how many bits lie in the mantissa and
12164      look only at the high HOST_WIDE_INT.  */
12165   mantissa = w.elt (1);
12166   point_pos -= HOST_BITS_PER_WIDE_INT;
12167
12168   /* We can only represent values with a mantissa of the form 1.xxxx.  */
12169   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
12170   if ((mantissa & mask) != 0)
12171     return false;
12172
12173   /* Having filtered unrepresentable values, we may now remove all
12174      but the highest 5 bits.  */
12175   mantissa >>= point_pos - 5;
12176
12177   /* We cannot represent the value 0.0, so reject it.  This is handled
12178      elsewhere.  */
12179   if (mantissa == 0)
12180     return false;
12181
12182   /* Then, as bit 4 is always set, we can mask it off, leaving
12183      the mantissa in the range [0, 15].  */
12184   mantissa &= ~(1 << 4);
12185   gcc_assert (mantissa <= 15);
12186
12187   /* GCC internally does not use IEEE754-like encoding (where normalized
12188      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
12189      Our mantissa values are shifted 4 places to the left relative to
12190      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
12191      by 5 places to correct for GCC's representation.  */
12192   exponent = 5 - exponent;
12193
12194   return (exponent >= 0 && exponent <= 7);
12195 }
12196
12197 char*
12198 aarch64_output_simd_mov_immediate (rtx const_vector,
12199                                    machine_mode mode,
12200                                    unsigned width)
12201 {
12202   bool is_valid;
12203   static char templ[40];
12204   const char *mnemonic;
12205   const char *shift_op;
12206   unsigned int lane_count = 0;
12207   char element_char;
12208
12209   struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
12210
12211   /* This will return true to show const_vector is legal for use as either
12212      a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate.  It will
12213      also update INFO to show how the immediate should be generated.  */
12214   is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
12215   gcc_assert (is_valid);
12216
12217   element_char = sizetochar (info.element_width);
12218   lane_count = width / info.element_width;
12219
12220   mode = GET_MODE_INNER (mode);
12221   if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12222     {
12223       gcc_assert (info.shift == 0 && ! info.mvn);
12224       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
12225          move immediate path.  */
12226       if (aarch64_float_const_zero_rtx_p (info.value))
12227         info.value = GEN_INT (0);
12228       else
12229         {
12230           const unsigned int buf_size = 20;
12231           char float_buf[buf_size] = {'\0'};
12232           real_to_decimal_for_mode (float_buf,
12233                                     CONST_DOUBLE_REAL_VALUE (info.value),
12234                                     buf_size, buf_size, 1, mode);
12235
12236           if (lane_count == 1)
12237             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
12238           else
12239             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
12240                       lane_count, element_char, float_buf);
12241           return templ;
12242         }
12243     }
12244
12245   mnemonic = info.mvn ? "mvni" : "movi";
12246   shift_op = info.msl ? "msl" : "lsl";
12247
12248   gcc_assert (CONST_INT_P (info.value));
12249   if (lane_count == 1)
12250     snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
12251               mnemonic, UINTVAL (info.value));
12252   else if (info.shift)
12253     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
12254               ", %s %d", mnemonic, lane_count, element_char,
12255               UINTVAL (info.value), shift_op, info.shift);
12256   else
12257     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
12258               mnemonic, lane_count, element_char, UINTVAL (info.value));
12259   return templ;
12260 }
12261
12262 char*
12263 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
12264                                           machine_mode mode)
12265 {
12266   machine_mode vmode;
12267
12268   gcc_assert (!VECTOR_MODE_P (mode));
12269   vmode = aarch64_simd_container_mode (mode, 64);
12270   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
12271   return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
12272 }
12273
12274 /* Split operands into moves from op[1] + op[2] into op[0].  */
12275
12276 void
12277 aarch64_split_combinev16qi (rtx operands[3])
12278 {
12279   unsigned int dest = REGNO (operands[0]);
12280   unsigned int src1 = REGNO (operands[1]);
12281   unsigned int src2 = REGNO (operands[2]);
12282   machine_mode halfmode = GET_MODE (operands[1]);
12283   unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
12284   rtx destlo, desthi;
12285
12286   gcc_assert (halfmode == V16QImode);
12287
12288   if (src1 == dest && src2 == dest + halfregs)
12289     {
12290       /* No-op move.  Can't split to nothing; emit something.  */
12291       emit_note (NOTE_INSN_DELETED);
12292       return;
12293     }
12294
12295   /* Preserve register attributes for variable tracking.  */
12296   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
12297   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
12298                                GET_MODE_SIZE (halfmode));
12299
12300   /* Special case of reversed high/low parts.  */
12301   if (reg_overlap_mentioned_p (operands[2], destlo)
12302       && reg_overlap_mentioned_p (operands[1], desthi))
12303     {
12304       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
12305       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
12306       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
12307     }
12308   else if (!reg_overlap_mentioned_p (operands[2], destlo))
12309     {
12310       /* Try to avoid unnecessary moves if part of the result
12311          is in the right place already.  */
12312       if (src1 != dest)
12313         emit_move_insn (destlo, operands[1]);
12314       if (src2 != dest + halfregs)
12315         emit_move_insn (desthi, operands[2]);
12316     }
12317   else
12318     {
12319       if (src2 != dest + halfregs)
12320         emit_move_insn (desthi, operands[2]);
12321       if (src1 != dest)
12322         emit_move_insn (destlo, operands[1]);
12323     }
12324 }
12325
12326 /* vec_perm support.  */
12327
12328 #define MAX_VECT_LEN 16
12329
12330 struct expand_vec_perm_d
12331 {
12332   rtx target, op0, op1;
12333   unsigned char perm[MAX_VECT_LEN];
12334   machine_mode vmode;
12335   unsigned char nelt;
12336   bool one_vector_p;
12337   bool testing_p;
12338 };
12339
12340 /* Generate a variable permutation.  */
12341
12342 static void
12343 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
12344 {
12345   machine_mode vmode = GET_MODE (target);
12346   bool one_vector_p = rtx_equal_p (op0, op1);
12347
12348   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
12349   gcc_checking_assert (GET_MODE (op0) == vmode);
12350   gcc_checking_assert (GET_MODE (op1) == vmode);
12351   gcc_checking_assert (GET_MODE (sel) == vmode);
12352   gcc_checking_assert (TARGET_SIMD);
12353
12354   if (one_vector_p)
12355     {
12356       if (vmode == V8QImode)
12357         {
12358           /* Expand the argument to a V16QI mode by duplicating it.  */
12359           rtx pair = gen_reg_rtx (V16QImode);
12360           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
12361           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
12362         }
12363       else
12364         {
12365           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
12366         }
12367     }
12368   else
12369     {
12370       rtx pair;
12371
12372       if (vmode == V8QImode)
12373         {
12374           pair = gen_reg_rtx (V16QImode);
12375           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
12376           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
12377         }
12378       else
12379         {
12380           pair = gen_reg_rtx (OImode);
12381           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
12382           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
12383         }
12384     }
12385 }
12386
12387 void
12388 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
12389 {
12390   machine_mode vmode = GET_MODE (target);
12391   unsigned int nelt = GET_MODE_NUNITS (vmode);
12392   bool one_vector_p = rtx_equal_p (op0, op1);
12393   rtx mask;
12394
12395   /* The TBL instruction does not use a modulo index, so we must take care
12396      of that ourselves.  */
12397   mask = aarch64_simd_gen_const_vector_dup (vmode,
12398       one_vector_p ? nelt - 1 : 2 * nelt - 1);
12399   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
12400
12401   /* For big-endian, we also need to reverse the index within the vector
12402      (but not which vector).  */
12403   if (BYTES_BIG_ENDIAN)
12404     {
12405       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
12406       if (!one_vector_p)
12407         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
12408       sel = expand_simple_binop (vmode, XOR, sel, mask,
12409                                  NULL, 0, OPTAB_LIB_WIDEN);
12410     }
12411   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
12412 }
12413
12414 /* Recognize patterns suitable for the TRN instructions.  */
12415 static bool
12416 aarch64_evpc_trn (struct expand_vec_perm_d *d)
12417 {
12418   unsigned int i, odd, mask, nelt = d->nelt;
12419   rtx out, in0, in1, x;
12420   rtx (*gen) (rtx, rtx, rtx);
12421   machine_mode vmode = d->vmode;
12422
12423   if (GET_MODE_UNIT_SIZE (vmode) > 8)
12424     return false;
12425
12426   /* Note that these are little-endian tests.
12427      We correct for big-endian later.  */
12428   if (d->perm[0] == 0)
12429     odd = 0;
12430   else if (d->perm[0] == 1)
12431     odd = 1;
12432   else
12433     return false;
12434   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12435
12436   for (i = 0; i < nelt; i += 2)
12437     {
12438       if (d->perm[i] != i + odd)
12439         return false;
12440       if (d->perm[i + 1] != ((i + nelt + odd) & mask))
12441         return false;
12442     }
12443
12444   /* Success!  */
12445   if (d->testing_p)
12446     return true;
12447
12448   in0 = d->op0;
12449   in1 = d->op1;
12450   if (BYTES_BIG_ENDIAN)
12451     {
12452       x = in0, in0 = in1, in1 = x;
12453       odd = !odd;
12454     }
12455   out = d->target;
12456
12457   if (odd)
12458     {
12459       switch (vmode)
12460         {
12461         case V16QImode: gen = gen_aarch64_trn2v16qi; break;
12462         case V8QImode: gen = gen_aarch64_trn2v8qi; break;
12463         case V8HImode: gen = gen_aarch64_trn2v8hi; break;
12464         case V4HImode: gen = gen_aarch64_trn2v4hi; break;
12465         case V4SImode: gen = gen_aarch64_trn2v4si; break;
12466         case V2SImode: gen = gen_aarch64_trn2v2si; break;
12467         case V2DImode: gen = gen_aarch64_trn2v2di; break;
12468         case V4HFmode: gen = gen_aarch64_trn2v4hf; break;
12469         case V8HFmode: gen = gen_aarch64_trn2v8hf; break;
12470         case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
12471         case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
12472         case V2DFmode: gen = gen_aarch64_trn2v2df; break;
12473         default:
12474           return false;
12475         }
12476     }
12477   else
12478     {
12479       switch (vmode)
12480         {
12481         case V16QImode: gen = gen_aarch64_trn1v16qi; break;
12482         case V8QImode: gen = gen_aarch64_trn1v8qi; break;
12483         case V8HImode: gen = gen_aarch64_trn1v8hi; break;
12484         case V4HImode: gen = gen_aarch64_trn1v4hi; break;
12485         case V4SImode: gen = gen_aarch64_trn1v4si; break;
12486         case V2SImode: gen = gen_aarch64_trn1v2si; break;
12487         case V2DImode: gen = gen_aarch64_trn1v2di; break;
12488         case V4HFmode: gen = gen_aarch64_trn1v4hf; break;
12489         case V8HFmode: gen = gen_aarch64_trn1v8hf; break;
12490         case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
12491         case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
12492         case V2DFmode: gen = gen_aarch64_trn1v2df; break;
12493         default:
12494           return false;
12495         }
12496     }
12497
12498   emit_insn (gen (out, in0, in1));
12499   return true;
12500 }
12501
12502 /* Recognize patterns suitable for the UZP instructions.  */
12503 static bool
12504 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
12505 {
12506   unsigned int i, odd, mask, nelt = d->nelt;
12507   rtx out, in0, in1, x;
12508   rtx (*gen) (rtx, rtx, rtx);
12509   machine_mode vmode = d->vmode;
12510
12511   if (GET_MODE_UNIT_SIZE (vmode) > 8)
12512     return false;
12513
12514   /* Note that these are little-endian tests.
12515      We correct for big-endian later.  */
12516   if (d->perm[0] == 0)
12517     odd = 0;
12518   else if (d->perm[0] == 1)
12519     odd = 1;
12520   else
12521     return false;
12522   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12523
12524   for (i = 0; i < nelt; i++)
12525     {
12526       unsigned elt = (i * 2 + odd) & mask;
12527       if (d->perm[i] != elt)
12528         return false;
12529     }
12530
12531   /* Success!  */
12532   if (d->testing_p)
12533     return true;
12534
12535   in0 = d->op0;
12536   in1 = d->op1;
12537   if (BYTES_BIG_ENDIAN)
12538     {
12539       x = in0, in0 = in1, in1 = x;
12540       odd = !odd;
12541     }
12542   out = d->target;
12543
12544   if (odd)
12545     {
12546       switch (vmode)
12547         {
12548         case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
12549         case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
12550         case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
12551         case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
12552         case V4SImode: gen = gen_aarch64_uzp2v4si; break;
12553         case V2SImode: gen = gen_aarch64_uzp2v2si; break;
12554         case V2DImode: gen = gen_aarch64_uzp2v2di; break;
12555         case V4HFmode: gen = gen_aarch64_uzp2v4hf; break;
12556         case V8HFmode: gen = gen_aarch64_uzp2v8hf; break;
12557         case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
12558         case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
12559         case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
12560         default:
12561           return false;
12562         }
12563     }
12564   else
12565     {
12566       switch (vmode)
12567         {
12568         case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
12569         case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
12570         case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
12571         case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
12572         case V4SImode: gen = gen_aarch64_uzp1v4si; break;
12573         case V2SImode: gen = gen_aarch64_uzp1v2si; break;
12574         case V2DImode: gen = gen_aarch64_uzp1v2di; break;
12575         case V4HFmode: gen = gen_aarch64_uzp1v4hf; break;
12576         case V8HFmode: gen = gen_aarch64_uzp1v8hf; break;
12577         case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
12578         case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
12579         case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
12580         default:
12581           return false;
12582         }
12583     }
12584
12585   emit_insn (gen (out, in0, in1));
12586   return true;
12587 }
12588
12589 /* Recognize patterns suitable for the ZIP instructions.  */
12590 static bool
12591 aarch64_evpc_zip (struct expand_vec_perm_d *d)
12592 {
12593   unsigned int i, high, mask, nelt = d->nelt;
12594   rtx out, in0, in1, x;
12595   rtx (*gen) (rtx, rtx, rtx);
12596   machine_mode vmode = d->vmode;
12597
12598   if (GET_MODE_UNIT_SIZE (vmode) > 8)
12599     return false;
12600
12601   /* Note that these are little-endian tests.
12602      We correct for big-endian later.  */
12603   high = nelt / 2;
12604   if (d->perm[0] == high)
12605     /* Do Nothing.  */
12606     ;
12607   else if (d->perm[0] == 0)
12608     high = 0;
12609   else
12610     return false;
12611   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12612
12613   for (i = 0; i < nelt / 2; i++)
12614     {
12615       unsigned elt = (i + high) & mask;
12616       if (d->perm[i * 2] != elt)
12617         return false;
12618       elt = (elt + nelt) & mask;
12619       if (d->perm[i * 2 + 1] != elt)
12620         return false;
12621     }
12622
12623   /* Success!  */
12624   if (d->testing_p)
12625     return true;
12626
12627   in0 = d->op0;
12628   in1 = d->op1;
12629   if (BYTES_BIG_ENDIAN)
12630     {
12631       x = in0, in0 = in1, in1 = x;
12632       high = !high;
12633     }
12634   out = d->target;
12635
12636   if (high)
12637     {
12638       switch (vmode)
12639         {
12640         case V16QImode: gen = gen_aarch64_zip2v16qi; break;
12641         case V8QImode: gen = gen_aarch64_zip2v8qi; break;
12642         case V8HImode: gen = gen_aarch64_zip2v8hi; break;
12643         case V4HImode: gen = gen_aarch64_zip2v4hi; break;
12644         case V4SImode: gen = gen_aarch64_zip2v4si; break;
12645         case V2SImode: gen = gen_aarch64_zip2v2si; break;
12646         case V2DImode: gen = gen_aarch64_zip2v2di; break;
12647         case V4HFmode: gen = gen_aarch64_zip2v4hf; break;
12648         case V8HFmode: gen = gen_aarch64_zip2v8hf; break;
12649         case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
12650         case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
12651         case V2DFmode: gen = gen_aarch64_zip2v2df; break;
12652         default:
12653           return false;
12654         }
12655     }
12656   else
12657     {
12658       switch (vmode)
12659         {
12660         case V16QImode: gen = gen_aarch64_zip1v16qi; break;
12661         case V8QImode: gen = gen_aarch64_zip1v8qi; break;
12662         case V8HImode: gen = gen_aarch64_zip1v8hi; break;
12663         case V4HImode: gen = gen_aarch64_zip1v4hi; break;
12664         case V4SImode: gen = gen_aarch64_zip1v4si; break;
12665         case V2SImode: gen = gen_aarch64_zip1v2si; break;
12666         case V2DImode: gen = gen_aarch64_zip1v2di; break;
12667         case V4HFmode: gen = gen_aarch64_zip1v4hf; break;
12668         case V8HFmode: gen = gen_aarch64_zip1v8hf; break;
12669         case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
12670         case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
12671         case V2DFmode: gen = gen_aarch64_zip1v2df; break;
12672         default:
12673           return false;
12674         }
12675     }
12676
12677   emit_insn (gen (out, in0, in1));
12678   return true;
12679 }
12680
12681 /* Recognize patterns for the EXT insn.  */
12682
12683 static bool
12684 aarch64_evpc_ext (struct expand_vec_perm_d *d)
12685 {
12686   unsigned int i, nelt = d->nelt;
12687   rtx (*gen) (rtx, rtx, rtx, rtx);
12688   rtx offset;
12689
12690   unsigned int location = d->perm[0]; /* Always < nelt.  */
12691
12692   /* Check if the extracted indices are increasing by one.  */
12693   for (i = 1; i < nelt; i++)
12694     {
12695       unsigned int required = location + i;
12696       if (d->one_vector_p)
12697         {
12698           /* We'll pass the same vector in twice, so allow indices to wrap.  */
12699           required &= (nelt - 1);
12700         }
12701       if (d->perm[i] != required)
12702         return false;
12703     }
12704
12705   switch (d->vmode)
12706     {
12707     case V16QImode: gen = gen_aarch64_extv16qi; break;
12708     case V8QImode: gen = gen_aarch64_extv8qi; break;
12709     case V4HImode: gen = gen_aarch64_extv4hi; break;
12710     case V8HImode: gen = gen_aarch64_extv8hi; break;
12711     case V2SImode: gen = gen_aarch64_extv2si; break;
12712     case V4SImode: gen = gen_aarch64_extv4si; break;
12713     case V4HFmode: gen = gen_aarch64_extv4hf; break;
12714     case V8HFmode: gen = gen_aarch64_extv8hf; break;
12715     case V2SFmode: gen = gen_aarch64_extv2sf; break;
12716     case V4SFmode: gen = gen_aarch64_extv4sf; break;
12717     case V2DImode: gen = gen_aarch64_extv2di; break;
12718     case V2DFmode: gen = gen_aarch64_extv2df; break;
12719     default:
12720       return false;
12721     }
12722
12723   /* Success! */
12724   if (d->testing_p)
12725     return true;
12726
12727   /* The case where (location == 0) is a no-op for both big- and little-endian,
12728      and is removed by the mid-end at optimization levels -O1 and higher.  */
12729
12730   if (BYTES_BIG_ENDIAN && (location != 0))
12731     {
12732       /* After setup, we want the high elements of the first vector (stored
12733          at the LSB end of the register), and the low elements of the second
12734          vector (stored at the MSB end of the register). So swap.  */
12735       std::swap (d->op0, d->op1);
12736       /* location != 0 (above), so safe to assume (nelt - location) < nelt.  */
12737       location = nelt - location;
12738     }
12739
12740   offset = GEN_INT (location);
12741   emit_insn (gen (d->target, d->op0, d->op1, offset));
12742   return true;
12743 }
12744
12745 /* Recognize patterns for the REV insns.  */
12746
12747 static bool
12748 aarch64_evpc_rev (struct expand_vec_perm_d *d)
12749 {
12750   unsigned int i, j, diff, nelt = d->nelt;
12751   rtx (*gen) (rtx, rtx);
12752
12753   if (!d->one_vector_p)
12754     return false;
12755
12756   diff = d->perm[0];
12757   switch (diff)
12758     {
12759     case 7:
12760       switch (d->vmode)
12761         {
12762         case V16QImode: gen = gen_aarch64_rev64v16qi; break;
12763         case V8QImode: gen = gen_aarch64_rev64v8qi;  break;
12764         default:
12765           return false;
12766         }
12767       break;
12768     case 3:
12769       switch (d->vmode)
12770         {
12771         case V16QImode: gen = gen_aarch64_rev32v16qi; break;
12772         case V8QImode: gen = gen_aarch64_rev32v8qi;  break;
12773         case V8HImode: gen = gen_aarch64_rev64v8hi;  break;
12774         case V4HImode: gen = gen_aarch64_rev64v4hi;  break;
12775         default:
12776           return false;
12777         }
12778       break;
12779     case 1:
12780       switch (d->vmode)
12781         {
12782         case V16QImode: gen = gen_aarch64_rev16v16qi; break;
12783         case V8QImode: gen = gen_aarch64_rev16v8qi;  break;
12784         case V8HImode: gen = gen_aarch64_rev32v8hi;  break;
12785         case V4HImode: gen = gen_aarch64_rev32v4hi;  break;
12786         case V4SImode: gen = gen_aarch64_rev64v4si;  break;
12787         case V2SImode: gen = gen_aarch64_rev64v2si;  break;
12788         case V4SFmode: gen = gen_aarch64_rev64v4sf;  break;
12789         case V2SFmode: gen = gen_aarch64_rev64v2sf;  break;
12790         case V8HFmode: gen = gen_aarch64_rev64v8hf;  break;
12791         case V4HFmode: gen = gen_aarch64_rev64v4hf;  break;
12792         default:
12793           return false;
12794         }
12795       break;
12796     default:
12797       return false;
12798     }
12799
12800   for (i = 0; i < nelt ; i += diff + 1)
12801     for (j = 0; j <= diff; j += 1)
12802       {
12803         /* This is guaranteed to be true as the value of diff
12804            is 7, 3, 1 and we should have enough elements in the
12805            queue to generate this.  Getting a vector mask with a
12806            value of diff other than these values implies that
12807            something is wrong by the time we get here.  */
12808         gcc_assert (i + j < nelt);
12809         if (d->perm[i + j] != i + diff - j)
12810           return false;
12811       }
12812
12813   /* Success! */
12814   if (d->testing_p)
12815     return true;
12816
12817   emit_insn (gen (d->target, d->op0));
12818   return true;
12819 }
12820
12821 static bool
12822 aarch64_evpc_dup (struct expand_vec_perm_d *d)
12823 {
12824   rtx (*gen) (rtx, rtx, rtx);
12825   rtx out = d->target;
12826   rtx in0;
12827   machine_mode vmode = d->vmode;
12828   unsigned int i, elt, nelt = d->nelt;
12829   rtx lane;
12830
12831   elt = d->perm[0];
12832   for (i = 1; i < nelt; i++)
12833     {
12834       if (elt != d->perm[i])
12835         return false;
12836     }
12837
12838   /* The generic preparation in aarch64_expand_vec_perm_const_1
12839      swaps the operand order and the permute indices if it finds
12840      d->perm[0] to be in the second operand.  Thus, we can always
12841      use d->op0 and need not do any extra arithmetic to get the
12842      correct lane number.  */
12843   in0 = d->op0;
12844   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
12845
12846   switch (vmode)
12847     {
12848     case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
12849     case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
12850     case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
12851     case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
12852     case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
12853     case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
12854     case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
12855     case V8HFmode: gen = gen_aarch64_dup_lanev8hf; break;
12856     case V4HFmode: gen = gen_aarch64_dup_lanev4hf; break;
12857     case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
12858     case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
12859     case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
12860     default:
12861       return false;
12862     }
12863
12864   emit_insn (gen (out, in0, lane));
12865   return true;
12866 }
12867
12868 static bool
12869 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
12870 {
12871   rtx rperm[MAX_VECT_LEN], sel;
12872   machine_mode vmode = d->vmode;
12873   unsigned int i, nelt = d->nelt;
12874
12875   if (d->testing_p)
12876     return true;
12877
12878   /* Generic code will try constant permutation twice.  Once with the
12879      original mode and again with the elements lowered to QImode.
12880      So wait and don't do the selector expansion ourselves.  */
12881   if (vmode != V8QImode && vmode != V16QImode)
12882     return false;
12883
12884   for (i = 0; i < nelt; ++i)
12885     {
12886       int nunits = GET_MODE_NUNITS (vmode);
12887
12888       /* If big-endian and two vectors we end up with a weird mixed-endian
12889          mode on NEON.  Reverse the index within each word but not the word
12890          itself.  */
12891       rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
12892                                            : d->perm[i]);
12893     }
12894   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
12895   sel = force_reg (vmode, sel);
12896
12897   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
12898   return true;
12899 }
12900
12901 static bool
12902 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
12903 {
12904   /* The pattern matching functions above are written to look for a small
12905      number to begin the sequence (0, 1, N/2).  If we begin with an index
12906      from the second operand, we can swap the operands.  */
12907   if (d->perm[0] >= d->nelt)
12908     {
12909       unsigned i, nelt = d->nelt;
12910
12911       gcc_assert (nelt == (nelt & -nelt));
12912       for (i = 0; i < nelt; ++i)
12913         d->perm[i] ^= nelt; /* Keep the same index, but in the other vector.  */
12914
12915       std::swap (d->op0, d->op1);
12916     }
12917
12918   if (TARGET_SIMD)
12919     {
12920       if (aarch64_evpc_rev (d))
12921         return true;
12922       else if (aarch64_evpc_ext (d))
12923         return true;
12924       else if (aarch64_evpc_dup (d))
12925         return true;
12926       else if (aarch64_evpc_zip (d))
12927         return true;
12928       else if (aarch64_evpc_uzp (d))
12929         return true;
12930       else if (aarch64_evpc_trn (d))
12931         return true;
12932       return aarch64_evpc_tbl (d);
12933     }
12934   return false;
12935 }
12936
12937 /* Expand a vec_perm_const pattern.  */
12938
12939 bool
12940 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
12941 {
12942   struct expand_vec_perm_d d;
12943   int i, nelt, which;
12944
12945   d.target = target;
12946   d.op0 = op0;
12947   d.op1 = op1;
12948
12949   d.vmode = GET_MODE (target);
12950   gcc_assert (VECTOR_MODE_P (d.vmode));
12951   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
12952   d.testing_p = false;
12953
12954   for (i = which = 0; i < nelt; ++i)
12955     {
12956       rtx e = XVECEXP (sel, 0, i);
12957       int ei = INTVAL (e) & (2 * nelt - 1);
12958       which |= (ei < nelt ? 1 : 2);
12959       d.perm[i] = ei;
12960     }
12961
12962   switch (which)
12963     {
12964     default:
12965       gcc_unreachable ();
12966
12967     case 3:
12968       d.one_vector_p = false;
12969       if (!rtx_equal_p (op0, op1))
12970         break;
12971
12972       /* The elements of PERM do not suggest that only the first operand
12973          is used, but both operands are identical.  Allow easier matching
12974          of the permutation by folding the permutation into the single
12975          input vector.  */
12976       /* Fall Through.  */
12977     case 2:
12978       for (i = 0; i < nelt; ++i)
12979         d.perm[i] &= nelt - 1;
12980       d.op0 = op1;
12981       d.one_vector_p = true;
12982       break;
12983
12984     case 1:
12985       d.op1 = op0;
12986       d.one_vector_p = true;
12987       break;
12988     }
12989
12990   return aarch64_expand_vec_perm_const_1 (&d);
12991 }
12992
12993 static bool
12994 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
12995                                      const unsigned char *sel)
12996 {
12997   struct expand_vec_perm_d d;
12998   unsigned int i, nelt, which;
12999   bool ret;
13000
13001   d.vmode = vmode;
13002   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13003   d.testing_p = true;
13004   memcpy (d.perm, sel, nelt);
13005
13006   /* Calculate whether all elements are in one vector.  */
13007   for (i = which = 0; i < nelt; ++i)
13008     {
13009       unsigned char e = d.perm[i];
13010       gcc_assert (e < 2 * nelt);
13011       which |= (e < nelt ? 1 : 2);
13012     }
13013
13014   /* If all elements are from the second vector, reindex as if from the
13015      first vector.  */
13016   if (which == 2)
13017     for (i = 0; i < nelt; ++i)
13018       d.perm[i] -= nelt;
13019
13020   /* Check whether the mask can be applied to a single vector.  */
13021   d.one_vector_p = (which != 3);
13022
13023   d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
13024   d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
13025   if (!d.one_vector_p)
13026     d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
13027
13028   start_sequence ();
13029   ret = aarch64_expand_vec_perm_const_1 (&d);
13030   end_sequence ();
13031
13032   return ret;
13033 }
13034
13035 rtx
13036 aarch64_reverse_mask (enum machine_mode mode)
13037 {
13038   /* We have to reverse each vector because we dont have
13039      a permuted load that can reverse-load according to ABI rules.  */
13040   rtx mask;
13041   rtvec v = rtvec_alloc (16);
13042   int i, j;
13043   int nunits = GET_MODE_NUNITS (mode);
13044   int usize = GET_MODE_UNIT_SIZE (mode);
13045
13046   gcc_assert (BYTES_BIG_ENDIAN);
13047   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
13048
13049   for (i = 0; i < nunits; i++)
13050     for (j = 0; j < usize; j++)
13051       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
13052   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
13053   return force_reg (V16QImode, mask);
13054 }
13055
13056 /* Implement MODES_TIEABLE_P.  In principle we should always return true.
13057    However due to issues with register allocation it is preferable to avoid
13058    tieing integer scalar and FP scalar modes.  Executing integer operations
13059    in general registers is better than treating them as scalar vector
13060    operations.  This reduces latency and avoids redundant int<->FP moves.
13061    So tie modes if they are either the same class, or vector modes with
13062    other vector modes, vector structs or any scalar mode.
13063 */
13064
13065 bool
13066 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
13067 {
13068   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
13069     return true;
13070
13071   /* We specifically want to allow elements of "structure" modes to
13072      be tieable to the structure.  This more general condition allows
13073      other rarer situations too.  */
13074   if (aarch64_vector_mode_p (mode1) && aarch64_vector_mode_p (mode2))
13075     return true;
13076
13077   /* Also allow any scalar modes with vectors.  */
13078   if (aarch64_vector_mode_supported_p (mode1)
13079       || aarch64_vector_mode_supported_p (mode2))
13080     return true;
13081
13082   return false;
13083 }
13084
13085 /* Return a new RTX holding the result of moving POINTER forward by
13086    AMOUNT bytes.  */
13087
13088 static rtx
13089 aarch64_move_pointer (rtx pointer, int amount)
13090 {
13091   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
13092
13093   return adjust_automodify_address (pointer, GET_MODE (pointer),
13094                                     next, amount);
13095 }
13096
13097 /* Return a new RTX holding the result of moving POINTER forward by the
13098    size of the mode it points to.  */
13099
13100 static rtx
13101 aarch64_progress_pointer (rtx pointer)
13102 {
13103   HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
13104
13105   return aarch64_move_pointer (pointer, amount);
13106 }
13107
13108 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
13109    MODE bytes.  */
13110
13111 static void
13112 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
13113                                               machine_mode mode)
13114 {
13115   rtx reg = gen_reg_rtx (mode);
13116
13117   /* "Cast" the pointers to the correct mode.  */
13118   *src = adjust_address (*src, mode, 0);
13119   *dst = adjust_address (*dst, mode, 0);
13120   /* Emit the memcpy.  */
13121   emit_move_insn (reg, *src);
13122   emit_move_insn (*dst, reg);
13123   /* Move the pointers forward.  */
13124   *src = aarch64_progress_pointer (*src);
13125   *dst = aarch64_progress_pointer (*dst);
13126 }
13127
13128 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
13129    we succeed, otherwise return false.  */
13130
13131 bool
13132 aarch64_expand_movmem (rtx *operands)
13133 {
13134   unsigned int n;
13135   rtx dst = operands[0];
13136   rtx src = operands[1];
13137   rtx base;
13138   bool speed_p = !optimize_function_for_size_p (cfun);
13139
13140   /* When optimizing for size, give a better estimate of the length of a
13141      memcpy call, but use the default otherwise.  */
13142   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
13143
13144   /* We can't do anything smart if the amount to copy is not constant.  */
13145   if (!CONST_INT_P (operands[2]))
13146     return false;
13147
13148   n = UINTVAL (operands[2]);
13149
13150   /* Try to keep the number of instructions low.  For cases below 16 bytes we
13151      need to make at most two moves.  For cases above 16 bytes it will be one
13152      move for each 16 byte chunk, then at most two additional moves.  */
13153   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
13154     return false;
13155
13156   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
13157   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
13158
13159   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
13160   src = adjust_automodify_address (src, VOIDmode, base, 0);
13161
13162   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
13163      1-byte chunk.  */
13164   if (n < 4)
13165     {
13166       if (n >= 2)
13167         {
13168           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13169           n -= 2;
13170         }
13171
13172       if (n == 1)
13173         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13174
13175       return true;
13176     }
13177
13178   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
13179      4-byte chunk, partially overlapping with the previously copied chunk.  */
13180   if (n < 8)
13181     {
13182       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13183       n -= 4;
13184       if (n > 0)
13185         {
13186           int move = n - 4;
13187
13188           src = aarch64_move_pointer (src, move);
13189           dst = aarch64_move_pointer (dst, move);
13190           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13191         }
13192       return true;
13193     }
13194
13195   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
13196      them, then (if applicable) an 8-byte chunk.  */
13197   while (n >= 8)
13198     {
13199       if (n / 16)
13200         {
13201           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
13202           n -= 16;
13203         }
13204       else
13205         {
13206           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13207           n -= 8;
13208         }
13209     }
13210
13211   /* Finish the final bytes of the copy.  We can always do this in one
13212      instruction.  We either copy the exact amount we need, or partially
13213      overlap with the previous chunk we copied and copy 8-bytes.  */
13214   if (n == 0)
13215     return true;
13216   else if (n == 1)
13217     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13218   else if (n == 2)
13219     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13220   else if (n == 4)
13221     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13222   else
13223     {
13224       if (n == 3)
13225         {
13226           src = aarch64_move_pointer (src, -1);
13227           dst = aarch64_move_pointer (dst, -1);
13228           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13229         }
13230       else
13231         {
13232           int move = n - 8;
13233
13234           src = aarch64_move_pointer (src, move);
13235           dst = aarch64_move_pointer (dst, move);
13236           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13237         }
13238     }
13239
13240   return true;
13241 }
13242
13243 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
13244
13245 static unsigned HOST_WIDE_INT
13246 aarch64_asan_shadow_offset (void)
13247 {
13248   return (HOST_WIDE_INT_1 << 36);
13249 }
13250
13251 static bool
13252 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
13253                                         unsigned int align,
13254                                         enum by_pieces_operation op,
13255                                         bool speed_p)
13256 {
13257   /* STORE_BY_PIECES can be used when copying a constant string, but
13258      in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
13259      For now we always fail this and let the move_by_pieces code copy
13260      the string from read-only memory.  */
13261   if (op == STORE_BY_PIECES)
13262     return false;
13263
13264   return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
13265 }
13266
13267 static rtx
13268 aarch64_gen_ccmp_first (rtx *prep_seq, rtx *gen_seq,
13269                         int code, tree treeop0, tree treeop1)
13270 {
13271   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
13272   rtx op0, op1;
13273   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
13274   insn_code icode;
13275   struct expand_operand ops[4];
13276
13277   start_sequence ();
13278   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
13279
13280   op_mode = GET_MODE (op0);
13281   if (op_mode == VOIDmode)
13282     op_mode = GET_MODE (op1);
13283
13284   switch (op_mode)
13285     {
13286     case QImode:
13287     case HImode:
13288     case SImode:
13289       cmp_mode = SImode;
13290       icode = CODE_FOR_cmpsi;
13291       break;
13292
13293     case DImode:
13294       cmp_mode = DImode;
13295       icode = CODE_FOR_cmpdi;
13296       break;
13297
13298     case SFmode:
13299       cmp_mode = SFmode;
13300       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
13301       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
13302       break;
13303
13304     case DFmode:
13305       cmp_mode = DFmode;
13306       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
13307       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
13308       break;
13309
13310     default:
13311       end_sequence ();
13312       return NULL_RTX;
13313     }
13314
13315   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
13316   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
13317   if (!op0 || !op1)
13318     {
13319       end_sequence ();
13320       return NULL_RTX;
13321     }
13322   *prep_seq = get_insns ();
13323   end_sequence ();
13324
13325   create_fixed_operand (&ops[0], op0);
13326   create_fixed_operand (&ops[1], op1);
13327
13328   start_sequence ();
13329   if (!maybe_expand_insn (icode, 2, ops))
13330     {
13331       end_sequence ();
13332       return NULL_RTX;
13333     }
13334   *gen_seq = get_insns ();
13335   end_sequence ();
13336
13337   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
13338                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
13339 }
13340
13341 static rtx
13342 aarch64_gen_ccmp_next (rtx *prep_seq, rtx *gen_seq, rtx prev, int cmp_code,
13343                        tree treeop0, tree treeop1, int bit_code)
13344 {
13345   rtx op0, op1, target;
13346   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
13347   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
13348   insn_code icode;
13349   struct expand_operand ops[6];
13350   int aarch64_cond;
13351
13352   push_to_sequence ((rtx_insn*) *prep_seq);
13353   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
13354
13355   op_mode = GET_MODE (op0);
13356   if (op_mode == VOIDmode)
13357     op_mode = GET_MODE (op1);
13358
13359   switch (op_mode)
13360     {
13361     case QImode:
13362     case HImode:
13363     case SImode:
13364       cmp_mode = SImode;
13365       icode = CODE_FOR_ccmpsi;
13366       break;
13367
13368     case DImode:
13369       cmp_mode = DImode;
13370       icode = CODE_FOR_ccmpdi;
13371       break;
13372
13373     case SFmode:
13374       cmp_mode = SFmode;
13375       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
13376       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
13377       break;
13378
13379     case DFmode:
13380       cmp_mode = DFmode;
13381       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
13382       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
13383       break;
13384
13385     default:
13386       end_sequence ();
13387       return NULL_RTX;
13388     }
13389
13390   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
13391   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
13392   if (!op0 || !op1)
13393     {
13394       end_sequence ();
13395       return NULL_RTX;
13396     }
13397   *prep_seq = get_insns ();
13398   end_sequence ();
13399
13400   target = gen_rtx_REG (cc_mode, CC_REGNUM);
13401   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
13402
13403   if (bit_code != AND)
13404     {
13405       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
13406                                                 GET_MODE (XEXP (prev, 0))),
13407                              VOIDmode, XEXP (prev, 0), const0_rtx);
13408       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
13409     }
13410
13411   create_fixed_operand (&ops[0], XEXP (prev, 0));
13412   create_fixed_operand (&ops[1], target);
13413   create_fixed_operand (&ops[2], op0);
13414   create_fixed_operand (&ops[3], op1);
13415   create_fixed_operand (&ops[4], prev);
13416   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
13417
13418   push_to_sequence ((rtx_insn*) *gen_seq);
13419   if (!maybe_expand_insn (icode, 6, ops))
13420     {
13421       end_sequence ();
13422       return NULL_RTX;
13423     }
13424
13425   *gen_seq = get_insns ();
13426   end_sequence ();
13427
13428   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
13429 }
13430
13431 #undef TARGET_GEN_CCMP_FIRST
13432 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
13433
13434 #undef TARGET_GEN_CCMP_NEXT
13435 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
13436
13437 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
13438    instruction fusion of some sort.  */
13439
13440 static bool
13441 aarch64_macro_fusion_p (void)
13442 {
13443   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
13444 }
13445
13446
13447 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
13448    should be kept together during scheduling.  */
13449
13450 static bool
13451 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
13452 {
13453   rtx set_dest;
13454   rtx prev_set = single_set (prev);
13455   rtx curr_set = single_set (curr);
13456   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
13457   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
13458
13459   if (!aarch64_macro_fusion_p ())
13460     return false;
13461
13462   if (simple_sets_p
13463       && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_MOV_MOVK))
13464     {
13465       /* We are trying to match:
13466          prev (mov)  == (set (reg r0) (const_int imm16))
13467          curr (movk) == (set (zero_extract (reg r0)
13468                                            (const_int 16)
13469                                            (const_int 16))
13470                              (const_int imm16_1))  */
13471
13472       set_dest = SET_DEST (curr_set);
13473
13474       if (GET_CODE (set_dest) == ZERO_EXTRACT
13475           && CONST_INT_P (SET_SRC (curr_set))
13476           && CONST_INT_P (SET_SRC (prev_set))
13477           && CONST_INT_P (XEXP (set_dest, 2))
13478           && INTVAL (XEXP (set_dest, 2)) == 16
13479           && REG_P (XEXP (set_dest, 0))
13480           && REG_P (SET_DEST (prev_set))
13481           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
13482         {
13483           return true;
13484         }
13485     }
13486
13487   if (simple_sets_p
13488       && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_ADRP_ADD))
13489     {
13490
13491       /*  We're trying to match:
13492           prev (adrp) == (set (reg r1)
13493                               (high (symbol_ref ("SYM"))))
13494           curr (add) == (set (reg r0)
13495                              (lo_sum (reg r1)
13496                                      (symbol_ref ("SYM"))))
13497           Note that r0 need not necessarily be the same as r1, especially
13498           during pre-regalloc scheduling.  */
13499
13500       if (satisfies_constraint_Ush (SET_SRC (prev_set))
13501           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
13502         {
13503           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
13504               && REG_P (XEXP (SET_SRC (curr_set), 0))
13505               && REGNO (XEXP (SET_SRC (curr_set), 0))
13506                  == REGNO (SET_DEST (prev_set))
13507               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
13508                               XEXP (SET_SRC (curr_set), 1)))
13509             return true;
13510         }
13511     }
13512
13513   if (simple_sets_p
13514       && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_MOVK_MOVK))
13515     {
13516
13517       /* We're trying to match:
13518          prev (movk) == (set (zero_extract (reg r0)
13519                                            (const_int 16)
13520                                            (const_int 32))
13521                              (const_int imm16_1))
13522          curr (movk) == (set (zero_extract (reg r0)
13523                                            (const_int 16)
13524                                            (const_int 48))
13525                              (const_int imm16_2))  */
13526
13527       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
13528           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
13529           && REG_P (XEXP (SET_DEST (prev_set), 0))
13530           && REG_P (XEXP (SET_DEST (curr_set), 0))
13531           && REGNO (XEXP (SET_DEST (prev_set), 0))
13532              == REGNO (XEXP (SET_DEST (curr_set), 0))
13533           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
13534           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
13535           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
13536           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
13537           && CONST_INT_P (SET_SRC (prev_set))
13538           && CONST_INT_P (SET_SRC (curr_set)))
13539         return true;
13540
13541     }
13542   if (simple_sets_p
13543       && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_ADRP_LDR))
13544     {
13545       /* We're trying to match:
13546           prev (adrp) == (set (reg r0)
13547                               (high (symbol_ref ("SYM"))))
13548           curr (ldr) == (set (reg r1)
13549                              (mem (lo_sum (reg r0)
13550                                              (symbol_ref ("SYM")))))
13551                  or
13552           curr (ldr) == (set (reg r1)
13553                              (zero_extend (mem
13554                                            (lo_sum (reg r0)
13555                                                    (symbol_ref ("SYM"))))))  */
13556       if (satisfies_constraint_Ush (SET_SRC (prev_set))
13557           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
13558         {
13559           rtx curr_src = SET_SRC (curr_set);
13560
13561           if (GET_CODE (curr_src) == ZERO_EXTEND)
13562             curr_src = XEXP (curr_src, 0);
13563
13564           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
13565               && REG_P (XEXP (XEXP (curr_src, 0), 0))
13566               && REGNO (XEXP (XEXP (curr_src, 0), 0))
13567                  == REGNO (SET_DEST (prev_set))
13568               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
13569                               XEXP (SET_SRC (prev_set), 0)))
13570               return true;
13571         }
13572     }
13573
13574   if ((aarch64_tune_params.fusible_ops & AARCH64_FUSE_AES_AESMC)
13575        && aarch_crypto_can_dual_issue (prev, curr))
13576     return true;
13577
13578   if ((aarch64_tune_params.fusible_ops & AARCH64_FUSE_CMP_BRANCH)
13579       && any_condjump_p (curr))
13580     {
13581       enum attr_type prev_type = get_attr_type (prev);
13582
13583       /* FIXME: this misses some which is considered simple arthematic
13584          instructions for ThunderX.  Simple shifts are missed here.  */
13585       if (prev_type == TYPE_ALUS_SREG
13586           || prev_type == TYPE_ALUS_IMM
13587           || prev_type == TYPE_LOGICS_REG
13588           || prev_type == TYPE_LOGICS_IMM)
13589         return true;
13590     }
13591
13592   return false;
13593 }
13594
13595 /* Return true iff the instruction fusion described by OP is enabled.  */
13596
13597 bool
13598 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
13599 {
13600   return (aarch64_tune_params.fusible_ops & op) != 0;
13601 }
13602
13603 /* If MEM is in the form of [base+offset], extract the two parts
13604    of address and set to BASE and OFFSET, otherwise return false
13605    after clearing BASE and OFFSET.  */
13606
13607 bool
13608 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
13609 {
13610   rtx addr;
13611
13612   gcc_assert (MEM_P (mem));
13613
13614   addr = XEXP (mem, 0);
13615
13616   if (REG_P (addr))
13617     {
13618       *base = addr;
13619       *offset = const0_rtx;
13620       return true;
13621     }
13622
13623   if (GET_CODE (addr) == PLUS
13624       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
13625     {
13626       *base = XEXP (addr, 0);
13627       *offset = XEXP (addr, 1);
13628       return true;
13629     }
13630
13631   *base = NULL_RTX;
13632   *offset = NULL_RTX;
13633
13634   return false;
13635 }
13636
13637 /* Types for scheduling fusion.  */
13638 enum sched_fusion_type
13639 {
13640   SCHED_FUSION_NONE = 0,
13641   SCHED_FUSION_LD_SIGN_EXTEND,
13642   SCHED_FUSION_LD_ZERO_EXTEND,
13643   SCHED_FUSION_LD,
13644   SCHED_FUSION_ST,
13645   SCHED_FUSION_NUM
13646 };
13647
13648 /* If INSN is a load or store of address in the form of [base+offset],
13649    extract the two parts and set to BASE and OFFSET.  Return scheduling
13650    fusion type this INSN is.  */
13651
13652 static enum sched_fusion_type
13653 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
13654 {
13655   rtx x, dest, src;
13656   enum sched_fusion_type fusion = SCHED_FUSION_LD;
13657
13658   gcc_assert (INSN_P (insn));
13659   x = PATTERN (insn);
13660   if (GET_CODE (x) != SET)
13661     return SCHED_FUSION_NONE;
13662
13663   src = SET_SRC (x);
13664   dest = SET_DEST (x);
13665
13666   machine_mode dest_mode = GET_MODE (dest);
13667
13668   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
13669     return SCHED_FUSION_NONE;
13670
13671   if (GET_CODE (src) == SIGN_EXTEND)
13672     {
13673       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
13674       src = XEXP (src, 0);
13675       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
13676         return SCHED_FUSION_NONE;
13677     }
13678   else if (GET_CODE (src) == ZERO_EXTEND)
13679     {
13680       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
13681       src = XEXP (src, 0);
13682       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
13683         return SCHED_FUSION_NONE;
13684     }
13685
13686   if (GET_CODE (src) == MEM && REG_P (dest))
13687     extract_base_offset_in_addr (src, base, offset);
13688   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
13689     {
13690       fusion = SCHED_FUSION_ST;
13691       extract_base_offset_in_addr (dest, base, offset);
13692     }
13693   else
13694     return SCHED_FUSION_NONE;
13695
13696   if (*base == NULL_RTX || *offset == NULL_RTX)
13697     fusion = SCHED_FUSION_NONE;
13698
13699   return fusion;
13700 }
13701
13702 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
13703
13704    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
13705    and PRI are only calculated for these instructions.  For other instruction,
13706    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
13707    type instruction fusion can be added by returning different priorities.
13708
13709    It's important that irrelevant instructions get the largest FUSION_PRI.  */
13710
13711 static void
13712 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
13713                                int *fusion_pri, int *pri)
13714 {
13715   int tmp, off_val;
13716   rtx base, offset;
13717   enum sched_fusion_type fusion;
13718
13719   gcc_assert (INSN_P (insn));
13720
13721   tmp = max_pri - 1;
13722   fusion = fusion_load_store (insn, &base, &offset);
13723   if (fusion == SCHED_FUSION_NONE)
13724     {
13725       *pri = tmp;
13726       *fusion_pri = tmp;
13727       return;
13728     }
13729
13730   /* Set FUSION_PRI according to fusion type and base register.  */
13731   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
13732
13733   /* Calculate PRI.  */
13734   tmp /= 2;
13735
13736   /* INSN with smaller offset goes first.  */
13737   off_val = (int)(INTVAL (offset));
13738   if (off_val >= 0)
13739     tmp -= (off_val & 0xfffff);
13740   else
13741     tmp += ((- off_val) & 0xfffff);
13742
13743   *pri = tmp;
13744   return;
13745 }
13746
13747 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
13748    Adjust priority of sha1h instructions so they are scheduled before
13749    other SHA1 instructions.  */
13750
13751 static int
13752 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
13753 {
13754   rtx x = PATTERN (insn);
13755
13756   if (GET_CODE (x) == SET)
13757     {
13758       x = SET_SRC (x);
13759
13760       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
13761         return priority + 10;
13762     }
13763
13764   return priority;
13765 }
13766
13767 /* Given OPERANDS of consecutive load/store, check if we can merge
13768    them into ldp/stp.  LOAD is true if they are load instructions.
13769    MODE is the mode of memory operands.  */
13770
13771 bool
13772 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
13773                                 enum machine_mode mode)
13774 {
13775   HOST_WIDE_INT offval_1, offval_2, msize;
13776   enum reg_class rclass_1, rclass_2;
13777   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
13778
13779   if (load)
13780     {
13781       mem_1 = operands[1];
13782       mem_2 = operands[3];
13783       reg_1 = operands[0];
13784       reg_2 = operands[2];
13785       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
13786       if (REGNO (reg_1) == REGNO (reg_2))
13787         return false;
13788     }
13789   else
13790     {
13791       mem_1 = operands[0];
13792       mem_2 = operands[2];
13793       reg_1 = operands[1];
13794       reg_2 = operands[3];
13795     }
13796
13797   /* The mems cannot be volatile.  */
13798   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
13799     return false;
13800
13801   /* If we have SImode and slow unaligned ldp,
13802      check the alignment to be at least 8 byte. */
13803   if (mode == SImode
13804       && (aarch64_tune_params.extra_tuning_flags
13805           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
13806       && !optimize_size
13807       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
13808     return false;
13809
13810   /* Check if the addresses are in the form of [base+offset].  */
13811   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
13812   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
13813     return false;
13814   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
13815   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
13816     return false;
13817
13818   /* Check if the bases are same.  */
13819   if (!rtx_equal_p (base_1, base_2))
13820     return false;
13821
13822   offval_1 = INTVAL (offset_1);
13823   offval_2 = INTVAL (offset_2);
13824   msize = GET_MODE_SIZE (mode);
13825   /* Check if the offsets are consecutive.  */
13826   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
13827     return false;
13828
13829   /* Check if the addresses are clobbered by load.  */
13830   if (load)
13831     {
13832       if (reg_mentioned_p (reg_1, mem_1))
13833         return false;
13834
13835       /* In increasing order, the last load can clobber the address.  */
13836       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
13837       return false;
13838     }
13839
13840   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
13841     rclass_1 = FP_REGS;
13842   else
13843     rclass_1 = GENERAL_REGS;
13844
13845   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
13846     rclass_2 = FP_REGS;
13847   else
13848     rclass_2 = GENERAL_REGS;
13849
13850   /* Check if the registers are of same class.  */
13851   if (rclass_1 != rclass_2)
13852     return false;
13853
13854   return true;
13855 }
13856
13857 /* Given OPERANDS of consecutive load/store, check if we can merge
13858    them into ldp/stp by adjusting the offset.  LOAD is true if they
13859    are load instructions.  MODE is the mode of memory operands.
13860
13861    Given below consecutive stores:
13862
13863      str  w1, [xb, 0x100]
13864      str  w1, [xb, 0x104]
13865      str  w1, [xb, 0x108]
13866      str  w1, [xb, 0x10c]
13867
13868    Though the offsets are out of the range supported by stp, we can
13869    still pair them after adjusting the offset, like:
13870
13871      add  scratch, xb, 0x100
13872      stp  w1, w1, [scratch]
13873      stp  w1, w1, [scratch, 0x8]
13874
13875    The peephole patterns detecting this opportunity should guarantee
13876    the scratch register is avaliable.  */
13877
13878 bool
13879 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
13880                                        enum machine_mode mode)
13881 {
13882   enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
13883   HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
13884   rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
13885   rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
13886
13887   if (load)
13888     {
13889       reg_1 = operands[0];
13890       mem_1 = operands[1];
13891       reg_2 = operands[2];
13892       mem_2 = operands[3];
13893       reg_3 = operands[4];
13894       mem_3 = operands[5];
13895       reg_4 = operands[6];
13896       mem_4 = operands[7];
13897       gcc_assert (REG_P (reg_1) && REG_P (reg_2)
13898                   && REG_P (reg_3) && REG_P (reg_4));
13899       if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
13900         return false;
13901     }
13902   else
13903     {
13904       mem_1 = operands[0];
13905       reg_1 = operands[1];
13906       mem_2 = operands[2];
13907       reg_2 = operands[3];
13908       mem_3 = operands[4];
13909       reg_3 = operands[5];
13910       mem_4 = operands[6];
13911       reg_4 = operands[7];
13912     }
13913   /* Skip if memory operand is by itslef valid for ldp/stp.  */
13914   if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
13915     return false;
13916
13917   /* The mems cannot be volatile.  */
13918   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
13919       || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
13920     return false;
13921
13922   /* Check if the addresses are in the form of [base+offset].  */
13923   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
13924   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
13925     return false;
13926   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
13927   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
13928     return false;
13929   extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
13930   if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
13931     return false;
13932   extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
13933   if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
13934     return false;
13935
13936   /* Check if the bases are same.  */
13937   if (!rtx_equal_p (base_1, base_2)
13938       || !rtx_equal_p (base_2, base_3)
13939       || !rtx_equal_p (base_3, base_4))
13940     return false;
13941
13942   offval_1 = INTVAL (offset_1);
13943   offval_2 = INTVAL (offset_2);
13944   offval_3 = INTVAL (offset_3);
13945   offval_4 = INTVAL (offset_4);
13946   msize = GET_MODE_SIZE (mode);
13947   /* Check if the offsets are consecutive.  */
13948   if ((offval_1 != (offval_2 + msize)
13949        || offval_1 != (offval_3 + msize * 2)
13950        || offval_1 != (offval_4 + msize * 3))
13951       && (offval_4 != (offval_3 + msize)
13952           || offval_4 != (offval_2 + msize * 2)
13953           || offval_4 != (offval_1 + msize * 3)))
13954     return false;
13955
13956   /* Check if the addresses are clobbered by load.  */
13957   if (load)
13958     {
13959       if (reg_mentioned_p (reg_1, mem_1)
13960           || reg_mentioned_p (reg_2, mem_2)
13961           || reg_mentioned_p (reg_3, mem_3))
13962         return false;
13963
13964       /* In increasing order, the last load can clobber the address.  */
13965       if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
13966         return false;
13967     }
13968
13969   /* If we have SImode and slow unaligned ldp,
13970      check the alignment to be at least 8 byte. */
13971   if (mode == SImode
13972       && (aarch64_tune_params.extra_tuning_flags
13973           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
13974       && !optimize_size
13975       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
13976     return false;
13977
13978   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
13979     rclass_1 = FP_REGS;
13980   else
13981     rclass_1 = GENERAL_REGS;
13982
13983   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
13984     rclass_2 = FP_REGS;
13985   else
13986     rclass_2 = GENERAL_REGS;
13987
13988   if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
13989     rclass_3 = FP_REGS;
13990   else
13991     rclass_3 = GENERAL_REGS;
13992
13993   if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
13994     rclass_4 = FP_REGS;
13995   else
13996     rclass_4 = GENERAL_REGS;
13997
13998   /* Check if the registers are of same class.  */
13999   if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
14000     return false;
14001
14002   return true;
14003 }
14004
14005 /* Given OPERANDS of consecutive load/store, this function pairs them
14006    into ldp/stp after adjusting the offset.  It depends on the fact
14007    that addresses of load/store instructions are in increasing order.
14008    MODE is the mode of memory operands.  CODE is the rtl operator
14009    which should be applied to all memory operands, it's SIGN_EXTEND,
14010    ZERO_EXTEND or UNKNOWN.  */
14011
14012 bool
14013 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
14014                              enum machine_mode mode, RTX_CODE code)
14015 {
14016   rtx base, offset, t1, t2;
14017   rtx mem_1, mem_2, mem_3, mem_4;
14018   HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
14019
14020   if (load)
14021     {
14022       mem_1 = operands[1];
14023       mem_2 = operands[3];
14024       mem_3 = operands[5];
14025       mem_4 = operands[7];
14026     }
14027   else
14028     {
14029       mem_1 = operands[0];
14030       mem_2 = operands[2];
14031       mem_3 = operands[4];
14032       mem_4 = operands[6];
14033       gcc_assert (code == UNKNOWN);
14034     }
14035
14036   extract_base_offset_in_addr (mem_1, &base, &offset);
14037   gcc_assert (base != NULL_RTX && offset != NULL_RTX);
14038
14039   /* Adjust offset thus it can fit in ldp/stp instruction.  */
14040   msize = GET_MODE_SIZE (mode);
14041   stp_off_limit = msize * 0x40;
14042   off_val = INTVAL (offset);
14043   abs_off = (off_val < 0) ? -off_val : off_val;
14044   new_off = abs_off % stp_off_limit;
14045   adj_off = abs_off - new_off;
14046
14047   /* Further adjust to make sure all offsets are OK.  */
14048   if ((new_off + msize * 2) >= stp_off_limit)
14049     {
14050       adj_off += stp_off_limit;
14051       new_off -= stp_off_limit;
14052     }
14053
14054   /* Make sure the adjustment can be done with ADD/SUB instructions.  */
14055   if (adj_off >= 0x1000)
14056     return false;
14057
14058   if (off_val < 0)
14059     {
14060       adj_off = -adj_off;
14061       new_off = -new_off;
14062     }
14063
14064   /* Create new memory references.  */
14065   mem_1 = change_address (mem_1, VOIDmode,
14066                           plus_constant (DImode, operands[8], new_off));
14067
14068   /* Check if the adjusted address is OK for ldp/stp.  */
14069   if (!aarch64_mem_pair_operand (mem_1, mode))
14070     return false;
14071
14072   msize = GET_MODE_SIZE (mode);
14073   mem_2 = change_address (mem_2, VOIDmode,
14074                           plus_constant (DImode,
14075                                          operands[8],
14076                                          new_off + msize));
14077   mem_3 = change_address (mem_3, VOIDmode,
14078                           plus_constant (DImode,
14079                                          operands[8],
14080                                          new_off + msize * 2));
14081   mem_4 = change_address (mem_4, VOIDmode,
14082                           plus_constant (DImode,
14083                                          operands[8],
14084                                          new_off + msize * 3));
14085
14086   if (code == ZERO_EXTEND)
14087     {
14088       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
14089       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
14090       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
14091       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
14092     }
14093   else if (code == SIGN_EXTEND)
14094     {
14095       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
14096       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
14097       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
14098       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
14099     }
14100
14101   if (load)
14102     {
14103       operands[1] = mem_1;
14104       operands[3] = mem_2;
14105       operands[5] = mem_3;
14106       operands[7] = mem_4;
14107     }
14108   else
14109     {
14110       operands[0] = mem_1;
14111       operands[2] = mem_2;
14112       operands[4] = mem_3;
14113       operands[6] = mem_4;
14114     }
14115
14116   /* Emit adjusting instruction.  */
14117   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
14118   /* Emit ldp/stp instructions.  */
14119   t1 = gen_rtx_SET (operands[0], operands[1]);
14120   t2 = gen_rtx_SET (operands[2], operands[3]);
14121   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
14122   t1 = gen_rtx_SET (operands[4], operands[5]);
14123   t2 = gen_rtx_SET (operands[6], operands[7]);
14124   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
14125   return true;
14126 }
14127
14128 /* Return 1 if pseudo register should be created and used to hold
14129    GOT address for PIC code.  */
14130
14131 bool
14132 aarch64_use_pseudo_pic_reg (void)
14133 {
14134   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
14135 }
14136
14137 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
14138
14139 static int
14140 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
14141 {
14142   switch (XINT (x, 1))
14143     {
14144     case UNSPEC_GOTSMALLPIC:
14145     case UNSPEC_GOTSMALLPIC28K:
14146     case UNSPEC_GOTTINYPIC:
14147       return 0;
14148     default:
14149       break;
14150     }
14151
14152   return default_unspec_may_trap_p (x, flags);
14153 }
14154
14155
14156 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
14157    return the log2 of that value.  Otherwise return -1.  */
14158
14159 int
14160 aarch64_fpconst_pow_of_2 (rtx x)
14161 {
14162   const REAL_VALUE_TYPE *r;
14163
14164   if (!CONST_DOUBLE_P (x))
14165     return -1;
14166
14167   r = CONST_DOUBLE_REAL_VALUE (x);
14168
14169   if (REAL_VALUE_NEGATIVE (*r)
14170       || REAL_VALUE_ISNAN (*r)
14171       || REAL_VALUE_ISINF (*r)
14172       || !real_isinteger (r, DFmode))
14173     return -1;
14174
14175   return exact_log2 (real_to_integer (r));
14176 }
14177
14178 /* If X is a vector of equal CONST_DOUBLE values and that value is
14179    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
14180
14181 int
14182 aarch64_vec_fpconst_pow_of_2 (rtx x)
14183 {
14184   if (GET_CODE (x) != CONST_VECTOR)
14185     return -1;
14186
14187   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
14188     return -1;
14189
14190   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
14191   if (firstval <= 0)
14192     return -1;
14193
14194   for (int i = 1; i < CONST_VECTOR_NUNITS (x); i++)
14195     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
14196       return -1;
14197
14198   return firstval;
14199 }
14200
14201 /* Implement TARGET_PROMOTED_TYPE to promote __fp16 to float.  */
14202 static tree
14203 aarch64_promoted_type (const_tree t)
14204 {
14205   if (SCALAR_FLOAT_TYPE_P (t) && TYPE_PRECISION (t) == 16)
14206     return float_type_node;
14207   return NULL_TREE;
14208 }
14209
14210 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
14211
14212 static bool
14213 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
14214                            optimization_type opt_type)
14215 {
14216   switch (op)
14217     {
14218     case rsqrt_optab:
14219       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
14220
14221     default:
14222       return true;
14223     }
14224 }
14225
14226 /* Override the default target speculation_safe_value.  */
14227 static rtx
14228 aarch64_speculation_safe_value (machine_mode mode,
14229                                 rtx result, rtx val, rtx failval)
14230 {
14231   /* Maybe we should warn if falling back to hard barriers.  They are
14232      likely to be noticably more expensive than the alternative below.  */
14233   if (!aarch64_track_speculation)
14234     return default_speculation_safe_value (mode, result, val, failval);
14235
14236   if (!REG_P (val))
14237     val = copy_to_mode_reg (mode, val);
14238
14239   if (!aarch64_reg_or_zero (failval, mode))
14240     failval = copy_to_mode_reg (mode, failval);
14241
14242   switch (mode)
14243     {
14244     case QImode:
14245       emit_insn (gen_despeculate_copyqi (result, val, failval));
14246       break;
14247     case HImode:
14248       emit_insn (gen_despeculate_copyhi (result, val, failval));
14249       break;
14250     case SImode:
14251       emit_insn (gen_despeculate_copysi (result, val, failval));
14252       break;
14253     case DImode:
14254       emit_insn (gen_despeculate_copydi (result, val, failval));
14255       break;
14256     case TImode:
14257       emit_insn (gen_despeculate_copyti (result, val, failval));
14258       break;
14259     default:
14260       gcc_unreachable ();
14261     }
14262   return result;
14263 }
14264
14265 #undef TARGET_ADDRESS_COST
14266 #define TARGET_ADDRESS_COST aarch64_address_cost
14267
14268 /* This hook will determines whether unnamed bitfields affect the alignment
14269    of the containing structure.  The hook returns true if the structure
14270    should inherit the alignment requirements of an unnamed bitfield's
14271    type.  */
14272 #undef TARGET_ALIGN_ANON_BITFIELD
14273 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
14274
14275 #undef TARGET_ASM_ALIGNED_DI_OP
14276 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
14277
14278 #undef TARGET_ASM_ALIGNED_HI_OP
14279 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
14280
14281 #undef TARGET_ASM_ALIGNED_SI_OP
14282 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
14283
14284 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
14285 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
14286   hook_bool_const_tree_hwi_hwi_const_tree_true
14287
14288 #undef TARGET_ASM_FILE_START
14289 #define TARGET_ASM_FILE_START aarch64_start_file
14290
14291 #undef TARGET_ASM_OUTPUT_MI_THUNK
14292 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
14293
14294 #undef TARGET_ASM_SELECT_RTX_SECTION
14295 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
14296
14297 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
14298 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
14299
14300 #undef TARGET_BUILD_BUILTIN_VA_LIST
14301 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
14302
14303 #undef TARGET_CALLEE_COPIES
14304 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
14305
14306 #undef TARGET_CAN_ELIMINATE
14307 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
14308
14309 #undef TARGET_CAN_INLINE_P
14310 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
14311
14312 #undef TARGET_CANNOT_FORCE_CONST_MEM
14313 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
14314
14315 #undef TARGET_CASE_VALUES_THRESHOLD
14316 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
14317
14318 #undef TARGET_CONDITIONAL_REGISTER_USAGE
14319 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
14320
14321 /* Only the least significant bit is used for initialization guard
14322    variables.  */
14323 #undef TARGET_CXX_GUARD_MASK_BIT
14324 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
14325
14326 #undef TARGET_C_MODE_FOR_SUFFIX
14327 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
14328
14329 #ifdef TARGET_BIG_ENDIAN_DEFAULT
14330 #undef  TARGET_DEFAULT_TARGET_FLAGS
14331 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
14332 #endif
14333
14334 #undef TARGET_CLASS_MAX_NREGS
14335 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
14336
14337 #undef TARGET_BUILTIN_DECL
14338 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
14339
14340 #undef TARGET_BUILTIN_RECIPROCAL
14341 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
14342
14343 #undef  TARGET_EXPAND_BUILTIN
14344 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
14345
14346 #undef TARGET_EXPAND_BUILTIN_VA_START
14347 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
14348
14349 #undef TARGET_FOLD_BUILTIN
14350 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
14351
14352 #undef TARGET_FUNCTION_ARG
14353 #define TARGET_FUNCTION_ARG aarch64_function_arg
14354
14355 #undef TARGET_FUNCTION_ARG_ADVANCE
14356 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
14357
14358 #undef TARGET_FUNCTION_ARG_BOUNDARY
14359 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
14360
14361 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
14362 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
14363
14364 #undef TARGET_FUNCTION_VALUE
14365 #define TARGET_FUNCTION_VALUE aarch64_function_value
14366
14367 #undef TARGET_FUNCTION_VALUE_REGNO_P
14368 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
14369
14370 #undef TARGET_FRAME_POINTER_REQUIRED
14371 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
14372
14373 #undef TARGET_GIMPLE_FOLD_BUILTIN
14374 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
14375
14376 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
14377 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
14378
14379 #undef  TARGET_INIT_BUILTINS
14380 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
14381
14382 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
14383 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
14384   aarch64_ira_change_pseudo_allocno_class
14385
14386 #undef TARGET_LEGITIMATE_ADDRESS_P
14387 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
14388
14389 #undef TARGET_LEGITIMATE_CONSTANT_P
14390 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
14391
14392 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
14393 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
14394   aarch64_legitimize_address_displacement
14395
14396 #undef TARGET_LIBGCC_CMP_RETURN_MODE
14397 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
14398
14399 #undef TARGET_LRA_P
14400 #define TARGET_LRA_P hook_bool_void_true
14401
14402 #undef TARGET_MANGLE_TYPE
14403 #define TARGET_MANGLE_TYPE aarch64_mangle_type
14404
14405 #undef TARGET_MEMORY_MOVE_COST
14406 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
14407
14408 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
14409 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
14410
14411 #undef TARGET_MUST_PASS_IN_STACK
14412 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
14413
14414 /* This target hook should return true if accesses to volatile bitfields
14415    should use the narrowest mode possible.  It should return false if these
14416    accesses should use the bitfield container type.  */
14417 #undef TARGET_NARROW_VOLATILE_BITFIELD
14418 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
14419
14420 #undef  TARGET_OPTION_OVERRIDE
14421 #define TARGET_OPTION_OVERRIDE aarch64_override_options
14422
14423 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
14424 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
14425   aarch64_override_options_after_change
14426
14427 #undef TARGET_OPTION_SAVE
14428 #define TARGET_OPTION_SAVE aarch64_option_save
14429
14430 #undef TARGET_OPTION_RESTORE
14431 #define TARGET_OPTION_RESTORE aarch64_option_restore
14432
14433 #undef TARGET_OPTION_PRINT
14434 #define TARGET_OPTION_PRINT aarch64_option_print
14435
14436 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
14437 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
14438
14439 #undef TARGET_SET_CURRENT_FUNCTION
14440 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
14441
14442 #undef TARGET_PASS_BY_REFERENCE
14443 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
14444
14445 #undef TARGET_PREFERRED_RELOAD_CLASS
14446 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
14447
14448 #undef TARGET_SCHED_REASSOCIATION_WIDTH
14449 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
14450
14451 #undef TARGET_PROMOTED_TYPE
14452 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
14453
14454 #undef TARGET_SECONDARY_RELOAD
14455 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
14456
14457 #undef TARGET_SHIFT_TRUNCATION_MASK
14458 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
14459
14460 #undef TARGET_SETUP_INCOMING_VARARGS
14461 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
14462
14463 #undef TARGET_STRUCT_VALUE_RTX
14464 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
14465
14466 #undef TARGET_REGISTER_MOVE_COST
14467 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
14468
14469 #undef TARGET_RETURN_IN_MEMORY
14470 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
14471
14472 #undef TARGET_RETURN_IN_MSB
14473 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
14474
14475 #undef TARGET_RTX_COSTS
14476 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
14477
14478 #undef TARGET_SCHED_ISSUE_RATE
14479 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
14480
14481 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
14482 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
14483   aarch64_sched_first_cycle_multipass_dfa_lookahead
14484
14485 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
14486 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
14487   aarch64_first_cycle_multipass_dfa_lookahead_guard
14488
14489 #undef TARGET_TRAMPOLINE_INIT
14490 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
14491
14492 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
14493 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
14494
14495 #undef TARGET_VECTOR_MODE_SUPPORTED_P
14496 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
14497
14498 #undef TARGET_ARRAY_MODE_SUPPORTED_P
14499 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
14500
14501 #undef TARGET_VECTORIZE_ADD_STMT_COST
14502 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
14503
14504 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
14505 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
14506   aarch64_builtin_vectorization_cost
14507
14508 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
14509 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
14510
14511 #undef TARGET_VECTORIZE_BUILTINS
14512 #define TARGET_VECTORIZE_BUILTINS
14513
14514 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
14515 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
14516   aarch64_builtin_vectorized_function
14517
14518 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
14519 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
14520   aarch64_autovectorize_vector_sizes
14521
14522 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
14523 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
14524   aarch64_atomic_assign_expand_fenv
14525
14526 /* Section anchor support.  */
14527
14528 #undef TARGET_MIN_ANCHOR_OFFSET
14529 #define TARGET_MIN_ANCHOR_OFFSET -256
14530
14531 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
14532    byte offset; we can do much more for larger data types, but have no way
14533    to determine the size of the access.  We assume accesses are aligned.  */
14534 #undef TARGET_MAX_ANCHOR_OFFSET
14535 #define TARGET_MAX_ANCHOR_OFFSET 4095
14536
14537 #undef TARGET_VECTOR_ALIGNMENT
14538 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
14539
14540 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
14541 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
14542   aarch64_simd_vector_alignment_reachable
14543
14544 /* vec_perm support.  */
14545
14546 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
14547 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
14548   aarch64_vectorize_vec_perm_const_ok
14549
14550 #undef TARGET_INIT_LIBFUNCS
14551 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
14552
14553 #undef TARGET_FIXED_CONDITION_CODE_REGS
14554 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
14555
14556 #undef TARGET_FLAGS_REGNUM
14557 #define TARGET_FLAGS_REGNUM CC_REGNUM
14558
14559 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
14560 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
14561
14562 #undef TARGET_ASAN_SHADOW_OFFSET
14563 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
14564
14565 #undef TARGET_LEGITIMIZE_ADDRESS
14566 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
14567
14568 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
14569 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
14570   aarch64_use_by_pieces_infrastructure_p
14571
14572 #undef TARGET_CAN_USE_DOLOOP_P
14573 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
14574
14575 #undef TARGET_SCHED_ADJUST_PRIORITY
14576 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
14577
14578 #undef TARGET_SCHED_MACRO_FUSION_P
14579 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
14580
14581 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
14582 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
14583
14584 #undef TARGET_SCHED_FUSION_PRIORITY
14585 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
14586
14587 #undef TARGET_UNSPEC_MAY_TRAP_P
14588 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
14589
14590 #undef TARGET_USE_PSEUDO_PIC_REG
14591 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
14592
14593 #undef TARGET_PRINT_OPERAND
14594 #define TARGET_PRINT_OPERAND aarch64_print_operand
14595
14596 #undef TARGET_PRINT_OPERAND_ADDRESS
14597 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
14598
14599 #undef TARGET_OPTAB_SUPPORTED_P
14600 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
14601
14602 #undef TARGET_SPECULATION_SAFE_VALUE
14603 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
14604
14605 #undef TARGET_OMIT_STRUCT_RETURN_REG
14606 #define TARGET_OMIT_STRUCT_RETURN_REG true
14607
14608 struct gcc_target targetm = TARGET_INITIALIZER;
14609
14610 #include "gt-aarch64.h"