1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2015 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
23 #include "coretypes.h"
25 #include "insn-codes.h"
27 #include "insn-attr.h"
31 #include "double-int.h"
38 #include "fold-const.h"
39 #include "stringpool.h"
40 #include "stor-layout.h"
44 #include "dominance.h"
50 #include "cfgcleanup.h"
52 #include "basic-block.h"
54 #include "hard-reg-set.h"
59 #include "statistics.h"
61 #include "fixed-value.h"
62 #include "insn-config.h"
72 #include "target-def.h"
73 #include "targhooks.h"
77 #include "langhooks.h"
78 #include "diagnostic-core.h"
79 #include "hash-table.h"
80 #include "tree-ssa-alias.h"
81 #include "internal-fn.h"
82 #include "gimple-fold.h"
84 #include "gimple-expr.h"
91 #include "tree-vectorizer.h"
92 #include "aarch64-cost-tables.h"
96 #include "tm-constrs.h"
97 #include "sched-int.h"
98 #include "cortex-a57-fma-steering.h"
100 /* Defined for convenience. */
101 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
103 /* Classifies an address.
106 A simple base register plus immediate offset.
109 A base register indexed by immediate offset with writeback.
112 A base register indexed by (optionally scaled) register.
115 A base register indexed by (optionally scaled) zero-extended register.
118 A base register indexed by (optionally scaled) sign-extended register.
121 A LO_SUM rtx with a base register and "LO12" symbol relocation.
124 A constant symbolic address, in pc-relative literal pool. */
126 enum aarch64_address_type {
136 struct aarch64_address_info {
137 enum aarch64_address_type type;
141 enum aarch64_symbol_type symbol_type;
144 struct simd_immediate_info
153 /* The current code model. */
154 enum aarch64_code_model aarch64_cmodel;
157 #undef TARGET_HAVE_TLS
158 #define TARGET_HAVE_TLS 1
161 static bool aarch64_composite_type_p (const_tree, machine_mode);
162 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
164 machine_mode *, int *,
166 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
167 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
168 static void aarch64_override_options_after_change (void);
169 static bool aarch64_vector_mode_supported_p (machine_mode);
170 static unsigned bit_count (unsigned HOST_WIDE_INT);
171 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
172 const unsigned char *sel);
173 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
175 /* Major revision number of the ARM Architecture implemented by the target. */
176 unsigned aarch64_architecture_version;
178 /* The processor for which instructions should be scheduled. */
179 enum aarch64_processor aarch64_tune = cortexa53;
181 /* The current tuning set. */
182 const struct tune_params *aarch64_tune_params;
184 /* Mask to specify which instructions we are allowed to generate. */
185 unsigned long aarch64_isa_flags = 0;
187 /* Mask to specify which instruction scheduling options should be used. */
188 unsigned long aarch64_tune_flags = 0;
190 /* Tuning parameters. */
192 static const struct cpu_addrcost_table generic_addrcost_table =
202 0, /* register_offset */
203 0, /* register_extend */
207 static const struct cpu_addrcost_table cortexa57_addrcost_table =
217 0, /* register_offset */
218 0, /* register_extend */
222 static const struct cpu_addrcost_table xgene1_addrcost_table =
232 0, /* register_offset */
233 1, /* register_extend */
237 static const struct cpu_regmove_cost generic_regmove_cost =
240 /* Avoid the use of slow int<->fp moves for spilling by setting
241 their cost higher than memmov_cost. */
247 static const struct cpu_regmove_cost cortexa57_regmove_cost =
250 /* Avoid the use of slow int<->fp moves for spilling by setting
251 their cost higher than memmov_cost. */
257 static const struct cpu_regmove_cost cortexa53_regmove_cost =
260 /* Avoid the use of slow int<->fp moves for spilling by setting
261 their cost higher than memmov_cost. */
267 static const struct cpu_regmove_cost thunderx_regmove_cost =
275 static const struct cpu_regmove_cost xgene1_regmove_cost =
278 /* Avoid the use of slow int<->fp moves for spilling by setting
279 their cost higher than memmov_cost. */
285 /* Generic costs for vector insn classes. */
286 static const struct cpu_vector_cost generic_vector_cost =
288 1, /* scalar_stmt_cost */
289 1, /* scalar_load_cost */
290 1, /* scalar_store_cost */
291 1, /* vec_stmt_cost */
292 1, /* vec_to_scalar_cost */
293 1, /* scalar_to_vec_cost */
294 1, /* vec_align_load_cost */
295 1, /* vec_unalign_load_cost */
296 1, /* vec_unalign_store_cost */
297 1, /* vec_store_cost */
298 3, /* cond_taken_branch_cost */
299 1 /* cond_not_taken_branch_cost */
302 /* Generic costs for vector insn classes. */
303 static const struct cpu_vector_cost cortexa57_vector_cost =
305 1, /* scalar_stmt_cost */
306 4, /* scalar_load_cost */
307 1, /* scalar_store_cost */
308 3, /* vec_stmt_cost */
309 8, /* vec_to_scalar_cost */
310 8, /* scalar_to_vec_cost */
311 5, /* vec_align_load_cost */
312 5, /* vec_unalign_load_cost */
313 1, /* vec_unalign_store_cost */
314 1, /* vec_store_cost */
315 1, /* cond_taken_branch_cost */
316 1 /* cond_not_taken_branch_cost */
319 /* Generic costs for vector insn classes. */
320 static const struct cpu_vector_cost xgene1_vector_cost =
322 1, /* scalar_stmt_cost */
323 5, /* scalar_load_cost */
324 1, /* scalar_store_cost */
325 2, /* vec_stmt_cost */
326 4, /* vec_to_scalar_cost */
327 4, /* scalar_to_vec_cost */
328 10, /* vec_align_load_cost */
329 10, /* vec_unalign_load_cost */
330 2, /* vec_unalign_store_cost */
331 2, /* vec_store_cost */
332 2, /* cond_taken_branch_cost */
333 1 /* cond_not_taken_branch_cost */
336 #define AARCH64_FUSE_NOTHING (0)
337 #define AARCH64_FUSE_MOV_MOVK (1 << 0)
338 #define AARCH64_FUSE_ADRP_ADD (1 << 1)
339 #define AARCH64_FUSE_MOVK_MOVK (1 << 2)
340 #define AARCH64_FUSE_ADRP_LDR (1 << 3)
341 #define AARCH64_FUSE_CMP_BRANCH (1 << 4)
343 /* Generic costs for branch instructions. */
344 static const struct cpu_branch_cost generic_branch_cost =
346 2, /* Predictable. */
347 2 /* Unpredictable. */
350 static const struct tune_params generic_tunings =
352 &cortexa57_extra_costs,
353 &generic_addrcost_table,
354 &generic_regmove_cost,
355 &generic_vector_cost,
356 &generic_branch_cost,
359 AARCH64_FUSE_NOTHING, /* fusible_ops */
360 8, /* function_align. */
363 2, /* int_reassoc_width. */
364 4, /* fp_reassoc_width. */
365 1, /* vec_reassoc_width. */
366 2, /* min_div_recip_mul_sf. */
367 2 /* min_div_recip_mul_df. */
370 static const struct tune_params cortexa53_tunings =
372 &cortexa53_extra_costs,
373 &generic_addrcost_table,
374 &cortexa53_regmove_cost,
375 &generic_vector_cost,
376 &generic_branch_cost,
379 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
380 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
381 8, /* function_align. */
384 2, /* int_reassoc_width. */
385 4, /* fp_reassoc_width. */
386 1, /* vec_reassoc_width. */
387 2, /* min_div_recip_mul_sf. */
388 2 /* min_div_recip_mul_df. */
391 static const struct tune_params cortexa57_tunings =
393 &cortexa57_extra_costs,
394 &cortexa57_addrcost_table,
395 &cortexa57_regmove_cost,
396 &cortexa57_vector_cost,
397 &generic_branch_cost,
400 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
401 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
402 16, /* function_align. */
405 2, /* int_reassoc_width. */
406 4, /* fp_reassoc_width. */
407 1, /* vec_reassoc_width. */
408 2, /* min_div_recip_mul_sf. */
409 2 /* min_div_recip_mul_df. */
412 static const struct tune_params thunderx_tunings =
414 &thunderx_extra_costs,
415 &generic_addrcost_table,
416 &thunderx_regmove_cost,
417 &generic_vector_cost,
418 &generic_branch_cost,
421 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
422 8, /* function_align. */
425 2, /* int_reassoc_width. */
426 4, /* fp_reassoc_width. */
427 1, /* vec_reassoc_width. */
428 2, /* min_div_recip_mul_sf. */
429 2 /* min_div_recip_mul_df. */
432 static const struct tune_params xgene1_tunings =
435 &xgene1_addrcost_table,
436 &xgene1_regmove_cost,
438 &generic_branch_cost,
441 AARCH64_FUSE_NOTHING, /* fusible_ops */
442 16, /* function_align. */
444 16, /* loop_align. */
445 2, /* int_reassoc_width. */
446 4, /* fp_reassoc_width. */
447 1, /* vec_reassoc_width. */
448 2, /* min_div_recip_mul_sf. */
449 2 /* min_div_recip_mul_df. */
452 /* A processor implementing AArch64. */
455 const char *const name;
456 enum aarch64_processor core;
458 unsigned architecture_version;
459 const unsigned long flags;
460 const struct tune_params *const tune;
463 /* Processor cores implementing AArch64. */
464 static const struct processor all_cores[] =
466 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART) \
467 {NAME, SCHED, #ARCH, ARCH, FLAGS, &COSTS##_tunings},
468 #include "aarch64-cores.def"
470 {"generic", cortexa53, "8", 8, AARCH64_FL_FOR_ARCH8, &generic_tunings},
471 {NULL, aarch64_none, NULL, 0, 0, NULL}
474 /* Architectures implementing AArch64. */
475 static const struct processor all_architectures[] =
477 #define AARCH64_ARCH(NAME, CORE, ARCH, FLAGS) \
478 {NAME, CORE, #ARCH, ARCH, FLAGS, NULL},
479 #include "aarch64-arches.def"
481 {NULL, aarch64_none, NULL, 0, 0, NULL}
484 /* Target specification. These are populated as commandline arguments
485 are processed, or NULL if not specified. */
486 static const struct processor *selected_arch;
487 static const struct processor *selected_cpu;
488 static const struct processor *selected_tune;
490 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
492 /* An ISA extension in the co-processor and main instruction set space. */
493 struct aarch64_option_extension
495 const char *const name;
496 const unsigned long flags_on;
497 const unsigned long flags_off;
500 /* ISA extensions in AArch64. */
501 static const struct aarch64_option_extension all_extensions[] =
503 #define AARCH64_OPT_EXTENSION(NAME, FLAGS_ON, FLAGS_OFF, FEATURE_STRING) \
504 {NAME, FLAGS_ON, FLAGS_OFF},
505 #include "aarch64-option-extensions.def"
506 #undef AARCH64_OPT_EXTENSION
510 /* Used to track the size of an address when generating a pre/post
511 increment address. */
512 static machine_mode aarch64_memory_reference_mode;
514 /* A table of valid AArch64 "bitmask immediate" values for
515 logical instructions. */
517 #define AARCH64_NUM_BITMASKS 5334
518 static unsigned HOST_WIDE_INT aarch64_bitmasks[AARCH64_NUM_BITMASKS];
520 typedef enum aarch64_cond_code
522 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
523 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
524 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
528 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
530 /* The condition codes of the processor, and the inverse function. */
531 static const char * const aarch64_condition_codes[] =
533 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
534 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
538 aarch64_min_divisions_for_recip_mul (enum machine_mode mode)
540 if (GET_MODE_UNIT_SIZE (mode) == 4)
541 return aarch64_tune_params->min_div_recip_mul_sf;
542 return aarch64_tune_params->min_div_recip_mul_df;
546 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
547 enum machine_mode mode)
549 if (VECTOR_MODE_P (mode))
550 return aarch64_tune_params->vec_reassoc_width;
551 if (INTEGRAL_MODE_P (mode))
552 return aarch64_tune_params->int_reassoc_width;
553 if (FLOAT_MODE_P (mode))
554 return aarch64_tune_params->fp_reassoc_width;
558 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
560 aarch64_dbx_register_number (unsigned regno)
562 if (GP_REGNUM_P (regno))
563 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
564 else if (regno == SP_REGNUM)
565 return AARCH64_DWARF_SP;
566 else if (FP_REGNUM_P (regno))
567 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
569 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
570 equivalent DWARF register. */
571 return DWARF_FRAME_REGISTERS;
574 /* Return TRUE if MODE is any of the large INT modes. */
576 aarch64_vect_struct_mode_p (machine_mode mode)
578 return mode == OImode || mode == CImode || mode == XImode;
581 /* Return TRUE if MODE is any of the vector modes. */
583 aarch64_vector_mode_p (machine_mode mode)
585 return aarch64_vector_mode_supported_p (mode)
586 || aarch64_vect_struct_mode_p (mode);
589 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
591 aarch64_array_mode_supported_p (machine_mode mode,
592 unsigned HOST_WIDE_INT nelems)
595 && AARCH64_VALID_SIMD_QREG_MODE (mode)
596 && (nelems >= 2 && nelems <= 4))
602 /* Implement HARD_REGNO_NREGS. */
605 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
607 switch (aarch64_regno_regclass (regno))
611 return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
613 return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
618 /* Implement HARD_REGNO_MODE_OK. */
621 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
623 if (GET_MODE_CLASS (mode) == MODE_CC)
624 return regno == CC_REGNUM;
626 if (regno == SP_REGNUM)
627 /* The purpose of comparing with ptr_mode is to support the
628 global register variable associated with the stack pointer
629 register via the syntax of asm ("wsp") in ILP32. */
630 return mode == Pmode || mode == ptr_mode;
632 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
633 return mode == Pmode;
635 if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
638 if (FP_REGNUM_P (regno))
640 if (aarch64_vect_struct_mode_p (mode))
642 (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
650 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
652 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
655 /* Handle modes that fit within single registers. */
656 if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
658 if (GET_MODE_SIZE (mode) >= 4)
663 /* Fall back to generic for multi-reg and very large modes. */
665 return choose_hard_reg_mode (regno, nregs, false);
668 /* Return true if calls to DECL should be treated as
669 long-calls (ie called via a register). */
671 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
676 /* Return true if calls to symbol-ref SYM should be treated as
677 long-calls (ie called via a register). */
679 aarch64_is_long_call_p (rtx sym)
681 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
684 /* Return true if the offsets to a zero/sign-extract operation
685 represent an expression that matches an extend operation. The
686 operands represent the paramters from
688 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
690 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
693 HOST_WIDE_INT mult_val, extract_val;
695 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
698 mult_val = INTVAL (mult_imm);
699 extract_val = INTVAL (extract_imm);
702 && extract_val < GET_MODE_BITSIZE (mode)
703 && exact_log2 (extract_val & ~7) > 0
704 && (extract_val & 7) <= 4
705 && mult_val == (1 << (extract_val & 7)))
711 /* Emit an insn that's a simple single-set. Both the operands must be
712 known to be valid. */
714 emit_set_insn (rtx x, rtx y)
716 return emit_insn (gen_rtx_SET (x, y));
719 /* X and Y are two things to compare using CODE. Emit the compare insn and
720 return the rtx for register 0 in the proper mode. */
722 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
724 machine_mode mode = SELECT_CC_MODE (code, x, y);
725 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
727 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
731 /* Build the SYMBOL_REF for __tls_get_addr. */
733 static GTY(()) rtx tls_get_addr_libfunc;
736 aarch64_tls_get_addr (void)
738 if (!tls_get_addr_libfunc)
739 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
740 return tls_get_addr_libfunc;
743 /* Return the TLS model to use for ADDR. */
745 static enum tls_model
746 tls_symbolic_operand_type (rtx addr)
748 enum tls_model tls_kind = TLS_MODEL_NONE;
751 if (GET_CODE (addr) == CONST)
753 split_const (addr, &sym, &addend);
754 if (GET_CODE (sym) == SYMBOL_REF)
755 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
757 else if (GET_CODE (addr) == SYMBOL_REF)
758 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
763 /* We'll allow lo_sum's in addresses in our legitimate addresses
764 so that combine would take care of combining addresses where
765 necessary, but for generation purposes, we'll generate the address
768 tmp = hi (symbol_ref); adrp x1, foo
769 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
773 adrp x1, :got:foo adrp tmp, :tlsgd:foo
774 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
778 Load TLS symbol, depending on TLS mechanism and TLS access model.
780 Global Dynamic - Traditional TLS:
782 add dest, tmp, #:tlsgd_lo12:imm
785 Global Dynamic - TLS Descriptors:
786 adrp dest, :tlsdesc:imm
787 ldr tmp, [dest, #:tlsdesc_lo12:imm]
788 add dest, dest, #:tlsdesc_lo12:imm
795 adrp tmp, :gottprel:imm
796 ldr dest, [tmp, #:gottprel_lo12:imm]
801 add t0, tp, #:tprel_hi12:imm, lsl #12
802 add t0, t0, #:tprel_lo12_nc:imm
806 aarch64_load_symref_appropriately (rtx dest, rtx imm,
807 enum aarch64_symbol_type type)
811 case SYMBOL_SMALL_ABSOLUTE:
813 /* In ILP32, the mode of dest can be either SImode or DImode. */
815 machine_mode mode = GET_MODE (dest);
817 gcc_assert (mode == Pmode || mode == ptr_mode);
819 if (can_create_pseudo_p ())
820 tmp_reg = gen_reg_rtx (mode);
822 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
823 emit_insn (gen_add_losym (dest, tmp_reg, imm));
827 case SYMBOL_TINY_ABSOLUTE:
828 emit_insn (gen_rtx_SET (dest, imm));
831 case SYMBOL_SMALL_GOT:
833 /* In ILP32, the mode of dest can be either SImode or DImode,
834 while the got entry is always of SImode size. The mode of
835 dest depends on how dest is used: if dest is assigned to a
836 pointer (e.g. in the memory), it has SImode; it may have
837 DImode if dest is dereferenced to access the memeory.
838 This is why we have to handle three different ldr_got_small
839 patterns here (two patterns for ILP32). */
841 machine_mode mode = GET_MODE (dest);
843 if (can_create_pseudo_p ())
844 tmp_reg = gen_reg_rtx (mode);
846 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
847 if (mode == ptr_mode)
850 emit_insn (gen_ldr_got_small_di (dest, tmp_reg, imm));
852 emit_insn (gen_ldr_got_small_si (dest, tmp_reg, imm));
856 gcc_assert (mode == Pmode);
857 emit_insn (gen_ldr_got_small_sidi (dest, tmp_reg, imm));
863 case SYMBOL_SMALL_TLSGD:
866 rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
869 aarch64_emit_call_insn (gen_tlsgd_small (result, imm));
870 insns = get_insns ();
873 RTL_CONST_CALL_P (insns) = 1;
874 emit_libcall_block (insns, dest, result, imm);
878 case SYMBOL_SMALL_TLSDESC:
880 machine_mode mode = GET_MODE (dest);
881 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
884 gcc_assert (mode == Pmode || mode == ptr_mode);
886 /* In ILP32, the got entry is always of SImode size. Unlike
887 small GOT, the dest is fixed at reg 0. */
889 emit_insn (gen_tlsdesc_small_si (imm));
891 emit_insn (gen_tlsdesc_small_di (imm));
892 tp = aarch64_load_tp (NULL);
895 tp = gen_lowpart (mode, tp);
897 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
898 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
902 case SYMBOL_SMALL_GOTTPREL:
904 /* In ILP32, the mode of dest can be either SImode or DImode,
905 while the got entry is always of SImode size. The mode of
906 dest depends on how dest is used: if dest is assigned to a
907 pointer (e.g. in the memory), it has SImode; it may have
908 DImode if dest is dereferenced to access the memeory.
909 This is why we have to handle three different tlsie_small
910 patterns here (two patterns for ILP32). */
911 machine_mode mode = GET_MODE (dest);
912 rtx tmp_reg = gen_reg_rtx (mode);
913 rtx tp = aarch64_load_tp (NULL);
915 if (mode == ptr_mode)
918 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
921 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
922 tp = gen_lowpart (mode, tp);
927 gcc_assert (mode == Pmode);
928 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
931 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
932 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
936 case SYMBOL_SMALL_TPREL:
938 rtx tp = aarch64_load_tp (NULL);
940 if (GET_MODE (dest) != Pmode)
941 tp = gen_lowpart (GET_MODE (dest), tp);
943 emit_insn (gen_tlsle_small (dest, tp, imm));
944 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
948 case SYMBOL_TINY_GOT:
949 emit_insn (gen_ldr_got_tiny (dest, imm));
957 /* Emit a move from SRC to DEST. Assume that the move expanders can
958 handle all moves if !can_create_pseudo_p (). The distinction is
959 important because, unlike emit_move_insn, the move expanders know
960 how to force Pmode objects into the constant pool even when the
961 constant pool address is not itself legitimate. */
963 aarch64_emit_move (rtx dest, rtx src)
965 return (can_create_pseudo_p ()
966 ? emit_move_insn (dest, src)
967 : emit_move_insn_1 (dest, src));
970 /* Split a 128-bit move operation into two 64-bit move operations,
971 taking care to handle partial overlap of register to register
972 copies. Special cases are needed when moving between GP regs and
973 FP regs. SRC can be a register, constant or memory; DST a register
974 or memory. If either operand is memory it must not have any side
977 aarch64_split_128bit_move (rtx dst, rtx src)
982 machine_mode mode = GET_MODE (dst);
984 gcc_assert (mode == TImode || mode == TFmode);
985 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
986 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
988 if (REG_P (dst) && REG_P (src))
990 int src_regno = REGNO (src);
991 int dst_regno = REGNO (dst);
993 /* Handle FP <-> GP regs. */
994 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
996 src_lo = gen_lowpart (word_mode, src);
997 src_hi = gen_highpart (word_mode, src);
1001 emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1002 emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1006 emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1007 emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1011 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1013 dst_lo = gen_lowpart (word_mode, dst);
1014 dst_hi = gen_highpart (word_mode, dst);
1018 emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1019 emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1023 emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1024 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1030 dst_lo = gen_lowpart (word_mode, dst);
1031 dst_hi = gen_highpart (word_mode, dst);
1032 src_lo = gen_lowpart (word_mode, src);
1033 src_hi = gen_highpart_mode (word_mode, mode, src);
1035 /* At most one pairing may overlap. */
1036 if (reg_overlap_mentioned_p (dst_lo, src_hi))
1038 aarch64_emit_move (dst_hi, src_hi);
1039 aarch64_emit_move (dst_lo, src_lo);
1043 aarch64_emit_move (dst_lo, src_lo);
1044 aarch64_emit_move (dst_hi, src_hi);
1049 aarch64_split_128bit_move_p (rtx dst, rtx src)
1051 return (! REG_P (src)
1052 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1055 /* Split a complex SIMD combine. */
1058 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1060 machine_mode src_mode = GET_MODE (src1);
1061 machine_mode dst_mode = GET_MODE (dst);
1063 gcc_assert (VECTOR_MODE_P (dst_mode));
1065 if (REG_P (dst) && REG_P (src1) && REG_P (src2))
1067 rtx (*gen) (rtx, rtx, rtx);
1072 gen = gen_aarch64_simd_combinev8qi;
1075 gen = gen_aarch64_simd_combinev4hi;
1078 gen = gen_aarch64_simd_combinev2si;
1081 gen = gen_aarch64_simd_combinev2sf;
1084 gen = gen_aarch64_simd_combinedi;
1087 gen = gen_aarch64_simd_combinedf;
1093 emit_insn (gen (dst, src1, src2));
1098 /* Split a complex SIMD move. */
1101 aarch64_split_simd_move (rtx dst, rtx src)
1103 machine_mode src_mode = GET_MODE (src);
1104 machine_mode dst_mode = GET_MODE (dst);
1106 gcc_assert (VECTOR_MODE_P (dst_mode));
1108 if (REG_P (dst) && REG_P (src))
1110 rtx (*gen) (rtx, rtx);
1112 gcc_assert (VECTOR_MODE_P (src_mode));
1117 gen = gen_aarch64_split_simd_movv16qi;
1120 gen = gen_aarch64_split_simd_movv8hi;
1123 gen = gen_aarch64_split_simd_movv4si;
1126 gen = gen_aarch64_split_simd_movv2di;
1129 gen = gen_aarch64_split_simd_movv4sf;
1132 gen = gen_aarch64_split_simd_movv2df;
1138 emit_insn (gen (dst, src));
1144 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1146 if (can_create_pseudo_p ())
1147 return force_reg (mode, value);
1150 x = aarch64_emit_move (x, value);
1157 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1159 if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1162 /* Load the full offset into a register. This
1163 might be improvable in the future. */
1164 high = GEN_INT (offset);
1166 high = aarch64_force_temporary (mode, temp, high);
1167 reg = aarch64_force_temporary (mode, temp,
1168 gen_rtx_PLUS (mode, high, reg));
1170 return plus_constant (mode, reg, offset);
1174 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1177 unsigned HOST_WIDE_INT mask;
1180 unsigned HOST_WIDE_INT val;
1183 int one_match, zero_match, first_not_ffff_match;
1186 if (CONST_INT_P (imm) && aarch64_move_imm (INTVAL (imm), mode))
1189 emit_insn (gen_rtx_SET (dest, imm));
1196 /* We know we can't do this in 1 insn, and we must be able to do it
1197 in two; so don't mess around looking for sequences that don't buy
1201 emit_insn (gen_rtx_SET (dest, GEN_INT (INTVAL (imm) & 0xffff)));
1202 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1203 GEN_INT ((INTVAL (imm) >> 16) & 0xffff)));
1209 /* Remaining cases are all for DImode. */
1212 subtargets = optimize && can_create_pseudo_p ();
1217 first_not_ffff_match = -1;
1219 for (i = 0; i < 64; i += 16, mask <<= 16)
1221 if ((val & mask) == mask)
1225 if (first_not_ffff_match < 0)
1226 first_not_ffff_match = i;
1227 if ((val & mask) == 0)
1234 /* Set one of the quarters and then insert back into result. */
1235 mask = 0xffffll << first_not_ffff_match;
1238 emit_insn (gen_rtx_SET (dest, GEN_INT (val | mask)));
1239 emit_insn (gen_insv_immdi (dest, GEN_INT (first_not_ffff_match),
1240 GEN_INT ((val >> first_not_ffff_match)
1247 if (zero_match == 2)
1248 goto simple_sequence;
1250 mask = 0x0ffff0000UL;
1251 for (i = 16; i < 64; i += 16, mask <<= 16)
1253 HOST_WIDE_INT comp = mask & ~(mask - 1);
1255 if (aarch64_uimm12_shift (val - (val & mask)))
1259 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1260 emit_insn (gen_rtx_SET (subtarget, GEN_INT (val & mask)));
1261 emit_insn (gen_adddi3 (dest, subtarget,
1262 GEN_INT (val - (val & mask))));
1267 else if (aarch64_uimm12_shift (-(val - ((val + comp) & mask))))
1271 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1272 emit_insn (gen_rtx_SET (subtarget,
1273 GEN_INT ((val + comp) & mask)));
1274 emit_insn (gen_adddi3 (dest, subtarget,
1275 GEN_INT (val - ((val + comp) & mask))));
1280 else if (aarch64_uimm12_shift (val - ((val - comp) | ~mask)))
1284 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1285 emit_insn (gen_rtx_SET (subtarget,
1286 GEN_INT ((val - comp) | ~mask)));
1287 emit_insn (gen_adddi3 (dest, subtarget,
1288 GEN_INT (val - ((val - comp) | ~mask))));
1293 else if (aarch64_uimm12_shift (-(val - (val | ~mask))))
1297 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1298 emit_insn (gen_rtx_SET (subtarget, GEN_INT (val | ~mask)));
1299 emit_insn (gen_adddi3 (dest, subtarget,
1300 GEN_INT (val - (val | ~mask))));
1307 /* See if we can do it by arithmetically combining two
1309 for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1314 if (aarch64_uimm12_shift (val - aarch64_bitmasks[i])
1315 || aarch64_uimm12_shift (-val + aarch64_bitmasks[i]))
1319 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1320 emit_insn (gen_rtx_SET (subtarget,
1321 GEN_INT (aarch64_bitmasks[i])));
1322 emit_insn (gen_adddi3 (dest, subtarget,
1323 GEN_INT (val - aarch64_bitmasks[i])));
1329 for (j = 0; j < 64; j += 16, mask <<= 16)
1331 if ((aarch64_bitmasks[i] & ~mask) == (val & ~mask))
1335 emit_insn (gen_rtx_SET (dest,
1336 GEN_INT (aarch64_bitmasks[i])));
1337 emit_insn (gen_insv_immdi (dest, GEN_INT (j),
1338 GEN_INT ((val >> j) & 0xffff)));
1346 /* See if we can do it by logically combining two immediates. */
1347 for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1349 if ((aarch64_bitmasks[i] & val) == aarch64_bitmasks[i])
1353 for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1354 if (val == (aarch64_bitmasks[i] | aarch64_bitmasks[j]))
1358 subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1359 emit_insn (gen_rtx_SET (subtarget,
1360 GEN_INT (aarch64_bitmasks[i])));
1361 emit_insn (gen_iordi3 (dest, subtarget,
1362 GEN_INT (aarch64_bitmasks[j])));
1368 else if ((val & aarch64_bitmasks[i]) == val)
1372 for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1373 if (val == (aarch64_bitmasks[j] & aarch64_bitmasks[i]))
1377 subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1378 emit_insn (gen_rtx_SET (subtarget,
1379 GEN_INT (aarch64_bitmasks[j])));
1380 emit_insn (gen_anddi3 (dest, subtarget,
1381 GEN_INT (aarch64_bitmasks[i])));
1389 if (one_match > zero_match)
1391 /* Set either first three quarters or all but the third. */
1392 mask = 0xffffll << (16 - first_not_ffff_match);
1394 emit_insn (gen_rtx_SET (dest,
1395 GEN_INT (val | mask | 0xffffffff00000000ull)));
1398 /* Now insert other two quarters. */
1399 for (i = first_not_ffff_match + 16, mask <<= (first_not_ffff_match << 1);
1400 i < 64; i += 16, mask <<= 16)
1402 if ((val & mask) != mask)
1405 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1406 GEN_INT ((val >> i) & 0xffff)));
1416 for (i = 0; i < 64; i += 16, mask <<= 16)
1418 if ((val & mask) != 0)
1423 emit_insn (gen_rtx_SET (dest, GEN_INT (val & mask)));
1430 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1431 GEN_INT ((val >> i) & 0xffff)));
1442 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1444 machine_mode mode = GET_MODE (dest);
1446 gcc_assert (mode == SImode || mode == DImode);
1448 /* Check on what type of symbol it is. */
1449 if (GET_CODE (imm) == SYMBOL_REF
1450 || GET_CODE (imm) == LABEL_REF
1451 || GET_CODE (imm) == CONST)
1453 rtx mem, base, offset;
1454 enum aarch64_symbol_type sty;
1456 /* If we have (const (plus symbol offset)), separate out the offset
1457 before we start classifying the symbol. */
1458 split_const (imm, &base, &offset);
1460 sty = aarch64_classify_symbol (base, offset, SYMBOL_CONTEXT_ADR);
1463 case SYMBOL_FORCE_TO_MEM:
1464 if (offset != const0_rtx
1465 && targetm.cannot_force_const_mem (mode, imm))
1467 gcc_assert (can_create_pseudo_p ());
1468 base = aarch64_force_temporary (mode, dest, base);
1469 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1470 aarch64_emit_move (dest, base);
1473 mem = force_const_mem (ptr_mode, imm);
1475 if (mode != ptr_mode)
1476 mem = gen_rtx_ZERO_EXTEND (mode, mem);
1477 emit_insn (gen_rtx_SET (dest, mem));
1480 case SYMBOL_SMALL_TLSGD:
1481 case SYMBOL_SMALL_TLSDESC:
1482 case SYMBOL_SMALL_GOTTPREL:
1483 case SYMBOL_SMALL_GOT:
1484 case SYMBOL_TINY_GOT:
1485 if (offset != const0_rtx)
1487 gcc_assert(can_create_pseudo_p ());
1488 base = aarch64_force_temporary (mode, dest, base);
1489 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1490 aarch64_emit_move (dest, base);
1495 case SYMBOL_SMALL_TPREL:
1496 case SYMBOL_SMALL_ABSOLUTE:
1497 case SYMBOL_TINY_ABSOLUTE:
1498 aarch64_load_symref_appropriately (dest, imm, sty);
1506 if (!CONST_INT_P (imm))
1508 if (GET_CODE (imm) == HIGH)
1509 emit_insn (gen_rtx_SET (dest, imm));
1512 rtx mem = force_const_mem (mode, imm);
1514 emit_insn (gen_rtx_SET (dest, mem));
1520 aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
1524 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
1525 tree exp ATTRIBUTE_UNUSED)
1527 /* Currently, always true. */
1531 /* Implement TARGET_PASS_BY_REFERENCE. */
1534 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
1537 bool named ATTRIBUTE_UNUSED)
1540 machine_mode dummymode;
1543 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
1544 size = (mode == BLKmode && type)
1545 ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
1547 /* Aggregates are passed by reference based on their size. */
1548 if (type && AGGREGATE_TYPE_P (type))
1550 size = int_size_in_bytes (type);
1553 /* Variable sized arguments are always returned by reference. */
1557 /* Can this be a candidate to be passed in fp/simd register(s)? */
1558 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1563 /* Arguments which are variable sized or larger than 2 registers are
1564 passed by reference unless they are a homogenous floating point
1566 return size > 2 * UNITS_PER_WORD;
1569 /* Return TRUE if VALTYPE is padded to its least significant bits. */
1571 aarch64_return_in_msb (const_tree valtype)
1573 machine_mode dummy_mode;
1576 /* Never happens in little-endian mode. */
1577 if (!BYTES_BIG_ENDIAN)
1580 /* Only composite types smaller than or equal to 16 bytes can
1581 be potentially returned in registers. */
1582 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
1583 || int_size_in_bytes (valtype) <= 0
1584 || int_size_in_bytes (valtype) > 16)
1587 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
1588 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
1589 is always passed/returned in the least significant bits of fp/simd
1591 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
1592 &dummy_mode, &dummy_int, NULL))
1598 /* Implement TARGET_FUNCTION_VALUE.
1599 Define how to find the value returned by a function. */
1602 aarch64_function_value (const_tree type, const_tree func,
1603 bool outgoing ATTRIBUTE_UNUSED)
1608 machine_mode ag_mode;
1610 mode = TYPE_MODE (type);
1611 if (INTEGRAL_TYPE_P (type))
1612 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
1614 if (aarch64_return_in_msb (type))
1616 HOST_WIDE_INT size = int_size_in_bytes (type);
1618 if (size % UNITS_PER_WORD != 0)
1620 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
1621 mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
1625 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1626 &ag_mode, &count, NULL))
1628 if (!aarch64_composite_type_p (type, mode))
1630 gcc_assert (count == 1 && mode == ag_mode);
1631 return gen_rtx_REG (mode, V0_REGNUM);
1638 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
1639 for (i = 0; i < count; i++)
1641 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
1642 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1643 GEN_INT (i * GET_MODE_SIZE (ag_mode)));
1644 XVECEXP (par, 0, i) = tmp;
1650 return gen_rtx_REG (mode, R0_REGNUM);
1653 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
1654 Return true if REGNO is the number of a hard register in which the values
1655 of called function may come back. */
1658 aarch64_function_value_regno_p (const unsigned int regno)
1660 /* Maximum of 16 bytes can be returned in the general registers. Examples
1661 of 16-byte return values are: 128-bit integers and 16-byte small
1662 structures (excluding homogeneous floating-point aggregates). */
1663 if (regno == R0_REGNUM || regno == R1_REGNUM)
1666 /* Up to four fp/simd registers can return a function value, e.g. a
1667 homogeneous floating-point aggregate having four members. */
1668 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
1669 return TARGET_FLOAT;
1674 /* Implement TARGET_RETURN_IN_MEMORY.
1676 If the type T of the result of a function is such that
1678 would require that arg be passed as a value in a register (or set of
1679 registers) according to the parameter passing rules, then the result
1680 is returned in the same registers as would be used for such an
1684 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
1687 machine_mode ag_mode;
1690 if (!AGGREGATE_TYPE_P (type)
1691 && TREE_CODE (type) != COMPLEX_TYPE
1692 && TREE_CODE (type) != VECTOR_TYPE)
1693 /* Simple scalar types always returned in registers. */
1696 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
1703 /* Types larger than 2 registers returned in memory. */
1704 size = int_size_in_bytes (type);
1705 return (size < 0 || size > 2 * UNITS_PER_WORD);
1709 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
1710 const_tree type, int *nregs)
1712 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1713 return aarch64_vfp_is_call_or_return_candidate (mode,
1715 &pcum->aapcs_vfp_rmode,
1720 /* Given MODE and TYPE of a function argument, return the alignment in
1721 bits. The idea is to suppress any stronger alignment requested by
1722 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
1723 This is a helper function for local use only. */
1726 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
1728 unsigned int alignment;
1732 if (!integer_zerop (TYPE_SIZE (type)))
1734 if (TYPE_MODE (type) == mode)
1735 alignment = TYPE_ALIGN (type);
1737 alignment = GET_MODE_ALIGNMENT (mode);
1743 alignment = GET_MODE_ALIGNMENT (mode);
1748 /* Layout a function argument according to the AAPCS64 rules. The rule
1749 numbers refer to the rule numbers in the AAPCS64. */
1752 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
1754 bool named ATTRIBUTE_UNUSED)
1756 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1757 int ncrn, nvrn, nregs;
1758 bool allocate_ncrn, allocate_nvrn;
1761 /* We need to do this once per argument. */
1762 if (pcum->aapcs_arg_processed)
1765 pcum->aapcs_arg_processed = true;
1767 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
1769 = AARCH64_ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
1772 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
1773 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
1778 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
1779 The following code thus handles passing by SIMD/FP registers first. */
1781 nvrn = pcum->aapcs_nvrn;
1783 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
1784 and homogenous short-vector aggregates (HVA). */
1787 if (nvrn + nregs <= NUM_FP_ARG_REGS)
1789 pcum->aapcs_nextnvrn = nvrn + nregs;
1790 if (!aarch64_composite_type_p (type, mode))
1792 gcc_assert (nregs == 1);
1793 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
1799 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1800 for (i = 0; i < nregs; i++)
1802 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
1803 V0_REGNUM + nvrn + i);
1804 tmp = gen_rtx_EXPR_LIST
1806 GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
1807 XVECEXP (par, 0, i) = tmp;
1809 pcum->aapcs_reg = par;
1815 /* C.3 NSRN is set to 8. */
1816 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
1821 ncrn = pcum->aapcs_ncrn;
1822 nregs = size / UNITS_PER_WORD;
1824 /* C6 - C9. though the sign and zero extension semantics are
1825 handled elsewhere. This is the case where the argument fits
1826 entirely general registers. */
1827 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
1829 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1831 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
1833 /* C.8 if the argument has an alignment of 16 then the NGRN is
1834 rounded up to the next even number. */
1835 if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
1838 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
1840 /* NREGS can be 0 when e.g. an empty structure is to be passed.
1841 A reg is still generated for it, but the caller should be smart
1842 enough not to use it. */
1843 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
1845 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
1852 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1853 for (i = 0; i < nregs; i++)
1855 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
1856 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1857 GEN_INT (i * UNITS_PER_WORD));
1858 XVECEXP (par, 0, i) = tmp;
1860 pcum->aapcs_reg = par;
1863 pcum->aapcs_nextncrn = ncrn + nregs;
1868 pcum->aapcs_nextncrn = NUM_ARG_REGS;
1870 /* The argument is passed on stack; record the needed number of words for
1871 this argument and align the total size if necessary. */
1873 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
1874 if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
1875 pcum->aapcs_stack_size = AARCH64_ROUND_UP (pcum->aapcs_stack_size,
1876 16 / UNITS_PER_WORD);
1880 /* Implement TARGET_FUNCTION_ARG. */
1883 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
1884 const_tree type, bool named)
1886 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1887 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
1889 if (mode == VOIDmode)
1892 aarch64_layout_arg (pcum_v, mode, type, named);
1893 return pcum->aapcs_reg;
1897 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
1898 const_tree fntype ATTRIBUTE_UNUSED,
1899 rtx libname ATTRIBUTE_UNUSED,
1900 const_tree fndecl ATTRIBUTE_UNUSED,
1901 unsigned n_named ATTRIBUTE_UNUSED)
1903 pcum->aapcs_ncrn = 0;
1904 pcum->aapcs_nvrn = 0;
1905 pcum->aapcs_nextncrn = 0;
1906 pcum->aapcs_nextnvrn = 0;
1907 pcum->pcs_variant = ARM_PCS_AAPCS64;
1908 pcum->aapcs_reg = NULL_RTX;
1909 pcum->aapcs_arg_processed = false;
1910 pcum->aapcs_stack_words = 0;
1911 pcum->aapcs_stack_size = 0;
1917 aarch64_function_arg_advance (cumulative_args_t pcum_v,
1922 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1923 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
1925 aarch64_layout_arg (pcum_v, mode, type, named);
1926 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
1927 != (pcum->aapcs_stack_words != 0));
1928 pcum->aapcs_arg_processed = false;
1929 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
1930 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
1931 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
1932 pcum->aapcs_stack_words = 0;
1933 pcum->aapcs_reg = NULL_RTX;
1938 aarch64_function_arg_regno_p (unsigned regno)
1940 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
1941 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
1944 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
1945 PARM_BOUNDARY bits of alignment, but will be given anything up
1946 to STACK_BOUNDARY bits if the type requires it. This makes sure
1947 that both before and after the layout of each argument, the Next
1948 Stacked Argument Address (NSAA) will have a minimum alignment of
1952 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
1954 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1956 if (alignment < PARM_BOUNDARY)
1957 alignment = PARM_BOUNDARY;
1958 if (alignment > STACK_BOUNDARY)
1959 alignment = STACK_BOUNDARY;
1963 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
1965 Return true if an argument passed on the stack should be padded upwards,
1966 i.e. if the least-significant byte of the stack slot has useful data.
1968 Small aggregate types are placed in the lowest memory address.
1970 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
1973 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
1975 /* On little-endian targets, the least significant byte of every stack
1976 argument is passed at the lowest byte address of the stack slot. */
1977 if (!BYTES_BIG_ENDIAN)
1980 /* Otherwise, integral, floating-point and pointer types are padded downward:
1981 the least significant byte of a stack argument is passed at the highest
1982 byte address of the stack slot. */
1984 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
1985 || POINTER_TYPE_P (type))
1986 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
1989 /* Everything else padded upward, i.e. data in first byte of stack slot. */
1993 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
1995 It specifies padding for the last (may also be the only)
1996 element of a block move between registers and memory. If
1997 assuming the block is in the memory, padding upward means that
1998 the last element is padded after its highest significant byte,
1999 while in downward padding, the last element is padded at the
2000 its least significant byte side.
2002 Small aggregates and small complex types are always padded
2005 We don't need to worry about homogeneous floating-point or
2006 short-vector aggregates; their move is not affected by the
2007 padding direction determined here. Regardless of endianness,
2008 each element of such an aggregate is put in the least
2009 significant bits of a fp/simd register.
2011 Return !BYTES_BIG_ENDIAN if the least significant byte of the
2012 register has useful data, and return the opposite if the most
2013 significant byte does. */
2016 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2017 bool first ATTRIBUTE_UNUSED)
2020 /* Small composite types are always padded upward. */
2021 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2023 HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2024 : GET_MODE_SIZE (mode));
2025 if (size < 2 * UNITS_PER_WORD)
2029 /* Otherwise, use the default padding. */
2030 return !BYTES_BIG_ENDIAN;
2034 aarch64_libgcc_cmp_return_mode (void)
2040 aarch64_frame_pointer_required (void)
2042 /* In aarch64_override_options_after_change
2043 flag_omit_leaf_frame_pointer turns off the frame pointer by
2044 default. Turn it back on now if we've not got a leaf
2046 if (flag_omit_leaf_frame_pointer
2047 && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2053 /* Mark the registers that need to be saved by the callee and calculate
2054 the size of the callee-saved registers area and frame record (both FP
2055 and LR may be omitted). */
2057 aarch64_layout_frame (void)
2059 HOST_WIDE_INT offset = 0;
2062 if (reload_completed && cfun->machine->frame.laid_out)
2065 #define SLOT_NOT_REQUIRED (-2)
2066 #define SLOT_REQUIRED (-1)
2068 cfun->machine->frame.wb_candidate1 = FIRST_PSEUDO_REGISTER;
2069 cfun->machine->frame.wb_candidate2 = FIRST_PSEUDO_REGISTER;
2071 /* First mark all the registers that really need to be saved... */
2072 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2073 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2075 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2076 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2078 /* ... that includes the eh data registers (if needed)... */
2079 if (crtl->calls_eh_return)
2080 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2081 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2084 /* ... and any callee saved register that dataflow says is live. */
2085 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2086 if (df_regs_ever_live_p (regno)
2087 && (regno == R30_REGNUM
2088 || !call_used_regs[regno]))
2089 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2091 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2092 if (df_regs_ever_live_p (regno)
2093 && !call_used_regs[regno])
2094 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2096 if (frame_pointer_needed)
2098 /* FP and LR are placed in the linkage record. */
2099 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2100 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2101 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2102 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2103 cfun->machine->frame.hardfp_offset = 2 * UNITS_PER_WORD;
2104 offset += 2 * UNITS_PER_WORD;
2107 /* Now assign stack slots for them. */
2108 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2109 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2111 cfun->machine->frame.reg_offset[regno] = offset;
2112 if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2113 cfun->machine->frame.wb_candidate1 = regno;
2114 else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER)
2115 cfun->machine->frame.wb_candidate2 = regno;
2116 offset += UNITS_PER_WORD;
2119 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2120 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2122 cfun->machine->frame.reg_offset[regno] = offset;
2123 if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2124 cfun->machine->frame.wb_candidate1 = regno;
2125 else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER
2126 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2127 cfun->machine->frame.wb_candidate2 = regno;
2128 offset += UNITS_PER_WORD;
2131 cfun->machine->frame.padding0 =
2132 (AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT) - offset);
2133 offset = AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2135 cfun->machine->frame.saved_regs_size = offset;
2137 cfun->machine->frame.hard_fp_offset
2138 = AARCH64_ROUND_UP (cfun->machine->frame.saved_varargs_size
2140 + cfun->machine->frame.saved_regs_size,
2141 STACK_BOUNDARY / BITS_PER_UNIT);
2143 cfun->machine->frame.frame_size
2144 = AARCH64_ROUND_UP (cfun->machine->frame.hard_fp_offset
2145 + crtl->outgoing_args_size,
2146 STACK_BOUNDARY / BITS_PER_UNIT);
2148 cfun->machine->frame.laid_out = true;
2152 aarch64_register_saved_on_entry (int regno)
2154 return cfun->machine->frame.reg_offset[regno] >= 0;
2158 aarch64_next_callee_save (unsigned regno, unsigned limit)
2160 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
2166 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
2167 HOST_WIDE_INT adjustment)
2169 rtx base_rtx = stack_pointer_rtx;
2172 reg = gen_rtx_REG (mode, regno);
2173 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
2174 plus_constant (Pmode, base_rtx, -adjustment));
2175 mem = gen_rtx_MEM (mode, mem);
2177 insn = emit_move_insn (mem, reg);
2178 RTX_FRAME_RELATED_P (insn) = 1;
2182 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2183 HOST_WIDE_INT adjustment)
2188 return gen_storewb_pairdi_di (base, base, reg, reg2,
2189 GEN_INT (-adjustment),
2190 GEN_INT (UNITS_PER_WORD - adjustment));
2192 return gen_storewb_pairdf_di (base, base, reg, reg2,
2193 GEN_INT (-adjustment),
2194 GEN_INT (UNITS_PER_WORD - adjustment));
2201 aarch64_pushwb_pair_reg (machine_mode mode, unsigned regno1,
2202 unsigned regno2, HOST_WIDE_INT adjustment)
2205 rtx reg1 = gen_rtx_REG (mode, regno1);
2206 rtx reg2 = gen_rtx_REG (mode, regno2);
2208 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
2210 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
2211 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2212 RTX_FRAME_RELATED_P (insn) = 1;
2216 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2217 HOST_WIDE_INT adjustment)
2222 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
2223 GEN_INT (UNITS_PER_WORD));
2225 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
2226 GEN_INT (UNITS_PER_WORD));
2233 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
2239 return gen_store_pairdi (mem1, reg1, mem2, reg2);
2242 return gen_store_pairdf (mem1, reg1, mem2, reg2);
2250 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
2256 return gen_load_pairdi (reg1, mem1, reg2, mem2);
2259 return gen_load_pairdf (reg1, mem1, reg2, mem2);
2268 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
2269 unsigned start, unsigned limit, bool skip_wb)
2272 rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2273 ? gen_frame_mem : gen_rtx_MEM);
2277 for (regno = aarch64_next_callee_save (start, limit);
2279 regno = aarch64_next_callee_save (regno + 1, limit))
2282 HOST_WIDE_INT offset;
2285 && (regno == cfun->machine->frame.wb_candidate1
2286 || regno == cfun->machine->frame.wb_candidate2))
2289 reg = gen_rtx_REG (mode, regno);
2290 offset = start_offset + cfun->machine->frame.reg_offset[regno];
2291 mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2294 regno2 = aarch64_next_callee_save (regno + 1, limit);
2297 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2298 == cfun->machine->frame.reg_offset[regno2]))
2301 rtx reg2 = gen_rtx_REG (mode, regno2);
2304 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2305 mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2307 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
2310 /* The first part of a frame-related parallel insn is
2311 always assumed to be relevant to the frame
2312 calculations; subsequent parts, are only
2313 frame-related if explicitly marked. */
2314 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2318 insn = emit_move_insn (mem, reg);
2320 RTX_FRAME_RELATED_P (insn) = 1;
2325 aarch64_restore_callee_saves (machine_mode mode,
2326 HOST_WIDE_INT start_offset, unsigned start,
2327 unsigned limit, bool skip_wb, rtx *cfi_ops)
2329 rtx base_rtx = stack_pointer_rtx;
2330 rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2331 ? gen_frame_mem : gen_rtx_MEM);
2334 HOST_WIDE_INT offset;
2336 for (regno = aarch64_next_callee_save (start, limit);
2338 regno = aarch64_next_callee_save (regno + 1, limit))
2343 && (regno == cfun->machine->frame.wb_candidate1
2344 || regno == cfun->machine->frame.wb_candidate2))
2347 reg = gen_rtx_REG (mode, regno);
2348 offset = start_offset + cfun->machine->frame.reg_offset[regno];
2349 mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2351 regno2 = aarch64_next_callee_save (regno + 1, limit);
2354 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2355 == cfun->machine->frame.reg_offset[regno2]))
2357 rtx reg2 = gen_rtx_REG (mode, regno2);
2360 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2361 mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2362 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
2364 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
2368 emit_move_insn (reg, mem);
2369 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
2373 /* AArch64 stack frames generated by this compiler look like:
2375 +-------------------------------+
2377 | incoming stack arguments |
2379 +-------------------------------+
2380 | | <-- incoming stack pointer (aligned)
2381 | callee-allocated save area |
2382 | for register varargs |
2384 +-------------------------------+
2385 | local variables | <-- frame_pointer_rtx
2387 +-------------------------------+
2389 +-------------------------------+ |
2390 | callee-saved registers | | frame.saved_regs_size
2391 +-------------------------------+ |
2393 +-------------------------------+ |
2394 | FP' | / <- hard_frame_pointer_rtx (aligned)
2395 +-------------------------------+
2396 | dynamic allocation |
2397 +-------------------------------+
2399 +-------------------------------+
2400 | outgoing stack arguments | <-- arg_pointer
2402 +-------------------------------+
2403 | | <-- stack_pointer_rtx (aligned)
2405 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
2406 but leave frame_pointer_rtx and hard_frame_pointer_rtx
2409 /* Generate the prologue instructions for entry into a function.
2410 Establish the stack frame by decreasing the stack pointer with a
2411 properly calculated size and, if necessary, create a frame record
2412 filled with the values of LR and previous frame pointer. The
2413 current FP is also set up if it is in use. */
2416 aarch64_expand_prologue (void)
2418 /* sub sp, sp, #<frame_size>
2419 stp {fp, lr}, [sp, #<frame_size> - 16]
2420 add fp, sp, #<frame_size> - hardfp_offset
2421 stp {cs_reg}, [fp, #-16] etc.
2423 sub sp, sp, <final_adjustment_if_any>
2425 HOST_WIDE_INT frame_size, offset;
2426 HOST_WIDE_INT fp_offset; /* Offset from hard FP to SP. */
2427 HOST_WIDE_INT hard_fp_offset;
2430 aarch64_layout_frame ();
2432 offset = frame_size = cfun->machine->frame.frame_size;
2433 hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2434 fp_offset = frame_size - hard_fp_offset;
2436 if (flag_stack_usage_info)
2437 current_function_static_stack_size = frame_size;
2439 /* Store pairs and load pairs have a range only -512 to 504. */
2442 /* When the frame has a large size, an initial decrease is done on
2443 the stack pointer to jump over the callee-allocated save area for
2444 register varargs, the local variable area and/or the callee-saved
2445 register area. This will allow the pre-index write-back
2446 store pair instructions to be used for setting up the stack frame
2448 offset = hard_fp_offset;
2450 offset = cfun->machine->frame.saved_regs_size;
2452 frame_size -= (offset + crtl->outgoing_args_size);
2455 if (frame_size >= 0x1000000)
2457 rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2458 emit_move_insn (op0, GEN_INT (-frame_size));
2459 insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2461 add_reg_note (insn, REG_CFA_ADJUST_CFA,
2462 gen_rtx_SET (stack_pointer_rtx,
2463 plus_constant (Pmode, stack_pointer_rtx,
2465 RTX_FRAME_RELATED_P (insn) = 1;
2467 else if (frame_size > 0)
2469 int hi_ofs = frame_size & 0xfff000;
2470 int lo_ofs = frame_size & 0x000fff;
2474 insn = emit_insn (gen_add2_insn
2475 (stack_pointer_rtx, GEN_INT (-hi_ofs)));
2476 RTX_FRAME_RELATED_P (insn) = 1;
2480 insn = emit_insn (gen_add2_insn
2481 (stack_pointer_rtx, GEN_INT (-lo_ofs)));
2482 RTX_FRAME_RELATED_P (insn) = 1;
2491 bool skip_wb = false;
2493 if (frame_pointer_needed)
2499 insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2500 GEN_INT (-offset)));
2501 RTX_FRAME_RELATED_P (insn) = 1;
2503 aarch64_save_callee_saves (DImode, fp_offset, R29_REGNUM,
2507 aarch64_pushwb_pair_reg (DImode, R29_REGNUM, R30_REGNUM, offset);
2509 /* Set up frame pointer to point to the location of the
2510 previous frame pointer on the stack. */
2511 insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
2513 GEN_INT (fp_offset)));
2514 RTX_FRAME_RELATED_P (insn) = 1;
2515 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
2519 unsigned reg1 = cfun->machine->frame.wb_candidate1;
2520 unsigned reg2 = cfun->machine->frame.wb_candidate2;
2523 || reg1 == FIRST_PSEUDO_REGISTER
2524 || (reg2 == FIRST_PSEUDO_REGISTER
2527 insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2528 GEN_INT (-offset)));
2529 RTX_FRAME_RELATED_P (insn) = 1;
2533 machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2537 if (reg2 == FIRST_PSEUDO_REGISTER)
2538 aarch64_pushwb_single_reg (mode1, reg1, offset);
2540 aarch64_pushwb_pair_reg (mode1, reg1, reg2, offset);
2544 aarch64_save_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2546 aarch64_save_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2550 /* when offset >= 512,
2551 sub sp, sp, #<outgoing_args_size> */
2552 if (frame_size > -1)
2554 if (crtl->outgoing_args_size > 0)
2556 insn = emit_insn (gen_add2_insn
2558 GEN_INT (- crtl->outgoing_args_size)));
2559 RTX_FRAME_RELATED_P (insn) = 1;
2564 /* Return TRUE if we can use a simple_return insn.
2566 This function checks whether the callee saved stack is empty, which
2567 means no restore actions are need. The pro_and_epilogue will use
2568 this to check whether shrink-wrapping opt is feasible. */
2571 aarch64_use_return_insn_p (void)
2573 if (!reload_completed)
2579 aarch64_layout_frame ();
2581 return cfun->machine->frame.frame_size == 0;
2584 /* Generate the epilogue instructions for returning from a function. */
2586 aarch64_expand_epilogue (bool for_sibcall)
2588 HOST_WIDE_INT frame_size, offset;
2589 HOST_WIDE_INT fp_offset;
2590 HOST_WIDE_INT hard_fp_offset;
2592 /* We need to add memory barrier to prevent read from deallocated stack. */
2593 bool need_barrier_p = (get_frame_size () != 0
2594 || cfun->machine->frame.saved_varargs_size);
2596 aarch64_layout_frame ();
2598 offset = frame_size = cfun->machine->frame.frame_size;
2599 hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2600 fp_offset = frame_size - hard_fp_offset;
2602 /* Store pairs and load pairs have a range only -512 to 504. */
2605 offset = hard_fp_offset;
2607 offset = cfun->machine->frame.saved_regs_size;
2609 frame_size -= (offset + crtl->outgoing_args_size);
2611 if (!frame_pointer_needed && crtl->outgoing_args_size > 0)
2613 insn = emit_insn (gen_add2_insn
2615 GEN_INT (crtl->outgoing_args_size)));
2616 RTX_FRAME_RELATED_P (insn) = 1;
2622 /* If there were outgoing arguments or we've done dynamic stack
2623 allocation, then restore the stack pointer from the frame
2624 pointer. This is at most one insn and more efficient than using
2625 GCC's internal mechanism. */
2626 if (frame_pointer_needed
2627 && (crtl->outgoing_args_size || cfun->calls_alloca))
2629 if (cfun->calls_alloca)
2630 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2632 insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
2633 hard_frame_pointer_rtx,
2635 offset = offset - fp_offset;
2640 unsigned reg1 = cfun->machine->frame.wb_candidate1;
2641 unsigned reg2 = cfun->machine->frame.wb_candidate2;
2642 bool skip_wb = true;
2645 if (frame_pointer_needed)
2648 || reg1 == FIRST_PSEUDO_REGISTER
2649 || (reg2 == FIRST_PSEUDO_REGISTER
2653 aarch64_restore_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2655 aarch64_restore_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2659 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2663 machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2664 rtx rreg1 = gen_rtx_REG (mode1, reg1);
2666 cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg1, cfi_ops);
2667 if (reg2 == FIRST_PSEUDO_REGISTER)
2669 rtx mem = plus_constant (Pmode, stack_pointer_rtx, offset);
2670 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
2671 mem = gen_rtx_MEM (mode1, mem);
2672 insn = emit_move_insn (rreg1, mem);
2676 rtx rreg2 = gen_rtx_REG (mode1, reg2);
2678 cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg2, cfi_ops);
2679 insn = emit_insn (aarch64_gen_loadwb_pair
2680 (mode1, stack_pointer_rtx, rreg1,
2686 insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2690 /* Reset the CFA to be SP + FRAME_SIZE. */
2691 rtx new_cfa = stack_pointer_rtx;
2693 new_cfa = plus_constant (Pmode, new_cfa, frame_size);
2694 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
2695 REG_NOTES (insn) = cfi_ops;
2696 RTX_FRAME_RELATED_P (insn) = 1;
2702 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2704 if (frame_size >= 0x1000000)
2706 rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2707 emit_move_insn (op0, GEN_INT (frame_size));
2708 insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2712 int hi_ofs = frame_size & 0xfff000;
2713 int lo_ofs = frame_size & 0x000fff;
2715 if (hi_ofs && lo_ofs)
2717 insn = emit_insn (gen_add2_insn
2718 (stack_pointer_rtx, GEN_INT (hi_ofs)));
2719 RTX_FRAME_RELATED_P (insn) = 1;
2720 frame_size = lo_ofs;
2722 insn = emit_insn (gen_add2_insn
2723 (stack_pointer_rtx, GEN_INT (frame_size)));
2726 /* Reset the CFA to be SP + 0. */
2727 add_reg_note (insn, REG_CFA_DEF_CFA, stack_pointer_rtx);
2728 RTX_FRAME_RELATED_P (insn) = 1;
2731 /* Stack adjustment for exception handler. */
2732 if (crtl->calls_eh_return)
2734 /* We need to unwind the stack by the offset computed by
2735 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
2736 to be SP; letting the CFA move during this adjustment
2737 is just as correct as retaining the CFA from the body
2738 of the function. Therefore, do nothing special. */
2739 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
2742 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
2744 emit_jump_insn (ret_rtx);
2747 /* Return the place to copy the exception unwinding return address to.
2748 This will probably be a stack slot, but could (in theory be the
2749 return register). */
2751 aarch64_final_eh_return_addr (void)
2753 HOST_WIDE_INT fp_offset;
2755 aarch64_layout_frame ();
2757 fp_offset = cfun->machine->frame.frame_size
2758 - cfun->machine->frame.hard_fp_offset;
2760 if (cfun->machine->frame.reg_offset[LR_REGNUM] < 0)
2761 return gen_rtx_REG (DImode, LR_REGNUM);
2763 /* DSE and CSELIB do not detect an alias between sp+k1 and fp+k2. This can
2764 result in a store to save LR introduced by builtin_eh_return () being
2765 incorrectly deleted because the alias is not detected.
2766 So in the calculation of the address to copy the exception unwinding
2767 return address to, we note 2 cases.
2768 If FP is needed and the fp_offset is 0, it means that SP = FP and hence
2769 we return a SP-relative location since all the addresses are SP-relative
2770 in this case. This prevents the store from being optimized away.
2771 If the fp_offset is not 0, then the addresses will be FP-relative and
2772 therefore we return a FP-relative location. */
2774 if (frame_pointer_needed)
2777 return gen_frame_mem (DImode,
2778 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
2780 return gen_frame_mem (DImode,
2781 plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD));
2784 /* If FP is not needed, we calculate the location of LR, which would be
2785 at the top of the saved registers block. */
2787 return gen_frame_mem (DImode,
2788 plus_constant (Pmode,
2791 + cfun->machine->frame.saved_regs_size
2792 - 2 * UNITS_PER_WORD));
2795 /* Possibly output code to build up a constant in a register. For
2796 the benefit of the costs infrastructure, returns the number of
2797 instructions which would be emitted. GENERATE inhibits or
2798 enables code generation. */
2801 aarch64_build_constant (int regnum, HOST_WIDE_INT val, bool generate)
2805 if (aarch64_bitmask_imm (val, DImode))
2808 emit_move_insn (gen_rtx_REG (Pmode, regnum), GEN_INT (val));
2816 HOST_WIDE_INT valp = val >> 16;
2820 for (i = 16; i < 64; i += 16)
2822 valm = (valp & 0xffff);
2833 /* zcount contains the number of additional MOVK instructions
2834 required if the constant is built up with an initial MOVZ instruction,
2835 while ncount is the number of MOVK instructions required if starting
2836 with a MOVN instruction. Choose the sequence that yields the fewest
2837 number of instructions, preferring MOVZ instructions when they are both
2839 if (ncount < zcount)
2842 emit_move_insn (gen_rtx_REG (Pmode, regnum),
2843 GEN_INT (val | ~(HOST_WIDE_INT) 0xffff));
2850 emit_move_insn (gen_rtx_REG (Pmode, regnum),
2851 GEN_INT (val & 0xffff));
2858 for (i = 16; i < 64; i += 16)
2860 if ((val & 0xffff) != tval)
2863 emit_insn (gen_insv_immdi (gen_rtx_REG (Pmode, regnum),
2865 GEN_INT (val & 0xffff)));
2875 aarch64_add_constant (int regnum, int scratchreg, HOST_WIDE_INT delta)
2877 HOST_WIDE_INT mdelta = delta;
2878 rtx this_rtx = gen_rtx_REG (Pmode, regnum);
2879 rtx scratch_rtx = gen_rtx_REG (Pmode, scratchreg);
2884 if (mdelta >= 4096 * 4096)
2886 (void) aarch64_build_constant (scratchreg, delta, true);
2887 emit_insn (gen_add3_insn (this_rtx, this_rtx, scratch_rtx));
2889 else if (mdelta > 0)
2893 emit_insn (gen_rtx_SET (scratch_rtx, GEN_INT (mdelta / 4096)));
2894 rtx shift = gen_rtx_ASHIFT (Pmode, scratch_rtx, GEN_INT (12));
2896 emit_insn (gen_rtx_SET (this_rtx,
2897 gen_rtx_MINUS (Pmode, this_rtx, shift)));
2899 emit_insn (gen_rtx_SET (this_rtx,
2900 gen_rtx_PLUS (Pmode, this_rtx, shift)));
2902 if (mdelta % 4096 != 0)
2904 scratch_rtx = GEN_INT ((delta < 0 ? -1 : 1) * (mdelta % 4096));
2905 emit_insn (gen_rtx_SET (this_rtx,
2906 gen_rtx_PLUS (Pmode, this_rtx, scratch_rtx)));
2911 /* Output code to add DELTA to the first argument, and then jump
2912 to FUNCTION. Used for C++ multiple inheritance. */
2914 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
2915 HOST_WIDE_INT delta,
2916 HOST_WIDE_INT vcall_offset,
2919 /* The this pointer is always in x0. Note that this differs from
2920 Arm where the this pointer maybe bumped to r1 if r0 is required
2921 to return a pointer to an aggregate. On AArch64 a result value
2922 pointer will be in x8. */
2923 int this_regno = R0_REGNUM;
2924 rtx this_rtx, temp0, temp1, addr, funexp;
2927 reload_completed = 1;
2928 emit_note (NOTE_INSN_PROLOGUE_END);
2930 if (vcall_offset == 0)
2931 aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2934 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
2936 this_rtx = gen_rtx_REG (Pmode, this_regno);
2937 temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2938 temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
2943 if (delta >= -256 && delta < 256)
2944 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
2945 plus_constant (Pmode, this_rtx, delta));
2947 aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2950 if (Pmode == ptr_mode)
2951 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
2953 aarch64_emit_move (temp0,
2954 gen_rtx_ZERO_EXTEND (Pmode,
2955 gen_rtx_MEM (ptr_mode, addr)));
2957 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
2958 addr = plus_constant (Pmode, temp0, vcall_offset);
2961 (void) aarch64_build_constant (IP1_REGNUM, vcall_offset, true);
2962 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
2965 if (Pmode == ptr_mode)
2966 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
2968 aarch64_emit_move (temp1,
2969 gen_rtx_SIGN_EXTEND (Pmode,
2970 gen_rtx_MEM (ptr_mode, addr)));
2972 emit_insn (gen_add2_insn (this_rtx, temp1));
2975 /* Generate a tail call to the target function. */
2976 if (!TREE_USED (function))
2978 assemble_external (function);
2979 TREE_USED (function) = 1;
2981 funexp = XEXP (DECL_RTL (function), 0);
2982 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
2983 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
2984 SIBLING_CALL_P (insn) = 1;
2986 insn = get_insns ();
2987 shorten_branches (insn);
2988 final_start_function (insn, file, 1);
2989 final (insn, file, 1);
2990 final_end_function ();
2992 /* Stop pretending to be a post-reload pass. */
2993 reload_completed = 0;
2997 aarch64_tls_referenced_p (rtx x)
2999 if (!TARGET_HAVE_TLS)
3001 subrtx_iterator::array_type array;
3002 FOR_EACH_SUBRTX (iter, array, x, ALL)
3004 const_rtx x = *iter;
3005 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3007 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3008 TLS offsets, not real symbol references. */
3009 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3010 iter.skip_subrtxes ();
3017 aarch64_bitmasks_cmp (const void *i1, const void *i2)
3019 const unsigned HOST_WIDE_INT *imm1 = (const unsigned HOST_WIDE_INT *) i1;
3020 const unsigned HOST_WIDE_INT *imm2 = (const unsigned HOST_WIDE_INT *) i2;
3031 aarch64_build_bitmask_table (void)
3033 unsigned HOST_WIDE_INT mask, imm;
3034 unsigned int log_e, e, s, r;
3035 unsigned int nimms = 0;
3037 for (log_e = 1; log_e <= 6; log_e++)
3041 mask = ~(HOST_WIDE_INT) 0;
3043 mask = ((HOST_WIDE_INT) 1 << e) - 1;
3044 for (s = 1; s < e; s++)
3046 for (r = 0; r < e; r++)
3048 /* set s consecutive bits to 1 (s < 64) */
3049 imm = ((unsigned HOST_WIDE_INT)1 << s) - 1;
3050 /* rotate right by r */
3052 imm = ((imm >> r) | (imm << (e - r))) & mask;
3053 /* replicate the constant depending on SIMD size */
3055 case 1: imm |= (imm << 2);
3056 case 2: imm |= (imm << 4);
3057 case 3: imm |= (imm << 8);
3058 case 4: imm |= (imm << 16);
3059 case 5: imm |= (imm << 32);
3065 gcc_assert (nimms < AARCH64_NUM_BITMASKS);
3066 aarch64_bitmasks[nimms++] = imm;
3071 gcc_assert (nimms == AARCH64_NUM_BITMASKS);
3072 qsort (aarch64_bitmasks, nimms, sizeof (aarch64_bitmasks[0]),
3073 aarch64_bitmasks_cmp);
3077 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3078 a left shift of 0 or 12 bits. */
3080 aarch64_uimm12_shift (HOST_WIDE_INT val)
3082 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3083 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3088 /* Return true if val is an immediate that can be loaded into a
3089 register by a MOVZ instruction. */
3091 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3093 if (GET_MODE_SIZE (mode) > 4)
3095 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3096 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3101 /* Ignore sign extension. */
3102 val &= (HOST_WIDE_INT) 0xffffffff;
3104 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3105 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3109 /* Return true if val is a valid bitmask immediate. */
3111 aarch64_bitmask_imm (HOST_WIDE_INT val, machine_mode mode)
3113 if (GET_MODE_SIZE (mode) < 8)
3115 /* Replicate bit pattern. */
3116 val &= (HOST_WIDE_INT) 0xffffffff;
3119 return bsearch (&val, aarch64_bitmasks, AARCH64_NUM_BITMASKS,
3120 sizeof (aarch64_bitmasks[0]), aarch64_bitmasks_cmp) != NULL;
3124 /* Return true if val is an immediate that can be loaded into a
3125 register in a single instruction. */
3127 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
3129 if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
3131 return aarch64_bitmask_imm (val, mode);
3135 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
3139 if (GET_CODE (x) == HIGH)
3142 split_const (x, &base, &offset);
3143 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
3145 if (aarch64_classify_symbol (base, offset, SYMBOL_CONTEXT_ADR)
3146 != SYMBOL_FORCE_TO_MEM)
3149 /* Avoid generating a 64-bit relocation in ILP32; leave
3150 to aarch64_expand_mov_immediate to handle it properly. */
3151 return mode != ptr_mode;
3154 return aarch64_tls_referenced_p (x);
3157 /* Return true if register REGNO is a valid index register.
3158 STRICT_P is true if REG_OK_STRICT is in effect. */
3161 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
3163 if (!HARD_REGISTER_NUM_P (regno))
3171 regno = reg_renumber[regno];
3173 return GP_REGNUM_P (regno);
3176 /* Return true if register REGNO is a valid base register for mode MODE.
3177 STRICT_P is true if REG_OK_STRICT is in effect. */
3180 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
3182 if (!HARD_REGISTER_NUM_P (regno))
3190 regno = reg_renumber[regno];
3193 /* The fake registers will be eliminated to either the stack or
3194 hard frame pointer, both of which are usually valid base registers.
3195 Reload deals with the cases where the eliminated form isn't valid. */
3196 return (GP_REGNUM_P (regno)
3197 || regno == SP_REGNUM
3198 || regno == FRAME_POINTER_REGNUM
3199 || regno == ARG_POINTER_REGNUM);
3202 /* Return true if X is a valid base register for mode MODE.
3203 STRICT_P is true if REG_OK_STRICT is in effect. */
3206 aarch64_base_register_rtx_p (rtx x, bool strict_p)
3208 if (!strict_p && GET_CODE (x) == SUBREG)
3211 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
3214 /* Return true if address offset is a valid index. If it is, fill in INFO
3215 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
3218 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
3219 machine_mode mode, bool strict_p)
3221 enum aarch64_address_type type;
3226 if ((REG_P (x) || GET_CODE (x) == SUBREG)
3227 && GET_MODE (x) == Pmode)
3229 type = ADDRESS_REG_REG;
3233 /* (sign_extend:DI (reg:SI)) */
3234 else if ((GET_CODE (x) == SIGN_EXTEND
3235 || GET_CODE (x) == ZERO_EXTEND)
3236 && GET_MODE (x) == DImode
3237 && GET_MODE (XEXP (x, 0)) == SImode)
3239 type = (GET_CODE (x) == SIGN_EXTEND)
3240 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3241 index = XEXP (x, 0);
3244 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
3245 else if (GET_CODE (x) == MULT
3246 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3247 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3248 && GET_MODE (XEXP (x, 0)) == DImode
3249 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3250 && CONST_INT_P (XEXP (x, 1)))
3252 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3253 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3254 index = XEXP (XEXP (x, 0), 0);
3255 shift = exact_log2 (INTVAL (XEXP (x, 1)));
3257 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
3258 else if (GET_CODE (x) == ASHIFT
3259 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3260 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3261 && GET_MODE (XEXP (x, 0)) == DImode
3262 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3263 && CONST_INT_P (XEXP (x, 1)))
3265 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3266 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3267 index = XEXP (XEXP (x, 0), 0);
3268 shift = INTVAL (XEXP (x, 1));
3270 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
3271 else if ((GET_CODE (x) == SIGN_EXTRACT
3272 || GET_CODE (x) == ZERO_EXTRACT)
3273 && GET_MODE (x) == DImode
3274 && GET_CODE (XEXP (x, 0)) == MULT
3275 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3276 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3278 type = (GET_CODE (x) == SIGN_EXTRACT)
3279 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3280 index = XEXP (XEXP (x, 0), 0);
3281 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3282 if (INTVAL (XEXP (x, 1)) != 32 + shift
3283 || INTVAL (XEXP (x, 2)) != 0)
3286 /* (and:DI (mult:DI (reg:DI) (const_int scale))
3287 (const_int 0xffffffff<<shift)) */
3288 else if (GET_CODE (x) == AND
3289 && GET_MODE (x) == DImode
3290 && GET_CODE (XEXP (x, 0)) == MULT
3291 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3292 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3293 && CONST_INT_P (XEXP (x, 1)))
3295 type = ADDRESS_REG_UXTW;
3296 index = XEXP (XEXP (x, 0), 0);
3297 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3298 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3301 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
3302 else if ((GET_CODE (x) == SIGN_EXTRACT
3303 || GET_CODE (x) == ZERO_EXTRACT)
3304 && GET_MODE (x) == DImode
3305 && GET_CODE (XEXP (x, 0)) == ASHIFT
3306 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3307 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3309 type = (GET_CODE (x) == SIGN_EXTRACT)
3310 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3311 index = XEXP (XEXP (x, 0), 0);
3312 shift = INTVAL (XEXP (XEXP (x, 0), 1));
3313 if (INTVAL (XEXP (x, 1)) != 32 + shift
3314 || INTVAL (XEXP (x, 2)) != 0)
3317 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
3318 (const_int 0xffffffff<<shift)) */
3319 else if (GET_CODE (x) == AND
3320 && GET_MODE (x) == DImode
3321 && GET_CODE (XEXP (x, 0)) == ASHIFT
3322 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3323 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3324 && CONST_INT_P (XEXP (x, 1)))
3326 type = ADDRESS_REG_UXTW;
3327 index = XEXP (XEXP (x, 0), 0);
3328 shift = INTVAL (XEXP (XEXP (x, 0), 1));
3329 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3332 /* (mult:P (reg:P) (const_int scale)) */
3333 else if (GET_CODE (x) == MULT
3334 && GET_MODE (x) == Pmode
3335 && GET_MODE (XEXP (x, 0)) == Pmode
3336 && CONST_INT_P (XEXP (x, 1)))
3338 type = ADDRESS_REG_REG;
3339 index = XEXP (x, 0);
3340 shift = exact_log2 (INTVAL (XEXP (x, 1)));
3342 /* (ashift:P (reg:P) (const_int shift)) */
3343 else if (GET_CODE (x) == ASHIFT
3344 && GET_MODE (x) == Pmode
3345 && GET_MODE (XEXP (x, 0)) == Pmode
3346 && CONST_INT_P (XEXP (x, 1)))
3348 type = ADDRESS_REG_REG;
3349 index = XEXP (x, 0);
3350 shift = INTVAL (XEXP (x, 1));
3355 if (GET_CODE (index) == SUBREG)
3356 index = SUBREG_REG (index);
3359 (shift > 0 && shift <= 3
3360 && (1 << shift) == GET_MODE_SIZE (mode)))
3362 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
3365 info->offset = index;
3366 info->shift = shift;
3374 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3376 return (offset >= -64 * GET_MODE_SIZE (mode)
3377 && offset < 64 * GET_MODE_SIZE (mode)
3378 && offset % GET_MODE_SIZE (mode) == 0);
3382 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3383 HOST_WIDE_INT offset)
3385 return offset >= -256 && offset < 256;
3389 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3392 && offset < 4096 * GET_MODE_SIZE (mode)
3393 && offset % GET_MODE_SIZE (mode) == 0);
3396 /* Return true if X is a valid address for machine mode MODE. If it is,
3397 fill in INFO appropriately. STRICT_P is true if REG_OK_STRICT is in
3398 effect. OUTER_CODE is PARALLEL for a load/store pair. */
3401 aarch64_classify_address (struct aarch64_address_info *info,
3402 rtx x, machine_mode mode,
3403 RTX_CODE outer_code, bool strict_p)
3405 enum rtx_code code = GET_CODE (x);
3408 /* On BE, we use load/store pair for all large int mode load/stores. */
3409 bool load_store_pair_p = (outer_code == PARALLEL
3410 || (BYTES_BIG_ENDIAN
3411 && aarch64_vect_struct_mode_p (mode)));
3413 bool allow_reg_index_p =
3415 && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
3416 && !aarch64_vect_struct_mode_p (mode);
3418 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
3420 if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
3421 && (code != POST_INC && code != REG))
3428 info->type = ADDRESS_REG_IMM;
3430 info->offset = const0_rtx;
3431 return aarch64_base_register_rtx_p (x, strict_p);
3439 && (op0 == virtual_stack_vars_rtx
3440 || op0 == frame_pointer_rtx
3441 || op0 == arg_pointer_rtx)
3442 && CONST_INT_P (op1))
3444 info->type = ADDRESS_REG_IMM;
3451 if (GET_MODE_SIZE (mode) != 0
3452 && CONST_INT_P (op1)
3453 && aarch64_base_register_rtx_p (op0, strict_p))
3455 HOST_WIDE_INT offset = INTVAL (op1);
3457 info->type = ADDRESS_REG_IMM;
3461 /* TImode and TFmode values are allowed in both pairs of X
3462 registers and individual Q registers. The available
3464 X,X: 7-bit signed scaled offset
3465 Q: 9-bit signed offset
3466 We conservatively require an offset representable in either mode.
3468 if (mode == TImode || mode == TFmode)
3469 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3470 && offset_9bit_signed_unscaled_p (mode, offset));
3472 /* A 7bit offset check because OImode will emit a ldp/stp
3473 instruction (only big endian will get here).
3474 For ldp/stp instructions, the offset is scaled for the size of a
3475 single element of the pair. */
3477 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
3479 /* Three 9/12 bit offsets checks because CImode will emit three
3480 ldr/str instructions (only big endian will get here). */
3482 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3483 && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
3484 || offset_12bit_unsigned_scaled_p (V16QImode,
3487 /* Two 7bit offsets checks because XImode will emit two ldp/stp
3488 instructions (only big endian will get here). */
3490 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3491 && aarch64_offset_7bit_signed_scaled_p (TImode,
3494 if (load_store_pair_p)
3495 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3496 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3498 return (offset_9bit_signed_unscaled_p (mode, offset)
3499 || offset_12bit_unsigned_scaled_p (mode, offset));
3502 if (allow_reg_index_p)
3504 /* Look for base + (scaled/extended) index register. */
3505 if (aarch64_base_register_rtx_p (op0, strict_p)
3506 && aarch64_classify_index (info, op1, mode, strict_p))
3511 if (aarch64_base_register_rtx_p (op1, strict_p)
3512 && aarch64_classify_index (info, op0, mode, strict_p))
3525 info->type = ADDRESS_REG_WB;
3526 info->base = XEXP (x, 0);
3527 info->offset = NULL_RTX;
3528 return aarch64_base_register_rtx_p (info->base, strict_p);
3532 info->type = ADDRESS_REG_WB;
3533 info->base = XEXP (x, 0);
3534 if (GET_CODE (XEXP (x, 1)) == PLUS
3535 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
3536 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
3537 && aarch64_base_register_rtx_p (info->base, strict_p))
3539 HOST_WIDE_INT offset;
3540 info->offset = XEXP (XEXP (x, 1), 1);
3541 offset = INTVAL (info->offset);
3543 /* TImode and TFmode values are allowed in both pairs of X
3544 registers and individual Q registers. The available
3546 X,X: 7-bit signed scaled offset
3547 Q: 9-bit signed offset
3548 We conservatively require an offset representable in either mode.
3550 if (mode == TImode || mode == TFmode)
3551 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3552 && offset_9bit_signed_unscaled_p (mode, offset));
3554 if (load_store_pair_p)
3555 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3556 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3558 return offset_9bit_signed_unscaled_p (mode, offset);
3565 /* load literal: pc-relative constant pool entry. Only supported
3566 for SI mode or larger. */
3567 info->type = ADDRESS_SYMBOLIC;
3569 if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
3573 split_const (x, &sym, &addend);
3574 return (GET_CODE (sym) == LABEL_REF
3575 || (GET_CODE (sym) == SYMBOL_REF
3576 && CONSTANT_POOL_ADDRESS_P (sym)));
3581 info->type = ADDRESS_LO_SUM;
3582 info->base = XEXP (x, 0);
3583 info->offset = XEXP (x, 1);
3584 if (allow_reg_index_p
3585 && aarch64_base_register_rtx_p (info->base, strict_p))
3588 split_const (info->offset, &sym, &offs);
3589 if (GET_CODE (sym) == SYMBOL_REF
3590 && (aarch64_classify_symbol (sym, offs, SYMBOL_CONTEXT_MEM)
3591 == SYMBOL_SMALL_ABSOLUTE))
3593 /* The symbol and offset must be aligned to the access size. */
3595 unsigned int ref_size;
3597 if (CONSTANT_POOL_ADDRESS_P (sym))
3598 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
3599 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
3601 tree exp = SYMBOL_REF_DECL (sym);
3602 align = TYPE_ALIGN (TREE_TYPE (exp));
3603 align = CONSTANT_ALIGNMENT (exp, align);
3605 else if (SYMBOL_REF_DECL (sym))
3606 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
3607 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
3608 && SYMBOL_REF_BLOCK (sym) != NULL)
3609 align = SYMBOL_REF_BLOCK (sym)->alignment;
3611 align = BITS_PER_UNIT;
3613 ref_size = GET_MODE_SIZE (mode);
3615 ref_size = GET_MODE_SIZE (DImode);
3617 return ((INTVAL (offs) & (ref_size - 1)) == 0
3618 && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
3629 aarch64_symbolic_address_p (rtx x)
3633 split_const (x, &x, &offset);
3634 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
3637 /* Classify the base of symbolic expression X, given that X appears in
3640 enum aarch64_symbol_type
3641 aarch64_classify_symbolic_expression (rtx x,
3642 enum aarch64_symbol_context context)
3646 split_const (x, &x, &offset);
3647 return aarch64_classify_symbol (x, offset, context);
3651 /* Return TRUE if X is a legitimate address for accessing memory in
3654 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
3656 struct aarch64_address_info addr;
3658 return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
3661 /* Return TRUE if X is a legitimate address for accessing memory in
3662 mode MODE. OUTER_CODE will be PARALLEL if this is a load/store
3665 aarch64_legitimate_address_p (machine_mode mode, rtx x,
3666 RTX_CODE outer_code, bool strict_p)
3668 struct aarch64_address_info addr;
3670 return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
3673 /* Return TRUE if rtx X is immediate constant 0.0 */
3675 aarch64_float_const_zero_rtx_p (rtx x)
3679 if (GET_MODE (x) == VOIDmode)
3682 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
3683 if (REAL_VALUE_MINUS_ZERO (r))
3684 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
3685 return REAL_VALUES_EQUAL (r, dconst0);
3688 /* Return the fixed registers used for condition codes. */
3691 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
3694 *p2 = INVALID_REGNUM;
3698 /* Emit call insn with PAT and do aarch64-specific handling. */
3701 aarch64_emit_call_insn (rtx pat)
3703 rtx insn = emit_call_insn (pat);
3705 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
3706 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
3707 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
3711 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
3713 /* All floating point compares return CCFP if it is an equality
3714 comparison, and CCFPE otherwise. */
3715 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
3742 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3744 && (code == EQ || code == NE || code == LT || code == GE)
3745 && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
3746 || GET_CODE (x) == NEG))
3749 /* A compare with a shifted operand. Because of canonicalization,
3750 the comparison will have to be swapped when we emit the assembly
3752 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3753 && (REG_P (y) || GET_CODE (y) == SUBREG)
3754 && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
3755 || GET_CODE (x) == LSHIFTRT
3756 || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
3759 /* Similarly for a negated operand, but we can only do this for
3761 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3762 && (REG_P (y) || GET_CODE (y) == SUBREG)
3763 && (code == EQ || code == NE)
3764 && GET_CODE (x) == NEG)
3767 /* A compare of a mode narrower than SI mode against zero can be done
3768 by extending the value in the comparison. */
3769 if ((GET_MODE (x) == QImode || GET_MODE (x) == HImode)
3771 /* Only use sign-extension if we really need it. */
3772 return ((code == GT || code == GE || code == LE || code == LT)
3773 ? CC_SESWPmode : CC_ZESWPmode);
3775 /* For everything else, return CCmode. */
3780 aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
3783 aarch64_get_condition_code (rtx x)
3785 machine_mode mode = GET_MODE (XEXP (x, 0));
3786 enum rtx_code comp_code = GET_CODE (x);
3788 if (GET_MODE_CLASS (mode) != MODE_CC)
3789 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
3790 return aarch64_get_condition_code_1 (mode, comp_code);
3794 aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
3796 int ne = -1, eq = -1;
3803 case GE: return AARCH64_GE;
3804 case GT: return AARCH64_GT;
3805 case LE: return AARCH64_LS;
3806 case LT: return AARCH64_MI;
3807 case NE: return AARCH64_NE;
3808 case EQ: return AARCH64_EQ;
3809 case ORDERED: return AARCH64_VC;
3810 case UNORDERED: return AARCH64_VS;
3811 case UNLT: return AARCH64_LT;
3812 case UNLE: return AARCH64_LE;
3813 case UNGT: return AARCH64_HI;
3814 case UNGE: return AARCH64_PL;
3872 case NE: return AARCH64_NE;
3873 case EQ: return AARCH64_EQ;
3874 case GE: return AARCH64_GE;
3875 case GT: return AARCH64_GT;
3876 case LE: return AARCH64_LE;
3877 case LT: return AARCH64_LT;
3878 case GEU: return AARCH64_CS;
3879 case GTU: return AARCH64_HI;
3880 case LEU: return AARCH64_LS;
3881 case LTU: return AARCH64_CC;
3891 case NE: return AARCH64_NE;
3892 case EQ: return AARCH64_EQ;
3893 case GE: return AARCH64_LE;
3894 case GT: return AARCH64_LT;
3895 case LE: return AARCH64_GE;
3896 case LT: return AARCH64_GT;
3897 case GEU: return AARCH64_LS;
3898 case GTU: return AARCH64_CC;
3899 case LEU: return AARCH64_CS;
3900 case LTU: return AARCH64_HI;
3908 case NE: return AARCH64_NE;
3909 case EQ: return AARCH64_EQ;
3910 case GE: return AARCH64_PL;
3911 case LT: return AARCH64_MI;
3919 case NE: return AARCH64_NE;
3920 case EQ: return AARCH64_EQ;
3930 if (comp_code == NE)
3933 if (comp_code == EQ)
3940 aarch64_const_vec_all_same_in_range_p (rtx x,
3941 HOST_WIDE_INT minval,
3942 HOST_WIDE_INT maxval)
3944 HOST_WIDE_INT firstval;
3947 if (GET_CODE (x) != CONST_VECTOR
3948 || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
3951 firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
3952 if (firstval < minval || firstval > maxval)
3955 count = CONST_VECTOR_NUNITS (x);
3956 for (i = 1; i < count; i++)
3957 if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
3964 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
3966 return aarch64_const_vec_all_same_in_range_p (x, val, val);
3970 bit_count (unsigned HOST_WIDE_INT value)
3984 #define AARCH64_CC_V 1
3985 #define AARCH64_CC_C (1 << 1)
3986 #define AARCH64_CC_Z (1 << 2)
3987 #define AARCH64_CC_N (1 << 3)
3989 /* N Z C V flags for ccmp. The first code is for AND op and the other
3990 is for IOR op. Indexed by AARCH64_COND_CODE. */
3991 static const int aarch64_nzcv_codes[][2] =
3993 {AARCH64_CC_Z, 0}, /* EQ, Z == 1. */
3994 {0, AARCH64_CC_Z}, /* NE, Z == 0. */
3995 {AARCH64_CC_C, 0}, /* CS, C == 1. */
3996 {0, AARCH64_CC_C}, /* CC, C == 0. */
3997 {AARCH64_CC_N, 0}, /* MI, N == 1. */
3998 {0, AARCH64_CC_N}, /* PL, N == 0. */
3999 {AARCH64_CC_V, 0}, /* VS, V == 1. */
4000 {0, AARCH64_CC_V}, /* VC, V == 0. */
4001 {AARCH64_CC_C, 0}, /* HI, C ==1 && Z == 0. */
4002 {0, AARCH64_CC_C}, /* LS, !(C == 1 && Z == 0). */
4003 {0, AARCH64_CC_V}, /* GE, N == V. */
4004 {AARCH64_CC_V, 0}, /* LT, N != V. */
4005 {0, AARCH64_CC_Z}, /* GT, Z == 0 && N == V. */
4006 {AARCH64_CC_Z, 0}, /* LE, !(Z == 0 && N == V). */
4007 {0, 0}, /* AL, Any. */
4008 {0, 0}, /* NV, Any. */
4012 aarch64_ccmp_mode_to_code (enum machine_mode mode)
4053 aarch64_print_operand (FILE *f, rtx x, char code)
4057 /* An integer or symbol address without a preceding # sign. */
4059 switch (GET_CODE (x))
4062 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
4066 output_addr_const (f, x);
4070 if (GET_CODE (XEXP (x, 0)) == PLUS
4071 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
4073 output_addr_const (f, x);
4079 output_operand_lossage ("Unsupported operand for code '%c'", code);
4084 /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w. */
4088 if (!CONST_INT_P (x)
4089 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
4091 output_operand_lossage ("invalid operand for '%%%c'", code);
4107 output_operand_lossage ("invalid operand for '%%%c'", code);
4117 /* Print N such that 2^N == X. */
4118 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
4120 output_operand_lossage ("invalid operand for '%%%c'", code);
4124 asm_fprintf (f, "%d", n);
4129 /* Print the number of non-zero bits in X (a const_int). */
4130 if (!CONST_INT_P (x))
4132 output_operand_lossage ("invalid operand for '%%%c'", code);
4136 asm_fprintf (f, "%u", bit_count (INTVAL (x)));
4140 /* Print the higher numbered register of a pair (TImode) of regs. */
4141 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
4143 output_operand_lossage ("invalid operand for '%%%c'", code);
4147 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
4153 /* Print a condition (eq, ne, etc). */
4155 /* CONST_TRUE_RTX means always -- that's the default. */
4156 if (x == const_true_rtx)
4159 if (!COMPARISON_P (x))
4161 output_operand_lossage ("invalid operand for '%%%c'", code);
4165 cond_code = aarch64_get_condition_code (x);
4166 gcc_assert (cond_code >= 0);
4167 fputs (aarch64_condition_codes[cond_code], f);
4174 /* Print the inverse of a condition (eq <-> ne, etc). */
4176 /* CONST_TRUE_RTX means never -- that's the default. */
4177 if (x == const_true_rtx)
4183 if (!COMPARISON_P (x))
4185 output_operand_lossage ("invalid operand for '%%%c'", code);
4188 cond_code = aarch64_get_condition_code (x);
4189 gcc_assert (cond_code >= 0);
4190 fputs (aarch64_condition_codes[AARCH64_INVERSE_CONDITION_CODE
4200 /* Print a scalar FP/SIMD register name. */
4201 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4203 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4206 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
4213 /* Print the first FP/SIMD register name in a list. */
4214 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4216 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4219 asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
4223 /* Print a scalar FP/SIMD register name + 1. */
4224 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4226 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4229 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
4233 /* Print bottom 16 bits of integer constant in hex. */
4234 if (!CONST_INT_P (x))
4236 output_operand_lossage ("invalid operand for '%%%c'", code);
4239 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
4244 /* Print a general register name or the zero register (32-bit or
4247 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
4249 asm_fprintf (f, "%czr", code);
4253 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
4255 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
4259 if (REG_P (x) && REGNO (x) == SP_REGNUM)
4261 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
4268 /* Print a normal operand, if it's a general register, then we
4272 output_operand_lossage ("missing operand");
4276 switch (GET_CODE (x))
4279 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
4283 aarch64_memory_reference_mode = GET_MODE (x);
4284 output_address (XEXP (x, 0));
4289 output_addr_const (asm_out_file, x);
4293 asm_fprintf (f, "%wd", INTVAL (x));
4297 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
4300 aarch64_const_vec_all_same_in_range_p (x,
4302 HOST_WIDE_INT_MAX));
4303 asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
4305 else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
4314 /* CONST_DOUBLE can represent a double-width integer.
4315 In this case, the mode of x is VOIDmode. */
4316 if (GET_MODE (x) == VOIDmode)
4318 else if (aarch64_float_const_zero_rtx_p (x))
4323 else if (aarch64_float_const_representable_p (x))
4326 char float_buf[buf_size] = {'\0'};
4328 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
4329 real_to_decimal_for_mode (float_buf, &r,
4332 asm_fprintf (asm_out_file, "%s", float_buf);
4336 output_operand_lossage ("invalid constant");
4339 output_operand_lossage ("invalid operand");
4345 if (GET_CODE (x) == HIGH)
4348 switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4350 case SYMBOL_SMALL_GOT:
4351 asm_fprintf (asm_out_file, ":got:");
4354 case SYMBOL_SMALL_TLSGD:
4355 asm_fprintf (asm_out_file, ":tlsgd:");
4358 case SYMBOL_SMALL_TLSDESC:
4359 asm_fprintf (asm_out_file, ":tlsdesc:");
4362 case SYMBOL_SMALL_GOTTPREL:
4363 asm_fprintf (asm_out_file, ":gottprel:");
4366 case SYMBOL_SMALL_TPREL:
4367 asm_fprintf (asm_out_file, ":tprel:");
4370 case SYMBOL_TINY_GOT:
4377 output_addr_const (asm_out_file, x);
4381 switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4383 case SYMBOL_SMALL_GOT:
4384 asm_fprintf (asm_out_file, ":lo12:");
4387 case SYMBOL_SMALL_TLSGD:
4388 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
4391 case SYMBOL_SMALL_TLSDESC:
4392 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
4395 case SYMBOL_SMALL_GOTTPREL:
4396 asm_fprintf (asm_out_file, ":gottprel_lo12:");
4399 case SYMBOL_SMALL_TPREL:
4400 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
4403 case SYMBOL_TINY_GOT:
4404 asm_fprintf (asm_out_file, ":got:");
4410 output_addr_const (asm_out_file, x);
4415 switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4417 case SYMBOL_SMALL_TPREL:
4418 asm_fprintf (asm_out_file, ":tprel_hi12:");
4423 output_addr_const (asm_out_file, x);
4431 if (!COMPARISON_P (x))
4433 output_operand_lossage ("invalid operand for '%%%c'", code);
4437 cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4438 gcc_assert (cond_code >= 0);
4439 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][0]);
4448 if (!COMPARISON_P (x))
4450 output_operand_lossage ("invalid operand for '%%%c'", code);
4454 cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4455 gcc_assert (cond_code >= 0);
4456 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][1]);
4461 output_operand_lossage ("invalid operand prefix '%%%c'", code);
4467 aarch64_print_operand_address (FILE *f, rtx x)
4469 struct aarch64_address_info addr;
4471 if (aarch64_classify_address (&addr, x, aarch64_memory_reference_mode,
4475 case ADDRESS_REG_IMM:
4476 if (addr.offset == const0_rtx)
4477 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
4479 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
4480 INTVAL (addr.offset));
4483 case ADDRESS_REG_REG:
4484 if (addr.shift == 0)
4485 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
4486 reg_names [REGNO (addr.offset)]);
4488 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
4489 reg_names [REGNO (addr.offset)], addr.shift);
4492 case ADDRESS_REG_UXTW:
4493 if (addr.shift == 0)
4494 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
4495 REGNO (addr.offset) - R0_REGNUM);
4497 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
4498 REGNO (addr.offset) - R0_REGNUM, addr.shift);
4501 case ADDRESS_REG_SXTW:
4502 if (addr.shift == 0)
4503 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
4504 REGNO (addr.offset) - R0_REGNUM);
4506 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
4507 REGNO (addr.offset) - R0_REGNUM, addr.shift);
4510 case ADDRESS_REG_WB:
4511 switch (GET_CODE (x))
4514 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
4515 GET_MODE_SIZE (aarch64_memory_reference_mode));
4518 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
4519 GET_MODE_SIZE (aarch64_memory_reference_mode));
4522 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
4523 GET_MODE_SIZE (aarch64_memory_reference_mode));
4526 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
4527 GET_MODE_SIZE (aarch64_memory_reference_mode));
4530 asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
4531 INTVAL (addr.offset));
4534 asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
4535 INTVAL (addr.offset));
4542 case ADDRESS_LO_SUM:
4543 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
4544 output_addr_const (f, addr.offset);
4545 asm_fprintf (f, "]");
4548 case ADDRESS_SYMBOLIC:
4552 output_addr_const (f, x);
4556 aarch64_label_mentioned_p (rtx x)
4561 if (GET_CODE (x) == LABEL_REF)
4564 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
4565 referencing instruction, but they are constant offsets, not
4567 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
4570 fmt = GET_RTX_FORMAT (GET_CODE (x));
4571 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
4577 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
4578 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
4581 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
4588 /* Implement REGNO_REG_CLASS. */
4591 aarch64_regno_regclass (unsigned regno)
4593 if (GP_REGNUM_P (regno))
4594 return GENERAL_REGS;
4596 if (regno == SP_REGNUM)
4599 if (regno == FRAME_POINTER_REGNUM
4600 || regno == ARG_POINTER_REGNUM)
4601 return POINTER_REGS;
4603 if (FP_REGNUM_P (regno))
4604 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
4610 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
4612 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
4613 where mask is selected by alignment and size of the offset.
4614 We try to pick as large a range for the offset as possible to
4615 maximize the chance of a CSE. However, for aligned addresses
4616 we limit the range to 4k so that structures with different sized
4617 elements are likely to use the same base. */
4619 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
4621 HOST_WIDE_INT offset = INTVAL (XEXP (x, 1));
4622 HOST_WIDE_INT base_offset;
4624 /* Does it look like we'll need a load/store-pair operation? */
4625 if (GET_MODE_SIZE (mode) > 16
4627 base_offset = ((offset + 64 * GET_MODE_SIZE (mode))
4628 & ~((128 * GET_MODE_SIZE (mode)) - 1));
4629 /* For offsets aren't a multiple of the access size, the limit is
4631 else if (offset & (GET_MODE_SIZE (mode) - 1))
4632 base_offset = (offset + 0x100) & ~0x1ff;
4634 base_offset = offset & ~0xfff;
4636 if (base_offset == 0)
4639 offset -= base_offset;
4640 rtx base_reg = gen_reg_rtx (Pmode);
4641 rtx val = force_operand (plus_constant (Pmode, XEXP (x, 0), base_offset),
4643 emit_move_insn (base_reg, val);
4644 x = plus_constant (Pmode, base_reg, offset);
4650 /* Try a machine-dependent way of reloading an illegitimate address
4651 operand. If we find one, push the reload and return the new rtx. */
4654 aarch64_legitimize_reload_address (rtx *x_p,
4656 int opnum, int type,
4657 int ind_levels ATTRIBUTE_UNUSED)
4661 /* Do not allow mem (plus (reg, const)) if vector struct mode. */
4662 if (aarch64_vect_struct_mode_p (mode)
4663 && GET_CODE (x) == PLUS
4664 && REG_P (XEXP (x, 0))
4665 && CONST_INT_P (XEXP (x, 1)))
4669 push_reload (orig_rtx, NULL_RTX, x_p, NULL,
4670 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4671 opnum, (enum reload_type) type);
4675 /* We must recognize output that we have already generated ourselves. */
4676 if (GET_CODE (x) == PLUS
4677 && GET_CODE (XEXP (x, 0)) == PLUS
4678 && REG_P (XEXP (XEXP (x, 0), 0))
4679 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4680 && CONST_INT_P (XEXP (x, 1)))
4682 push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4683 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4684 opnum, (enum reload_type) type);
4688 /* We wish to handle large displacements off a base register by splitting
4689 the addend across an add and the mem insn. This can cut the number of
4690 extra insns needed from 3 to 1. It is only useful for load/store of a
4691 single register with 12 bit offset field. */
4692 if (GET_CODE (x) == PLUS
4693 && REG_P (XEXP (x, 0))
4694 && CONST_INT_P (XEXP (x, 1))
4695 && HARD_REGISTER_P (XEXP (x, 0))
4698 && aarch64_regno_ok_for_base_p (REGNO (XEXP (x, 0)), true))
4700 HOST_WIDE_INT val = INTVAL (XEXP (x, 1));
4701 HOST_WIDE_INT low = val & 0xfff;
4702 HOST_WIDE_INT high = val - low;
4705 machine_mode xmode = GET_MODE (x);
4707 /* In ILP32, xmode can be either DImode or SImode. */
4708 gcc_assert (xmode == DImode || xmode == SImode);
4710 /* Reload non-zero BLKmode offsets. This is because we cannot ascertain
4711 BLKmode alignment. */
4712 if (GET_MODE_SIZE (mode) == 0)
4715 offs = low % GET_MODE_SIZE (mode);
4717 /* Align misaligned offset by adjusting high part to compensate. */
4720 if (aarch64_uimm12_shift (high + offs))
4729 offs = GET_MODE_SIZE (mode) - offs;
4731 high = high + (low & 0x1000) - offs;
4736 /* Check for overflow. */
4737 if (high + low != val)
4740 cst = GEN_INT (high);
4741 if (!aarch64_uimm12_shift (high))
4742 cst = force_const_mem (xmode, cst);
4744 /* Reload high part into base reg, leaving the low part
4745 in the mem instruction.
4746 Note that replacing this gen_rtx_PLUS with plus_constant is
4747 wrong in this case because we rely on the
4748 (plus (plus reg c1) c2) structure being preserved so that
4749 XEXP (*p, 0) in push_reload below uses the correct term. */
4750 x = gen_rtx_PLUS (xmode,
4751 gen_rtx_PLUS (xmode, XEXP (x, 0), cst),
4754 push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4755 BASE_REG_CLASS, xmode, VOIDmode, 0, 0,
4756 opnum, (enum reload_type) type);
4765 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
4768 secondary_reload_info *sri)
4770 /* Without the TARGET_SIMD instructions we cannot move a Q register
4771 to a Q register directly. We need a scratch. */
4772 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
4773 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
4774 && reg_class_subset_p (rclass, FP_REGS))
4777 sri->icode = CODE_FOR_aarch64_reload_movtf;
4778 else if (mode == TImode)
4779 sri->icode = CODE_FOR_aarch64_reload_movti;
4783 /* A TFmode or TImode memory access should be handled via an FP_REGS
4784 because AArch64 has richer addressing modes for LDR/STR instructions
4785 than LDP/STP instructions. */
4786 if (TARGET_FLOAT && rclass == GENERAL_REGS
4787 && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
4790 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
4791 return GENERAL_REGS;
4797 aarch64_can_eliminate (const int from, const int to)
4799 /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
4800 HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM. */
4802 if (frame_pointer_needed)
4804 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4806 if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
4808 if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
4809 && !cfun->calls_alloca)
4811 if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4818 /* If we decided that we didn't need a leaf frame pointer but then used
4819 LR in the function, then we'll want a frame pointer after all, so
4820 prevent this elimination to ensure a frame pointer is used. */
4821 if (to == STACK_POINTER_REGNUM
4822 && flag_omit_leaf_frame_pointer
4823 && df_regs_ever_live_p (LR_REGNUM))
4831 aarch64_initial_elimination_offset (unsigned from, unsigned to)
4833 aarch64_layout_frame ();
4835 if (to == HARD_FRAME_POINTER_REGNUM)
4837 if (from == ARG_POINTER_REGNUM)
4838 return cfun->machine->frame.frame_size - crtl->outgoing_args_size;
4840 if (from == FRAME_POINTER_REGNUM)
4841 return (cfun->machine->frame.hard_fp_offset
4842 - cfun->machine->frame.saved_varargs_size);
4845 if (to == STACK_POINTER_REGNUM)
4847 if (from == FRAME_POINTER_REGNUM)
4848 return (cfun->machine->frame.frame_size
4849 - cfun->machine->frame.saved_varargs_size);
4852 return cfun->machine->frame.frame_size;
4855 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
4859 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
4863 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
4868 aarch64_asm_trampoline_template (FILE *f)
4872 asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
4873 asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
4877 asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
4878 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
4880 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
4881 assemble_aligned_integer (4, const0_rtx);
4882 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4883 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4887 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
4889 rtx fnaddr, mem, a_tramp;
4890 const int tramp_code_sz = 16;
4892 /* Don't need to copy the trailing D-words, we fill those in below. */
4893 emit_block_move (m_tramp, assemble_trampoline_template (),
4894 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
4895 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
4896 fnaddr = XEXP (DECL_RTL (fndecl), 0);
4897 if (GET_MODE (fnaddr) != ptr_mode)
4898 fnaddr = convert_memory_address (ptr_mode, fnaddr);
4899 emit_move_insn (mem, fnaddr);
4901 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
4902 emit_move_insn (mem, chain_value);
4904 /* XXX We should really define a "clear_cache" pattern and use
4905 gen_clear_cache(). */
4906 a_tramp = XEXP (m_tramp, 0);
4907 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
4908 LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
4909 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
4913 static unsigned char
4914 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
4918 case CALLER_SAVE_REGS:
4925 aarch64_vector_mode_p (mode)
4926 ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
4927 : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
4941 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
4943 if (regclass == POINTER_REGS)
4944 return GENERAL_REGS;
4946 if (regclass == STACK_REG)
4949 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
4955 /* If it's an integer immediate that MOVI can't handle, then
4956 FP_REGS is not an option, so we return NO_REGS instead. */
4957 if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
4958 && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
4961 /* Register eliminiation can result in a request for
4962 SP+constant->FP_REGS. We cannot support such operations which
4963 use SP as source and an FP_REG as destination, so reject out
4965 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
4967 rtx lhs = XEXP (x, 0);
4969 /* Look through a possible SUBREG introduced by ILP32. */
4970 if (GET_CODE (lhs) == SUBREG)
4971 lhs = SUBREG_REG (lhs);
4973 gcc_assert (REG_P (lhs));
4974 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
4983 aarch64_asm_output_labelref (FILE* f, const char *name)
4985 asm_fprintf (f, "%U%s", name);
4989 aarch64_elf_asm_constructor (rtx symbol, int priority)
4991 if (priority == DEFAULT_INIT_PRIORITY)
4992 default_ctor_section_asm_out_constructor (symbol, priority);
4997 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
4998 s = get_section (buf, SECTION_WRITE, NULL);
4999 switch_to_section (s);
5000 assemble_align (POINTER_SIZE);
5001 assemble_aligned_integer (POINTER_BYTES, symbol);
5006 aarch64_elf_asm_destructor (rtx symbol, int priority)
5008 if (priority == DEFAULT_INIT_PRIORITY)
5009 default_dtor_section_asm_out_destructor (symbol, priority);
5014 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
5015 s = get_section (buf, SECTION_WRITE, NULL);
5016 switch_to_section (s);
5017 assemble_align (POINTER_SIZE);
5018 assemble_aligned_integer (POINTER_BYTES, symbol);
5023 aarch64_output_casesi (rtx *operands)
5027 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
5029 static const char *const patterns[4][2] =
5032 "ldrb\t%w3, [%0,%w1,uxtw]",
5033 "add\t%3, %4, %w3, sxtb #2"
5036 "ldrh\t%w3, [%0,%w1,uxtw #1]",
5037 "add\t%3, %4, %w3, sxth #2"
5040 "ldr\t%w3, [%0,%w1,uxtw #2]",
5041 "add\t%3, %4, %w3, sxtw #2"
5043 /* We assume that DImode is only generated when not optimizing and
5044 that we don't really need 64-bit address offsets. That would
5045 imply an object file with 8GB of code in a single function! */
5047 "ldr\t%w3, [%0,%w1,uxtw #2]",
5048 "add\t%3, %4, %w3, sxtw #2"
5052 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
5054 index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
5056 gcc_assert (index >= 0 && index <= 3);
5058 /* Need to implement table size reduction, by chaning the code below. */
5059 output_asm_insn (patterns[index][0], operands);
5060 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
5061 snprintf (buf, sizeof (buf),
5062 "adr\t%%4, %s", targetm.strip_name_encoding (label));
5063 output_asm_insn (buf, operands);
5064 output_asm_insn (patterns[index][1], operands);
5065 output_asm_insn ("br\t%3", operands);
5066 assemble_label (asm_out_file, label);
5071 /* Return size in bits of an arithmetic operand which is shifted/scaled and
5072 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
5076 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
5078 if (shift >= 0 && shift <= 3)
5081 for (size = 8; size <= 32; size *= 2)
5083 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
5084 if (mask == bits << shift)
5092 aarch64_use_blocks_for_constant_p (machine_mode mode ATTRIBUTE_UNUSED,
5093 const_rtx x ATTRIBUTE_UNUSED)
5095 /* We can't use blocks for constants when we're using a per-function
5101 aarch64_select_rtx_section (machine_mode mode ATTRIBUTE_UNUSED,
5102 rtx x ATTRIBUTE_UNUSED,
5103 unsigned HOST_WIDE_INT align ATTRIBUTE_UNUSED)
5105 /* Force all constant pool entries into the current function section. */
5106 return function_section (current_function_decl);
5112 /* Helper function for rtx cost calculation. Strip a shift expression
5113 from X. Returns the inner operand if successful, or the original
5114 expression on failure. */
5116 aarch64_strip_shift (rtx x)
5120 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
5121 we can convert both to ROR during final output. */
5122 if ((GET_CODE (op) == ASHIFT
5123 || GET_CODE (op) == ASHIFTRT
5124 || GET_CODE (op) == LSHIFTRT
5125 || GET_CODE (op) == ROTATERT
5126 || GET_CODE (op) == ROTATE)
5127 && CONST_INT_P (XEXP (op, 1)))
5128 return XEXP (op, 0);
5130 if (GET_CODE (op) == MULT
5131 && CONST_INT_P (XEXP (op, 1))
5132 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
5133 return XEXP (op, 0);
5138 /* Helper function for rtx cost calculation. Strip an extend
5139 expression from X. Returns the inner operand if successful, or the
5140 original expression on failure. We deal with a number of possible
5141 canonicalization variations here. */
5143 aarch64_strip_extend (rtx x)
5147 /* Zero and sign extraction of a widened value. */
5148 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
5149 && XEXP (op, 2) == const0_rtx
5150 && GET_CODE (XEXP (op, 0)) == MULT
5151 && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
5153 return XEXP (XEXP (op, 0), 0);
5155 /* It can also be represented (for zero-extend) as an AND with an
5157 if (GET_CODE (op) == AND
5158 && GET_CODE (XEXP (op, 0)) == MULT
5159 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
5160 && CONST_INT_P (XEXP (op, 1))
5161 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
5162 INTVAL (XEXP (op, 1))) != 0)
5163 return XEXP (XEXP (op, 0), 0);
5165 /* Now handle extended register, as this may also have an optional
5166 left shift by 1..4. */
5167 if (GET_CODE (op) == ASHIFT
5168 && CONST_INT_P (XEXP (op, 1))
5169 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
5172 if (GET_CODE (op) == ZERO_EXTEND
5173 || GET_CODE (op) == SIGN_EXTEND)
5182 /* Return true iff CODE is a shift supported in combination
5183 with arithmetic instructions. */
5186 aarch64_shift_p (enum rtx_code code)
5188 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
5191 /* Helper function for rtx cost calculation. Calculate the cost of
5192 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
5193 Return the calculated cost of the expression, recursing manually in to
5194 operands where needed. */
5197 aarch64_rtx_mult_cost (rtx x, int code, int outer, bool speed)
5200 const struct cpu_cost_table *extra_cost
5201 = aarch64_tune_params->insn_extra_cost;
5203 bool compound_p = (outer == PLUS || outer == MINUS);
5204 machine_mode mode = GET_MODE (x);
5206 gcc_checking_assert (code == MULT);
5211 if (VECTOR_MODE_P (mode))
5212 mode = GET_MODE_INNER (mode);
5214 /* Integer multiply/fma. */
5215 if (GET_MODE_CLASS (mode) == MODE_INT)
5217 /* The multiply will be canonicalized as a shift, cost it as such. */
5218 if (aarch64_shift_p (GET_CODE (x))
5219 || (CONST_INT_P (op1)
5220 && exact_log2 (INTVAL (op1)) > 0))
5222 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
5223 || GET_CODE (op0) == SIGN_EXTEND;
5229 /* ARITH + shift-by-register. */
5230 cost += extra_cost->alu.arith_shift_reg;
5232 /* ARITH + extended register. We don't have a cost field
5233 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
5234 cost += extra_cost->alu.extend_arith;
5236 /* ARITH + shift-by-immediate. */
5237 cost += extra_cost->alu.arith_shift;
5240 /* LSL (immediate). */
5241 cost += extra_cost->alu.shift;
5244 /* Strip extends as we will have costed them in the case above. */
5246 op0 = aarch64_strip_extend (op0);
5248 cost += rtx_cost (op0, GET_CODE (op0), 0, speed);
5253 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
5254 compound and let the below cases handle it. After all, MNEG is a
5255 special-case alias of MSUB. */
5256 if (GET_CODE (op0) == NEG)
5258 op0 = XEXP (op0, 0);
5262 /* Integer multiplies or FMAs have zero/sign extending variants. */
5263 if ((GET_CODE (op0) == ZERO_EXTEND
5264 && GET_CODE (op1) == ZERO_EXTEND)
5265 || (GET_CODE (op0) == SIGN_EXTEND
5266 && GET_CODE (op1) == SIGN_EXTEND))
5268 cost += rtx_cost (XEXP (op0, 0), MULT, 0, speed)
5269 + rtx_cost (XEXP (op1, 0), MULT, 1, speed);
5274 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
5275 cost += extra_cost->mult[0].extend_add;
5277 /* MUL/SMULL/UMULL. */
5278 cost += extra_cost->mult[0].extend;
5284 /* This is either an integer multiply or a MADD. In both cases
5285 we want to recurse and cost the operands. */
5286 cost += rtx_cost (op0, MULT, 0, speed)
5287 + rtx_cost (op1, MULT, 1, speed);
5293 cost += extra_cost->mult[mode == DImode].add;
5296 cost += extra_cost->mult[mode == DImode].simple;
5305 /* Floating-point FMA/FMUL can also support negations of the
5307 if (GET_CODE (op0) == NEG)
5308 op0 = XEXP (op0, 0);
5309 if (GET_CODE (op1) == NEG)
5310 op1 = XEXP (op1, 0);
5313 /* FMADD/FNMADD/FNMSUB/FMSUB. */
5314 cost += extra_cost->fp[mode == DFmode].fma;
5317 cost += extra_cost->fp[mode == DFmode].mult;
5320 cost += rtx_cost (op0, MULT, 0, speed)
5321 + rtx_cost (op1, MULT, 1, speed);
5327 aarch64_address_cost (rtx x,
5329 addr_space_t as ATTRIBUTE_UNUSED,
5332 enum rtx_code c = GET_CODE (x);
5333 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params->addr_cost;
5334 struct aarch64_address_info info;
5338 if (!aarch64_classify_address (&info, x, mode, c, false))
5340 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
5342 /* This is a CONST or SYMBOL ref which will be split
5343 in a different way depending on the code model in use.
5344 Cost it through the generic infrastructure. */
5345 int cost_symbol_ref = rtx_cost (x, MEM, 1, speed);
5346 /* Divide through by the cost of one instruction to
5347 bring it to the same units as the address costs. */
5348 cost_symbol_ref /= COSTS_N_INSNS (1);
5349 /* The cost is then the cost of preparing the address,
5350 followed by an immediate (possibly 0) offset. */
5351 return cost_symbol_ref + addr_cost->imm_offset;
5355 /* This is most likely a jump table from a case
5357 return addr_cost->register_offset;
5363 case ADDRESS_LO_SUM:
5364 case ADDRESS_SYMBOLIC:
5365 case ADDRESS_REG_IMM:
5366 cost += addr_cost->imm_offset;
5369 case ADDRESS_REG_WB:
5370 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
5371 cost += addr_cost->pre_modify;
5372 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
5373 cost += addr_cost->post_modify;
5379 case ADDRESS_REG_REG:
5380 cost += addr_cost->register_offset;
5383 case ADDRESS_REG_UXTW:
5384 case ADDRESS_REG_SXTW:
5385 cost += addr_cost->register_extend;
5395 /* For the sake of calculating the cost of the shifted register
5396 component, we can treat same sized modes in the same way. */
5397 switch (GET_MODE_BITSIZE (mode))
5400 cost += addr_cost->addr_scale_costs.hi;
5404 cost += addr_cost->addr_scale_costs.si;
5408 cost += addr_cost->addr_scale_costs.di;
5411 /* We can't tell, or this is a 128-bit vector. */
5413 cost += addr_cost->addr_scale_costs.ti;
5421 /* Return the cost of a branch. If SPEED_P is true then the compiler is
5422 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
5426 aarch64_branch_cost (bool speed_p, bool predictable_p)
5428 /* When optimizing for speed, use the cost of unpredictable branches. */
5429 const struct cpu_branch_cost *branch_costs =
5430 aarch64_tune_params->branch_costs;
5432 if (!speed_p || predictable_p)
5433 return branch_costs->predictable;
5435 return branch_costs->unpredictable;
5438 /* Return true if the RTX X in mode MODE is a zero or sign extract
5439 usable in an ADD or SUB (extended register) instruction. */
5441 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
5443 /* Catch add with a sign extract.
5444 This is add_<optab><mode>_multp2. */
5445 if (GET_CODE (x) == SIGN_EXTRACT
5446 || GET_CODE (x) == ZERO_EXTRACT)
5448 rtx op0 = XEXP (x, 0);
5449 rtx op1 = XEXP (x, 1);
5450 rtx op2 = XEXP (x, 2);
5452 if (GET_CODE (op0) == MULT
5453 && CONST_INT_P (op1)
5454 && op2 == const0_rtx
5455 && CONST_INT_P (XEXP (op0, 1))
5456 && aarch64_is_extend_from_extract (mode,
5468 aarch64_frint_unspec_p (unsigned int u)
5486 /* Return true iff X is an rtx that will match an extr instruction
5487 i.e. as described in the *extr<mode>5_insn family of patterns.
5488 OP0 and OP1 will be set to the operands of the shifts involved
5489 on success and will be NULL_RTX otherwise. */
5492 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
5495 machine_mode mode = GET_MODE (x);
5497 *res_op0 = NULL_RTX;
5498 *res_op1 = NULL_RTX;
5500 if (GET_CODE (x) != IOR)
5506 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
5507 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
5509 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
5510 if (GET_CODE (op1) == ASHIFT)
5511 std::swap (op0, op1);
5513 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
5516 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
5517 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
5519 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
5520 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
5522 *res_op0 = XEXP (op0, 0);
5523 *res_op1 = XEXP (op1, 0);
5531 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
5532 storing it in *COST. Result is true if the total cost of the operation
5533 has now been calculated. */
5535 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
5539 enum rtx_code cmpcode;
5541 if (COMPARISON_P (op0))
5543 inner = XEXP (op0, 0);
5544 comparator = XEXP (op0, 1);
5545 cmpcode = GET_CODE (op0);
5550 comparator = const0_rtx;
5554 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
5556 /* Conditional branch. */
5557 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5561 if (cmpcode == NE || cmpcode == EQ)
5563 if (comparator == const0_rtx)
5565 /* TBZ/TBNZ/CBZ/CBNZ. */
5566 if (GET_CODE (inner) == ZERO_EXTRACT)
5568 *cost += rtx_cost (XEXP (inner, 0), ZERO_EXTRACT,
5572 *cost += rtx_cost (inner, cmpcode, 0, speed);
5577 else if (cmpcode == LT || cmpcode == GE)
5580 if (comparator == const0_rtx)
5585 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5587 /* It's a conditional operation based on the status flags,
5588 so it must be some flavor of CSEL. */
5590 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
5591 if (GET_CODE (op1) == NEG
5592 || GET_CODE (op1) == NOT
5593 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
5594 op1 = XEXP (op1, 0);
5596 *cost += rtx_cost (op1, IF_THEN_ELSE, 1, speed);
5597 *cost += rtx_cost (op2, IF_THEN_ELSE, 2, speed);
5601 /* We don't know what this is, cost all operands. */
5605 /* Calculate the cost of calculating X, storing it in *COST. Result
5606 is true if the total cost of the operation has now been calculated. */
5608 aarch64_rtx_costs (rtx x, int code, int outer ATTRIBUTE_UNUSED,
5609 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
5612 const struct cpu_cost_table *extra_cost
5613 = aarch64_tune_params->insn_extra_cost;
5614 machine_mode mode = GET_MODE (x);
5616 /* By default, assume that everything has equivalent cost to the
5617 cheapest instruction. Any additional costs are applied as a delta
5618 above this default. */
5619 *cost = COSTS_N_INSNS (1);
5624 /* The cost depends entirely on the operands to SET. */
5629 switch (GET_CODE (op0))
5634 rtx address = XEXP (op0, 0);
5635 if (VECTOR_MODE_P (mode))
5636 *cost += extra_cost->ldst.storev;
5637 else if (GET_MODE_CLASS (mode) == MODE_INT)
5638 *cost += extra_cost->ldst.store;
5639 else if (mode == SFmode)
5640 *cost += extra_cost->ldst.storef;
5641 else if (mode == DFmode)
5642 *cost += extra_cost->ldst.stored;
5645 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5649 *cost += rtx_cost (op1, SET, 1, speed);
5653 if (! REG_P (SUBREG_REG (op0)))
5654 *cost += rtx_cost (SUBREG_REG (op0), SET, 0, speed);
5658 /* The cost is one per vector-register copied. */
5659 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
5661 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
5662 / GET_MODE_SIZE (V4SImode);
5663 *cost = COSTS_N_INSNS (n_minus_1 + 1);
5665 /* const0_rtx is in general free, but we will use an
5666 instruction to set a register to 0. */
5667 else if (REG_P (op1) || op1 == const0_rtx)
5669 /* The cost is 1 per register copied. */
5670 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
5672 *cost = COSTS_N_INSNS (n_minus_1 + 1);
5675 /* Cost is just the cost of the RHS of the set. */
5676 *cost += rtx_cost (op1, SET, 1, speed);
5681 /* Bit-field insertion. Strip any redundant widening of
5682 the RHS to meet the width of the target. */
5683 if (GET_CODE (op1) == SUBREG)
5684 op1 = SUBREG_REG (op1);
5685 if ((GET_CODE (op1) == ZERO_EXTEND
5686 || GET_CODE (op1) == SIGN_EXTEND)
5687 && CONST_INT_P (XEXP (op0, 1))
5688 && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
5689 >= INTVAL (XEXP (op0, 1))))
5690 op1 = XEXP (op1, 0);
5692 if (CONST_INT_P (op1))
5694 /* MOV immediate is assumed to always be cheap. */
5695 *cost = COSTS_N_INSNS (1);
5701 *cost += extra_cost->alu.bfi;
5702 *cost += rtx_cost (op1, (enum rtx_code) code, 1, speed);
5708 /* We can't make sense of this, assume default cost. */
5709 *cost = COSTS_N_INSNS (1);
5715 /* If an instruction can incorporate a constant within the
5716 instruction, the instruction's expression avoids calling
5717 rtx_cost() on the constant. If rtx_cost() is called on a
5718 constant, then it is usually because the constant must be
5719 moved into a register by one or more instructions.
5721 The exception is constant 0, which can be expressed
5722 as XZR/WZR and is therefore free. The exception to this is
5723 if we have (set (reg) (const0_rtx)) in which case we must cost
5724 the move. However, we can catch that when we cost the SET, so
5725 we don't need to consider that here. */
5726 if (x == const0_rtx)
5730 /* To an approximation, building any other constant is
5731 proportionally expensive to the number of instructions
5732 required to build that constant. This is true whether we
5733 are compiling for SPEED or otherwise. */
5734 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
5735 (NULL_RTX, x, false, mode));
5742 /* mov[df,sf]_aarch64. */
5743 if (aarch64_float_const_representable_p (x))
5744 /* FMOV (scalar immediate). */
5745 *cost += extra_cost->fp[mode == DFmode].fpconst;
5746 else if (!aarch64_float_const_zero_rtx_p (x))
5748 /* This will be a load from memory. */
5750 *cost += extra_cost->ldst.loadd;
5752 *cost += extra_cost->ldst.loadf;
5755 /* Otherwise this is +0.0. We get this using MOVI d0, #0
5756 or MOV v0.s[0], wzr - neither of which are modeled by the
5757 cost tables. Just use the default cost. */
5767 /* For loads we want the base cost of a load, plus an
5768 approximation for the additional cost of the addressing
5770 rtx address = XEXP (x, 0);
5771 if (VECTOR_MODE_P (mode))
5772 *cost += extra_cost->ldst.loadv;
5773 else if (GET_MODE_CLASS (mode) == MODE_INT)
5774 *cost += extra_cost->ldst.load;
5775 else if (mode == SFmode)
5776 *cost += extra_cost->ldst.loadf;
5777 else if (mode == DFmode)
5778 *cost += extra_cost->ldst.loadd;
5781 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5790 if (VECTOR_MODE_P (mode))
5795 *cost += extra_cost->vect.alu;
5800 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5802 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5803 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5806 *cost += rtx_cost (XEXP (op0, 0), NEG, 0, speed);
5810 /* Cost this as SUB wzr, X. */
5811 op0 = CONST0_RTX (GET_MODE (x));
5816 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
5818 /* Support (neg(fma...)) as a single instruction only if
5819 sign of zeros is unimportant. This matches the decision
5820 making in aarch64.md. */
5821 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
5824 *cost = rtx_cost (op0, NEG, 0, speed);
5829 *cost += extra_cost->fp[mode == DFmode].neg;
5839 if (VECTOR_MODE_P (mode))
5840 *cost += extra_cost->vect.alu;
5842 *cost += extra_cost->alu.clz;
5851 if (op1 == const0_rtx
5852 && GET_CODE (op0) == AND)
5858 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
5860 /* TODO: A write to the CC flags possibly costs extra, this
5861 needs encoding in the cost tables. */
5863 /* CC_ZESWPmode supports zero extend for free. */
5864 if (GET_MODE (x) == CC_ZESWPmode && GET_CODE (op0) == ZERO_EXTEND)
5865 op0 = XEXP (op0, 0);
5868 if (GET_CODE (op0) == AND)
5874 if (GET_CODE (op0) == PLUS)
5876 /* ADDS (and CMN alias). */
5881 if (GET_CODE (op0) == MINUS)
5888 if (GET_CODE (op1) == NEG)
5892 *cost += extra_cost->alu.arith;
5894 *cost += rtx_cost (op0, COMPARE, 0, speed);
5895 *cost += rtx_cost (XEXP (op1, 0), NEG, 1, speed);
5901 Compare can freely swap the order of operands, and
5902 canonicalization puts the more complex operation first.
5903 But the integer MINUS logic expects the shift/extend
5904 operation in op1. */
5906 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
5914 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
5918 *cost += extra_cost->fp[mode == DFmode].compare;
5920 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
5922 *cost += rtx_cost (op0, COMPARE, 0, speed);
5923 /* FCMP supports constant 0.0 for no extra cost. */
5929 if (VECTOR_MODE_P (mode))
5931 /* Vector compare. */
5933 *cost += extra_cost->vect.alu;
5935 if (aarch64_float_const_zero_rtx_p (op1))
5937 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
5951 *cost += rtx_cost (op0, MINUS, 0, speed);
5953 /* Detect valid immediates. */
5954 if ((GET_MODE_CLASS (mode) == MODE_INT
5955 || (GET_MODE_CLASS (mode) == MODE_CC
5956 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
5957 && CONST_INT_P (op1)
5958 && aarch64_uimm12_shift (INTVAL (op1)))
5961 /* SUB(S) (immediate). */
5962 *cost += extra_cost->alu.arith;
5966 /* Look for SUB (extended register). */
5967 if (aarch64_rtx_arith_op_extract_p (op1, mode))
5970 *cost += extra_cost->alu.extend_arith;
5972 *cost += rtx_cost (XEXP (XEXP (op1, 0), 0),
5973 (enum rtx_code) GET_CODE (op1),
5978 rtx new_op1 = aarch64_strip_extend (op1);
5980 /* Cost this as an FMA-alike operation. */
5981 if ((GET_CODE (new_op1) == MULT
5982 || aarch64_shift_p (GET_CODE (new_op1)))
5985 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
5986 (enum rtx_code) code,
5991 *cost += rtx_cost (new_op1, MINUS, 1, speed);
5995 if (VECTOR_MODE_P (mode))
5998 *cost += extra_cost->vect.alu;
6000 else if (GET_MODE_CLASS (mode) == MODE_INT)
6003 *cost += extra_cost->alu.arith;
6005 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6008 *cost += extra_cost->fp[mode == DFmode].addsub;
6022 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6023 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6026 *cost += rtx_cost (XEXP (op0, 0), PLUS, 0, speed);
6027 *cost += rtx_cost (op1, PLUS, 1, speed);
6031 if (GET_MODE_CLASS (mode) == MODE_INT
6032 && CONST_INT_P (op1)
6033 && aarch64_uimm12_shift (INTVAL (op1)))
6035 *cost += rtx_cost (op0, PLUS, 0, speed);
6038 /* ADD (immediate). */
6039 *cost += extra_cost->alu.arith;
6043 *cost += rtx_cost (op1, PLUS, 1, speed);
6045 /* Look for ADD (extended register). */
6046 if (aarch64_rtx_arith_op_extract_p (op0, mode))
6049 *cost += extra_cost->alu.extend_arith;
6051 *cost += rtx_cost (XEXP (XEXP (op0, 0), 0),
6052 (enum rtx_code) GET_CODE (op0),
6057 /* Strip any extend, leave shifts behind as we will
6058 cost them through mult_cost. */
6059 new_op0 = aarch64_strip_extend (op0);
6061 if (GET_CODE (new_op0) == MULT
6062 || aarch64_shift_p (GET_CODE (new_op0)))
6064 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
6069 *cost += rtx_cost (new_op0, PLUS, 0, speed);
6073 if (VECTOR_MODE_P (mode))
6076 *cost += extra_cost->vect.alu;
6078 else if (GET_MODE_CLASS (mode) == MODE_INT)
6081 *cost += extra_cost->alu.arith;
6083 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6086 *cost += extra_cost->fp[mode == DFmode].addsub;
6093 *cost = COSTS_N_INSNS (1);
6097 if (VECTOR_MODE_P (mode))
6098 *cost += extra_cost->vect.alu;
6100 *cost += extra_cost->alu.rev;
6105 if (aarch_rev16_p (x))
6107 *cost = COSTS_N_INSNS (1);
6111 if (VECTOR_MODE_P (mode))
6112 *cost += extra_cost->vect.alu;
6114 *cost += extra_cost->alu.rev;
6119 if (aarch64_extr_rtx_p (x, &op0, &op1))
6121 *cost += rtx_cost (op0, IOR, 0, speed)
6122 + rtx_cost (op1, IOR, 1, speed);
6124 *cost += extra_cost->alu.shift;
6135 if (VECTOR_MODE_P (mode))
6138 *cost += extra_cost->vect.alu;
6143 && GET_CODE (op0) == MULT
6144 && CONST_INT_P (XEXP (op0, 1))
6145 && CONST_INT_P (op1)
6146 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
6149 /* This is a UBFM/SBFM. */
6150 *cost += rtx_cost (XEXP (op0, 0), ZERO_EXTRACT, 0, speed);
6152 *cost += extra_cost->alu.bfx;
6156 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
6158 /* We possibly get the immediate for free, this is not
6160 if (CONST_INT_P (op1)
6161 && aarch64_bitmask_imm (INTVAL (op1), GET_MODE (x)))
6163 *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6166 *cost += extra_cost->alu.logical;
6174 /* Handle ORN, EON, or BIC. */
6175 if (GET_CODE (op0) == NOT)
6176 op0 = XEXP (op0, 0);
6178 new_op0 = aarch64_strip_shift (op0);
6180 /* If we had a shift on op0 then this is a logical-shift-
6181 by-register/immediate operation. Otherwise, this is just
6182 a logical operation. */
6187 /* Shift by immediate. */
6188 if (CONST_INT_P (XEXP (op0, 1)))
6189 *cost += extra_cost->alu.log_shift;
6191 *cost += extra_cost->alu.log_shift_reg;
6194 *cost += extra_cost->alu.logical;
6197 /* In both cases we want to cost both operands. */
6198 *cost += rtx_cost (new_op0, (enum rtx_code) code, 0, speed)
6199 + rtx_cost (op1, (enum rtx_code) code, 1, speed);
6208 op0 = aarch64_strip_shift (x);
6210 if (VECTOR_MODE_P (mode))
6213 *cost += extra_cost->vect.alu;
6217 /* MVN-shifted-reg. */
6220 *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6223 *cost += extra_cost->alu.log_shift;
6227 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
6228 Handle the second form here taking care that 'a' in the above can
6230 else if (GET_CODE (op0) == XOR)
6232 rtx newop0 = XEXP (op0, 0);
6233 rtx newop1 = XEXP (op0, 1);
6234 rtx op0_stripped = aarch64_strip_shift (newop0);
6236 *cost += rtx_cost (newop1, (enum rtx_code) code, 1, speed)
6237 + rtx_cost (op0_stripped, XOR, 0, speed);
6241 if (op0_stripped != newop0)
6242 *cost += extra_cost->alu.log_shift;
6244 *cost += extra_cost->alu.logical;
6251 *cost += extra_cost->alu.logical;
6258 /* If a value is written in SI mode, then zero extended to DI
6259 mode, the operation will in general be free as a write to
6260 a 'w' register implicitly zeroes the upper bits of an 'x'
6261 register. However, if this is
6263 (set (reg) (zero_extend (reg)))
6265 we must cost the explicit register move. */
6267 && GET_MODE (op0) == SImode
6270 int op_cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, 0, speed);
6272 if (!op_cost && speed)
6274 *cost += extra_cost->alu.extend;
6276 /* Free, the cost is that of the SI mode operation. */
6281 else if (MEM_P (XEXP (x, 0)))
6283 /* All loads can zero extend to any size for free. */
6284 *cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, param, speed);
6290 if (VECTOR_MODE_P (mode))
6293 *cost += extra_cost->vect.alu;
6298 *cost += extra_cost->alu.extend;
6304 if (MEM_P (XEXP (x, 0)))
6309 rtx address = XEXP (XEXP (x, 0), 0);
6310 *cost += extra_cost->ldst.load_sign_extend;
6313 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6321 if (VECTOR_MODE_P (mode))
6322 *cost += extra_cost->vect.alu;
6324 *cost += extra_cost->alu.extend;
6332 if (CONST_INT_P (op1))
6336 if (VECTOR_MODE_P (mode))
6338 /* Vector shift (immediate). */
6339 *cost += extra_cost->vect.alu;
6343 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
6345 *cost += extra_cost->alu.shift;
6349 /* We can incorporate zero/sign extend for free. */
6350 if (GET_CODE (op0) == ZERO_EXTEND
6351 || GET_CODE (op0) == SIGN_EXTEND)
6352 op0 = XEXP (op0, 0);
6354 *cost += rtx_cost (op0, ASHIFT, 0, speed);
6361 if (VECTOR_MODE_P (mode))
6363 /* Vector shift (register). */
6364 *cost += extra_cost->vect.alu;
6369 *cost += extra_cost->alu.shift_reg;
6372 return false; /* All arguments need to be in registers. */
6382 if (CONST_INT_P (op1))
6384 /* ASR (immediate) and friends. */
6387 if (VECTOR_MODE_P (mode))
6388 *cost += extra_cost->vect.alu;
6390 *cost += extra_cost->alu.shift;
6393 *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6399 /* ASR (register) and friends. */
6402 if (VECTOR_MODE_P (mode))
6403 *cost += extra_cost->vect.alu;
6405 *cost += extra_cost->alu.shift_reg;
6407 return false; /* All arguments need to be in registers. */
6412 if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
6416 *cost += extra_cost->ldst.load;
6418 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
6419 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
6421 /* ADRP, followed by ADD. */
6422 *cost += COSTS_N_INSNS (1);
6424 *cost += 2 * extra_cost->alu.arith;
6426 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
6427 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
6431 *cost += extra_cost->alu.arith;
6436 /* One extra load instruction, after accessing the GOT. */
6437 *cost += COSTS_N_INSNS (1);
6439 *cost += extra_cost->ldst.load;
6445 /* ADRP/ADD (immediate). */
6447 *cost += extra_cost->alu.arith;
6455 if (VECTOR_MODE_P (mode))
6456 *cost += extra_cost->vect.alu;
6458 *cost += extra_cost->alu.bfx;
6461 /* We can trust that the immediates used will be correct (there
6462 are no by-register forms), so we need only cost op0. */
6463 *cost += rtx_cost (XEXP (x, 0), (enum rtx_code) code, 0, speed);
6467 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
6468 /* aarch64_rtx_mult_cost always handles recursion to its
6476 if (VECTOR_MODE_P (mode))
6477 *cost += extra_cost->vect.alu;
6478 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
6479 *cost += (extra_cost->mult[GET_MODE (x) == DImode].add
6480 + extra_cost->mult[GET_MODE (x) == DImode].idiv);
6481 else if (GET_MODE (x) == DFmode)
6482 *cost += (extra_cost->fp[1].mult
6483 + extra_cost->fp[1].div);
6484 else if (GET_MODE (x) == SFmode)
6485 *cost += (extra_cost->fp[0].mult
6486 + extra_cost->fp[0].div);
6488 return false; /* All arguments need to be in registers. */
6495 if (VECTOR_MODE_P (mode))
6496 *cost += extra_cost->vect.alu;
6497 else if (GET_MODE_CLASS (mode) == MODE_INT)
6498 /* There is no integer SQRT, so only DIV and UDIV can get
6500 *cost += extra_cost->mult[mode == DImode].idiv;
6502 *cost += extra_cost->fp[mode == DFmode].div;
6504 return false; /* All arguments need to be in registers. */
6507 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
6508 XEXP (x, 2), cost, speed);
6521 return false; /* All arguments must be in registers. */
6530 if (VECTOR_MODE_P (mode))
6531 *cost += extra_cost->vect.alu;
6533 *cost += extra_cost->fp[mode == DFmode].fma;
6536 /* FMSUB, FNMADD, and FNMSUB are free. */
6537 if (GET_CODE (op0) == NEG)
6538 op0 = XEXP (op0, 0);
6540 if (GET_CODE (op2) == NEG)
6541 op2 = XEXP (op2, 0);
6543 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
6544 and the by-element operand as operand 0. */
6545 if (GET_CODE (op1) == NEG)
6546 op1 = XEXP (op1, 0);
6548 /* Catch vector-by-element operations. The by-element operand can
6549 either be (vec_duplicate (vec_select (x))) or just
6550 (vec_select (x)), depending on whether we are multiplying by
6551 a vector or a scalar.
6553 Canonicalization is not very good in these cases, FMA4 will put the
6554 by-element operand as operand 0, FNMA4 will have it as operand 1. */
6555 if (GET_CODE (op0) == VEC_DUPLICATE)
6556 op0 = XEXP (op0, 0);
6557 else if (GET_CODE (op1) == VEC_DUPLICATE)
6558 op1 = XEXP (op1, 0);
6560 if (GET_CODE (op0) == VEC_SELECT)
6561 op0 = XEXP (op0, 0);
6562 else if (GET_CODE (op1) == VEC_SELECT)
6563 op1 = XEXP (op1, 0);
6565 /* If the remaining parameters are not registers,
6566 get the cost to put them into registers. */
6567 *cost += rtx_cost (op0, FMA, 0, speed);
6568 *cost += rtx_cost (op1, FMA, 1, speed);
6569 *cost += rtx_cost (op2, FMA, 2, speed);
6573 case UNSIGNED_FLOAT:
6575 *cost += extra_cost->fp[mode == DFmode].fromint;
6581 if (VECTOR_MODE_P (mode))
6583 /*Vector truncate. */
6584 *cost += extra_cost->vect.alu;
6587 *cost += extra_cost->fp[mode == DFmode].widen;
6591 case FLOAT_TRUNCATE:
6594 if (VECTOR_MODE_P (mode))
6596 /*Vector conversion. */
6597 *cost += extra_cost->vect.alu;
6600 *cost += extra_cost->fp[mode == DFmode].narrow;
6607 /* Strip the rounding part. They will all be implemented
6608 by the fcvt* family of instructions anyway. */
6609 if (GET_CODE (x) == UNSPEC)
6611 unsigned int uns_code = XINT (x, 1);
6613 if (uns_code == UNSPEC_FRINTA
6614 || uns_code == UNSPEC_FRINTM
6615 || uns_code == UNSPEC_FRINTN
6616 || uns_code == UNSPEC_FRINTP
6617 || uns_code == UNSPEC_FRINTZ)
6618 x = XVECEXP (x, 0, 0);
6623 if (VECTOR_MODE_P (mode))
6624 *cost += extra_cost->vect.alu;
6626 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
6628 *cost += rtx_cost (x, (enum rtx_code) code, 0, speed);
6632 if (VECTOR_MODE_P (mode))
6636 *cost += extra_cost->vect.alu;
6638 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6642 /* FABD, which is analogous to FADD. */
6643 if (GET_CODE (op0) == MINUS)
6645 *cost += rtx_cost (XEXP (op0, 0), MINUS, 0, speed);
6646 + rtx_cost (XEXP (op0, 1), MINUS, 1, speed);
6648 *cost += extra_cost->fp[mode == DFmode].addsub;
6652 /* Simple FABS is analogous to FNEG. */
6654 *cost += extra_cost->fp[mode == DFmode].neg;
6658 /* Integer ABS will either be split to
6659 two arithmetic instructions, or will be an ABS
6660 (scalar), which we don't model. */
6661 *cost = COSTS_N_INSNS (2);
6663 *cost += 2 * extra_cost->alu.arith;
6671 if (VECTOR_MODE_P (mode))
6672 *cost += extra_cost->vect.alu;
6675 /* FMAXNM/FMINNM/FMAX/FMIN.
6676 TODO: This may not be accurate for all implementations, but
6677 we do not model this in the cost tables. */
6678 *cost += extra_cost->fp[mode == DFmode].addsub;
6684 /* The floating point round to integer frint* instructions. */
6685 if (aarch64_frint_unspec_p (XINT (x, 1)))
6688 *cost += extra_cost->fp[mode == DFmode].roundint;
6693 if (XINT (x, 1) == UNSPEC_RBIT)
6696 *cost += extra_cost->alu.rev;
6704 /* Decompose <su>muldi3_highpart. */
6705 if (/* (truncate:DI */
6708 && GET_MODE (XEXP (x, 0)) == TImode
6709 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
6711 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
6712 /* (ANY_EXTEND:TI (reg:DI))
6713 (ANY_EXTEND:TI (reg:DI))) */
6714 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
6715 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
6716 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
6717 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
6718 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
6719 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
6720 /* (const_int 64) */
6721 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6722 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
6726 *cost += extra_cost->mult[mode == DImode].extend;
6727 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
6729 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
6739 if (dump_file && (dump_flags & TDF_DETAILS))
6741 "\nFailed to cost RTX. Assuming default cost.\n");
6746 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
6747 calculated for X. This cost is stored in *COST. Returns true
6748 if the total cost of X was calculated. */
6750 aarch64_rtx_costs_wrapper (rtx x, int code, int outer,
6751 int param, int *cost, bool speed)
6753 bool result = aarch64_rtx_costs (x, code, outer, param, cost, speed);
6755 if (dump_file && (dump_flags & TDF_DETAILS))
6757 print_rtl_single (dump_file, x);
6758 fprintf (dump_file, "\n%s cost: %d (%s)\n",
6759 speed ? "Hot" : "Cold",
6760 *cost, result ? "final" : "partial");
6767 aarch64_register_move_cost (machine_mode mode,
6768 reg_class_t from_i, reg_class_t to_i)
6770 enum reg_class from = (enum reg_class) from_i;
6771 enum reg_class to = (enum reg_class) to_i;
6772 const struct cpu_regmove_cost *regmove_cost
6773 = aarch64_tune_params->regmove_cost;
6775 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
6776 if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
6779 if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
6780 from = GENERAL_REGS;
6782 /* Moving between GPR and stack cost is the same as GP2GP. */
6783 if ((from == GENERAL_REGS && to == STACK_REG)
6784 || (to == GENERAL_REGS && from == STACK_REG))
6785 return regmove_cost->GP2GP;
6787 /* To/From the stack register, we move via the gprs. */
6788 if (to == STACK_REG || from == STACK_REG)
6789 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
6790 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
6792 if (GET_MODE_SIZE (mode) == 16)
6794 /* 128-bit operations on general registers require 2 instructions. */
6795 if (from == GENERAL_REGS && to == GENERAL_REGS)
6796 return regmove_cost->GP2GP * 2;
6797 else if (from == GENERAL_REGS)
6798 return regmove_cost->GP2FP * 2;
6799 else if (to == GENERAL_REGS)
6800 return regmove_cost->FP2GP * 2;
6802 /* When AdvSIMD instructions are disabled it is not possible to move
6803 a 128-bit value directly between Q registers. This is handled in
6804 secondary reload. A general register is used as a scratch to move
6805 the upper DI value and the lower DI value is moved directly,
6806 hence the cost is the sum of three moves. */
6808 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
6810 return regmove_cost->FP2FP;
6813 if (from == GENERAL_REGS && to == GENERAL_REGS)
6814 return regmove_cost->GP2GP;
6815 else if (from == GENERAL_REGS)
6816 return regmove_cost->GP2FP;
6817 else if (to == GENERAL_REGS)
6818 return regmove_cost->FP2GP;
6820 return regmove_cost->FP2FP;
6824 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
6825 reg_class_t rclass ATTRIBUTE_UNUSED,
6826 bool in ATTRIBUTE_UNUSED)
6828 return aarch64_tune_params->memmov_cost;
6831 /* Return the number of instructions that can be issued per cycle. */
6833 aarch64_sched_issue_rate (void)
6835 return aarch64_tune_params->issue_rate;
6839 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
6841 int issue_rate = aarch64_sched_issue_rate ();
6843 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
6846 /* Vectorizer cost model target hooks. */
6848 /* Implement targetm.vectorize.builtin_vectorization_cost. */
6850 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
6852 int misalign ATTRIBUTE_UNUSED)
6856 switch (type_of_cost)
6859 return aarch64_tune_params->vec_costs->scalar_stmt_cost;
6862 return aarch64_tune_params->vec_costs->scalar_load_cost;
6865 return aarch64_tune_params->vec_costs->scalar_store_cost;
6868 return aarch64_tune_params->vec_costs->vec_stmt_cost;
6871 return aarch64_tune_params->vec_costs->vec_align_load_cost;
6874 return aarch64_tune_params->vec_costs->vec_store_cost;
6877 return aarch64_tune_params->vec_costs->vec_to_scalar_cost;
6880 return aarch64_tune_params->vec_costs->scalar_to_vec_cost;
6882 case unaligned_load:
6883 return aarch64_tune_params->vec_costs->vec_unalign_load_cost;
6885 case unaligned_store:
6886 return aarch64_tune_params->vec_costs->vec_unalign_store_cost;
6888 case cond_branch_taken:
6889 return aarch64_tune_params->vec_costs->cond_taken_branch_cost;
6891 case cond_branch_not_taken:
6892 return aarch64_tune_params->vec_costs->cond_not_taken_branch_cost;
6895 case vec_promote_demote:
6896 return aarch64_tune_params->vec_costs->vec_stmt_cost;
6899 elements = TYPE_VECTOR_SUBPARTS (vectype);
6900 return elements / 2 + 1;
6907 /* Implement targetm.vectorize.add_stmt_cost. */
6909 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
6910 struct _stmt_vec_info *stmt_info, int misalign,
6911 enum vect_cost_model_location where)
6913 unsigned *cost = (unsigned *) data;
6914 unsigned retval = 0;
6916 if (flag_vect_cost_model)
6918 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
6920 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
6922 /* Statements in an inner loop relative to the loop being
6923 vectorized are weighted more heavily. The value here is
6924 a function (linear for now) of the loop nest level. */
6925 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
6927 loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info);
6928 struct loop *loop = LOOP_VINFO_LOOP (loop_info);
6929 unsigned nest_level = loop_depth (loop);
6931 count *= nest_level;
6934 retval = (unsigned) (count * stmt_cost);
6935 cost[where] += retval;
6941 static void initialize_aarch64_code_model (void);
6943 /* Parse the architecture extension string. */
6946 aarch64_parse_extension (char *str)
6948 /* The extension string is parsed left to right. */
6949 const struct aarch64_option_extension *opt = NULL;
6951 /* Flag to say whether we are adding or removing an extension. */
6952 int adding_ext = -1;
6954 while (str != NULL && *str != 0)
6960 ext = strchr (str, '+');
6967 if (len >= 2 && strncmp (str, "no", 2) == 0)
6978 error ("missing feature modifier after %qs", adding_ext ? "+"
6983 /* Scan over the extensions table trying to find an exact match. */
6984 for (opt = all_extensions; opt->name != NULL; opt++)
6986 if (strlen (opt->name) == len && strncmp (opt->name, str, len) == 0)
6988 /* Add or remove the extension. */
6990 aarch64_isa_flags |= opt->flags_on;
6992 aarch64_isa_flags &= ~(opt->flags_off);
6997 if (opt->name == NULL)
6999 /* Extension not found in list. */
7000 error ("unknown feature modifier %qs", str);
7010 /* Parse the ARCH string. */
7013 aarch64_parse_arch (void)
7016 const struct processor *arch;
7017 char *str = (char *) alloca (strlen (aarch64_arch_string) + 1);
7020 strcpy (str, aarch64_arch_string);
7022 ext = strchr (str, '+');
7031 error ("missing arch name in -march=%qs", str);
7035 /* Loop through the list of supported ARCHs to find a match. */
7036 for (arch = all_architectures; arch->name != NULL; arch++)
7038 if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
7040 selected_arch = arch;
7041 aarch64_isa_flags = selected_arch->flags;
7044 selected_cpu = &all_cores[selected_arch->core];
7048 /* ARCH string contains at least one extension. */
7049 aarch64_parse_extension (ext);
7052 if (strcmp (selected_arch->arch, selected_cpu->arch))
7054 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
7055 selected_cpu->name, selected_arch->name);
7062 /* ARCH name not found in list. */
7063 error ("unknown value %qs for -march", str);
7067 /* Parse the CPU string. */
7070 aarch64_parse_cpu (void)
7073 const struct processor *cpu;
7074 char *str = (char *) alloca (strlen (aarch64_cpu_string) + 1);
7077 strcpy (str, aarch64_cpu_string);
7079 ext = strchr (str, '+');
7088 error ("missing cpu name in -mcpu=%qs", str);
7092 /* Loop through the list of supported CPUs to find a match. */
7093 for (cpu = all_cores; cpu->name != NULL; cpu++)
7095 if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
7098 aarch64_isa_flags = selected_cpu->flags;
7102 /* CPU string contains at least one extension. */
7103 aarch64_parse_extension (ext);
7110 /* CPU name not found in list. */
7111 error ("unknown value %qs for -mcpu", str);
7115 /* Parse the TUNE string. */
7118 aarch64_parse_tune (void)
7120 const struct processor *cpu;
7121 char *str = (char *) alloca (strlen (aarch64_tune_string) + 1);
7122 strcpy (str, aarch64_tune_string);
7124 /* Loop through the list of supported CPUs to find a match. */
7125 for (cpu = all_cores; cpu->name != NULL; cpu++)
7127 if (strcmp (cpu->name, str) == 0)
7129 selected_tune = cpu;
7134 /* CPU name not found in list. */
7135 error ("unknown value %qs for -mtune", str);
7140 /* Implement TARGET_OPTION_OVERRIDE. */
7143 aarch64_override_options (void)
7145 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
7146 If either of -march or -mtune is given, they override their
7147 respective component of -mcpu.
7149 So, first parse AARCH64_CPU_STRING, then the others, be careful
7150 with -march as, if -mcpu is not present on the command line, march
7151 must set a sensible default CPU. */
7152 if (aarch64_cpu_string)
7154 aarch64_parse_cpu ();
7157 if (aarch64_arch_string)
7159 aarch64_parse_arch ();
7162 if (aarch64_tune_string)
7164 aarch64_parse_tune ();
7167 #ifndef HAVE_AS_MABI_OPTION
7168 /* The compiler may have been configured with 2.23.* binutils, which does
7169 not have support for ILP32. */
7171 error ("Assembler does not support -mabi=ilp32");
7174 initialize_aarch64_code_model ();
7176 aarch64_build_bitmask_table ();
7178 /* This target defaults to strict volatile bitfields. */
7179 if (flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
7180 flag_strict_volatile_bitfields = 1;
7182 /* If the user did not specify a processor, choose the default
7183 one for them. This will be the CPU set during configuration using
7184 --with-cpu, otherwise it is "generic". */
7187 selected_cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
7188 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
7191 gcc_assert (selected_cpu);
7194 selected_tune = selected_cpu;
7196 aarch64_tune_flags = selected_tune->flags;
7197 aarch64_tune = selected_tune->core;
7198 aarch64_tune_params = selected_tune->tune;
7199 aarch64_architecture_version = selected_cpu->architecture_version;
7201 if (aarch64_fix_a53_err835769 == 2)
7203 #ifdef TARGET_FIX_ERR_A53_835769_DEFAULT
7204 aarch64_fix_a53_err835769 = 1;
7206 aarch64_fix_a53_err835769 = 0;
7210 aarch64_register_fma_steering ();
7212 aarch64_override_options_after_change ();
7215 /* Implement targetm.override_options_after_change. */
7218 aarch64_override_options_after_change (void)
7220 if (flag_omit_frame_pointer)
7221 flag_omit_leaf_frame_pointer = false;
7222 else if (flag_omit_leaf_frame_pointer)
7223 flag_omit_frame_pointer = true;
7225 /* If not optimizing for size, set the default
7226 alignment to what the target wants */
7229 if (align_loops <= 0)
7230 align_loops = aarch64_tune_params->loop_align;
7231 if (align_jumps <= 0)
7232 align_jumps = aarch64_tune_params->jump_align;
7233 if (align_functions <= 0)
7234 align_functions = aarch64_tune_params->function_align;
7238 static struct machine_function *
7239 aarch64_init_machine_status (void)
7241 struct machine_function *machine;
7242 machine = ggc_cleared_alloc<machine_function> ();
7247 aarch64_init_expanders (void)
7249 init_machine_status = aarch64_init_machine_status;
7252 /* A checking mechanism for the implementation of the various code models. */
7254 initialize_aarch64_code_model (void)
7258 switch (aarch64_cmodel_var)
7260 case AARCH64_CMODEL_TINY:
7261 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
7263 case AARCH64_CMODEL_SMALL:
7264 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
7266 case AARCH64_CMODEL_LARGE:
7267 sorry ("code model %qs with -f%s", "large",
7268 flag_pic > 1 ? "PIC" : "pic");
7274 aarch64_cmodel = aarch64_cmodel_var;
7277 /* Return true if SYMBOL_REF X binds locally. */
7280 aarch64_symbol_binds_local_p (const_rtx x)
7282 return (SYMBOL_REF_DECL (x)
7283 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
7284 : SYMBOL_REF_LOCAL_P (x));
7287 /* Return true if SYMBOL_REF X is thread local */
7289 aarch64_tls_symbol_p (rtx x)
7291 if (! TARGET_HAVE_TLS)
7294 if (GET_CODE (x) != SYMBOL_REF)
7297 return SYMBOL_REF_TLS_MODEL (x) != 0;
7300 /* Classify a TLS symbol into one of the TLS kinds. */
7301 enum aarch64_symbol_type
7302 aarch64_classify_tls_symbol (rtx x)
7304 enum tls_model tls_kind = tls_symbolic_operand_type (x);
7308 case TLS_MODEL_GLOBAL_DYNAMIC:
7309 case TLS_MODEL_LOCAL_DYNAMIC:
7310 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
7312 case TLS_MODEL_INITIAL_EXEC:
7313 return SYMBOL_SMALL_GOTTPREL;
7315 case TLS_MODEL_LOCAL_EXEC:
7316 return SYMBOL_SMALL_TPREL;
7318 case TLS_MODEL_EMULATED:
7319 case TLS_MODEL_NONE:
7320 return SYMBOL_FORCE_TO_MEM;
7327 /* Return the method that should be used to access SYMBOL_REF or
7328 LABEL_REF X in context CONTEXT. */
7330 enum aarch64_symbol_type
7331 aarch64_classify_symbol (rtx x, rtx offset,
7332 enum aarch64_symbol_context context ATTRIBUTE_UNUSED)
7334 if (GET_CODE (x) == LABEL_REF)
7336 switch (aarch64_cmodel)
7338 case AARCH64_CMODEL_LARGE:
7339 return SYMBOL_FORCE_TO_MEM;
7341 case AARCH64_CMODEL_TINY_PIC:
7342 case AARCH64_CMODEL_TINY:
7343 return SYMBOL_TINY_ABSOLUTE;
7345 case AARCH64_CMODEL_SMALL_PIC:
7346 case AARCH64_CMODEL_SMALL:
7347 return SYMBOL_SMALL_ABSOLUTE;
7354 if (GET_CODE (x) == SYMBOL_REF)
7356 if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
7357 return SYMBOL_FORCE_TO_MEM;
7359 if (aarch64_tls_symbol_p (x))
7360 return aarch64_classify_tls_symbol (x);
7362 switch (aarch64_cmodel)
7364 case AARCH64_CMODEL_TINY:
7365 /* When we retreive symbol + offset address, we have to make sure
7366 the offset does not cause overflow of the final address. But
7367 we have no way of knowing the address of symbol at compile time
7368 so we can't accurately say if the distance between the PC and
7369 symbol + offset is outside the addressible range of +/-1M in the
7370 TINY code model. So we rely on images not being greater than
7371 1M and cap the offset at 1M and anything beyond 1M will have to
7372 be loaded using an alternative mechanism. */
7373 if (SYMBOL_REF_WEAK (x)
7374 || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
7375 return SYMBOL_FORCE_TO_MEM;
7376 return SYMBOL_TINY_ABSOLUTE;
7378 case AARCH64_CMODEL_SMALL:
7379 /* Same reasoning as the tiny code model, but the offset cap here is
7381 if (SYMBOL_REF_WEAK (x)
7382 || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
7383 HOST_WIDE_INT_C (4294967264)))
7384 return SYMBOL_FORCE_TO_MEM;
7385 return SYMBOL_SMALL_ABSOLUTE;
7387 case AARCH64_CMODEL_TINY_PIC:
7388 if (!aarch64_symbol_binds_local_p (x))
7389 return SYMBOL_TINY_GOT;
7390 return SYMBOL_TINY_ABSOLUTE;
7392 case AARCH64_CMODEL_SMALL_PIC:
7393 if (!aarch64_symbol_binds_local_p (x))
7394 return SYMBOL_SMALL_GOT;
7395 return SYMBOL_SMALL_ABSOLUTE;
7402 /* By default push everything into the constant pool. */
7403 return SYMBOL_FORCE_TO_MEM;
7407 aarch64_constant_address_p (rtx x)
7409 return (CONSTANT_P (x) && memory_address_p (DImode, x));
7413 aarch64_legitimate_pic_operand_p (rtx x)
7415 if (GET_CODE (x) == SYMBOL_REF
7416 || (GET_CODE (x) == CONST
7417 && GET_CODE (XEXP (x, 0)) == PLUS
7418 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
7424 /* Return true if X holds either a quarter-precision or
7425 floating-point +0.0 constant. */
7427 aarch64_valid_floating_const (machine_mode mode, rtx x)
7429 if (!CONST_DOUBLE_P (x))
7432 /* TODO: We could handle moving 0.0 to a TFmode register,
7433 but first we would like to refactor the movtf_aarch64
7434 to be more amicable to split moves properly and
7435 correctly gate on TARGET_SIMD. For now - reject all
7436 constants which are not to SFmode or DFmode registers. */
7437 if (!(mode == SFmode || mode == DFmode))
7440 if (aarch64_float_const_zero_rtx_p (x))
7442 return aarch64_float_const_representable_p (x);
7446 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
7448 /* Do not allow vector struct mode constants. We could support
7449 0 and -1 easily, but they need support in aarch64-simd.md. */
7450 if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
7453 /* This could probably go away because
7454 we now decompose CONST_INTs according to expand_mov_immediate. */
7455 if ((GET_CODE (x) == CONST_VECTOR
7456 && aarch64_simd_valid_immediate (x, mode, false, NULL))
7457 || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
7458 return !targetm.cannot_force_const_mem (mode, x);
7460 if (GET_CODE (x) == HIGH
7461 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
7464 return aarch64_constant_address_p (x);
7468 aarch64_load_tp (rtx target)
7471 || GET_MODE (target) != Pmode
7472 || !register_operand (target, Pmode))
7473 target = gen_reg_rtx (Pmode);
7475 /* Can return in any reg. */
7476 emit_insn (gen_aarch64_load_tp_hard (target));
7480 /* On AAPCS systems, this is the "struct __va_list". */
7481 static GTY(()) tree va_list_type;
7483 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
7484 Return the type to use as __builtin_va_list.
7486 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
7498 aarch64_build_builtin_va_list (void)
7501 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7503 /* Create the type. */
7504 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
7505 /* Give it the required name. */
7506 va_list_name = build_decl (BUILTINS_LOCATION,
7508 get_identifier ("__va_list"),
7510 DECL_ARTIFICIAL (va_list_name) = 1;
7511 TYPE_NAME (va_list_type) = va_list_name;
7512 TYPE_STUB_DECL (va_list_type) = va_list_name;
7514 /* Create the fields. */
7515 f_stack = build_decl (BUILTINS_LOCATION,
7516 FIELD_DECL, get_identifier ("__stack"),
7518 f_grtop = build_decl (BUILTINS_LOCATION,
7519 FIELD_DECL, get_identifier ("__gr_top"),
7521 f_vrtop = build_decl (BUILTINS_LOCATION,
7522 FIELD_DECL, get_identifier ("__vr_top"),
7524 f_groff = build_decl (BUILTINS_LOCATION,
7525 FIELD_DECL, get_identifier ("__gr_offs"),
7527 f_vroff = build_decl (BUILTINS_LOCATION,
7528 FIELD_DECL, get_identifier ("__vr_offs"),
7531 DECL_ARTIFICIAL (f_stack) = 1;
7532 DECL_ARTIFICIAL (f_grtop) = 1;
7533 DECL_ARTIFICIAL (f_vrtop) = 1;
7534 DECL_ARTIFICIAL (f_groff) = 1;
7535 DECL_ARTIFICIAL (f_vroff) = 1;
7537 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
7538 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
7539 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
7540 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
7541 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
7543 TYPE_FIELDS (va_list_type) = f_stack;
7544 DECL_CHAIN (f_stack) = f_grtop;
7545 DECL_CHAIN (f_grtop) = f_vrtop;
7546 DECL_CHAIN (f_vrtop) = f_groff;
7547 DECL_CHAIN (f_groff) = f_vroff;
7549 /* Compute its layout. */
7550 layout_type (va_list_type);
7552 return va_list_type;
7555 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
7557 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
7559 const CUMULATIVE_ARGS *cum;
7560 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7561 tree stack, grtop, vrtop, groff, vroff;
7563 int gr_save_area_size;
7564 int vr_save_area_size;
7567 cum = &crtl->args.info;
7569 = (NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD;
7571 = (NUM_FP_ARG_REGS - cum->aapcs_nvrn) * UNITS_PER_VREG;
7575 if (cum->aapcs_nvrn > 0)
7576 sorry ("%qs and floating point or vector arguments",
7577 "-mgeneral-regs-only");
7578 vr_save_area_size = 0;
7581 f_stack = TYPE_FIELDS (va_list_type_node);
7582 f_grtop = DECL_CHAIN (f_stack);
7583 f_vrtop = DECL_CHAIN (f_grtop);
7584 f_groff = DECL_CHAIN (f_vrtop);
7585 f_vroff = DECL_CHAIN (f_groff);
7587 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
7589 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
7591 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
7593 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
7595 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
7598 /* Emit code to initialize STACK, which points to the next varargs stack
7599 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
7600 by named arguments. STACK is 8-byte aligned. */
7601 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
7602 if (cum->aapcs_stack_size > 0)
7603 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
7604 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
7605 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7607 /* Emit code to initialize GRTOP, the top of the GR save area.
7608 virtual_incoming_args_rtx should have been 16 byte aligned. */
7609 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
7610 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
7611 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7613 /* Emit code to initialize VRTOP, the top of the VR save area.
7614 This address is gr_save_area_bytes below GRTOP, rounded
7615 down to the next 16-byte boundary. */
7616 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
7617 vr_offset = AARCH64_ROUND_UP (gr_save_area_size,
7618 STACK_BOUNDARY / BITS_PER_UNIT);
7621 t = fold_build_pointer_plus_hwi (t, -vr_offset);
7622 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
7623 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7625 /* Emit code to initialize GROFF, the offset from GRTOP of the
7626 next GPR argument. */
7627 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
7628 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
7629 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7631 /* Likewise emit code to initialize VROFF, the offset from FTOP
7632 of the next VR argument. */
7633 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
7634 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
7635 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7638 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
7641 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
7642 gimple_seq *post_p ATTRIBUTE_UNUSED)
7646 bool is_ha; /* is HFA or HVA. */
7647 bool dw_align; /* double-word align. */
7648 machine_mode ag_mode = VOIDmode;
7652 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7653 tree stack, f_top, f_off, off, arg, roundup, on_stack;
7654 HOST_WIDE_INT size, rsize, adjust, align;
7655 tree t, u, cond1, cond2;
7657 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
7659 type = build_pointer_type (type);
7661 mode = TYPE_MODE (type);
7663 f_stack = TYPE_FIELDS (va_list_type_node);
7664 f_grtop = DECL_CHAIN (f_stack);
7665 f_vrtop = DECL_CHAIN (f_grtop);
7666 f_groff = DECL_CHAIN (f_vrtop);
7667 f_vroff = DECL_CHAIN (f_groff);
7669 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
7670 f_stack, NULL_TREE);
7671 size = int_size_in_bytes (type);
7672 align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
7676 if (aarch64_vfp_is_call_or_return_candidate (mode,
7682 /* TYPE passed in fp/simd registers. */
7684 sorry ("%qs and floating point or vector arguments",
7685 "-mgeneral-regs-only");
7687 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
7688 unshare_expr (valist), f_vrtop, NULL_TREE);
7689 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
7690 unshare_expr (valist), f_vroff, NULL_TREE);
7692 rsize = nregs * UNITS_PER_VREG;
7696 if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
7697 adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
7699 else if (BLOCK_REG_PADDING (mode, type, 1) == downward
7700 && size < UNITS_PER_VREG)
7702 adjust = UNITS_PER_VREG - size;
7707 /* TYPE passed in general registers. */
7708 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
7709 unshare_expr (valist), f_grtop, NULL_TREE);
7710 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
7711 unshare_expr (valist), f_groff, NULL_TREE);
7712 rsize = (size + UNITS_PER_WORD - 1) & -UNITS_PER_WORD;
7713 nregs = rsize / UNITS_PER_WORD;
7718 if (BLOCK_REG_PADDING (mode, type, 1) == downward
7719 && size < UNITS_PER_WORD)
7721 adjust = UNITS_PER_WORD - size;
7725 /* Get a local temporary for the field value. */
7726 off = get_initialized_tmp_var (f_off, pre_p, NULL);
7728 /* Emit code to branch if off >= 0. */
7729 t = build2 (GE_EXPR, boolean_type_node, off,
7730 build_int_cst (TREE_TYPE (off), 0));
7731 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
7735 /* Emit: offs = (offs + 15) & -16. */
7736 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
7737 build_int_cst (TREE_TYPE (off), 15));
7738 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
7739 build_int_cst (TREE_TYPE (off), -16));
7740 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
7745 /* Update ap.__[g|v]r_offs */
7746 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
7747 build_int_cst (TREE_TYPE (off), rsize));
7748 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
7752 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
7754 /* [cond2] if (ap.__[g|v]r_offs > 0) */
7755 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
7756 build_int_cst (TREE_TYPE (f_off), 0));
7757 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
7759 /* String up: make sure the assignment happens before the use. */
7760 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
7761 COND_EXPR_ELSE (cond1) = t;
7763 /* Prepare the trees handling the argument that is passed on the stack;
7764 the top level node will store in ON_STACK. */
7765 arg = get_initialized_tmp_var (stack, pre_p, NULL);
7768 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
7769 t = fold_convert (intDI_type_node, arg);
7770 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
7771 build_int_cst (TREE_TYPE (t), 15));
7772 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
7773 build_int_cst (TREE_TYPE (t), -16));
7774 t = fold_convert (TREE_TYPE (arg), t);
7775 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
7779 /* Advance ap.__stack */
7780 t = fold_convert (intDI_type_node, arg);
7781 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
7782 build_int_cst (TREE_TYPE (t), size + 7));
7783 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
7784 build_int_cst (TREE_TYPE (t), -8));
7785 t = fold_convert (TREE_TYPE (arg), t);
7786 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
7787 /* String up roundup and advance. */
7789 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
7790 /* String up with arg */
7791 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
7792 /* Big-endianness related address adjustment. */
7793 if (BLOCK_REG_PADDING (mode, type, 1) == downward
7794 && size < UNITS_PER_WORD)
7796 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
7797 size_int (UNITS_PER_WORD - size));
7798 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
7801 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
7802 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
7804 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
7807 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
7808 build_int_cst (TREE_TYPE (off), adjust));
7810 t = fold_convert (sizetype, t);
7811 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
7815 /* type ha; // treat as "struct {ftype field[n];}"
7816 ... [computing offs]
7817 for (i = 0; i <nregs; ++i, offs += 16)
7818 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
7821 tree tmp_ha, field_t, field_ptr_t;
7823 /* Declare a local variable. */
7824 tmp_ha = create_tmp_var_raw (type, "ha");
7825 gimple_add_tmp_var (tmp_ha);
7827 /* Establish the base type. */
7831 field_t = float_type_node;
7832 field_ptr_t = float_ptr_type_node;
7835 field_t = double_type_node;
7836 field_ptr_t = double_ptr_type_node;
7839 field_t = long_double_type_node;
7840 field_ptr_t = long_double_ptr_type_node;
7842 /* The half precision and quad precision are not fully supported yet. Enable
7843 the following code after the support is complete. Need to find the correct
7844 type node for __fp16 *. */
7847 field_t = float_type_node;
7848 field_ptr_t = float_ptr_type_node;
7854 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
7855 field_t = build_vector_type_for_mode (innertype, ag_mode);
7856 field_ptr_t = build_pointer_type (field_t);
7863 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
7864 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
7866 t = fold_convert (field_ptr_t, addr);
7867 t = build2 (MODIFY_EXPR, field_t,
7868 build1 (INDIRECT_REF, field_t, tmp_ha),
7869 build1 (INDIRECT_REF, field_t, t));
7871 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
7872 for (i = 1; i < nregs; ++i)
7874 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
7875 u = fold_convert (field_ptr_t, addr);
7876 u = build2 (MODIFY_EXPR, field_t,
7877 build2 (MEM_REF, field_t, tmp_ha,
7878 build_int_cst (field_ptr_t,
7880 int_size_in_bytes (field_t)))),
7881 build1 (INDIRECT_REF, field_t, u));
7882 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
7885 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
7886 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
7889 COND_EXPR_ELSE (cond2) = t;
7890 addr = fold_convert (build_pointer_type (type), cond1);
7891 addr = build_va_arg_indirect_ref (addr);
7894 addr = build_va_arg_indirect_ref (addr);
7899 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
7902 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
7903 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7906 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7907 CUMULATIVE_ARGS local_cum;
7908 int gr_saved, vr_saved;
7910 /* The caller has advanced CUM up to, but not beyond, the last named
7911 argument. Advance a local copy of CUM past the last "real" named
7912 argument, to find out how many registers are left over. */
7914 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
7916 /* Found out how many registers we need to save. */
7917 gr_saved = NUM_ARG_REGS - local_cum.aapcs_ncrn;
7918 vr_saved = NUM_FP_ARG_REGS - local_cum.aapcs_nvrn;
7922 if (local_cum.aapcs_nvrn > 0)
7923 sorry ("%qs and floating point or vector arguments",
7924 "-mgeneral-regs-only");
7934 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
7935 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
7936 - gr_saved * UNITS_PER_WORD);
7937 mem = gen_frame_mem (BLKmode, ptr);
7938 set_mem_alias_set (mem, get_varargs_alias_set ());
7940 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
7945 /* We can't use move_block_from_reg, because it will use
7946 the wrong mode, storing D regs only. */
7947 machine_mode mode = TImode;
7950 /* Set OFF to the offset from virtual_incoming_args_rtx of
7951 the first vector register. The VR save area lies below
7952 the GR one, and is aligned to 16 bytes. */
7953 off = -AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
7954 STACK_BOUNDARY / BITS_PER_UNIT);
7955 off -= vr_saved * UNITS_PER_VREG;
7957 for (i = local_cum.aapcs_nvrn; i < NUM_FP_ARG_REGS; ++i)
7961 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
7962 mem = gen_frame_mem (mode, ptr);
7963 set_mem_alias_set (mem, get_varargs_alias_set ());
7964 aarch64_emit_move (mem, gen_rtx_REG (mode, V0_REGNUM + i));
7965 off += UNITS_PER_VREG;
7970 /* We don't save the size into *PRETEND_SIZE because we want to avoid
7971 any complication of having crtl->args.pretend_args_size changed. */
7972 cfun->machine->frame.saved_varargs_size
7973 = (AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
7974 STACK_BOUNDARY / BITS_PER_UNIT)
7975 + vr_saved * UNITS_PER_VREG);
7979 aarch64_conditional_register_usage (void)
7984 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
7987 call_used_regs[i] = 1;
7992 /* Walk down the type tree of TYPE counting consecutive base elements.
7993 If *MODEP is VOIDmode, then set it to the first valid floating point
7994 type. If a non-floating point type is found, or if a floating point
7995 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
7996 otherwise return the count in the sub-tree. */
7998 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
8003 switch (TREE_CODE (type))
8006 mode = TYPE_MODE (type);
8007 if (mode != DFmode && mode != SFmode && mode != TFmode)
8010 if (*modep == VOIDmode)
8019 mode = TYPE_MODE (TREE_TYPE (type));
8020 if (mode != DFmode && mode != SFmode && mode != TFmode)
8023 if (*modep == VOIDmode)
8032 /* Use V2SImode and V4SImode as representatives of all 64-bit
8033 and 128-bit vector types. */
8034 size = int_size_in_bytes (type);
8047 if (*modep == VOIDmode)
8050 /* Vector modes are considered to be opaque: two vectors are
8051 equivalent for the purposes of being homogeneous aggregates
8052 if they are the same size. */
8061 tree index = TYPE_DOMAIN (type);
8063 /* Can't handle incomplete types nor sizes that are not
8065 if (!COMPLETE_TYPE_P (type)
8066 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
8069 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
8072 || !TYPE_MAX_VALUE (index)
8073 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
8074 || !TYPE_MIN_VALUE (index)
8075 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
8079 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
8080 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
8082 /* There must be no padding. */
8083 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
8095 /* Can't handle incomplete types nor sizes that are not
8097 if (!COMPLETE_TYPE_P (type)
8098 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
8101 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
8103 if (TREE_CODE (field) != FIELD_DECL)
8106 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
8112 /* There must be no padding. */
8113 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
8120 case QUAL_UNION_TYPE:
8122 /* These aren't very interesting except in a degenerate case. */
8127 /* Can't handle incomplete types nor sizes that are not
8129 if (!COMPLETE_TYPE_P (type)
8130 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
8133 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
8135 if (TREE_CODE (field) != FIELD_DECL)
8138 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
8141 count = count > sub_count ? count : sub_count;
8144 /* There must be no padding. */
8145 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
8158 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
8159 type as described in AAPCS64 \S 4.1.2.
8161 See the comment above aarch64_composite_type_p for the notes on MODE. */
8164 aarch64_short_vector_p (const_tree type,
8167 HOST_WIDE_INT size = -1;
8169 if (type && TREE_CODE (type) == VECTOR_TYPE)
8170 size = int_size_in_bytes (type);
8171 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
8172 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
8173 size = GET_MODE_SIZE (mode);
8175 return (size == 8 || size == 16);
8178 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
8179 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
8180 array types. The C99 floating-point complex types are also considered
8181 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
8182 types, which are GCC extensions and out of the scope of AAPCS64, are
8183 treated as composite types here as well.
8185 Note that MODE itself is not sufficient in determining whether a type
8186 is such a composite type or not. This is because
8187 stor-layout.c:compute_record_mode may have already changed the MODE
8188 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
8189 structure with only one field may have its MODE set to the mode of the
8190 field. Also an integer mode whose size matches the size of the
8191 RECORD_TYPE type may be used to substitute the original mode
8192 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
8193 solely relied on. */
8196 aarch64_composite_type_p (const_tree type,
8199 if (aarch64_short_vector_p (type, mode))
8202 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
8206 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
8207 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
8213 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
8214 shall be passed or returned in simd/fp register(s) (providing these
8215 parameter passing registers are available).
8217 Upon successful return, *COUNT returns the number of needed registers,
8218 *BASE_MODE returns the mode of the individual register and when IS_HAF
8219 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
8220 floating-point aggregate or a homogeneous short-vector aggregate. */
8223 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
8225 machine_mode *base_mode,
8229 machine_mode new_mode = VOIDmode;
8230 bool composite_p = aarch64_composite_type_p (type, mode);
8232 if (is_ha != NULL) *is_ha = false;
8234 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
8235 || aarch64_short_vector_p (type, mode))
8240 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
8242 if (is_ha != NULL) *is_ha = true;
8244 new_mode = GET_MODE_INNER (mode);
8246 else if (type && composite_p)
8248 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
8250 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
8252 if (is_ha != NULL) *is_ha = true;
8261 *base_mode = new_mode;
8265 /* Implement TARGET_STRUCT_VALUE_RTX. */
8268 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
8269 int incoming ATTRIBUTE_UNUSED)
8271 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
8274 /* Implements target hook vector_mode_supported_p. */
8276 aarch64_vector_mode_supported_p (machine_mode mode)
8279 && (mode == V4SImode || mode == V8HImode
8280 || mode == V16QImode || mode == V2DImode
8281 || mode == V2SImode || mode == V4HImode
8282 || mode == V8QImode || mode == V2SFmode
8283 || mode == V4SFmode || mode == V2DFmode
8284 || mode == V1DFmode))
8290 /* Return appropriate SIMD container
8291 for MODE within a vector of WIDTH bits. */
8293 aarch64_simd_container_mode (machine_mode mode, unsigned width)
8295 gcc_assert (width == 64 || width == 128);
8334 /* Return 128-bit container as the preferred SIMD mode for MODE. */
8336 aarch64_preferred_simd_mode (machine_mode mode)
8338 return aarch64_simd_container_mode (mode, 128);
8341 /* Return the bitmask of possible vector sizes for the vectorizer
8344 aarch64_autovectorize_vector_sizes (void)
8349 /* Implement TARGET_MANGLE_TYPE. */
8352 aarch64_mangle_type (const_tree type)
8354 /* The AArch64 ABI documents say that "__va_list" has to be
8355 managled as if it is in the "std" namespace. */
8356 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
8357 return "St9__va_list";
8359 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
8361 if (TYPE_NAME (type) != NULL)
8362 return aarch64_mangle_builtin_type (type);
8364 /* Use the default mangling. */
8369 /* Return true if the rtx_insn contains a MEM RTX somewhere
8373 has_memory_op (rtx_insn *mem_insn)
8375 subrtx_iterator::array_type array;
8376 FOR_EACH_SUBRTX (iter, array, PATTERN (mem_insn), ALL)
8383 /* Find the first rtx_insn before insn that will generate an assembly
8387 aarch64_prev_real_insn (rtx_insn *insn)
8394 insn = prev_real_insn (insn);
8396 while (insn && recog_memoized (insn) < 0);
8402 is_madd_op (enum attr_type t1)
8405 /* A number of these may be AArch32 only. */
8406 enum attr_type mlatypes[] = {
8407 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
8408 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
8409 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
8412 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
8414 if (t1 == mlatypes[i])
8421 /* Check if there is a register dependency between a load and the insn
8422 for which we hold recog_data. */
8425 dep_between_memop_and_curr (rtx memop)
8430 gcc_assert (GET_CODE (memop) == SET);
8432 if (!REG_P (SET_DEST (memop)))
8435 load_reg = SET_DEST (memop);
8436 for (opno = 1; opno < recog_data.n_operands; opno++)
8438 rtx operand = recog_data.operand[opno];
8440 && reg_overlap_mentioned_p (load_reg, operand))
8448 /* When working around the Cortex-A53 erratum 835769,
8449 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
8450 instruction and has a preceding memory instruction such that a NOP
8451 should be inserted between them. */
8454 aarch64_madd_needs_nop (rtx_insn* insn)
8456 enum attr_type attr_type;
8460 if (!aarch64_fix_a53_err835769)
8463 if (recog_memoized (insn) < 0)
8466 attr_type = get_attr_type (insn);
8467 if (!is_madd_op (attr_type))
8470 prev = aarch64_prev_real_insn (insn);
8471 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
8472 Restore recog state to INSN to avoid state corruption. */
8473 extract_constrain_insn_cached (insn);
8475 if (!prev || !has_memory_op (prev))
8478 body = single_set (prev);
8480 /* If the previous insn is a memory op and there is no dependency between
8481 it and the DImode madd, emit a NOP between them. If body is NULL then we
8482 have a complex memory operation, probably a load/store pair.
8483 Be conservative for now and emit a NOP. */
8484 if (GET_MODE (recog_data.operand[0]) == DImode
8485 && (!body || !dep_between_memop_and_curr (body)))
8493 /* Implement FINAL_PRESCAN_INSN. */
8496 aarch64_final_prescan_insn (rtx_insn *insn)
8498 if (aarch64_madd_needs_nop (insn))
8499 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
8503 /* Return the equivalent letter for size. */
8505 sizetochar (int size)
8509 case 64: return 'd';
8510 case 32: return 's';
8511 case 16: return 'h';
8512 case 8 : return 'b';
8513 default: gcc_unreachable ();
8517 /* Return true iff x is a uniform vector of floating-point
8518 constants, and the constant can be represented in
8519 quarter-precision form. Note, as aarch64_float_const_representable
8520 rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0. */
8522 aarch64_vect_float_const_representable_p (rtx x)
8525 REAL_VALUE_TYPE r0, ri;
8528 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
8531 x0 = CONST_VECTOR_ELT (x, 0);
8532 if (!CONST_DOUBLE_P (x0))
8535 REAL_VALUE_FROM_CONST_DOUBLE (r0, x0);
8537 for (i = 1; i < CONST_VECTOR_NUNITS (x); i++)
8539 xi = CONST_VECTOR_ELT (x, i);
8540 if (!CONST_DOUBLE_P (xi))
8543 REAL_VALUE_FROM_CONST_DOUBLE (ri, xi);
8544 if (!REAL_VALUES_EQUAL (r0, ri))
8548 return aarch64_float_const_representable_p (x0);
8551 /* Return true for valid and false for invalid. */
8553 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
8554 struct simd_immediate_info *info)
8556 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG) \
8558 for (i = 0; i < idx; i += (STRIDE)) \
8563 immtype = (CLASS); \
8564 elsize = (ELSIZE); \
8570 unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
8571 unsigned int innersize = GET_MODE_SIZE (GET_MODE_INNER (mode));
8572 unsigned char bytes[16];
8573 int immtype = -1, matches;
8574 unsigned int invmask = inverse ? 0xff : 0;
8577 if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
8579 if (! (aarch64_simd_imm_zero_p (op, mode)
8580 || aarch64_vect_float_const_representable_p (op)))
8585 info->value = CONST_VECTOR_ELT (op, 0);
8586 info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
8594 /* Splat vector constant out into a byte vector. */
8595 for (i = 0; i < n_elts; i++)
8597 /* The vector is provided in gcc endian-neutral fashion. For aarch64_be,
8598 it must be laid out in the vector register in reverse order. */
8599 rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
8600 unsigned HOST_WIDE_INT elpart;
8601 unsigned int part, parts;
8603 if (CONST_INT_P (el))
8605 elpart = INTVAL (el);
8608 else if (GET_CODE (el) == CONST_DOUBLE)
8610 elpart = CONST_DOUBLE_LOW (el);
8616 for (part = 0; part < parts; part++)
8619 for (byte = 0; byte < innersize; byte++)
8621 bytes[idx++] = (elpart & 0xff) ^ invmask;
8622 elpart >>= BITS_PER_UNIT;
8624 if (GET_CODE (el) == CONST_DOUBLE)
8625 elpart = CONST_DOUBLE_HIGH (el);
8630 gcc_assert (idx == GET_MODE_SIZE (mode));
8634 CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
8635 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
8637 CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
8638 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
8640 CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
8641 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
8643 CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
8644 && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
8646 CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
8648 CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
8650 CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
8651 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
8653 CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
8654 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
8656 CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
8657 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
8659 CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
8660 && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
8662 CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
8664 CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
8666 CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
8667 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
8669 CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
8670 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
8672 CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
8673 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
8675 CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
8676 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
8678 CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
8680 CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
8681 && bytes[i] == bytes[(i + 8) % idx], 0, 0);
8690 info->element_width = elsize;
8691 info->mvn = emvn != 0;
8692 info->shift = eshift;
8694 unsigned HOST_WIDE_INT imm = 0;
8696 if (immtype >= 12 && immtype <= 15)
8699 /* Un-invert bytes of recognized vector, if necessary. */
8701 for (i = 0; i < idx; i++)
8702 bytes[i] ^= invmask;
8706 /* FIXME: Broken on 32-bit H_W_I hosts. */
8707 gcc_assert (sizeof (HOST_WIDE_INT) == 8);
8709 for (i = 0; i < 8; i++)
8710 imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
8711 << (i * BITS_PER_UNIT);
8714 info->value = GEN_INT (imm);
8718 for (i = 0; i < elsize / BITS_PER_UNIT; i++)
8719 imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
8721 /* Construct 'abcdefgh' because the assembler cannot handle
8722 generic constants. */
8725 imm = (imm >> info->shift) & 0xff;
8726 info->value = GEN_INT (imm);
8734 /* Check of immediate shift constants are within range. */
8736 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
8738 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
8740 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
8742 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
8745 /* Return true if X is a uniform vector where all elements
8746 are either the floating-point constant 0.0 or the
8747 integer constant 0. */
8749 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
8751 return x == CONST0_RTX (mode);
8755 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
8757 HOST_WIDE_INT imm = INTVAL (x);
8760 for (i = 0; i < 8; i++)
8762 unsigned int byte = imm & 0xff;
8763 if (byte != 0xff && byte != 0)
8772 aarch64_mov_operand_p (rtx x,
8773 enum aarch64_symbol_context context,
8776 if (GET_CODE (x) == HIGH
8777 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
8780 if (CONST_INT_P (x))
8783 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
8786 return aarch64_classify_symbolic_expression (x, context)
8787 == SYMBOL_TINY_ABSOLUTE;
8790 /* Return a const_int vector of VAL. */
8792 aarch64_simd_gen_const_vector_dup (machine_mode mode, int val)
8794 int nunits = GET_MODE_NUNITS (mode);
8795 rtvec v = rtvec_alloc (nunits);
8798 for (i=0; i < nunits; i++)
8799 RTVEC_ELT (v, i) = GEN_INT (val);
8801 return gen_rtx_CONST_VECTOR (mode, v);
8804 /* Check OP is a legal scalar immediate for the MOVI instruction. */
8807 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
8811 gcc_assert (!VECTOR_MODE_P (mode));
8812 vmode = aarch64_preferred_simd_mode (mode);
8813 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
8814 return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
8817 /* Construct and return a PARALLEL RTX vector with elements numbering the
8818 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
8819 the vector - from the perspective of the architecture. This does not
8820 line up with GCC's perspective on lane numbers, so we end up with
8821 different masks depending on our target endian-ness. The diagram
8822 below may help. We must draw the distinction when building masks
8823 which select one half of the vector. An instruction selecting
8824 architectural low-lanes for a big-endian target, must be described using
8825 a mask selecting GCC high-lanes.
8827 Big-Endian Little-Endian
8830 | x | x | x | x | | x | x | x | x |
8831 Architecture 3 2 1 0 3 2 1 0
8833 Low Mask: { 2, 3 } { 0, 1 }
8834 High Mask: { 0, 1 } { 2, 3 }
8838 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
8840 int nunits = GET_MODE_NUNITS (mode);
8841 rtvec v = rtvec_alloc (nunits / 2);
8842 int high_base = nunits / 2;
8848 if (BYTES_BIG_ENDIAN)
8849 base = high ? low_base : high_base;
8851 base = high ? high_base : low_base;
8853 for (i = 0; i < nunits / 2; i++)
8854 RTVEC_ELT (v, i) = GEN_INT (base + i);
8856 t1 = gen_rtx_PARALLEL (mode, v);
8860 /* Check OP for validity as a PARALLEL RTX vector with elements
8861 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
8862 from the perspective of the architecture. See the diagram above
8863 aarch64_simd_vect_par_cnst_half for more details. */
8866 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
8869 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
8870 HOST_WIDE_INT count_op = XVECLEN (op, 0);
8871 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
8874 if (!VECTOR_MODE_P (mode))
8877 if (count_op != count_ideal)
8880 for (i = 0; i < count_ideal; i++)
8882 rtx elt_op = XVECEXP (op, 0, i);
8883 rtx elt_ideal = XVECEXP (ideal, 0, i);
8885 if (!CONST_INT_P (elt_op)
8886 || INTVAL (elt_ideal) != INTVAL (elt_op))
8892 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
8893 HIGH (exclusive). */
8895 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
8899 gcc_assert (CONST_INT_P (operand));
8900 lane = INTVAL (operand);
8902 if (lane < low || lane >= high)
8905 error ("%Klane %ld out of range %ld - %ld", exp, lane, low, high - 1);
8907 error ("lane %ld out of range %ld - %ld", lane, low, high - 1);
8911 /* Return TRUE if OP is a valid vector addressing mode. */
8913 aarch64_simd_mem_operand_p (rtx op)
8915 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
8916 || REG_P (XEXP (op, 0)));
8919 /* Emit a register copy from operand to operand, taking care not to
8920 early-clobber source registers in the process.
8922 COUNT is the number of components into which the copy needs to be
8925 aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
8929 int rdest = REGNO (operands[0]);
8930 int rsrc = REGNO (operands[1]);
8932 if (!reg_overlap_mentioned_p (operands[0], operands[1])
8934 for (i = 0; i < count; i++)
8935 emit_move_insn (gen_rtx_REG (mode, rdest + i),
8936 gen_rtx_REG (mode, rsrc + i));
8938 for (i = 0; i < count; i++)
8939 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
8940 gen_rtx_REG (mode, rsrc + count - i - 1));
8943 /* Compute and return the length of aarch64_simd_mov<mode>, where <mode> is
8944 one of VSTRUCT modes: OI, CI or XI. */
8946 aarch64_simd_attr_length_move (rtx_insn *insn)
8950 extract_insn_cached (insn);
8952 if (REG_P (recog_data.operand[0]) && REG_P (recog_data.operand[1]))
8954 mode = GET_MODE (recog_data.operand[0]);
8970 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
8971 one of VSTRUCT modes: OI, CI, EI, or XI. */
8973 aarch64_simd_attr_length_rglist (enum machine_mode mode)
8975 return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
8978 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
8979 alignment of a vector to 128 bits. */
8980 static HOST_WIDE_INT
8981 aarch64_simd_vector_alignment (const_tree type)
8983 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
8984 return MIN (align, 128);
8987 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
8989 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
8994 /* We guarantee alignment for vectors up to 128-bits. */
8995 if (tree_int_cst_compare (TYPE_SIZE (type),
8996 bitsize_int (BIGGEST_ALIGNMENT)) > 0)
8999 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
9003 /* If VALS is a vector constant that can be loaded into a register
9004 using DUP, generate instructions to do so and return an RTX to
9005 assign to the register. Otherwise return NULL_RTX. */
9007 aarch64_simd_dup_constant (rtx vals)
9009 machine_mode mode = GET_MODE (vals);
9010 machine_mode inner_mode = GET_MODE_INNER (mode);
9011 int n_elts = GET_MODE_NUNITS (mode);
9012 bool all_same = true;
9016 if (GET_CODE (vals) != CONST_VECTOR)
9019 for (i = 1; i < n_elts; ++i)
9021 x = CONST_VECTOR_ELT (vals, i);
9022 if (!rtx_equal_p (x, CONST_VECTOR_ELT (vals, 0)))
9029 /* We can load this constant by using DUP and a constant in a
9030 single ARM register. This will be cheaper than a vector
9032 x = copy_to_mode_reg (inner_mode, CONST_VECTOR_ELT (vals, 0));
9033 return gen_rtx_VEC_DUPLICATE (mode, x);
9037 /* Generate code to load VALS, which is a PARALLEL containing only
9038 constants (for vec_init) or CONST_VECTOR, efficiently into a
9039 register. Returns an RTX to copy into the register, or NULL_RTX
9040 for a PARALLEL that can not be converted into a CONST_VECTOR. */
9042 aarch64_simd_make_constant (rtx vals)
9044 machine_mode mode = GET_MODE (vals);
9046 rtx const_vec = NULL_RTX;
9047 int n_elts = GET_MODE_NUNITS (mode);
9051 if (GET_CODE (vals) == CONST_VECTOR)
9053 else if (GET_CODE (vals) == PARALLEL)
9055 /* A CONST_VECTOR must contain only CONST_INTs and
9056 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
9057 Only store valid constants in a CONST_VECTOR. */
9058 for (i = 0; i < n_elts; ++i)
9060 rtx x = XVECEXP (vals, 0, i);
9061 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
9064 if (n_const == n_elts)
9065 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
9070 if (const_vec != NULL_RTX
9071 && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
9072 /* Load using MOVI/MVNI. */
9074 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
9075 /* Loaded using DUP. */
9077 else if (const_vec != NULL_RTX)
9078 /* Load from constant pool. We can not take advantage of single-cycle
9079 LD1 because we need a PC-relative addressing mode. */
9082 /* A PARALLEL containing something not valid inside CONST_VECTOR.
9083 We can not construct an initializer. */
9088 aarch64_expand_vector_init (rtx target, rtx vals)
9090 machine_mode mode = GET_MODE (target);
9091 machine_mode inner_mode = GET_MODE_INNER (mode);
9092 int n_elts = GET_MODE_NUNITS (mode);
9094 rtx any_const = NULL_RTX;
9095 bool all_same = true;
9097 for (int i = 0; i < n_elts; ++i)
9099 rtx x = XVECEXP (vals, 0, i);
9100 if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
9105 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
9111 rtx constant = aarch64_simd_make_constant (vals);
9112 if (constant != NULL_RTX)
9114 emit_move_insn (target, constant);
9119 /* Splat a single non-constant element if we can. */
9122 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));
9123 aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
9127 /* Half the fields (or less) are non-constant. Load constant then overwrite
9128 varying fields. Hope that this is more efficient than using the stack. */
9129 if (n_var <= n_elts/2)
9131 rtx copy = copy_rtx (vals);
9133 /* Load constant part of vector. We really don't care what goes into the
9134 parts we will overwrite, but we're more likely to be able to load the
9135 constant efficiently if it has fewer, larger, repeating parts
9136 (see aarch64_simd_valid_immediate). */
9137 for (int i = 0; i < n_elts; i++)
9139 rtx x = XVECEXP (vals, 0, i);
9140 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
9142 rtx subst = any_const;
9143 for (int bit = n_elts / 2; bit > 0; bit /= 2)
9145 /* Look in the copied vector, as more elements are const. */
9146 rtx test = XVECEXP (copy, 0, i ^ bit);
9147 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
9153 XVECEXP (copy, 0, i) = subst;
9155 aarch64_expand_vector_init (target, copy);
9157 /* Insert variables. */
9158 enum insn_code icode = optab_handler (vec_set_optab, mode);
9159 gcc_assert (icode != CODE_FOR_nothing);
9161 for (int i = 0; i < n_elts; i++)
9163 rtx x = XVECEXP (vals, 0, i);
9164 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
9166 x = copy_to_mode_reg (inner_mode, x);
9167 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
9172 /* Construct the vector in memory one field at a time
9173 and load the whole vector. */
9174 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
9175 for (int i = 0; i < n_elts; i++)
9176 emit_move_insn (adjust_address_nv (mem, inner_mode,
9177 i * GET_MODE_SIZE (inner_mode)),
9178 XVECEXP (vals, 0, i));
9179 emit_move_insn (target, mem);
9183 static unsigned HOST_WIDE_INT
9184 aarch64_shift_truncation_mask (machine_mode mode)
9187 (aarch64_vector_mode_supported_p (mode)
9188 || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
9191 #ifndef TLS_SECTION_ASM_FLAG
9192 #define TLS_SECTION_ASM_FLAG 'T'
9196 aarch64_elf_asm_named_section (const char *name, unsigned int flags,
9197 tree decl ATTRIBUTE_UNUSED)
9199 char flagchars[10], *f = flagchars;
9201 /* If we have already declared this section, we can use an
9202 abbreviated form to switch back to it -- unless this section is
9203 part of a COMDAT groups, in which case GAS requires the full
9204 declaration every time. */
9205 if (!(HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
9206 && (flags & SECTION_DECLARED))
9208 fprintf (asm_out_file, "\t.section\t%s\n", name);
9212 if (!(flags & SECTION_DEBUG))
9214 if (flags & SECTION_WRITE)
9216 if (flags & SECTION_CODE)
9218 if (flags & SECTION_SMALL)
9220 if (flags & SECTION_MERGE)
9222 if (flags & SECTION_STRINGS)
9224 if (flags & SECTION_TLS)
9225 *f++ = TLS_SECTION_ASM_FLAG;
9226 if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
9230 fprintf (asm_out_file, "\t.section\t%s,\"%s\"", name, flagchars);
9232 if (!(flags & SECTION_NOTYPE))
9237 if (flags & SECTION_BSS)
9242 #ifdef TYPE_OPERAND_FMT
9243 format = "," TYPE_OPERAND_FMT;
9248 fprintf (asm_out_file, format, type);
9250 if (flags & SECTION_ENTSIZE)
9251 fprintf (asm_out_file, ",%d", flags & SECTION_ENTSIZE);
9252 if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
9254 if (TREE_CODE (decl) == IDENTIFIER_NODE)
9255 fprintf (asm_out_file, ",%s,comdat", IDENTIFIER_POINTER (decl));
9257 fprintf (asm_out_file, ",%s,comdat",
9258 IDENTIFIER_POINTER (DECL_COMDAT_GROUP (decl)));
9262 putc ('\n', asm_out_file);
9265 /* Select a format to encode pointers in exception handling data. */
9267 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
9270 switch (aarch64_cmodel)
9272 case AARCH64_CMODEL_TINY:
9273 case AARCH64_CMODEL_TINY_PIC:
9274 case AARCH64_CMODEL_SMALL:
9275 case AARCH64_CMODEL_SMALL_PIC:
9276 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
9278 type = DW_EH_PE_sdata4;
9281 /* No assumptions here. 8-byte relocs required. */
9282 type = DW_EH_PE_sdata8;
9285 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
9288 /* Emit load exclusive. */
9291 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
9292 rtx mem, rtx model_rtx)
9294 rtx (*gen) (rtx, rtx, rtx);
9298 case QImode: gen = gen_aarch64_load_exclusiveqi; break;
9299 case HImode: gen = gen_aarch64_load_exclusivehi; break;
9300 case SImode: gen = gen_aarch64_load_exclusivesi; break;
9301 case DImode: gen = gen_aarch64_load_exclusivedi; break;
9306 emit_insn (gen (rval, mem, model_rtx));
9309 /* Emit store exclusive. */
9312 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
9313 rtx rval, rtx mem, rtx model_rtx)
9315 rtx (*gen) (rtx, rtx, rtx, rtx);
9319 case QImode: gen = gen_aarch64_store_exclusiveqi; break;
9320 case HImode: gen = gen_aarch64_store_exclusivehi; break;
9321 case SImode: gen = gen_aarch64_store_exclusivesi; break;
9322 case DImode: gen = gen_aarch64_store_exclusivedi; break;
9327 emit_insn (gen (bval, rval, mem, model_rtx));
9330 /* Mark the previous jump instruction as unlikely. */
9333 aarch64_emit_unlikely_jump (rtx insn)
9335 int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
9337 insn = emit_jump_insn (insn);
9338 add_int_reg_note (insn, REG_BR_PROB, very_unlikely);
9341 /* Expand a compare and swap pattern. */
9344 aarch64_expand_compare_and_swap (rtx operands[])
9346 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
9347 machine_mode mode, cmp_mode;
9348 rtx (*gen) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
9353 oldval = operands[3];
9354 newval = operands[4];
9355 is_weak = operands[5];
9356 mod_s = operands[6];
9357 mod_f = operands[7];
9358 mode = GET_MODE (mem);
9361 /* Normally the succ memory model must be stronger than fail, but in the
9362 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
9363 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
9365 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
9366 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
9367 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
9373 /* For short modes, we're going to perform the comparison in SImode,
9374 so do the zero-extension now. */
9376 rval = gen_reg_rtx (SImode);
9377 oldval = convert_modes (SImode, mode, oldval, true);
9382 /* Force the value into a register if needed. */
9383 if (!aarch64_plus_operand (oldval, mode))
9384 oldval = force_reg (cmp_mode, oldval);
9393 case QImode: gen = gen_atomic_compare_and_swapqi_1; break;
9394 case HImode: gen = gen_atomic_compare_and_swaphi_1; break;
9395 case SImode: gen = gen_atomic_compare_and_swapsi_1; break;
9396 case DImode: gen = gen_atomic_compare_and_swapdi_1; break;
9401 emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
9403 if (mode == QImode || mode == HImode)
9404 emit_move_insn (operands[1], gen_lowpart (mode, rval));
9406 x = gen_rtx_REG (CCmode, CC_REGNUM);
9407 x = gen_rtx_EQ (SImode, x, const0_rtx);
9408 emit_insn (gen_rtx_SET (bval, x));
9411 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
9412 sequence implementing an atomic operation. */
9415 aarch64_emit_post_barrier (enum memmodel model)
9417 const enum memmodel base_model = memmodel_base (model);
9419 if (is_mm_sync (model)
9420 && (base_model == MEMMODEL_ACQUIRE
9421 || base_model == MEMMODEL_ACQ_REL
9422 || base_model == MEMMODEL_SEQ_CST))
9424 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
9428 /* Split a compare and swap pattern. */
9431 aarch64_split_compare_and_swap (rtx operands[])
9433 rtx rval, mem, oldval, newval, scratch;
9436 rtx_code_label *label1, *label2;
9438 enum memmodel model;
9443 oldval = operands[2];
9444 newval = operands[3];
9445 is_weak = (operands[4] != const0_rtx);
9446 model_rtx = operands[5];
9447 scratch = operands[7];
9448 mode = GET_MODE (mem);
9449 model = memmodel_from_int (INTVAL (model_rtx));
9454 label1 = gen_label_rtx ();
9455 emit_label (label1);
9457 label2 = gen_label_rtx ();
9459 /* The initial load can be relaxed for a __sync operation since a final
9460 barrier will be emitted to stop code hoisting. */
9461 if (is_mm_sync (model))
9462 aarch64_emit_load_exclusive (mode, rval, mem,
9463 GEN_INT (MEMMODEL_RELAXED));
9465 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
9467 cond = aarch64_gen_compare_reg (NE, rval, oldval);
9468 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
9469 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9470 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
9471 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
9473 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
9477 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
9478 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9479 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
9480 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
9484 cond = gen_rtx_REG (CCmode, CC_REGNUM);
9485 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
9486 emit_insn (gen_rtx_SET (cond, x));
9489 emit_label (label2);
9491 /* Emit any final barrier needed for a __sync operation. */
9492 if (is_mm_sync (model))
9493 aarch64_emit_post_barrier (model);
9496 /* Split an atomic operation. */
9499 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
9500 rtx value, rtx model_rtx, rtx cond)
9502 machine_mode mode = GET_MODE (mem);
9503 machine_mode wmode = (mode == DImode ? DImode : SImode);
9504 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
9505 const bool is_sync = is_mm_sync (model);
9506 rtx_code_label *label;
9509 label = gen_label_rtx ();
9513 new_out = gen_lowpart (wmode, new_out);
9515 old_out = gen_lowpart (wmode, old_out);
9518 value = simplify_gen_subreg (wmode, value, mode, 0);
9520 /* The initial load can be relaxed for a __sync operation since a final
9521 barrier will be emitted to stop code hoisting. */
9523 aarch64_emit_load_exclusive (mode, old_out, mem,
9524 GEN_INT (MEMMODEL_RELAXED));
9526 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
9535 x = gen_rtx_AND (wmode, old_out, value);
9536 emit_insn (gen_rtx_SET (new_out, x));
9537 x = gen_rtx_NOT (wmode, new_out);
9538 emit_insn (gen_rtx_SET (new_out, x));
9542 if (CONST_INT_P (value))
9544 value = GEN_INT (-INTVAL (value));
9550 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
9551 emit_insn (gen_rtx_SET (new_out, x));
9555 aarch64_emit_store_exclusive (mode, cond, mem,
9556 gen_lowpart (mode, new_out), model_rtx);
9558 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
9559 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9560 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
9561 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
9563 /* Emit any final barrier needed for a __sync operation. */
9565 aarch64_emit_post_barrier (model);
9569 aarch64_print_extension (void)
9571 const struct aarch64_option_extension *opt = NULL;
9573 for (opt = all_extensions; opt->name != NULL; opt++)
9574 if ((aarch64_isa_flags & opt->flags_on) == opt->flags_on)
9575 asm_fprintf (asm_out_file, "+%s", opt->name);
9577 asm_fprintf (asm_out_file, "\n");
9581 aarch64_start_file (void)
9585 asm_fprintf (asm_out_file, "\t.arch %s", selected_arch->name);
9586 aarch64_print_extension ();
9588 else if (selected_cpu)
9590 const char *truncated_name
9591 = aarch64_rewrite_selected_cpu (selected_cpu->name);
9592 asm_fprintf (asm_out_file, "\t.cpu %s", truncated_name);
9593 aarch64_print_extension ();
9595 default_file_start();
9598 /* Target hook for c_mode_for_suffix. */
9600 aarch64_c_mode_for_suffix (char suffix)
9608 /* We can only represent floating point constants which will fit in
9609 "quarter-precision" values. These values are characterised by
9610 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
9613 (-1)^s * (n/16) * 2^r
9616 's' is the sign bit.
9617 'n' is an integer in the range 16 <= n <= 31.
9618 'r' is an integer in the range -3 <= r <= 4. */
9620 /* Return true iff X can be represented by a quarter-precision
9621 floating point immediate operand X. Note, we cannot represent 0.0. */
9623 aarch64_float_const_representable_p (rtx x)
9625 /* This represents our current view of how many bits
9626 make up the mantissa. */
9627 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
9629 unsigned HOST_WIDE_INT mantissa, mask;
9630 REAL_VALUE_TYPE r, m;
9633 if (!CONST_DOUBLE_P (x))
9636 if (GET_MODE (x) == VOIDmode)
9639 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
9641 /* We cannot represent infinities, NaNs or +/-zero. We won't
9642 know if we have +zero until we analyse the mantissa, but we
9643 can reject the other invalid values. */
9644 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
9645 || REAL_VALUE_MINUS_ZERO (r))
9648 /* Extract exponent. */
9649 r = real_value_abs (&r);
9650 exponent = REAL_EXP (&r);
9652 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
9653 highest (sign) bit, with a fixed binary point at bit point_pos.
9654 m1 holds the low part of the mantissa, m2 the high part.
9655 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
9656 bits for the mantissa, this can fail (low bits will be lost). */
9657 real_ldexp (&m, &r, point_pos - exponent);
9658 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
9660 /* If the low part of the mantissa has bits set we cannot represent
9664 /* We have rejected the lower HOST_WIDE_INT, so update our
9665 understanding of how many bits lie in the mantissa and
9666 look only at the high HOST_WIDE_INT. */
9667 mantissa = w.elt (1);
9668 point_pos -= HOST_BITS_PER_WIDE_INT;
9670 /* We can only represent values with a mantissa of the form 1.xxxx. */
9671 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
9672 if ((mantissa & mask) != 0)
9675 /* Having filtered unrepresentable values, we may now remove all
9676 but the highest 5 bits. */
9677 mantissa >>= point_pos - 5;
9679 /* We cannot represent the value 0.0, so reject it. This is handled
9684 /* Then, as bit 4 is always set, we can mask it off, leaving
9685 the mantissa in the range [0, 15]. */
9686 mantissa &= ~(1 << 4);
9687 gcc_assert (mantissa <= 15);
9689 /* GCC internally does not use IEEE754-like encoding (where normalized
9690 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
9691 Our mantissa values are shifted 4 places to the left relative to
9692 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
9693 by 5 places to correct for GCC's representation. */
9694 exponent = 5 - exponent;
9696 return (exponent >= 0 && exponent <= 7);
9700 aarch64_output_simd_mov_immediate (rtx const_vector,
9705 static char templ[40];
9706 const char *mnemonic;
9707 const char *shift_op;
9708 unsigned int lane_count = 0;
9711 struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
9713 /* This will return true to show const_vector is legal for use as either
9714 a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate. It will
9715 also update INFO to show how the immediate should be generated. */
9716 is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
9717 gcc_assert (is_valid);
9719 element_char = sizetochar (info.element_width);
9720 lane_count = width / info.element_width;
9722 mode = GET_MODE_INNER (mode);
9723 if (mode == SFmode || mode == DFmode)
9725 gcc_assert (info.shift == 0 && ! info.mvn);
9726 if (aarch64_float_const_zero_rtx_p (info.value))
9727 info.value = GEN_INT (0);
9732 REAL_VALUE_FROM_CONST_DOUBLE (r, info.value);
9733 char float_buf[buf_size] = {'\0'};
9734 real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size, 1, mode);
9737 if (lane_count == 1)
9738 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
9740 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
9741 lane_count, element_char, float_buf);
9746 mnemonic = info.mvn ? "mvni" : "movi";
9747 shift_op = info.msl ? "msl" : "lsl";
9749 if (lane_count == 1)
9750 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
9751 mnemonic, UINTVAL (info.value));
9752 else if (info.shift)
9753 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
9754 ", %s %d", mnemonic, lane_count, element_char,
9755 UINTVAL (info.value), shift_op, info.shift);
9757 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
9758 mnemonic, lane_count, element_char, UINTVAL (info.value));
9763 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
9768 gcc_assert (!VECTOR_MODE_P (mode));
9769 vmode = aarch64_simd_container_mode (mode, 64);
9770 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
9771 return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
9774 /* Split operands into moves from op[1] + op[2] into op[0]. */
9777 aarch64_split_combinev16qi (rtx operands[3])
9779 unsigned int dest = REGNO (operands[0]);
9780 unsigned int src1 = REGNO (operands[1]);
9781 unsigned int src2 = REGNO (operands[2]);
9782 machine_mode halfmode = GET_MODE (operands[1]);
9783 unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
9786 gcc_assert (halfmode == V16QImode);
9788 if (src1 == dest && src2 == dest + halfregs)
9790 /* No-op move. Can't split to nothing; emit something. */
9791 emit_note (NOTE_INSN_DELETED);
9795 /* Preserve register attributes for variable tracking. */
9796 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
9797 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
9798 GET_MODE_SIZE (halfmode));
9800 /* Special case of reversed high/low parts. */
9801 if (reg_overlap_mentioned_p (operands[2], destlo)
9802 && reg_overlap_mentioned_p (operands[1], desthi))
9804 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
9805 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
9806 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
9808 else if (!reg_overlap_mentioned_p (operands[2], destlo))
9810 /* Try to avoid unnecessary moves if part of the result
9811 is in the right place already. */
9813 emit_move_insn (destlo, operands[1]);
9814 if (src2 != dest + halfregs)
9815 emit_move_insn (desthi, operands[2]);
9819 if (src2 != dest + halfregs)
9820 emit_move_insn (desthi, operands[2]);
9822 emit_move_insn (destlo, operands[1]);
9826 /* vec_perm support. */
9828 #define MAX_VECT_LEN 16
9830 struct expand_vec_perm_d
9832 rtx target, op0, op1;
9833 unsigned char perm[MAX_VECT_LEN];
9840 /* Generate a variable permutation. */
9843 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
9845 machine_mode vmode = GET_MODE (target);
9846 bool one_vector_p = rtx_equal_p (op0, op1);
9848 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
9849 gcc_checking_assert (GET_MODE (op0) == vmode);
9850 gcc_checking_assert (GET_MODE (op1) == vmode);
9851 gcc_checking_assert (GET_MODE (sel) == vmode);
9852 gcc_checking_assert (TARGET_SIMD);
9856 if (vmode == V8QImode)
9858 /* Expand the argument to a V16QI mode by duplicating it. */
9859 rtx pair = gen_reg_rtx (V16QImode);
9860 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
9861 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
9865 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
9872 if (vmode == V8QImode)
9874 pair = gen_reg_rtx (V16QImode);
9875 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
9876 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
9880 pair = gen_reg_rtx (OImode);
9881 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
9882 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
9888 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
9890 machine_mode vmode = GET_MODE (target);
9891 unsigned int nelt = GET_MODE_NUNITS (vmode);
9892 bool one_vector_p = rtx_equal_p (op0, op1);
9895 /* The TBL instruction does not use a modulo index, so we must take care
9896 of that ourselves. */
9897 mask = aarch64_simd_gen_const_vector_dup (vmode,
9898 one_vector_p ? nelt - 1 : 2 * nelt - 1);
9899 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
9901 /* For big-endian, we also need to reverse the index within the vector
9902 (but not which vector). */
9903 if (BYTES_BIG_ENDIAN)
9905 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
9907 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
9908 sel = expand_simple_binop (vmode, XOR, sel, mask,
9909 NULL, 0, OPTAB_LIB_WIDEN);
9911 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
9914 /* Recognize patterns suitable for the TRN instructions. */
9916 aarch64_evpc_trn (struct expand_vec_perm_d *d)
9918 unsigned int i, odd, mask, nelt = d->nelt;
9919 rtx out, in0, in1, x;
9920 rtx (*gen) (rtx, rtx, rtx);
9921 machine_mode vmode = d->vmode;
9923 if (GET_MODE_UNIT_SIZE (vmode) > 8)
9926 /* Note that these are little-endian tests.
9927 We correct for big-endian later. */
9928 if (d->perm[0] == 0)
9930 else if (d->perm[0] == 1)
9934 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9936 for (i = 0; i < nelt; i += 2)
9938 if (d->perm[i] != i + odd)
9940 if (d->perm[i + 1] != ((i + nelt + odd) & mask))
9950 if (BYTES_BIG_ENDIAN)
9952 x = in0, in0 = in1, in1 = x;
9961 case V16QImode: gen = gen_aarch64_trn2v16qi; break;
9962 case V8QImode: gen = gen_aarch64_trn2v8qi; break;
9963 case V8HImode: gen = gen_aarch64_trn2v8hi; break;
9964 case V4HImode: gen = gen_aarch64_trn2v4hi; break;
9965 case V4SImode: gen = gen_aarch64_trn2v4si; break;
9966 case V2SImode: gen = gen_aarch64_trn2v2si; break;
9967 case V2DImode: gen = gen_aarch64_trn2v2di; break;
9968 case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
9969 case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
9970 case V2DFmode: gen = gen_aarch64_trn2v2df; break;
9979 case V16QImode: gen = gen_aarch64_trn1v16qi; break;
9980 case V8QImode: gen = gen_aarch64_trn1v8qi; break;
9981 case V8HImode: gen = gen_aarch64_trn1v8hi; break;
9982 case V4HImode: gen = gen_aarch64_trn1v4hi; break;
9983 case V4SImode: gen = gen_aarch64_trn1v4si; break;
9984 case V2SImode: gen = gen_aarch64_trn1v2si; break;
9985 case V2DImode: gen = gen_aarch64_trn1v2di; break;
9986 case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
9987 case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
9988 case V2DFmode: gen = gen_aarch64_trn1v2df; break;
9994 emit_insn (gen (out, in0, in1));
9998 /* Recognize patterns suitable for the UZP instructions. */
10000 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
10002 unsigned int i, odd, mask, nelt = d->nelt;
10003 rtx out, in0, in1, x;
10004 rtx (*gen) (rtx, rtx, rtx);
10005 machine_mode vmode = d->vmode;
10007 if (GET_MODE_UNIT_SIZE (vmode) > 8)
10010 /* Note that these are little-endian tests.
10011 We correct for big-endian later. */
10012 if (d->perm[0] == 0)
10014 else if (d->perm[0] == 1)
10018 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
10020 for (i = 0; i < nelt; i++)
10022 unsigned elt = (i * 2 + odd) & mask;
10023 if (d->perm[i] != elt)
10033 if (BYTES_BIG_ENDIAN)
10035 x = in0, in0 = in1, in1 = x;
10044 case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
10045 case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
10046 case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
10047 case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
10048 case V4SImode: gen = gen_aarch64_uzp2v4si; break;
10049 case V2SImode: gen = gen_aarch64_uzp2v2si; break;
10050 case V2DImode: gen = gen_aarch64_uzp2v2di; break;
10051 case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
10052 case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
10053 case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
10062 case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
10063 case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
10064 case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
10065 case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
10066 case V4SImode: gen = gen_aarch64_uzp1v4si; break;
10067 case V2SImode: gen = gen_aarch64_uzp1v2si; break;
10068 case V2DImode: gen = gen_aarch64_uzp1v2di; break;
10069 case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
10070 case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
10071 case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
10077 emit_insn (gen (out, in0, in1));
10081 /* Recognize patterns suitable for the ZIP instructions. */
10083 aarch64_evpc_zip (struct expand_vec_perm_d *d)
10085 unsigned int i, high, mask, nelt = d->nelt;
10086 rtx out, in0, in1, x;
10087 rtx (*gen) (rtx, rtx, rtx);
10088 machine_mode vmode = d->vmode;
10090 if (GET_MODE_UNIT_SIZE (vmode) > 8)
10093 /* Note that these are little-endian tests.
10094 We correct for big-endian later. */
10096 if (d->perm[0] == high)
10099 else if (d->perm[0] == 0)
10103 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
10105 for (i = 0; i < nelt / 2; i++)
10107 unsigned elt = (i + high) & mask;
10108 if (d->perm[i * 2] != elt)
10110 elt = (elt + nelt) & mask;
10111 if (d->perm[i * 2 + 1] != elt)
10121 if (BYTES_BIG_ENDIAN)
10123 x = in0, in0 = in1, in1 = x;
10132 case V16QImode: gen = gen_aarch64_zip2v16qi; break;
10133 case V8QImode: gen = gen_aarch64_zip2v8qi; break;
10134 case V8HImode: gen = gen_aarch64_zip2v8hi; break;
10135 case V4HImode: gen = gen_aarch64_zip2v4hi; break;
10136 case V4SImode: gen = gen_aarch64_zip2v4si; break;
10137 case V2SImode: gen = gen_aarch64_zip2v2si; break;
10138 case V2DImode: gen = gen_aarch64_zip2v2di; break;
10139 case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
10140 case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
10141 case V2DFmode: gen = gen_aarch64_zip2v2df; break;
10150 case V16QImode: gen = gen_aarch64_zip1v16qi; break;
10151 case V8QImode: gen = gen_aarch64_zip1v8qi; break;
10152 case V8HImode: gen = gen_aarch64_zip1v8hi; break;
10153 case V4HImode: gen = gen_aarch64_zip1v4hi; break;
10154 case V4SImode: gen = gen_aarch64_zip1v4si; break;
10155 case V2SImode: gen = gen_aarch64_zip1v2si; break;
10156 case V2DImode: gen = gen_aarch64_zip1v2di; break;
10157 case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
10158 case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
10159 case V2DFmode: gen = gen_aarch64_zip1v2df; break;
10165 emit_insn (gen (out, in0, in1));
10169 /* Recognize patterns for the EXT insn. */
10172 aarch64_evpc_ext (struct expand_vec_perm_d *d)
10174 unsigned int i, nelt = d->nelt;
10175 rtx (*gen) (rtx, rtx, rtx, rtx);
10178 unsigned int location = d->perm[0]; /* Always < nelt. */
10180 /* Check if the extracted indices are increasing by one. */
10181 for (i = 1; i < nelt; i++)
10183 unsigned int required = location + i;
10184 if (d->one_vector_p)
10186 /* We'll pass the same vector in twice, so allow indices to wrap. */
10187 required &= (nelt - 1);
10189 if (d->perm[i] != required)
10195 case V16QImode: gen = gen_aarch64_extv16qi; break;
10196 case V8QImode: gen = gen_aarch64_extv8qi; break;
10197 case V4HImode: gen = gen_aarch64_extv4hi; break;
10198 case V8HImode: gen = gen_aarch64_extv8hi; break;
10199 case V2SImode: gen = gen_aarch64_extv2si; break;
10200 case V4SImode: gen = gen_aarch64_extv4si; break;
10201 case V2SFmode: gen = gen_aarch64_extv2sf; break;
10202 case V4SFmode: gen = gen_aarch64_extv4sf; break;
10203 case V2DImode: gen = gen_aarch64_extv2di; break;
10204 case V2DFmode: gen = gen_aarch64_extv2df; break;
10213 /* The case where (location == 0) is a no-op for both big- and little-endian,
10214 and is removed by the mid-end at optimization levels -O1 and higher. */
10216 if (BYTES_BIG_ENDIAN && (location != 0))
10218 /* After setup, we want the high elements of the first vector (stored
10219 at the LSB end of the register), and the low elements of the second
10220 vector (stored at the MSB end of the register). So swap. */
10221 std::swap (d->op0, d->op1);
10222 /* location != 0 (above), so safe to assume (nelt - location) < nelt. */
10223 location = nelt - location;
10226 offset = GEN_INT (location);
10227 emit_insn (gen (d->target, d->op0, d->op1, offset));
10231 /* Recognize patterns for the REV insns. */
10234 aarch64_evpc_rev (struct expand_vec_perm_d *d)
10236 unsigned int i, j, diff, nelt = d->nelt;
10237 rtx (*gen) (rtx, rtx);
10239 if (!d->one_vector_p)
10248 case V16QImode: gen = gen_aarch64_rev64v16qi; break;
10249 case V8QImode: gen = gen_aarch64_rev64v8qi; break;
10257 case V16QImode: gen = gen_aarch64_rev32v16qi; break;
10258 case V8QImode: gen = gen_aarch64_rev32v8qi; break;
10259 case V8HImode: gen = gen_aarch64_rev64v8hi; break;
10260 case V4HImode: gen = gen_aarch64_rev64v4hi; break;
10268 case V16QImode: gen = gen_aarch64_rev16v16qi; break;
10269 case V8QImode: gen = gen_aarch64_rev16v8qi; break;
10270 case V8HImode: gen = gen_aarch64_rev32v8hi; break;
10271 case V4HImode: gen = gen_aarch64_rev32v4hi; break;
10272 case V4SImode: gen = gen_aarch64_rev64v4si; break;
10273 case V2SImode: gen = gen_aarch64_rev64v2si; break;
10274 case V4SFmode: gen = gen_aarch64_rev64v4sf; break;
10275 case V2SFmode: gen = gen_aarch64_rev64v2sf; break;
10284 for (i = 0; i < nelt ; i += diff + 1)
10285 for (j = 0; j <= diff; j += 1)
10287 /* This is guaranteed to be true as the value of diff
10288 is 7, 3, 1 and we should have enough elements in the
10289 queue to generate this. Getting a vector mask with a
10290 value of diff other than these values implies that
10291 something is wrong by the time we get here. */
10292 gcc_assert (i + j < nelt);
10293 if (d->perm[i + j] != i + diff - j)
10301 emit_insn (gen (d->target, d->op0));
10306 aarch64_evpc_dup (struct expand_vec_perm_d *d)
10308 rtx (*gen) (rtx, rtx, rtx);
10309 rtx out = d->target;
10311 machine_mode vmode = d->vmode;
10312 unsigned int i, elt, nelt = d->nelt;
10316 for (i = 1; i < nelt; i++)
10318 if (elt != d->perm[i])
10322 /* The generic preparation in aarch64_expand_vec_perm_const_1
10323 swaps the operand order and the permute indices if it finds
10324 d->perm[0] to be in the second operand. Thus, we can always
10325 use d->op0 and need not do any extra arithmetic to get the
10326 correct lane number. */
10328 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
10332 case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
10333 case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
10334 case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
10335 case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
10336 case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
10337 case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
10338 case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
10339 case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
10340 case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
10341 case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
10346 emit_insn (gen (out, in0, lane));
10351 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
10353 rtx rperm[MAX_VECT_LEN], sel;
10354 machine_mode vmode = d->vmode;
10355 unsigned int i, nelt = d->nelt;
10360 /* Generic code will try constant permutation twice. Once with the
10361 original mode and again with the elements lowered to QImode.
10362 So wait and don't do the selector expansion ourselves. */
10363 if (vmode != V8QImode && vmode != V16QImode)
10366 for (i = 0; i < nelt; ++i)
10368 int nunits = GET_MODE_NUNITS (vmode);
10370 /* If big-endian and two vectors we end up with a weird mixed-endian
10371 mode on NEON. Reverse the index within each word but not the word
10373 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
10376 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
10377 sel = force_reg (vmode, sel);
10379 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
10384 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
10386 /* The pattern matching functions above are written to look for a small
10387 number to begin the sequence (0, 1, N/2). If we begin with an index
10388 from the second operand, we can swap the operands. */
10389 if (d->perm[0] >= d->nelt)
10391 unsigned i, nelt = d->nelt;
10393 gcc_assert (nelt == (nelt & -nelt));
10394 for (i = 0; i < nelt; ++i)
10395 d->perm[i] ^= nelt; /* Keep the same index, but in the other vector. */
10397 std::swap (d->op0, d->op1);
10402 if (aarch64_evpc_rev (d))
10404 else if (aarch64_evpc_ext (d))
10406 else if (aarch64_evpc_dup (d))
10408 else if (aarch64_evpc_zip (d))
10410 else if (aarch64_evpc_uzp (d))
10412 else if (aarch64_evpc_trn (d))
10414 return aarch64_evpc_tbl (d);
10419 /* Expand a vec_perm_const pattern. */
10422 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
10424 struct expand_vec_perm_d d;
10425 int i, nelt, which;
10431 d.vmode = GET_MODE (target);
10432 gcc_assert (VECTOR_MODE_P (d.vmode));
10433 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
10434 d.testing_p = false;
10436 for (i = which = 0; i < nelt; ++i)
10438 rtx e = XVECEXP (sel, 0, i);
10439 int ei = INTVAL (e) & (2 * nelt - 1);
10440 which |= (ei < nelt ? 1 : 2);
10447 gcc_unreachable ();
10450 d.one_vector_p = false;
10451 if (!rtx_equal_p (op0, op1))
10454 /* The elements of PERM do not suggest that only the first operand
10455 is used, but both operands are identical. Allow easier matching
10456 of the permutation by folding the permutation into the single
10458 /* Fall Through. */
10460 for (i = 0; i < nelt; ++i)
10461 d.perm[i] &= nelt - 1;
10463 d.one_vector_p = true;
10468 d.one_vector_p = true;
10472 return aarch64_expand_vec_perm_const_1 (&d);
10476 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
10477 const unsigned char *sel)
10479 struct expand_vec_perm_d d;
10480 unsigned int i, nelt, which;
10484 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
10485 d.testing_p = true;
10486 memcpy (d.perm, sel, nelt);
10488 /* Calculate whether all elements are in one vector. */
10489 for (i = which = 0; i < nelt; ++i)
10491 unsigned char e = d.perm[i];
10492 gcc_assert (e < 2 * nelt);
10493 which |= (e < nelt ? 1 : 2);
10496 /* If all elements are from the second vector, reindex as if from the
10499 for (i = 0; i < nelt; ++i)
10502 /* Check whether the mask can be applied to a single vector. */
10503 d.one_vector_p = (which != 3);
10505 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
10506 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
10507 if (!d.one_vector_p)
10508 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
10511 ret = aarch64_expand_vec_perm_const_1 (&d);
10518 aarch64_reverse_mask (enum machine_mode mode)
10520 /* We have to reverse each vector because we dont have
10521 a permuted load that can reverse-load according to ABI rules. */
10523 rtvec v = rtvec_alloc (16);
10525 int nunits = GET_MODE_NUNITS (mode);
10526 int usize = GET_MODE_UNIT_SIZE (mode);
10528 gcc_assert (BYTES_BIG_ENDIAN);
10529 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
10531 for (i = 0; i < nunits; i++)
10532 for (j = 0; j < usize; j++)
10533 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
10534 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
10535 return force_reg (V16QImode, mask);
10538 /* Implement MODES_TIEABLE_P. */
10541 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
10543 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
10546 /* We specifically want to allow elements of "structure" modes to
10547 be tieable to the structure. This more general condition allows
10548 other rarer situations too. */
10550 && aarch64_vector_mode_p (mode1)
10551 && aarch64_vector_mode_p (mode2))
10557 /* Return a new RTX holding the result of moving POINTER forward by
10561 aarch64_move_pointer (rtx pointer, int amount)
10563 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
10565 return adjust_automodify_address (pointer, GET_MODE (pointer),
10569 /* Return a new RTX holding the result of moving POINTER forward by the
10570 size of the mode it points to. */
10573 aarch64_progress_pointer (rtx pointer)
10575 HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
10577 return aarch64_move_pointer (pointer, amount);
10580 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
10584 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
10587 rtx reg = gen_reg_rtx (mode);
10589 /* "Cast" the pointers to the correct mode. */
10590 *src = adjust_address (*src, mode, 0);
10591 *dst = adjust_address (*dst, mode, 0);
10592 /* Emit the memcpy. */
10593 emit_move_insn (reg, *src);
10594 emit_move_insn (*dst, reg);
10595 /* Move the pointers forward. */
10596 *src = aarch64_progress_pointer (*src);
10597 *dst = aarch64_progress_pointer (*dst);
10600 /* Expand movmem, as if from a __builtin_memcpy. Return true if
10601 we succeed, otherwise return false. */
10604 aarch64_expand_movmem (rtx *operands)
10607 rtx dst = operands[0];
10608 rtx src = operands[1];
10610 bool speed_p = !optimize_function_for_size_p (cfun);
10612 /* When optimizing for size, give a better estimate of the length of a
10613 memcpy call, but use the default otherwise. */
10614 unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
10616 /* We can't do anything smart if the amount to copy is not constant. */
10617 if (!CONST_INT_P (operands[2]))
10620 n = UINTVAL (operands[2]);
10622 /* Try to keep the number of instructions low. For cases below 16 bytes we
10623 need to make at most two moves. For cases above 16 bytes it will be one
10624 move for each 16 byte chunk, then at most two additional moves. */
10625 if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
10628 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
10629 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
10631 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
10632 src = adjust_automodify_address (src, VOIDmode, base, 0);
10634 /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
10640 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
10645 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
10650 /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
10651 4-byte chunk, partially overlapping with the previously copied chunk. */
10654 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10660 src = aarch64_move_pointer (src, move);
10661 dst = aarch64_move_pointer (dst, move);
10662 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10667 /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
10668 them, then (if applicable) an 8-byte chunk. */
10673 aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
10678 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
10683 /* Finish the final bytes of the copy. We can always do this in one
10684 instruction. We either copy the exact amount we need, or partially
10685 overlap with the previous chunk we copied and copy 8-bytes. */
10689 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
10691 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
10693 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10698 src = aarch64_move_pointer (src, -1);
10699 dst = aarch64_move_pointer (dst, -1);
10700 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10706 src = aarch64_move_pointer (src, move);
10707 dst = aarch64_move_pointer (dst, move);
10708 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
10715 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
10717 static unsigned HOST_WIDE_INT
10718 aarch64_asan_shadow_offset (void)
10720 return (HOST_WIDE_INT_1 << 36);
10724 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
10725 unsigned int align,
10726 enum by_pieces_operation op,
10729 /* STORE_BY_PIECES can be used when copying a constant string, but
10730 in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
10731 For now we always fail this and let the move_by_pieces code copy
10732 the string from read-only memory. */
10733 if (op == STORE_BY_PIECES)
10736 return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
10739 static enum machine_mode
10740 aarch64_code_to_ccmode (enum rtx_code code)
10763 return CC_DLEUmode;
10766 return CC_DLTUmode;
10769 return CC_DGEUmode;
10772 return CC_DGTUmode;
10780 aarch64_gen_ccmp_first (rtx *prep_seq, rtx *gen_seq,
10781 int code, tree treeop0, tree treeop1)
10783 enum machine_mode op_mode, cmp_mode, cc_mode;
10784 rtx op0, op1, cmp, target;
10785 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
10786 enum insn_code icode;
10787 struct expand_operand ops[4];
10789 cc_mode = aarch64_code_to_ccmode ((enum rtx_code) code);
10790 if (cc_mode == CCmode)
10794 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
10796 op_mode = GET_MODE (op0);
10797 if (op_mode == VOIDmode)
10798 op_mode = GET_MODE (op1);
10806 icode = CODE_FOR_cmpsi;
10811 icode = CODE_FOR_cmpdi;
10819 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
10820 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
10826 *prep_seq = get_insns ();
10829 cmp = gen_rtx_fmt_ee ((enum rtx_code) code, cmp_mode, op0, op1);
10830 target = gen_rtx_REG (CCmode, CC_REGNUM);
10832 create_output_operand (&ops[0], target, CCmode);
10833 create_fixed_operand (&ops[1], cmp);
10834 create_fixed_operand (&ops[2], op0);
10835 create_fixed_operand (&ops[3], op1);
10838 if (!maybe_expand_insn (icode, 4, ops))
10843 *gen_seq = get_insns ();
10846 return gen_rtx_REG (cc_mode, CC_REGNUM);
10850 aarch64_gen_ccmp_next (rtx *prep_seq, rtx *gen_seq, rtx prev, int cmp_code,
10851 tree treeop0, tree treeop1, int bit_code)
10853 rtx op0, op1, cmp0, cmp1, target;
10854 enum machine_mode op_mode, cmp_mode, cc_mode;
10855 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
10856 enum insn_code icode = CODE_FOR_ccmp_andsi;
10857 struct expand_operand ops[6];
10859 cc_mode = aarch64_code_to_ccmode ((enum rtx_code) cmp_code);
10860 if (cc_mode == CCmode)
10863 push_to_sequence ((rtx_insn*) *prep_seq);
10864 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
10866 op_mode = GET_MODE (op0);
10867 if (op_mode == VOIDmode)
10868 op_mode = GET_MODE (op1);
10876 icode = (enum rtx_code) bit_code == AND ? CODE_FOR_ccmp_andsi
10877 : CODE_FOR_ccmp_iorsi;
10882 icode = (enum rtx_code) bit_code == AND ? CODE_FOR_ccmp_anddi
10883 : CODE_FOR_ccmp_iordi;
10891 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
10892 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
10898 *prep_seq = get_insns ();
10901 target = gen_rtx_REG (cc_mode, CC_REGNUM);
10902 cmp1 = gen_rtx_fmt_ee ((enum rtx_code) cmp_code, cmp_mode, op0, op1);
10903 cmp0 = gen_rtx_fmt_ee (NE, cmp_mode, prev, const0_rtx);
10905 create_fixed_operand (&ops[0], prev);
10906 create_fixed_operand (&ops[1], target);
10907 create_fixed_operand (&ops[2], op0);
10908 create_fixed_operand (&ops[3], op1);
10909 create_fixed_operand (&ops[4], cmp0);
10910 create_fixed_operand (&ops[5], cmp1);
10912 push_to_sequence ((rtx_insn*) *gen_seq);
10913 if (!maybe_expand_insn (icode, 6, ops))
10919 *gen_seq = get_insns ();
10925 #undef TARGET_GEN_CCMP_FIRST
10926 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
10928 #undef TARGET_GEN_CCMP_NEXT
10929 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
10931 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
10932 instruction fusion of some sort. */
10935 aarch64_macro_fusion_p (void)
10937 return aarch64_tune_params->fusible_ops != AARCH64_FUSE_NOTHING;
10941 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
10942 should be kept together during scheduling. */
10945 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
10948 rtx prev_set = single_set (prev);
10949 rtx curr_set = single_set (curr);
10950 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
10951 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
10953 if (!aarch64_macro_fusion_p ())
10957 && (aarch64_tune_params->fusible_ops & AARCH64_FUSE_MOV_MOVK))
10959 /* We are trying to match:
10960 prev (mov) == (set (reg r0) (const_int imm16))
10961 curr (movk) == (set (zero_extract (reg r0)
10964 (const_int imm16_1)) */
10966 set_dest = SET_DEST (curr_set);
10968 if (GET_CODE (set_dest) == ZERO_EXTRACT
10969 && CONST_INT_P (SET_SRC (curr_set))
10970 && CONST_INT_P (SET_SRC (prev_set))
10971 && CONST_INT_P (XEXP (set_dest, 2))
10972 && INTVAL (XEXP (set_dest, 2)) == 16
10973 && REG_P (XEXP (set_dest, 0))
10974 && REG_P (SET_DEST (prev_set))
10975 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
10982 && (aarch64_tune_params->fusible_ops & AARCH64_FUSE_ADRP_ADD))
10985 /* We're trying to match:
10986 prev (adrp) == (set (reg r1)
10987 (high (symbol_ref ("SYM"))))
10988 curr (add) == (set (reg r0)
10990 (symbol_ref ("SYM"))))
10991 Note that r0 need not necessarily be the same as r1, especially
10992 during pre-regalloc scheduling. */
10994 if (satisfies_constraint_Ush (SET_SRC (prev_set))
10995 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
10997 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
10998 && REG_P (XEXP (SET_SRC (curr_set), 0))
10999 && REGNO (XEXP (SET_SRC (curr_set), 0))
11000 == REGNO (SET_DEST (prev_set))
11001 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
11002 XEXP (SET_SRC (curr_set), 1)))
11008 && (aarch64_tune_params->fusible_ops & AARCH64_FUSE_MOVK_MOVK))
11011 /* We're trying to match:
11012 prev (movk) == (set (zero_extract (reg r0)
11015 (const_int imm16_1))
11016 curr (movk) == (set (zero_extract (reg r0)
11019 (const_int imm16_2)) */
11021 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
11022 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
11023 && REG_P (XEXP (SET_DEST (prev_set), 0))
11024 && REG_P (XEXP (SET_DEST (curr_set), 0))
11025 && REGNO (XEXP (SET_DEST (prev_set), 0))
11026 == REGNO (XEXP (SET_DEST (curr_set), 0))
11027 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
11028 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
11029 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
11030 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
11031 && CONST_INT_P (SET_SRC (prev_set))
11032 && CONST_INT_P (SET_SRC (curr_set)))
11037 && (aarch64_tune_params->fusible_ops & AARCH64_FUSE_ADRP_LDR))
11039 /* We're trying to match:
11040 prev (adrp) == (set (reg r0)
11041 (high (symbol_ref ("SYM"))))
11042 curr (ldr) == (set (reg r1)
11043 (mem (lo_sum (reg r0)
11044 (symbol_ref ("SYM")))))
11046 curr (ldr) == (set (reg r1)
11049 (symbol_ref ("SYM")))))) */
11050 if (satisfies_constraint_Ush (SET_SRC (prev_set))
11051 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
11053 rtx curr_src = SET_SRC (curr_set);
11055 if (GET_CODE (curr_src) == ZERO_EXTEND)
11056 curr_src = XEXP (curr_src, 0);
11058 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
11059 && REG_P (XEXP (XEXP (curr_src, 0), 0))
11060 && REGNO (XEXP (XEXP (curr_src, 0), 0))
11061 == REGNO (SET_DEST (prev_set))
11062 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
11063 XEXP (SET_SRC (prev_set), 0)))
11068 if ((aarch64_tune_params->fusible_ops & AARCH64_FUSE_CMP_BRANCH)
11069 && any_condjump_p (curr))
11071 enum attr_type prev_type = get_attr_type (prev);
11073 /* FIXME: this misses some which is considered simple arthematic
11074 instructions for ThunderX. Simple shifts are missed here. */
11075 if (prev_type == TYPE_ALUS_SREG
11076 || prev_type == TYPE_ALUS_IMM
11077 || prev_type == TYPE_LOGICS_REG
11078 || prev_type == TYPE_LOGICS_IMM)
11085 /* If MEM is in the form of [base+offset], extract the two parts
11086 of address and set to BASE and OFFSET, otherwise return false
11087 after clearing BASE and OFFSET. */
11090 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
11094 gcc_assert (MEM_P (mem));
11096 addr = XEXP (mem, 0);
11101 *offset = const0_rtx;
11105 if (GET_CODE (addr) == PLUS
11106 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
11108 *base = XEXP (addr, 0);
11109 *offset = XEXP (addr, 1);
11114 *offset = NULL_RTX;
11119 /* Types for scheduling fusion. */
11120 enum sched_fusion_type
11122 SCHED_FUSION_NONE = 0,
11123 SCHED_FUSION_LD_SIGN_EXTEND,
11124 SCHED_FUSION_LD_ZERO_EXTEND,
11130 /* If INSN is a load or store of address in the form of [base+offset],
11131 extract the two parts and set to BASE and OFFSET. Return scheduling
11132 fusion type this INSN is. */
11134 static enum sched_fusion_type
11135 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
11138 enum sched_fusion_type fusion = SCHED_FUSION_LD;
11140 gcc_assert (INSN_P (insn));
11141 x = PATTERN (insn);
11142 if (GET_CODE (x) != SET)
11143 return SCHED_FUSION_NONE;
11146 dest = SET_DEST (x);
11148 if (GET_MODE (dest) != SImode && GET_MODE (dest) != DImode
11149 && GET_MODE (dest) != SFmode && GET_MODE (dest) != DFmode)
11150 return SCHED_FUSION_NONE;
11152 if (GET_CODE (src) == SIGN_EXTEND)
11154 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
11155 src = XEXP (src, 0);
11156 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
11157 return SCHED_FUSION_NONE;
11159 else if (GET_CODE (src) == ZERO_EXTEND)
11161 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
11162 src = XEXP (src, 0);
11163 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
11164 return SCHED_FUSION_NONE;
11167 if (GET_CODE (src) == MEM && REG_P (dest))
11168 extract_base_offset_in_addr (src, base, offset);
11169 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
11171 fusion = SCHED_FUSION_ST;
11172 extract_base_offset_in_addr (dest, base, offset);
11175 return SCHED_FUSION_NONE;
11177 if (*base == NULL_RTX || *offset == NULL_RTX)
11178 fusion = SCHED_FUSION_NONE;
11183 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
11185 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
11186 and PRI are only calculated for these instructions. For other instruction,
11187 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
11188 type instruction fusion can be added by returning different priorities.
11190 It's important that irrelevant instructions get the largest FUSION_PRI. */
11193 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
11194 int *fusion_pri, int *pri)
11198 enum sched_fusion_type fusion;
11200 gcc_assert (INSN_P (insn));
11203 fusion = fusion_load_store (insn, &base, &offset);
11204 if (fusion == SCHED_FUSION_NONE)
11211 /* Set FUSION_PRI according to fusion type and base register. */
11212 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
11214 /* Calculate PRI. */
11217 /* INSN with smaller offset goes first. */
11218 off_val = (int)(INTVAL (offset));
11220 tmp -= (off_val & 0xfffff);
11222 tmp += ((- off_val) & 0xfffff);
11228 /* Given OPERANDS of consecutive load/store, check if we can merge
11229 them into ldp/stp. LOAD is true if they are load instructions.
11230 MODE is the mode of memory operands. */
11233 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
11234 enum machine_mode mode)
11236 HOST_WIDE_INT offval_1, offval_2, msize;
11237 enum reg_class rclass_1, rclass_2;
11238 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
11242 mem_1 = operands[1];
11243 mem_2 = operands[3];
11244 reg_1 = operands[0];
11245 reg_2 = operands[2];
11246 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
11247 if (REGNO (reg_1) == REGNO (reg_2))
11252 mem_1 = operands[0];
11253 mem_2 = operands[2];
11254 reg_1 = operands[1];
11255 reg_2 = operands[3];
11258 /* The mems cannot be volatile. */
11259 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
11262 /* Check if the addresses are in the form of [base+offset]. */
11263 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
11264 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
11266 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
11267 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
11270 /* Check if the bases are same. */
11271 if (!rtx_equal_p (base_1, base_2))
11274 offval_1 = INTVAL (offset_1);
11275 offval_2 = INTVAL (offset_2);
11276 msize = GET_MODE_SIZE (mode);
11277 /* Check if the offsets are consecutive. */
11278 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
11281 /* Check if the addresses are clobbered by load. */
11284 if (reg_mentioned_p (reg_1, mem_1))
11287 /* In increasing order, the last load can clobber the address. */
11288 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
11292 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
11293 rclass_1 = FP_REGS;
11295 rclass_1 = GENERAL_REGS;
11297 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
11298 rclass_2 = FP_REGS;
11300 rclass_2 = GENERAL_REGS;
11302 /* Check if the registers are of same class. */
11303 if (rclass_1 != rclass_2)
11309 /* Given OPERANDS of consecutive load/store, check if we can merge
11310 them into ldp/stp by adjusting the offset. LOAD is true if they
11311 are load instructions. MODE is the mode of memory operands.
11313 Given below consecutive stores:
11315 str w1, [xb, 0x100]
11316 str w1, [xb, 0x104]
11317 str w1, [xb, 0x108]
11318 str w1, [xb, 0x10c]
11320 Though the offsets are out of the range supported by stp, we can
11321 still pair them after adjusting the offset, like:
11323 add scratch, xb, 0x100
11324 stp w1, w1, [scratch]
11325 stp w1, w1, [scratch, 0x8]
11327 The peephole patterns detecting this opportunity should guarantee
11328 the scratch register is avaliable. */
11331 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
11332 enum machine_mode mode)
11334 enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
11335 HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
11336 rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
11337 rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
11341 reg_1 = operands[0];
11342 mem_1 = operands[1];
11343 reg_2 = operands[2];
11344 mem_2 = operands[3];
11345 reg_3 = operands[4];
11346 mem_3 = operands[5];
11347 reg_4 = operands[6];
11348 mem_4 = operands[7];
11349 gcc_assert (REG_P (reg_1) && REG_P (reg_2)
11350 && REG_P (reg_3) && REG_P (reg_4));
11351 if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
11356 mem_1 = operands[0];
11357 reg_1 = operands[1];
11358 mem_2 = operands[2];
11359 reg_2 = operands[3];
11360 mem_3 = operands[4];
11361 reg_3 = operands[5];
11362 mem_4 = operands[6];
11363 reg_4 = operands[7];
11365 /* Skip if memory operand is by itslef valid for ldp/stp. */
11366 if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
11369 /* The mems cannot be volatile. */
11370 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
11371 || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
11374 /* Check if the addresses are in the form of [base+offset]. */
11375 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
11376 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
11378 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
11379 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
11381 extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
11382 if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
11384 extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
11385 if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
11388 /* Check if the bases are same. */
11389 if (!rtx_equal_p (base_1, base_2)
11390 || !rtx_equal_p (base_2, base_3)
11391 || !rtx_equal_p (base_3, base_4))
11394 offval_1 = INTVAL (offset_1);
11395 offval_2 = INTVAL (offset_2);
11396 offval_3 = INTVAL (offset_3);
11397 offval_4 = INTVAL (offset_4);
11398 msize = GET_MODE_SIZE (mode);
11399 /* Check if the offsets are consecutive. */
11400 if ((offval_1 != (offval_2 + msize)
11401 || offval_1 != (offval_3 + msize * 2)
11402 || offval_1 != (offval_4 + msize * 3))
11403 && (offval_4 != (offval_3 + msize)
11404 || offval_4 != (offval_2 + msize * 2)
11405 || offval_4 != (offval_1 + msize * 3)))
11408 /* Check if the addresses are clobbered by load. */
11411 if (reg_mentioned_p (reg_1, mem_1)
11412 || reg_mentioned_p (reg_2, mem_2)
11413 || reg_mentioned_p (reg_3, mem_3))
11416 /* In increasing order, the last load can clobber the address. */
11417 if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
11421 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
11422 rclass_1 = FP_REGS;
11424 rclass_1 = GENERAL_REGS;
11426 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
11427 rclass_2 = FP_REGS;
11429 rclass_2 = GENERAL_REGS;
11431 if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
11432 rclass_3 = FP_REGS;
11434 rclass_3 = GENERAL_REGS;
11436 if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
11437 rclass_4 = FP_REGS;
11439 rclass_4 = GENERAL_REGS;
11441 /* Check if the registers are of same class. */
11442 if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
11448 /* Given OPERANDS of consecutive load/store, this function pairs them
11449 into ldp/stp after adjusting the offset. It depends on the fact
11450 that addresses of load/store instructions are in increasing order.
11451 MODE is the mode of memory operands. CODE is the rtl operator
11452 which should be applied to all memory operands, it's SIGN_EXTEND,
11453 ZERO_EXTEND or UNKNOWN. */
11456 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
11457 enum machine_mode mode, RTX_CODE code)
11459 rtx base, offset, t1, t2;
11460 rtx mem_1, mem_2, mem_3, mem_4;
11461 HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
11465 mem_1 = operands[1];
11466 mem_2 = operands[3];
11467 mem_3 = operands[5];
11468 mem_4 = operands[7];
11472 mem_1 = operands[0];
11473 mem_2 = operands[2];
11474 mem_3 = operands[4];
11475 mem_4 = operands[6];
11476 gcc_assert (code == UNKNOWN);
11479 extract_base_offset_in_addr (mem_1, &base, &offset);
11480 gcc_assert (base != NULL_RTX && offset != NULL_RTX);
11482 /* Adjust offset thus it can fit in ldp/stp instruction. */
11483 msize = GET_MODE_SIZE (mode);
11484 stp_off_limit = msize * 0x40;
11485 off_val = INTVAL (offset);
11486 abs_off = (off_val < 0) ? -off_val : off_val;
11487 new_off = abs_off % stp_off_limit;
11488 adj_off = abs_off - new_off;
11490 /* Further adjust to make sure all offsets are OK. */
11491 if ((new_off + msize * 2) >= stp_off_limit)
11493 adj_off += stp_off_limit;
11494 new_off -= stp_off_limit;
11497 /* Make sure the adjustment can be done with ADD/SUB instructions. */
11498 if (adj_off >= 0x1000)
11503 adj_off = -adj_off;
11504 new_off = -new_off;
11507 /* Create new memory references. */
11508 mem_1 = change_address (mem_1, VOIDmode,
11509 plus_constant (DImode, operands[8], new_off));
11511 /* Check if the adjusted address is OK for ldp/stp. */
11512 if (!aarch64_mem_pair_operand (mem_1, mode))
11515 msize = GET_MODE_SIZE (mode);
11516 mem_2 = change_address (mem_2, VOIDmode,
11517 plus_constant (DImode,
11520 mem_3 = change_address (mem_3, VOIDmode,
11521 plus_constant (DImode,
11523 new_off + msize * 2));
11524 mem_4 = change_address (mem_4, VOIDmode,
11525 plus_constant (DImode,
11527 new_off + msize * 3));
11529 if (code == ZERO_EXTEND)
11531 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
11532 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
11533 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
11534 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
11536 else if (code == SIGN_EXTEND)
11538 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
11539 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
11540 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
11541 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
11546 operands[1] = mem_1;
11547 operands[3] = mem_2;
11548 operands[5] = mem_3;
11549 operands[7] = mem_4;
11553 operands[0] = mem_1;
11554 operands[2] = mem_2;
11555 operands[4] = mem_3;
11556 operands[6] = mem_4;
11559 /* Emit adjusting instruction. */
11560 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
11561 /* Emit ldp/stp instructions. */
11562 t1 = gen_rtx_SET (operands[0], operands[1]);
11563 t2 = gen_rtx_SET (operands[2], operands[3]);
11564 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
11565 t1 = gen_rtx_SET (operands[4], operands[5]);
11566 t2 = gen_rtx_SET (operands[6], operands[7]);
11567 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
11571 #undef TARGET_ADDRESS_COST
11572 #define TARGET_ADDRESS_COST aarch64_address_cost
11574 /* This hook will determines whether unnamed bitfields affect the alignment
11575 of the containing structure. The hook returns true if the structure
11576 should inherit the alignment requirements of an unnamed bitfield's
11578 #undef TARGET_ALIGN_ANON_BITFIELD
11579 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
11581 #undef TARGET_ASM_ALIGNED_DI_OP
11582 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
11584 #undef TARGET_ASM_ALIGNED_HI_OP
11585 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
11587 #undef TARGET_ASM_ALIGNED_SI_OP
11588 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
11590 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
11591 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
11592 hook_bool_const_tree_hwi_hwi_const_tree_true
11594 #undef TARGET_ASM_FILE_START
11595 #define TARGET_ASM_FILE_START aarch64_start_file
11597 #undef TARGET_ASM_OUTPUT_MI_THUNK
11598 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
11600 #undef TARGET_ASM_SELECT_RTX_SECTION
11601 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
11603 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
11604 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
11606 #undef TARGET_BUILD_BUILTIN_VA_LIST
11607 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
11609 #undef TARGET_CALLEE_COPIES
11610 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
11612 #undef TARGET_CAN_ELIMINATE
11613 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
11615 #undef TARGET_CANNOT_FORCE_CONST_MEM
11616 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
11618 #undef TARGET_CONDITIONAL_REGISTER_USAGE
11619 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
11621 /* Only the least significant bit is used for initialization guard
11623 #undef TARGET_CXX_GUARD_MASK_BIT
11624 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
11626 #undef TARGET_C_MODE_FOR_SUFFIX
11627 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
11629 #ifdef TARGET_BIG_ENDIAN_DEFAULT
11630 #undef TARGET_DEFAULT_TARGET_FLAGS
11631 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
11634 #undef TARGET_CLASS_MAX_NREGS
11635 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
11637 #undef TARGET_BUILTIN_DECL
11638 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
11640 #undef TARGET_EXPAND_BUILTIN
11641 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
11643 #undef TARGET_EXPAND_BUILTIN_VA_START
11644 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
11646 #undef TARGET_FOLD_BUILTIN
11647 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
11649 #undef TARGET_FUNCTION_ARG
11650 #define TARGET_FUNCTION_ARG aarch64_function_arg
11652 #undef TARGET_FUNCTION_ARG_ADVANCE
11653 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
11655 #undef TARGET_FUNCTION_ARG_BOUNDARY
11656 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
11658 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
11659 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
11661 #undef TARGET_FUNCTION_VALUE
11662 #define TARGET_FUNCTION_VALUE aarch64_function_value
11664 #undef TARGET_FUNCTION_VALUE_REGNO_P
11665 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
11667 #undef TARGET_FRAME_POINTER_REQUIRED
11668 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
11670 #undef TARGET_GIMPLE_FOLD_BUILTIN
11671 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
11673 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
11674 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
11676 #undef TARGET_INIT_BUILTINS
11677 #define TARGET_INIT_BUILTINS aarch64_init_builtins
11679 #undef TARGET_LEGITIMATE_ADDRESS_P
11680 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
11682 #undef TARGET_LEGITIMATE_CONSTANT_P
11683 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
11685 #undef TARGET_LIBGCC_CMP_RETURN_MODE
11686 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
11688 #undef TARGET_LRA_P
11689 #define TARGET_LRA_P hook_bool_void_true
11691 #undef TARGET_MANGLE_TYPE
11692 #define TARGET_MANGLE_TYPE aarch64_mangle_type
11694 #undef TARGET_MEMORY_MOVE_COST
11695 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
11697 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
11698 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
11700 #undef TARGET_MUST_PASS_IN_STACK
11701 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
11703 /* This target hook should return true if accesses to volatile bitfields
11704 should use the narrowest mode possible. It should return false if these
11705 accesses should use the bitfield container type. */
11706 #undef TARGET_NARROW_VOLATILE_BITFIELD
11707 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
11709 #undef TARGET_OPTION_OVERRIDE
11710 #define TARGET_OPTION_OVERRIDE aarch64_override_options
11712 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
11713 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
11714 aarch64_override_options_after_change
11716 #undef TARGET_PASS_BY_REFERENCE
11717 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
11719 #undef TARGET_PREFERRED_RELOAD_CLASS
11720 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
11722 #undef TARGET_SCHED_REASSOCIATION_WIDTH
11723 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
11725 #undef TARGET_SECONDARY_RELOAD
11726 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
11728 #undef TARGET_SHIFT_TRUNCATION_MASK
11729 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
11731 #undef TARGET_SETUP_INCOMING_VARARGS
11732 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
11734 #undef TARGET_STRUCT_VALUE_RTX
11735 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
11737 #undef TARGET_REGISTER_MOVE_COST
11738 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
11740 #undef TARGET_RETURN_IN_MEMORY
11741 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
11743 #undef TARGET_RETURN_IN_MSB
11744 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
11746 #undef TARGET_RTX_COSTS
11747 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
11749 #undef TARGET_SCHED_ISSUE_RATE
11750 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
11752 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
11753 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
11754 aarch64_sched_first_cycle_multipass_dfa_lookahead
11756 #undef TARGET_TRAMPOLINE_INIT
11757 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
11759 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
11760 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
11762 #undef TARGET_VECTOR_MODE_SUPPORTED_P
11763 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
11765 #undef TARGET_ARRAY_MODE_SUPPORTED_P
11766 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
11768 #undef TARGET_VECTORIZE_ADD_STMT_COST
11769 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
11771 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
11772 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
11773 aarch64_builtin_vectorization_cost
11775 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
11776 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
11778 #undef TARGET_VECTORIZE_BUILTINS
11779 #define TARGET_VECTORIZE_BUILTINS
11781 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
11782 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
11783 aarch64_builtin_vectorized_function
11785 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
11786 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
11787 aarch64_autovectorize_vector_sizes
11789 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
11790 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
11791 aarch64_atomic_assign_expand_fenv
11793 /* Section anchor support. */
11795 #undef TARGET_MIN_ANCHOR_OFFSET
11796 #define TARGET_MIN_ANCHOR_OFFSET -256
11798 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
11799 byte offset; we can do much more for larger data types, but have no way
11800 to determine the size of the access. We assume accesses are aligned. */
11801 #undef TARGET_MAX_ANCHOR_OFFSET
11802 #define TARGET_MAX_ANCHOR_OFFSET 4095
11804 #undef TARGET_VECTOR_ALIGNMENT
11805 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
11807 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
11808 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
11809 aarch64_simd_vector_alignment_reachable
11811 /* vec_perm support. */
11813 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
11814 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
11815 aarch64_vectorize_vec_perm_const_ok
11818 #undef TARGET_FIXED_CONDITION_CODE_REGS
11819 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
11821 #undef TARGET_FLAGS_REGNUM
11822 #define TARGET_FLAGS_REGNUM CC_REGNUM
11824 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
11825 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
11827 #undef TARGET_ASAN_SHADOW_OFFSET
11828 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
11830 #undef TARGET_LEGITIMIZE_ADDRESS
11831 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
11833 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
11834 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
11835 aarch64_use_by_pieces_infrastructure_p
11837 #undef TARGET_CAN_USE_DOLOOP_P
11838 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
11840 #undef TARGET_SCHED_MACRO_FUSION_P
11841 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
11843 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
11844 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
11846 #undef TARGET_SCHED_FUSION_PRIORITY
11847 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
11849 struct gcc_target targetm = TARGET_INITIALIZER;
11851 #include "gt-aarch64.h"