1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2014 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
23 #include "coretypes.h"
25 #include "insn-codes.h"
27 #include "insn-attr.h"
29 #include "stringpool.h"
30 #include "stor-layout.h"
35 #include "hard-reg-set.h"
41 #include "target-def.h"
42 #include "targhooks.h"
47 #include "langhooks.h"
48 #include "diagnostic-core.h"
49 #include "pointer-set.h"
50 #include "hash-table.h"
52 #include "basic-block.h"
53 #include "tree-ssa-alias.h"
54 #include "internal-fn.h"
55 #include "gimple-fold.h"
57 #include "gimple-expr.h"
64 #include "tree-vectorizer.h"
65 #include "config/arm/aarch-cost-tables.h"
68 /* Defined for convenience. */
69 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
71 /* Classifies an address.
74 A simple base register plus immediate offset.
77 A base register indexed by immediate offset with writeback.
80 A base register indexed by (optionally scaled) register.
83 A base register indexed by (optionally scaled) zero-extended register.
86 A base register indexed by (optionally scaled) sign-extended register.
89 A LO_SUM rtx with a base register and "LO12" symbol relocation.
92 A constant symbolic address, in pc-relative literal pool. */
94 enum aarch64_address_type {
104 struct aarch64_address_info {
105 enum aarch64_address_type type;
109 enum aarch64_symbol_type symbol_type;
112 struct simd_immediate_info
121 /* The current code model. */
122 enum aarch64_code_model aarch64_cmodel;
125 #undef TARGET_HAVE_TLS
126 #define TARGET_HAVE_TLS 1
129 static bool aarch64_lra_p (void);
130 static bool aarch64_composite_type_p (const_tree, enum machine_mode);
131 static bool aarch64_vfp_is_call_or_return_candidate (enum machine_mode,
133 enum machine_mode *, int *,
135 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
136 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
137 static void aarch64_override_options_after_change (void);
138 static bool aarch64_vector_mode_supported_p (enum machine_mode);
139 static unsigned bit_count (unsigned HOST_WIDE_INT);
140 static bool aarch64_const_vec_all_same_int_p (rtx,
141 HOST_WIDE_INT, HOST_WIDE_INT);
143 static bool aarch64_vectorize_vec_perm_const_ok (enum machine_mode vmode,
144 const unsigned char *sel);
145 static int aarch64_address_cost (rtx, enum machine_mode, addr_space_t, bool);
147 /* The processor for which instructions should be scheduled. */
148 enum aarch64_processor aarch64_tune = cortexa53;
150 /* The current tuning set. */
151 const struct tune_params *aarch64_tune_params;
153 /* Mask to specify which instructions we are allowed to generate. */
154 unsigned long aarch64_isa_flags = 0;
156 /* Mask to specify which instruction scheduling options should be used. */
157 unsigned long aarch64_tune_flags = 0;
159 /* Tuning parameters. */
161 #if HAVE_DESIGNATED_INITIALIZERS
162 #define NAMED_PARAM(NAME, VAL) .NAME = (VAL)
164 #define NAMED_PARAM(NAME, VAL) (VAL)
167 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
171 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
174 static const struct cpu_addrcost_table generic_addrcost_table =
176 #if HAVE_DESIGNATED_INITIALIZERS
185 NAMED_PARAM (pre_modify, 0),
186 NAMED_PARAM (post_modify, 0),
187 NAMED_PARAM (register_offset, 0),
188 NAMED_PARAM (register_extend, 0),
189 NAMED_PARAM (imm_offset, 0)
192 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
195 static const struct cpu_addrcost_table cortexa57_addrcost_table =
197 #if HAVE_DESIGNATED_INITIALIZERS
206 NAMED_PARAM (pre_modify, 0),
207 NAMED_PARAM (post_modify, 0),
208 NAMED_PARAM (register_offset, 0),
209 NAMED_PARAM (register_extend, 0),
210 NAMED_PARAM (imm_offset, 0),
213 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
216 static const struct cpu_regmove_cost generic_regmove_cost =
218 NAMED_PARAM (GP2GP, 1),
219 NAMED_PARAM (GP2FP, 2),
220 NAMED_PARAM (FP2GP, 2),
221 /* We currently do not provide direct support for TFmode Q->Q move.
222 Therefore we need to raise the cost above 2 in order to have
223 reload handle the situation. */
224 NAMED_PARAM (FP2FP, 4)
227 /* Generic costs for vector insn classes. */
228 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
231 static const struct cpu_vector_cost generic_vector_cost =
233 NAMED_PARAM (scalar_stmt_cost, 1),
234 NAMED_PARAM (scalar_load_cost, 1),
235 NAMED_PARAM (scalar_store_cost, 1),
236 NAMED_PARAM (vec_stmt_cost, 1),
237 NAMED_PARAM (vec_to_scalar_cost, 1),
238 NAMED_PARAM (scalar_to_vec_cost, 1),
239 NAMED_PARAM (vec_align_load_cost, 1),
240 NAMED_PARAM (vec_unalign_load_cost, 1),
241 NAMED_PARAM (vec_unalign_store_cost, 1),
242 NAMED_PARAM (vec_store_cost, 1),
243 NAMED_PARAM (cond_taken_branch_cost, 3),
244 NAMED_PARAM (cond_not_taken_branch_cost, 1)
247 /* Generic costs for vector insn classes. */
248 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
251 static const struct cpu_vector_cost cortexa57_vector_cost =
253 NAMED_PARAM (scalar_stmt_cost, 1),
254 NAMED_PARAM (scalar_load_cost, 4),
255 NAMED_PARAM (scalar_store_cost, 1),
256 NAMED_PARAM (vec_stmt_cost, 3),
257 NAMED_PARAM (vec_to_scalar_cost, 8),
258 NAMED_PARAM (scalar_to_vec_cost, 8),
259 NAMED_PARAM (vec_align_load_cost, 5),
260 NAMED_PARAM (vec_unalign_load_cost, 5),
261 NAMED_PARAM (vec_unalign_store_cost, 1),
262 NAMED_PARAM (vec_store_cost, 1),
263 NAMED_PARAM (cond_taken_branch_cost, 1),
264 NAMED_PARAM (cond_not_taken_branch_cost, 1)
267 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
270 static const struct tune_params generic_tunings =
272 &cortexa57_extra_costs,
273 &generic_addrcost_table,
274 &generic_regmove_cost,
275 &generic_vector_cost,
276 NAMED_PARAM (memmov_cost, 4),
277 NAMED_PARAM (issue_rate, 2)
280 static const struct tune_params cortexa53_tunings =
282 &cortexa53_extra_costs,
283 &generic_addrcost_table,
284 &generic_regmove_cost,
285 &generic_vector_cost,
286 NAMED_PARAM (memmov_cost, 4),
287 NAMED_PARAM (issue_rate, 2)
290 static const struct tune_params cortexa57_tunings =
292 &cortexa57_extra_costs,
293 &cortexa57_addrcost_table,
294 &generic_regmove_cost,
295 &cortexa57_vector_cost,
296 NAMED_PARAM (memmov_cost, 4),
297 NAMED_PARAM (issue_rate, 3)
300 /* A processor implementing AArch64. */
303 const char *const name;
304 enum aarch64_processor core;
306 const unsigned long flags;
307 const struct tune_params *const tune;
310 /* Processor cores implementing AArch64. */
311 static const struct processor all_cores[] =
313 #define AARCH64_CORE(NAME, X, IDENT, ARCH, FLAGS, COSTS) \
314 {NAME, IDENT, #ARCH, FLAGS | AARCH64_FL_FOR_ARCH##ARCH, &COSTS##_tunings},
315 #include "aarch64-cores.def"
317 {"generic", cortexa53, "8", AARCH64_FL_FPSIMD | AARCH64_FL_FOR_ARCH8, &generic_tunings},
318 {NULL, aarch64_none, NULL, 0, NULL}
321 /* Architectures implementing AArch64. */
322 static const struct processor all_architectures[] =
324 #define AARCH64_ARCH(NAME, CORE, ARCH, FLAGS) \
325 {NAME, CORE, #ARCH, FLAGS, NULL},
326 #include "aarch64-arches.def"
328 {NULL, aarch64_none, NULL, 0, NULL}
331 /* Target specification. These are populated as commandline arguments
332 are processed, or NULL if not specified. */
333 static const struct processor *selected_arch;
334 static const struct processor *selected_cpu;
335 static const struct processor *selected_tune;
337 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
339 /* An ISA extension in the co-processor and main instruction set space. */
340 struct aarch64_option_extension
342 const char *const name;
343 const unsigned long flags_on;
344 const unsigned long flags_off;
347 /* ISA extensions in AArch64. */
348 static const struct aarch64_option_extension all_extensions[] =
350 #define AARCH64_OPT_EXTENSION(NAME, FLAGS_ON, FLAGS_OFF) \
351 {NAME, FLAGS_ON, FLAGS_OFF},
352 #include "aarch64-option-extensions.def"
353 #undef AARCH64_OPT_EXTENSION
357 /* Used to track the size of an address when generating a pre/post
358 increment address. */
359 static enum machine_mode aarch64_memory_reference_mode;
361 /* Used to force GTY into this file. */
362 static GTY(()) int gty_dummy;
364 /* A table of valid AArch64 "bitmask immediate" values for
365 logical instructions. */
367 #define AARCH64_NUM_BITMASKS 5334
368 static unsigned HOST_WIDE_INT aarch64_bitmasks[AARCH64_NUM_BITMASKS];
370 typedef enum aarch64_cond_code
372 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
373 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
374 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
378 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
380 /* The condition codes of the processor, and the inverse function. */
381 static const char * const aarch64_condition_codes[] =
383 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
384 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
387 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
389 aarch64_dbx_register_number (unsigned regno)
391 if (GP_REGNUM_P (regno))
392 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
393 else if (regno == SP_REGNUM)
394 return AARCH64_DWARF_SP;
395 else if (FP_REGNUM_P (regno))
396 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
398 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
399 equivalent DWARF register. */
400 return DWARF_FRAME_REGISTERS;
403 /* Return TRUE if MODE is any of the large INT modes. */
405 aarch64_vect_struct_mode_p (enum machine_mode mode)
407 return mode == OImode || mode == CImode || mode == XImode;
410 /* Return TRUE if MODE is any of the vector modes. */
412 aarch64_vector_mode_p (enum machine_mode mode)
414 return aarch64_vector_mode_supported_p (mode)
415 || aarch64_vect_struct_mode_p (mode);
418 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
420 aarch64_array_mode_supported_p (enum machine_mode mode,
421 unsigned HOST_WIDE_INT nelems)
424 && AARCH64_VALID_SIMD_QREG_MODE (mode)
425 && (nelems >= 2 && nelems <= 4))
431 /* Implement HARD_REGNO_NREGS. */
434 aarch64_hard_regno_nregs (unsigned regno, enum machine_mode mode)
436 switch (aarch64_regno_regclass (regno))
440 return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
442 return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
447 /* Implement HARD_REGNO_MODE_OK. */
450 aarch64_hard_regno_mode_ok (unsigned regno, enum machine_mode mode)
452 if (GET_MODE_CLASS (mode) == MODE_CC)
453 return regno == CC_REGNUM;
455 if (regno == SP_REGNUM)
456 /* The purpose of comparing with ptr_mode is to support the
457 global register variable associated with the stack pointer
458 register via the syntax of asm ("wsp") in ILP32. */
459 return mode == Pmode || mode == ptr_mode;
461 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
462 return mode == Pmode;
464 if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
467 if (FP_REGNUM_P (regno))
469 if (aarch64_vect_struct_mode_p (mode))
471 (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
479 /* Return true if calls to DECL should be treated as
480 long-calls (ie called via a register). */
482 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
487 /* Return true if calls to symbol-ref SYM should be treated as
488 long-calls (ie called via a register). */
490 aarch64_is_long_call_p (rtx sym)
492 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
495 /* Return true if the offsets to a zero/sign-extract operation
496 represent an expression that matches an extend operation. The
497 operands represent the paramters from
499 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
501 aarch64_is_extend_from_extract (enum machine_mode mode, rtx mult_imm,
504 HOST_WIDE_INT mult_val, extract_val;
506 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
509 mult_val = INTVAL (mult_imm);
510 extract_val = INTVAL (extract_imm);
513 && extract_val < GET_MODE_BITSIZE (mode)
514 && exact_log2 (extract_val & ~7) > 0
515 && (extract_val & 7) <= 4
516 && mult_val == (1 << (extract_val & 7)))
522 /* Emit an insn that's a simple single-set. Both the operands must be
523 known to be valid. */
525 emit_set_insn (rtx x, rtx y)
527 return emit_insn (gen_rtx_SET (VOIDmode, x, y));
530 /* X and Y are two things to compare using CODE. Emit the compare insn and
531 return the rtx for register 0 in the proper mode. */
533 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
535 enum machine_mode mode = SELECT_CC_MODE (code, x, y);
536 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
538 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
542 /* Build the SYMBOL_REF for __tls_get_addr. */
544 static GTY(()) rtx tls_get_addr_libfunc;
547 aarch64_tls_get_addr (void)
549 if (!tls_get_addr_libfunc)
550 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
551 return tls_get_addr_libfunc;
554 /* Return the TLS model to use for ADDR. */
556 static enum tls_model
557 tls_symbolic_operand_type (rtx addr)
559 enum tls_model tls_kind = TLS_MODEL_NONE;
562 if (GET_CODE (addr) == CONST)
564 split_const (addr, &sym, &addend);
565 if (GET_CODE (sym) == SYMBOL_REF)
566 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
568 else if (GET_CODE (addr) == SYMBOL_REF)
569 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
574 /* We'll allow lo_sum's in addresses in our legitimate addresses
575 so that combine would take care of combining addresses where
576 necessary, but for generation purposes, we'll generate the address
579 tmp = hi (symbol_ref); adrp x1, foo
580 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
584 adrp x1, :got:foo adrp tmp, :tlsgd:foo
585 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
589 Load TLS symbol, depending on TLS mechanism and TLS access model.
591 Global Dynamic - Traditional TLS:
593 add dest, tmp, #:tlsgd_lo12:imm
596 Global Dynamic - TLS Descriptors:
597 adrp dest, :tlsdesc:imm
598 ldr tmp, [dest, #:tlsdesc_lo12:imm]
599 add dest, dest, #:tlsdesc_lo12:imm
606 adrp tmp, :gottprel:imm
607 ldr dest, [tmp, #:gottprel_lo12:imm]
612 add t0, tp, #:tprel_hi12:imm
613 add t0, #:tprel_lo12_nc:imm
617 aarch64_load_symref_appropriately (rtx dest, rtx imm,
618 enum aarch64_symbol_type type)
622 case SYMBOL_SMALL_ABSOLUTE:
624 /* In ILP32, the mode of dest can be either SImode or DImode. */
626 enum machine_mode mode = GET_MODE (dest);
628 gcc_assert (mode == Pmode || mode == ptr_mode);
630 if (can_create_pseudo_p ())
631 tmp_reg = gen_reg_rtx (mode);
633 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
634 emit_insn (gen_add_losym (dest, tmp_reg, imm));
638 case SYMBOL_TINY_ABSOLUTE:
639 emit_insn (gen_rtx_SET (Pmode, dest, imm));
642 case SYMBOL_SMALL_GOT:
644 /* In ILP32, the mode of dest can be either SImode or DImode,
645 while the got entry is always of SImode size. The mode of
646 dest depends on how dest is used: if dest is assigned to a
647 pointer (e.g. in the memory), it has SImode; it may have
648 DImode if dest is dereferenced to access the memeory.
649 This is why we have to handle three different ldr_got_small
650 patterns here (two patterns for ILP32). */
652 enum machine_mode mode = GET_MODE (dest);
654 if (can_create_pseudo_p ())
655 tmp_reg = gen_reg_rtx (mode);
657 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
658 if (mode == ptr_mode)
661 emit_insn (gen_ldr_got_small_di (dest, tmp_reg, imm));
663 emit_insn (gen_ldr_got_small_si (dest, tmp_reg, imm));
667 gcc_assert (mode == Pmode);
668 emit_insn (gen_ldr_got_small_sidi (dest, tmp_reg, imm));
674 case SYMBOL_SMALL_TLSGD:
677 rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
680 emit_call_insn (gen_tlsgd_small (result, imm));
681 insns = get_insns ();
684 RTL_CONST_CALL_P (insns) = 1;
685 emit_libcall_block (insns, dest, result, imm);
689 case SYMBOL_SMALL_TLSDESC:
691 rtx x0 = gen_rtx_REG (Pmode, R0_REGNUM);
694 emit_insn (gen_tlsdesc_small (imm));
695 tp = aarch64_load_tp (NULL);
696 emit_insn (gen_rtx_SET (Pmode, dest, gen_rtx_PLUS (Pmode, tp, x0)));
697 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
701 case SYMBOL_SMALL_GOTTPREL:
703 rtx tmp_reg = gen_reg_rtx (Pmode);
704 rtx tp = aarch64_load_tp (NULL);
705 emit_insn (gen_tlsie_small (tmp_reg, imm));
706 emit_insn (gen_rtx_SET (Pmode, dest, gen_rtx_PLUS (Pmode, tp, tmp_reg)));
707 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
711 case SYMBOL_SMALL_TPREL:
713 rtx tp = aarch64_load_tp (NULL);
714 emit_insn (gen_tlsle_small (dest, tp, imm));
715 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
719 case SYMBOL_TINY_GOT:
720 emit_insn (gen_ldr_got_tiny (dest, imm));
728 /* Emit a move from SRC to DEST. Assume that the move expanders can
729 handle all moves if !can_create_pseudo_p (). The distinction is
730 important because, unlike emit_move_insn, the move expanders know
731 how to force Pmode objects into the constant pool even when the
732 constant pool address is not itself legitimate. */
734 aarch64_emit_move (rtx dest, rtx src)
736 return (can_create_pseudo_p ()
737 ? emit_move_insn (dest, src)
738 : emit_move_insn_1 (dest, src));
741 /* Split a 128-bit move operation into two 64-bit move operations,
742 taking care to handle partial overlap of register to register
743 copies. Special cases are needed when moving between GP regs and
744 FP regs. SRC can be a register, constant or memory; DST a register
745 or memory. If either operand is memory it must not have any side
748 aarch64_split_128bit_move (rtx dst, rtx src)
753 enum machine_mode mode = GET_MODE (dst);
755 gcc_assert (mode == TImode || mode == TFmode);
756 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
757 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
759 if (REG_P (dst) && REG_P (src))
761 int src_regno = REGNO (src);
762 int dst_regno = REGNO (dst);
764 /* Handle FP <-> GP regs. */
765 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
767 src_lo = gen_lowpart (word_mode, src);
768 src_hi = gen_highpart (word_mode, src);
772 emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
773 emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
777 emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
778 emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
782 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
784 dst_lo = gen_lowpart (word_mode, dst);
785 dst_hi = gen_highpart (word_mode, dst);
789 emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
790 emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
794 emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
795 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
801 dst_lo = gen_lowpart (word_mode, dst);
802 dst_hi = gen_highpart (word_mode, dst);
803 src_lo = gen_lowpart (word_mode, src);
804 src_hi = gen_highpart_mode (word_mode, mode, src);
806 /* At most one pairing may overlap. */
807 if (reg_overlap_mentioned_p (dst_lo, src_hi))
809 aarch64_emit_move (dst_hi, src_hi);
810 aarch64_emit_move (dst_lo, src_lo);
814 aarch64_emit_move (dst_lo, src_lo);
815 aarch64_emit_move (dst_hi, src_hi);
820 aarch64_split_128bit_move_p (rtx dst, rtx src)
822 return (! REG_P (src)
823 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
826 /* Split a complex SIMD combine. */
829 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
831 enum machine_mode src_mode = GET_MODE (src1);
832 enum machine_mode dst_mode = GET_MODE (dst);
834 gcc_assert (VECTOR_MODE_P (dst_mode));
836 if (REG_P (dst) && REG_P (src1) && REG_P (src2))
838 rtx (*gen) (rtx, rtx, rtx);
843 gen = gen_aarch64_simd_combinev8qi;
846 gen = gen_aarch64_simd_combinev4hi;
849 gen = gen_aarch64_simd_combinev2si;
852 gen = gen_aarch64_simd_combinev2sf;
855 gen = gen_aarch64_simd_combinedi;
858 gen = gen_aarch64_simd_combinedf;
864 emit_insn (gen (dst, src1, src2));
869 /* Split a complex SIMD move. */
872 aarch64_split_simd_move (rtx dst, rtx src)
874 enum machine_mode src_mode = GET_MODE (src);
875 enum machine_mode dst_mode = GET_MODE (dst);
877 gcc_assert (VECTOR_MODE_P (dst_mode));
879 if (REG_P (dst) && REG_P (src))
881 rtx (*gen) (rtx, rtx);
883 gcc_assert (VECTOR_MODE_P (src_mode));
888 gen = gen_aarch64_split_simd_movv16qi;
891 gen = gen_aarch64_split_simd_movv8hi;
894 gen = gen_aarch64_split_simd_movv4si;
897 gen = gen_aarch64_split_simd_movv2di;
900 gen = gen_aarch64_split_simd_movv4sf;
903 gen = gen_aarch64_split_simd_movv2df;
909 emit_insn (gen (dst, src));
915 aarch64_force_temporary (enum machine_mode mode, rtx x, rtx value)
917 if (can_create_pseudo_p ())
918 return force_reg (mode, value);
921 x = aarch64_emit_move (x, value);
928 aarch64_add_offset (enum machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
930 if (!aarch64_plus_immediate (GEN_INT (offset), mode))
933 /* Load the full offset into a register. This
934 might be improvable in the future. */
935 high = GEN_INT (offset);
937 high = aarch64_force_temporary (mode, temp, high);
938 reg = aarch64_force_temporary (mode, temp,
939 gen_rtx_PLUS (mode, high, reg));
941 return plus_constant (mode, reg, offset);
945 aarch64_expand_mov_immediate (rtx dest, rtx imm)
947 enum machine_mode mode = GET_MODE (dest);
948 unsigned HOST_WIDE_INT mask;
951 unsigned HOST_WIDE_INT val;
954 int one_match, zero_match;
956 gcc_assert (mode == SImode || mode == DImode);
958 /* Check on what type of symbol it is. */
959 if (GET_CODE (imm) == SYMBOL_REF
960 || GET_CODE (imm) == LABEL_REF
961 || GET_CODE (imm) == CONST)
963 rtx mem, base, offset;
964 enum aarch64_symbol_type sty;
966 /* If we have (const (plus symbol offset)), separate out the offset
967 before we start classifying the symbol. */
968 split_const (imm, &base, &offset);
970 sty = aarch64_classify_symbol (base, SYMBOL_CONTEXT_ADR);
973 case SYMBOL_FORCE_TO_MEM:
974 if (offset != const0_rtx
975 && targetm.cannot_force_const_mem (mode, imm))
977 gcc_assert (can_create_pseudo_p ());
978 base = aarch64_force_temporary (mode, dest, base);
979 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
980 aarch64_emit_move (dest, base);
983 mem = force_const_mem (ptr_mode, imm);
985 if (mode != ptr_mode)
986 mem = gen_rtx_ZERO_EXTEND (mode, mem);
987 emit_insn (gen_rtx_SET (VOIDmode, dest, mem));
990 case SYMBOL_SMALL_TLSGD:
991 case SYMBOL_SMALL_TLSDESC:
992 case SYMBOL_SMALL_GOTTPREL:
993 case SYMBOL_SMALL_GOT:
994 case SYMBOL_TINY_GOT:
995 if (offset != const0_rtx)
997 gcc_assert(can_create_pseudo_p ());
998 base = aarch64_force_temporary (mode, dest, base);
999 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1000 aarch64_emit_move (dest, base);
1005 case SYMBOL_SMALL_TPREL:
1006 case SYMBOL_SMALL_ABSOLUTE:
1007 case SYMBOL_TINY_ABSOLUTE:
1008 aarch64_load_symref_appropriately (dest, imm, sty);
1016 if (CONST_INT_P (imm) && aarch64_move_imm (INTVAL (imm), mode))
1018 emit_insn (gen_rtx_SET (VOIDmode, dest, imm));
1022 if (!CONST_INT_P (imm))
1024 if (GET_CODE (imm) == HIGH)
1025 emit_insn (gen_rtx_SET (VOIDmode, dest, imm));
1028 rtx mem = force_const_mem (mode, imm);
1030 emit_insn (gen_rtx_SET (VOIDmode, dest, mem));
1038 /* We know we can't do this in 1 insn, and we must be able to do it
1039 in two; so don't mess around looking for sequences that don't buy
1041 emit_insn (gen_rtx_SET (VOIDmode, dest, GEN_INT (INTVAL (imm) & 0xffff)));
1042 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1043 GEN_INT ((INTVAL (imm) >> 16) & 0xffff)));
1047 /* Remaining cases are all for DImode. */
1050 subtargets = optimize && can_create_pseudo_p ();
1056 for (i = 0; i < 64; i += 16, mask <<= 16)
1058 if ((val & mask) == 0)
1060 else if ((val & mask) == mask)
1067 for (i = 0; i < 64; i += 16, mask <<= 16)
1069 if ((val & mask) != mask)
1071 emit_insn (gen_rtx_SET (VOIDmode, dest, GEN_INT (val | mask)));
1072 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1073 GEN_INT ((val >> i) & 0xffff)));
1080 if (zero_match == 2)
1081 goto simple_sequence;
1083 mask = 0x0ffff0000UL;
1084 for (i = 16; i < 64; i += 16, mask <<= 16)
1086 HOST_WIDE_INT comp = mask & ~(mask - 1);
1088 if (aarch64_uimm12_shift (val - (val & mask)))
1090 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1092 emit_insn (gen_rtx_SET (VOIDmode, subtarget, GEN_INT (val & mask)));
1093 emit_insn (gen_adddi3 (dest, subtarget,
1094 GEN_INT (val - (val & mask))));
1097 else if (aarch64_uimm12_shift (-(val - ((val + comp) & mask))))
1099 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1101 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1102 GEN_INT ((val + comp) & mask)));
1103 emit_insn (gen_adddi3 (dest, subtarget,
1104 GEN_INT (val - ((val + comp) & mask))));
1107 else if (aarch64_uimm12_shift (val - ((val - comp) | ~mask)))
1109 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1111 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1112 GEN_INT ((val - comp) | ~mask)));
1113 emit_insn (gen_adddi3 (dest, subtarget,
1114 GEN_INT (val - ((val - comp) | ~mask))));
1117 else if (aarch64_uimm12_shift (-(val - (val | ~mask))))
1119 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1121 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1122 GEN_INT (val | ~mask)));
1123 emit_insn (gen_adddi3 (dest, subtarget,
1124 GEN_INT (val - (val | ~mask))));
1129 /* See if we can do it by arithmetically combining two
1131 for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1136 if (aarch64_uimm12_shift (val - aarch64_bitmasks[i])
1137 || aarch64_uimm12_shift (-val + aarch64_bitmasks[i]))
1139 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1140 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1141 GEN_INT (aarch64_bitmasks[i])));
1142 emit_insn (gen_adddi3 (dest, subtarget,
1143 GEN_INT (val - aarch64_bitmasks[i])));
1147 for (j = 0; j < 64; j += 16, mask <<= 16)
1149 if ((aarch64_bitmasks[i] & ~mask) == (val & ~mask))
1151 emit_insn (gen_rtx_SET (VOIDmode, dest,
1152 GEN_INT (aarch64_bitmasks[i])));
1153 emit_insn (gen_insv_immdi (dest, GEN_INT (j),
1154 GEN_INT ((val >> j) & 0xffff)));
1160 /* See if we can do it by logically combining two immediates. */
1161 for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1163 if ((aarch64_bitmasks[i] & val) == aarch64_bitmasks[i])
1167 for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1168 if (val == (aarch64_bitmasks[i] | aarch64_bitmasks[j]))
1170 subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1171 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1172 GEN_INT (aarch64_bitmasks[i])));
1173 emit_insn (gen_iordi3 (dest, subtarget,
1174 GEN_INT (aarch64_bitmasks[j])));
1178 else if ((val & aarch64_bitmasks[i]) == val)
1182 for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1183 if (val == (aarch64_bitmasks[j] & aarch64_bitmasks[i]))
1186 subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1187 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1188 GEN_INT (aarch64_bitmasks[j])));
1189 emit_insn (gen_anddi3 (dest, subtarget,
1190 GEN_INT (aarch64_bitmasks[i])));
1199 for (i = 0; i < 64; i += 16, mask <<= 16)
1201 if ((val & mask) != 0)
1205 emit_insn (gen_rtx_SET (VOIDmode, dest,
1206 GEN_INT (val & mask)));
1210 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1211 GEN_INT ((val >> i) & 0xffff)));
1217 aarch64_function_ok_for_sibcall (tree decl, tree exp ATTRIBUTE_UNUSED)
1219 /* Indirect calls are not currently supported. */
1223 /* Cannot tail-call to long-calls, since these are outside of the
1224 range of a branch instruction (we could handle this if we added
1225 support for indirect tail-calls. */
1226 if (aarch64_decl_is_long_call_p (decl))
1232 /* Implement TARGET_PASS_BY_REFERENCE. */
1235 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
1236 enum machine_mode mode,
1238 bool named ATTRIBUTE_UNUSED)
1241 enum machine_mode dummymode;
1244 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
1245 size = (mode == BLKmode && type)
1246 ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
1248 /* Aggregates are passed by reference based on their size. */
1249 if (type && AGGREGATE_TYPE_P (type))
1251 size = int_size_in_bytes (type);
1254 /* Variable sized arguments are always returned by reference. */
1258 /* Can this be a candidate to be passed in fp/simd register(s)? */
1259 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1264 /* Arguments which are variable sized or larger than 2 registers are
1265 passed by reference unless they are a homogenous floating point
1267 return size > 2 * UNITS_PER_WORD;
1270 /* Return TRUE if VALTYPE is padded to its least significant bits. */
1272 aarch64_return_in_msb (const_tree valtype)
1274 enum machine_mode dummy_mode;
1277 /* Never happens in little-endian mode. */
1278 if (!BYTES_BIG_ENDIAN)
1281 /* Only composite types smaller than or equal to 16 bytes can
1282 be potentially returned in registers. */
1283 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
1284 || int_size_in_bytes (valtype) <= 0
1285 || int_size_in_bytes (valtype) > 16)
1288 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
1289 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
1290 is always passed/returned in the least significant bits of fp/simd
1292 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
1293 &dummy_mode, &dummy_int, NULL))
1299 /* Implement TARGET_FUNCTION_VALUE.
1300 Define how to find the value returned by a function. */
1303 aarch64_function_value (const_tree type, const_tree func,
1304 bool outgoing ATTRIBUTE_UNUSED)
1306 enum machine_mode mode;
1309 enum machine_mode ag_mode;
1311 mode = TYPE_MODE (type);
1312 if (INTEGRAL_TYPE_P (type))
1313 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
1315 if (aarch64_return_in_msb (type))
1317 HOST_WIDE_INT size = int_size_in_bytes (type);
1319 if (size % UNITS_PER_WORD != 0)
1321 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
1322 mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
1326 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1327 &ag_mode, &count, NULL))
1329 if (!aarch64_composite_type_p (type, mode))
1331 gcc_assert (count == 1 && mode == ag_mode);
1332 return gen_rtx_REG (mode, V0_REGNUM);
1339 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
1340 for (i = 0; i < count; i++)
1342 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
1343 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1344 GEN_INT (i * GET_MODE_SIZE (ag_mode)));
1345 XVECEXP (par, 0, i) = tmp;
1351 return gen_rtx_REG (mode, R0_REGNUM);
1354 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
1355 Return true if REGNO is the number of a hard register in which the values
1356 of called function may come back. */
1359 aarch64_function_value_regno_p (const unsigned int regno)
1361 /* Maximum of 16 bytes can be returned in the general registers. Examples
1362 of 16-byte return values are: 128-bit integers and 16-byte small
1363 structures (excluding homogeneous floating-point aggregates). */
1364 if (regno == R0_REGNUM || regno == R1_REGNUM)
1367 /* Up to four fp/simd registers can return a function value, e.g. a
1368 homogeneous floating-point aggregate having four members. */
1369 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
1370 return !TARGET_GENERAL_REGS_ONLY;
1375 /* Implement TARGET_RETURN_IN_MEMORY.
1377 If the type T of the result of a function is such that
1379 would require that arg be passed as a value in a register (or set of
1380 registers) according to the parameter passing rules, then the result
1381 is returned in the same registers as would be used for such an
1385 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
1388 enum machine_mode ag_mode;
1391 if (!AGGREGATE_TYPE_P (type)
1392 && TREE_CODE (type) != COMPLEX_TYPE
1393 && TREE_CODE (type) != VECTOR_TYPE)
1394 /* Simple scalar types always returned in registers. */
1397 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
1404 /* Types larger than 2 registers returned in memory. */
1405 size = int_size_in_bytes (type);
1406 return (size < 0 || size > 2 * UNITS_PER_WORD);
1410 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, enum machine_mode mode,
1411 const_tree type, int *nregs)
1413 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1414 return aarch64_vfp_is_call_or_return_candidate (mode,
1416 &pcum->aapcs_vfp_rmode,
1421 /* Given MODE and TYPE of a function argument, return the alignment in
1422 bits. The idea is to suppress any stronger alignment requested by
1423 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
1424 This is a helper function for local use only. */
1427 aarch64_function_arg_alignment (enum machine_mode mode, const_tree type)
1429 unsigned int alignment;
1433 if (!integer_zerop (TYPE_SIZE (type)))
1435 if (TYPE_MODE (type) == mode)
1436 alignment = TYPE_ALIGN (type);
1438 alignment = GET_MODE_ALIGNMENT (mode);
1444 alignment = GET_MODE_ALIGNMENT (mode);
1449 /* Layout a function argument according to the AAPCS64 rules. The rule
1450 numbers refer to the rule numbers in the AAPCS64. */
1453 aarch64_layout_arg (cumulative_args_t pcum_v, enum machine_mode mode,
1455 bool named ATTRIBUTE_UNUSED)
1457 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1458 int ncrn, nvrn, nregs;
1459 bool allocate_ncrn, allocate_nvrn;
1462 /* We need to do this once per argument. */
1463 if (pcum->aapcs_arg_processed)
1466 pcum->aapcs_arg_processed = true;
1468 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
1470 = AARCH64_ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
1473 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
1474 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
1479 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
1480 The following code thus handles passing by SIMD/FP registers first. */
1482 nvrn = pcum->aapcs_nvrn;
1484 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
1485 and homogenous short-vector aggregates (HVA). */
1488 if (nvrn + nregs <= NUM_FP_ARG_REGS)
1490 pcum->aapcs_nextnvrn = nvrn + nregs;
1491 if (!aarch64_composite_type_p (type, mode))
1493 gcc_assert (nregs == 1);
1494 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
1500 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1501 for (i = 0; i < nregs; i++)
1503 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
1504 V0_REGNUM + nvrn + i);
1505 tmp = gen_rtx_EXPR_LIST
1507 GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
1508 XVECEXP (par, 0, i) = tmp;
1510 pcum->aapcs_reg = par;
1516 /* C.3 NSRN is set to 8. */
1517 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
1522 ncrn = pcum->aapcs_ncrn;
1523 nregs = size / UNITS_PER_WORD;
1525 /* C6 - C9. though the sign and zero extension semantics are
1526 handled elsewhere. This is the case where the argument fits
1527 entirely general registers. */
1528 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
1530 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1532 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
1534 /* C.8 if the argument has an alignment of 16 then the NGRN is
1535 rounded up to the next even number. */
1536 if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
1539 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
1541 /* NREGS can be 0 when e.g. an empty structure is to be passed.
1542 A reg is still generated for it, but the caller should be smart
1543 enough not to use it. */
1544 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
1546 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
1553 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1554 for (i = 0; i < nregs; i++)
1556 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
1557 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1558 GEN_INT (i * UNITS_PER_WORD));
1559 XVECEXP (par, 0, i) = tmp;
1561 pcum->aapcs_reg = par;
1564 pcum->aapcs_nextncrn = ncrn + nregs;
1569 pcum->aapcs_nextncrn = NUM_ARG_REGS;
1571 /* The argument is passed on stack; record the needed number of words for
1572 this argument and align the total size if necessary. */
1574 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
1575 if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
1576 pcum->aapcs_stack_size = AARCH64_ROUND_UP (pcum->aapcs_stack_size,
1577 16 / UNITS_PER_WORD);
1581 /* Implement TARGET_FUNCTION_ARG. */
1584 aarch64_function_arg (cumulative_args_t pcum_v, enum machine_mode mode,
1585 const_tree type, bool named)
1587 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1588 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
1590 if (mode == VOIDmode)
1593 aarch64_layout_arg (pcum_v, mode, type, named);
1594 return pcum->aapcs_reg;
1598 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
1599 const_tree fntype ATTRIBUTE_UNUSED,
1600 rtx libname ATTRIBUTE_UNUSED,
1601 const_tree fndecl ATTRIBUTE_UNUSED,
1602 unsigned n_named ATTRIBUTE_UNUSED)
1604 pcum->aapcs_ncrn = 0;
1605 pcum->aapcs_nvrn = 0;
1606 pcum->aapcs_nextncrn = 0;
1607 pcum->aapcs_nextnvrn = 0;
1608 pcum->pcs_variant = ARM_PCS_AAPCS64;
1609 pcum->aapcs_reg = NULL_RTX;
1610 pcum->aapcs_arg_processed = false;
1611 pcum->aapcs_stack_words = 0;
1612 pcum->aapcs_stack_size = 0;
1618 aarch64_function_arg_advance (cumulative_args_t pcum_v,
1619 enum machine_mode mode,
1623 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1624 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
1626 aarch64_layout_arg (pcum_v, mode, type, named);
1627 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
1628 != (pcum->aapcs_stack_words != 0));
1629 pcum->aapcs_arg_processed = false;
1630 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
1631 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
1632 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
1633 pcum->aapcs_stack_words = 0;
1634 pcum->aapcs_reg = NULL_RTX;
1639 aarch64_function_arg_regno_p (unsigned regno)
1641 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
1642 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
1645 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
1646 PARM_BOUNDARY bits of alignment, but will be given anything up
1647 to STACK_BOUNDARY bits if the type requires it. This makes sure
1648 that both before and after the layout of each argument, the Next
1649 Stacked Argument Address (NSAA) will have a minimum alignment of
1653 aarch64_function_arg_boundary (enum machine_mode mode, const_tree type)
1655 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1657 if (alignment < PARM_BOUNDARY)
1658 alignment = PARM_BOUNDARY;
1659 if (alignment > STACK_BOUNDARY)
1660 alignment = STACK_BOUNDARY;
1664 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
1666 Return true if an argument passed on the stack should be padded upwards,
1667 i.e. if the least-significant byte of the stack slot has useful data.
1669 Small aggregate types are placed in the lowest memory address.
1671 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
1674 aarch64_pad_arg_upward (enum machine_mode mode, const_tree type)
1676 /* On little-endian targets, the least significant byte of every stack
1677 argument is passed at the lowest byte address of the stack slot. */
1678 if (!BYTES_BIG_ENDIAN)
1681 /* Otherwise, integral, floating-point and pointer types are padded downward:
1682 the least significant byte of a stack argument is passed at the highest
1683 byte address of the stack slot. */
1685 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
1686 || POINTER_TYPE_P (type))
1687 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
1690 /* Everything else padded upward, i.e. data in first byte of stack slot. */
1694 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
1696 It specifies padding for the last (may also be the only)
1697 element of a block move between registers and memory. If
1698 assuming the block is in the memory, padding upward means that
1699 the last element is padded after its highest significant byte,
1700 while in downward padding, the last element is padded at the
1701 its least significant byte side.
1703 Small aggregates and small complex types are always padded
1706 We don't need to worry about homogeneous floating-point or
1707 short-vector aggregates; their move is not affected by the
1708 padding direction determined here. Regardless of endianness,
1709 each element of such an aggregate is put in the least
1710 significant bits of a fp/simd register.
1712 Return !BYTES_BIG_ENDIAN if the least significant byte of the
1713 register has useful data, and return the opposite if the most
1714 significant byte does. */
1717 aarch64_pad_reg_upward (enum machine_mode mode, const_tree type,
1718 bool first ATTRIBUTE_UNUSED)
1721 /* Small composite types are always padded upward. */
1722 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
1724 HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
1725 : GET_MODE_SIZE (mode));
1726 if (size < 2 * UNITS_PER_WORD)
1730 /* Otherwise, use the default padding. */
1731 return !BYTES_BIG_ENDIAN;
1734 static enum machine_mode
1735 aarch64_libgcc_cmp_return_mode (void)
1741 aarch64_frame_pointer_required (void)
1743 /* If the function contains dynamic stack allocations, we need to
1744 use the frame pointer to access the static parts of the frame. */
1745 if (cfun->calls_alloca)
1748 /* In aarch64_override_options_after_change
1749 flag_omit_leaf_frame_pointer turns off the frame pointer by
1750 default. Turn it back on now if we've not got a leaf
1752 if (flag_omit_leaf_frame_pointer
1753 && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
1759 /* Mark the registers that need to be saved by the callee and calculate
1760 the size of the callee-saved registers area and frame record (both FP
1761 and LR may be omitted). */
1763 aarch64_layout_frame (void)
1765 HOST_WIDE_INT offset = 0;
1768 if (reload_completed && cfun->machine->frame.laid_out)
1771 cfun->machine->frame.fp_lr_offset = 0;
1773 /* First mark all the registers that really need to be saved... */
1774 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
1775 cfun->machine->frame.reg_offset[regno] = -1;
1777 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
1778 cfun->machine->frame.reg_offset[regno] = -1;
1780 /* ... that includes the eh data registers (if needed)... */
1781 if (crtl->calls_eh_return)
1782 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
1783 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)] = 0;
1785 /* ... and any callee saved register that dataflow says is live. */
1786 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
1787 if (df_regs_ever_live_p (regno)
1788 && !call_used_regs[regno])
1789 cfun->machine->frame.reg_offset[regno] = 0;
1791 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
1792 if (df_regs_ever_live_p (regno)
1793 && !call_used_regs[regno])
1794 cfun->machine->frame.reg_offset[regno] = 0;
1796 if (frame_pointer_needed)
1798 cfun->machine->frame.reg_offset[R30_REGNUM] = 0;
1799 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
1800 cfun->machine->frame.hardfp_offset = 2 * UNITS_PER_WORD;
1803 /* Now assign stack slots for them. */
1804 for (regno = R0_REGNUM; regno <= R28_REGNUM; regno++)
1805 if (cfun->machine->frame.reg_offset[regno] != -1)
1807 cfun->machine->frame.reg_offset[regno] = offset;
1808 offset += UNITS_PER_WORD;
1811 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
1812 if (cfun->machine->frame.reg_offset[regno] != -1)
1814 cfun->machine->frame.reg_offset[regno] = offset;
1815 offset += UNITS_PER_WORD;
1818 if (frame_pointer_needed)
1820 cfun->machine->frame.reg_offset[R29_REGNUM] = offset;
1821 offset += UNITS_PER_WORD;
1822 cfun->machine->frame.fp_lr_offset = UNITS_PER_WORD;
1825 if (cfun->machine->frame.reg_offset[R30_REGNUM] != -1)
1827 cfun->machine->frame.reg_offset[R30_REGNUM] = offset;
1828 offset += UNITS_PER_WORD;
1829 cfun->machine->frame.fp_lr_offset += UNITS_PER_WORD;
1832 cfun->machine->frame.padding0 =
1833 (AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT) - offset);
1834 offset = AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
1836 cfun->machine->frame.saved_regs_size = offset;
1837 cfun->machine->frame.laid_out = true;
1840 /* Make the last instruction frame-related and note that it performs
1841 the operation described by FRAME_PATTERN. */
1844 aarch64_set_frame_expr (rtx frame_pattern)
1848 insn = get_last_insn ();
1849 RTX_FRAME_RELATED_P (insn) = 1;
1850 RTX_FRAME_RELATED_P (frame_pattern) = 1;
1851 REG_NOTES (insn) = alloc_EXPR_LIST (REG_FRAME_RELATED_EXPR,
1857 aarch64_register_saved_on_entry (int regno)
1859 return cfun->machine->frame.reg_offset[regno] != -1;
1864 aarch64_save_or_restore_fprs (int start_offset, int increment,
1865 bool restore, rtx base_rtx)
1871 rtx (*gen_mem_ref)(enum machine_mode, rtx)
1872 = (frame_pointer_needed)? gen_frame_mem : gen_rtx_MEM;
1875 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
1877 if (aarch64_register_saved_on_entry (regno))
1880 mem = gen_mem_ref (DFmode,
1881 plus_constant (Pmode,
1885 for (regno2 = regno + 1;
1886 regno2 <= V31_REGNUM
1887 && !aarch64_register_saved_on_entry (regno2);
1892 if (regno2 <= V31_REGNUM &&
1893 aarch64_register_saved_on_entry (regno2))
1896 /* Next highest register to be saved. */
1897 mem2 = gen_mem_ref (DFmode,
1901 start_offset + increment));
1902 if (restore == false)
1905 ( gen_store_pairdf (mem, gen_rtx_REG (DFmode, regno),
1906 mem2, gen_rtx_REG (DFmode, regno2)));
1912 ( gen_load_pairdf (gen_rtx_REG (DFmode, regno), mem,
1913 gen_rtx_REG (DFmode, regno2), mem2));
1915 add_reg_note (insn, REG_CFA_RESTORE,
1916 gen_rtx_REG (DFmode, regno));
1917 add_reg_note (insn, REG_CFA_RESTORE,
1918 gen_rtx_REG (DFmode, regno2));
1921 /* The first part of a frame-related parallel insn
1922 is always assumed to be relevant to the frame
1923 calculations; subsequent parts, are only
1924 frame-related if explicitly marked. */
1925 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
1927 start_offset += increment * 2;
1931 if (restore == false)
1932 insn = emit_move_insn (mem, gen_rtx_REG (DFmode, regno));
1935 insn = emit_move_insn (gen_rtx_REG (DFmode, regno), mem);
1936 add_reg_note (insn, REG_CFA_RESTORE,
1937 gen_rtx_REG (DImode, regno));
1939 start_offset += increment;
1941 RTX_FRAME_RELATED_P (insn) = 1;
1948 /* offset from the stack pointer of where the saves and
1949 restore's have to happen. */
1951 aarch64_save_or_restore_callee_save_registers (HOST_WIDE_INT offset,
1955 rtx base_rtx = stack_pointer_rtx;
1956 HOST_WIDE_INT start_offset = offset;
1957 HOST_WIDE_INT increment = UNITS_PER_WORD;
1958 rtx (*gen_mem_ref)(enum machine_mode, rtx) = (frame_pointer_needed)? gen_frame_mem : gen_rtx_MEM;
1959 unsigned limit = (frame_pointer_needed)? R28_REGNUM: R30_REGNUM;
1963 for (regno = R0_REGNUM; regno <= limit; regno++)
1965 if (aarch64_register_saved_on_entry (regno))
1968 mem = gen_mem_ref (Pmode,
1969 plus_constant (Pmode,
1973 for (regno2 = regno + 1;
1975 && !aarch64_register_saved_on_entry (regno2);
1980 if (regno2 <= limit &&
1981 aarch64_register_saved_on_entry (regno2))
1984 /* Next highest register to be saved. */
1985 mem2 = gen_mem_ref (Pmode,
1989 start_offset + increment));
1990 if (restore == false)
1993 ( gen_store_pairdi (mem, gen_rtx_REG (DImode, regno),
1994 mem2, gen_rtx_REG (DImode, regno2)));
2000 ( gen_load_pairdi (gen_rtx_REG (DImode, regno), mem,
2001 gen_rtx_REG (DImode, regno2), mem2));
2003 add_reg_note (insn, REG_CFA_RESTORE, gen_rtx_REG (DImode, regno));
2004 add_reg_note (insn, REG_CFA_RESTORE, gen_rtx_REG (DImode, regno2));
2007 /* The first part of a frame-related parallel insn
2008 is always assumed to be relevant to the frame
2009 calculations; subsequent parts, are only
2010 frame-related if explicitly marked. */
2011 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0,
2014 start_offset += increment * 2;
2018 if (restore == false)
2019 insn = emit_move_insn (mem, gen_rtx_REG (DImode, regno));
2022 insn = emit_move_insn (gen_rtx_REG (DImode, regno), mem);
2023 add_reg_note (insn, REG_CFA_RESTORE, gen_rtx_REG (DImode, regno));
2025 start_offset += increment;
2027 RTX_FRAME_RELATED_P (insn) = 1;
2031 aarch64_save_or_restore_fprs (start_offset, increment, restore, base_rtx);
2035 /* AArch64 stack frames generated by this compiler look like:
2037 +-------------------------------+
2039 | incoming stack arguments |
2041 +-------------------------------+ <-- arg_pointer_rtx
2043 | callee-allocated save area |
2044 | for register varargs |
2046 +-------------------------------+ <-- frame_pointer_rtx
2050 +-------------------------------+
2052 +-------------------------------+ |
2055 | callee-saved registers | | frame.saved_regs_size
2057 +-------------------------------+ |
2059 +-------------------------------+ |
2061 P +-------------------------------+ <-- hard_frame_pointer_rtx
2062 | dynamic allocation |
2063 +-------------------------------+
2065 | outgoing stack arguments |
2067 +-------------------------------+ <-- stack_pointer_rtx
2069 Dynamic stack allocations such as alloca insert data at point P.
2070 They decrease stack_pointer_rtx but leave frame_pointer_rtx and
2071 hard_frame_pointer_rtx unchanged. */
2073 /* Generate the prologue instructions for entry into a function.
2074 Establish the stack frame by decreasing the stack pointer with a
2075 properly calculated size and, if necessary, create a frame record
2076 filled with the values of LR and previous frame pointer. The
2077 current FP is also set up if it is in use. */
2080 aarch64_expand_prologue (void)
2082 /* sub sp, sp, #<frame_size>
2083 stp {fp, lr}, [sp, #<frame_size> - 16]
2084 add fp, sp, #<frame_size> - hardfp_offset
2085 stp {cs_reg}, [fp, #-16] etc.
2087 sub sp, sp, <final_adjustment_if_any>
2089 HOST_WIDE_INT original_frame_size; /* local variables + vararg save */
2090 HOST_WIDE_INT frame_size, offset;
2091 HOST_WIDE_INT fp_offset; /* FP offset from SP */
2094 aarch64_layout_frame ();
2095 original_frame_size = get_frame_size () + cfun->machine->saved_varargs_size;
2096 gcc_assert ((!cfun->machine->saved_varargs_size || cfun->stdarg)
2097 && (cfun->stdarg || !cfun->machine->saved_varargs_size));
2098 frame_size = (original_frame_size + cfun->machine->frame.saved_regs_size
2099 + crtl->outgoing_args_size);
2100 offset = frame_size = AARCH64_ROUND_UP (frame_size,
2101 STACK_BOUNDARY / BITS_PER_UNIT);
2103 if (flag_stack_usage_info)
2104 current_function_static_stack_size = frame_size;
2107 - original_frame_size
2108 - cfun->machine->frame.saved_regs_size);
2110 /* Store pairs and load pairs have a range only -512 to 504. */
2113 /* When the frame has a large size, an initial decrease is done on
2114 the stack pointer to jump over the callee-allocated save area for
2115 register varargs, the local variable area and/or the callee-saved
2116 register area. This will allow the pre-index write-back
2117 store pair instructions to be used for setting up the stack frame
2119 offset = original_frame_size + cfun->machine->frame.saved_regs_size;
2121 offset = cfun->machine->frame.saved_regs_size;
2123 frame_size -= (offset + crtl->outgoing_args_size);
2126 if (frame_size >= 0x1000000)
2128 rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2129 emit_move_insn (op0, GEN_INT (-frame_size));
2130 emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2131 aarch64_set_frame_expr (gen_rtx_SET
2132 (Pmode, stack_pointer_rtx,
2133 plus_constant (Pmode,
2137 else if (frame_size > 0)
2139 if ((frame_size & 0xfff) != frame_size)
2141 insn = emit_insn (gen_add2_insn
2143 GEN_INT (-(frame_size
2144 & ~(HOST_WIDE_INT)0xfff))));
2145 RTX_FRAME_RELATED_P (insn) = 1;
2147 if ((frame_size & 0xfff) != 0)
2149 insn = emit_insn (gen_add2_insn
2151 GEN_INT (-(frame_size
2152 & (HOST_WIDE_INT)0xfff))));
2153 RTX_FRAME_RELATED_P (insn) = 1;
2162 /* Save the frame pointer and lr if the frame pointer is needed
2163 first. Make the frame pointer point to the location of the
2164 old frame pointer on the stack. */
2165 if (frame_pointer_needed)
2171 insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2172 GEN_INT (-offset)));
2173 RTX_FRAME_RELATED_P (insn) = 1;
2174 aarch64_set_frame_expr (gen_rtx_SET
2175 (Pmode, stack_pointer_rtx,
2176 gen_rtx_MINUS (Pmode,
2178 GEN_INT (offset))));
2179 mem_fp = gen_frame_mem (DImode,
2180 plus_constant (Pmode,
2183 mem_lr = gen_frame_mem (DImode,
2184 plus_constant (Pmode,
2188 insn = emit_insn (gen_store_pairdi (mem_fp,
2189 hard_frame_pointer_rtx,
2191 gen_rtx_REG (DImode,
2196 insn = emit_insn (gen_storewb_pairdi_di
2197 (stack_pointer_rtx, stack_pointer_rtx,
2198 hard_frame_pointer_rtx,
2199 gen_rtx_REG (DImode, LR_REGNUM),
2201 GEN_INT (GET_MODE_SIZE (DImode) - offset)));
2202 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
2205 /* The first part of a frame-related parallel insn is always
2206 assumed to be relevant to the frame calculations;
2207 subsequent parts, are only frame-related if explicitly
2209 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2210 RTX_FRAME_RELATED_P (insn) = 1;
2212 /* Set up frame pointer to point to the location of the
2213 previous frame pointer on the stack. */
2214 insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
2216 GEN_INT (fp_offset)));
2217 aarch64_set_frame_expr (gen_rtx_SET
2218 (Pmode, hard_frame_pointer_rtx,
2219 plus_constant (Pmode,
2222 RTX_FRAME_RELATED_P (insn) = 1;
2223 insn = emit_insn (gen_stack_tie (stack_pointer_rtx,
2224 hard_frame_pointer_rtx));
2228 insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2229 GEN_INT (-offset)));
2230 RTX_FRAME_RELATED_P (insn) = 1;
2233 aarch64_save_or_restore_callee_save_registers
2234 (fp_offset + cfun->machine->frame.hardfp_offset, 0);
2237 /* when offset >= 512,
2238 sub sp, sp, #<outgoing_args_size> */
2239 if (frame_size > -1)
2241 if (crtl->outgoing_args_size > 0)
2243 insn = emit_insn (gen_add2_insn
2245 GEN_INT (- crtl->outgoing_args_size)));
2246 RTX_FRAME_RELATED_P (insn) = 1;
2251 /* Generate the epilogue instructions for returning from a function. */
2253 aarch64_expand_epilogue (bool for_sibcall)
2255 HOST_WIDE_INT original_frame_size, frame_size, offset;
2256 HOST_WIDE_INT fp_offset;
2260 aarch64_layout_frame ();
2261 original_frame_size = get_frame_size () + cfun->machine->saved_varargs_size;
2262 frame_size = (original_frame_size + cfun->machine->frame.saved_regs_size
2263 + crtl->outgoing_args_size);
2264 offset = frame_size = AARCH64_ROUND_UP (frame_size,
2265 STACK_BOUNDARY / BITS_PER_UNIT);
2268 - original_frame_size
2269 - cfun->machine->frame.saved_regs_size);
2271 cfa_reg = frame_pointer_needed ? hard_frame_pointer_rtx : stack_pointer_rtx;
2273 /* Store pairs and load pairs have a range only -512 to 504. */
2276 offset = original_frame_size + cfun->machine->frame.saved_regs_size;
2278 offset = cfun->machine->frame.saved_regs_size;
2280 frame_size -= (offset + crtl->outgoing_args_size);
2282 if (!frame_pointer_needed && crtl->outgoing_args_size > 0)
2284 insn = emit_insn (gen_add2_insn
2286 GEN_INT (crtl->outgoing_args_size)));
2287 RTX_FRAME_RELATED_P (insn) = 1;
2293 /* If there were outgoing arguments or we've done dynamic stack
2294 allocation, then restore the stack pointer from the frame
2295 pointer. This is at most one insn and more efficient than using
2296 GCC's internal mechanism. */
2297 if (frame_pointer_needed
2298 && (crtl->outgoing_args_size || cfun->calls_alloca))
2300 insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
2301 hard_frame_pointer_rtx,
2302 GEN_INT (- fp_offset)));
2303 RTX_FRAME_RELATED_P (insn) = 1;
2304 /* As SP is set to (FP - fp_offset), according to the rules in
2305 dwarf2cfi.c:dwarf2out_frame_debug_expr, CFA should be calculated
2306 from the value of SP from now on. */
2307 cfa_reg = stack_pointer_rtx;
2310 aarch64_save_or_restore_callee_save_registers
2311 (fp_offset + cfun->machine->frame.hardfp_offset, 1);
2313 /* Restore the frame pointer and lr if the frame pointer is needed. */
2316 if (frame_pointer_needed)
2322 mem_fp = gen_frame_mem (DImode,
2323 plus_constant (Pmode,
2326 mem_lr = gen_frame_mem (DImode,
2327 plus_constant (Pmode,
2331 insn = emit_insn (gen_load_pairdi (hard_frame_pointer_rtx,
2333 gen_rtx_REG (DImode,
2339 insn = emit_insn (gen_loadwb_pairdi_di
2342 hard_frame_pointer_rtx,
2343 gen_rtx_REG (DImode, LR_REGNUM),
2345 GEN_INT (GET_MODE_SIZE (DImode) + offset)));
2346 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
2347 add_reg_note (insn, REG_CFA_ADJUST_CFA,
2348 (gen_rtx_SET (Pmode, stack_pointer_rtx,
2349 plus_constant (Pmode, cfa_reg,
2353 /* The first part of a frame-related parallel insn
2354 is always assumed to be relevant to the frame
2355 calculations; subsequent parts, are only
2356 frame-related if explicitly marked. */
2357 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2358 RTX_FRAME_RELATED_P (insn) = 1;
2359 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
2360 add_reg_note (insn, REG_CFA_RESTORE,
2361 gen_rtx_REG (DImode, LR_REGNUM));
2365 insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2367 RTX_FRAME_RELATED_P (insn) = 1;
2372 insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2374 RTX_FRAME_RELATED_P (insn) = 1;
2378 /* Stack adjustment for exception handler. */
2379 if (crtl->calls_eh_return)
2381 /* We need to unwind the stack by the offset computed by
2382 EH_RETURN_STACKADJ_RTX. However, at this point the CFA is
2383 based on SP. Ideally we would update the SP and define the
2384 CFA along the lines of:
2386 SP = SP + EH_RETURN_STACKADJ_RTX
2387 (regnote CFA = SP - EH_RETURN_STACKADJ_RTX)
2389 However the dwarf emitter only understands a constant
2392 The solution chosen here is to use the otherwise unused IP0
2393 as a temporary register to hold the current SP value. The
2394 CFA is described using IP0 then SP is modified. */
2396 rtx ip0 = gen_rtx_REG (DImode, IP0_REGNUM);
2398 insn = emit_move_insn (ip0, stack_pointer_rtx);
2399 add_reg_note (insn, REG_CFA_DEF_CFA, ip0);
2400 RTX_FRAME_RELATED_P (insn) = 1;
2402 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
2404 /* Ensure the assignment to IP0 does not get optimized away. */
2408 if (frame_size > -1)
2410 if (frame_size >= 0x1000000)
2412 rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2413 emit_move_insn (op0, GEN_INT (frame_size));
2414 emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2415 aarch64_set_frame_expr (gen_rtx_SET
2416 (Pmode, stack_pointer_rtx,
2417 plus_constant (Pmode,
2421 else if (frame_size > 0)
2423 if ((frame_size & 0xfff) != 0)
2425 insn = emit_insn (gen_add2_insn
2427 GEN_INT ((frame_size
2428 & (HOST_WIDE_INT) 0xfff))));
2429 RTX_FRAME_RELATED_P (insn) = 1;
2431 if ((frame_size & 0xfff) != frame_size)
2433 insn = emit_insn (gen_add2_insn
2435 GEN_INT ((frame_size
2436 & ~ (HOST_WIDE_INT) 0xfff))));
2437 RTX_FRAME_RELATED_P (insn) = 1;
2441 aarch64_set_frame_expr (gen_rtx_SET (Pmode, stack_pointer_rtx,
2442 plus_constant (Pmode,
2447 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
2449 emit_jump_insn (ret_rtx);
2452 /* Return the place to copy the exception unwinding return address to.
2453 This will probably be a stack slot, but could (in theory be the
2454 return register). */
2456 aarch64_final_eh_return_addr (void)
2458 HOST_WIDE_INT original_frame_size, frame_size, offset, fp_offset;
2459 aarch64_layout_frame ();
2460 original_frame_size = get_frame_size () + cfun->machine->saved_varargs_size;
2461 frame_size = (original_frame_size + cfun->machine->frame.saved_regs_size
2462 + crtl->outgoing_args_size);
2463 offset = frame_size = AARCH64_ROUND_UP (frame_size,
2464 STACK_BOUNDARY / BITS_PER_UNIT);
2466 - original_frame_size
2467 - cfun->machine->frame.saved_regs_size;
2469 if (cfun->machine->frame.reg_offset[LR_REGNUM] < 0)
2470 return gen_rtx_REG (DImode, LR_REGNUM);
2472 /* DSE and CSELIB do not detect an alias between sp+k1 and fp+k2. This can
2473 result in a store to save LR introduced by builtin_eh_return () being
2474 incorrectly deleted because the alias is not detected.
2475 So in the calculation of the address to copy the exception unwinding
2476 return address to, we note 2 cases.
2477 If FP is needed and the fp_offset is 0, it means that SP = FP and hence
2478 we return a SP-relative location since all the addresses are SP-relative
2479 in this case. This prevents the store from being optimized away.
2480 If the fp_offset is not 0, then the addresses will be FP-relative and
2481 therefore we return a FP-relative location. */
2483 if (frame_pointer_needed)
2486 return gen_frame_mem (DImode,
2487 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
2489 return gen_frame_mem (DImode,
2490 plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD));
2493 /* If FP is not needed, we calculate the location of LR, which would be
2494 at the top of the saved registers block. */
2496 return gen_frame_mem (DImode,
2497 plus_constant (Pmode,
2500 + cfun->machine->frame.saved_regs_size
2501 - 2 * UNITS_PER_WORD));
2504 /* Possibly output code to build up a constant in a register. For
2505 the benefit of the costs infrastructure, returns the number of
2506 instructions which would be emitted. GENERATE inhibits or
2507 enables code generation. */
2510 aarch64_build_constant (int regnum, HOST_WIDE_INT val, bool generate)
2514 if (aarch64_bitmask_imm (val, DImode))
2517 emit_move_insn (gen_rtx_REG (Pmode, regnum), GEN_INT (val));
2525 HOST_WIDE_INT valp = val >> 16;
2529 for (i = 16; i < 64; i += 16)
2531 valm = (valp & 0xffff);
2542 /* zcount contains the number of additional MOVK instructions
2543 required if the constant is built up with an initial MOVZ instruction,
2544 while ncount is the number of MOVK instructions required if starting
2545 with a MOVN instruction. Choose the sequence that yields the fewest
2546 number of instructions, preferring MOVZ instructions when they are both
2548 if (ncount < zcount)
2551 emit_move_insn (gen_rtx_REG (Pmode, regnum),
2552 GEN_INT (val | ~(HOST_WIDE_INT) 0xffff));
2559 emit_move_insn (gen_rtx_REG (Pmode, regnum),
2560 GEN_INT (val & 0xffff));
2567 for (i = 16; i < 64; i += 16)
2569 if ((val & 0xffff) != tval)
2572 emit_insn (gen_insv_immdi (gen_rtx_REG (Pmode, regnum),
2574 GEN_INT (val & 0xffff)));
2584 aarch64_add_constant (int regnum, int scratchreg, HOST_WIDE_INT delta)
2586 HOST_WIDE_INT mdelta = delta;
2587 rtx this_rtx = gen_rtx_REG (Pmode, regnum);
2588 rtx scratch_rtx = gen_rtx_REG (Pmode, scratchreg);
2593 if (mdelta >= 4096 * 4096)
2595 (void) aarch64_build_constant (scratchreg, delta, true);
2596 emit_insn (gen_add3_insn (this_rtx, this_rtx, scratch_rtx));
2598 else if (mdelta > 0)
2602 emit_insn (gen_rtx_SET (Pmode, scratch_rtx, GEN_INT (mdelta / 4096)));
2603 rtx shift = gen_rtx_ASHIFT (Pmode, scratch_rtx, GEN_INT (12));
2605 emit_insn (gen_rtx_SET (Pmode, this_rtx,
2606 gen_rtx_MINUS (Pmode, this_rtx, shift)));
2608 emit_insn (gen_rtx_SET (Pmode, this_rtx,
2609 gen_rtx_PLUS (Pmode, this_rtx, shift)));
2611 if (mdelta % 4096 != 0)
2613 scratch_rtx = GEN_INT ((delta < 0 ? -1 : 1) * (mdelta % 4096));
2614 emit_insn (gen_rtx_SET (Pmode, this_rtx,
2615 gen_rtx_PLUS (Pmode, this_rtx, scratch_rtx)));
2620 /* Output code to add DELTA to the first argument, and then jump
2621 to FUNCTION. Used for C++ multiple inheritance. */
2623 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
2624 HOST_WIDE_INT delta,
2625 HOST_WIDE_INT vcall_offset,
2628 /* The this pointer is always in x0. Note that this differs from
2629 Arm where the this pointer maybe bumped to r1 if r0 is required
2630 to return a pointer to an aggregate. On AArch64 a result value
2631 pointer will be in x8. */
2632 int this_regno = R0_REGNUM;
2633 rtx this_rtx, temp0, temp1, addr, insn, funexp;
2635 reload_completed = 1;
2636 emit_note (NOTE_INSN_PROLOGUE_END);
2638 if (vcall_offset == 0)
2639 aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2642 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
2644 this_rtx = gen_rtx_REG (Pmode, this_regno);
2645 temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2646 temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
2651 if (delta >= -256 && delta < 256)
2652 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
2653 plus_constant (Pmode, this_rtx, delta));
2655 aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2658 if (Pmode == ptr_mode)
2659 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
2661 aarch64_emit_move (temp0,
2662 gen_rtx_ZERO_EXTEND (Pmode,
2663 gen_rtx_MEM (ptr_mode, addr)));
2665 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
2666 addr = plus_constant (Pmode, temp0, vcall_offset);
2669 (void) aarch64_build_constant (IP1_REGNUM, vcall_offset, true);
2670 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
2673 if (Pmode == ptr_mode)
2674 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
2676 aarch64_emit_move (temp1,
2677 gen_rtx_SIGN_EXTEND (Pmode,
2678 gen_rtx_MEM (ptr_mode, addr)));
2680 emit_insn (gen_add2_insn (this_rtx, temp1));
2683 /* Generate a tail call to the target function. */
2684 if (!TREE_USED (function))
2686 assemble_external (function);
2687 TREE_USED (function) = 1;
2689 funexp = XEXP (DECL_RTL (function), 0);
2690 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
2691 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
2692 SIBLING_CALL_P (insn) = 1;
2694 insn = get_insns ();
2695 shorten_branches (insn);
2696 final_start_function (insn, file, 1);
2697 final (insn, file, 1);
2698 final_end_function ();
2700 /* Stop pretending to be a post-reload pass. */
2701 reload_completed = 0;
2705 aarch64_tls_operand_p_1 (rtx *x, void *data ATTRIBUTE_UNUSED)
2707 if (GET_CODE (*x) == SYMBOL_REF)
2708 return SYMBOL_REF_TLS_MODEL (*x) != 0;
2710 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
2711 TLS offsets, not real symbol references. */
2712 if (GET_CODE (*x) == UNSPEC
2713 && XINT (*x, 1) == UNSPEC_TLS)
2720 aarch64_tls_referenced_p (rtx x)
2722 if (!TARGET_HAVE_TLS)
2725 return for_each_rtx (&x, aarch64_tls_operand_p_1, NULL);
2730 aarch64_bitmasks_cmp (const void *i1, const void *i2)
2732 const unsigned HOST_WIDE_INT *imm1 = (const unsigned HOST_WIDE_INT *) i1;
2733 const unsigned HOST_WIDE_INT *imm2 = (const unsigned HOST_WIDE_INT *) i2;
2744 aarch64_build_bitmask_table (void)
2746 unsigned HOST_WIDE_INT mask, imm;
2747 unsigned int log_e, e, s, r;
2748 unsigned int nimms = 0;
2750 for (log_e = 1; log_e <= 6; log_e++)
2754 mask = ~(HOST_WIDE_INT) 0;
2756 mask = ((HOST_WIDE_INT) 1 << e) - 1;
2757 for (s = 1; s < e; s++)
2759 for (r = 0; r < e; r++)
2761 /* set s consecutive bits to 1 (s < 64) */
2762 imm = ((unsigned HOST_WIDE_INT)1 << s) - 1;
2763 /* rotate right by r */
2765 imm = ((imm >> r) | (imm << (e - r))) & mask;
2766 /* replicate the constant depending on SIMD size */
2768 case 1: imm |= (imm << 2);
2769 case 2: imm |= (imm << 4);
2770 case 3: imm |= (imm << 8);
2771 case 4: imm |= (imm << 16);
2772 case 5: imm |= (imm << 32);
2778 gcc_assert (nimms < AARCH64_NUM_BITMASKS);
2779 aarch64_bitmasks[nimms++] = imm;
2784 gcc_assert (nimms == AARCH64_NUM_BITMASKS);
2785 qsort (aarch64_bitmasks, nimms, sizeof (aarch64_bitmasks[0]),
2786 aarch64_bitmasks_cmp);
2790 /* Return true if val can be encoded as a 12-bit unsigned immediate with
2791 a left shift of 0 or 12 bits. */
2793 aarch64_uimm12_shift (HOST_WIDE_INT val)
2795 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
2796 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
2801 /* Return true if val is an immediate that can be loaded into a
2802 register by a MOVZ instruction. */
2804 aarch64_movw_imm (HOST_WIDE_INT val, enum machine_mode mode)
2806 if (GET_MODE_SIZE (mode) > 4)
2808 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
2809 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
2814 /* Ignore sign extension. */
2815 val &= (HOST_WIDE_INT) 0xffffffff;
2817 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
2818 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
2822 /* Return true if val is a valid bitmask immediate. */
2824 aarch64_bitmask_imm (HOST_WIDE_INT val, enum machine_mode mode)
2826 if (GET_MODE_SIZE (mode) < 8)
2828 /* Replicate bit pattern. */
2829 val &= (HOST_WIDE_INT) 0xffffffff;
2832 return bsearch (&val, aarch64_bitmasks, AARCH64_NUM_BITMASKS,
2833 sizeof (aarch64_bitmasks[0]), aarch64_bitmasks_cmp) != NULL;
2837 /* Return true if val is an immediate that can be loaded into a
2838 register in a single instruction. */
2840 aarch64_move_imm (HOST_WIDE_INT val, enum machine_mode mode)
2842 if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
2844 return aarch64_bitmask_imm (val, mode);
2848 aarch64_cannot_force_const_mem (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
2852 if (GET_CODE (x) == HIGH)
2855 split_const (x, &base, &offset);
2856 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
2858 if (aarch64_classify_symbol (base, SYMBOL_CONTEXT_ADR)
2859 != SYMBOL_FORCE_TO_MEM)
2862 /* Avoid generating a 64-bit relocation in ILP32; leave
2863 to aarch64_expand_mov_immediate to handle it properly. */
2864 return mode != ptr_mode;
2867 return aarch64_tls_referenced_p (x);
2870 /* Return true if register REGNO is a valid index register.
2871 STRICT_P is true if REG_OK_STRICT is in effect. */
2874 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
2876 if (!HARD_REGISTER_NUM_P (regno))
2884 regno = reg_renumber[regno];
2886 return GP_REGNUM_P (regno);
2889 /* Return true if register REGNO is a valid base register for mode MODE.
2890 STRICT_P is true if REG_OK_STRICT is in effect. */
2893 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
2895 if (!HARD_REGISTER_NUM_P (regno))
2903 regno = reg_renumber[regno];
2906 /* The fake registers will be eliminated to either the stack or
2907 hard frame pointer, both of which are usually valid base registers.
2908 Reload deals with the cases where the eliminated form isn't valid. */
2909 return (GP_REGNUM_P (regno)
2910 || regno == SP_REGNUM
2911 || regno == FRAME_POINTER_REGNUM
2912 || regno == ARG_POINTER_REGNUM);
2915 /* Return true if X is a valid base register for mode MODE.
2916 STRICT_P is true if REG_OK_STRICT is in effect. */
2919 aarch64_base_register_rtx_p (rtx x, bool strict_p)
2921 if (!strict_p && GET_CODE (x) == SUBREG)
2924 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
2927 /* Return true if address offset is a valid index. If it is, fill in INFO
2928 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
2931 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
2932 enum machine_mode mode, bool strict_p)
2934 enum aarch64_address_type type;
2939 if ((REG_P (x) || GET_CODE (x) == SUBREG)
2940 && GET_MODE (x) == Pmode)
2942 type = ADDRESS_REG_REG;
2946 /* (sign_extend:DI (reg:SI)) */
2947 else if ((GET_CODE (x) == SIGN_EXTEND
2948 || GET_CODE (x) == ZERO_EXTEND)
2949 && GET_MODE (x) == DImode
2950 && GET_MODE (XEXP (x, 0)) == SImode)
2952 type = (GET_CODE (x) == SIGN_EXTEND)
2953 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
2954 index = XEXP (x, 0);
2957 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
2958 else if (GET_CODE (x) == MULT
2959 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
2960 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
2961 && GET_MODE (XEXP (x, 0)) == DImode
2962 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
2963 && CONST_INT_P (XEXP (x, 1)))
2965 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
2966 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
2967 index = XEXP (XEXP (x, 0), 0);
2968 shift = exact_log2 (INTVAL (XEXP (x, 1)));
2970 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
2971 else if (GET_CODE (x) == ASHIFT
2972 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
2973 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
2974 && GET_MODE (XEXP (x, 0)) == DImode
2975 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
2976 && CONST_INT_P (XEXP (x, 1)))
2978 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
2979 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
2980 index = XEXP (XEXP (x, 0), 0);
2981 shift = INTVAL (XEXP (x, 1));
2983 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
2984 else if ((GET_CODE (x) == SIGN_EXTRACT
2985 || GET_CODE (x) == ZERO_EXTRACT)
2986 && GET_MODE (x) == DImode
2987 && GET_CODE (XEXP (x, 0)) == MULT
2988 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
2989 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
2991 type = (GET_CODE (x) == SIGN_EXTRACT)
2992 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
2993 index = XEXP (XEXP (x, 0), 0);
2994 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
2995 if (INTVAL (XEXP (x, 1)) != 32 + shift
2996 || INTVAL (XEXP (x, 2)) != 0)
2999 /* (and:DI (mult:DI (reg:DI) (const_int scale))
3000 (const_int 0xffffffff<<shift)) */
3001 else if (GET_CODE (x) == AND
3002 && GET_MODE (x) == DImode
3003 && GET_CODE (XEXP (x, 0)) == MULT
3004 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3005 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3006 && CONST_INT_P (XEXP (x, 1)))
3008 type = ADDRESS_REG_UXTW;
3009 index = XEXP (XEXP (x, 0), 0);
3010 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3011 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3014 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
3015 else if ((GET_CODE (x) == SIGN_EXTRACT
3016 || GET_CODE (x) == ZERO_EXTRACT)
3017 && GET_MODE (x) == DImode
3018 && GET_CODE (XEXP (x, 0)) == ASHIFT
3019 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3020 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3022 type = (GET_CODE (x) == SIGN_EXTRACT)
3023 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3024 index = XEXP (XEXP (x, 0), 0);
3025 shift = INTVAL (XEXP (XEXP (x, 0), 1));
3026 if (INTVAL (XEXP (x, 1)) != 32 + shift
3027 || INTVAL (XEXP (x, 2)) != 0)
3030 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
3031 (const_int 0xffffffff<<shift)) */
3032 else if (GET_CODE (x) == AND
3033 && GET_MODE (x) == DImode
3034 && GET_CODE (XEXP (x, 0)) == ASHIFT
3035 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3036 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3037 && CONST_INT_P (XEXP (x, 1)))
3039 type = ADDRESS_REG_UXTW;
3040 index = XEXP (XEXP (x, 0), 0);
3041 shift = INTVAL (XEXP (XEXP (x, 0), 1));
3042 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3045 /* (mult:P (reg:P) (const_int scale)) */
3046 else if (GET_CODE (x) == MULT
3047 && GET_MODE (x) == Pmode
3048 && GET_MODE (XEXP (x, 0)) == Pmode
3049 && CONST_INT_P (XEXP (x, 1)))
3051 type = ADDRESS_REG_REG;
3052 index = XEXP (x, 0);
3053 shift = exact_log2 (INTVAL (XEXP (x, 1)));
3055 /* (ashift:P (reg:P) (const_int shift)) */
3056 else if (GET_CODE (x) == ASHIFT
3057 && GET_MODE (x) == Pmode
3058 && GET_MODE (XEXP (x, 0)) == Pmode
3059 && CONST_INT_P (XEXP (x, 1)))
3061 type = ADDRESS_REG_REG;
3062 index = XEXP (x, 0);
3063 shift = INTVAL (XEXP (x, 1));
3068 if (GET_CODE (index) == SUBREG)
3069 index = SUBREG_REG (index);
3072 (shift > 0 && shift <= 3
3073 && (1 << shift) == GET_MODE_SIZE (mode)))
3075 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
3078 info->offset = index;
3079 info->shift = shift;
3087 offset_7bit_signed_scaled_p (enum machine_mode mode, HOST_WIDE_INT offset)
3089 return (offset >= -64 * GET_MODE_SIZE (mode)
3090 && offset < 64 * GET_MODE_SIZE (mode)
3091 && offset % GET_MODE_SIZE (mode) == 0);
3095 offset_9bit_signed_unscaled_p (enum machine_mode mode ATTRIBUTE_UNUSED,
3096 HOST_WIDE_INT offset)
3098 return offset >= -256 && offset < 256;
3102 offset_12bit_unsigned_scaled_p (enum machine_mode mode, HOST_WIDE_INT offset)
3105 && offset < 4096 * GET_MODE_SIZE (mode)
3106 && offset % GET_MODE_SIZE (mode) == 0);
3109 /* Return true if X is a valid address for machine mode MODE. If it is,
3110 fill in INFO appropriately. STRICT_P is true if REG_OK_STRICT is in
3111 effect. OUTER_CODE is PARALLEL for a load/store pair. */
3114 aarch64_classify_address (struct aarch64_address_info *info,
3115 rtx x, enum machine_mode mode,
3116 RTX_CODE outer_code, bool strict_p)
3118 enum rtx_code code = GET_CODE (x);
3120 bool allow_reg_index_p =
3121 outer_code != PARALLEL && GET_MODE_SIZE(mode) != 16;
3123 /* Don't support anything other than POST_INC or REG addressing for
3125 if (aarch64_vector_mode_p (mode)
3126 && (code != POST_INC && code != REG))
3133 info->type = ADDRESS_REG_IMM;
3135 info->offset = const0_rtx;
3136 return aarch64_base_register_rtx_p (x, strict_p);
3141 if (GET_MODE_SIZE (mode) != 0
3142 && CONST_INT_P (op1)
3143 && aarch64_base_register_rtx_p (op0, strict_p))
3145 HOST_WIDE_INT offset = INTVAL (op1);
3147 info->type = ADDRESS_REG_IMM;
3151 /* TImode and TFmode values are allowed in both pairs of X
3152 registers and individual Q registers. The available
3154 X,X: 7-bit signed scaled offset
3155 Q: 9-bit signed offset
3156 We conservatively require an offset representable in either mode.
3158 if (mode == TImode || mode == TFmode)
3159 return (offset_7bit_signed_scaled_p (mode, offset)
3160 && offset_9bit_signed_unscaled_p (mode, offset));
3162 if (outer_code == PARALLEL)
3163 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3164 && offset_7bit_signed_scaled_p (mode, offset));
3166 return (offset_9bit_signed_unscaled_p (mode, offset)
3167 || offset_12bit_unsigned_scaled_p (mode, offset));
3170 if (allow_reg_index_p)
3172 /* Look for base + (scaled/extended) index register. */
3173 if (aarch64_base_register_rtx_p (op0, strict_p)
3174 && aarch64_classify_index (info, op1, mode, strict_p))
3179 if (aarch64_base_register_rtx_p (op1, strict_p)
3180 && aarch64_classify_index (info, op0, mode, strict_p))
3193 info->type = ADDRESS_REG_WB;
3194 info->base = XEXP (x, 0);
3195 info->offset = NULL_RTX;
3196 return aarch64_base_register_rtx_p (info->base, strict_p);
3200 info->type = ADDRESS_REG_WB;
3201 info->base = XEXP (x, 0);
3202 if (GET_CODE (XEXP (x, 1)) == PLUS
3203 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
3204 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
3205 && aarch64_base_register_rtx_p (info->base, strict_p))
3207 HOST_WIDE_INT offset;
3208 info->offset = XEXP (XEXP (x, 1), 1);
3209 offset = INTVAL (info->offset);
3211 /* TImode and TFmode values are allowed in both pairs of X
3212 registers and individual Q registers. The available
3214 X,X: 7-bit signed scaled offset
3215 Q: 9-bit signed offset
3216 We conservatively require an offset representable in either mode.
3218 if (mode == TImode || mode == TFmode)
3219 return (offset_7bit_signed_scaled_p (mode, offset)
3220 && offset_9bit_signed_unscaled_p (mode, offset));
3222 if (outer_code == PARALLEL)
3223 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3224 && offset_7bit_signed_scaled_p (mode, offset));
3226 return offset_9bit_signed_unscaled_p (mode, offset);
3233 /* load literal: pc-relative constant pool entry. Only supported
3234 for SI mode or larger. */
3235 info->type = ADDRESS_SYMBOLIC;
3236 if (outer_code != PARALLEL && GET_MODE_SIZE (mode) >= 4)
3240 split_const (x, &sym, &addend);
3241 return (GET_CODE (sym) == LABEL_REF
3242 || (GET_CODE (sym) == SYMBOL_REF
3243 && CONSTANT_POOL_ADDRESS_P (sym)));
3248 info->type = ADDRESS_LO_SUM;
3249 info->base = XEXP (x, 0);
3250 info->offset = XEXP (x, 1);
3251 if (allow_reg_index_p
3252 && aarch64_base_register_rtx_p (info->base, strict_p))
3255 split_const (info->offset, &sym, &offs);
3256 if (GET_CODE (sym) == SYMBOL_REF
3257 && (aarch64_classify_symbol (sym, SYMBOL_CONTEXT_MEM)
3258 == SYMBOL_SMALL_ABSOLUTE))
3260 /* The symbol and offset must be aligned to the access size. */
3262 unsigned int ref_size;
3264 if (CONSTANT_POOL_ADDRESS_P (sym))
3265 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
3266 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
3268 tree exp = SYMBOL_REF_DECL (sym);
3269 align = TYPE_ALIGN (TREE_TYPE (exp));
3270 align = CONSTANT_ALIGNMENT (exp, align);
3272 else if (SYMBOL_REF_DECL (sym))
3273 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
3274 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
3275 && SYMBOL_REF_BLOCK (sym) != NULL)
3276 align = SYMBOL_REF_BLOCK (sym)->alignment;
3278 align = BITS_PER_UNIT;
3280 ref_size = GET_MODE_SIZE (mode);
3282 ref_size = GET_MODE_SIZE (DImode);
3284 return ((INTVAL (offs) & (ref_size - 1)) == 0
3285 && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
3296 aarch64_symbolic_address_p (rtx x)
3300 split_const (x, &x, &offset);
3301 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
3304 /* Classify the base of symbolic expression X, given that X appears in
3307 enum aarch64_symbol_type
3308 aarch64_classify_symbolic_expression (rtx x,
3309 enum aarch64_symbol_context context)
3313 split_const (x, &x, &offset);
3314 return aarch64_classify_symbol (x, context);
3318 /* Return TRUE if X is a legitimate address for accessing memory in
3321 aarch64_legitimate_address_hook_p (enum machine_mode mode, rtx x, bool strict_p)
3323 struct aarch64_address_info addr;
3325 return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
3328 /* Return TRUE if X is a legitimate address for accessing memory in
3329 mode MODE. OUTER_CODE will be PARALLEL if this is a load/store
3332 aarch64_legitimate_address_p (enum machine_mode mode, rtx x,
3333 RTX_CODE outer_code, bool strict_p)
3335 struct aarch64_address_info addr;
3337 return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
3340 /* Return TRUE if rtx X is immediate constant 0.0 */
3342 aarch64_float_const_zero_rtx_p (rtx x)
3346 if (GET_MODE (x) == VOIDmode)
3349 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
3350 if (REAL_VALUE_MINUS_ZERO (r))
3351 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
3352 return REAL_VALUES_EQUAL (r, dconst0);
3355 /* Return the fixed registers used for condition codes. */
3358 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
3361 *p2 = INVALID_REGNUM;
3366 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
3368 /* All floating point compares return CCFP if it is an equality
3369 comparison, and CCFPE otherwise. */
3370 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
3397 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3399 && (code == EQ || code == NE || code == LT || code == GE)
3400 && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
3401 || GET_CODE (x) == NEG))
3404 /* A compare with a shifted operand. Because of canonicalization,
3405 the comparison will have to be swapped when we emit the assembly
3407 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3408 && (GET_CODE (y) == REG || GET_CODE (y) == SUBREG)
3409 && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
3410 || GET_CODE (x) == LSHIFTRT
3411 || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
3414 /* Similarly for a negated operand, but we can only do this for
3416 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3417 && (GET_CODE (y) == REG || GET_CODE (y) == SUBREG)
3418 && (code == EQ || code == NE)
3419 && GET_CODE (x) == NEG)
3422 /* A compare of a mode narrower than SI mode against zero can be done
3423 by extending the value in the comparison. */
3424 if ((GET_MODE (x) == QImode || GET_MODE (x) == HImode)
3426 /* Only use sign-extension if we really need it. */
3427 return ((code == GT || code == GE || code == LE || code == LT)
3428 ? CC_SESWPmode : CC_ZESWPmode);
3430 /* For everything else, return CCmode. */
3435 aarch64_get_condition_code (rtx x)
3437 enum machine_mode mode = GET_MODE (XEXP (x, 0));
3438 enum rtx_code comp_code = GET_CODE (x);
3440 if (GET_MODE_CLASS (mode) != MODE_CC)
3441 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
3449 case GE: return AARCH64_GE;
3450 case GT: return AARCH64_GT;
3451 case LE: return AARCH64_LS;
3452 case LT: return AARCH64_MI;
3453 case NE: return AARCH64_NE;
3454 case EQ: return AARCH64_EQ;
3455 case ORDERED: return AARCH64_VC;
3456 case UNORDERED: return AARCH64_VS;
3457 case UNLT: return AARCH64_LT;
3458 case UNLE: return AARCH64_LE;
3459 case UNGT: return AARCH64_HI;
3460 case UNGE: return AARCH64_PL;
3461 default: gcc_unreachable ();
3468 case NE: return AARCH64_NE;
3469 case EQ: return AARCH64_EQ;
3470 case GE: return AARCH64_GE;
3471 case GT: return AARCH64_GT;
3472 case LE: return AARCH64_LE;
3473 case LT: return AARCH64_LT;
3474 case GEU: return AARCH64_CS;
3475 case GTU: return AARCH64_HI;
3476 case LEU: return AARCH64_LS;
3477 case LTU: return AARCH64_CC;
3478 default: gcc_unreachable ();
3487 case NE: return AARCH64_NE;
3488 case EQ: return AARCH64_EQ;
3489 case GE: return AARCH64_LE;
3490 case GT: return AARCH64_LT;
3491 case LE: return AARCH64_GE;
3492 case LT: return AARCH64_GT;
3493 case GEU: return AARCH64_LS;
3494 case GTU: return AARCH64_CC;
3495 case LEU: return AARCH64_CS;
3496 case LTU: return AARCH64_HI;
3497 default: gcc_unreachable ();
3504 case NE: return AARCH64_NE;
3505 case EQ: return AARCH64_EQ;
3506 case GE: return AARCH64_PL;
3507 case LT: return AARCH64_MI;
3508 default: gcc_unreachable ();
3515 case NE: return AARCH64_NE;
3516 case EQ: return AARCH64_EQ;
3517 default: gcc_unreachable ();
3528 bit_count (unsigned HOST_WIDE_INT value)
3542 aarch64_print_operand (FILE *f, rtx x, char code)
3546 /* An integer or symbol address without a preceding # sign. */
3548 switch (GET_CODE (x))
3551 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
3555 output_addr_const (f, x);
3559 if (GET_CODE (XEXP (x, 0)) == PLUS
3560 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
3562 output_addr_const (f, x);
3568 output_operand_lossage ("Unsupported operand for code '%c'", code);
3573 /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w. */
3577 if (GET_CODE (x) != CONST_INT
3578 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
3580 output_operand_lossage ("invalid operand for '%%%c'", code);
3596 output_operand_lossage ("invalid operand for '%%%c'", code);
3606 /* Print N such that 2^N == X. */
3607 if (GET_CODE (x) != CONST_INT || (n = exact_log2 (INTVAL (x))) < 0)
3609 output_operand_lossage ("invalid operand for '%%%c'", code);
3613 asm_fprintf (f, "%d", n);
3618 /* Print the number of non-zero bits in X (a const_int). */
3619 if (GET_CODE (x) != CONST_INT)
3621 output_operand_lossage ("invalid operand for '%%%c'", code);
3625 asm_fprintf (f, "%u", bit_count (INTVAL (x)));
3629 /* Print the higher numbered register of a pair (TImode) of regs. */
3630 if (GET_CODE (x) != REG || !GP_REGNUM_P (REGNO (x) + 1))
3632 output_operand_lossage ("invalid operand for '%%%c'", code);
3636 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
3640 /* Print a condition (eq, ne, etc). */
3642 /* CONST_TRUE_RTX means always -- that's the default. */
3643 if (x == const_true_rtx)
3646 if (!COMPARISON_P (x))
3648 output_operand_lossage ("invalid operand for '%%%c'", code);
3652 fputs (aarch64_condition_codes[aarch64_get_condition_code (x)], f);
3656 /* Print the inverse of a condition (eq <-> ne, etc). */
3658 /* CONST_TRUE_RTX means never -- that's the default. */
3659 if (x == const_true_rtx)
3665 if (!COMPARISON_P (x))
3667 output_operand_lossage ("invalid operand for '%%%c'", code);
3671 fputs (aarch64_condition_codes[AARCH64_INVERSE_CONDITION_CODE
3672 (aarch64_get_condition_code (x))], f);
3680 /* Print a scalar FP/SIMD register name. */
3681 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
3683 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
3686 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
3693 /* Print the first FP/SIMD register name in a list. */
3694 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
3696 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
3699 asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
3703 /* Print bottom 16 bits of integer constant in hex. */
3704 if (GET_CODE (x) != CONST_INT)
3706 output_operand_lossage ("invalid operand for '%%%c'", code);
3709 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
3714 /* Print a general register name or the zero register (32-bit or
3717 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
3719 asm_fprintf (f, "%czr", code);
3723 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
3725 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
3729 if (REG_P (x) && REGNO (x) == SP_REGNUM)
3731 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
3738 /* Print a normal operand, if it's a general register, then we
3742 output_operand_lossage ("missing operand");
3746 switch (GET_CODE (x))
3749 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
3753 aarch64_memory_reference_mode = GET_MODE (x);
3754 output_address (XEXP (x, 0));
3759 output_addr_const (asm_out_file, x);
3763 asm_fprintf (f, "%wd", INTVAL (x));
3767 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
3769 gcc_assert (aarch64_const_vec_all_same_int_p (x,
3771 HOST_WIDE_INT_MAX));
3772 asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
3774 else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
3783 /* CONST_DOUBLE can represent a double-width integer.
3784 In this case, the mode of x is VOIDmode. */
3785 if (GET_MODE (x) == VOIDmode)
3787 else if (aarch64_float_const_zero_rtx_p (x))
3792 else if (aarch64_float_const_representable_p (x))
3795 char float_buf[buf_size] = {'\0'};
3797 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
3798 real_to_decimal_for_mode (float_buf, &r,
3801 asm_fprintf (asm_out_file, "%s", float_buf);
3805 output_operand_lossage ("invalid constant");
3808 output_operand_lossage ("invalid operand");
3814 if (GET_CODE (x) == HIGH)
3817 switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
3819 case SYMBOL_SMALL_GOT:
3820 asm_fprintf (asm_out_file, ":got:");
3823 case SYMBOL_SMALL_TLSGD:
3824 asm_fprintf (asm_out_file, ":tlsgd:");
3827 case SYMBOL_SMALL_TLSDESC:
3828 asm_fprintf (asm_out_file, ":tlsdesc:");
3831 case SYMBOL_SMALL_GOTTPREL:
3832 asm_fprintf (asm_out_file, ":gottprel:");
3835 case SYMBOL_SMALL_TPREL:
3836 asm_fprintf (asm_out_file, ":tprel:");
3839 case SYMBOL_TINY_GOT:
3846 output_addr_const (asm_out_file, x);
3850 switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
3852 case SYMBOL_SMALL_GOT:
3853 asm_fprintf (asm_out_file, ":lo12:");
3856 case SYMBOL_SMALL_TLSGD:
3857 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
3860 case SYMBOL_SMALL_TLSDESC:
3861 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
3864 case SYMBOL_SMALL_GOTTPREL:
3865 asm_fprintf (asm_out_file, ":gottprel_lo12:");
3868 case SYMBOL_SMALL_TPREL:
3869 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
3872 case SYMBOL_TINY_GOT:
3873 asm_fprintf (asm_out_file, ":got:");
3879 output_addr_const (asm_out_file, x);
3884 switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
3886 case SYMBOL_SMALL_TPREL:
3887 asm_fprintf (asm_out_file, ":tprel_hi12:");
3892 output_addr_const (asm_out_file, x);
3896 output_operand_lossage ("invalid operand prefix '%%%c'", code);
3902 aarch64_print_operand_address (FILE *f, rtx x)
3904 struct aarch64_address_info addr;
3906 if (aarch64_classify_address (&addr, x, aarch64_memory_reference_mode,
3910 case ADDRESS_REG_IMM:
3911 if (addr.offset == const0_rtx)
3912 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
3914 asm_fprintf (f, "[%s,%wd]", reg_names [REGNO (addr.base)],
3915 INTVAL (addr.offset));
3918 case ADDRESS_REG_REG:
3919 if (addr.shift == 0)
3920 asm_fprintf (f, "[%s,%s]", reg_names [REGNO (addr.base)],
3921 reg_names [REGNO (addr.offset)]);
3923 asm_fprintf (f, "[%s,%s,lsl %u]", reg_names [REGNO (addr.base)],
3924 reg_names [REGNO (addr.offset)], addr.shift);
3927 case ADDRESS_REG_UXTW:
3928 if (addr.shift == 0)
3929 asm_fprintf (f, "[%s,w%d,uxtw]", reg_names [REGNO (addr.base)],
3930 REGNO (addr.offset) - R0_REGNUM);
3932 asm_fprintf (f, "[%s,w%d,uxtw %u]", reg_names [REGNO (addr.base)],
3933 REGNO (addr.offset) - R0_REGNUM, addr.shift);
3936 case ADDRESS_REG_SXTW:
3937 if (addr.shift == 0)
3938 asm_fprintf (f, "[%s,w%d,sxtw]", reg_names [REGNO (addr.base)],
3939 REGNO (addr.offset) - R0_REGNUM);
3941 asm_fprintf (f, "[%s,w%d,sxtw %u]", reg_names [REGNO (addr.base)],
3942 REGNO (addr.offset) - R0_REGNUM, addr.shift);
3945 case ADDRESS_REG_WB:
3946 switch (GET_CODE (x))
3949 asm_fprintf (f, "[%s,%d]!", reg_names [REGNO (addr.base)],
3950 GET_MODE_SIZE (aarch64_memory_reference_mode));
3953 asm_fprintf (f, "[%s],%d", reg_names [REGNO (addr.base)],
3954 GET_MODE_SIZE (aarch64_memory_reference_mode));
3957 asm_fprintf (f, "[%s,-%d]!", reg_names [REGNO (addr.base)],
3958 GET_MODE_SIZE (aarch64_memory_reference_mode));
3961 asm_fprintf (f, "[%s],-%d", reg_names [REGNO (addr.base)],
3962 GET_MODE_SIZE (aarch64_memory_reference_mode));
3965 asm_fprintf (f, "[%s,%wd]!", reg_names [REGNO (addr.base)],
3966 INTVAL (addr.offset));
3969 asm_fprintf (f, "[%s],%wd", reg_names [REGNO (addr.base)],
3970 INTVAL (addr.offset));
3977 case ADDRESS_LO_SUM:
3978 asm_fprintf (f, "[%s,#:lo12:", reg_names [REGNO (addr.base)]);
3979 output_addr_const (f, addr.offset);
3980 asm_fprintf (f, "]");
3983 case ADDRESS_SYMBOLIC:
3987 output_addr_const (f, x);
3991 aarch64_label_mentioned_p (rtx x)
3996 if (GET_CODE (x) == LABEL_REF)
3999 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
4000 referencing instruction, but they are constant offsets, not
4002 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
4005 fmt = GET_RTX_FORMAT (GET_CODE (x));
4006 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
4012 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
4013 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
4016 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
4023 /* Implement REGNO_REG_CLASS. */
4026 aarch64_regno_regclass (unsigned regno)
4028 if (GP_REGNUM_P (regno))
4031 if (regno == SP_REGNUM)
4034 if (regno == FRAME_POINTER_REGNUM
4035 || regno == ARG_POINTER_REGNUM)
4036 return POINTER_REGS;
4038 if (FP_REGNUM_P (regno))
4039 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
4044 /* Try a machine-dependent way of reloading an illegitimate address
4045 operand. If we find one, push the reload and return the new rtx. */
4048 aarch64_legitimize_reload_address (rtx *x_p,
4049 enum machine_mode mode,
4050 int opnum, int type,
4051 int ind_levels ATTRIBUTE_UNUSED)
4055 /* Do not allow mem (plus (reg, const)) if vector mode. */
4056 if (aarch64_vector_mode_p (mode)
4057 && GET_CODE (x) == PLUS
4058 && REG_P (XEXP (x, 0))
4059 && CONST_INT_P (XEXP (x, 1)))
4063 push_reload (orig_rtx, NULL_RTX, x_p, NULL,
4064 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4065 opnum, (enum reload_type) type);
4069 /* We must recognize output that we have already generated ourselves. */
4070 if (GET_CODE (x) == PLUS
4071 && GET_CODE (XEXP (x, 0)) == PLUS
4072 && REG_P (XEXP (XEXP (x, 0), 0))
4073 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4074 && CONST_INT_P (XEXP (x, 1)))
4076 push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4077 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4078 opnum, (enum reload_type) type);
4082 /* We wish to handle large displacements off a base register by splitting
4083 the addend across an add and the mem insn. This can cut the number of
4084 extra insns needed from 3 to 1. It is only useful for load/store of a
4085 single register with 12 bit offset field. */
4086 if (GET_CODE (x) == PLUS
4087 && REG_P (XEXP (x, 0))
4088 && CONST_INT_P (XEXP (x, 1))
4089 && HARD_REGISTER_P (XEXP (x, 0))
4092 && aarch64_regno_ok_for_base_p (REGNO (XEXP (x, 0)), true))
4094 HOST_WIDE_INT val = INTVAL (XEXP (x, 1));
4095 HOST_WIDE_INT low = val & 0xfff;
4096 HOST_WIDE_INT high = val - low;
4099 enum machine_mode xmode = GET_MODE (x);
4101 /* In ILP32, xmode can be either DImode or SImode. */
4102 gcc_assert (xmode == DImode || xmode == SImode);
4104 /* Reload non-zero BLKmode offsets. This is because we cannot ascertain
4105 BLKmode alignment. */
4106 if (GET_MODE_SIZE (mode) == 0)
4109 offs = low % GET_MODE_SIZE (mode);
4111 /* Align misaligned offset by adjusting high part to compensate. */
4114 if (aarch64_uimm12_shift (high + offs))
4123 offs = GET_MODE_SIZE (mode) - offs;
4125 high = high + (low & 0x1000) - offs;
4130 /* Check for overflow. */
4131 if (high + low != val)
4134 cst = GEN_INT (high);
4135 if (!aarch64_uimm12_shift (high))
4136 cst = force_const_mem (xmode, cst);
4138 /* Reload high part into base reg, leaving the low part
4139 in the mem instruction.
4140 Note that replacing this gen_rtx_PLUS with plus_constant is
4141 wrong in this case because we rely on the
4142 (plus (plus reg c1) c2) structure being preserved so that
4143 XEXP (*p, 0) in push_reload below uses the correct term. */
4144 x = gen_rtx_PLUS (xmode,
4145 gen_rtx_PLUS (xmode, XEXP (x, 0), cst),
4148 push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4149 BASE_REG_CLASS, xmode, VOIDmode, 0, 0,
4150 opnum, (enum reload_type) type);
4159 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
4161 enum machine_mode mode,
4162 secondary_reload_info *sri)
4164 /* Without the TARGET_SIMD instructions we cannot move a Q register
4165 to a Q register directly. We need a scratch. */
4166 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
4167 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
4168 && reg_class_subset_p (rclass, FP_REGS))
4171 sri->icode = CODE_FOR_aarch64_reload_movtf;
4172 else if (mode == TImode)
4173 sri->icode = CODE_FOR_aarch64_reload_movti;
4177 /* A TFmode or TImode memory access should be handled via an FP_REGS
4178 because AArch64 has richer addressing modes for LDR/STR instructions
4179 than LDP/STP instructions. */
4180 if (!TARGET_GENERAL_REGS_ONLY && rclass == CORE_REGS
4181 && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
4184 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
4191 aarch64_can_eliminate (const int from, const int to)
4193 /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
4194 HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM. */
4196 if (frame_pointer_needed)
4198 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4200 if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
4202 if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
4203 && !cfun->calls_alloca)
4205 if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4215 aarch64_initial_elimination_offset (unsigned from, unsigned to)
4217 HOST_WIDE_INT frame_size;
4218 HOST_WIDE_INT offset;
4220 aarch64_layout_frame ();
4221 frame_size = (get_frame_size () + cfun->machine->frame.saved_regs_size
4222 + crtl->outgoing_args_size
4223 + cfun->machine->saved_varargs_size);
4225 frame_size = AARCH64_ROUND_UP (frame_size, STACK_BOUNDARY / BITS_PER_UNIT);
4226 offset = frame_size;
4228 if (to == HARD_FRAME_POINTER_REGNUM)
4230 if (from == ARG_POINTER_REGNUM)
4231 return offset - crtl->outgoing_args_size;
4233 if (from == FRAME_POINTER_REGNUM)
4234 return cfun->machine->frame.saved_regs_size + get_frame_size ();
4237 if (to == STACK_POINTER_REGNUM)
4239 if (from == FRAME_POINTER_REGNUM)
4241 HOST_WIDE_INT elim = crtl->outgoing_args_size
4242 + cfun->machine->frame.saved_regs_size
4244 - cfun->machine->frame.fp_lr_offset;
4245 elim = AARCH64_ROUND_UP (elim, STACK_BOUNDARY / BITS_PER_UNIT);
4254 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
4258 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
4262 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
4267 aarch64_asm_trampoline_template (FILE *f)
4271 asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
4272 asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
4276 asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
4277 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
4279 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
4280 assemble_aligned_integer (4, const0_rtx);
4281 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4282 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4286 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
4288 rtx fnaddr, mem, a_tramp;
4289 const int tramp_code_sz = 16;
4291 /* Don't need to copy the trailing D-words, we fill those in below. */
4292 emit_block_move (m_tramp, assemble_trampoline_template (),
4293 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
4294 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
4295 fnaddr = XEXP (DECL_RTL (fndecl), 0);
4296 if (GET_MODE (fnaddr) != ptr_mode)
4297 fnaddr = convert_memory_address (ptr_mode, fnaddr);
4298 emit_move_insn (mem, fnaddr);
4300 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
4301 emit_move_insn (mem, chain_value);
4303 /* XXX We should really define a "clear_cache" pattern and use
4304 gen_clear_cache(). */
4305 a_tramp = XEXP (m_tramp, 0);
4306 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
4307 LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
4308 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
4312 static unsigned char
4313 aarch64_class_max_nregs (reg_class_t regclass, enum machine_mode mode)
4324 aarch64_vector_mode_p (mode) ? (GET_MODE_SIZE (mode) + 15) / 16 :
4325 (GET_MODE_SIZE (mode) + 7) / 8;
4339 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
4341 if (regclass == POINTER_REGS)
4342 return GENERAL_REGS;
4344 if (regclass == STACK_REG)
4347 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
4353 /* If it's an integer immediate that MOVI can't handle, then
4354 FP_REGS is not an option, so we return NO_REGS instead. */
4355 if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
4356 && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
4359 /* Register eliminiation can result in a request for
4360 SP+constant->FP_REGS. We cannot support such operations which
4361 use SP as source and an FP_REG as destination, so reject out
4363 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
4365 rtx lhs = XEXP (x, 0);
4367 /* Look through a possible SUBREG introduced by ILP32. */
4368 if (GET_CODE (lhs) == SUBREG)
4369 lhs = SUBREG_REG (lhs);
4371 gcc_assert (REG_P (lhs));
4372 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
4381 aarch64_asm_output_labelref (FILE* f, const char *name)
4383 asm_fprintf (f, "%U%s", name);
4387 aarch64_elf_asm_constructor (rtx symbol, int priority)
4389 if (priority == DEFAULT_INIT_PRIORITY)
4390 default_ctor_section_asm_out_constructor (symbol, priority);
4395 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
4396 s = get_section (buf, SECTION_WRITE, NULL);
4397 switch_to_section (s);
4398 assemble_align (POINTER_SIZE);
4399 assemble_aligned_integer (POINTER_BYTES, symbol);
4404 aarch64_elf_asm_destructor (rtx symbol, int priority)
4406 if (priority == DEFAULT_INIT_PRIORITY)
4407 default_dtor_section_asm_out_destructor (symbol, priority);
4412 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
4413 s = get_section (buf, SECTION_WRITE, NULL);
4414 switch_to_section (s);
4415 assemble_align (POINTER_SIZE);
4416 assemble_aligned_integer (POINTER_BYTES, symbol);
4421 aarch64_output_casesi (rtx *operands)
4425 rtx diff_vec = PATTERN (NEXT_INSN (operands[2]));
4427 static const char *const patterns[4][2] =
4430 "ldrb\t%w3, [%0,%w1,uxtw]",
4431 "add\t%3, %4, %w3, sxtb #2"
4434 "ldrh\t%w3, [%0,%w1,uxtw #1]",
4435 "add\t%3, %4, %w3, sxth #2"
4438 "ldr\t%w3, [%0,%w1,uxtw #2]",
4439 "add\t%3, %4, %w3, sxtw #2"
4441 /* We assume that DImode is only generated when not optimizing and
4442 that we don't really need 64-bit address offsets. That would
4443 imply an object file with 8GB of code in a single function! */
4445 "ldr\t%w3, [%0,%w1,uxtw #2]",
4446 "add\t%3, %4, %w3, sxtw #2"
4450 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
4452 index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
4454 gcc_assert (index >= 0 && index <= 3);
4456 /* Need to implement table size reduction, by chaning the code below. */
4457 output_asm_insn (patterns[index][0], operands);
4458 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
4459 snprintf (buf, sizeof (buf),
4460 "adr\t%%4, %s", targetm.strip_name_encoding (label));
4461 output_asm_insn (buf, operands);
4462 output_asm_insn (patterns[index][1], operands);
4463 output_asm_insn ("br\t%3", operands);
4464 assemble_label (asm_out_file, label);
4469 /* Return size in bits of an arithmetic operand which is shifted/scaled and
4470 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
4474 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
4476 if (shift >= 0 && shift <= 3)
4479 for (size = 8; size <= 32; size *= 2)
4481 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
4482 if (mask == bits << shift)
4490 aarch64_use_blocks_for_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED,
4491 const_rtx x ATTRIBUTE_UNUSED)
4493 /* We can't use blocks for constants when we're using a per-function
4499 aarch64_select_rtx_section (enum machine_mode mode ATTRIBUTE_UNUSED,
4500 rtx x ATTRIBUTE_UNUSED,
4501 unsigned HOST_WIDE_INT align ATTRIBUTE_UNUSED)
4503 /* Force all constant pool entries into the current function section. */
4504 return function_section (current_function_decl);
4510 /* Helper function for rtx cost calculation. Strip a shift expression
4511 from X. Returns the inner operand if successful, or the original
4512 expression on failure. */
4514 aarch64_strip_shift (rtx x)
4518 if ((GET_CODE (op) == ASHIFT
4519 || GET_CODE (op) == ASHIFTRT
4520 || GET_CODE (op) == LSHIFTRT)
4521 && CONST_INT_P (XEXP (op, 1)))
4522 return XEXP (op, 0);
4524 if (GET_CODE (op) == MULT
4525 && CONST_INT_P (XEXP (op, 1))
4526 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
4527 return XEXP (op, 0);
4532 /* Helper function for rtx cost calculation. Strip an extend
4533 expression from X. Returns the inner operand if successful, or the
4534 original expression on failure. We deal with a number of possible
4535 canonicalization variations here. */
4537 aarch64_strip_extend (rtx x)
4541 /* Zero and sign extraction of a widened value. */
4542 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
4543 && XEXP (op, 2) == const0_rtx
4544 && GET_CODE (XEXP (op, 0)) == MULT
4545 && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
4547 return XEXP (XEXP (op, 0), 0);
4549 /* It can also be represented (for zero-extend) as an AND with an
4551 if (GET_CODE (op) == AND
4552 && GET_CODE (XEXP (op, 0)) == MULT
4553 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
4554 && CONST_INT_P (XEXP (op, 1))
4555 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
4556 INTVAL (XEXP (op, 1))) != 0)
4557 return XEXP (XEXP (op, 0), 0);
4559 /* Now handle extended register, as this may also have an optional
4560 left shift by 1..4. */
4561 if (GET_CODE (op) == ASHIFT
4562 && CONST_INT_P (XEXP (op, 1))
4563 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
4566 if (GET_CODE (op) == ZERO_EXTEND
4567 || GET_CODE (op) == SIGN_EXTEND)
4576 /* Helper function for rtx cost calculation. Calculate the cost of
4577 a MULT, which may be part of a multiply-accumulate rtx. Return
4578 the calculated cost of the expression, recursing manually in to
4579 operands where needed. */
4582 aarch64_rtx_mult_cost (rtx x, int code, int outer, bool speed)
4585 const struct cpu_cost_table *extra_cost
4586 = aarch64_tune_params->insn_extra_cost;
4588 bool maybe_fma = (outer == PLUS || outer == MINUS);
4589 enum machine_mode mode = GET_MODE (x);
4591 gcc_checking_assert (code == MULT);
4596 if (VECTOR_MODE_P (mode))
4597 mode = GET_MODE_INNER (mode);
4599 /* Integer multiply/fma. */
4600 if (GET_MODE_CLASS (mode) == MODE_INT)
4602 /* The multiply will be canonicalized as a shift, cost it as such. */
4603 if (CONST_INT_P (op1)
4604 && exact_log2 (INTVAL (op1)) > 0)
4609 /* ADD (shifted register). */
4610 cost += extra_cost->alu.arith_shift;
4612 /* LSL (immediate). */
4613 cost += extra_cost->alu.shift;
4616 cost += rtx_cost (op0, GET_CODE (op0), 0, speed);
4621 /* Integer multiplies or FMAs have zero/sign extending variants. */
4622 if ((GET_CODE (op0) == ZERO_EXTEND
4623 && GET_CODE (op1) == ZERO_EXTEND)
4624 || (GET_CODE (op0) == SIGN_EXTEND
4625 && GET_CODE (op1) == SIGN_EXTEND))
4627 cost += rtx_cost (XEXP (op0, 0), MULT, 0, speed)
4628 + rtx_cost (XEXP (op1, 0), MULT, 1, speed);
4633 /* MADD/SMADDL/UMADDL. */
4634 cost += extra_cost->mult[0].extend_add;
4636 /* MUL/SMULL/UMULL. */
4637 cost += extra_cost->mult[0].extend;
4643 /* This is either an integer multiply or an FMA. In both cases
4644 we want to recurse and cost the operands. */
4645 cost += rtx_cost (op0, MULT, 0, speed)
4646 + rtx_cost (op1, MULT, 1, speed);
4652 cost += extra_cost->mult[mode == DImode].add;
4655 cost += extra_cost->mult[mode == DImode].simple;
4664 /* Floating-point FMA can also support negations of the
4666 if (GET_CODE (op0) == NEG)
4669 op0 = XEXP (op0, 0);
4671 if (GET_CODE (op1) == NEG)
4674 op1 = XEXP (op1, 0);
4678 /* FMADD/FNMADD/FNMSUB/FMSUB. */
4679 cost += extra_cost->fp[mode == DFmode].fma;
4682 cost += extra_cost->fp[mode == DFmode].mult;
4685 cost += rtx_cost (op0, MULT, 0, speed)
4686 + rtx_cost (op1, MULT, 1, speed);
4692 aarch64_address_cost (rtx x,
4693 enum machine_mode mode,
4694 addr_space_t as ATTRIBUTE_UNUSED,
4697 enum rtx_code c = GET_CODE (x);
4698 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params->addr_cost;
4699 struct aarch64_address_info info;
4703 if (!aarch64_classify_address (&info, x, mode, c, false))
4705 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
4707 /* This is a CONST or SYMBOL ref which will be split
4708 in a different way depending on the code model in use.
4709 Cost it through the generic infrastructure. */
4710 int cost_symbol_ref = rtx_cost (x, MEM, 1, speed);
4711 /* Divide through by the cost of one instruction to
4712 bring it to the same units as the address costs. */
4713 cost_symbol_ref /= COSTS_N_INSNS (1);
4714 /* The cost is then the cost of preparing the address,
4715 followed by an immediate (possibly 0) offset. */
4716 return cost_symbol_ref + addr_cost->imm_offset;
4720 /* This is most likely a jump table from a case
4722 return addr_cost->register_offset;
4728 case ADDRESS_LO_SUM:
4729 case ADDRESS_SYMBOLIC:
4730 case ADDRESS_REG_IMM:
4731 cost += addr_cost->imm_offset;
4734 case ADDRESS_REG_WB:
4735 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
4736 cost += addr_cost->pre_modify;
4737 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
4738 cost += addr_cost->post_modify;
4744 case ADDRESS_REG_REG:
4745 cost += addr_cost->register_offset;
4748 case ADDRESS_REG_UXTW:
4749 case ADDRESS_REG_SXTW:
4750 cost += addr_cost->register_extend;
4760 /* For the sake of calculating the cost of the shifted register
4761 component, we can treat same sized modes in the same way. */
4762 switch (GET_MODE_BITSIZE (mode))
4765 cost += addr_cost->addr_scale_costs.hi;
4769 cost += addr_cost->addr_scale_costs.si;
4773 cost += addr_cost->addr_scale_costs.di;
4776 /* We can't tell, or this is a 128-bit vector. */
4778 cost += addr_cost->addr_scale_costs.ti;
4786 /* Return true if the RTX X in mode MODE is a zero or sign extract
4787 usable in an ADD or SUB (extended register) instruction. */
4789 aarch64_rtx_arith_op_extract_p (rtx x, enum machine_mode mode)
4791 /* Catch add with a sign extract.
4792 This is add_<optab><mode>_multp2. */
4793 if (GET_CODE (x) == SIGN_EXTRACT
4794 || GET_CODE (x) == ZERO_EXTRACT)
4796 rtx op0 = XEXP (x, 0);
4797 rtx op1 = XEXP (x, 1);
4798 rtx op2 = XEXP (x, 2);
4800 if (GET_CODE (op0) == MULT
4801 && CONST_INT_P (op1)
4802 && op2 == const0_rtx
4803 && CONST_INT_P (XEXP (op0, 1))
4804 && aarch64_is_extend_from_extract (mode,
4815 /* Calculate the cost of calculating X, storing it in *COST. Result
4816 is true if the total cost of the operation has now been calculated. */
4818 aarch64_rtx_costs (rtx x, int code, int outer ATTRIBUTE_UNUSED,
4819 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
4822 const struct cpu_cost_table *extra_cost
4823 = aarch64_tune_params->insn_extra_cost;
4824 enum machine_mode mode = GET_MODE (x);
4826 /* By default, assume that everything has equivalent cost to the
4827 cheapest instruction. Any additional costs are applied as a delta
4828 above this default. */
4829 *cost = COSTS_N_INSNS (1);
4831 /* TODO: The cost infrastructure currently does not handle
4832 vector operations. Assume that all vector operations
4833 are equally expensive. */
4834 if (VECTOR_MODE_P (mode))
4837 *cost += extra_cost->vect.alu;
4844 /* The cost depends entirely on the operands to SET. */
4849 switch (GET_CODE (op0))
4854 rtx address = XEXP (op0, 0);
4855 if (GET_MODE_CLASS (mode) == MODE_INT)
4856 *cost += extra_cost->ldst.store;
4857 else if (mode == SFmode)
4858 *cost += extra_cost->ldst.storef;
4859 else if (mode == DFmode)
4860 *cost += extra_cost->ldst.stored;
4863 COSTS_N_INSNS (aarch64_address_cost (address, mode,
4867 *cost += rtx_cost (op1, SET, 1, speed);
4871 if (! REG_P (SUBREG_REG (op0)))
4872 *cost += rtx_cost (SUBREG_REG (op0), SET, 0, speed);
4876 /* const0_rtx is in general free, but we will use an
4877 instruction to set a register to 0. */
4878 if (REG_P (op1) || op1 == const0_rtx)
4880 /* The cost is 1 per register copied. */
4881 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
4883 *cost = COSTS_N_INSNS (n_minus_1 + 1);
4886 /* Cost is just the cost of the RHS of the set. */
4887 *cost += rtx_cost (op1, SET, 1, speed);
4892 /* Bit-field insertion. Strip any redundant widening of
4893 the RHS to meet the width of the target. */
4894 if (GET_CODE (op1) == SUBREG)
4895 op1 = SUBREG_REG (op1);
4896 if ((GET_CODE (op1) == ZERO_EXTEND
4897 || GET_CODE (op1) == SIGN_EXTEND)
4898 && GET_CODE (XEXP (op0, 1)) == CONST_INT
4899 && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
4900 >= INTVAL (XEXP (op0, 1))))
4901 op1 = XEXP (op1, 0);
4903 if (CONST_INT_P (op1))
4905 /* MOV immediate is assumed to always be cheap. */
4906 *cost = COSTS_N_INSNS (1);
4912 *cost += extra_cost->alu.bfi;
4913 *cost += rtx_cost (op1, (enum rtx_code) code, 1, speed);
4919 /* We can't make sense of this, assume default cost. */
4920 *cost = COSTS_N_INSNS (1);
4926 /* If an instruction can incorporate a constant within the
4927 instruction, the instruction's expression avoids calling
4928 rtx_cost() on the constant. If rtx_cost() is called on a
4929 constant, then it is usually because the constant must be
4930 moved into a register by one or more instructions.
4932 The exception is constant 0, which can be expressed
4933 as XZR/WZR and is therefore free. The exception to this is
4934 if we have (set (reg) (const0_rtx)) in which case we must cost
4935 the move. However, we can catch that when we cost the SET, so
4936 we don't need to consider that here. */
4937 if (x == const0_rtx)
4941 /* To an approximation, building any other constant is
4942 proportionally expensive to the number of instructions
4943 required to build that constant. This is true whether we
4944 are compiling for SPEED or otherwise. */
4945 *cost = COSTS_N_INSNS (aarch64_build_constant (0,
4954 /* mov[df,sf]_aarch64. */
4955 if (aarch64_float_const_representable_p (x))
4956 /* FMOV (scalar immediate). */
4957 *cost += extra_cost->fp[mode == DFmode].fpconst;
4958 else if (!aarch64_float_const_zero_rtx_p (x))
4960 /* This will be a load from memory. */
4962 *cost += extra_cost->ldst.loadd;
4964 *cost += extra_cost->ldst.loadf;
4967 /* Otherwise this is +0.0. We get this using MOVI d0, #0
4968 or MOV v0.s[0], wzr - neither of which are modeled by the
4969 cost tables. Just use the default cost. */
4979 /* For loads we want the base cost of a load, plus an
4980 approximation for the additional cost of the addressing
4982 rtx address = XEXP (x, 0);
4983 if (GET_MODE_CLASS (mode) == MODE_INT)
4984 *cost += extra_cost->ldst.load;
4985 else if (mode == SFmode)
4986 *cost += extra_cost->ldst.loadf;
4987 else if (mode == DFmode)
4988 *cost += extra_cost->ldst.loadd;
4991 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5000 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5002 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5003 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5006 *cost += rtx_cost (XEXP (op0, 0), NEG, 0, speed);
5010 /* Cost this as SUB wzr, X. */
5011 op0 = CONST0_RTX (GET_MODE (x));
5016 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
5018 /* Support (neg(fma...)) as a single instruction only if
5019 sign of zeros is unimportant. This matches the decision
5020 making in aarch64.md. */
5021 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
5024 *cost = rtx_cost (op0, NEG, 0, speed);
5029 *cost += extra_cost->fp[mode == DFmode].neg;
5039 if (op1 == const0_rtx
5040 && GET_CODE (op0) == AND)
5046 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
5048 /* TODO: A write to the CC flags possibly costs extra, this
5049 needs encoding in the cost tables. */
5051 /* CC_ZESWPmode supports zero extend for free. */
5052 if (GET_MODE (x) == CC_ZESWPmode && GET_CODE (op0) == ZERO_EXTEND)
5053 op0 = XEXP (op0, 0);
5056 if (GET_CODE (op0) == AND)
5062 if (GET_CODE (op0) == PLUS)
5064 /* ADDS (and CMN alias). */
5069 if (GET_CODE (op0) == MINUS)
5076 if (GET_CODE (op1) == NEG)
5080 *cost += extra_cost->alu.arith;
5082 *cost += rtx_cost (op0, COMPARE, 0, speed);
5083 *cost += rtx_cost (XEXP (op1, 0), NEG, 1, speed);
5089 Compare can freely swap the order of operands, and
5090 canonicalization puts the more complex operation first.
5091 But the integer MINUS logic expects the shift/extend
5092 operation in op1. */
5094 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
5102 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
5106 *cost += extra_cost->fp[mode == DFmode].compare;
5108 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
5110 /* FCMP supports constant 0.0 for no extra cost. */
5124 /* Detect valid immediates. */
5125 if ((GET_MODE_CLASS (mode) == MODE_INT
5126 || (GET_MODE_CLASS (mode) == MODE_CC
5127 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
5128 && CONST_INT_P (op1)
5129 && aarch64_uimm12_shift (INTVAL (op1)))
5131 *cost += rtx_cost (op0, MINUS, 0, speed);
5134 /* SUB(S) (immediate). */
5135 *cost += extra_cost->alu.arith;
5140 /* Look for SUB (extended register). */
5141 if (aarch64_rtx_arith_op_extract_p (op1, mode))
5144 *cost += extra_cost->alu.arith_shift;
5146 *cost += rtx_cost (XEXP (XEXP (op1, 0), 0),
5147 (enum rtx_code) GET_CODE (op1),
5152 rtx new_op1 = aarch64_strip_extend (op1);
5154 /* Cost this as an FMA-alike operation. */
5155 if ((GET_CODE (new_op1) == MULT
5156 || GET_CODE (new_op1) == ASHIFT)
5159 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
5160 (enum rtx_code) code,
5162 *cost += rtx_cost (op0, MINUS, 0, speed);
5166 *cost += rtx_cost (new_op1, MINUS, 1, speed);
5170 if (GET_MODE_CLASS (mode) == MODE_INT)
5172 *cost += extra_cost->alu.arith;
5173 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
5175 *cost += extra_cost->fp[mode == DFmode].addsub;
5188 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5189 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5192 *cost += rtx_cost (XEXP (op0, 0), PLUS, 0, speed);
5193 *cost += rtx_cost (op1, PLUS, 1, speed);
5197 if (GET_MODE_CLASS (mode) == MODE_INT
5198 && CONST_INT_P (op1)
5199 && aarch64_uimm12_shift (INTVAL (op1)))
5201 *cost += rtx_cost (op0, PLUS, 0, speed);
5204 /* ADD (immediate). */
5205 *cost += extra_cost->alu.arith;
5209 /* Look for ADD (extended register). */
5210 if (aarch64_rtx_arith_op_extract_p (op0, mode))
5213 *cost += extra_cost->alu.arith_shift;
5215 *cost += rtx_cost (XEXP (XEXP (op0, 0), 0),
5216 (enum rtx_code) GET_CODE (op0),
5221 /* Strip any extend, leave shifts behind as we will
5222 cost them through mult_cost. */
5223 new_op0 = aarch64_strip_extend (op0);
5225 if (GET_CODE (new_op0) == MULT
5226 || GET_CODE (new_op0) == ASHIFT)
5228 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
5230 *cost += rtx_cost (op1, PLUS, 1, speed);
5234 *cost += (rtx_cost (new_op0, PLUS, 0, speed)
5235 + rtx_cost (op1, PLUS, 1, speed));
5239 if (GET_MODE_CLASS (mode) == MODE_INT)
5241 *cost += extra_cost->alu.arith;
5242 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
5244 *cost += extra_cost->fp[mode == DFmode].addsub;
5257 && GET_CODE (op0) == MULT
5258 && CONST_INT_P (XEXP (op0, 1))
5259 && CONST_INT_P (op1)
5260 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
5263 /* This is a UBFM/SBFM. */
5264 *cost += rtx_cost (XEXP (op0, 0), ZERO_EXTRACT, 0, speed);
5266 *cost += extra_cost->alu.bfx;
5270 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5272 /* We possibly get the immediate for free, this is not
5274 if (CONST_INT_P (op1)
5275 && aarch64_bitmask_imm (INTVAL (op1), GET_MODE (x)))
5277 *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
5280 *cost += extra_cost->alu.logical;
5288 /* Handle ORN, EON, or BIC. */
5289 if (GET_CODE (op0) == NOT)
5290 op0 = XEXP (op0, 0);
5292 new_op0 = aarch64_strip_shift (op0);
5294 /* If we had a shift on op0 then this is a logical-shift-
5295 by-register/immediate operation. Otherwise, this is just
5296 a logical operation. */
5301 /* Shift by immediate. */
5302 if (CONST_INT_P (XEXP (op0, 1)))
5303 *cost += extra_cost->alu.log_shift;
5305 *cost += extra_cost->alu.log_shift_reg;
5308 *cost += extra_cost->alu.logical;
5311 /* In both cases we want to cost both operands. */
5312 *cost += rtx_cost (new_op0, (enum rtx_code) code, 0, speed)
5313 + rtx_cost (op1, (enum rtx_code) code, 1, speed);
5323 *cost += extra_cost->alu.logical;
5325 /* The logical instruction could have the shifted register form,
5326 but the cost is the same if the shift is processed as a separate
5327 instruction, so we don't bother with it here. */
5333 /* If a value is written in SI mode, then zero extended to DI
5334 mode, the operation will in general be free as a write to
5335 a 'w' register implicitly zeroes the upper bits of an 'x'
5336 register. However, if this is
5338 (set (reg) (zero_extend (reg)))
5340 we must cost the explicit register move. */
5342 && GET_MODE (op0) == SImode
5345 int op_cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, 0, speed);
5347 if (!op_cost && speed)
5349 *cost += extra_cost->alu.extend;
5351 /* Free, the cost is that of the SI mode operation. */
5356 else if (MEM_P (XEXP (x, 0)))
5358 /* All loads can zero extend to any size for free. */
5359 *cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, param, speed);
5365 *cost += extra_cost->alu.extend;
5370 if (MEM_P (XEXP (x, 0)))
5375 rtx address = XEXP (XEXP (x, 0), 0);
5376 *cost += extra_cost->ldst.load_sign_extend;
5379 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5386 *cost += extra_cost->alu.extend;
5393 if (CONST_INT_P (op1))
5395 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
5398 *cost += extra_cost->alu.shift;
5400 /* We can incorporate zero/sign extend for free. */
5401 if (GET_CODE (op0) == ZERO_EXTEND
5402 || GET_CODE (op0) == SIGN_EXTEND)
5403 op0 = XEXP (op0, 0);
5405 *cost += rtx_cost (op0, ASHIFT, 0, speed);
5412 *cost += extra_cost->alu.shift_reg;
5414 return false; /* All arguments need to be in registers. */
5424 if (CONST_INT_P (op1))
5426 /* ASR (immediate) and friends. */
5428 *cost += extra_cost->alu.shift;
5430 *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
5436 /* ASR (register) and friends. */
5438 *cost += extra_cost->alu.shift_reg;
5440 return false; /* All arguments need to be in registers. */
5444 if (!CONSTANT_P (XEXP (x, 0)))
5445 *cost += rtx_cost (XEXP (x, 0), HIGH, 0, speed);
5449 if (!CONSTANT_P (XEXP (x, 1)))
5450 *cost += rtx_cost (XEXP (x, 1), LO_SUM, 1, speed);
5451 *cost += rtx_cost (XEXP (x, 0), LO_SUM, 0, speed);
5458 *cost += extra_cost->alu.bfx;
5460 /* We can trust that the immediates used will be correct (there
5461 are no by-register forms), so we need only cost op0. */
5462 *cost += rtx_cost (XEXP (x, 0), (enum rtx_code) code, 0, speed);
5466 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
5467 /* aarch64_rtx_mult_cost always handles recursion to its
5475 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5476 *cost += (extra_cost->mult[GET_MODE (x) == DImode].add
5477 + extra_cost->mult[GET_MODE (x) == DImode].idiv);
5478 else if (GET_MODE (x) == DFmode)
5479 *cost += (extra_cost->fp[1].mult
5480 + extra_cost->fp[1].div);
5481 else if (GET_MODE (x) == SFmode)
5482 *cost += (extra_cost->fp[0].mult
5483 + extra_cost->fp[0].div);
5485 return false; /* All arguments need to be in registers. */
5492 if (GET_MODE_CLASS (mode) == MODE_INT)
5493 /* There is no integer SQRT, so only DIV and UDIV can get
5495 *cost += extra_cost->mult[mode == DImode].idiv;
5497 *cost += extra_cost->fp[mode == DFmode].div;
5499 return false; /* All arguments need to be in registers. */
5506 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
5508 /* Conditional branch. */
5509 if (GET_MODE_CLASS (GET_MODE (XEXP (op0, 0))) == MODE_CC)
5513 if (GET_CODE (op0) == NE
5514 || GET_CODE (op0) == EQ)
5516 rtx inner = XEXP (op0, 0);
5517 rtx comparator = XEXP (op0, 1);
5519 if (comparator == const0_rtx)
5521 /* TBZ/TBNZ/CBZ/CBNZ. */
5522 if (GET_CODE (inner) == ZERO_EXTRACT)
5524 *cost += rtx_cost (XEXP (inner, 0), ZERO_EXTRACT,
5528 *cost += rtx_cost (inner, GET_CODE (op0), 0, speed);
5533 else if (GET_CODE (op0) == LT
5534 || GET_CODE (op0) == GE)
5536 rtx comparator = XEXP (op0, 1);
5539 if (comparator == const0_rtx)
5544 else if (GET_MODE_CLASS (GET_MODE (XEXP (op0, 0))) == MODE_CC)
5546 /* It's a conditional operation based on the status flags,
5547 so it must be some flavor of CSEL. */
5549 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
5550 if (GET_CODE (op1) == NEG
5551 || GET_CODE (op1) == NOT
5552 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
5553 op1 = XEXP (op1, 0);
5555 *cost += rtx_cost (op1, IF_THEN_ELSE, 1, speed);
5556 *cost += rtx_cost (op2, IF_THEN_ELSE, 2, speed);
5560 /* We don't know what this is, cost all operands. */
5574 return false; /* All arguments must be in registers. */
5582 *cost += extra_cost->fp[mode == DFmode].fma;
5584 /* FMSUB, FNMADD, and FNMSUB are free. */
5585 if (GET_CODE (op0) == NEG)
5586 op0 = XEXP (op0, 0);
5588 if (GET_CODE (op2) == NEG)
5589 op2 = XEXP (op2, 0);
5591 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
5592 and the by-element operand as operand 0. */
5593 if (GET_CODE (op1) == NEG)
5594 op1 = XEXP (op1, 0);
5596 /* Catch vector-by-element operations. The by-element operand can
5597 either be (vec_duplicate (vec_select (x))) or just
5598 (vec_select (x)), depending on whether we are multiplying by
5599 a vector or a scalar.
5601 Canonicalization is not very good in these cases, FMA4 will put the
5602 by-element operand as operand 0, FNMA4 will have it as operand 1. */
5603 if (GET_CODE (op0) == VEC_DUPLICATE)
5604 op0 = XEXP (op0, 0);
5605 else if (GET_CODE (op1) == VEC_DUPLICATE)
5606 op1 = XEXP (op1, 0);
5608 if (GET_CODE (op0) == VEC_SELECT)
5609 op0 = XEXP (op0, 0);
5610 else if (GET_CODE (op1) == VEC_SELECT)
5611 op1 = XEXP (op1, 0);
5613 /* If the remaining parameters are not registers,
5614 get the cost to put them into registers. */
5615 *cost += rtx_cost (op0, FMA, 0, speed);
5616 *cost += rtx_cost (op1, FMA, 1, speed);
5617 *cost += rtx_cost (op2, FMA, 2, speed);
5622 *cost += extra_cost->fp[mode == DFmode].widen;
5625 case FLOAT_TRUNCATE:
5627 *cost += extra_cost->fp[mode == DFmode].narrow;
5631 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
5633 /* FABS and FNEG are analogous. */
5635 *cost += extra_cost->fp[mode == DFmode].neg;
5639 /* Integer ABS will either be split to
5640 two arithmetic instructions, or will be an ABS
5641 (scalar), which we don't model. */
5642 *cost = COSTS_N_INSNS (2);
5644 *cost += 2 * extra_cost->alu.arith;
5652 /* FMAXNM/FMINNM/FMAX/FMIN.
5653 TODO: This may not be accurate for all implementations, but
5654 we do not model this in the cost tables. */
5655 *cost += extra_cost->fp[mode == DFmode].addsub;
5665 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
5666 calculated for X. This cost is stored in *COST. Returns true
5667 if the total cost of X was calculated. */
5669 aarch64_rtx_costs_wrapper (rtx x, int code, int outer,
5670 int param, int *cost, bool speed)
5672 bool result = aarch64_rtx_costs (x, code, outer, param, cost, speed);
5674 if (dump_file && (dump_flags & TDF_DETAILS))
5676 print_rtl_single (dump_file, x);
5677 fprintf (dump_file, "\n%s cost: %d (%s)\n",
5678 speed ? "Hot" : "Cold",
5679 *cost, result ? "final" : "partial");
5686 aarch64_register_move_cost (enum machine_mode mode,
5687 reg_class_t from_i, reg_class_t to_i)
5689 enum reg_class from = (enum reg_class) from_i;
5690 enum reg_class to = (enum reg_class) to_i;
5691 const struct cpu_regmove_cost *regmove_cost
5692 = aarch64_tune_params->regmove_cost;
5694 /* Moving between GPR and stack cost is the same as GP2GP. */
5695 if ((from == GENERAL_REGS && to == STACK_REG)
5696 || (to == GENERAL_REGS && from == STACK_REG))
5697 return regmove_cost->GP2GP;
5699 /* To/From the stack register, we move via the gprs. */
5700 if (to == STACK_REG || from == STACK_REG)
5701 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
5702 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
5704 if (from == GENERAL_REGS && to == GENERAL_REGS)
5705 return regmove_cost->GP2GP;
5706 else if (from == GENERAL_REGS)
5707 return regmove_cost->GP2FP;
5708 else if (to == GENERAL_REGS)
5709 return regmove_cost->FP2GP;
5711 /* When AdvSIMD instructions are disabled it is not possible to move
5712 a 128-bit value directly between Q registers. This is handled in
5713 secondary reload. A general register is used as a scratch to move
5714 the upper DI value and the lower DI value is moved directly,
5715 hence the cost is the sum of three moves. */
5716 if (! TARGET_SIMD && GET_MODE_SIZE (mode) == 128)
5717 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
5719 return regmove_cost->FP2FP;
5723 aarch64_memory_move_cost (enum machine_mode mode ATTRIBUTE_UNUSED,
5724 reg_class_t rclass ATTRIBUTE_UNUSED,
5725 bool in ATTRIBUTE_UNUSED)
5727 return aarch64_tune_params->memmov_cost;
5730 /* Return the number of instructions that can be issued per cycle. */
5732 aarch64_sched_issue_rate (void)
5734 return aarch64_tune_params->issue_rate;
5737 /* Vectorizer cost model target hooks. */
5739 /* Implement targetm.vectorize.builtin_vectorization_cost. */
5741 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
5743 int misalign ATTRIBUTE_UNUSED)
5747 switch (type_of_cost)
5750 return aarch64_tune_params->vec_costs->scalar_stmt_cost;
5753 return aarch64_tune_params->vec_costs->scalar_load_cost;
5756 return aarch64_tune_params->vec_costs->scalar_store_cost;
5759 return aarch64_tune_params->vec_costs->vec_stmt_cost;
5762 return aarch64_tune_params->vec_costs->vec_align_load_cost;
5765 return aarch64_tune_params->vec_costs->vec_store_cost;
5768 return aarch64_tune_params->vec_costs->vec_to_scalar_cost;
5771 return aarch64_tune_params->vec_costs->scalar_to_vec_cost;
5773 case unaligned_load:
5774 return aarch64_tune_params->vec_costs->vec_unalign_load_cost;
5776 case unaligned_store:
5777 return aarch64_tune_params->vec_costs->vec_unalign_store_cost;
5779 case cond_branch_taken:
5780 return aarch64_tune_params->vec_costs->cond_taken_branch_cost;
5782 case cond_branch_not_taken:
5783 return aarch64_tune_params->vec_costs->cond_not_taken_branch_cost;
5786 case vec_promote_demote:
5787 return aarch64_tune_params->vec_costs->vec_stmt_cost;
5790 elements = TYPE_VECTOR_SUBPARTS (vectype);
5791 return elements / 2 + 1;
5798 /* Implement targetm.vectorize.add_stmt_cost. */
5800 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
5801 struct _stmt_vec_info *stmt_info, int misalign,
5802 enum vect_cost_model_location where)
5804 unsigned *cost = (unsigned *) data;
5805 unsigned retval = 0;
5807 if (flag_vect_cost_model)
5809 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
5811 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
5813 /* Statements in an inner loop relative to the loop being
5814 vectorized are weighted more heavily. The value here is
5815 a function (linear for now) of the loop nest level. */
5816 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
5818 loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info);
5819 struct loop *loop = LOOP_VINFO_LOOP (loop_info);
5820 unsigned nest_level = loop_depth (loop);
5822 count *= nest_level;
5825 retval = (unsigned) (count * stmt_cost);
5826 cost[where] += retval;
5832 static void initialize_aarch64_code_model (void);
5834 /* Parse the architecture extension string. */
5837 aarch64_parse_extension (char *str)
5839 /* The extension string is parsed left to right. */
5840 const struct aarch64_option_extension *opt = NULL;
5842 /* Flag to say whether we are adding or removing an extension. */
5843 int adding_ext = -1;
5845 while (str != NULL && *str != 0)
5851 ext = strchr (str, '+');
5858 if (len >= 2 && strncmp (str, "no", 2) == 0)
5869 error ("missing feature modifier after %qs", "+no");
5873 /* Scan over the extensions table trying to find an exact match. */
5874 for (opt = all_extensions; opt->name != NULL; opt++)
5876 if (strlen (opt->name) == len && strncmp (opt->name, str, len) == 0)
5878 /* Add or remove the extension. */
5880 aarch64_isa_flags |= opt->flags_on;
5882 aarch64_isa_flags &= ~(opt->flags_off);
5887 if (opt->name == NULL)
5889 /* Extension not found in list. */
5890 error ("unknown feature modifier %qs", str);
5900 /* Parse the ARCH string. */
5903 aarch64_parse_arch (void)
5906 const struct processor *arch;
5907 char *str = (char *) alloca (strlen (aarch64_arch_string) + 1);
5910 strcpy (str, aarch64_arch_string);
5912 ext = strchr (str, '+');
5921 error ("missing arch name in -march=%qs", str);
5925 /* Loop through the list of supported ARCHs to find a match. */
5926 for (arch = all_architectures; arch->name != NULL; arch++)
5928 if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
5930 selected_arch = arch;
5931 aarch64_isa_flags = selected_arch->flags;
5934 selected_cpu = &all_cores[selected_arch->core];
5938 /* ARCH string contains at least one extension. */
5939 aarch64_parse_extension (ext);
5942 if (strcmp (selected_arch->arch, selected_cpu->arch))
5944 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
5945 selected_cpu->name, selected_arch->name);
5952 /* ARCH name not found in list. */
5953 error ("unknown value %qs for -march", str);
5957 /* Parse the CPU string. */
5960 aarch64_parse_cpu (void)
5963 const struct processor *cpu;
5964 char *str = (char *) alloca (strlen (aarch64_cpu_string) + 1);
5967 strcpy (str, aarch64_cpu_string);
5969 ext = strchr (str, '+');
5978 error ("missing cpu name in -mcpu=%qs", str);
5982 /* Loop through the list of supported CPUs to find a match. */
5983 for (cpu = all_cores; cpu->name != NULL; cpu++)
5985 if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
5988 selected_tune = cpu;
5989 aarch64_isa_flags = selected_cpu->flags;
5993 /* CPU string contains at least one extension. */
5994 aarch64_parse_extension (ext);
6001 /* CPU name not found in list. */
6002 error ("unknown value %qs for -mcpu", str);
6006 /* Parse the TUNE string. */
6009 aarch64_parse_tune (void)
6011 const struct processor *cpu;
6012 char *str = (char *) alloca (strlen (aarch64_tune_string) + 1);
6013 strcpy (str, aarch64_tune_string);
6015 /* Loop through the list of supported CPUs to find a match. */
6016 for (cpu = all_cores; cpu->name != NULL; cpu++)
6018 if (strcmp (cpu->name, str) == 0)
6020 selected_tune = cpu;
6025 /* CPU name not found in list. */
6026 error ("unknown value %qs for -mtune", str);
6031 /* Implement TARGET_OPTION_OVERRIDE. */
6034 aarch64_override_options (void)
6036 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
6037 If either of -march or -mtune is given, they override their
6038 respective component of -mcpu.
6040 So, first parse AARCH64_CPU_STRING, then the others, be careful
6041 with -march as, if -mcpu is not present on the command line, march
6042 must set a sensible default CPU. */
6043 if (aarch64_cpu_string)
6045 aarch64_parse_cpu ();
6048 if (aarch64_arch_string)
6050 aarch64_parse_arch ();
6053 if (aarch64_tune_string)
6055 aarch64_parse_tune ();
6058 #ifndef HAVE_AS_MABI_OPTION
6059 /* The compiler may have been configured with 2.23.* binutils, which does
6060 not have support for ILP32. */
6062 error ("Assembler does not support -mabi=ilp32");
6065 initialize_aarch64_code_model ();
6067 aarch64_build_bitmask_table ();
6069 /* This target defaults to strict volatile bitfields. */
6070 if (flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
6071 flag_strict_volatile_bitfields = 1;
6073 /* If the user did not specify a processor, choose the default
6074 one for them. This will be the CPU set during configuration using
6075 --with-cpu, otherwise it is "generic". */
6078 selected_cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
6079 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
6082 gcc_assert (selected_cpu);
6084 /* The selected cpu may be an architecture, so lookup tuning by core ID. */
6086 selected_tune = &all_cores[selected_cpu->core];
6088 aarch64_tune_flags = selected_tune->flags;
6089 aarch64_tune = selected_tune->core;
6090 aarch64_tune_params = selected_tune->tune;
6092 if (aarch64_fix_a53_err835769 == 2)
6094 #ifdef TARGET_FIX_ERR_A53_835769_DEFAULT
6095 aarch64_fix_a53_err835769 = 1;
6097 aarch64_fix_a53_err835769 = 0;
6101 aarch64_override_options_after_change ();
6104 /* Implement targetm.override_options_after_change. */
6107 aarch64_override_options_after_change (void)
6109 if (flag_omit_frame_pointer)
6110 flag_omit_leaf_frame_pointer = false;
6111 else if (flag_omit_leaf_frame_pointer)
6112 flag_omit_frame_pointer = true;
6115 static struct machine_function *
6116 aarch64_init_machine_status (void)
6118 struct machine_function *machine;
6119 machine = ggc_alloc_cleared_machine_function ();
6124 aarch64_init_expanders (void)
6126 init_machine_status = aarch64_init_machine_status;
6129 /* A checking mechanism for the implementation of the various code models. */
6131 initialize_aarch64_code_model (void)
6135 switch (aarch64_cmodel_var)
6137 case AARCH64_CMODEL_TINY:
6138 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
6140 case AARCH64_CMODEL_SMALL:
6141 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
6143 case AARCH64_CMODEL_LARGE:
6144 sorry ("code model %qs with -f%s", "large",
6145 flag_pic > 1 ? "PIC" : "pic");
6151 aarch64_cmodel = aarch64_cmodel_var;
6154 /* Return true if SYMBOL_REF X binds locally. */
6157 aarch64_symbol_binds_local_p (const_rtx x)
6159 return (SYMBOL_REF_DECL (x)
6160 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
6161 : SYMBOL_REF_LOCAL_P (x));
6164 /* Return true if SYMBOL_REF X is thread local */
6166 aarch64_tls_symbol_p (rtx x)
6168 if (! TARGET_HAVE_TLS)
6171 if (GET_CODE (x) != SYMBOL_REF)
6174 return SYMBOL_REF_TLS_MODEL (x) != 0;
6177 /* Classify a TLS symbol into one of the TLS kinds. */
6178 enum aarch64_symbol_type
6179 aarch64_classify_tls_symbol (rtx x)
6181 enum tls_model tls_kind = tls_symbolic_operand_type (x);
6185 case TLS_MODEL_GLOBAL_DYNAMIC:
6186 case TLS_MODEL_LOCAL_DYNAMIC:
6187 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
6189 case TLS_MODEL_INITIAL_EXEC:
6190 return SYMBOL_SMALL_GOTTPREL;
6192 case TLS_MODEL_LOCAL_EXEC:
6193 return SYMBOL_SMALL_TPREL;
6195 case TLS_MODEL_EMULATED:
6196 case TLS_MODEL_NONE:
6197 return SYMBOL_FORCE_TO_MEM;
6204 /* Return the method that should be used to access SYMBOL_REF or
6205 LABEL_REF X in context CONTEXT. */
6207 enum aarch64_symbol_type
6208 aarch64_classify_symbol (rtx x,
6209 enum aarch64_symbol_context context ATTRIBUTE_UNUSED)
6211 if (GET_CODE (x) == LABEL_REF)
6213 switch (aarch64_cmodel)
6215 case AARCH64_CMODEL_LARGE:
6216 return SYMBOL_FORCE_TO_MEM;
6218 case AARCH64_CMODEL_TINY_PIC:
6219 case AARCH64_CMODEL_TINY:
6220 return SYMBOL_TINY_ABSOLUTE;
6222 case AARCH64_CMODEL_SMALL_PIC:
6223 case AARCH64_CMODEL_SMALL:
6224 return SYMBOL_SMALL_ABSOLUTE;
6231 if (GET_CODE (x) == SYMBOL_REF)
6233 if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
6234 return SYMBOL_FORCE_TO_MEM;
6236 if (aarch64_tls_symbol_p (x))
6237 return aarch64_classify_tls_symbol (x);
6239 switch (aarch64_cmodel)
6241 case AARCH64_CMODEL_TINY:
6242 if (SYMBOL_REF_WEAK (x))
6243 return SYMBOL_FORCE_TO_MEM;
6244 return SYMBOL_TINY_ABSOLUTE;
6246 case AARCH64_CMODEL_SMALL:
6247 if (SYMBOL_REF_WEAK (x))
6248 return SYMBOL_FORCE_TO_MEM;
6249 return SYMBOL_SMALL_ABSOLUTE;
6251 case AARCH64_CMODEL_TINY_PIC:
6252 if (!aarch64_symbol_binds_local_p (x))
6253 return SYMBOL_TINY_GOT;
6254 return SYMBOL_TINY_ABSOLUTE;
6256 case AARCH64_CMODEL_SMALL_PIC:
6257 if (!aarch64_symbol_binds_local_p (x))
6258 return SYMBOL_SMALL_GOT;
6259 return SYMBOL_SMALL_ABSOLUTE;
6266 /* By default push everything into the constant pool. */
6267 return SYMBOL_FORCE_TO_MEM;
6271 aarch64_constant_address_p (rtx x)
6273 return (CONSTANT_P (x) && memory_address_p (DImode, x));
6277 aarch64_legitimate_pic_operand_p (rtx x)
6279 if (GET_CODE (x) == SYMBOL_REF
6280 || (GET_CODE (x) == CONST
6281 && GET_CODE (XEXP (x, 0)) == PLUS
6282 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
6288 /* Return true if X holds either a quarter-precision or
6289 floating-point +0.0 constant. */
6291 aarch64_valid_floating_const (enum machine_mode mode, rtx x)
6293 if (!CONST_DOUBLE_P (x))
6296 /* TODO: We could handle moving 0.0 to a TFmode register,
6297 but first we would like to refactor the movtf_aarch64
6298 to be more amicable to split moves properly and
6299 correctly gate on TARGET_SIMD. For now - reject all
6300 constants which are not to SFmode or DFmode registers. */
6301 if (!(mode == SFmode || mode == DFmode))
6304 if (aarch64_float_const_zero_rtx_p (x))
6306 return aarch64_float_const_representable_p (x);
6310 aarch64_legitimate_constant_p (enum machine_mode mode, rtx x)
6312 /* Do not allow vector struct mode constants. We could support
6313 0 and -1 easily, but they need support in aarch64-simd.md. */
6314 if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
6317 /* This could probably go away because
6318 we now decompose CONST_INTs according to expand_mov_immediate. */
6319 if ((GET_CODE (x) == CONST_VECTOR
6320 && aarch64_simd_valid_immediate (x, mode, false, NULL))
6321 || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
6322 return !targetm.cannot_force_const_mem (mode, x);
6324 if (GET_CODE (x) == HIGH
6325 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
6328 return aarch64_constant_address_p (x);
6332 aarch64_load_tp (rtx target)
6335 || GET_MODE (target) != Pmode
6336 || !register_operand (target, Pmode))
6337 target = gen_reg_rtx (Pmode);
6339 /* Can return in any reg. */
6340 emit_insn (gen_aarch64_load_tp_hard (target));
6344 /* On AAPCS systems, this is the "struct __va_list". */
6345 static GTY(()) tree va_list_type;
6347 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
6348 Return the type to use as __builtin_va_list.
6350 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
6362 aarch64_build_builtin_va_list (void)
6365 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
6367 /* Create the type. */
6368 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
6369 /* Give it the required name. */
6370 va_list_name = build_decl (BUILTINS_LOCATION,
6372 get_identifier ("__va_list"),
6374 DECL_ARTIFICIAL (va_list_name) = 1;
6375 TYPE_NAME (va_list_type) = va_list_name;
6376 TYPE_STUB_DECL (va_list_type) = va_list_name;
6378 /* Create the fields. */
6379 f_stack = build_decl (BUILTINS_LOCATION,
6380 FIELD_DECL, get_identifier ("__stack"),
6382 f_grtop = build_decl (BUILTINS_LOCATION,
6383 FIELD_DECL, get_identifier ("__gr_top"),
6385 f_vrtop = build_decl (BUILTINS_LOCATION,
6386 FIELD_DECL, get_identifier ("__vr_top"),
6388 f_groff = build_decl (BUILTINS_LOCATION,
6389 FIELD_DECL, get_identifier ("__gr_offs"),
6391 f_vroff = build_decl (BUILTINS_LOCATION,
6392 FIELD_DECL, get_identifier ("__vr_offs"),
6395 DECL_ARTIFICIAL (f_stack) = 1;
6396 DECL_ARTIFICIAL (f_grtop) = 1;
6397 DECL_ARTIFICIAL (f_vrtop) = 1;
6398 DECL_ARTIFICIAL (f_groff) = 1;
6399 DECL_ARTIFICIAL (f_vroff) = 1;
6401 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
6402 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
6403 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
6404 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
6405 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
6407 TYPE_FIELDS (va_list_type) = f_stack;
6408 DECL_CHAIN (f_stack) = f_grtop;
6409 DECL_CHAIN (f_grtop) = f_vrtop;
6410 DECL_CHAIN (f_vrtop) = f_groff;
6411 DECL_CHAIN (f_groff) = f_vroff;
6413 /* Compute its layout. */
6414 layout_type (va_list_type);
6416 return va_list_type;
6419 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
6421 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
6423 const CUMULATIVE_ARGS *cum;
6424 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
6425 tree stack, grtop, vrtop, groff, vroff;
6427 int gr_save_area_size;
6428 int vr_save_area_size;
6431 cum = &crtl->args.info;
6433 = (NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD;
6435 = (NUM_FP_ARG_REGS - cum->aapcs_nvrn) * UNITS_PER_VREG;
6437 if (TARGET_GENERAL_REGS_ONLY)
6439 if (cum->aapcs_nvrn > 0)
6440 sorry ("%qs and floating point or vector arguments",
6441 "-mgeneral-regs-only");
6442 vr_save_area_size = 0;
6445 f_stack = TYPE_FIELDS (va_list_type_node);
6446 f_grtop = DECL_CHAIN (f_stack);
6447 f_vrtop = DECL_CHAIN (f_grtop);
6448 f_groff = DECL_CHAIN (f_vrtop);
6449 f_vroff = DECL_CHAIN (f_groff);
6451 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
6453 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
6455 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
6457 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
6459 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
6462 /* Emit code to initialize STACK, which points to the next varargs stack
6463 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
6464 by named arguments. STACK is 8-byte aligned. */
6465 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
6466 if (cum->aapcs_stack_size > 0)
6467 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
6468 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
6469 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
6471 /* Emit code to initialize GRTOP, the top of the GR save area.
6472 virtual_incoming_args_rtx should have been 16 byte aligned. */
6473 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
6474 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
6475 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
6477 /* Emit code to initialize VRTOP, the top of the VR save area.
6478 This address is gr_save_area_bytes below GRTOP, rounded
6479 down to the next 16-byte boundary. */
6480 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
6481 vr_offset = AARCH64_ROUND_UP (gr_save_area_size,
6482 STACK_BOUNDARY / BITS_PER_UNIT);
6485 t = fold_build_pointer_plus_hwi (t, -vr_offset);
6486 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
6487 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
6489 /* Emit code to initialize GROFF, the offset from GRTOP of the
6490 next GPR argument. */
6491 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
6492 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
6493 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
6495 /* Likewise emit code to initialize VROFF, the offset from FTOP
6496 of the next VR argument. */
6497 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
6498 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
6499 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
6502 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
6505 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
6506 gimple_seq *post_p ATTRIBUTE_UNUSED)
6510 bool is_ha; /* is HFA or HVA. */
6511 bool dw_align; /* double-word align. */
6512 enum machine_mode ag_mode = VOIDmode;
6514 enum machine_mode mode;
6516 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
6517 tree stack, f_top, f_off, off, arg, roundup, on_stack;
6518 HOST_WIDE_INT size, rsize, adjust, align;
6519 tree t, u, cond1, cond2;
6521 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
6523 type = build_pointer_type (type);
6525 mode = TYPE_MODE (type);
6527 f_stack = TYPE_FIELDS (va_list_type_node);
6528 f_grtop = DECL_CHAIN (f_stack);
6529 f_vrtop = DECL_CHAIN (f_grtop);
6530 f_groff = DECL_CHAIN (f_vrtop);
6531 f_vroff = DECL_CHAIN (f_groff);
6533 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
6534 f_stack, NULL_TREE);
6535 size = int_size_in_bytes (type);
6536 align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
6540 if (aarch64_vfp_is_call_or_return_candidate (mode,
6546 /* TYPE passed in fp/simd registers. */
6547 if (TARGET_GENERAL_REGS_ONLY)
6548 sorry ("%qs and floating point or vector arguments",
6549 "-mgeneral-regs-only");
6551 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
6552 unshare_expr (valist), f_vrtop, NULL_TREE);
6553 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
6554 unshare_expr (valist), f_vroff, NULL_TREE);
6556 rsize = nregs * UNITS_PER_VREG;
6560 if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
6561 adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
6563 else if (BLOCK_REG_PADDING (mode, type, 1) == downward
6564 && size < UNITS_PER_VREG)
6566 adjust = UNITS_PER_VREG - size;
6571 /* TYPE passed in general registers. */
6572 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
6573 unshare_expr (valist), f_grtop, NULL_TREE);
6574 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
6575 unshare_expr (valist), f_groff, NULL_TREE);
6576 rsize = (size + UNITS_PER_WORD - 1) & -UNITS_PER_WORD;
6577 nregs = rsize / UNITS_PER_WORD;
6582 if (BLOCK_REG_PADDING (mode, type, 1) == downward
6583 && size < UNITS_PER_WORD)
6585 adjust = UNITS_PER_WORD - size;
6589 /* Get a local temporary for the field value. */
6590 off = get_initialized_tmp_var (f_off, pre_p, NULL);
6592 /* Emit code to branch if off >= 0. */
6593 t = build2 (GE_EXPR, boolean_type_node, off,
6594 build_int_cst (TREE_TYPE (off), 0));
6595 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
6599 /* Emit: offs = (offs + 15) & -16. */
6600 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
6601 build_int_cst (TREE_TYPE (off), 15));
6602 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
6603 build_int_cst (TREE_TYPE (off), -16));
6604 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
6609 /* Update ap.__[g|v]r_offs */
6610 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
6611 build_int_cst (TREE_TYPE (off), rsize));
6612 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
6616 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
6618 /* [cond2] if (ap.__[g|v]r_offs > 0) */
6619 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
6620 build_int_cst (TREE_TYPE (f_off), 0));
6621 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
6623 /* String up: make sure the assignment happens before the use. */
6624 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
6625 COND_EXPR_ELSE (cond1) = t;
6627 /* Prepare the trees handling the argument that is passed on the stack;
6628 the top level node will store in ON_STACK. */
6629 arg = get_initialized_tmp_var (stack, pre_p, NULL);
6632 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
6633 t = fold_convert (intDI_type_node, arg);
6634 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
6635 build_int_cst (TREE_TYPE (t), 15));
6636 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
6637 build_int_cst (TREE_TYPE (t), -16));
6638 t = fold_convert (TREE_TYPE (arg), t);
6639 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
6643 /* Advance ap.__stack */
6644 t = fold_convert (intDI_type_node, arg);
6645 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
6646 build_int_cst (TREE_TYPE (t), size + 7));
6647 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
6648 build_int_cst (TREE_TYPE (t), -8));
6649 t = fold_convert (TREE_TYPE (arg), t);
6650 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
6651 /* String up roundup and advance. */
6653 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
6654 /* String up with arg */
6655 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
6656 /* Big-endianness related address adjustment. */
6657 if (BLOCK_REG_PADDING (mode, type, 1) == downward
6658 && size < UNITS_PER_WORD)
6660 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
6661 size_int (UNITS_PER_WORD - size));
6662 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
6665 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
6666 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
6668 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
6671 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
6672 build_int_cst (TREE_TYPE (off), adjust));
6674 t = fold_convert (sizetype, t);
6675 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
6679 /* type ha; // treat as "struct {ftype field[n];}"
6680 ... [computing offs]
6681 for (i = 0; i <nregs; ++i, offs += 16)
6682 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
6685 tree tmp_ha, field_t, field_ptr_t;
6687 /* Declare a local variable. */
6688 tmp_ha = create_tmp_var_raw (type, "ha");
6689 gimple_add_tmp_var (tmp_ha);
6691 /* Establish the base type. */
6695 field_t = float_type_node;
6696 field_ptr_t = float_ptr_type_node;
6699 field_t = double_type_node;
6700 field_ptr_t = double_ptr_type_node;
6703 field_t = long_double_type_node;
6704 field_ptr_t = long_double_ptr_type_node;
6706 /* The half precision and quad precision are not fully supported yet. Enable
6707 the following code after the support is complete. Need to find the correct
6708 type node for __fp16 *. */
6711 field_t = float_type_node;
6712 field_ptr_t = float_ptr_type_node;
6718 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
6719 field_t = build_vector_type_for_mode (innertype, ag_mode);
6720 field_ptr_t = build_pointer_type (field_t);
6727 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
6728 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
6730 t = fold_convert (field_ptr_t, addr);
6731 t = build2 (MODIFY_EXPR, field_t,
6732 build1 (INDIRECT_REF, field_t, tmp_ha),
6733 build1 (INDIRECT_REF, field_t, t));
6735 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
6736 for (i = 1; i < nregs; ++i)
6738 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
6739 u = fold_convert (field_ptr_t, addr);
6740 u = build2 (MODIFY_EXPR, field_t,
6741 build2 (MEM_REF, field_t, tmp_ha,
6742 build_int_cst (field_ptr_t,
6744 int_size_in_bytes (field_t)))),
6745 build1 (INDIRECT_REF, field_t, u));
6746 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
6749 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
6750 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
6753 COND_EXPR_ELSE (cond2) = t;
6754 addr = fold_convert (build_pointer_type (type), cond1);
6755 addr = build_va_arg_indirect_ref (addr);
6758 addr = build_va_arg_indirect_ref (addr);
6763 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
6766 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
6767 tree type, int *pretend_size ATTRIBUTE_UNUSED,
6770 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6771 CUMULATIVE_ARGS local_cum;
6772 int gr_saved, vr_saved;
6774 /* The caller has advanced CUM up to, but not beyond, the last named
6775 argument. Advance a local copy of CUM past the last "real" named
6776 argument, to find out how many registers are left over. */
6778 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
6780 /* Found out how many registers we need to save. */
6781 gr_saved = NUM_ARG_REGS - local_cum.aapcs_ncrn;
6782 vr_saved = NUM_FP_ARG_REGS - local_cum.aapcs_nvrn;
6784 if (TARGET_GENERAL_REGS_ONLY)
6786 if (local_cum.aapcs_nvrn > 0)
6787 sorry ("%qs and floating point or vector arguments",
6788 "-mgeneral-regs-only");
6798 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
6799 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
6800 - gr_saved * UNITS_PER_WORD);
6801 mem = gen_frame_mem (BLKmode, ptr);
6802 set_mem_alias_set (mem, get_varargs_alias_set ());
6804 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
6809 /* We can't use move_block_from_reg, because it will use
6810 the wrong mode, storing D regs only. */
6811 enum machine_mode mode = TImode;
6814 /* Set OFF to the offset from virtual_incoming_args_rtx of
6815 the first vector register. The VR save area lies below
6816 the GR one, and is aligned to 16 bytes. */
6817 off = -AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
6818 STACK_BOUNDARY / BITS_PER_UNIT);
6819 off -= vr_saved * UNITS_PER_VREG;
6821 for (i = local_cum.aapcs_nvrn; i < NUM_FP_ARG_REGS; ++i)
6825 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
6826 mem = gen_frame_mem (mode, ptr);
6827 set_mem_alias_set (mem, get_varargs_alias_set ());
6828 aarch64_emit_move (mem, gen_rtx_REG (mode, V0_REGNUM + i));
6829 off += UNITS_PER_VREG;
6834 /* We don't save the size into *PRETEND_SIZE because we want to avoid
6835 any complication of having crtl->args.pretend_args_size changed. */
6836 cfun->machine->saved_varargs_size
6837 = (AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
6838 STACK_BOUNDARY / BITS_PER_UNIT)
6839 + vr_saved * UNITS_PER_VREG);
6843 aarch64_conditional_register_usage (void)
6848 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
6851 call_used_regs[i] = 1;
6856 /* Walk down the type tree of TYPE counting consecutive base elements.
6857 If *MODEP is VOIDmode, then set it to the first valid floating point
6858 type. If a non-floating point type is found, or if a floating point
6859 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
6860 otherwise return the count in the sub-tree. */
6862 aapcs_vfp_sub_candidate (const_tree type, enum machine_mode *modep)
6864 enum machine_mode mode;
6867 switch (TREE_CODE (type))
6870 mode = TYPE_MODE (type);
6871 if (mode != DFmode && mode != SFmode && mode != TFmode)
6874 if (*modep == VOIDmode)
6883 mode = TYPE_MODE (TREE_TYPE (type));
6884 if (mode != DFmode && mode != SFmode && mode != TFmode)
6887 if (*modep == VOIDmode)
6896 /* Use V2SImode and V4SImode as representatives of all 64-bit
6897 and 128-bit vector types. */
6898 size = int_size_in_bytes (type);
6911 if (*modep == VOIDmode)
6914 /* Vector modes are considered to be opaque: two vectors are
6915 equivalent for the purposes of being homogeneous aggregates
6916 if they are the same size. */
6925 tree index = TYPE_DOMAIN (type);
6927 /* Can't handle incomplete types. */
6928 if (!COMPLETE_TYPE_P (type))
6931 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
6934 || !TYPE_MAX_VALUE (index)
6935 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
6936 || !TYPE_MIN_VALUE (index)
6937 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
6941 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
6942 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
6944 /* There must be no padding. */
6945 if (!tree_fits_uhwi_p (TYPE_SIZE (type))
6946 || ((HOST_WIDE_INT) tree_to_uhwi (TYPE_SIZE (type))
6947 != count * GET_MODE_BITSIZE (*modep)))
6959 /* Can't handle incomplete types. */
6960 if (!COMPLETE_TYPE_P (type))
6963 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
6965 if (TREE_CODE (field) != FIELD_DECL)
6968 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
6974 /* There must be no padding. */
6975 if (!tree_fits_uhwi_p (TYPE_SIZE (type))
6976 || ((HOST_WIDE_INT) tree_to_uhwi (TYPE_SIZE (type))
6977 != count * GET_MODE_BITSIZE (*modep)))
6984 case QUAL_UNION_TYPE:
6986 /* These aren't very interesting except in a degenerate case. */
6991 /* Can't handle incomplete types. */
6992 if (!COMPLETE_TYPE_P (type))
6995 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
6997 if (TREE_CODE (field) != FIELD_DECL)
7000 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
7003 count = count > sub_count ? count : sub_count;
7006 /* There must be no padding. */
7007 if (!tree_fits_uhwi_p (TYPE_SIZE (type))
7008 || ((HOST_WIDE_INT) tree_to_uhwi (TYPE_SIZE (type))
7009 != count * GET_MODE_BITSIZE (*modep)))
7022 /* Return true if we use LRA instead of reload pass. */
7024 aarch64_lra_p (void)
7026 return aarch64_lra_flag;
7029 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
7030 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
7031 array types. The C99 floating-point complex types are also considered
7032 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
7033 types, which are GCC extensions and out of the scope of AAPCS64, are
7034 treated as composite types here as well.
7036 Note that MODE itself is not sufficient in determining whether a type
7037 is such a composite type or not. This is because
7038 stor-layout.c:compute_record_mode may have already changed the MODE
7039 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
7040 structure with only one field may have its MODE set to the mode of the
7041 field. Also an integer mode whose size matches the size of the
7042 RECORD_TYPE type may be used to substitute the original mode
7043 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
7044 solely relied on. */
7047 aarch64_composite_type_p (const_tree type,
7048 enum machine_mode mode)
7050 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
7054 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
7055 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
7061 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
7062 type as described in AAPCS64 \S 4.1.2.
7064 See the comment above aarch64_composite_type_p for the notes on MODE. */
7067 aarch64_short_vector_p (const_tree type,
7068 enum machine_mode mode)
7070 HOST_WIDE_INT size = -1;
7072 if (type && TREE_CODE (type) == VECTOR_TYPE)
7073 size = int_size_in_bytes (type);
7074 else if (!aarch64_composite_type_p (type, mode)
7075 && (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
7076 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT))
7077 size = GET_MODE_SIZE (mode);
7079 return (size == 8 || size == 16) ? true : false;
7082 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
7083 shall be passed or returned in simd/fp register(s) (providing these
7084 parameter passing registers are available).
7086 Upon successful return, *COUNT returns the number of needed registers,
7087 *BASE_MODE returns the mode of the individual register and when IS_HAF
7088 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
7089 floating-point aggregate or a homogeneous short-vector aggregate. */
7092 aarch64_vfp_is_call_or_return_candidate (enum machine_mode mode,
7094 enum machine_mode *base_mode,
7098 enum machine_mode new_mode = VOIDmode;
7099 bool composite_p = aarch64_composite_type_p (type, mode);
7101 if (is_ha != NULL) *is_ha = false;
7103 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
7104 || aarch64_short_vector_p (type, mode))
7109 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
7111 if (is_ha != NULL) *is_ha = true;
7113 new_mode = GET_MODE_INNER (mode);
7115 else if (type && composite_p)
7117 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
7119 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
7121 if (is_ha != NULL) *is_ha = true;
7130 *base_mode = new_mode;
7134 /* Implement TARGET_STRUCT_VALUE_RTX. */
7137 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
7138 int incoming ATTRIBUTE_UNUSED)
7140 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
7143 /* Implements target hook vector_mode_supported_p. */
7145 aarch64_vector_mode_supported_p (enum machine_mode mode)
7148 && (mode == V4SImode || mode == V8HImode
7149 || mode == V16QImode || mode == V2DImode
7150 || mode == V2SImode || mode == V4HImode
7151 || mode == V8QImode || mode == V2SFmode
7152 || mode == V4SFmode || mode == V2DFmode
7153 || mode == V1DFmode))
7159 /* Return appropriate SIMD container
7160 for MODE within a vector of WIDTH bits. */
7161 static enum machine_mode
7162 aarch64_simd_container_mode (enum machine_mode mode, unsigned width)
7164 gcc_assert (width == 64 || width == 128);
7203 /* Return 128-bit container as the preferred SIMD mode for MODE. */
7204 static enum machine_mode
7205 aarch64_preferred_simd_mode (enum machine_mode mode)
7207 return aarch64_simd_container_mode (mode, 128);
7210 /* Return the bitmask of possible vector sizes for the vectorizer
7213 aarch64_autovectorize_vector_sizes (void)
7218 /* A table to help perform AArch64-specific name mangling for AdvSIMD
7219 vector types in order to conform to the AAPCS64 (see "Procedure
7220 Call Standard for the ARM 64-bit Architecture", Appendix A). To
7221 qualify for emission with the mangled names defined in that document,
7222 a vector type must not only be of the correct mode but also be
7223 composed of AdvSIMD vector element types (e.g.
7224 _builtin_aarch64_simd_qi); these types are registered by
7225 aarch64_init_simd_builtins (). In other words, vector types defined
7226 in other ways e.g. via vector_size attribute will get default
7230 enum machine_mode mode;
7231 const char *element_type_name;
7232 const char *mangled_name;
7233 } aarch64_simd_mangle_map_entry;
7235 static aarch64_simd_mangle_map_entry aarch64_simd_mangle_map[] = {
7236 /* 64-bit containerized types. */
7237 { V8QImode, "__builtin_aarch64_simd_qi", "10__Int8x8_t" },
7238 { V8QImode, "__builtin_aarch64_simd_uqi", "11__Uint8x8_t" },
7239 { V4HImode, "__builtin_aarch64_simd_hi", "11__Int16x4_t" },
7240 { V4HImode, "__builtin_aarch64_simd_uhi", "12__Uint16x4_t" },
7241 { V2SImode, "__builtin_aarch64_simd_si", "11__Int32x2_t" },
7242 { V2SImode, "__builtin_aarch64_simd_usi", "12__Uint32x2_t" },
7243 { V2SFmode, "__builtin_aarch64_simd_sf", "13__Float32x2_t" },
7244 { V8QImode, "__builtin_aarch64_simd_poly8", "11__Poly8x8_t" },
7245 { V4HImode, "__builtin_aarch64_simd_poly16", "12__Poly16x4_t" },
7246 /* 128-bit containerized types. */
7247 { V16QImode, "__builtin_aarch64_simd_qi", "11__Int8x16_t" },
7248 { V16QImode, "__builtin_aarch64_simd_uqi", "12__Uint8x16_t" },
7249 { V8HImode, "__builtin_aarch64_simd_hi", "11__Int16x8_t" },
7250 { V8HImode, "__builtin_aarch64_simd_uhi", "12__Uint16x8_t" },
7251 { V4SImode, "__builtin_aarch64_simd_si", "11__Int32x4_t" },
7252 { V4SImode, "__builtin_aarch64_simd_usi", "12__Uint32x4_t" },
7253 { V2DImode, "__builtin_aarch64_simd_di", "11__Int64x2_t" },
7254 { V2DImode, "__builtin_aarch64_simd_udi", "12__Uint64x2_t" },
7255 { V4SFmode, "__builtin_aarch64_simd_sf", "13__Float32x4_t" },
7256 { V2DFmode, "__builtin_aarch64_simd_df", "13__Float64x2_t" },
7257 { V16QImode, "__builtin_aarch64_simd_poly8", "12__Poly8x16_t" },
7258 { V8HImode, "__builtin_aarch64_simd_poly16", "12__Poly16x8_t" },
7259 { V2DImode, "__builtin_aarch64_simd_poly64", "12__Poly64x2_t" },
7260 { VOIDmode, NULL, NULL }
7263 /* Implement TARGET_MANGLE_TYPE. */
7266 aarch64_mangle_type (const_tree type)
7268 /* The AArch64 ABI documents say that "__va_list" has to be
7269 managled as if it is in the "std" namespace. */
7270 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
7271 return "St9__va_list";
7273 /* Check the mode of the vector type, and the name of the vector
7274 element type, against the table. */
7275 if (TREE_CODE (type) == VECTOR_TYPE)
7277 aarch64_simd_mangle_map_entry *pos = aarch64_simd_mangle_map;
7279 while (pos->mode != VOIDmode)
7281 tree elt_type = TREE_TYPE (type);
7283 if (pos->mode == TYPE_MODE (type)
7284 && TREE_CODE (TYPE_NAME (elt_type)) == TYPE_DECL
7285 && !strcmp (IDENTIFIER_POINTER (DECL_NAME (TYPE_NAME (elt_type))),
7286 pos->element_type_name))
7287 return pos->mangled_name;
7293 /* Use the default mangling. */
7298 is_mem_p (rtx *x, void *data ATTRIBUTE_UNUSED)
7304 is_memory_op (rtx mem_insn)
7306 rtx pattern = PATTERN (mem_insn);
7307 return for_each_rtx (&pattern, is_mem_p, NULL);
7310 /* Find the first rtx before insn that will generate an assembly
7314 aarch64_prev_real_insn (rtx insn)
7321 insn = prev_real_insn (insn);
7323 while (insn && recog_memoized (insn) < 0);
7329 is_madd_op (enum attr_type t1)
7332 /* A number of these may be AArch32 only. */
7333 enum attr_type mlatypes[] = {
7334 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
7335 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
7336 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
7339 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
7341 if (t1 == mlatypes[i])
7348 /* Check if there is a register dependency between a load and the insn
7349 for which we hold recog_data. */
7352 dep_between_memop_and_curr (rtx memop)
7360 if (!REG_P (SET_DEST (memop)))
7363 load_reg = SET_DEST (memop);
7364 for (opno = 0; opno < recog_data.n_operands; opno++)
7366 rtx operand = recog_data.operand[opno];
7368 && reg_overlap_mentioned_p (load_reg, operand))
7376 aarch64_madd_needs_nop (rtx insn)
7378 enum attr_type attr_type;
7382 if (!aarch64_fix_a53_err835769)
7385 if (recog_memoized (insn) < 0)
7388 attr_type = get_attr_type (insn);
7389 if (!is_madd_op (attr_type))
7392 prev = aarch64_prev_real_insn (insn);
7393 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
7394 Restore recog state to INSN to avoid state corruption. */
7395 extract_constrain_insn_cached (insn);
7400 body = single_set (prev);
7402 /* If the previous insn is a memory op and there is no dependency between
7403 it and the madd, emit a nop between them. If we know the previous insn is
7404 a memory op but body is NULL, emit the nop to be safe, it's probably a
7405 load/store pair insn. */
7406 if (is_memory_op (prev)
7407 && GET_MODE (recog_data.operand[0]) == DImode
7408 && (!dep_between_memop_and_curr (body)))
7416 aarch64_final_prescan_insn (rtx insn)
7418 if (aarch64_madd_needs_nop (insn))
7419 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
7423 /* Return the equivalent letter for size. */
7425 sizetochar (int size)
7429 case 64: return 'd';
7430 case 32: return 's';
7431 case 16: return 'h';
7432 case 8 : return 'b';
7433 default: gcc_unreachable ();
7437 /* Return true iff x is a uniform vector of floating-point
7438 constants, and the constant can be represented in
7439 quarter-precision form. Note, as aarch64_float_const_representable
7440 rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0. */
7442 aarch64_vect_float_const_representable_p (rtx x)
7445 REAL_VALUE_TYPE r0, ri;
7448 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
7451 x0 = CONST_VECTOR_ELT (x, 0);
7452 if (!CONST_DOUBLE_P (x0))
7455 REAL_VALUE_FROM_CONST_DOUBLE (r0, x0);
7457 for (i = 1; i < CONST_VECTOR_NUNITS (x); i++)
7459 xi = CONST_VECTOR_ELT (x, i);
7460 if (!CONST_DOUBLE_P (xi))
7463 REAL_VALUE_FROM_CONST_DOUBLE (ri, xi);
7464 if (!REAL_VALUES_EQUAL (r0, ri))
7468 return aarch64_float_const_representable_p (x0);
7471 /* Return true for valid and false for invalid. */
7473 aarch64_simd_valid_immediate (rtx op, enum machine_mode mode, bool inverse,
7474 struct simd_immediate_info *info)
7476 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG) \
7478 for (i = 0; i < idx; i += (STRIDE)) \
7483 immtype = (CLASS); \
7484 elsize = (ELSIZE); \
7490 unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
7491 unsigned int innersize = GET_MODE_SIZE (GET_MODE_INNER (mode));
7492 unsigned char bytes[16];
7493 int immtype = -1, matches;
7494 unsigned int invmask = inverse ? 0xff : 0;
7497 if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
7499 if (! (aarch64_simd_imm_zero_p (op, mode)
7500 || aarch64_vect_float_const_representable_p (op)))
7505 info->value = CONST_VECTOR_ELT (op, 0);
7506 info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
7514 /* Splat vector constant out into a byte vector. */
7515 for (i = 0; i < n_elts; i++)
7517 /* The vector is provided in gcc endian-neutral fashion. For aarch64_be,
7518 it must be laid out in the vector register in reverse order. */
7519 rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
7520 unsigned HOST_WIDE_INT elpart;
7521 unsigned int part, parts;
7523 if (GET_CODE (el) == CONST_INT)
7525 elpart = INTVAL (el);
7528 else if (GET_CODE (el) == CONST_DOUBLE)
7530 elpart = CONST_DOUBLE_LOW (el);
7536 for (part = 0; part < parts; part++)
7539 for (byte = 0; byte < innersize; byte++)
7541 bytes[idx++] = (elpart & 0xff) ^ invmask;
7542 elpart >>= BITS_PER_UNIT;
7544 if (GET_CODE (el) == CONST_DOUBLE)
7545 elpart = CONST_DOUBLE_HIGH (el);
7550 gcc_assert (idx == GET_MODE_SIZE (mode));
7554 CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
7555 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
7557 CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
7558 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
7560 CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
7561 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
7563 CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
7564 && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
7566 CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
7568 CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
7570 CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
7571 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
7573 CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
7574 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
7576 CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
7577 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
7579 CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
7580 && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
7582 CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
7584 CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
7586 CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
7587 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
7589 CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
7590 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
7592 CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
7593 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
7595 CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
7596 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
7598 CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
7600 CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
7601 && bytes[i] == bytes[(i + 8) % idx], 0, 0);
7610 info->element_width = elsize;
7611 info->mvn = emvn != 0;
7612 info->shift = eshift;
7614 unsigned HOST_WIDE_INT imm = 0;
7616 if (immtype >= 12 && immtype <= 15)
7619 /* Un-invert bytes of recognized vector, if necessary. */
7621 for (i = 0; i < idx; i++)
7622 bytes[i] ^= invmask;
7626 /* FIXME: Broken on 32-bit H_W_I hosts. */
7627 gcc_assert (sizeof (HOST_WIDE_INT) == 8);
7629 for (i = 0; i < 8; i++)
7630 imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
7631 << (i * BITS_PER_UNIT);
7634 info->value = GEN_INT (imm);
7638 for (i = 0; i < elsize / BITS_PER_UNIT; i++)
7639 imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
7641 /* Construct 'abcdefgh' because the assembler cannot handle
7642 generic constants. */
7645 imm = (imm >> info->shift) & 0xff;
7646 info->value = GEN_INT (imm);
7655 aarch64_const_vec_all_same_int_p (rtx x,
7656 HOST_WIDE_INT minval,
7657 HOST_WIDE_INT maxval)
7659 HOST_WIDE_INT firstval;
7662 if (GET_CODE (x) != CONST_VECTOR
7663 || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
7666 firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
7667 if (firstval < minval || firstval > maxval)
7670 count = CONST_VECTOR_NUNITS (x);
7671 for (i = 1; i < count; i++)
7672 if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
7678 /* Check of immediate shift constants are within range. */
7680 aarch64_simd_shift_imm_p (rtx x, enum machine_mode mode, bool left)
7682 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
7684 return aarch64_const_vec_all_same_int_p (x, 0, bit_width - 1);
7686 return aarch64_const_vec_all_same_int_p (x, 1, bit_width);
7689 /* Return true if X is a uniform vector where all elements
7690 are either the floating-point constant 0.0 or the
7691 integer constant 0. */
7693 aarch64_simd_imm_zero_p (rtx x, enum machine_mode mode)
7695 return x == CONST0_RTX (mode);
7699 aarch64_simd_imm_scalar_p (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED)
7701 HOST_WIDE_INT imm = INTVAL (x);
7704 for (i = 0; i < 8; i++)
7706 unsigned int byte = imm & 0xff;
7707 if (byte != 0xff && byte != 0)
7716 aarch64_mov_operand_p (rtx x,
7717 enum aarch64_symbol_context context,
7718 enum machine_mode mode)
7720 if (GET_CODE (x) == HIGH
7721 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
7724 if (CONST_INT_P (x) && aarch64_move_imm (INTVAL (x), mode))
7727 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
7730 return aarch64_classify_symbolic_expression (x, context)
7731 == SYMBOL_TINY_ABSOLUTE;
7734 /* Return a const_int vector of VAL. */
7736 aarch64_simd_gen_const_vector_dup (enum machine_mode mode, int val)
7738 int nunits = GET_MODE_NUNITS (mode);
7739 rtvec v = rtvec_alloc (nunits);
7742 for (i=0; i < nunits; i++)
7743 RTVEC_ELT (v, i) = GEN_INT (val);
7745 return gen_rtx_CONST_VECTOR (mode, v);
7748 /* Check OP is a legal scalar immediate for the MOVI instruction. */
7751 aarch64_simd_scalar_immediate_valid_for_move (rtx op, enum machine_mode mode)
7753 enum machine_mode vmode;
7755 gcc_assert (!VECTOR_MODE_P (mode));
7756 vmode = aarch64_preferred_simd_mode (mode);
7757 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
7758 return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
7761 /* Construct and return a PARALLEL RTX vector. */
7763 aarch64_simd_vect_par_cnst_half (enum machine_mode mode, bool high)
7765 int nunits = GET_MODE_NUNITS (mode);
7766 rtvec v = rtvec_alloc (nunits / 2);
7767 int base = high ? nunits / 2 : 0;
7771 for (i=0; i < nunits / 2; i++)
7772 RTVEC_ELT (v, i) = GEN_INT (base + i);
7774 t1 = gen_rtx_PARALLEL (mode, v);
7778 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
7779 HIGH (exclusive). */
7781 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high)
7784 gcc_assert (GET_CODE (operand) == CONST_INT);
7785 lane = INTVAL (operand);
7787 if (lane < low || lane >= high)
7788 error ("lane out of range");
7792 aarch64_simd_const_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high)
7794 gcc_assert (GET_CODE (operand) == CONST_INT);
7795 HOST_WIDE_INT lane = INTVAL (operand);
7797 if (lane < low || lane >= high)
7798 error ("constant out of range");
7801 /* Emit code to reinterpret one AdvSIMD type as another,
7802 without altering bits. */
7804 aarch64_simd_reinterpret (rtx dest, rtx src)
7806 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), src));
7809 /* Emit code to place a AdvSIMD pair result in memory locations (with equal
7812 aarch64_simd_emit_pair_result_insn (enum machine_mode mode,
7813 rtx (*intfn) (rtx, rtx, rtx), rtx destaddr,
7816 rtx mem = gen_rtx_MEM (mode, destaddr);
7817 rtx tmp1 = gen_reg_rtx (mode);
7818 rtx tmp2 = gen_reg_rtx (mode);
7820 emit_insn (intfn (tmp1, op1, tmp2));
7822 emit_move_insn (mem, tmp1);
7823 mem = adjust_address (mem, mode, GET_MODE_SIZE (mode));
7824 emit_move_insn (mem, tmp2);
7827 /* Return TRUE if OP is a valid vector addressing mode. */
7829 aarch64_simd_mem_operand_p (rtx op)
7831 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
7832 || GET_CODE (XEXP (op, 0)) == REG);
7835 /* Set up OPERANDS for a register copy from SRC to DEST, taking care
7836 not to early-clobber SRC registers in the process.
7838 We assume that the operands described by SRC and DEST represent a
7839 decomposed copy of OPERANDS[1] into OPERANDS[0]. COUNT is the
7840 number of components into which the copy has been decomposed. */
7842 aarch64_simd_disambiguate_copy (rtx *operands, rtx *dest,
7843 rtx *src, unsigned int count)
7847 if (!reg_overlap_mentioned_p (operands[0], operands[1])
7848 || REGNO (operands[0]) < REGNO (operands[1]))
7850 for (i = 0; i < count; i++)
7852 operands[2 * i] = dest[i];
7853 operands[2 * i + 1] = src[i];
7858 for (i = 0; i < count; i++)
7860 operands[2 * i] = dest[count - i - 1];
7861 operands[2 * i + 1] = src[count - i - 1];
7866 /* Compute and return the length of aarch64_simd_mov<mode>, where <mode> is
7867 one of VSTRUCT modes: OI, CI or XI. */
7869 aarch64_simd_attr_length_move (rtx insn)
7871 enum machine_mode mode;
7873 extract_insn_cached (insn);
7875 if (REG_P (recog_data.operand[0]) && REG_P (recog_data.operand[1]))
7877 mode = GET_MODE (recog_data.operand[0]);
7893 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
7894 alignment of a vector to 128 bits. */
7895 static HOST_WIDE_INT
7896 aarch64_simd_vector_alignment (const_tree type)
7898 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
7899 return MIN (align, 128);
7902 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
7904 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
7909 /* We guarantee alignment for vectors up to 128-bits. */
7910 if (tree_int_cst_compare (TYPE_SIZE (type),
7911 bitsize_int (BIGGEST_ALIGNMENT)) > 0)
7914 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
7918 /* If VALS is a vector constant that can be loaded into a register
7919 using DUP, generate instructions to do so and return an RTX to
7920 assign to the register. Otherwise return NULL_RTX. */
7922 aarch64_simd_dup_constant (rtx vals)
7924 enum machine_mode mode = GET_MODE (vals);
7925 enum machine_mode inner_mode = GET_MODE_INNER (mode);
7926 int n_elts = GET_MODE_NUNITS (mode);
7927 bool all_same = true;
7931 if (GET_CODE (vals) != CONST_VECTOR)
7934 for (i = 1; i < n_elts; ++i)
7936 x = CONST_VECTOR_ELT (vals, i);
7937 if (!rtx_equal_p (x, CONST_VECTOR_ELT (vals, 0)))
7944 /* We can load this constant by using DUP and a constant in a
7945 single ARM register. This will be cheaper than a vector
7947 x = copy_to_mode_reg (inner_mode, CONST_VECTOR_ELT (vals, 0));
7948 return gen_rtx_VEC_DUPLICATE (mode, x);
7952 /* Generate code to load VALS, which is a PARALLEL containing only
7953 constants (for vec_init) or CONST_VECTOR, efficiently into a
7954 register. Returns an RTX to copy into the register, or NULL_RTX
7955 for a PARALLEL that can not be converted into a CONST_VECTOR. */
7957 aarch64_simd_make_constant (rtx vals)
7959 enum machine_mode mode = GET_MODE (vals);
7961 rtx const_vec = NULL_RTX;
7962 int n_elts = GET_MODE_NUNITS (mode);
7966 if (GET_CODE (vals) == CONST_VECTOR)
7968 else if (GET_CODE (vals) == PARALLEL)
7970 /* A CONST_VECTOR must contain only CONST_INTs and
7971 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
7972 Only store valid constants in a CONST_VECTOR. */
7973 for (i = 0; i < n_elts; ++i)
7975 rtx x = XVECEXP (vals, 0, i);
7976 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
7979 if (n_const == n_elts)
7980 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
7985 if (const_vec != NULL_RTX
7986 && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
7987 /* Load using MOVI/MVNI. */
7989 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
7990 /* Loaded using DUP. */
7992 else if (const_vec != NULL_RTX)
7993 /* Load from constant pool. We can not take advantage of single-cycle
7994 LD1 because we need a PC-relative addressing mode. */
7997 /* A PARALLEL containing something not valid inside CONST_VECTOR.
7998 We can not construct an initializer. */
8003 aarch64_expand_vector_init (rtx target, rtx vals)
8005 enum machine_mode mode = GET_MODE (target);
8006 enum machine_mode inner_mode = GET_MODE_INNER (mode);
8007 int n_elts = GET_MODE_NUNITS (mode);
8008 int n_var = 0, one_var = -1;
8009 bool all_same = true;
8013 x = XVECEXP (vals, 0, 0);
8014 if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
8015 n_var = 1, one_var = 0;
8017 for (i = 1; i < n_elts; ++i)
8019 x = XVECEXP (vals, 0, i);
8020 if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
8021 ++n_var, one_var = i;
8023 if (!rtx_equal_p (x, XVECEXP (vals, 0, 0)))
8029 rtx constant = aarch64_simd_make_constant (vals);
8030 if (constant != NULL_RTX)
8032 emit_move_insn (target, constant);
8037 /* Splat a single non-constant element if we can. */
8040 x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));
8041 aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
8045 /* One field is non-constant. Load constant then overwrite varying
8046 field. This is more efficient than using the stack. */
8049 rtx copy = copy_rtx (vals);
8050 rtx index = GEN_INT (one_var);
8051 enum insn_code icode;
8053 /* Load constant part of vector, substitute neighboring value for
8055 XVECEXP (copy, 0, one_var) = XVECEXP (vals, 0, one_var ^ 1);
8056 aarch64_expand_vector_init (target, copy);
8058 /* Insert variable. */
8059 x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, one_var));
8060 icode = optab_handler (vec_set_optab, mode);
8061 gcc_assert (icode != CODE_FOR_nothing);
8062 emit_insn (GEN_FCN (icode) (target, x, index));
8066 /* Construct the vector in memory one field at a time
8067 and load the whole vector. */
8068 mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
8069 for (i = 0; i < n_elts; i++)
8070 emit_move_insn (adjust_address_nv (mem, inner_mode,
8071 i * GET_MODE_SIZE (inner_mode)),
8072 XVECEXP (vals, 0, i));
8073 emit_move_insn (target, mem);
8077 static unsigned HOST_WIDE_INT
8078 aarch64_shift_truncation_mask (enum machine_mode mode)
8081 (aarch64_vector_mode_supported_p (mode)
8082 || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
8085 #ifndef TLS_SECTION_ASM_FLAG
8086 #define TLS_SECTION_ASM_FLAG 'T'
8090 aarch64_elf_asm_named_section (const char *name, unsigned int flags,
8091 tree decl ATTRIBUTE_UNUSED)
8093 char flagchars[10], *f = flagchars;
8095 /* If we have already declared this section, we can use an
8096 abbreviated form to switch back to it -- unless this section is
8097 part of a COMDAT groups, in which case GAS requires the full
8098 declaration every time. */
8099 if (!(HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8100 && (flags & SECTION_DECLARED))
8102 fprintf (asm_out_file, "\t.section\t%s\n", name);
8106 if (!(flags & SECTION_DEBUG))
8108 if (flags & SECTION_WRITE)
8110 if (flags & SECTION_CODE)
8112 if (flags & SECTION_SMALL)
8114 if (flags & SECTION_MERGE)
8116 if (flags & SECTION_STRINGS)
8118 if (flags & SECTION_TLS)
8119 *f++ = TLS_SECTION_ASM_FLAG;
8120 if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8124 fprintf (asm_out_file, "\t.section\t%s,\"%s\"", name, flagchars);
8126 if (!(flags & SECTION_NOTYPE))
8131 if (flags & SECTION_BSS)
8136 #ifdef TYPE_OPERAND_FMT
8137 format = "," TYPE_OPERAND_FMT;
8142 fprintf (asm_out_file, format, type);
8144 if (flags & SECTION_ENTSIZE)
8145 fprintf (asm_out_file, ",%d", flags & SECTION_ENTSIZE);
8146 if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8148 if (TREE_CODE (decl) == IDENTIFIER_NODE)
8149 fprintf (asm_out_file, ",%s,comdat", IDENTIFIER_POINTER (decl));
8151 fprintf (asm_out_file, ",%s,comdat",
8152 IDENTIFIER_POINTER (DECL_COMDAT_GROUP (decl)));
8156 putc ('\n', asm_out_file);
8159 /* Select a format to encode pointers in exception handling data. */
8161 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
8164 switch (aarch64_cmodel)
8166 case AARCH64_CMODEL_TINY:
8167 case AARCH64_CMODEL_TINY_PIC:
8168 case AARCH64_CMODEL_SMALL:
8169 case AARCH64_CMODEL_SMALL_PIC:
8170 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
8172 type = DW_EH_PE_sdata4;
8175 /* No assumptions here. 8-byte relocs required. */
8176 type = DW_EH_PE_sdata8;
8179 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
8182 /* Emit load exclusive. */
8185 aarch64_emit_load_exclusive (enum machine_mode mode, rtx rval,
8186 rtx mem, rtx model_rtx)
8188 rtx (*gen) (rtx, rtx, rtx);
8192 case QImode: gen = gen_aarch64_load_exclusiveqi; break;
8193 case HImode: gen = gen_aarch64_load_exclusivehi; break;
8194 case SImode: gen = gen_aarch64_load_exclusivesi; break;
8195 case DImode: gen = gen_aarch64_load_exclusivedi; break;
8200 emit_insn (gen (rval, mem, model_rtx));
8203 /* Emit store exclusive. */
8206 aarch64_emit_store_exclusive (enum machine_mode mode, rtx bval,
8207 rtx rval, rtx mem, rtx model_rtx)
8209 rtx (*gen) (rtx, rtx, rtx, rtx);
8213 case QImode: gen = gen_aarch64_store_exclusiveqi; break;
8214 case HImode: gen = gen_aarch64_store_exclusivehi; break;
8215 case SImode: gen = gen_aarch64_store_exclusivesi; break;
8216 case DImode: gen = gen_aarch64_store_exclusivedi; break;
8221 emit_insn (gen (bval, rval, mem, model_rtx));
8224 /* Mark the previous jump instruction as unlikely. */
8227 aarch64_emit_unlikely_jump (rtx insn)
8229 int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
8231 insn = emit_jump_insn (insn);
8232 add_int_reg_note (insn, REG_BR_PROB, very_unlikely);
8235 /* Expand a compare and swap pattern. */
8238 aarch64_expand_compare_and_swap (rtx operands[])
8240 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
8241 enum machine_mode mode, cmp_mode;
8242 rtx (*gen) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
8247 oldval = operands[3];
8248 newval = operands[4];
8249 is_weak = operands[5];
8250 mod_s = operands[6];
8251 mod_f = operands[7];
8252 mode = GET_MODE (mem);
8255 /* Normally the succ memory model must be stronger than fail, but in the
8256 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
8257 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
8259 if (INTVAL (mod_f) == MEMMODEL_ACQUIRE
8260 && INTVAL (mod_s) == MEMMODEL_RELEASE)
8261 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
8267 /* For short modes, we're going to perform the comparison in SImode,
8268 so do the zero-extension now. */
8270 rval = gen_reg_rtx (SImode);
8271 oldval = convert_modes (SImode, mode, oldval, true);
8276 /* Force the value into a register if needed. */
8277 if (!aarch64_plus_operand (oldval, mode))
8278 oldval = force_reg (cmp_mode, oldval);
8287 case QImode: gen = gen_atomic_compare_and_swapqi_1; break;
8288 case HImode: gen = gen_atomic_compare_and_swaphi_1; break;
8289 case SImode: gen = gen_atomic_compare_and_swapsi_1; break;
8290 case DImode: gen = gen_atomic_compare_and_swapdi_1; break;
8295 emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
8297 if (mode == QImode || mode == HImode)
8298 emit_move_insn (operands[1], gen_lowpart (mode, rval));
8300 x = gen_rtx_REG (CCmode, CC_REGNUM);
8301 x = gen_rtx_EQ (SImode, x, const0_rtx);
8302 emit_insn (gen_rtx_SET (VOIDmode, bval, x));
8305 /* Split a compare and swap pattern. */
8308 aarch64_split_compare_and_swap (rtx operands[])
8310 rtx rval, mem, oldval, newval, scratch;
8311 enum machine_mode mode;
8313 rtx label1, label2, x, cond;
8317 oldval = operands[2];
8318 newval = operands[3];
8319 is_weak = (operands[4] != const0_rtx);
8320 scratch = operands[7];
8321 mode = GET_MODE (mem);
8326 label1 = gen_label_rtx ();
8327 emit_label (label1);
8329 label2 = gen_label_rtx ();
8331 aarch64_emit_load_exclusive (mode, rval, mem, operands[5]);
8333 cond = aarch64_gen_compare_reg (NE, rval, oldval);
8334 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
8335 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
8336 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
8337 aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
8339 aarch64_emit_store_exclusive (mode, scratch, mem, newval, operands[5]);
8343 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
8344 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
8345 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
8346 aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
8350 cond = gen_rtx_REG (CCmode, CC_REGNUM);
8351 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
8352 emit_insn (gen_rtx_SET (VOIDmode, cond, x));
8355 emit_label (label2);
8358 /* Split an atomic operation. */
8361 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
8362 rtx value, rtx model_rtx, rtx cond)
8364 enum machine_mode mode = GET_MODE (mem);
8365 enum machine_mode wmode = (mode == DImode ? DImode : SImode);
8368 label = gen_label_rtx ();
8372 new_out = gen_lowpart (wmode, new_out);
8374 old_out = gen_lowpart (wmode, old_out);
8377 value = simplify_gen_subreg (wmode, value, mode, 0);
8379 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
8388 x = gen_rtx_AND (wmode, old_out, value);
8389 emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
8390 x = gen_rtx_NOT (wmode, new_out);
8391 emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
8395 if (CONST_INT_P (value))
8397 value = GEN_INT (-INTVAL (value));
8403 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
8404 emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
8408 aarch64_emit_store_exclusive (mode, cond, mem,
8409 gen_lowpart (mode, new_out), model_rtx);
8411 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
8412 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
8413 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
8414 aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
8418 aarch64_print_extension (void)
8420 const struct aarch64_option_extension *opt = NULL;
8422 for (opt = all_extensions; opt->name != NULL; opt++)
8423 if ((aarch64_isa_flags & opt->flags_on) == opt->flags_on)
8424 asm_fprintf (asm_out_file, "+%s", opt->name);
8426 asm_fprintf (asm_out_file, "\n");
8430 aarch64_start_file (void)
8434 asm_fprintf (asm_out_file, "\t.arch %s", selected_arch->name);
8435 aarch64_print_extension ();
8437 else if (selected_cpu)
8439 const char *truncated_name
8440 = aarch64_rewrite_selected_cpu (selected_cpu->name);
8441 asm_fprintf (asm_out_file, "\t.cpu %s", truncated_name);
8442 aarch64_print_extension ();
8444 default_file_start();
8447 /* Target hook for c_mode_for_suffix. */
8448 static enum machine_mode
8449 aarch64_c_mode_for_suffix (char suffix)
8457 /* We can only represent floating point constants which will fit in
8458 "quarter-precision" values. These values are characterised by
8459 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
8462 (-1)^s * (n/16) * 2^r
8465 's' is the sign bit.
8466 'n' is an integer in the range 16 <= n <= 31.
8467 'r' is an integer in the range -3 <= r <= 4. */
8469 /* Return true iff X can be represented by a quarter-precision
8470 floating point immediate operand X. Note, we cannot represent 0.0. */
8472 aarch64_float_const_representable_p (rtx x)
8474 /* This represents our current view of how many bits
8475 make up the mantissa. */
8476 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
8478 unsigned HOST_WIDE_INT mantissa, mask;
8479 HOST_WIDE_INT m1, m2;
8480 REAL_VALUE_TYPE r, m;
8482 if (!CONST_DOUBLE_P (x))
8485 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8487 /* We cannot represent infinities, NaNs or +/-zero. We won't
8488 know if we have +zero until we analyse the mantissa, but we
8489 can reject the other invalid values. */
8490 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
8491 || REAL_VALUE_MINUS_ZERO (r))
8494 /* Extract exponent. */
8495 r = real_value_abs (&r);
8496 exponent = REAL_EXP (&r);
8498 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
8499 highest (sign) bit, with a fixed binary point at bit point_pos.
8500 m1 holds the low part of the mantissa, m2 the high part.
8501 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
8502 bits for the mantissa, this can fail (low bits will be lost). */
8503 real_ldexp (&m, &r, point_pos - exponent);
8504 REAL_VALUE_TO_INT (&m1, &m2, m);
8506 /* If the low part of the mantissa has bits set we cannot represent
8510 /* We have rejected the lower HOST_WIDE_INT, so update our
8511 understanding of how many bits lie in the mantissa and
8512 look only at the high HOST_WIDE_INT. */
8514 point_pos -= HOST_BITS_PER_WIDE_INT;
8516 /* We can only represent values with a mantissa of the form 1.xxxx. */
8517 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
8518 if ((mantissa & mask) != 0)
8521 /* Having filtered unrepresentable values, we may now remove all
8522 but the highest 5 bits. */
8523 mantissa >>= point_pos - 5;
8525 /* We cannot represent the value 0.0, so reject it. This is handled
8530 /* Then, as bit 4 is always set, we can mask it off, leaving
8531 the mantissa in the range [0, 15]. */
8532 mantissa &= ~(1 << 4);
8533 gcc_assert (mantissa <= 15);
8535 /* GCC internally does not use IEEE754-like encoding (where normalized
8536 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
8537 Our mantissa values are shifted 4 places to the left relative to
8538 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
8539 by 5 places to correct for GCC's representation. */
8540 exponent = 5 - exponent;
8542 return (exponent >= 0 && exponent <= 7);
8546 aarch64_output_simd_mov_immediate (rtx const_vector,
8547 enum machine_mode mode,
8551 static char templ[40];
8552 const char *mnemonic;
8553 const char *shift_op;
8554 unsigned int lane_count = 0;
8557 struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
8559 /* This will return true to show const_vector is legal for use as either
8560 a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate. It will
8561 also update INFO to show how the immediate should be generated. */
8562 is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
8563 gcc_assert (is_valid);
8565 element_char = sizetochar (info.element_width);
8566 lane_count = width / info.element_width;
8568 mode = GET_MODE_INNER (mode);
8569 if (mode == SFmode || mode == DFmode)
8571 gcc_assert (info.shift == 0 && ! info.mvn);
8572 if (aarch64_float_const_zero_rtx_p (info.value))
8573 info.value = GEN_INT (0);
8578 REAL_VALUE_FROM_CONST_DOUBLE (r, info.value);
8579 char float_buf[buf_size] = {'\0'};
8580 real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size, 1, mode);
8583 if (lane_count == 1)
8584 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
8586 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
8587 lane_count, element_char, float_buf);
8592 mnemonic = info.mvn ? "mvni" : "movi";
8593 shift_op = info.msl ? "msl" : "lsl";
8595 if (lane_count == 1)
8596 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
8597 mnemonic, UINTVAL (info.value));
8598 else if (info.shift)
8599 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
8600 ", %s %d", mnemonic, lane_count, element_char,
8601 UINTVAL (info.value), shift_op, info.shift);
8603 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
8604 mnemonic, lane_count, element_char, UINTVAL (info.value));
8609 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
8610 enum machine_mode mode)
8612 enum machine_mode vmode;
8614 gcc_assert (!VECTOR_MODE_P (mode));
8615 vmode = aarch64_simd_container_mode (mode, 64);
8616 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
8617 return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
8620 /* Split operands into moves from op[1] + op[2] into op[0]. */
8623 aarch64_split_combinev16qi (rtx operands[3])
8625 unsigned int dest = REGNO (operands[0]);
8626 unsigned int src1 = REGNO (operands[1]);
8627 unsigned int src2 = REGNO (operands[2]);
8628 enum machine_mode halfmode = GET_MODE (operands[1]);
8629 unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
8632 gcc_assert (halfmode == V16QImode);
8634 if (src1 == dest && src2 == dest + halfregs)
8636 /* No-op move. Can't split to nothing; emit something. */
8637 emit_note (NOTE_INSN_DELETED);
8641 /* Preserve register attributes for variable tracking. */
8642 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
8643 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
8644 GET_MODE_SIZE (halfmode));
8646 /* Special case of reversed high/low parts. */
8647 if (reg_overlap_mentioned_p (operands[2], destlo)
8648 && reg_overlap_mentioned_p (operands[1], desthi))
8650 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
8651 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
8652 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
8654 else if (!reg_overlap_mentioned_p (operands[2], destlo))
8656 /* Try to avoid unnecessary moves if part of the result
8657 is in the right place already. */
8659 emit_move_insn (destlo, operands[1]);
8660 if (src2 != dest + halfregs)
8661 emit_move_insn (desthi, operands[2]);
8665 if (src2 != dest + halfregs)
8666 emit_move_insn (desthi, operands[2]);
8668 emit_move_insn (destlo, operands[1]);
8672 /* vec_perm support. */
8674 #define MAX_VECT_LEN 16
8676 struct expand_vec_perm_d
8678 rtx target, op0, op1;
8679 unsigned char perm[MAX_VECT_LEN];
8680 enum machine_mode vmode;
8686 /* Generate a variable permutation. */
8689 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
8691 enum machine_mode vmode = GET_MODE (target);
8692 bool one_vector_p = rtx_equal_p (op0, op1);
8694 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
8695 gcc_checking_assert (GET_MODE (op0) == vmode);
8696 gcc_checking_assert (GET_MODE (op1) == vmode);
8697 gcc_checking_assert (GET_MODE (sel) == vmode);
8698 gcc_checking_assert (TARGET_SIMD);
8702 if (vmode == V8QImode)
8704 /* Expand the argument to a V16QI mode by duplicating it. */
8705 rtx pair = gen_reg_rtx (V16QImode);
8706 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
8707 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
8711 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
8718 if (vmode == V8QImode)
8720 pair = gen_reg_rtx (V16QImode);
8721 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
8722 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
8726 pair = gen_reg_rtx (OImode);
8727 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
8728 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
8734 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
8736 enum machine_mode vmode = GET_MODE (target);
8737 unsigned int i, nelt = GET_MODE_NUNITS (vmode);
8738 bool one_vector_p = rtx_equal_p (op0, op1);
8739 rtx rmask[MAX_VECT_LEN], mask;
8741 gcc_checking_assert (!BYTES_BIG_ENDIAN);
8743 /* The TBL instruction does not use a modulo index, so we must take care
8744 of that ourselves. */
8745 mask = GEN_INT (one_vector_p ? nelt - 1 : 2 * nelt - 1);
8746 for (i = 0; i < nelt; ++i)
8748 mask = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rmask));
8749 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
8751 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
8754 /* Recognize patterns suitable for the TRN instructions. */
8756 aarch64_evpc_trn (struct expand_vec_perm_d *d)
8758 unsigned int i, odd, mask, nelt = d->nelt;
8759 rtx out, in0, in1, x;
8760 rtx (*gen) (rtx, rtx, rtx);
8761 enum machine_mode vmode = d->vmode;
8763 if (GET_MODE_UNIT_SIZE (vmode) > 8)
8766 /* Note that these are little-endian tests.
8767 We correct for big-endian later. */
8768 if (d->perm[0] == 0)
8770 else if (d->perm[0] == 1)
8774 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
8776 for (i = 0; i < nelt; i += 2)
8778 if (d->perm[i] != i + odd)
8780 if (d->perm[i + 1] != ((i + nelt + odd) & mask))
8790 if (BYTES_BIG_ENDIAN)
8792 x = in0, in0 = in1, in1 = x;
8801 case V16QImode: gen = gen_aarch64_trn2v16qi; break;
8802 case V8QImode: gen = gen_aarch64_trn2v8qi; break;
8803 case V8HImode: gen = gen_aarch64_trn2v8hi; break;
8804 case V4HImode: gen = gen_aarch64_trn2v4hi; break;
8805 case V4SImode: gen = gen_aarch64_trn2v4si; break;
8806 case V2SImode: gen = gen_aarch64_trn2v2si; break;
8807 case V2DImode: gen = gen_aarch64_trn2v2di; break;
8808 case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
8809 case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
8810 case V2DFmode: gen = gen_aarch64_trn2v2df; break;
8819 case V16QImode: gen = gen_aarch64_trn1v16qi; break;
8820 case V8QImode: gen = gen_aarch64_trn1v8qi; break;
8821 case V8HImode: gen = gen_aarch64_trn1v8hi; break;
8822 case V4HImode: gen = gen_aarch64_trn1v4hi; break;
8823 case V4SImode: gen = gen_aarch64_trn1v4si; break;
8824 case V2SImode: gen = gen_aarch64_trn1v2si; break;
8825 case V2DImode: gen = gen_aarch64_trn1v2di; break;
8826 case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
8827 case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
8828 case V2DFmode: gen = gen_aarch64_trn1v2df; break;
8834 emit_insn (gen (out, in0, in1));
8838 /* Recognize patterns suitable for the UZP instructions. */
8840 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
8842 unsigned int i, odd, mask, nelt = d->nelt;
8843 rtx out, in0, in1, x;
8844 rtx (*gen) (rtx, rtx, rtx);
8845 enum machine_mode vmode = d->vmode;
8847 if (GET_MODE_UNIT_SIZE (vmode) > 8)
8850 /* Note that these are little-endian tests.
8851 We correct for big-endian later. */
8852 if (d->perm[0] == 0)
8854 else if (d->perm[0] == 1)
8858 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
8860 for (i = 0; i < nelt; i++)
8862 unsigned elt = (i * 2 + odd) & mask;
8863 if (d->perm[i] != elt)
8873 if (BYTES_BIG_ENDIAN)
8875 x = in0, in0 = in1, in1 = x;
8884 case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
8885 case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
8886 case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
8887 case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
8888 case V4SImode: gen = gen_aarch64_uzp2v4si; break;
8889 case V2SImode: gen = gen_aarch64_uzp2v2si; break;
8890 case V2DImode: gen = gen_aarch64_uzp2v2di; break;
8891 case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
8892 case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
8893 case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
8902 case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
8903 case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
8904 case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
8905 case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
8906 case V4SImode: gen = gen_aarch64_uzp1v4si; break;
8907 case V2SImode: gen = gen_aarch64_uzp1v2si; break;
8908 case V2DImode: gen = gen_aarch64_uzp1v2di; break;
8909 case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
8910 case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
8911 case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
8917 emit_insn (gen (out, in0, in1));
8921 /* Recognize patterns suitable for the ZIP instructions. */
8923 aarch64_evpc_zip (struct expand_vec_perm_d *d)
8925 unsigned int i, high, mask, nelt = d->nelt;
8926 rtx out, in0, in1, x;
8927 rtx (*gen) (rtx, rtx, rtx);
8928 enum machine_mode vmode = d->vmode;
8930 if (GET_MODE_UNIT_SIZE (vmode) > 8)
8933 /* Note that these are little-endian tests.
8934 We correct for big-endian later. */
8936 if (d->perm[0] == high)
8939 else if (d->perm[0] == 0)
8943 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
8945 for (i = 0; i < nelt / 2; i++)
8947 unsigned elt = (i + high) & mask;
8948 if (d->perm[i * 2] != elt)
8950 elt = (elt + nelt) & mask;
8951 if (d->perm[i * 2 + 1] != elt)
8961 if (BYTES_BIG_ENDIAN)
8963 x = in0, in0 = in1, in1 = x;
8972 case V16QImode: gen = gen_aarch64_zip2v16qi; break;
8973 case V8QImode: gen = gen_aarch64_zip2v8qi; break;
8974 case V8HImode: gen = gen_aarch64_zip2v8hi; break;
8975 case V4HImode: gen = gen_aarch64_zip2v4hi; break;
8976 case V4SImode: gen = gen_aarch64_zip2v4si; break;
8977 case V2SImode: gen = gen_aarch64_zip2v2si; break;
8978 case V2DImode: gen = gen_aarch64_zip2v2di; break;
8979 case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
8980 case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
8981 case V2DFmode: gen = gen_aarch64_zip2v2df; break;
8990 case V16QImode: gen = gen_aarch64_zip1v16qi; break;
8991 case V8QImode: gen = gen_aarch64_zip1v8qi; break;
8992 case V8HImode: gen = gen_aarch64_zip1v8hi; break;
8993 case V4HImode: gen = gen_aarch64_zip1v4hi; break;
8994 case V4SImode: gen = gen_aarch64_zip1v4si; break;
8995 case V2SImode: gen = gen_aarch64_zip1v2si; break;
8996 case V2DImode: gen = gen_aarch64_zip1v2di; break;
8997 case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
8998 case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
8999 case V2DFmode: gen = gen_aarch64_zip1v2df; break;
9005 emit_insn (gen (out, in0, in1));
9010 aarch64_evpc_dup (struct expand_vec_perm_d *d)
9012 rtx (*gen) (rtx, rtx, rtx);
9013 rtx out = d->target;
9015 enum machine_mode vmode = d->vmode;
9016 unsigned int i, elt, nelt = d->nelt;
9019 /* TODO: This may not be big-endian safe. */
9020 if (BYTES_BIG_ENDIAN)
9024 for (i = 1; i < nelt; i++)
9026 if (elt != d->perm[i])
9030 /* The generic preparation in aarch64_expand_vec_perm_const_1
9031 swaps the operand order and the permute indices if it finds
9032 d->perm[0] to be in the second operand. Thus, we can always
9033 use d->op0 and need not do any extra arithmetic to get the
9034 correct lane number. */
9036 lane = GEN_INT (elt);
9040 case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
9041 case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
9042 case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
9043 case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
9044 case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
9045 case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
9046 case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
9047 case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
9048 case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
9049 case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
9054 emit_insn (gen (out, in0, lane));
9059 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
9061 rtx rperm[MAX_VECT_LEN], sel;
9062 enum machine_mode vmode = d->vmode;
9063 unsigned int i, nelt = d->nelt;
9065 /* TODO: ARM's TBL indexing is little-endian. In order to handle GCC's
9066 numbering of elements for big-endian, we must reverse the order. */
9067 if (BYTES_BIG_ENDIAN)
9073 /* Generic code will try constant permutation twice. Once with the
9074 original mode and again with the elements lowered to QImode.
9075 So wait and don't do the selector expansion ourselves. */
9076 if (vmode != V8QImode && vmode != V16QImode)
9079 for (i = 0; i < nelt; ++i)
9080 rperm[i] = GEN_INT (d->perm[i]);
9081 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
9082 sel = force_reg (vmode, sel);
9084 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
9089 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
9091 /* The pattern matching functions above are written to look for a small
9092 number to begin the sequence (0, 1, N/2). If we begin with an index
9093 from the second operand, we can swap the operands. */
9094 if (d->perm[0] >= d->nelt)
9096 unsigned i, nelt = d->nelt;
9099 for (i = 0; i < nelt; ++i)
9100 d->perm[i] = (d->perm[i] + nelt) & (2 * nelt - 1);
9109 if (aarch64_evpc_zip (d))
9111 else if (aarch64_evpc_uzp (d))
9113 else if (aarch64_evpc_trn (d))
9115 else if (aarch64_evpc_dup (d))
9117 return aarch64_evpc_tbl (d);
9122 /* Expand a vec_perm_const pattern. */
9125 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
9127 struct expand_vec_perm_d d;
9134 d.vmode = GET_MODE (target);
9135 gcc_assert (VECTOR_MODE_P (d.vmode));
9136 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
9137 d.testing_p = false;
9139 for (i = which = 0; i < nelt; ++i)
9141 rtx e = XVECEXP (sel, 0, i);
9142 int ei = INTVAL (e) & (2 * nelt - 1);
9143 which |= (ei < nelt ? 1 : 2);
9153 d.one_vector_p = false;
9154 if (!rtx_equal_p (op0, op1))
9157 /* The elements of PERM do not suggest that only the first operand
9158 is used, but both operands are identical. Allow easier matching
9159 of the permutation by folding the permutation into the single
9163 for (i = 0; i < nelt; ++i)
9164 d.perm[i] &= nelt - 1;
9166 d.one_vector_p = true;
9171 d.one_vector_p = true;
9175 return aarch64_expand_vec_perm_const_1 (&d);
9179 aarch64_vectorize_vec_perm_const_ok (enum machine_mode vmode,
9180 const unsigned char *sel)
9182 struct expand_vec_perm_d d;
9183 unsigned int i, nelt, which;
9187 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
9189 memcpy (d.perm, sel, nelt);
9191 /* Calculate whether all elements are in one vector. */
9192 for (i = which = 0; i < nelt; ++i)
9194 unsigned char e = d.perm[i];
9195 gcc_assert (e < 2 * nelt);
9196 which |= (e < nelt ? 1 : 2);
9199 /* If all elements are from the second vector, reindex as if from the
9202 for (i = 0; i < nelt; ++i)
9205 /* Check whether the mask can be applied to a single vector. */
9206 d.one_vector_p = (which != 3);
9208 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
9209 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
9210 if (!d.one_vector_p)
9211 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
9214 ret = aarch64_expand_vec_perm_const_1 (&d);
9220 /* Implement target hook CANNOT_CHANGE_MODE_CLASS. */
9222 aarch64_cannot_change_mode_class (enum machine_mode from,
9223 enum machine_mode to,
9224 enum reg_class rclass)
9226 /* Full-reg subregs are allowed on general regs or any class if they are
9228 if (GET_MODE_SIZE (from) == GET_MODE_SIZE (to)
9229 || !reg_classes_intersect_p (FP_REGS, rclass))
9232 /* Limited combinations of subregs are safe on FPREGs. Particularly,
9233 1. Vector Mode to Scalar mode where 1 unit of the vector is accessed.
9234 2. Scalar to Scalar for integer modes or same size float modes.
9235 3. Vector to Vector modes. */
9236 if (GET_MODE_SIZE (from) > GET_MODE_SIZE (to))
9238 if (aarch64_vector_mode_supported_p (from)
9239 && GET_MODE_SIZE (GET_MODE_INNER (from)) == GET_MODE_SIZE (to))
9242 if (GET_MODE_NUNITS (from) == 1
9243 && GET_MODE_NUNITS (to) == 1
9244 && (GET_MODE_CLASS (from) == MODE_INT
9248 if (aarch64_vector_mode_supported_p (from)
9249 && aarch64_vector_mode_supported_p (to))
9256 #undef TARGET_ADDRESS_COST
9257 #define TARGET_ADDRESS_COST aarch64_address_cost
9259 /* This hook will determines whether unnamed bitfields affect the alignment
9260 of the containing structure. The hook returns true if the structure
9261 should inherit the alignment requirements of an unnamed bitfield's
9263 #undef TARGET_ALIGN_ANON_BITFIELD
9264 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
9266 #undef TARGET_ASM_ALIGNED_DI_OP
9267 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
9269 #undef TARGET_ASM_ALIGNED_HI_OP
9270 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
9272 #undef TARGET_ASM_ALIGNED_SI_OP
9273 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
9275 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
9276 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
9277 hook_bool_const_tree_hwi_hwi_const_tree_true
9279 #undef TARGET_ASM_FILE_START
9280 #define TARGET_ASM_FILE_START aarch64_start_file
9282 #undef TARGET_ASM_OUTPUT_MI_THUNK
9283 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
9285 #undef TARGET_ASM_SELECT_RTX_SECTION
9286 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
9288 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
9289 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
9291 #undef TARGET_BUILD_BUILTIN_VA_LIST
9292 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
9294 #undef TARGET_CALLEE_COPIES
9295 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
9297 #undef TARGET_CAN_ELIMINATE
9298 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
9300 #undef TARGET_CANNOT_FORCE_CONST_MEM
9301 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
9303 #undef TARGET_CONDITIONAL_REGISTER_USAGE
9304 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
9306 /* Only the least significant bit is used for initialization guard
9308 #undef TARGET_CXX_GUARD_MASK_BIT
9309 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
9311 #undef TARGET_C_MODE_FOR_SUFFIX
9312 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
9314 #ifdef TARGET_BIG_ENDIAN_DEFAULT
9315 #undef TARGET_DEFAULT_TARGET_FLAGS
9316 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
9319 #undef TARGET_CLASS_MAX_NREGS
9320 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
9322 #undef TARGET_BUILTIN_DECL
9323 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
9325 #undef TARGET_EXPAND_BUILTIN
9326 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
9328 #undef TARGET_EXPAND_BUILTIN_VA_START
9329 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
9331 #undef TARGET_FOLD_BUILTIN
9332 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
9334 #undef TARGET_FUNCTION_ARG
9335 #define TARGET_FUNCTION_ARG aarch64_function_arg
9337 #undef TARGET_FUNCTION_ARG_ADVANCE
9338 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
9340 #undef TARGET_FUNCTION_ARG_BOUNDARY
9341 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
9343 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
9344 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
9346 #undef TARGET_FUNCTION_VALUE
9347 #define TARGET_FUNCTION_VALUE aarch64_function_value
9349 #undef TARGET_FUNCTION_VALUE_REGNO_P
9350 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
9352 #undef TARGET_FRAME_POINTER_REQUIRED
9353 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
9355 #undef TARGET_GIMPLE_FOLD_BUILTIN
9356 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
9358 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
9359 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
9361 #undef TARGET_INIT_BUILTINS
9362 #define TARGET_INIT_BUILTINS aarch64_init_builtins
9364 #undef TARGET_LEGITIMATE_ADDRESS_P
9365 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
9367 #undef TARGET_LEGITIMATE_CONSTANT_P
9368 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
9370 #undef TARGET_LIBGCC_CMP_RETURN_MODE
9371 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
9374 #define TARGET_LRA_P aarch64_lra_p
9376 #undef TARGET_MANGLE_TYPE
9377 #define TARGET_MANGLE_TYPE aarch64_mangle_type
9379 #undef TARGET_MEMORY_MOVE_COST
9380 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
9382 #undef TARGET_MUST_PASS_IN_STACK
9383 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
9385 /* This target hook should return true if accesses to volatile bitfields
9386 should use the narrowest mode possible. It should return false if these
9387 accesses should use the bitfield container type. */
9388 #undef TARGET_NARROW_VOLATILE_BITFIELD
9389 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
9391 #undef TARGET_OPTION_OVERRIDE
9392 #define TARGET_OPTION_OVERRIDE aarch64_override_options
9394 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
9395 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
9396 aarch64_override_options_after_change
9398 #undef TARGET_PASS_BY_REFERENCE
9399 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
9401 #undef TARGET_PREFERRED_RELOAD_CLASS
9402 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
9404 #undef TARGET_SECONDARY_RELOAD
9405 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
9407 #undef TARGET_SHIFT_TRUNCATION_MASK
9408 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
9410 #undef TARGET_SETUP_INCOMING_VARARGS
9411 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
9413 #undef TARGET_STRUCT_VALUE_RTX
9414 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
9416 #undef TARGET_REGISTER_MOVE_COST
9417 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
9419 #undef TARGET_RETURN_IN_MEMORY
9420 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
9422 #undef TARGET_RETURN_IN_MSB
9423 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
9425 #undef TARGET_RTX_COSTS
9426 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
9428 #undef TARGET_SCHED_ISSUE_RATE
9429 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
9431 #undef TARGET_TRAMPOLINE_INIT
9432 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
9434 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
9435 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
9437 #undef TARGET_VECTOR_MODE_SUPPORTED_P
9438 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
9440 #undef TARGET_ARRAY_MODE_SUPPORTED_P
9441 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
9443 #undef TARGET_VECTORIZE_ADD_STMT_COST
9444 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
9446 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
9447 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
9448 aarch64_builtin_vectorization_cost
9450 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
9451 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
9453 #undef TARGET_VECTORIZE_BUILTINS
9454 #define TARGET_VECTORIZE_BUILTINS
9456 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
9457 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
9458 aarch64_builtin_vectorized_function
9460 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
9461 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
9462 aarch64_autovectorize_vector_sizes
9464 /* Section anchor support. */
9466 #undef TARGET_MIN_ANCHOR_OFFSET
9467 #define TARGET_MIN_ANCHOR_OFFSET -256
9469 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
9470 byte offset; we can do much more for larger data types, but have no way
9471 to determine the size of the access. We assume accesses are aligned. */
9472 #undef TARGET_MAX_ANCHOR_OFFSET
9473 #define TARGET_MAX_ANCHOR_OFFSET 4095
9475 #undef TARGET_VECTOR_ALIGNMENT
9476 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
9478 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
9479 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
9480 aarch64_simd_vector_alignment_reachable
9482 /* vec_perm support. */
9484 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
9485 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
9486 aarch64_vectorize_vec_perm_const_ok
9489 #undef TARGET_FIXED_CONDITION_CODE_REGS
9490 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
9492 #undef TARGET_FLAGS_REGNUM
9493 #define TARGET_FLAGS_REGNUM CC_REGNUM
9495 struct gcc_target targetm = TARGET_INITIALIZER;
9497 #include "gt-aarch64.h"