1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2016 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
22 #define INCLUDE_STRING
24 #include "coretypes.h"
35 #include "stringpool.h"
40 #include "diagnostic.h"
41 #include "insn-attr.h"
43 #include "fold-const.h"
44 #include "stor-layout.h"
52 #include "langhooks.h"
57 #include "gimple-iterator.h"
58 #include "tree-vectorizer.h"
59 #include "aarch64-cost-tables.h"
63 #include "tm-constrs.h"
64 #include "sched-int.h"
65 #include "cortex-a57-fma-steering.h"
66 #include "target-globals.h"
67 #include "common/common-target.h"
69 /* This file should be included last. */
70 #include "target-def.h"
72 /* Defined for convenience. */
73 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
75 /* Classifies an address.
78 A simple base register plus immediate offset.
81 A base register indexed by immediate offset with writeback.
84 A base register indexed by (optionally scaled) register.
87 A base register indexed by (optionally scaled) zero-extended register.
90 A base register indexed by (optionally scaled) sign-extended register.
93 A LO_SUM rtx with a base register and "LO12" symbol relocation.
96 A constant symbolic address, in pc-relative literal pool. */
98 enum aarch64_address_type {
108 struct aarch64_address_info {
109 enum aarch64_address_type type;
113 enum aarch64_symbol_type symbol_type;
116 struct simd_immediate_info
125 /* The current code model. */
126 enum aarch64_code_model aarch64_cmodel;
129 #undef TARGET_HAVE_TLS
130 #define TARGET_HAVE_TLS 1
133 static bool aarch64_composite_type_p (const_tree, machine_mode);
134 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
136 machine_mode *, int *,
138 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
139 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
140 static void aarch64_override_options_after_change (void);
141 static bool aarch64_vector_mode_supported_p (machine_mode);
142 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
143 const unsigned char *sel);
144 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
146 /* Major revision number of the ARM Architecture implemented by the target. */
147 unsigned aarch64_architecture_version;
149 /* The processor for which instructions should be scheduled. */
150 enum aarch64_processor aarch64_tune = cortexa53;
152 /* Mask to specify which instruction scheduling options should be used. */
153 unsigned long aarch64_tune_flags = 0;
155 /* Global flag for PC relative loads. */
156 bool aarch64_pcrelative_literal_loads;
158 /* Support for command line parsing of boolean flags in the tuning
160 struct aarch64_flag_desc
166 #define AARCH64_FUSION_PAIR(name, internal_name) \
167 { name, AARCH64_FUSE_##internal_name },
168 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
170 { "none", AARCH64_FUSE_NOTHING },
171 #include "aarch64-fusion-pairs.def"
172 { "all", AARCH64_FUSE_ALL },
173 { NULL, AARCH64_FUSE_NOTHING }
175 #undef AARCH64_FUION_PAIR
177 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
178 { name, AARCH64_EXTRA_TUNE_##internal_name },
179 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
181 { "none", AARCH64_EXTRA_TUNE_NONE },
182 #include "aarch64-tuning-flags.def"
183 { "all", AARCH64_EXTRA_TUNE_ALL },
184 { NULL, AARCH64_EXTRA_TUNE_NONE }
186 #undef AARCH64_EXTRA_TUNING_OPTION
188 /* Tuning parameters. */
190 static const struct cpu_addrcost_table generic_addrcost_table =
200 0, /* register_offset */
201 0, /* register_sextend */
202 0, /* register_zextend */
206 static const struct cpu_addrcost_table cortexa57_addrcost_table =
216 0, /* register_offset */
217 0, /* register_sextend */
218 0, /* register_zextend */
222 static const struct cpu_addrcost_table exynosm1_addrcost_table =
232 1, /* register_offset */
233 1, /* register_sextend */
234 2, /* register_zextend */
238 static const struct cpu_addrcost_table xgene1_addrcost_table =
248 0, /* register_offset */
249 1, /* register_sextend */
250 1, /* register_zextend */
254 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
264 0, /* register_offset */
265 0, /* register_sextend */
266 0, /* register_zextend */
270 static const struct cpu_addrcost_table vulcan_addrcost_table =
280 2, /* register_offset */
281 3, /* register_sextend */
282 3, /* register_zextend */
286 static const struct cpu_regmove_cost generic_regmove_cost =
289 /* Avoid the use of slow int<->fp moves for spilling by setting
290 their cost higher than memmov_cost. */
296 static const struct cpu_regmove_cost cortexa57_regmove_cost =
299 /* Avoid the use of slow int<->fp moves for spilling by setting
300 their cost higher than memmov_cost. */
306 static const struct cpu_regmove_cost cortexa53_regmove_cost =
309 /* Avoid the use of slow int<->fp moves for spilling by setting
310 their cost higher than memmov_cost. */
316 static const struct cpu_regmove_cost exynosm1_regmove_cost =
319 /* Avoid the use of slow int<->fp moves for spilling by setting
320 their cost higher than memmov_cost (actual, 4 and 9). */
326 static const struct cpu_regmove_cost thunderx_regmove_cost =
334 static const struct cpu_regmove_cost xgene1_regmove_cost =
337 /* Avoid the use of slow int<->fp moves for spilling by setting
338 their cost higher than memmov_cost. */
344 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
347 /* Avoid the use of int<->fp moves for spilling. */
353 static const struct cpu_regmove_cost vulcan_regmove_cost =
356 /* Avoid the use of int<->fp moves for spilling. */
362 /* Generic costs for vector insn classes. */
363 static const struct cpu_vector_cost generic_vector_cost =
365 1, /* scalar_stmt_cost */
366 1, /* scalar_load_cost */
367 1, /* scalar_store_cost */
368 1, /* vec_stmt_cost */
369 2, /* vec_permute_cost */
370 1, /* vec_to_scalar_cost */
371 1, /* scalar_to_vec_cost */
372 1, /* vec_align_load_cost */
373 1, /* vec_unalign_load_cost */
374 1, /* vec_unalign_store_cost */
375 1, /* vec_store_cost */
376 3, /* cond_taken_branch_cost */
377 1 /* cond_not_taken_branch_cost */
380 /* ThunderX costs for vector insn classes. */
381 static const struct cpu_vector_cost thunderx_vector_cost =
383 1, /* scalar_stmt_cost */
384 3, /* scalar_load_cost */
385 1, /* scalar_store_cost */
386 4, /* vec_stmt_cost */
387 4, /* vec_permute_cost */
388 2, /* vec_to_scalar_cost */
389 2, /* scalar_to_vec_cost */
390 3, /* vec_align_load_cost */
391 10, /* vec_unalign_load_cost */
392 10, /* vec_unalign_store_cost */
393 1, /* vec_store_cost */
394 3, /* cond_taken_branch_cost */
395 3 /* cond_not_taken_branch_cost */
398 /* Generic costs for vector insn classes. */
399 static const struct cpu_vector_cost cortexa57_vector_cost =
401 1, /* scalar_stmt_cost */
402 4, /* scalar_load_cost */
403 1, /* scalar_store_cost */
404 2, /* vec_stmt_cost */
405 3, /* vec_permute_cost */
406 8, /* vec_to_scalar_cost */
407 8, /* scalar_to_vec_cost */
408 4, /* vec_align_load_cost */
409 4, /* vec_unalign_load_cost */
410 1, /* vec_unalign_store_cost */
411 1, /* vec_store_cost */
412 1, /* cond_taken_branch_cost */
413 1 /* cond_not_taken_branch_cost */
416 static const struct cpu_vector_cost exynosm1_vector_cost =
418 1, /* scalar_stmt_cost */
419 5, /* scalar_load_cost */
420 1, /* scalar_store_cost */
421 3, /* vec_stmt_cost */
422 3, /* vec_permute_cost */
423 3, /* vec_to_scalar_cost */
424 3, /* scalar_to_vec_cost */
425 5, /* vec_align_load_cost */
426 5, /* vec_unalign_load_cost */
427 1, /* vec_unalign_store_cost */
428 1, /* vec_store_cost */
429 1, /* cond_taken_branch_cost */
430 1 /* cond_not_taken_branch_cost */
433 /* Generic costs for vector insn classes. */
434 static const struct cpu_vector_cost xgene1_vector_cost =
436 1, /* scalar_stmt_cost */
437 5, /* scalar_load_cost */
438 1, /* scalar_store_cost */
439 2, /* vec_stmt_cost */
440 2, /* vec_permute_cost */
441 4, /* vec_to_scalar_cost */
442 4, /* scalar_to_vec_cost */
443 10, /* vec_align_load_cost */
444 10, /* vec_unalign_load_cost */
445 2, /* vec_unalign_store_cost */
446 2, /* vec_store_cost */
447 2, /* cond_taken_branch_cost */
448 1 /* cond_not_taken_branch_cost */
451 /* Costs for vector insn classes for Vulcan. */
452 static const struct cpu_vector_cost vulcan_vector_cost =
454 6, /* scalar_stmt_cost */
455 4, /* scalar_load_cost */
456 1, /* scalar_store_cost */
457 6, /* vec_stmt_cost */
458 3, /* vec_permute_cost */
459 6, /* vec_to_scalar_cost */
460 5, /* scalar_to_vec_cost */
461 8, /* vec_align_load_cost */
462 8, /* vec_unalign_load_cost */
463 4, /* vec_unalign_store_cost */
464 4, /* vec_store_cost */
465 2, /* cond_taken_branch_cost */
466 1 /* cond_not_taken_branch_cost */
469 /* Generic costs for branch instructions. */
470 static const struct cpu_branch_cost generic_branch_cost =
472 2, /* Predictable. */
473 2 /* Unpredictable. */
476 /* Branch costs for Cortex-A57. */
477 static const struct cpu_branch_cost cortexa57_branch_cost =
479 1, /* Predictable. */
480 3 /* Unpredictable. */
483 /* Branch costs for Vulcan. */
484 static const struct cpu_branch_cost vulcan_branch_cost =
486 1, /* Predictable. */
487 3 /* Unpredictable. */
490 /* Generic approximation modes. */
491 static const cpu_approx_modes generic_approx_modes =
493 AARCH64_APPROX_NONE, /* division */
494 AARCH64_APPROX_NONE, /* sqrt */
495 AARCH64_APPROX_NONE /* recip_sqrt */
498 /* Approximation modes for Exynos M1. */
499 static const cpu_approx_modes exynosm1_approx_modes =
501 AARCH64_APPROX_NONE, /* division */
502 AARCH64_APPROX_ALL, /* sqrt */
503 AARCH64_APPROX_ALL /* recip_sqrt */
506 /* Approximation modes for X-Gene 1. */
507 static const cpu_approx_modes xgene1_approx_modes =
509 AARCH64_APPROX_NONE, /* division */
510 AARCH64_APPROX_NONE, /* sqrt */
511 AARCH64_APPROX_ALL /* recip_sqrt */
514 static const struct tune_params generic_tunings =
516 &cortexa57_extra_costs,
517 &generic_addrcost_table,
518 &generic_regmove_cost,
519 &generic_vector_cost,
520 &generic_branch_cost,
521 &generic_approx_modes,
524 AARCH64_FUSE_NOTHING, /* fusible_ops */
525 8, /* function_align. */
528 2, /* int_reassoc_width. */
529 4, /* fp_reassoc_width. */
530 1, /* vec_reassoc_width. */
531 2, /* min_div_recip_mul_sf. */
532 2, /* min_div_recip_mul_df. */
533 0, /* max_case_values. */
534 0, /* cache_line_size. */
535 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
536 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
539 static const struct tune_params cortexa35_tunings =
541 &cortexa53_extra_costs,
542 &generic_addrcost_table,
543 &cortexa53_regmove_cost,
544 &generic_vector_cost,
545 &cortexa57_branch_cost,
546 &generic_approx_modes,
549 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
550 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
551 16, /* function_align. */
554 2, /* int_reassoc_width. */
555 4, /* fp_reassoc_width. */
556 1, /* vec_reassoc_width. */
557 2, /* min_div_recip_mul_sf. */
558 2, /* min_div_recip_mul_df. */
559 0, /* max_case_values. */
560 0, /* cache_line_size. */
561 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
562 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
565 static const struct tune_params cortexa53_tunings =
567 &cortexa53_extra_costs,
568 &generic_addrcost_table,
569 &cortexa53_regmove_cost,
570 &generic_vector_cost,
571 &cortexa57_branch_cost,
572 &generic_approx_modes,
575 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
576 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
577 16, /* function_align. */
580 2, /* int_reassoc_width. */
581 4, /* fp_reassoc_width. */
582 1, /* vec_reassoc_width. */
583 2, /* min_div_recip_mul_sf. */
584 2, /* min_div_recip_mul_df. */
585 0, /* max_case_values. */
586 0, /* cache_line_size. */
587 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
588 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
591 static const struct tune_params cortexa57_tunings =
593 &cortexa57_extra_costs,
594 &cortexa57_addrcost_table,
595 &cortexa57_regmove_cost,
596 &cortexa57_vector_cost,
597 &cortexa57_branch_cost,
598 &generic_approx_modes,
601 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
602 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
603 16, /* function_align. */
606 2, /* int_reassoc_width. */
607 4, /* fp_reassoc_width. */
608 1, /* vec_reassoc_width. */
609 2, /* min_div_recip_mul_sf. */
610 2, /* min_div_recip_mul_df. */
611 0, /* max_case_values. */
612 0, /* cache_line_size. */
613 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
614 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS) /* tune_flags. */
617 static const struct tune_params cortexa72_tunings =
619 &cortexa57_extra_costs,
620 &cortexa57_addrcost_table,
621 &cortexa57_regmove_cost,
622 &cortexa57_vector_cost,
623 &cortexa57_branch_cost,
624 &generic_approx_modes,
627 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
628 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
629 16, /* function_align. */
632 2, /* int_reassoc_width. */
633 4, /* fp_reassoc_width. */
634 1, /* vec_reassoc_width. */
635 2, /* min_div_recip_mul_sf. */
636 2, /* min_div_recip_mul_df. */
637 0, /* max_case_values. */
638 0, /* cache_line_size. */
639 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
640 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
643 static const struct tune_params cortexa73_tunings =
645 &cortexa57_extra_costs,
646 &cortexa57_addrcost_table,
647 &cortexa57_regmove_cost,
648 &cortexa57_vector_cost,
649 &cortexa57_branch_cost,
650 &generic_approx_modes,
651 4, /* memmov_cost. */
653 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
654 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
655 16, /* function_align. */
658 2, /* int_reassoc_width. */
659 4, /* fp_reassoc_width. */
660 1, /* vec_reassoc_width. */
661 2, /* min_div_recip_mul_sf. */
662 2, /* min_div_recip_mul_df. */
663 0, /* max_case_values. */
664 0, /* cache_line_size. */
665 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
666 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
669 static const struct tune_params exynosm1_tunings =
671 &exynosm1_extra_costs,
672 &exynosm1_addrcost_table,
673 &exynosm1_regmove_cost,
674 &exynosm1_vector_cost,
675 &generic_branch_cost,
676 &exynosm1_approx_modes,
679 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
680 4, /* function_align. */
683 2, /* int_reassoc_width. */
684 4, /* fp_reassoc_width. */
685 1, /* vec_reassoc_width. */
686 2, /* min_div_recip_mul_sf. */
687 2, /* min_div_recip_mul_df. */
688 48, /* max_case_values. */
689 64, /* cache_line_size. */
690 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
691 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
694 static const struct tune_params thunderx_tunings =
696 &thunderx_extra_costs,
697 &generic_addrcost_table,
698 &thunderx_regmove_cost,
699 &thunderx_vector_cost,
700 &generic_branch_cost,
701 &generic_approx_modes,
704 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
705 8, /* function_align. */
708 2, /* int_reassoc_width. */
709 4, /* fp_reassoc_width. */
710 1, /* vec_reassoc_width. */
711 2, /* min_div_recip_mul_sf. */
712 2, /* min_div_recip_mul_df. */
713 0, /* max_case_values. */
714 0, /* cache_line_size. */
715 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
716 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW) /* tune_flags. */
719 static const struct tune_params xgene1_tunings =
722 &xgene1_addrcost_table,
723 &xgene1_regmove_cost,
725 &generic_branch_cost,
726 &xgene1_approx_modes,
729 AARCH64_FUSE_NOTHING, /* fusible_ops */
730 16, /* function_align. */
732 16, /* loop_align. */
733 2, /* int_reassoc_width. */
734 4, /* fp_reassoc_width. */
735 1, /* vec_reassoc_width. */
736 2, /* min_div_recip_mul_sf. */
737 2, /* min_div_recip_mul_df. */
738 0, /* max_case_values. */
739 0, /* cache_line_size. */
740 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
741 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
744 static const struct tune_params qdf24xx_tunings =
746 &qdf24xx_extra_costs,
747 &qdf24xx_addrcost_table,
748 &qdf24xx_regmove_cost,
749 &generic_vector_cost,
750 &generic_branch_cost,
751 &generic_approx_modes,
754 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
755 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
756 16, /* function_align. */
758 16, /* loop_align. */
759 2, /* int_reassoc_width. */
760 4, /* fp_reassoc_width. */
761 1, /* vec_reassoc_width. */
762 2, /* min_div_recip_mul_sf. */
763 2, /* min_div_recip_mul_df. */
764 0, /* max_case_values. */
765 64, /* cache_line_size. */
766 tune_params::AUTOPREFETCHER_STRONG, /* autoprefetcher_model. */
767 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
770 static const struct tune_params vulcan_tunings =
773 &vulcan_addrcost_table,
774 &vulcan_regmove_cost,
777 &generic_approx_modes,
778 4, /* memmov_cost. */
780 AARCH64_FUSE_NOTHING, /* fuseable_ops. */
781 16, /* function_align. */
783 16, /* loop_align. */
784 3, /* int_reassoc_width. */
785 2, /* fp_reassoc_width. */
786 2, /* vec_reassoc_width. */
787 2, /* min_div_recip_mul_sf. */
788 2, /* min_div_recip_mul_df. */
789 0, /* max_case_values. */
790 64, /* cache_line_size. */
791 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
792 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
795 /* Support for fine-grained override of the tuning structures. */
796 struct aarch64_tuning_override_function
799 void (*parse_override)(const char*, struct tune_params*);
802 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
803 static void aarch64_parse_tune_string (const char*, struct tune_params*);
805 static const struct aarch64_tuning_override_function
806 aarch64_tuning_override_functions[] =
808 { "fuse", aarch64_parse_fuse_string },
809 { "tune", aarch64_parse_tune_string },
813 /* A processor implementing AArch64. */
816 const char *const name;
817 enum aarch64_processor ident;
818 enum aarch64_processor sched_core;
819 enum aarch64_arch arch;
820 unsigned architecture_version;
821 const unsigned long flags;
822 const struct tune_params *const tune;
825 /* Architectures implementing AArch64. */
826 static const struct processor all_architectures[] =
828 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
829 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
830 #include "aarch64-arches.def"
832 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
835 /* Processor cores implementing AArch64. */
836 static const struct processor all_cores[] =
838 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART) \
839 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
840 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
841 FLAGS, &COSTS##_tunings},
842 #include "aarch64-cores.def"
844 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
845 AARCH64_FL_FOR_ARCH8, &generic_tunings},
846 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
850 /* Target specification. These are populated by the -march, -mtune, -mcpu
851 handling code or by target attributes. */
852 static const struct processor *selected_arch;
853 static const struct processor *selected_cpu;
854 static const struct processor *selected_tune;
856 /* The current tuning set. */
857 struct tune_params aarch64_tune_params = generic_tunings;
859 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
861 /* An ISA extension in the co-processor and main instruction set space. */
862 struct aarch64_option_extension
864 const char *const name;
865 const unsigned long flags_on;
866 const unsigned long flags_off;
869 typedef enum aarch64_cond_code
871 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
872 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
873 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
877 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
879 /* The condition codes of the processor, and the inverse function. */
880 static const char * const aarch64_condition_codes[] =
882 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
883 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
886 /* Generate code to enable conditional branches in functions over 1 MiB. */
888 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
889 const char * branch_format)
891 rtx_code_label * tmp_label = gen_label_rtx ();
894 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
895 CODE_LABEL_NUMBER (tmp_label));
896 const char *label_ptr = targetm.strip_name_encoding (label_buf);
897 rtx dest_label = operands[pos_label];
898 operands[pos_label] = tmp_label;
900 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
901 output_asm_insn (buffer, operands);
903 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
904 operands[pos_label] = dest_label;
905 output_asm_insn (buffer, operands);
910 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
912 const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
913 if (TARGET_GENERAL_REGS_ONLY)
914 error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
916 error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
919 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
920 The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
921 the same cost even if ALL_REGS has a much larger cost. ALL_REGS is also
922 used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
923 cost (in this case the best class is the lowest cost one). Using ALL_REGS
924 irrespectively of its cost results in bad allocations with many redundant
925 int<->FP moves which are expensive on various cores.
926 To avoid this we don't allow ALL_REGS as the allocno class, but force a
927 decision between FP_REGS and GENERAL_REGS. We use the allocno class if it
928 isn't ALL_REGS. Similarly, use the best class if it isn't ALL_REGS.
929 Otherwise set the allocno class depending on the mode.
930 The result of this is that it is no longer inefficient to have a higher
931 memory move cost than the register move cost.
935 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
936 reg_class_t best_class)
938 enum machine_mode mode;
940 if (allocno_class != ALL_REGS)
941 return allocno_class;
943 if (best_class != ALL_REGS)
946 mode = PSEUDO_REGNO_MODE (regno);
947 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
951 aarch64_min_divisions_for_recip_mul (enum machine_mode mode)
953 if (GET_MODE_UNIT_SIZE (mode) == 4)
954 return aarch64_tune_params.min_div_recip_mul_sf;
955 return aarch64_tune_params.min_div_recip_mul_df;
959 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
960 enum machine_mode mode)
962 if (VECTOR_MODE_P (mode))
963 return aarch64_tune_params.vec_reassoc_width;
964 if (INTEGRAL_MODE_P (mode))
965 return aarch64_tune_params.int_reassoc_width;
966 if (FLOAT_MODE_P (mode))
967 return aarch64_tune_params.fp_reassoc_width;
971 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
973 aarch64_dbx_register_number (unsigned regno)
975 if (GP_REGNUM_P (regno))
976 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
977 else if (regno == SP_REGNUM)
978 return AARCH64_DWARF_SP;
979 else if (FP_REGNUM_P (regno))
980 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
982 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
983 equivalent DWARF register. */
984 return DWARF_FRAME_REGISTERS;
987 /* Return TRUE if MODE is any of the large INT modes. */
989 aarch64_vect_struct_mode_p (machine_mode mode)
991 return mode == OImode || mode == CImode || mode == XImode;
994 /* Return TRUE if MODE is any of the vector modes. */
996 aarch64_vector_mode_p (machine_mode mode)
998 return aarch64_vector_mode_supported_p (mode)
999 || aarch64_vect_struct_mode_p (mode);
1002 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1004 aarch64_array_mode_supported_p (machine_mode mode,
1005 unsigned HOST_WIDE_INT nelems)
1008 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1009 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1010 && (nelems >= 2 && nelems <= 4))
1016 /* Implement HARD_REGNO_NREGS. */
1019 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1021 switch (aarch64_regno_regclass (regno))
1025 return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
1027 return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
1032 /* Implement HARD_REGNO_MODE_OK. */
1035 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1037 if (GET_MODE_CLASS (mode) == MODE_CC)
1038 return regno == CC_REGNUM;
1040 if (regno == SP_REGNUM)
1041 /* The purpose of comparing with ptr_mode is to support the
1042 global register variable associated with the stack pointer
1043 register via the syntax of asm ("wsp") in ILP32. */
1044 return mode == Pmode || mode == ptr_mode;
1046 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1047 return mode == Pmode;
1049 if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
1052 if (FP_REGNUM_P (regno))
1054 if (aarch64_vect_struct_mode_p (mode))
1056 (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
1064 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1066 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
1069 /* Handle modes that fit within single registers. */
1070 if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
1072 if (GET_MODE_SIZE (mode) >= 4)
1077 /* Fall back to generic for multi-reg and very large modes. */
1079 return choose_hard_reg_mode (regno, nregs, false);
1082 /* Return true if calls to DECL should be treated as
1083 long-calls (ie called via a register). */
1085 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1090 /* Return true if calls to symbol-ref SYM should be treated as
1091 long-calls (ie called via a register). */
1093 aarch64_is_long_call_p (rtx sym)
1095 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1098 /* Return true if calls to symbol-ref SYM should not go through
1102 aarch64_is_noplt_call_p (rtx sym)
1104 const_tree decl = SYMBOL_REF_DECL (sym);
1109 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1110 && !targetm.binds_local_p (decl))
1116 /* Return true if the offsets to a zero/sign-extract operation
1117 represent an expression that matches an extend operation. The
1118 operands represent the paramters from
1120 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1122 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
1125 HOST_WIDE_INT mult_val, extract_val;
1127 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1130 mult_val = INTVAL (mult_imm);
1131 extract_val = INTVAL (extract_imm);
1134 && extract_val < GET_MODE_BITSIZE (mode)
1135 && exact_log2 (extract_val & ~7) > 0
1136 && (extract_val & 7) <= 4
1137 && mult_val == (1 << (extract_val & 7)))
1143 /* Emit an insn that's a simple single-set. Both the operands must be
1144 known to be valid. */
1146 emit_set_insn (rtx x, rtx y)
1148 return emit_insn (gen_rtx_SET (x, y));
1151 /* X and Y are two things to compare using CODE. Emit the compare insn and
1152 return the rtx for register 0 in the proper mode. */
1154 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1156 machine_mode mode = SELECT_CC_MODE (code, x, y);
1157 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1159 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1163 /* Build the SYMBOL_REF for __tls_get_addr. */
1165 static GTY(()) rtx tls_get_addr_libfunc;
1168 aarch64_tls_get_addr (void)
1170 if (!tls_get_addr_libfunc)
1171 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1172 return tls_get_addr_libfunc;
1175 /* Return the TLS model to use for ADDR. */
1177 static enum tls_model
1178 tls_symbolic_operand_type (rtx addr)
1180 enum tls_model tls_kind = TLS_MODEL_NONE;
1183 if (GET_CODE (addr) == CONST)
1185 split_const (addr, &sym, &addend);
1186 if (GET_CODE (sym) == SYMBOL_REF)
1187 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1189 else if (GET_CODE (addr) == SYMBOL_REF)
1190 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1195 /* We'll allow lo_sum's in addresses in our legitimate addresses
1196 so that combine would take care of combining addresses where
1197 necessary, but for generation purposes, we'll generate the address
1200 tmp = hi (symbol_ref); adrp x1, foo
1201 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1205 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1206 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1210 Load TLS symbol, depending on TLS mechanism and TLS access model.
1212 Global Dynamic - Traditional TLS:
1213 adrp tmp, :tlsgd:imm
1214 add dest, tmp, #:tlsgd_lo12:imm
1217 Global Dynamic - TLS Descriptors:
1218 adrp dest, :tlsdesc:imm
1219 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1220 add dest, dest, #:tlsdesc_lo12:imm
1227 adrp tmp, :gottprel:imm
1228 ldr dest, [tmp, #:gottprel_lo12:imm]
1233 add t0, tp, #:tprel_hi12:imm, lsl #12
1234 add t0, t0, #:tprel_lo12_nc:imm
1238 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1239 enum aarch64_symbol_type type)
1243 case SYMBOL_SMALL_ABSOLUTE:
1245 /* In ILP32, the mode of dest can be either SImode or DImode. */
1247 machine_mode mode = GET_MODE (dest);
1249 gcc_assert (mode == Pmode || mode == ptr_mode);
1251 if (can_create_pseudo_p ())
1252 tmp_reg = gen_reg_rtx (mode);
1254 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1255 emit_insn (gen_add_losym (dest, tmp_reg, imm));
1259 case SYMBOL_TINY_ABSOLUTE:
1260 emit_insn (gen_rtx_SET (dest, imm));
1263 case SYMBOL_SMALL_GOT_28K:
1265 machine_mode mode = GET_MODE (dest);
1266 rtx gp_rtx = pic_offset_table_rtx;
1270 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1271 here before rtl expand. Tree IVOPT will generate rtl pattern to
1272 decide rtx costs, in which case pic_offset_table_rtx is not
1273 initialized. For that case no need to generate the first adrp
1274 instruction as the final cost for global variable access is
1278 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1279 using the page base as GOT base, the first page may be wasted,
1280 in the worst scenario, there is only 28K space for GOT).
1282 The generate instruction sequence for accessing global variable
1285 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1287 Only one instruction needed. But we must initialize
1288 pic_offset_table_rtx properly. We generate initialize insn for
1289 every global access, and allow CSE to remove all redundant.
1291 The final instruction sequences will look like the following
1292 for multiply global variables access.
1294 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1296 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1297 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1298 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1301 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1302 crtl->uses_pic_offset_table = 1;
1303 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1305 if (mode != GET_MODE (gp_rtx))
1306 gp_rtx = simplify_gen_subreg (mode, gp_rtx, GET_MODE (gp_rtx), 0);
1309 if (mode == ptr_mode)
1312 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1314 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1316 mem = XVECEXP (SET_SRC (insn), 0, 0);
1320 gcc_assert (mode == Pmode);
1322 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1323 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1326 /* The operand is expected to be MEM. Whenever the related insn
1327 pattern changed, above code which calculate mem should be
1329 gcc_assert (GET_CODE (mem) == MEM);
1330 MEM_READONLY_P (mem) = 1;
1331 MEM_NOTRAP_P (mem) = 1;
1336 case SYMBOL_SMALL_GOT_4G:
1338 /* In ILP32, the mode of dest can be either SImode or DImode,
1339 while the got entry is always of SImode size. The mode of
1340 dest depends on how dest is used: if dest is assigned to a
1341 pointer (e.g. in the memory), it has SImode; it may have
1342 DImode if dest is dereferenced to access the memeory.
1343 This is why we have to handle three different ldr_got_small
1344 patterns here (two patterns for ILP32). */
1349 machine_mode mode = GET_MODE (dest);
1351 if (can_create_pseudo_p ())
1352 tmp_reg = gen_reg_rtx (mode);
1354 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1355 if (mode == ptr_mode)
1358 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1360 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1362 mem = XVECEXP (SET_SRC (insn), 0, 0);
1366 gcc_assert (mode == Pmode);
1368 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1369 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1372 gcc_assert (GET_CODE (mem) == MEM);
1373 MEM_READONLY_P (mem) = 1;
1374 MEM_NOTRAP_P (mem) = 1;
1379 case SYMBOL_SMALL_TLSGD:
1382 rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
1385 aarch64_emit_call_insn (gen_tlsgd_small (result, imm));
1386 insns = get_insns ();
1389 RTL_CONST_CALL_P (insns) = 1;
1390 emit_libcall_block (insns, dest, result, imm);
1394 case SYMBOL_SMALL_TLSDESC:
1396 machine_mode mode = GET_MODE (dest);
1397 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1400 gcc_assert (mode == Pmode || mode == ptr_mode);
1402 /* In ILP32, the got entry is always of SImode size. Unlike
1403 small GOT, the dest is fixed at reg 0. */
1405 emit_insn (gen_tlsdesc_small_si (imm));
1407 emit_insn (gen_tlsdesc_small_di (imm));
1408 tp = aarch64_load_tp (NULL);
1411 tp = gen_lowpart (mode, tp);
1413 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1414 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1418 case SYMBOL_SMALL_TLSIE:
1420 /* In ILP32, the mode of dest can be either SImode or DImode,
1421 while the got entry is always of SImode size. The mode of
1422 dest depends on how dest is used: if dest is assigned to a
1423 pointer (e.g. in the memory), it has SImode; it may have
1424 DImode if dest is dereferenced to access the memeory.
1425 This is why we have to handle three different tlsie_small
1426 patterns here (two patterns for ILP32). */
1427 machine_mode mode = GET_MODE (dest);
1428 rtx tmp_reg = gen_reg_rtx (mode);
1429 rtx tp = aarch64_load_tp (NULL);
1431 if (mode == ptr_mode)
1434 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1437 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1438 tp = gen_lowpart (mode, tp);
1443 gcc_assert (mode == Pmode);
1444 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1447 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1448 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1452 case SYMBOL_TLSLE12:
1453 case SYMBOL_TLSLE24:
1454 case SYMBOL_TLSLE32:
1455 case SYMBOL_TLSLE48:
1457 machine_mode mode = GET_MODE (dest);
1458 rtx tp = aarch64_load_tp (NULL);
1461 tp = gen_lowpart (mode, tp);
1465 case SYMBOL_TLSLE12:
1466 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1469 case SYMBOL_TLSLE24:
1470 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1473 case SYMBOL_TLSLE32:
1474 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1476 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1479 case SYMBOL_TLSLE48:
1480 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1482 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1489 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1493 case SYMBOL_TINY_GOT:
1494 emit_insn (gen_ldr_got_tiny (dest, imm));
1497 case SYMBOL_TINY_TLSIE:
1499 machine_mode mode = GET_MODE (dest);
1500 rtx tp = aarch64_load_tp (NULL);
1502 if (mode == ptr_mode)
1505 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1508 tp = gen_lowpart (mode, tp);
1509 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1514 gcc_assert (mode == Pmode);
1515 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1518 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1527 /* Emit a move from SRC to DEST. Assume that the move expanders can
1528 handle all moves if !can_create_pseudo_p (). The distinction is
1529 important because, unlike emit_move_insn, the move expanders know
1530 how to force Pmode objects into the constant pool even when the
1531 constant pool address is not itself legitimate. */
1533 aarch64_emit_move (rtx dest, rtx src)
1535 return (can_create_pseudo_p ()
1536 ? emit_move_insn (dest, src)
1537 : emit_move_insn_1 (dest, src));
1540 /* Split a 128-bit move operation into two 64-bit move operations,
1541 taking care to handle partial overlap of register to register
1542 copies. Special cases are needed when moving between GP regs and
1543 FP regs. SRC can be a register, constant or memory; DST a register
1544 or memory. If either operand is memory it must not have any side
1547 aarch64_split_128bit_move (rtx dst, rtx src)
1552 machine_mode mode = GET_MODE (dst);
1554 gcc_assert (mode == TImode || mode == TFmode);
1555 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1556 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1558 if (REG_P (dst) && REG_P (src))
1560 int src_regno = REGNO (src);
1561 int dst_regno = REGNO (dst);
1563 /* Handle FP <-> GP regs. */
1564 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1566 src_lo = gen_lowpart (word_mode, src);
1567 src_hi = gen_highpart (word_mode, src);
1571 emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1572 emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1576 emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1577 emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1581 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1583 dst_lo = gen_lowpart (word_mode, dst);
1584 dst_hi = gen_highpart (word_mode, dst);
1588 emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1589 emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1593 emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1594 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1600 dst_lo = gen_lowpart (word_mode, dst);
1601 dst_hi = gen_highpart (word_mode, dst);
1602 src_lo = gen_lowpart (word_mode, src);
1603 src_hi = gen_highpart_mode (word_mode, mode, src);
1605 /* At most one pairing may overlap. */
1606 if (reg_overlap_mentioned_p (dst_lo, src_hi))
1608 aarch64_emit_move (dst_hi, src_hi);
1609 aarch64_emit_move (dst_lo, src_lo);
1613 aarch64_emit_move (dst_lo, src_lo);
1614 aarch64_emit_move (dst_hi, src_hi);
1619 aarch64_split_128bit_move_p (rtx dst, rtx src)
1621 return (! REG_P (src)
1622 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1625 /* Split a complex SIMD combine. */
1628 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1630 machine_mode src_mode = GET_MODE (src1);
1631 machine_mode dst_mode = GET_MODE (dst);
1633 gcc_assert (VECTOR_MODE_P (dst_mode));
1635 if (REG_P (dst) && REG_P (src1) && REG_P (src2))
1637 rtx (*gen) (rtx, rtx, rtx);
1642 gen = gen_aarch64_simd_combinev8qi;
1645 gen = gen_aarch64_simd_combinev4hi;
1648 gen = gen_aarch64_simd_combinev2si;
1651 gen = gen_aarch64_simd_combinev4hf;
1654 gen = gen_aarch64_simd_combinev2sf;
1657 gen = gen_aarch64_simd_combinedi;
1660 gen = gen_aarch64_simd_combinedf;
1666 emit_insn (gen (dst, src1, src2));
1671 /* Split a complex SIMD move. */
1674 aarch64_split_simd_move (rtx dst, rtx src)
1676 machine_mode src_mode = GET_MODE (src);
1677 machine_mode dst_mode = GET_MODE (dst);
1679 gcc_assert (VECTOR_MODE_P (dst_mode));
1681 if (REG_P (dst) && REG_P (src))
1683 rtx (*gen) (rtx, rtx);
1685 gcc_assert (VECTOR_MODE_P (src_mode));
1690 gen = gen_aarch64_split_simd_movv16qi;
1693 gen = gen_aarch64_split_simd_movv8hi;
1696 gen = gen_aarch64_split_simd_movv4si;
1699 gen = gen_aarch64_split_simd_movv2di;
1702 gen = gen_aarch64_split_simd_movv8hf;
1705 gen = gen_aarch64_split_simd_movv4sf;
1708 gen = gen_aarch64_split_simd_movv2df;
1714 emit_insn (gen (dst, src));
1720 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
1721 machine_mode ymode, rtx y)
1723 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
1724 gcc_assert (r != NULL);
1725 return rtx_equal_p (x, r);
1730 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1732 if (can_create_pseudo_p ())
1733 return force_reg (mode, value);
1736 x = aarch64_emit_move (x, value);
1743 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1745 if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1748 /* Load the full offset into a register. This
1749 might be improvable in the future. */
1750 high = GEN_INT (offset);
1752 high = aarch64_force_temporary (mode, temp, high);
1753 reg = aarch64_force_temporary (mode, temp,
1754 gen_rtx_PLUS (mode, high, reg));
1756 return plus_constant (mode, reg, offset);
1760 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1764 unsigned HOST_WIDE_INT val, val2, mask;
1765 int one_match, zero_match;
1770 if (aarch64_move_imm (val, mode))
1773 emit_insn (gen_rtx_SET (dest, imm));
1777 if ((val >> 32) == 0 || mode == SImode)
1781 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
1783 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1784 GEN_INT ((val >> 16) & 0xffff)));
1786 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
1787 GEN_INT ((val >> 16) & 0xffff)));
1792 /* Remaining cases are all for DImode. */
1795 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
1796 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
1797 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
1798 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
1800 if (zero_match != 2 && one_match != 2)
1802 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1803 For a 64-bit bitmask try whether changing 16 bits to all ones or
1804 zeroes creates a valid bitmask. To check any repeated bitmask,
1805 try using 16 bits from the other 32-bit half of val. */
1807 for (i = 0; i < 64; i += 16, mask <<= 16)
1810 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1813 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1815 val2 = val2 & ~mask;
1816 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
1817 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1824 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1825 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1826 GEN_INT ((val >> i) & 0xffff)));
1832 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1833 are emitted by the initial mov. If one_match > zero_match, skip set bits,
1834 otherwise skip zero bits. */
1838 val2 = one_match > zero_match ? ~val : val;
1839 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
1842 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
1843 ? (val | ~(mask << i))
1844 : (val & (mask << i)))));
1845 for (i += 16; i < 64; i += 16)
1847 if ((val2 & (mask << i)) == 0)
1850 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1851 GEN_INT ((val >> i) & 0xffff)));
1860 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1862 machine_mode mode = GET_MODE (dest);
1864 gcc_assert (mode == SImode || mode == DImode);
1866 /* Check on what type of symbol it is. */
1867 if (GET_CODE (imm) == SYMBOL_REF
1868 || GET_CODE (imm) == LABEL_REF
1869 || GET_CODE (imm) == CONST)
1871 rtx mem, base, offset;
1872 enum aarch64_symbol_type sty;
1874 /* If we have (const (plus symbol offset)), separate out the offset
1875 before we start classifying the symbol. */
1876 split_const (imm, &base, &offset);
1878 sty = aarch64_classify_symbol (base, offset);
1881 case SYMBOL_FORCE_TO_MEM:
1882 if (offset != const0_rtx
1883 && targetm.cannot_force_const_mem (mode, imm))
1885 gcc_assert (can_create_pseudo_p ());
1886 base = aarch64_force_temporary (mode, dest, base);
1887 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1888 aarch64_emit_move (dest, base);
1892 mem = force_const_mem (ptr_mode, imm);
1895 /* If we aren't generating PC relative literals, then
1896 we need to expand the literal pool access carefully.
1897 This is something that needs to be done in a number
1898 of places, so could well live as a separate function. */
1899 if (!aarch64_pcrelative_literal_loads)
1901 gcc_assert (can_create_pseudo_p ());
1902 base = gen_reg_rtx (ptr_mode);
1903 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
1904 mem = gen_rtx_MEM (ptr_mode, base);
1907 if (mode != ptr_mode)
1908 mem = gen_rtx_ZERO_EXTEND (mode, mem);
1910 emit_insn (gen_rtx_SET (dest, mem));
1914 case SYMBOL_SMALL_TLSGD:
1915 case SYMBOL_SMALL_TLSDESC:
1916 case SYMBOL_SMALL_TLSIE:
1917 case SYMBOL_SMALL_GOT_28K:
1918 case SYMBOL_SMALL_GOT_4G:
1919 case SYMBOL_TINY_GOT:
1920 case SYMBOL_TINY_TLSIE:
1921 if (offset != const0_rtx)
1923 gcc_assert(can_create_pseudo_p ());
1924 base = aarch64_force_temporary (mode, dest, base);
1925 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1926 aarch64_emit_move (dest, base);
1931 case SYMBOL_SMALL_ABSOLUTE:
1932 case SYMBOL_TINY_ABSOLUTE:
1933 case SYMBOL_TLSLE12:
1934 case SYMBOL_TLSLE24:
1935 case SYMBOL_TLSLE32:
1936 case SYMBOL_TLSLE48:
1937 aarch64_load_symref_appropriately (dest, imm, sty);
1945 if (!CONST_INT_P (imm))
1947 if (GET_CODE (imm) == HIGH)
1948 emit_insn (gen_rtx_SET (dest, imm));
1951 rtx mem = force_const_mem (mode, imm);
1953 emit_insn (gen_rtx_SET (dest, mem));
1959 aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
1962 /* Add DELTA to REGNUM in mode MODE. SCRATCHREG can be used to hold a
1963 temporary value if necessary. FRAME_RELATED_P should be true if
1964 the RTX_FRAME_RELATED flag should be set and CFA adjustments added
1965 to the generated instructions. If SCRATCHREG is known to hold
1966 abs (delta), EMIT_MOVE_IMM can be set to false to avoid emitting the
1969 Since this function may be used to adjust the stack pointer, we must
1970 ensure that it cannot cause transient stack deallocation (for example
1971 by first incrementing SP and then decrementing when adjusting by a
1972 large immediate). */
1975 aarch64_add_constant_internal (machine_mode mode, int regnum, int scratchreg,
1976 HOST_WIDE_INT delta, bool frame_related_p,
1979 HOST_WIDE_INT mdelta = abs_hwi (delta);
1980 rtx this_rtx = gen_rtx_REG (mode, regnum);
1986 /* Single instruction adjustment. */
1987 if (aarch64_uimm12_shift (mdelta))
1989 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
1990 RTX_FRAME_RELATED_P (insn) = frame_related_p;
1994 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits.
1995 Only do this if mdelta is not a 16-bit move as adjusting using a move
1997 if (mdelta < 0x1000000 && !aarch64_move_imm (mdelta, mode))
1999 HOST_WIDE_INT low_off = mdelta & 0xfff;
2001 low_off = delta < 0 ? -low_off : low_off;
2002 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
2003 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2004 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
2005 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2009 /* Emit a move immediate if required and an addition/subtraction. */
2010 rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
2012 aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (mdelta), true, mode);
2013 insn = emit_insn (delta < 0 ? gen_sub2_insn (this_rtx, scratch_rtx)
2014 : gen_add2_insn (this_rtx, scratch_rtx));
2015 if (frame_related_p)
2017 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2018 rtx adj = plus_constant (mode, this_rtx, delta);
2019 add_reg_note (insn , REG_CFA_ADJUST_CFA, gen_rtx_SET (this_rtx, adj));
2024 aarch64_add_constant (machine_mode mode, int regnum, int scratchreg,
2025 HOST_WIDE_INT delta)
2027 aarch64_add_constant_internal (mode, regnum, scratchreg, delta, false, true);
2031 aarch64_add_sp (int scratchreg, HOST_WIDE_INT delta, bool emit_move_imm)
2033 aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, delta,
2034 true, emit_move_imm);
2038 aarch64_sub_sp (int scratchreg, HOST_WIDE_INT delta, bool frame_related_p)
2040 aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, -delta,
2041 frame_related_p, true);
2045 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
2046 tree exp ATTRIBUTE_UNUSED)
2048 /* Currently, always true. */
2052 /* Implement TARGET_PASS_BY_REFERENCE. */
2055 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
2058 bool named ATTRIBUTE_UNUSED)
2061 machine_mode dummymode;
2064 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
2065 size = (mode == BLKmode && type)
2066 ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
2068 /* Aggregates are passed by reference based on their size. */
2069 if (type && AGGREGATE_TYPE_P (type))
2071 size = int_size_in_bytes (type);
2074 /* Variable sized arguments are always returned by reference. */
2078 /* Can this be a candidate to be passed in fp/simd register(s)? */
2079 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2084 /* Arguments which are variable sized or larger than 2 registers are
2085 passed by reference unless they are a homogenous floating point
2087 return size > 2 * UNITS_PER_WORD;
2090 /* Return TRUE if VALTYPE is padded to its least significant bits. */
2092 aarch64_return_in_msb (const_tree valtype)
2094 machine_mode dummy_mode;
2097 /* Never happens in little-endian mode. */
2098 if (!BYTES_BIG_ENDIAN)
2101 /* Only composite types smaller than or equal to 16 bytes can
2102 be potentially returned in registers. */
2103 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
2104 || int_size_in_bytes (valtype) <= 0
2105 || int_size_in_bytes (valtype) > 16)
2108 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
2109 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
2110 is always passed/returned in the least significant bits of fp/simd
2112 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
2113 &dummy_mode, &dummy_int, NULL))
2119 /* Implement TARGET_FUNCTION_VALUE.
2120 Define how to find the value returned by a function. */
2123 aarch64_function_value (const_tree type, const_tree func,
2124 bool outgoing ATTRIBUTE_UNUSED)
2129 machine_mode ag_mode;
2131 mode = TYPE_MODE (type);
2132 if (INTEGRAL_TYPE_P (type))
2133 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
2135 if (aarch64_return_in_msb (type))
2137 HOST_WIDE_INT size = int_size_in_bytes (type);
2139 if (size % UNITS_PER_WORD != 0)
2141 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
2142 mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
2146 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2147 &ag_mode, &count, NULL))
2149 if (!aarch64_composite_type_p (type, mode))
2151 gcc_assert (count == 1 && mode == ag_mode);
2152 return gen_rtx_REG (mode, V0_REGNUM);
2159 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
2160 for (i = 0; i < count; i++)
2162 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
2163 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2164 GEN_INT (i * GET_MODE_SIZE (ag_mode)));
2165 XVECEXP (par, 0, i) = tmp;
2171 return gen_rtx_REG (mode, R0_REGNUM);
2174 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
2175 Return true if REGNO is the number of a hard register in which the values
2176 of called function may come back. */
2179 aarch64_function_value_regno_p (const unsigned int regno)
2181 /* Maximum of 16 bytes can be returned in the general registers. Examples
2182 of 16-byte return values are: 128-bit integers and 16-byte small
2183 structures (excluding homogeneous floating-point aggregates). */
2184 if (regno == R0_REGNUM || regno == R1_REGNUM)
2187 /* Up to four fp/simd registers can return a function value, e.g. a
2188 homogeneous floating-point aggregate having four members. */
2189 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
2190 return TARGET_FLOAT;
2195 /* Implement TARGET_RETURN_IN_MEMORY.
2197 If the type T of the result of a function is such that
2199 would require that arg be passed as a value in a register (or set of
2200 registers) according to the parameter passing rules, then the result
2201 is returned in the same registers as would be used for such an
2205 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
2208 machine_mode ag_mode;
2211 if (!AGGREGATE_TYPE_P (type)
2212 && TREE_CODE (type) != COMPLEX_TYPE
2213 && TREE_CODE (type) != VECTOR_TYPE)
2214 /* Simple scalar types always returned in registers. */
2217 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
2224 /* Types larger than 2 registers returned in memory. */
2225 size = int_size_in_bytes (type);
2226 return (size < 0 || size > 2 * UNITS_PER_WORD);
2230 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
2231 const_tree type, int *nregs)
2233 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2234 return aarch64_vfp_is_call_or_return_candidate (mode,
2236 &pcum->aapcs_vfp_rmode,
2241 /* Given MODE and TYPE of a function argument, return the alignment in
2242 bits. The idea is to suppress any stronger alignment requested by
2243 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
2244 This is a helper function for local use only. */
2247 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
2250 return GET_MODE_ALIGNMENT (mode);
2252 if (integer_zerop (TYPE_SIZE (type)))
2255 gcc_assert (TYPE_MODE (type) == mode);
2257 if (!AGGREGATE_TYPE_P (type))
2258 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
2260 if (TREE_CODE (type) == ARRAY_TYPE)
2261 return TYPE_ALIGN (TREE_TYPE (type));
2263 unsigned int alignment = 0;
2264 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
2265 if (TREE_CODE (field) == FIELD_DECL)
2266 alignment = std::max (alignment, DECL_ALIGN (field));
2271 /* Layout a function argument according to the AAPCS64 rules. The rule
2272 numbers refer to the rule numbers in the AAPCS64. */
2275 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
2277 bool named ATTRIBUTE_UNUSED)
2279 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2280 int ncrn, nvrn, nregs;
2281 bool allocate_ncrn, allocate_nvrn;
2284 /* We need to do this once per argument. */
2285 if (pcum->aapcs_arg_processed)
2288 pcum->aapcs_arg_processed = true;
2290 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
2292 = ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
2295 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
2296 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
2301 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
2302 The following code thus handles passing by SIMD/FP registers first. */
2304 nvrn = pcum->aapcs_nvrn;
2306 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
2307 and homogenous short-vector aggregates (HVA). */
2311 aarch64_err_no_fpadvsimd (mode, "argument");
2313 if (nvrn + nregs <= NUM_FP_ARG_REGS)
2315 pcum->aapcs_nextnvrn = nvrn + nregs;
2316 if (!aarch64_composite_type_p (type, mode))
2318 gcc_assert (nregs == 1);
2319 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
2325 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2326 for (i = 0; i < nregs; i++)
2328 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
2329 V0_REGNUM + nvrn + i);
2330 tmp = gen_rtx_EXPR_LIST
2332 GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
2333 XVECEXP (par, 0, i) = tmp;
2335 pcum->aapcs_reg = par;
2341 /* C.3 NSRN is set to 8. */
2342 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
2347 ncrn = pcum->aapcs_ncrn;
2348 nregs = size / UNITS_PER_WORD;
2350 /* C6 - C9. though the sign and zero extension semantics are
2351 handled elsewhere. This is the case where the argument fits
2352 entirely general registers. */
2353 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
2356 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
2358 /* C.8 if the argument has an alignment of 16 then the NGRN is
2359 rounded up to the next even number. */
2362 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
2363 comparison is there because for > 16 * BITS_PER_UNIT
2364 alignment nregs should be > 2 and therefore it should be
2365 passed by reference rather than value. */
2366 && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2369 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
2372 /* NREGS can be 0 when e.g. an empty structure is to be passed.
2373 A reg is still generated for it, but the caller should be smart
2374 enough not to use it. */
2375 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
2376 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
2382 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2383 for (i = 0; i < nregs; i++)
2385 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
2386 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2387 GEN_INT (i * UNITS_PER_WORD));
2388 XVECEXP (par, 0, i) = tmp;
2390 pcum->aapcs_reg = par;
2393 pcum->aapcs_nextncrn = ncrn + nregs;
2398 pcum->aapcs_nextncrn = NUM_ARG_REGS;
2400 /* The argument is passed on stack; record the needed number of words for
2401 this argument and align the total size if necessary. */
2403 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2405 if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2406 pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
2407 16 / UNITS_PER_WORD);
2411 /* Implement TARGET_FUNCTION_ARG. */
2414 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
2415 const_tree type, bool named)
2417 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2418 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
2420 if (mode == VOIDmode)
2423 aarch64_layout_arg (pcum_v, mode, type, named);
2424 return pcum->aapcs_reg;
2428 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
2429 const_tree fntype ATTRIBUTE_UNUSED,
2430 rtx libname ATTRIBUTE_UNUSED,
2431 const_tree fndecl ATTRIBUTE_UNUSED,
2432 unsigned n_named ATTRIBUTE_UNUSED)
2434 pcum->aapcs_ncrn = 0;
2435 pcum->aapcs_nvrn = 0;
2436 pcum->aapcs_nextncrn = 0;
2437 pcum->aapcs_nextnvrn = 0;
2438 pcum->pcs_variant = ARM_PCS_AAPCS64;
2439 pcum->aapcs_reg = NULL_RTX;
2440 pcum->aapcs_arg_processed = false;
2441 pcum->aapcs_stack_words = 0;
2442 pcum->aapcs_stack_size = 0;
2445 && fndecl && TREE_PUBLIC (fndecl)
2446 && fntype && fntype != error_mark_node)
2448 const_tree type = TREE_TYPE (fntype);
2449 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
2450 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
2451 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2452 &mode, &nregs, NULL))
2453 aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2459 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2464 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2465 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2467 aarch64_layout_arg (pcum_v, mode, type, named);
2468 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2469 != (pcum->aapcs_stack_words != 0));
2470 pcum->aapcs_arg_processed = false;
2471 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2472 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2473 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2474 pcum->aapcs_stack_words = 0;
2475 pcum->aapcs_reg = NULL_RTX;
2480 aarch64_function_arg_regno_p (unsigned regno)
2482 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2483 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2486 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
2487 PARM_BOUNDARY bits of alignment, but will be given anything up
2488 to STACK_BOUNDARY bits if the type requires it. This makes sure
2489 that both before and after the layout of each argument, the Next
2490 Stacked Argument Address (NSAA) will have a minimum alignment of
2494 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2496 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2497 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
2500 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
2502 Return true if an argument passed on the stack should be padded upwards,
2503 i.e. if the least-significant byte of the stack slot has useful data.
2505 Small aggregate types are placed in the lowest memory address.
2507 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
2510 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
2512 /* On little-endian targets, the least significant byte of every stack
2513 argument is passed at the lowest byte address of the stack slot. */
2514 if (!BYTES_BIG_ENDIAN)
2517 /* Otherwise, integral, floating-point and pointer types are padded downward:
2518 the least significant byte of a stack argument is passed at the highest
2519 byte address of the stack slot. */
2521 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2522 || POINTER_TYPE_P (type))
2523 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2526 /* Everything else padded upward, i.e. data in first byte of stack slot. */
2530 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2532 It specifies padding for the last (may also be the only)
2533 element of a block move between registers and memory. If
2534 assuming the block is in the memory, padding upward means that
2535 the last element is padded after its highest significant byte,
2536 while in downward padding, the last element is padded at the
2537 its least significant byte side.
2539 Small aggregates and small complex types are always padded
2542 We don't need to worry about homogeneous floating-point or
2543 short-vector aggregates; their move is not affected by the
2544 padding direction determined here. Regardless of endianness,
2545 each element of such an aggregate is put in the least
2546 significant bits of a fp/simd register.
2548 Return !BYTES_BIG_ENDIAN if the least significant byte of the
2549 register has useful data, and return the opposite if the most
2550 significant byte does. */
2553 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2554 bool first ATTRIBUTE_UNUSED)
2557 /* Small composite types are always padded upward. */
2558 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2560 HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2561 : GET_MODE_SIZE (mode));
2562 if (size < 2 * UNITS_PER_WORD)
2566 /* Otherwise, use the default padding. */
2567 return !BYTES_BIG_ENDIAN;
2571 aarch64_libgcc_cmp_return_mode (void)
2576 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
2578 /* We use the 12-bit shifted immediate arithmetic instructions so values
2579 must be multiple of (1 << 12), i.e. 4096. */
2580 #define ARITH_FACTOR 4096
2582 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
2583 #error Cannot use simple address calculation for stack probing
2586 /* The pair of scratch registers used for stack probing. */
2587 #define PROBE_STACK_FIRST_REG 9
2588 #define PROBE_STACK_SECOND_REG 10
2590 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
2591 inclusive. These are offsets from the current stack pointer. */
2594 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
2596 rtx reg1 = gen_rtx_REG (ptr_mode, PROBE_STACK_FIRST_REG);
2598 /* See the same assertion on PROBE_INTERVAL above. */
2599 gcc_assert ((first % ARITH_FACTOR) == 0);
2601 /* See if we have a constant small number of probes to generate. If so,
2602 that's the easy case. */
2603 if (size <= PROBE_INTERVAL)
2605 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
2607 emit_set_insn (reg1,
2608 plus_constant (ptr_mode,
2609 stack_pointer_rtx, -(first + base)));
2610 emit_stack_probe (plus_constant (ptr_mode, reg1, base - size));
2613 /* The run-time loop is made up of 8 insns in the generic case while the
2614 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
2615 else if (size <= 4 * PROBE_INTERVAL)
2617 HOST_WIDE_INT i, rem;
2619 emit_set_insn (reg1,
2620 plus_constant (ptr_mode,
2622 -(first + PROBE_INTERVAL)));
2623 emit_stack_probe (reg1);
2625 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
2626 it exceeds SIZE. If only two probes are needed, this will not
2627 generate any code. Then probe at FIRST + SIZE. */
2628 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
2630 emit_set_insn (reg1,
2631 plus_constant (ptr_mode, reg1, -PROBE_INTERVAL));
2632 emit_stack_probe (reg1);
2635 rem = size - (i - PROBE_INTERVAL);
2638 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2640 emit_set_insn (reg1, plus_constant (ptr_mode, reg1, -base));
2641 emit_stack_probe (plus_constant (ptr_mode, reg1, base - rem));
2644 emit_stack_probe (plus_constant (ptr_mode, reg1, -rem));
2647 /* Otherwise, do the same as above, but in a loop. Note that we must be
2648 extra careful with variables wrapping around because we might be at
2649 the very top (or the very bottom) of the address space and we have
2650 to be able to handle this case properly; in particular, we use an
2651 equality test for the loop condition. */
2654 rtx reg2 = gen_rtx_REG (ptr_mode, PROBE_STACK_SECOND_REG);
2656 /* Step 1: round SIZE to the previous multiple of the interval. */
2658 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
2661 /* Step 2: compute initial and final value of the loop counter. */
2663 /* TEST_ADDR = SP + FIRST. */
2664 emit_set_insn (reg1,
2665 plus_constant (ptr_mode, stack_pointer_rtx, -first));
2667 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
2668 emit_set_insn (reg2,
2669 plus_constant (ptr_mode, stack_pointer_rtx,
2670 -(first + rounded_size)));
2677 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
2680 while (TEST_ADDR != LAST_ADDR)
2682 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
2683 until it is equal to ROUNDED_SIZE. */
2685 if (ptr_mode == DImode)
2686 emit_insn (gen_probe_stack_range_di (reg1, reg1, reg2));
2688 emit_insn (gen_probe_stack_range_si (reg1, reg1, reg2));
2691 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
2692 that SIZE is equal to ROUNDED_SIZE. */
2694 if (size != rounded_size)
2696 HOST_WIDE_INT rem = size - rounded_size;
2700 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2702 emit_set_insn (reg2, plus_constant (ptr_mode, reg2, -base));
2703 emit_stack_probe (plus_constant (ptr_mode, reg2, base - rem));
2706 emit_stack_probe (plus_constant (ptr_mode, reg2, -rem));
2710 /* Make sure nothing is scheduled before we are done. */
2711 emit_insn (gen_blockage ());
2714 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
2715 absolute addresses. */
2718 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
2720 static int labelno = 0;
2724 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
2727 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
2729 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
2731 xops[1] = GEN_INT (PROBE_INTERVAL);
2732 output_asm_insn ("sub\t%0, %0, %1", xops);
2734 /* Probe at TEST_ADDR. */
2735 output_asm_insn ("str\txzr, [%0]", xops);
2737 /* Test if TEST_ADDR == LAST_ADDR. */
2739 output_asm_insn ("cmp\t%0, %1", xops);
2742 fputs ("\tb.ne\t", asm_out_file);
2743 assemble_name_raw (asm_out_file, loop_lab);
2744 fputc ('\n', asm_out_file);
2750 aarch64_frame_pointer_required (void)
2752 /* In aarch64_override_options_after_change
2753 flag_omit_leaf_frame_pointer turns off the frame pointer by
2754 default. Turn it back on now if we've not got a leaf
2756 if (flag_omit_leaf_frame_pointer
2757 && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2760 /* Force a frame pointer for EH returns so the return address is at FP+8. */
2761 if (crtl->calls_eh_return)
2767 /* Mark the registers that need to be saved by the callee and calculate
2768 the size of the callee-saved registers area and frame record (both FP
2769 and LR may be omitted). */
2771 aarch64_layout_frame (void)
2773 HOST_WIDE_INT offset = 0;
2774 int regno, last_fp_reg = INVALID_REGNUM;
2776 if (reload_completed && cfun->machine->frame.laid_out)
2779 #define SLOT_NOT_REQUIRED (-2)
2780 #define SLOT_REQUIRED (-1)
2782 cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
2783 cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
2785 /* First mark all the registers that really need to be saved... */
2786 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2787 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2789 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2790 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2792 /* ... that includes the eh data registers (if needed)... */
2793 if (crtl->calls_eh_return)
2794 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2795 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2798 /* ... and any callee saved register that dataflow says is live. */
2799 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2800 if (df_regs_ever_live_p (regno)
2801 && (regno == R30_REGNUM
2802 || !call_used_regs[regno]))
2803 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2805 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2806 if (df_regs_ever_live_p (regno)
2807 && !call_used_regs[regno])
2809 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2810 last_fp_reg = regno;
2813 if (frame_pointer_needed)
2815 /* FP and LR are placed in the linkage record. */
2816 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2817 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2818 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2819 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2820 offset += 2 * UNITS_PER_WORD;
2823 /* Now assign stack slots for them. */
2824 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2825 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2827 cfun->machine->frame.reg_offset[regno] = offset;
2828 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2829 cfun->machine->frame.wb_candidate1 = regno;
2830 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
2831 cfun->machine->frame.wb_candidate2 = regno;
2832 offset += UNITS_PER_WORD;
2835 HOST_WIDE_INT max_int_offset = offset;
2836 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2837 bool has_align_gap = offset != max_int_offset;
2839 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2840 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2842 /* If there is an alignment gap between integer and fp callee-saves,
2843 allocate the last fp register to it if possible. */
2844 if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
2846 cfun->machine->frame.reg_offset[regno] = max_int_offset;
2850 cfun->machine->frame.reg_offset[regno] = offset;
2851 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2852 cfun->machine->frame.wb_candidate1 = regno;
2853 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
2854 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2855 cfun->machine->frame.wb_candidate2 = regno;
2856 offset += UNITS_PER_WORD;
2859 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2861 cfun->machine->frame.saved_regs_size = offset;
2863 HOST_WIDE_INT varargs_and_saved_regs_size
2864 = offset + cfun->machine->frame.saved_varargs_size;
2866 cfun->machine->frame.hard_fp_offset
2867 = ROUND_UP (varargs_and_saved_regs_size + get_frame_size (),
2868 STACK_BOUNDARY / BITS_PER_UNIT);
2870 cfun->machine->frame.frame_size
2871 = ROUND_UP (cfun->machine->frame.hard_fp_offset
2872 + crtl->outgoing_args_size,
2873 STACK_BOUNDARY / BITS_PER_UNIT);
2875 cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
2877 cfun->machine->frame.initial_adjust = 0;
2878 cfun->machine->frame.final_adjust = 0;
2879 cfun->machine->frame.callee_adjust = 0;
2880 cfun->machine->frame.callee_offset = 0;
2882 HOST_WIDE_INT max_push_offset = 0;
2883 if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
2884 max_push_offset = 512;
2885 else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
2886 max_push_offset = 256;
2888 if (cfun->machine->frame.frame_size < max_push_offset
2889 && crtl->outgoing_args_size == 0)
2891 /* Simple, small frame with no outgoing arguments:
2892 stp reg1, reg2, [sp, -frame_size]!
2893 stp reg3, reg4, [sp, 16] */
2894 cfun->machine->frame.callee_adjust = cfun->machine->frame.frame_size;
2896 else if ((crtl->outgoing_args_size
2897 + cfun->machine->frame.saved_regs_size < 512)
2898 && !(cfun->calls_alloca
2899 && cfun->machine->frame.hard_fp_offset < max_push_offset))
2901 /* Frame with small outgoing arguments:
2902 sub sp, sp, frame_size
2903 stp reg1, reg2, [sp, outgoing_args_size]
2904 stp reg3, reg4, [sp, outgoing_args_size + 16] */
2905 cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
2906 cfun->machine->frame.callee_offset
2907 = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
2909 else if (cfun->machine->frame.hard_fp_offset < max_push_offset)
2911 /* Frame with large outgoing arguments but a small local area:
2912 stp reg1, reg2, [sp, -hard_fp_offset]!
2913 stp reg3, reg4, [sp, 16]
2914 sub sp, sp, outgoing_args_size */
2915 cfun->machine->frame.callee_adjust = cfun->machine->frame.hard_fp_offset;
2916 cfun->machine->frame.final_adjust
2917 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
2919 else if (!frame_pointer_needed
2920 && varargs_and_saved_regs_size < max_push_offset)
2922 /* Frame with large local area and outgoing arguments (this pushes the
2923 callee-saves first, followed by the locals and outgoing area):
2924 stp reg1, reg2, [sp, -varargs_and_saved_regs_size]!
2925 stp reg3, reg4, [sp, 16]
2926 sub sp, sp, frame_size - varargs_and_saved_regs_size */
2927 cfun->machine->frame.callee_adjust = varargs_and_saved_regs_size;
2928 cfun->machine->frame.final_adjust
2929 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
2930 cfun->machine->frame.hard_fp_offset = cfun->machine->frame.callee_adjust;
2931 cfun->machine->frame.locals_offset = cfun->machine->frame.hard_fp_offset;
2935 /* Frame with large local area and outgoing arguments using frame pointer:
2936 sub sp, sp, hard_fp_offset
2937 stp x29, x30, [sp, 0]
2939 stp reg3, reg4, [sp, 16]
2940 sub sp, sp, outgoing_args_size */
2941 cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
2942 cfun->machine->frame.final_adjust
2943 = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
2946 cfun->machine->frame.laid_out = true;
2949 /* Return true if the register REGNO is saved on entry to
2950 the current function. */
2953 aarch64_register_saved_on_entry (int regno)
2955 return cfun->machine->frame.reg_offset[regno] >= 0;
2958 /* Return the next register up from REGNO up to LIMIT for the callee
2962 aarch64_next_callee_save (unsigned regno, unsigned limit)
2964 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
2969 /* Push the register number REGNO of mode MODE to the stack with write-back
2970 adjusting the stack by ADJUSTMENT. */
2973 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
2974 HOST_WIDE_INT adjustment)
2976 rtx base_rtx = stack_pointer_rtx;
2979 reg = gen_rtx_REG (mode, regno);
2980 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
2981 plus_constant (Pmode, base_rtx, -adjustment));
2982 mem = gen_rtx_MEM (mode, mem);
2984 insn = emit_move_insn (mem, reg);
2985 RTX_FRAME_RELATED_P (insn) = 1;
2988 /* Generate and return an instruction to store the pair of registers
2989 REG and REG2 of mode MODE to location BASE with write-back adjusting
2990 the stack location BASE by ADJUSTMENT. */
2993 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2994 HOST_WIDE_INT adjustment)
2999 return gen_storewb_pairdi_di (base, base, reg, reg2,
3000 GEN_INT (-adjustment),
3001 GEN_INT (UNITS_PER_WORD - adjustment));
3003 return gen_storewb_pairdf_di (base, base, reg, reg2,
3004 GEN_INT (-adjustment),
3005 GEN_INT (UNITS_PER_WORD - adjustment));
3011 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
3012 stack pointer by ADJUSTMENT. */
3015 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
3018 machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
3020 if (regno2 == INVALID_REGNUM)
3021 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
3023 rtx reg1 = gen_rtx_REG (mode, regno1);
3024 rtx reg2 = gen_rtx_REG (mode, regno2);
3026 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
3028 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
3029 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3030 RTX_FRAME_RELATED_P (insn) = 1;
3033 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
3034 adjusting it by ADJUSTMENT afterwards. */
3037 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3038 HOST_WIDE_INT adjustment)
3043 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
3044 GEN_INT (UNITS_PER_WORD));
3046 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
3047 GEN_INT (UNITS_PER_WORD));
3053 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
3054 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
3058 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
3061 machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
3062 rtx reg1 = gen_rtx_REG (mode, regno1);
3064 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
3066 if (regno2 == INVALID_REGNUM)
3068 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
3069 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
3070 emit_move_insn (reg1, gen_rtx_MEM (mode, mem));
3074 rtx reg2 = gen_rtx_REG (mode, regno2);
3075 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3076 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
3081 /* Generate and return a store pair instruction of mode MODE to store
3082 register REG1 to MEM1 and register REG2 to MEM2. */
3085 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
3091 return gen_store_pairdi (mem1, reg1, mem2, reg2);
3094 return gen_store_pairdf (mem1, reg1, mem2, reg2);
3101 /* Generate and regurn a load pair isntruction of mode MODE to load register
3102 REG1 from MEM1 and register REG2 from MEM2. */
3105 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
3111 return gen_load_pairdi (reg1, mem1, reg2, mem2);
3114 return gen_load_pairdf (reg1, mem1, reg2, mem2);
3121 /* Emit code to save the callee-saved registers from register number START
3122 to LIMIT to the stack at the location starting at offset START_OFFSET,
3123 skipping any write-back candidates if SKIP_WB is true. */
3126 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
3127 unsigned start, unsigned limit, bool skip_wb)
3130 rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
3131 ? gen_frame_mem : gen_rtx_MEM);
3135 for (regno = aarch64_next_callee_save (start, limit);
3137 regno = aarch64_next_callee_save (regno + 1, limit))
3140 HOST_WIDE_INT offset;
3143 && (regno == cfun->machine->frame.wb_candidate1
3144 || regno == cfun->machine->frame.wb_candidate2))
3147 reg = gen_rtx_REG (mode, regno);
3148 offset = start_offset + cfun->machine->frame.reg_offset[regno];
3149 mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
3152 regno2 = aarch64_next_callee_save (regno + 1, limit);
3155 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3156 == cfun->machine->frame.reg_offset[regno2]))
3159 rtx reg2 = gen_rtx_REG (mode, regno2);
3162 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3163 mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
3165 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
3168 /* The first part of a frame-related parallel insn is
3169 always assumed to be relevant to the frame
3170 calculations; subsequent parts, are only
3171 frame-related if explicitly marked. */
3172 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3176 insn = emit_move_insn (mem, reg);
3178 RTX_FRAME_RELATED_P (insn) = 1;
3182 /* Emit code to restore the callee registers of mode MODE from register
3183 number START up to and including LIMIT. Restore from the stack offset
3184 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
3185 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
3188 aarch64_restore_callee_saves (machine_mode mode,
3189 HOST_WIDE_INT start_offset, unsigned start,
3190 unsigned limit, bool skip_wb, rtx *cfi_ops)
3192 rtx base_rtx = stack_pointer_rtx;
3193 rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
3194 ? gen_frame_mem : gen_rtx_MEM);
3197 HOST_WIDE_INT offset;
3199 for (regno = aarch64_next_callee_save (start, limit);
3201 regno = aarch64_next_callee_save (regno + 1, limit))
3206 && (regno == cfun->machine->frame.wb_candidate1
3207 || regno == cfun->machine->frame.wb_candidate2))
3210 reg = gen_rtx_REG (mode, regno);
3211 offset = start_offset + cfun->machine->frame.reg_offset[regno];
3212 mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
3214 regno2 = aarch64_next_callee_save (regno + 1, limit);
3217 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3218 == cfun->machine->frame.reg_offset[regno2]))
3220 rtx reg2 = gen_rtx_REG (mode, regno2);
3223 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3224 mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
3225 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3227 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3231 emit_move_insn (reg, mem);
3232 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
3236 /* AArch64 stack frames generated by this compiler look like:
3238 +-------------------------------+
3240 | incoming stack arguments |
3242 +-------------------------------+
3243 | | <-- incoming stack pointer (aligned)
3244 | callee-allocated save area |
3245 | for register varargs |
3247 +-------------------------------+
3248 | local variables | <-- frame_pointer_rtx
3250 +-------------------------------+
3252 +-------------------------------+ |
3253 | callee-saved registers | | frame.saved_regs_size
3254 +-------------------------------+ |
3256 +-------------------------------+ |
3257 | FP' | / <- hard_frame_pointer_rtx (aligned)
3258 +-------------------------------+
3259 | dynamic allocation |
3260 +-------------------------------+
3262 +-------------------------------+
3263 | outgoing stack arguments | <-- arg_pointer
3265 +-------------------------------+
3266 | | <-- stack_pointer_rtx (aligned)
3268 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
3269 but leave frame_pointer_rtx and hard_frame_pointer_rtx
3272 /* Generate the prologue instructions for entry into a function.
3273 Establish the stack frame by decreasing the stack pointer with a
3274 properly calculated size and, if necessary, create a frame record
3275 filled with the values of LR and previous frame pointer. The
3276 current FP is also set up if it is in use. */
3279 aarch64_expand_prologue (void)
3281 aarch64_layout_frame ();
3283 HOST_WIDE_INT frame_size = cfun->machine->frame.frame_size;
3284 HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3285 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3286 HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3287 HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3288 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3289 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3292 if (flag_stack_usage_info)
3293 current_function_static_stack_size = frame_size;
3295 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
3297 if (crtl->is_leaf && !cfun->calls_alloca)
3299 if (frame_size > PROBE_INTERVAL && frame_size > STACK_CHECK_PROTECT)
3300 aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT,
3301 frame_size - STACK_CHECK_PROTECT);
3303 else if (frame_size > 0)
3304 aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
3307 aarch64_sub_sp (IP0_REGNUM, initial_adjust, true);
3309 if (callee_adjust != 0)
3310 aarch64_push_regs (reg1, reg2, callee_adjust);
3312 if (frame_pointer_needed)
3314 if (callee_adjust == 0)
3315 aarch64_save_callee_saves (DImode, callee_offset, R29_REGNUM,
3317 insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
3319 GEN_INT (callee_offset)));
3320 RTX_FRAME_RELATED_P (insn) = 1;
3321 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
3324 aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3325 callee_adjust != 0 || frame_pointer_needed);
3326 aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3327 callee_adjust != 0 || frame_pointer_needed);
3328 aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed);
3331 /* Return TRUE if we can use a simple_return insn.
3333 This function checks whether the callee saved stack is empty, which
3334 means no restore actions are need. The pro_and_epilogue will use
3335 this to check whether shrink-wrapping opt is feasible. */
3338 aarch64_use_return_insn_p (void)
3340 if (!reload_completed)
3346 aarch64_layout_frame ();
3348 return cfun->machine->frame.frame_size == 0;
3351 /* Generate the epilogue instructions for returning from a function.
3352 This is almost exactly the reverse of the prolog sequence, except
3353 that we need to insert barriers to avoid scheduling loads that read
3354 from a deallocated stack, and we optimize the unwind records by
3355 emitting them all together if possible. */
3357 aarch64_expand_epilogue (bool for_sibcall)
3359 aarch64_layout_frame ();
3361 HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3362 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3363 HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3364 HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3365 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3366 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3370 /* We need to add memory barrier to prevent read from deallocated stack. */
3371 bool need_barrier_p = (get_frame_size ()
3372 + cfun->machine->frame.saved_varargs_size) != 0;
3374 /* Emit a barrier to prevent loads from a deallocated stack. */
3375 if (final_adjust > crtl->outgoing_args_size || cfun->calls_alloca
3376 || crtl->calls_eh_return)
3378 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3379 need_barrier_p = false;
3382 /* Restore the stack pointer from the frame pointer if it may not
3383 be the same as the stack pointer. */
3384 if (frame_pointer_needed && (final_adjust || cfun->calls_alloca))
3386 insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
3387 hard_frame_pointer_rtx,
3388 GEN_INT (-callee_offset)));
3389 /* If writeback is used when restoring callee-saves, the CFA
3390 is restored on the instruction doing the writeback. */
3391 RTX_FRAME_RELATED_P (insn) = callee_adjust == 0;
3394 aarch64_add_sp (IP1_REGNUM, final_adjust, df_regs_ever_live_p (IP1_REGNUM));
3396 aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3397 callee_adjust != 0, &cfi_ops);
3398 aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3399 callee_adjust != 0, &cfi_ops);
3402 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3404 if (callee_adjust != 0)
3405 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
3407 if (callee_adjust != 0 || initial_adjust > 65536)
3409 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
3410 insn = get_last_insn ();
3411 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
3412 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
3413 RTX_FRAME_RELATED_P (insn) = 1;
3417 aarch64_add_sp (IP0_REGNUM, initial_adjust, df_regs_ever_live_p (IP0_REGNUM));
3421 /* Emit delayed restores and reset the CFA to be SP. */
3422 insn = get_last_insn ();
3423 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
3424 REG_NOTES (insn) = cfi_ops;
3425 RTX_FRAME_RELATED_P (insn) = 1;
3428 /* Stack adjustment for exception handler. */
3429 if (crtl->calls_eh_return)
3431 /* We need to unwind the stack by the offset computed by
3432 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
3433 to be SP; letting the CFA move during this adjustment
3434 is just as correct as retaining the CFA from the body
3435 of the function. Therefore, do nothing special. */
3436 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
3439 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
3441 emit_jump_insn (ret_rtx);
3444 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
3445 normally or return to a previous frame after unwinding.
3447 An EH return uses a single shared return sequence. The epilogue is
3448 exactly like a normal epilogue except that it has an extra input
3449 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
3450 that must be applied after the frame has been destroyed. An extra label
3451 is inserted before the epilogue which initializes this register to zero,
3452 and this is the entry point for a normal return.
3454 An actual EH return updates the return address, initializes the stack
3455 adjustment and jumps directly into the epilogue (bypassing the zeroing
3456 of the adjustment). Since the return address is typically saved on the
3457 stack when a function makes a call, the saved LR must be updated outside
3460 This poses problems as the store is generated well before the epilogue,
3461 so the offset of LR is not known yet. Also optimizations will remove the
3462 store as it appears dead, even after the epilogue is generated (as the
3463 base or offset for loading LR is different in many cases).
3465 To avoid these problems this implementation forces the frame pointer
3466 in eh_return functions so that the location of LR is fixed and known early.
3467 It also marks the store volatile, so no optimization is permitted to
3468 remove the store. */
3470 aarch64_eh_return_handler_rtx (void)
3472 rtx tmp = gen_frame_mem (Pmode,
3473 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
3475 /* Mark the store volatile, so no optimization is permitted to remove it. */
3476 MEM_VOLATILE_P (tmp) = true;
3480 /* Output code to add DELTA to the first argument, and then jump
3481 to FUNCTION. Used for C++ multiple inheritance. */
3483 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3484 HOST_WIDE_INT delta,
3485 HOST_WIDE_INT vcall_offset,
3488 /* The this pointer is always in x0. Note that this differs from
3489 Arm where the this pointer maybe bumped to r1 if r0 is required
3490 to return a pointer to an aggregate. On AArch64 a result value
3491 pointer will be in x8. */
3492 int this_regno = R0_REGNUM;
3493 rtx this_rtx, temp0, temp1, addr, funexp;
3496 reload_completed = 1;
3497 emit_note (NOTE_INSN_PROLOGUE_END);
3499 if (vcall_offset == 0)
3500 aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3503 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3505 this_rtx = gen_rtx_REG (Pmode, this_regno);
3506 temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3507 temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3512 if (delta >= -256 && delta < 256)
3513 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3514 plus_constant (Pmode, this_rtx, delta));
3516 aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3519 if (Pmode == ptr_mode)
3520 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3522 aarch64_emit_move (temp0,
3523 gen_rtx_ZERO_EXTEND (Pmode,
3524 gen_rtx_MEM (ptr_mode, addr)));
3526 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3527 addr = plus_constant (Pmode, temp0, vcall_offset);
3530 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
3532 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3535 if (Pmode == ptr_mode)
3536 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3538 aarch64_emit_move (temp1,
3539 gen_rtx_SIGN_EXTEND (Pmode,
3540 gen_rtx_MEM (ptr_mode, addr)));
3542 emit_insn (gen_add2_insn (this_rtx, temp1));
3545 /* Generate a tail call to the target function. */
3546 if (!TREE_USED (function))
3548 assemble_external (function);
3549 TREE_USED (function) = 1;
3551 funexp = XEXP (DECL_RTL (function), 0);
3552 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3553 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3554 SIBLING_CALL_P (insn) = 1;
3556 insn = get_insns ();
3557 shorten_branches (insn);
3558 final_start_function (insn, file, 1);
3559 final (insn, file, 1);
3560 final_end_function ();
3562 /* Stop pretending to be a post-reload pass. */
3563 reload_completed = 0;
3567 aarch64_tls_referenced_p (rtx x)
3569 if (!TARGET_HAVE_TLS)
3571 subrtx_iterator::array_type array;
3572 FOR_EACH_SUBRTX (iter, array, x, ALL)
3574 const_rtx x = *iter;
3575 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3577 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3578 TLS offsets, not real symbol references. */
3579 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3580 iter.skip_subrtxes ();
3586 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3587 a left shift of 0 or 12 bits. */
3589 aarch64_uimm12_shift (HOST_WIDE_INT val)
3591 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3592 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3597 /* Return true if val is an immediate that can be loaded into a
3598 register by a MOVZ instruction. */
3600 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3602 if (GET_MODE_SIZE (mode) > 4)
3604 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3605 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3610 /* Ignore sign extension. */
3611 val &= (HOST_WIDE_INT) 0xffffffff;
3613 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3614 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3617 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
3619 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
3621 0x0000000100000001ull,
3622 0x0001000100010001ull,
3623 0x0101010101010101ull,
3624 0x1111111111111111ull,
3625 0x5555555555555555ull,
3629 /* Return true if val is a valid bitmask immediate. */
3632 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
3634 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
3637 /* Check for a single sequence of one bits and return quickly if so.
3638 The special cases of all ones and all zeroes returns false. */
3639 val = (unsigned HOST_WIDE_INT) val_in;
3640 tmp = val + (val & -val);
3642 if (tmp == (tmp & -tmp))
3643 return (val + 1) > 1;
3645 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
3647 val = (val << 32) | (val & 0xffffffff);
3649 /* Invert if the immediate doesn't start with a zero bit - this means we
3650 only need to search for sequences of one bits. */
3654 /* Find the first set bit and set tmp to val with the first sequence of one
3655 bits removed. Return success if there is a single sequence of ones. */
3656 first_one = val & -val;
3657 tmp = val & (val + first_one);
3662 /* Find the next set bit and compute the difference in bit position. */
3663 next_one = tmp & -tmp;
3664 bits = clz_hwi (first_one) - clz_hwi (next_one);
3667 /* Check the bit position difference is a power of 2, and that the first
3668 sequence of one bits fits within 'bits' bits. */
3669 if ((mask >> bits) != 0 || bits != (bits & -bits))
3672 /* Check the sequence of one bits is repeated 64/bits times. */
3673 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
3677 /* Return true if val is an immediate that can be loaded into a
3678 register in a single instruction. */
3680 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
3682 if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
3684 return aarch64_bitmask_imm (val, mode);
3688 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
3692 if (GET_CODE (x) == HIGH)
3695 split_const (x, &base, &offset);
3696 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
3698 if (aarch64_classify_symbol (base, offset)
3699 != SYMBOL_FORCE_TO_MEM)
3702 /* Avoid generating a 64-bit relocation in ILP32; leave
3703 to aarch64_expand_mov_immediate to handle it properly. */
3704 return mode != ptr_mode;
3707 return aarch64_tls_referenced_p (x);
3710 /* Implement TARGET_CASE_VALUES_THRESHOLD.
3711 The expansion for a table switch is quite expensive due to the number
3712 of instructions, the table lookup and hard to predict indirect jump.
3713 When optimizing for speed, and -O3 enabled, use the per-core tuning if
3714 set, otherwise use tables for > 16 cases as a tradeoff between size and
3715 performance. When optimizing for size, use the default setting. */
3718 aarch64_case_values_threshold (void)
3720 /* Use the specified limit for the number of cases before using jump
3721 tables at higher optimization levels. */
3723 && selected_cpu->tune->max_case_values != 0)
3724 return selected_cpu->tune->max_case_values;
3726 return optimize_size ? default_case_values_threshold () : 17;
3729 /* Return true if register REGNO is a valid index register.
3730 STRICT_P is true if REG_OK_STRICT is in effect. */
3733 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
3735 if (!HARD_REGISTER_NUM_P (regno))
3743 regno = reg_renumber[regno];
3745 return GP_REGNUM_P (regno);
3748 /* Return true if register REGNO is a valid base register for mode MODE.
3749 STRICT_P is true if REG_OK_STRICT is in effect. */
3752 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
3754 if (!HARD_REGISTER_NUM_P (regno))
3762 regno = reg_renumber[regno];
3765 /* The fake registers will be eliminated to either the stack or
3766 hard frame pointer, both of which are usually valid base registers.
3767 Reload deals with the cases where the eliminated form isn't valid. */
3768 return (GP_REGNUM_P (regno)
3769 || regno == SP_REGNUM
3770 || regno == FRAME_POINTER_REGNUM
3771 || regno == ARG_POINTER_REGNUM);
3774 /* Return true if X is a valid base register for mode MODE.
3775 STRICT_P is true if REG_OK_STRICT is in effect. */
3778 aarch64_base_register_rtx_p (rtx x, bool strict_p)
3780 if (!strict_p && GET_CODE (x) == SUBREG)
3783 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
3786 /* Return true if address offset is a valid index. If it is, fill in INFO
3787 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
3790 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
3791 machine_mode mode, bool strict_p)
3793 enum aarch64_address_type type;
3798 if ((REG_P (x) || GET_CODE (x) == SUBREG)
3799 && GET_MODE (x) == Pmode)
3801 type = ADDRESS_REG_REG;
3805 /* (sign_extend:DI (reg:SI)) */
3806 else if ((GET_CODE (x) == SIGN_EXTEND
3807 || GET_CODE (x) == ZERO_EXTEND)
3808 && GET_MODE (x) == DImode
3809 && GET_MODE (XEXP (x, 0)) == SImode)
3811 type = (GET_CODE (x) == SIGN_EXTEND)
3812 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3813 index = XEXP (x, 0);
3816 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
3817 else if (GET_CODE (x) == MULT
3818 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3819 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3820 && GET_MODE (XEXP (x, 0)) == DImode
3821 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3822 && CONST_INT_P (XEXP (x, 1)))
3824 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3825 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3826 index = XEXP (XEXP (x, 0), 0);
3827 shift = exact_log2 (INTVAL (XEXP (x, 1)));
3829 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
3830 else if (GET_CODE (x) == ASHIFT
3831 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3832 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3833 && GET_MODE (XEXP (x, 0)) == DImode
3834 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3835 && CONST_INT_P (XEXP (x, 1)))
3837 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3838 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3839 index = XEXP (XEXP (x, 0), 0);
3840 shift = INTVAL (XEXP (x, 1));
3842 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
3843 else if ((GET_CODE (x) == SIGN_EXTRACT
3844 || GET_CODE (x) == ZERO_EXTRACT)
3845 && GET_MODE (x) == DImode
3846 && GET_CODE (XEXP (x, 0)) == MULT
3847 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3848 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3850 type = (GET_CODE (x) == SIGN_EXTRACT)
3851 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3852 index = XEXP (XEXP (x, 0), 0);
3853 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3854 if (INTVAL (XEXP (x, 1)) != 32 + shift
3855 || INTVAL (XEXP (x, 2)) != 0)
3858 /* (and:DI (mult:DI (reg:DI) (const_int scale))
3859 (const_int 0xffffffff<<shift)) */
3860 else if (GET_CODE (x) == AND
3861 && GET_MODE (x) == DImode
3862 && GET_CODE (XEXP (x, 0)) == MULT
3863 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3864 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3865 && CONST_INT_P (XEXP (x, 1)))
3867 type = ADDRESS_REG_UXTW;
3868 index = XEXP (XEXP (x, 0), 0);
3869 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3870 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3873 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
3874 else if ((GET_CODE (x) == SIGN_EXTRACT
3875 || GET_CODE (x) == ZERO_EXTRACT)
3876 && GET_MODE (x) == DImode
3877 && GET_CODE (XEXP (x, 0)) == ASHIFT
3878 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3879 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3881 type = (GET_CODE (x) == SIGN_EXTRACT)
3882 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3883 index = XEXP (XEXP (x, 0), 0);
3884 shift = INTVAL (XEXP (XEXP (x, 0), 1));
3885 if (INTVAL (XEXP (x, 1)) != 32 + shift
3886 || INTVAL (XEXP (x, 2)) != 0)
3889 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
3890 (const_int 0xffffffff<<shift)) */
3891 else if (GET_CODE (x) == AND
3892 && GET_MODE (x) == DImode
3893 && GET_CODE (XEXP (x, 0)) == ASHIFT
3894 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3895 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3896 && CONST_INT_P (XEXP (x, 1)))
3898 type = ADDRESS_REG_UXTW;
3899 index = XEXP (XEXP (x, 0), 0);
3900 shift = INTVAL (XEXP (XEXP (x, 0), 1));
3901 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3904 /* (mult:P (reg:P) (const_int scale)) */
3905 else if (GET_CODE (x) == MULT
3906 && GET_MODE (x) == Pmode
3907 && GET_MODE (XEXP (x, 0)) == Pmode
3908 && CONST_INT_P (XEXP (x, 1)))
3910 type = ADDRESS_REG_REG;
3911 index = XEXP (x, 0);
3912 shift = exact_log2 (INTVAL (XEXP (x, 1)));
3914 /* (ashift:P (reg:P) (const_int shift)) */
3915 else if (GET_CODE (x) == ASHIFT
3916 && GET_MODE (x) == Pmode
3917 && GET_MODE (XEXP (x, 0)) == Pmode
3918 && CONST_INT_P (XEXP (x, 1)))
3920 type = ADDRESS_REG_REG;
3921 index = XEXP (x, 0);
3922 shift = INTVAL (XEXP (x, 1));
3927 if (GET_CODE (index) == SUBREG)
3928 index = SUBREG_REG (index);
3931 (shift > 0 && shift <= 3
3932 && (1 << shift) == GET_MODE_SIZE (mode)))
3934 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
3937 info->offset = index;
3938 info->shift = shift;
3946 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3948 return (offset >= -64 * GET_MODE_SIZE (mode)
3949 && offset < 64 * GET_MODE_SIZE (mode)
3950 && offset % GET_MODE_SIZE (mode) == 0);
3954 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3955 HOST_WIDE_INT offset)
3957 return offset >= -256 && offset < 256;
3961 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3964 && offset < 4096 * GET_MODE_SIZE (mode)
3965 && offset % GET_MODE_SIZE (mode) == 0);
3968 /* Return true if MODE is one of the modes for which we
3969 support LDP/STP operations. */
3972 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
3974 return mode == SImode || mode == DImode
3975 || mode == SFmode || mode == DFmode
3976 || (aarch64_vector_mode_supported_p (mode)
3977 && GET_MODE_SIZE (mode) == 8);
3980 /* Return true if REGNO is a virtual pointer register, or an eliminable
3981 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
3982 include stack_pointer or hard_frame_pointer. */
3984 virt_or_elim_regno_p (unsigned regno)
3986 return ((regno >= FIRST_VIRTUAL_REGISTER
3987 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
3988 || regno == FRAME_POINTER_REGNUM
3989 || regno == ARG_POINTER_REGNUM);
3992 /* Return true if X is a valid address for machine mode MODE. If it is,
3993 fill in INFO appropriately. STRICT_P is true if REG_OK_STRICT is in
3994 effect. OUTER_CODE is PARALLEL for a load/store pair. */
3997 aarch64_classify_address (struct aarch64_address_info *info,
3998 rtx x, machine_mode mode,
3999 RTX_CODE outer_code, bool strict_p)
4001 enum rtx_code code = GET_CODE (x);
4004 /* On BE, we use load/store pair for all large int mode load/stores. */
4005 bool load_store_pair_p = (outer_code == PARALLEL
4006 || (BYTES_BIG_ENDIAN
4007 && aarch64_vect_struct_mode_p (mode)));
4009 bool allow_reg_index_p =
4011 && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
4012 && !aarch64_vect_struct_mode_p (mode);
4014 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
4016 if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
4017 && (code != POST_INC && code != REG))
4024 info->type = ADDRESS_REG_IMM;
4026 info->offset = const0_rtx;
4027 return aarch64_base_register_rtx_p (x, strict_p);
4035 && virt_or_elim_regno_p (REGNO (op0))
4036 && CONST_INT_P (op1))
4038 info->type = ADDRESS_REG_IMM;
4045 if (GET_MODE_SIZE (mode) != 0
4046 && CONST_INT_P (op1)
4047 && aarch64_base_register_rtx_p (op0, strict_p))
4049 HOST_WIDE_INT offset = INTVAL (op1);
4051 info->type = ADDRESS_REG_IMM;
4055 /* TImode and TFmode values are allowed in both pairs of X
4056 registers and individual Q registers. The available
4058 X,X: 7-bit signed scaled offset
4059 Q: 9-bit signed offset
4060 We conservatively require an offset representable in either mode.
4061 When performing the check for pairs of X registers i.e. LDP/STP
4062 pass down DImode since that is the natural size of the LDP/STP
4063 instruction memory accesses. */
4064 if (mode == TImode || mode == TFmode)
4065 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
4066 && offset_9bit_signed_unscaled_p (mode, offset));
4068 /* A 7bit offset check because OImode will emit a ldp/stp
4069 instruction (only big endian will get here).
4070 For ldp/stp instructions, the offset is scaled for the size of a
4071 single element of the pair. */
4073 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
4075 /* Three 9/12 bit offsets checks because CImode will emit three
4076 ldr/str instructions (only big endian will get here). */
4078 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4079 && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
4080 || offset_12bit_unsigned_scaled_p (V16QImode,
4083 /* Two 7bit offsets checks because XImode will emit two ldp/stp
4084 instructions (only big endian will get here). */
4086 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4087 && aarch64_offset_7bit_signed_scaled_p (TImode,
4090 if (load_store_pair_p)
4091 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4092 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4094 return (offset_9bit_signed_unscaled_p (mode, offset)
4095 || offset_12bit_unsigned_scaled_p (mode, offset));
4098 if (allow_reg_index_p)
4100 /* Look for base + (scaled/extended) index register. */
4101 if (aarch64_base_register_rtx_p (op0, strict_p)
4102 && aarch64_classify_index (info, op1, mode, strict_p))
4107 if (aarch64_base_register_rtx_p (op1, strict_p)
4108 && aarch64_classify_index (info, op0, mode, strict_p))
4121 info->type = ADDRESS_REG_WB;
4122 info->base = XEXP (x, 0);
4123 info->offset = NULL_RTX;
4124 return aarch64_base_register_rtx_p (info->base, strict_p);
4128 info->type = ADDRESS_REG_WB;
4129 info->base = XEXP (x, 0);
4130 if (GET_CODE (XEXP (x, 1)) == PLUS
4131 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
4132 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
4133 && aarch64_base_register_rtx_p (info->base, strict_p))
4135 HOST_WIDE_INT offset;
4136 info->offset = XEXP (XEXP (x, 1), 1);
4137 offset = INTVAL (info->offset);
4139 /* TImode and TFmode values are allowed in both pairs of X
4140 registers and individual Q registers. The available
4142 X,X: 7-bit signed scaled offset
4143 Q: 9-bit signed offset
4144 We conservatively require an offset representable in either mode.
4146 if (mode == TImode || mode == TFmode)
4147 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
4148 && offset_9bit_signed_unscaled_p (mode, offset));
4150 if (load_store_pair_p)
4151 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4152 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4154 return offset_9bit_signed_unscaled_p (mode, offset);
4161 /* load literal: pc-relative constant pool entry. Only supported
4162 for SI mode or larger. */
4163 info->type = ADDRESS_SYMBOLIC;
4165 if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
4169 split_const (x, &sym, &addend);
4170 return ((GET_CODE (sym) == LABEL_REF
4171 || (GET_CODE (sym) == SYMBOL_REF
4172 && CONSTANT_POOL_ADDRESS_P (sym)
4173 && aarch64_pcrelative_literal_loads)));
4178 info->type = ADDRESS_LO_SUM;
4179 info->base = XEXP (x, 0);
4180 info->offset = XEXP (x, 1);
4181 if (allow_reg_index_p
4182 && aarch64_base_register_rtx_p (info->base, strict_p))
4185 split_const (info->offset, &sym, &offs);
4186 if (GET_CODE (sym) == SYMBOL_REF
4187 && (aarch64_classify_symbol (sym, offs) == SYMBOL_SMALL_ABSOLUTE))
4189 /* The symbol and offset must be aligned to the access size. */
4191 unsigned int ref_size;
4193 if (CONSTANT_POOL_ADDRESS_P (sym))
4194 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
4195 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
4197 tree exp = SYMBOL_REF_DECL (sym);
4198 align = TYPE_ALIGN (TREE_TYPE (exp));
4199 align = CONSTANT_ALIGNMENT (exp, align);
4201 else if (SYMBOL_REF_DECL (sym))
4202 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
4203 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
4204 && SYMBOL_REF_BLOCK (sym) != NULL)
4205 align = SYMBOL_REF_BLOCK (sym)->alignment;
4207 align = BITS_PER_UNIT;
4209 ref_size = GET_MODE_SIZE (mode);
4211 ref_size = GET_MODE_SIZE (DImode);
4213 return ((INTVAL (offs) & (ref_size - 1)) == 0
4214 && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
4225 aarch64_symbolic_address_p (rtx x)
4229 split_const (x, &x, &offset);
4230 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
4233 /* Classify the base of symbolic expression X. */
4235 enum aarch64_symbol_type
4236 aarch64_classify_symbolic_expression (rtx x)
4240 split_const (x, &x, &offset);
4241 return aarch64_classify_symbol (x, offset);
4245 /* Return TRUE if X is a legitimate address for accessing memory in
4248 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
4250 struct aarch64_address_info addr;
4252 return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
4255 /* Return TRUE if X is a legitimate address for accessing memory in
4256 mode MODE. OUTER_CODE will be PARALLEL if this is a load/store
4259 aarch64_legitimate_address_p (machine_mode mode, rtx x,
4260 RTX_CODE outer_code, bool strict_p)
4262 struct aarch64_address_info addr;
4264 return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
4267 /* Split an out-of-range address displacement into a base and offset.
4268 Use 4KB range for 1- and 2-byte accesses and a 16KB range otherwise
4269 to increase opportunities for sharing the base address of different sizes.
4270 For TI/TFmode and unaligned accesses use a 256-byte range. */
4272 aarch64_legitimize_address_displacement (rtx *disp, rtx *off, machine_mode mode)
4274 HOST_WIDE_INT mask = GET_MODE_SIZE (mode) < 4 ? 0xfff : 0x3fff;
4276 if (mode == TImode || mode == TFmode ||
4277 (INTVAL (*disp) & (GET_MODE_SIZE (mode) - 1)) != 0)
4280 *off = GEN_INT (INTVAL (*disp) & ~mask);
4281 *disp = GEN_INT (INTVAL (*disp) & mask);
4285 /* Return TRUE if rtx X is immediate constant 0.0 */
4287 aarch64_float_const_zero_rtx_p (rtx x)
4289 if (GET_MODE (x) == VOIDmode)
4292 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
4293 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
4294 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
4297 /* Return the fixed registers used for condition codes. */
4300 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
4303 *p2 = INVALID_REGNUM;
4307 /* Emit call insn with PAT and do aarch64-specific handling. */
4310 aarch64_emit_call_insn (rtx pat)
4312 rtx insn = emit_call_insn (pat);
4314 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
4315 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
4316 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
4320 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
4322 /* All floating point compares return CCFP if it is an equality
4323 comparison, and CCFPE otherwise. */
4324 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
4351 /* Equality comparisons of short modes against zero can be performed
4352 using the TST instruction with the appropriate bitmask. */
4353 if (y == const0_rtx && REG_P (x)
4354 && (code == EQ || code == NE)
4355 && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
4358 /* Similarly, comparisons of zero_extends from shorter modes can
4359 be performed using an ANDS with an immediate mask. */
4360 if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
4361 && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4362 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
4363 && (code == EQ || code == NE))
4366 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4368 && (code == EQ || code == NE || code == LT || code == GE)
4369 && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
4370 || GET_CODE (x) == NEG
4371 || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
4372 && CONST_INT_P (XEXP (x, 2)))))
4375 /* A compare with a shifted operand. Because of canonicalization,
4376 the comparison will have to be swapped when we emit the assembly
4378 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4379 && (REG_P (y) || GET_CODE (y) == SUBREG)
4380 && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
4381 || GET_CODE (x) == LSHIFTRT
4382 || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
4385 /* Similarly for a negated operand, but we can only do this for
4387 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4388 && (REG_P (y) || GET_CODE (y) == SUBREG)
4389 && (code == EQ || code == NE)
4390 && GET_CODE (x) == NEG)
4393 /* A test for unsigned overflow. */
4394 if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
4396 && GET_CODE (x) == PLUS
4397 && GET_CODE (y) == ZERO_EXTEND)
4400 /* For everything else, return CCmode. */
4405 aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
4408 aarch64_get_condition_code (rtx x)
4410 machine_mode mode = GET_MODE (XEXP (x, 0));
4411 enum rtx_code comp_code = GET_CODE (x);
4413 if (GET_MODE_CLASS (mode) != MODE_CC)
4414 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
4415 return aarch64_get_condition_code_1 (mode, comp_code);
4419 aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
4427 case GE: return AARCH64_GE;
4428 case GT: return AARCH64_GT;
4429 case LE: return AARCH64_LS;
4430 case LT: return AARCH64_MI;
4431 case NE: return AARCH64_NE;
4432 case EQ: return AARCH64_EQ;
4433 case ORDERED: return AARCH64_VC;
4434 case UNORDERED: return AARCH64_VS;
4435 case UNLT: return AARCH64_LT;
4436 case UNLE: return AARCH64_LE;
4437 case UNGT: return AARCH64_HI;
4438 case UNGE: return AARCH64_PL;
4446 case NE: return AARCH64_NE;
4447 case EQ: return AARCH64_EQ;
4448 case GE: return AARCH64_GE;
4449 case GT: return AARCH64_GT;
4450 case LE: return AARCH64_LE;
4451 case LT: return AARCH64_LT;
4452 case GEU: return AARCH64_CS;
4453 case GTU: return AARCH64_HI;
4454 case LEU: return AARCH64_LS;
4455 case LTU: return AARCH64_CC;
4463 case NE: return AARCH64_NE;
4464 case EQ: return AARCH64_EQ;
4465 case GE: return AARCH64_LE;
4466 case GT: return AARCH64_LT;
4467 case LE: return AARCH64_GE;
4468 case LT: return AARCH64_GT;
4469 case GEU: return AARCH64_LS;
4470 case GTU: return AARCH64_CC;
4471 case LEU: return AARCH64_CS;
4472 case LTU: return AARCH64_HI;
4480 case NE: return AARCH64_NE;
4481 case EQ: return AARCH64_EQ;
4482 case GE: return AARCH64_PL;
4483 case LT: return AARCH64_MI;
4491 case NE: return AARCH64_NE;
4492 case EQ: return AARCH64_EQ;
4500 case NE: return AARCH64_CS;
4501 case EQ: return AARCH64_CC;
4515 aarch64_const_vec_all_same_in_range_p (rtx x,
4516 HOST_WIDE_INT minval,
4517 HOST_WIDE_INT maxval)
4519 HOST_WIDE_INT firstval;
4522 if (GET_CODE (x) != CONST_VECTOR
4523 || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
4526 firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
4527 if (firstval < minval || firstval > maxval)
4530 count = CONST_VECTOR_NUNITS (x);
4531 for (i = 1; i < count; i++)
4532 if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
4539 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
4541 return aarch64_const_vec_all_same_in_range_p (x, val, val);
4546 #define AARCH64_CC_V 1
4547 #define AARCH64_CC_C (1 << 1)
4548 #define AARCH64_CC_Z (1 << 2)
4549 #define AARCH64_CC_N (1 << 3)
4551 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
4552 static const int aarch64_nzcv_codes[] =
4554 0, /* EQ, Z == 1. */
4555 AARCH64_CC_Z, /* NE, Z == 0. */
4556 0, /* CS, C == 1. */
4557 AARCH64_CC_C, /* CC, C == 0. */
4558 0, /* MI, N == 1. */
4559 AARCH64_CC_N, /* PL, N == 0. */
4560 0, /* VS, V == 1. */
4561 AARCH64_CC_V, /* VC, V == 0. */
4562 0, /* HI, C ==1 && Z == 0. */
4563 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
4564 AARCH64_CC_V, /* GE, N == V. */
4565 0, /* LT, N != V. */
4566 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
4567 0, /* LE, !(Z == 0 && N == V). */
4573 aarch64_print_operand (FILE *f, rtx x, int code)
4577 /* An integer or symbol address without a preceding # sign. */
4579 switch (GET_CODE (x))
4582 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
4586 output_addr_const (f, x);
4590 if (GET_CODE (XEXP (x, 0)) == PLUS
4591 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
4593 output_addr_const (f, x);
4599 output_operand_lossage ("Unsupported operand for code '%c'", code);
4604 /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w. */
4608 if (!CONST_INT_P (x)
4609 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
4611 output_operand_lossage ("invalid operand for '%%%c'", code);
4627 output_operand_lossage ("invalid operand for '%%%c'", code);
4637 /* Print N such that 2^N == X. */
4638 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
4640 output_operand_lossage ("invalid operand for '%%%c'", code);
4644 asm_fprintf (f, "%d", n);
4649 /* Print the number of non-zero bits in X (a const_int). */
4650 if (!CONST_INT_P (x))
4652 output_operand_lossage ("invalid operand for '%%%c'", code);
4656 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
4660 /* Print the higher numbered register of a pair (TImode) of regs. */
4661 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
4663 output_operand_lossage ("invalid operand for '%%%c'", code);
4667 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
4674 /* Print a condition (eq, ne, etc) or its inverse. */
4676 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
4677 if (x == const_true_rtx)
4684 if (!COMPARISON_P (x))
4686 output_operand_lossage ("invalid operand for '%%%c'", code);
4690 cond_code = aarch64_get_condition_code (x);
4691 gcc_assert (cond_code >= 0);
4693 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
4694 fputs (aarch64_condition_codes[cond_code], f);
4703 /* Print a scalar FP/SIMD register name. */
4704 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4706 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4709 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
4716 /* Print the first FP/SIMD register name in a list. */
4717 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4719 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4722 asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
4726 /* Print a scalar FP/SIMD register name + 1. */
4727 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4729 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4732 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
4736 /* Print bottom 16 bits of integer constant in hex. */
4737 if (!CONST_INT_P (x))
4739 output_operand_lossage ("invalid operand for '%%%c'", code);
4742 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
4747 /* Print a general register name or the zero register (32-bit or
4750 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
4752 asm_fprintf (f, "%czr", code);
4756 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
4758 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
4762 if (REG_P (x) && REGNO (x) == SP_REGNUM)
4764 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
4771 /* Print a normal operand, if it's a general register, then we
4775 output_operand_lossage ("missing operand");
4779 switch (GET_CODE (x))
4782 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
4786 output_address (GET_MODE (x), XEXP (x, 0));
4792 output_addr_const (asm_out_file, x);
4796 asm_fprintf (f, "%wd", INTVAL (x));
4800 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
4803 aarch64_const_vec_all_same_in_range_p (x,
4805 HOST_WIDE_INT_MAX));
4806 asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
4808 else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
4817 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
4818 be getting CONST_DOUBLEs holding integers. */
4819 gcc_assert (GET_MODE (x) != VOIDmode);
4820 if (aarch64_float_const_zero_rtx_p (x))
4825 else if (aarch64_float_const_representable_p (x))
4828 char float_buf[buf_size] = {'\0'};
4829 real_to_decimal_for_mode (float_buf,
4830 CONST_DOUBLE_REAL_VALUE (x),
4833 asm_fprintf (asm_out_file, "%s", float_buf);
4837 output_operand_lossage ("invalid constant");
4840 output_operand_lossage ("invalid operand");
4846 if (GET_CODE (x) == HIGH)
4849 switch (aarch64_classify_symbolic_expression (x))
4851 case SYMBOL_SMALL_GOT_4G:
4852 asm_fprintf (asm_out_file, ":got:");
4855 case SYMBOL_SMALL_TLSGD:
4856 asm_fprintf (asm_out_file, ":tlsgd:");
4859 case SYMBOL_SMALL_TLSDESC:
4860 asm_fprintf (asm_out_file, ":tlsdesc:");
4863 case SYMBOL_SMALL_TLSIE:
4864 asm_fprintf (asm_out_file, ":gottprel:");
4867 case SYMBOL_TLSLE24:
4868 asm_fprintf (asm_out_file, ":tprel:");
4871 case SYMBOL_TINY_GOT:
4878 output_addr_const (asm_out_file, x);
4882 switch (aarch64_classify_symbolic_expression (x))
4884 case SYMBOL_SMALL_GOT_4G:
4885 asm_fprintf (asm_out_file, ":lo12:");
4888 case SYMBOL_SMALL_TLSGD:
4889 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
4892 case SYMBOL_SMALL_TLSDESC:
4893 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
4896 case SYMBOL_SMALL_TLSIE:
4897 asm_fprintf (asm_out_file, ":gottprel_lo12:");
4900 case SYMBOL_TLSLE12:
4901 asm_fprintf (asm_out_file, ":tprel_lo12:");
4904 case SYMBOL_TLSLE24:
4905 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
4908 case SYMBOL_TINY_GOT:
4909 asm_fprintf (asm_out_file, ":got:");
4912 case SYMBOL_TINY_TLSIE:
4913 asm_fprintf (asm_out_file, ":gottprel:");
4919 output_addr_const (asm_out_file, x);
4924 switch (aarch64_classify_symbolic_expression (x))
4926 case SYMBOL_TLSLE24:
4927 asm_fprintf (asm_out_file, ":tprel_hi12:");
4932 output_addr_const (asm_out_file, x);
4937 HOST_WIDE_INT cond_code;
4940 if (!CONST_INT_P (x))
4942 output_operand_lossage ("invalid operand for '%%%c'", code);
4946 cond_code = INTVAL (x);
4947 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
4948 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
4953 output_operand_lossage ("invalid operand prefix '%%%c'", code);
4959 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
4961 struct aarch64_address_info addr;
4963 if (aarch64_classify_address (&addr, x, mode, MEM, true))
4966 case ADDRESS_REG_IMM:
4967 if (addr.offset == const0_rtx)
4968 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
4970 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
4971 INTVAL (addr.offset));
4974 case ADDRESS_REG_REG:
4975 if (addr.shift == 0)
4976 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
4977 reg_names [REGNO (addr.offset)]);
4979 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
4980 reg_names [REGNO (addr.offset)], addr.shift);
4983 case ADDRESS_REG_UXTW:
4984 if (addr.shift == 0)
4985 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
4986 REGNO (addr.offset) - R0_REGNUM);
4988 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
4989 REGNO (addr.offset) - R0_REGNUM, addr.shift);
4992 case ADDRESS_REG_SXTW:
4993 if (addr.shift == 0)
4994 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
4995 REGNO (addr.offset) - R0_REGNUM);
4997 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
4998 REGNO (addr.offset) - R0_REGNUM, addr.shift);
5001 case ADDRESS_REG_WB:
5002 switch (GET_CODE (x))
5005 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
5006 GET_MODE_SIZE (mode));
5009 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
5010 GET_MODE_SIZE (mode));
5013 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
5014 GET_MODE_SIZE (mode));
5017 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
5018 GET_MODE_SIZE (mode));
5021 asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
5022 INTVAL (addr.offset));
5025 asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
5026 INTVAL (addr.offset));
5033 case ADDRESS_LO_SUM:
5034 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
5035 output_addr_const (f, addr.offset);
5036 asm_fprintf (f, "]");
5039 case ADDRESS_SYMBOLIC:
5043 output_addr_const (f, x);
5047 aarch64_label_mentioned_p (rtx x)
5052 if (GET_CODE (x) == LABEL_REF)
5055 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
5056 referencing instruction, but they are constant offsets, not
5058 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5061 fmt = GET_RTX_FORMAT (GET_CODE (x));
5062 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
5068 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
5069 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
5072 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
5079 /* Implement REGNO_REG_CLASS. */
5082 aarch64_regno_regclass (unsigned regno)
5084 if (GP_REGNUM_P (regno))
5085 return GENERAL_REGS;
5087 if (regno == SP_REGNUM)
5090 if (regno == FRAME_POINTER_REGNUM
5091 || regno == ARG_POINTER_REGNUM)
5092 return POINTER_REGS;
5094 if (FP_REGNUM_P (regno))
5095 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
5101 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
5103 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
5104 where mask is selected by alignment and size of the offset.
5105 We try to pick as large a range for the offset as possible to
5106 maximize the chance of a CSE. However, for aligned addresses
5107 we limit the range to 4k so that structures with different sized
5108 elements are likely to use the same base. We need to be careful
5109 not to split a CONST for some forms of address expression, otherwise
5110 it will generate sub-optimal code. */
5112 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
5114 rtx base = XEXP (x, 0);
5115 rtx offset_rtx = XEXP (x, 1);
5116 HOST_WIDE_INT offset = INTVAL (offset_rtx);
5118 if (GET_CODE (base) == PLUS)
5120 rtx op0 = XEXP (base, 0);
5121 rtx op1 = XEXP (base, 1);
5123 /* Force any scaling into a temp for CSE. */
5124 op0 = force_reg (Pmode, op0);
5125 op1 = force_reg (Pmode, op1);
5127 /* Let the pointer register be in op0. */
5128 if (REG_POINTER (op1))
5129 std::swap (op0, op1);
5131 /* If the pointer is virtual or frame related, then we know that
5132 virtual register instantiation or register elimination is going
5133 to apply a second constant. We want the two constants folded
5134 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
5135 if (virt_or_elim_regno_p (REGNO (op0)))
5137 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
5138 NULL_RTX, true, OPTAB_DIRECT);
5139 return gen_rtx_PLUS (Pmode, base, op1);
5142 /* Otherwise, in order to encourage CSE (and thence loop strength
5143 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
5144 base = expand_binop (Pmode, add_optab, op0, op1,
5145 NULL_RTX, true, OPTAB_DIRECT);
5146 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
5149 /* Does it look like we'll need a load/store-pair operation? */
5150 HOST_WIDE_INT base_offset;
5151 if (GET_MODE_SIZE (mode) > 16
5153 base_offset = ((offset + 64 * GET_MODE_SIZE (mode))
5154 & ~((128 * GET_MODE_SIZE (mode)) - 1));
5155 /* For offsets aren't a multiple of the access size, the limit is
5157 else if (offset & (GET_MODE_SIZE (mode) - 1))
5158 base_offset = (offset + 0x100) & ~0x1ff;
5160 base_offset = offset & ~0xfff;
5162 if (base_offset != 0)
5164 base = plus_constant (Pmode, base, base_offset);
5165 base = force_operand (base, NULL_RTX);
5166 return plus_constant (Pmode, base, offset - base_offset);
5173 /* Return the reload icode required for a constant pool in mode. */
5174 static enum insn_code
5175 aarch64_constant_pool_reload_icode (machine_mode mode)
5180 return CODE_FOR_aarch64_reload_movcpsfdi;
5183 return CODE_FOR_aarch64_reload_movcpdfdi;
5186 return CODE_FOR_aarch64_reload_movcptfdi;
5189 return CODE_FOR_aarch64_reload_movcpv8qidi;
5192 return CODE_FOR_aarch64_reload_movcpv16qidi;
5195 return CODE_FOR_aarch64_reload_movcpv4hidi;
5198 return CODE_FOR_aarch64_reload_movcpv8hidi;
5201 return CODE_FOR_aarch64_reload_movcpv2sidi;
5204 return CODE_FOR_aarch64_reload_movcpv4sidi;
5207 return CODE_FOR_aarch64_reload_movcpv2didi;
5210 return CODE_FOR_aarch64_reload_movcpv2dfdi;
5219 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
5222 secondary_reload_info *sri)
5225 /* If we have to disable direct literal pool loads and stores because the
5226 function is too big, then we need a scratch register. */
5227 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
5228 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
5229 || targetm.vector_mode_supported_p (GET_MODE (x)))
5230 && !aarch64_pcrelative_literal_loads)
5232 sri->icode = aarch64_constant_pool_reload_icode (mode);
5236 /* Without the TARGET_SIMD instructions we cannot move a Q register
5237 to a Q register directly. We need a scratch. */
5238 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
5239 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
5240 && reg_class_subset_p (rclass, FP_REGS))
5243 sri->icode = CODE_FOR_aarch64_reload_movtf;
5244 else if (mode == TImode)
5245 sri->icode = CODE_FOR_aarch64_reload_movti;
5249 /* A TFmode or TImode memory access should be handled via an FP_REGS
5250 because AArch64 has richer addressing modes for LDR/STR instructions
5251 than LDP/STP instructions. */
5252 if (TARGET_FLOAT && rclass == GENERAL_REGS
5253 && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
5256 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
5257 return GENERAL_REGS;
5263 aarch64_can_eliminate (const int from, const int to)
5265 /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
5266 HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM. */
5268 if (frame_pointer_needed)
5270 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5272 if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
5274 if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
5275 && !cfun->calls_alloca)
5277 if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5284 /* If we decided that we didn't need a leaf frame pointer but then used
5285 LR in the function, then we'll want a frame pointer after all, so
5286 prevent this elimination to ensure a frame pointer is used. */
5287 if (to == STACK_POINTER_REGNUM
5288 && flag_omit_leaf_frame_pointer
5289 && df_regs_ever_live_p (LR_REGNUM))
5297 aarch64_initial_elimination_offset (unsigned from, unsigned to)
5299 aarch64_layout_frame ();
5301 if (to == HARD_FRAME_POINTER_REGNUM)
5303 if (from == ARG_POINTER_REGNUM)
5304 return cfun->machine->frame.hard_fp_offset;
5306 if (from == FRAME_POINTER_REGNUM)
5307 return cfun->machine->frame.hard_fp_offset
5308 - cfun->machine->frame.locals_offset;
5311 if (to == STACK_POINTER_REGNUM)
5313 if (from == FRAME_POINTER_REGNUM)
5314 return cfun->machine->frame.frame_size
5315 - cfun->machine->frame.locals_offset;
5318 return cfun->machine->frame.frame_size;
5321 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
5325 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
5329 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
5334 aarch64_asm_trampoline_template (FILE *f)
5338 asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5339 asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5343 asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5344 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5346 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5347 assemble_aligned_integer (4, const0_rtx);
5348 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5349 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5353 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5355 rtx fnaddr, mem, a_tramp;
5356 const int tramp_code_sz = 16;
5358 /* Don't need to copy the trailing D-words, we fill those in below. */
5359 emit_block_move (m_tramp, assemble_trampoline_template (),
5360 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
5361 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
5362 fnaddr = XEXP (DECL_RTL (fndecl), 0);
5363 if (GET_MODE (fnaddr) != ptr_mode)
5364 fnaddr = convert_memory_address (ptr_mode, fnaddr);
5365 emit_move_insn (mem, fnaddr);
5367 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
5368 emit_move_insn (mem, chain_value);
5370 /* XXX We should really define a "clear_cache" pattern and use
5371 gen_clear_cache(). */
5372 a_tramp = XEXP (m_tramp, 0);
5373 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
5374 LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
5375 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
5379 static unsigned char
5380 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
5384 case CALLER_SAVE_REGS:
5391 aarch64_vector_mode_p (mode)
5392 ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
5393 : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5407 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
5409 if (regclass == POINTER_REGS)
5410 return GENERAL_REGS;
5412 if (regclass == STACK_REG)
5415 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
5421 /* If it's an integer immediate that MOVI can't handle, then
5422 FP_REGS is not an option, so we return NO_REGS instead. */
5423 if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
5424 && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
5427 /* Register eliminiation can result in a request for
5428 SP+constant->FP_REGS. We cannot support such operations which
5429 use SP as source and an FP_REG as destination, so reject out
5431 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
5433 rtx lhs = XEXP (x, 0);
5435 /* Look through a possible SUBREG introduced by ILP32. */
5436 if (GET_CODE (lhs) == SUBREG)
5437 lhs = SUBREG_REG (lhs);
5439 gcc_assert (REG_P (lhs));
5440 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
5449 aarch64_asm_output_labelref (FILE* f, const char *name)
5451 asm_fprintf (f, "%U%s", name);
5455 aarch64_elf_asm_constructor (rtx symbol, int priority)
5457 if (priority == DEFAULT_INIT_PRIORITY)
5458 default_ctor_section_asm_out_constructor (symbol, priority);
5462 /* While priority is known to be in range [0, 65535], so 18 bytes
5463 would be enough, the compiler might not know that. To avoid
5464 -Wformat-truncation false positive, use a larger size. */
5466 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
5467 s = get_section (buf, SECTION_WRITE, NULL);
5468 switch_to_section (s);
5469 assemble_align (POINTER_SIZE);
5470 assemble_aligned_integer (POINTER_BYTES, symbol);
5475 aarch64_elf_asm_destructor (rtx symbol, int priority)
5477 if (priority == DEFAULT_INIT_PRIORITY)
5478 default_dtor_section_asm_out_destructor (symbol, priority);
5482 /* While priority is known to be in range [0, 65535], so 18 bytes
5483 would be enough, the compiler might not know that. To avoid
5484 -Wformat-truncation false positive, use a larger size. */
5486 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
5487 s = get_section (buf, SECTION_WRITE, NULL);
5488 switch_to_section (s);
5489 assemble_align (POINTER_SIZE);
5490 assemble_aligned_integer (POINTER_BYTES, symbol);
5495 aarch64_output_casesi (rtx *operands)
5499 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
5501 static const char *const patterns[4][2] =
5504 "ldrb\t%w3, [%0,%w1,uxtw]",
5505 "add\t%3, %4, %w3, sxtb #2"
5508 "ldrh\t%w3, [%0,%w1,uxtw #1]",
5509 "add\t%3, %4, %w3, sxth #2"
5512 "ldr\t%w3, [%0,%w1,uxtw #2]",
5513 "add\t%3, %4, %w3, sxtw #2"
5515 /* We assume that DImode is only generated when not optimizing and
5516 that we don't really need 64-bit address offsets. That would
5517 imply an object file with 8GB of code in a single function! */
5519 "ldr\t%w3, [%0,%w1,uxtw #2]",
5520 "add\t%3, %4, %w3, sxtw #2"
5524 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
5526 index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
5528 gcc_assert (index >= 0 && index <= 3);
5530 /* Need to implement table size reduction, by chaning the code below. */
5531 output_asm_insn (patterns[index][0], operands);
5532 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
5533 snprintf (buf, sizeof (buf),
5534 "adr\t%%4, %s", targetm.strip_name_encoding (label));
5535 output_asm_insn (buf, operands);
5536 output_asm_insn (patterns[index][1], operands);
5537 output_asm_insn ("br\t%3", operands);
5538 assemble_label (asm_out_file, label);
5543 /* Return size in bits of an arithmetic operand which is shifted/scaled and
5544 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
5548 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
5550 if (shift >= 0 && shift <= 3)
5553 for (size = 8; size <= 32; size *= 2)
5555 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
5556 if (mask == bits << shift)
5563 /* Constant pools are per function only when PC relative
5564 literal loads are true or we are in the large memory
5568 aarch64_can_use_per_function_literal_pools_p (void)
5570 return (aarch64_pcrelative_literal_loads
5571 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
5575 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
5577 /* Fixme:: In an ideal world this would work similar
5578 to the logic in aarch64_select_rtx_section but this
5579 breaks bootstrap in gcc go. For now we workaround
5580 this by returning false here. */
5584 /* Select appropriate section for constants depending
5585 on where we place literal pools. */
5588 aarch64_select_rtx_section (machine_mode mode,
5590 unsigned HOST_WIDE_INT align)
5592 if (aarch64_can_use_per_function_literal_pools_p ())
5593 return function_section (current_function_decl);
5595 return default_elf_select_rtx_section (mode, x, align);
5598 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
5600 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
5601 HOST_WIDE_INT offset)
5603 /* When using per-function literal pools, we must ensure that any code
5604 section is aligned to the minimal instruction length, lest we get
5605 errors from the assembler re "unaligned instructions". */
5606 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
5607 ASM_OUTPUT_ALIGN (f, 2);
5612 /* Helper function for rtx cost calculation. Strip a shift expression
5613 from X. Returns the inner operand if successful, or the original
5614 expression on failure. */
5616 aarch64_strip_shift (rtx x)
5620 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
5621 we can convert both to ROR during final output. */
5622 if ((GET_CODE (op) == ASHIFT
5623 || GET_CODE (op) == ASHIFTRT
5624 || GET_CODE (op) == LSHIFTRT
5625 || GET_CODE (op) == ROTATERT
5626 || GET_CODE (op) == ROTATE)
5627 && CONST_INT_P (XEXP (op, 1)))
5628 return XEXP (op, 0);
5630 if (GET_CODE (op) == MULT
5631 && CONST_INT_P (XEXP (op, 1))
5632 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
5633 return XEXP (op, 0);
5638 /* Helper function for rtx cost calculation. Strip an extend
5639 expression from X. Returns the inner operand if successful, or the
5640 original expression on failure. We deal with a number of possible
5641 canonicalization variations here. */
5643 aarch64_strip_extend (rtx x)
5647 /* Zero and sign extraction of a widened value. */
5648 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
5649 && XEXP (op, 2) == const0_rtx
5650 && GET_CODE (XEXP (op, 0)) == MULT
5651 && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
5653 return XEXP (XEXP (op, 0), 0);
5655 /* It can also be represented (for zero-extend) as an AND with an
5657 if (GET_CODE (op) == AND
5658 && GET_CODE (XEXP (op, 0)) == MULT
5659 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
5660 && CONST_INT_P (XEXP (op, 1))
5661 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
5662 INTVAL (XEXP (op, 1))) != 0)
5663 return XEXP (XEXP (op, 0), 0);
5665 /* Now handle extended register, as this may also have an optional
5666 left shift by 1..4. */
5667 if (GET_CODE (op) == ASHIFT
5668 && CONST_INT_P (XEXP (op, 1))
5669 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
5672 if (GET_CODE (op) == ZERO_EXTEND
5673 || GET_CODE (op) == SIGN_EXTEND)
5682 /* Return true iff CODE is a shift supported in combination
5683 with arithmetic instructions. */
5686 aarch64_shift_p (enum rtx_code code)
5688 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
5691 /* Helper function for rtx cost calculation. Calculate the cost of
5692 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
5693 Return the calculated cost of the expression, recursing manually in to
5694 operands where needed. */
5697 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
5700 const struct cpu_cost_table *extra_cost
5701 = aarch64_tune_params.insn_extra_cost;
5703 bool compound_p = (outer == PLUS || outer == MINUS);
5704 machine_mode mode = GET_MODE (x);
5706 gcc_checking_assert (code == MULT);
5711 if (VECTOR_MODE_P (mode))
5712 mode = GET_MODE_INNER (mode);
5714 /* Integer multiply/fma. */
5715 if (GET_MODE_CLASS (mode) == MODE_INT)
5717 /* The multiply will be canonicalized as a shift, cost it as such. */
5718 if (aarch64_shift_p (GET_CODE (x))
5719 || (CONST_INT_P (op1)
5720 && exact_log2 (INTVAL (op1)) > 0))
5722 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
5723 || GET_CODE (op0) == SIGN_EXTEND;
5729 /* ARITH + shift-by-register. */
5730 cost += extra_cost->alu.arith_shift_reg;
5732 /* ARITH + extended register. We don't have a cost field
5733 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
5734 cost += extra_cost->alu.extend_arith;
5736 /* ARITH + shift-by-immediate. */
5737 cost += extra_cost->alu.arith_shift;
5740 /* LSL (immediate). */
5741 cost += extra_cost->alu.shift;
5744 /* Strip extends as we will have costed them in the case above. */
5746 op0 = aarch64_strip_extend (op0);
5748 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
5753 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
5754 compound and let the below cases handle it. After all, MNEG is a
5755 special-case alias of MSUB. */
5756 if (GET_CODE (op0) == NEG)
5758 op0 = XEXP (op0, 0);
5762 /* Integer multiplies or FMAs have zero/sign extending variants. */
5763 if ((GET_CODE (op0) == ZERO_EXTEND
5764 && GET_CODE (op1) == ZERO_EXTEND)
5765 || (GET_CODE (op0) == SIGN_EXTEND
5766 && GET_CODE (op1) == SIGN_EXTEND))
5768 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
5769 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
5774 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
5775 cost += extra_cost->mult[0].extend_add;
5777 /* MUL/SMULL/UMULL. */
5778 cost += extra_cost->mult[0].extend;
5784 /* This is either an integer multiply or a MADD. In both cases
5785 we want to recurse and cost the operands. */
5786 cost += rtx_cost (op0, mode, MULT, 0, speed);
5787 cost += rtx_cost (op1, mode, MULT, 1, speed);
5793 cost += extra_cost->mult[mode == DImode].add;
5796 cost += extra_cost->mult[mode == DImode].simple;
5805 /* Floating-point FMA/FMUL can also support negations of the
5806 operands, unless the rounding mode is upward or downward in
5807 which case FNMUL is different than FMUL with operand negation. */
5808 bool neg0 = GET_CODE (op0) == NEG;
5809 bool neg1 = GET_CODE (op1) == NEG;
5810 if (compound_p || !flag_rounding_math || (neg0 && neg1))
5813 op0 = XEXP (op0, 0);
5815 op1 = XEXP (op1, 0);
5819 /* FMADD/FNMADD/FNMSUB/FMSUB. */
5820 cost += extra_cost->fp[mode == DFmode].fma;
5823 cost += extra_cost->fp[mode == DFmode].mult;
5826 cost += rtx_cost (op0, mode, MULT, 0, speed);
5827 cost += rtx_cost (op1, mode, MULT, 1, speed);
5833 aarch64_address_cost (rtx x,
5835 addr_space_t as ATTRIBUTE_UNUSED,
5838 enum rtx_code c = GET_CODE (x);
5839 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
5840 struct aarch64_address_info info;
5844 if (!aarch64_classify_address (&info, x, mode, c, false))
5846 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
5848 /* This is a CONST or SYMBOL ref which will be split
5849 in a different way depending on the code model in use.
5850 Cost it through the generic infrastructure. */
5851 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
5852 /* Divide through by the cost of one instruction to
5853 bring it to the same units as the address costs. */
5854 cost_symbol_ref /= COSTS_N_INSNS (1);
5855 /* The cost is then the cost of preparing the address,
5856 followed by an immediate (possibly 0) offset. */
5857 return cost_symbol_ref + addr_cost->imm_offset;
5861 /* This is most likely a jump table from a case
5863 return addr_cost->register_offset;
5869 case ADDRESS_LO_SUM:
5870 case ADDRESS_SYMBOLIC:
5871 case ADDRESS_REG_IMM:
5872 cost += addr_cost->imm_offset;
5875 case ADDRESS_REG_WB:
5876 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
5877 cost += addr_cost->pre_modify;
5878 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
5879 cost += addr_cost->post_modify;
5885 case ADDRESS_REG_REG:
5886 cost += addr_cost->register_offset;
5889 case ADDRESS_REG_SXTW:
5890 cost += addr_cost->register_sextend;
5893 case ADDRESS_REG_UXTW:
5894 cost += addr_cost->register_zextend;
5904 /* For the sake of calculating the cost of the shifted register
5905 component, we can treat same sized modes in the same way. */
5906 switch (GET_MODE_BITSIZE (mode))
5909 cost += addr_cost->addr_scale_costs.hi;
5913 cost += addr_cost->addr_scale_costs.si;
5917 cost += addr_cost->addr_scale_costs.di;
5920 /* We can't tell, or this is a 128-bit vector. */
5922 cost += addr_cost->addr_scale_costs.ti;
5930 /* Return the cost of a branch. If SPEED_P is true then the compiler is
5931 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
5935 aarch64_branch_cost (bool speed_p, bool predictable_p)
5937 /* When optimizing for speed, use the cost of unpredictable branches. */
5938 const struct cpu_branch_cost *branch_costs =
5939 aarch64_tune_params.branch_costs;
5941 if (!speed_p || predictable_p)
5942 return branch_costs->predictable;
5944 return branch_costs->unpredictable;
5947 /* Return true if the RTX X in mode MODE is a zero or sign extract
5948 usable in an ADD or SUB (extended register) instruction. */
5950 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
5952 /* Catch add with a sign extract.
5953 This is add_<optab><mode>_multp2. */
5954 if (GET_CODE (x) == SIGN_EXTRACT
5955 || GET_CODE (x) == ZERO_EXTRACT)
5957 rtx op0 = XEXP (x, 0);
5958 rtx op1 = XEXP (x, 1);
5959 rtx op2 = XEXP (x, 2);
5961 if (GET_CODE (op0) == MULT
5962 && CONST_INT_P (op1)
5963 && op2 == const0_rtx
5964 && CONST_INT_P (XEXP (op0, 1))
5965 && aarch64_is_extend_from_extract (mode,
5972 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
5974 else if (GET_CODE (x) == SIGN_EXTEND
5975 || GET_CODE (x) == ZERO_EXTEND)
5976 return REG_P (XEXP (x, 0));
5982 aarch64_frint_unspec_p (unsigned int u)
6000 /* Return true iff X is an rtx that will match an extr instruction
6001 i.e. as described in the *extr<mode>5_insn family of patterns.
6002 OP0 and OP1 will be set to the operands of the shifts involved
6003 on success and will be NULL_RTX otherwise. */
6006 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
6009 machine_mode mode = GET_MODE (x);
6011 *res_op0 = NULL_RTX;
6012 *res_op1 = NULL_RTX;
6014 if (GET_CODE (x) != IOR)
6020 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
6021 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
6023 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
6024 if (GET_CODE (op1) == ASHIFT)
6025 std::swap (op0, op1);
6027 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
6030 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
6031 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
6033 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
6034 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
6036 *res_op0 = XEXP (op0, 0);
6037 *res_op1 = XEXP (op1, 0);
6045 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
6046 storing it in *COST. Result is true if the total cost of the operation
6047 has now been calculated. */
6049 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
6053 enum rtx_code cmpcode;
6055 if (COMPARISON_P (op0))
6057 inner = XEXP (op0, 0);
6058 comparator = XEXP (op0, 1);
6059 cmpcode = GET_CODE (op0);
6064 comparator = const0_rtx;
6068 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
6070 /* Conditional branch. */
6071 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6075 if (cmpcode == NE || cmpcode == EQ)
6077 if (comparator == const0_rtx)
6079 /* TBZ/TBNZ/CBZ/CBNZ. */
6080 if (GET_CODE (inner) == ZERO_EXTRACT)
6082 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
6083 ZERO_EXTRACT, 0, speed);
6086 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
6091 else if (cmpcode == LT || cmpcode == GE)
6094 if (comparator == const0_rtx)
6099 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6102 if (GET_CODE (op1) == COMPARE)
6104 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
6105 if (XEXP (op1, 1) == const0_rtx)
6109 machine_mode mode = GET_MODE (XEXP (op1, 0));
6110 const struct cpu_cost_table *extra_cost
6111 = aarch64_tune_params.insn_extra_cost;
6113 if (GET_MODE_CLASS (mode) == MODE_INT)
6114 *cost += extra_cost->alu.arith;
6116 *cost += extra_cost->fp[mode == DFmode].compare;
6121 /* It's a conditional operation based on the status flags,
6122 so it must be some flavor of CSEL. */
6124 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
6125 if (GET_CODE (op1) == NEG
6126 || GET_CODE (op1) == NOT
6127 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
6128 op1 = XEXP (op1, 0);
6129 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
6131 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
6132 op1 = XEXP (op1, 0);
6133 op2 = XEXP (op2, 0);
6136 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
6137 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
6141 /* We don't know what this is, cost all operands. */
6145 /* Check whether X is a bitfield operation of the form shift + extend that
6146 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
6147 operand to which the bitfield operation is applied. Otherwise return
6151 aarch64_extend_bitfield_pattern_p (rtx x)
6153 rtx_code outer_code = GET_CODE (x);
6154 machine_mode outer_mode = GET_MODE (x);
6156 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
6157 && outer_mode != SImode && outer_mode != DImode)
6160 rtx inner = XEXP (x, 0);
6161 rtx_code inner_code = GET_CODE (inner);
6162 machine_mode inner_mode = GET_MODE (inner);
6168 if (CONST_INT_P (XEXP (inner, 1))
6169 && (inner_mode == QImode || inner_mode == HImode))
6170 op = XEXP (inner, 0);
6173 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
6174 && (inner_mode == QImode || inner_mode == HImode))
6175 op = XEXP (inner, 0);
6178 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
6179 && (inner_mode == QImode || inner_mode == HImode))
6180 op = XEXP (inner, 0);
6189 /* Return true if the mask and a shift amount from an RTX of the form
6190 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
6191 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
6194 aarch64_mask_and_shift_for_ubfiz_p (machine_mode mode, rtx mask, rtx shft_amnt)
6196 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
6197 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
6198 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
6199 && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
6202 /* Calculate the cost of calculating X, storing it in *COST. Result
6203 is true if the total cost of the operation has now been calculated. */
6205 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
6206 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
6209 const struct cpu_cost_table *extra_cost
6210 = aarch64_tune_params.insn_extra_cost;
6211 int code = GET_CODE (x);
6213 /* By default, assume that everything has equivalent cost to the
6214 cheapest instruction. Any additional costs are applied as a delta
6215 above this default. */
6216 *cost = COSTS_N_INSNS (1);
6221 /* The cost depends entirely on the operands to SET. */
6226 switch (GET_CODE (op0))
6231 rtx address = XEXP (op0, 0);
6232 if (VECTOR_MODE_P (mode))
6233 *cost += extra_cost->ldst.storev;
6234 else if (GET_MODE_CLASS (mode) == MODE_INT)
6235 *cost += extra_cost->ldst.store;
6236 else if (mode == SFmode)
6237 *cost += extra_cost->ldst.storef;
6238 else if (mode == DFmode)
6239 *cost += extra_cost->ldst.stored;
6242 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6246 *cost += rtx_cost (op1, mode, SET, 1, speed);
6250 if (! REG_P (SUBREG_REG (op0)))
6251 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
6255 /* The cost is one per vector-register copied. */
6256 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
6258 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6259 / GET_MODE_SIZE (V4SImode);
6260 *cost = COSTS_N_INSNS (n_minus_1 + 1);
6262 /* const0_rtx is in general free, but we will use an
6263 instruction to set a register to 0. */
6264 else if (REG_P (op1) || op1 == const0_rtx)
6266 /* The cost is 1 per register copied. */
6267 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6269 *cost = COSTS_N_INSNS (n_minus_1 + 1);
6272 /* Cost is just the cost of the RHS of the set. */
6273 *cost += rtx_cost (op1, mode, SET, 1, speed);
6278 /* Bit-field insertion. Strip any redundant widening of
6279 the RHS to meet the width of the target. */
6280 if (GET_CODE (op1) == SUBREG)
6281 op1 = SUBREG_REG (op1);
6282 if ((GET_CODE (op1) == ZERO_EXTEND
6283 || GET_CODE (op1) == SIGN_EXTEND)
6284 && CONST_INT_P (XEXP (op0, 1))
6285 && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
6286 >= INTVAL (XEXP (op0, 1))))
6287 op1 = XEXP (op1, 0);
6289 if (CONST_INT_P (op1))
6291 /* MOV immediate is assumed to always be cheap. */
6292 *cost = COSTS_N_INSNS (1);
6298 *cost += extra_cost->alu.bfi;
6299 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
6305 /* We can't make sense of this, assume default cost. */
6306 *cost = COSTS_N_INSNS (1);
6312 /* If an instruction can incorporate a constant within the
6313 instruction, the instruction's expression avoids calling
6314 rtx_cost() on the constant. If rtx_cost() is called on a
6315 constant, then it is usually because the constant must be
6316 moved into a register by one or more instructions.
6318 The exception is constant 0, which can be expressed
6319 as XZR/WZR and is therefore free. The exception to this is
6320 if we have (set (reg) (const0_rtx)) in which case we must cost
6321 the move. However, we can catch that when we cost the SET, so
6322 we don't need to consider that here. */
6323 if (x == const0_rtx)
6327 /* To an approximation, building any other constant is
6328 proportionally expensive to the number of instructions
6329 required to build that constant. This is true whether we
6330 are compiling for SPEED or otherwise. */
6331 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
6332 (NULL_RTX, x, false, mode));
6339 /* mov[df,sf]_aarch64. */
6340 if (aarch64_float_const_representable_p (x))
6341 /* FMOV (scalar immediate). */
6342 *cost += extra_cost->fp[mode == DFmode].fpconst;
6343 else if (!aarch64_float_const_zero_rtx_p (x))
6345 /* This will be a load from memory. */
6347 *cost += extra_cost->ldst.loadd;
6349 *cost += extra_cost->ldst.loadf;
6352 /* Otherwise this is +0.0. We get this using MOVI d0, #0
6353 or MOV v0.s[0], wzr - neither of which are modeled by the
6354 cost tables. Just use the default cost. */
6364 /* For loads we want the base cost of a load, plus an
6365 approximation for the additional cost of the addressing
6367 rtx address = XEXP (x, 0);
6368 if (VECTOR_MODE_P (mode))
6369 *cost += extra_cost->ldst.loadv;
6370 else if (GET_MODE_CLASS (mode) == MODE_INT)
6371 *cost += extra_cost->ldst.load;
6372 else if (mode == SFmode)
6373 *cost += extra_cost->ldst.loadf;
6374 else if (mode == DFmode)
6375 *cost += extra_cost->ldst.loadd;
6378 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6387 if (VECTOR_MODE_P (mode))
6392 *cost += extra_cost->vect.alu;
6397 if (GET_MODE_CLASS (mode) == MODE_INT)
6399 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6400 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6403 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
6407 /* Cost this as SUB wzr, X. */
6408 op0 = CONST0_RTX (mode);
6413 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6415 /* Support (neg(fma...)) as a single instruction only if
6416 sign of zeros is unimportant. This matches the decision
6417 making in aarch64.md. */
6418 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
6421 *cost = rtx_cost (op0, mode, NEG, 0, speed);
6424 if (GET_CODE (op0) == MULT)
6427 *cost = rtx_cost (op0, mode, NEG, 0, speed);
6432 *cost += extra_cost->fp[mode == DFmode].neg;
6442 if (VECTOR_MODE_P (mode))
6443 *cost += extra_cost->vect.alu;
6445 *cost += extra_cost->alu.clz;
6454 if (op1 == const0_rtx
6455 && GET_CODE (op0) == AND)
6458 mode = GET_MODE (op0);
6462 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
6464 /* TODO: A write to the CC flags possibly costs extra, this
6465 needs encoding in the cost tables. */
6467 mode = GET_MODE (op0);
6469 if (GET_CODE (op0) == AND)
6475 if (GET_CODE (op0) == PLUS)
6477 /* ADDS (and CMN alias). */
6482 if (GET_CODE (op0) == MINUS)
6489 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
6490 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
6491 && CONST_INT_P (XEXP (op0, 2)))
6493 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
6494 Handle it here directly rather than going to cost_logic
6495 since we know the immediate generated for the TST is valid
6496 so we can avoid creating an intermediate rtx for it only
6497 for costing purposes. */
6499 *cost += extra_cost->alu.logical;
6501 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
6502 ZERO_EXTRACT, 0, speed);
6506 if (GET_CODE (op1) == NEG)
6510 *cost += extra_cost->alu.arith;
6512 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
6513 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
6519 Compare can freely swap the order of operands, and
6520 canonicalization puts the more complex operation first.
6521 But the integer MINUS logic expects the shift/extend
6522 operation in op1. */
6524 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
6532 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
6536 *cost += extra_cost->fp[mode == DFmode].compare;
6538 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
6540 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
6541 /* FCMP supports constant 0.0 for no extra cost. */
6547 if (VECTOR_MODE_P (mode))
6549 /* Vector compare. */
6551 *cost += extra_cost->vect.alu;
6553 if (aarch64_float_const_zero_rtx_p (op1))
6555 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
6569 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
6571 /* Detect valid immediates. */
6572 if ((GET_MODE_CLASS (mode) == MODE_INT
6573 || (GET_MODE_CLASS (mode) == MODE_CC
6574 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
6575 && CONST_INT_P (op1)
6576 && aarch64_uimm12_shift (INTVAL (op1)))
6579 /* SUB(S) (immediate). */
6580 *cost += extra_cost->alu.arith;
6584 /* Look for SUB (extended register). */
6585 if (aarch64_rtx_arith_op_extract_p (op1, mode))
6588 *cost += extra_cost->alu.extend_arith;
6590 op1 = aarch64_strip_extend (op1);
6591 *cost += rtx_cost (op1, VOIDmode,
6592 (enum rtx_code) GET_CODE (op1), 0, speed);
6596 rtx new_op1 = aarch64_strip_extend (op1);
6598 /* Cost this as an FMA-alike operation. */
6599 if ((GET_CODE (new_op1) == MULT
6600 || aarch64_shift_p (GET_CODE (new_op1)))
6603 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
6604 (enum rtx_code) code,
6609 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
6613 if (VECTOR_MODE_P (mode))
6616 *cost += extra_cost->vect.alu;
6618 else if (GET_MODE_CLASS (mode) == MODE_INT)
6621 *cost += extra_cost->alu.arith;
6623 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6626 *cost += extra_cost->fp[mode == DFmode].addsub;
6640 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6641 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6644 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
6645 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
6649 if (GET_MODE_CLASS (mode) == MODE_INT
6650 && CONST_INT_P (op1)
6651 && aarch64_uimm12_shift (INTVAL (op1)))
6653 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
6656 /* ADD (immediate). */
6657 *cost += extra_cost->alu.arith;
6661 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
6663 /* Look for ADD (extended register). */
6664 if (aarch64_rtx_arith_op_extract_p (op0, mode))
6667 *cost += extra_cost->alu.extend_arith;
6669 op0 = aarch64_strip_extend (op0);
6670 *cost += rtx_cost (op0, VOIDmode,
6671 (enum rtx_code) GET_CODE (op0), 0, speed);
6675 /* Strip any extend, leave shifts behind as we will
6676 cost them through mult_cost. */
6677 new_op0 = aarch64_strip_extend (op0);
6679 if (GET_CODE (new_op0) == MULT
6680 || aarch64_shift_p (GET_CODE (new_op0)))
6682 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
6687 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
6691 if (VECTOR_MODE_P (mode))
6694 *cost += extra_cost->vect.alu;
6696 else if (GET_MODE_CLASS (mode) == MODE_INT)
6699 *cost += extra_cost->alu.arith;
6701 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6704 *cost += extra_cost->fp[mode == DFmode].addsub;
6711 *cost = COSTS_N_INSNS (1);
6715 if (VECTOR_MODE_P (mode))
6716 *cost += extra_cost->vect.alu;
6718 *cost += extra_cost->alu.rev;
6723 if (aarch_rev16_p (x))
6725 *cost = COSTS_N_INSNS (1);
6729 if (VECTOR_MODE_P (mode))
6730 *cost += extra_cost->vect.alu;
6732 *cost += extra_cost->alu.rev;
6737 if (aarch64_extr_rtx_p (x, &op0, &op1))
6739 *cost += rtx_cost (op0, mode, IOR, 0, speed);
6740 *cost += rtx_cost (op1, mode, IOR, 1, speed);
6742 *cost += extra_cost->alu.shift;
6753 if (VECTOR_MODE_P (mode))
6756 *cost += extra_cost->vect.alu;
6761 && GET_CODE (op0) == MULT
6762 && CONST_INT_P (XEXP (op0, 1))
6763 && CONST_INT_P (op1)
6764 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
6767 /* This is a UBFM/SBFM. */
6768 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
6770 *cost += extra_cost->alu.bfx;
6774 if (GET_MODE_CLASS (mode) == MODE_INT)
6776 if (CONST_INT_P (op1))
6778 /* We have a mask + shift version of a UBFIZ
6779 i.e. the *andim_ashift<mode>_bfiz pattern. */
6780 if (GET_CODE (op0) == ASHIFT
6781 && aarch64_mask_and_shift_for_ubfiz_p (mode, op1,
6784 *cost += rtx_cost (XEXP (op0, 0), mode,
6785 (enum rtx_code) code, 0, speed);
6787 *cost += extra_cost->alu.bfx;
6791 else if (aarch64_bitmask_imm (INTVAL (op1), mode))
6793 /* We possibly get the immediate for free, this is not
6795 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
6797 *cost += extra_cost->alu.logical;
6806 /* Handle ORN, EON, or BIC. */
6807 if (GET_CODE (op0) == NOT)
6808 op0 = XEXP (op0, 0);
6810 new_op0 = aarch64_strip_shift (op0);
6812 /* If we had a shift on op0 then this is a logical-shift-
6813 by-register/immediate operation. Otherwise, this is just
6814 a logical operation. */
6819 /* Shift by immediate. */
6820 if (CONST_INT_P (XEXP (op0, 1)))
6821 *cost += extra_cost->alu.log_shift;
6823 *cost += extra_cost->alu.log_shift_reg;
6826 *cost += extra_cost->alu.logical;
6829 /* In both cases we want to cost both operands. */
6830 *cost += rtx_cost (new_op0, mode, (enum rtx_code) code, 0, speed);
6831 *cost += rtx_cost (op1, mode, (enum rtx_code) code, 1, speed);
6840 op0 = aarch64_strip_shift (x);
6842 if (VECTOR_MODE_P (mode))
6845 *cost += extra_cost->vect.alu;
6849 /* MVN-shifted-reg. */
6852 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
6855 *cost += extra_cost->alu.log_shift;
6859 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
6860 Handle the second form here taking care that 'a' in the above can
6862 else if (GET_CODE (op0) == XOR)
6864 rtx newop0 = XEXP (op0, 0);
6865 rtx newop1 = XEXP (op0, 1);
6866 rtx op0_stripped = aarch64_strip_shift (newop0);
6868 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
6869 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
6873 if (op0_stripped != newop0)
6874 *cost += extra_cost->alu.log_shift;
6876 *cost += extra_cost->alu.logical;
6883 *cost += extra_cost->alu.logical;
6890 /* If a value is written in SI mode, then zero extended to DI
6891 mode, the operation will in general be free as a write to
6892 a 'w' register implicitly zeroes the upper bits of an 'x'
6893 register. However, if this is
6895 (set (reg) (zero_extend (reg)))
6897 we must cost the explicit register move. */
6899 && GET_MODE (op0) == SImode
6902 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
6904 /* If OP_COST is non-zero, then the cost of the zero extend
6905 is effectively the cost of the inner operation. Otherwise
6906 we have a MOV instruction and we take the cost from the MOV
6907 itself. This is true independently of whether we are
6908 optimizing for space or time. */
6914 else if (MEM_P (op0))
6916 /* All loads can zero extend to any size for free. */
6917 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
6921 op0 = aarch64_extend_bitfield_pattern_p (x);
6924 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
6926 *cost += extra_cost->alu.bfx;
6932 if (VECTOR_MODE_P (mode))
6935 *cost += extra_cost->vect.alu;
6939 /* We generate an AND instead of UXTB/UXTH. */
6940 *cost += extra_cost->alu.logical;
6946 if (MEM_P (XEXP (x, 0)))
6951 rtx address = XEXP (XEXP (x, 0), 0);
6952 *cost += extra_cost->ldst.load_sign_extend;
6955 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6961 op0 = aarch64_extend_bitfield_pattern_p (x);
6964 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
6966 *cost += extra_cost->alu.bfx;
6972 if (VECTOR_MODE_P (mode))
6973 *cost += extra_cost->vect.alu;
6975 *cost += extra_cost->alu.extend;
6983 if (CONST_INT_P (op1))
6987 if (VECTOR_MODE_P (mode))
6989 /* Vector shift (immediate). */
6990 *cost += extra_cost->vect.alu;
6994 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
6996 *cost += extra_cost->alu.shift;
7000 /* We can incorporate zero/sign extend for free. */
7001 if (GET_CODE (op0) == ZERO_EXTEND
7002 || GET_CODE (op0) == SIGN_EXTEND)
7003 op0 = XEXP (op0, 0);
7005 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
7012 if (VECTOR_MODE_P (mode))
7014 /* Vector shift (register). */
7015 *cost += extra_cost->vect.alu;
7020 *cost += extra_cost->alu.shift_reg;
7023 return false; /* All arguments need to be in registers. */
7033 if (CONST_INT_P (op1))
7035 /* ASR (immediate) and friends. */
7038 if (VECTOR_MODE_P (mode))
7039 *cost += extra_cost->vect.alu;
7041 *cost += extra_cost->alu.shift;
7044 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7050 /* ASR (register) and friends. */
7053 if (VECTOR_MODE_P (mode))
7054 *cost += extra_cost->vect.alu;
7056 *cost += extra_cost->alu.shift_reg;
7058 return false; /* All arguments need to be in registers. */
7063 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
7064 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
7068 *cost += extra_cost->ldst.load;
7070 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
7071 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
7073 /* ADRP, followed by ADD. */
7074 *cost += COSTS_N_INSNS (1);
7076 *cost += 2 * extra_cost->alu.arith;
7078 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
7079 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
7083 *cost += extra_cost->alu.arith;
7088 /* One extra load instruction, after accessing the GOT. */
7089 *cost += COSTS_N_INSNS (1);
7091 *cost += extra_cost->ldst.load;
7097 /* ADRP/ADD (immediate). */
7099 *cost += extra_cost->alu.arith;
7107 if (VECTOR_MODE_P (mode))
7108 *cost += extra_cost->vect.alu;
7110 *cost += extra_cost->alu.bfx;
7113 /* We can trust that the immediates used will be correct (there
7114 are no by-register forms), so we need only cost op0. */
7115 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
7119 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
7120 /* aarch64_rtx_mult_cost always handles recursion to its
7125 /* We can expand signed mod by power of 2 using a NEGS, two parallel
7126 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
7127 an unconditional negate. This case should only ever be reached through
7128 the set_smod_pow2_cheap check in expmed.c. */
7129 if (CONST_INT_P (XEXP (x, 1))
7130 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
7131 && (mode == SImode || mode == DImode))
7133 /* We expand to 4 instructions. Reset the baseline. */
7134 *cost = COSTS_N_INSNS (4);
7137 *cost += 2 * extra_cost->alu.logical
7138 + 2 * extra_cost->alu.arith;
7147 if (VECTOR_MODE_P (mode))
7148 *cost += extra_cost->vect.alu;
7149 else if (GET_MODE_CLASS (mode) == MODE_INT)
7150 *cost += (extra_cost->mult[mode == DImode].add
7151 + extra_cost->mult[mode == DImode].idiv);
7152 else if (mode == DFmode)
7153 *cost += (extra_cost->fp[1].mult
7154 + extra_cost->fp[1].div);
7155 else if (mode == SFmode)
7156 *cost += (extra_cost->fp[0].mult
7157 + extra_cost->fp[0].div);
7159 return false; /* All arguments need to be in registers. */
7166 if (VECTOR_MODE_P (mode))
7167 *cost += extra_cost->vect.alu;
7168 else if (GET_MODE_CLASS (mode) == MODE_INT)
7169 /* There is no integer SQRT, so only DIV and UDIV can get
7171 *cost += extra_cost->mult[mode == DImode].idiv;
7173 *cost += extra_cost->fp[mode == DFmode].div;
7175 return false; /* All arguments need to be in registers. */
7178 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
7179 XEXP (x, 2), cost, speed);
7192 return false; /* All arguments must be in registers. */
7201 if (VECTOR_MODE_P (mode))
7202 *cost += extra_cost->vect.alu;
7204 *cost += extra_cost->fp[mode == DFmode].fma;
7207 /* FMSUB, FNMADD, and FNMSUB are free. */
7208 if (GET_CODE (op0) == NEG)
7209 op0 = XEXP (op0, 0);
7211 if (GET_CODE (op2) == NEG)
7212 op2 = XEXP (op2, 0);
7214 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
7215 and the by-element operand as operand 0. */
7216 if (GET_CODE (op1) == NEG)
7217 op1 = XEXP (op1, 0);
7219 /* Catch vector-by-element operations. The by-element operand can
7220 either be (vec_duplicate (vec_select (x))) or just
7221 (vec_select (x)), depending on whether we are multiplying by
7222 a vector or a scalar.
7224 Canonicalization is not very good in these cases, FMA4 will put the
7225 by-element operand as operand 0, FNMA4 will have it as operand 1. */
7226 if (GET_CODE (op0) == VEC_DUPLICATE)
7227 op0 = XEXP (op0, 0);
7228 else if (GET_CODE (op1) == VEC_DUPLICATE)
7229 op1 = XEXP (op1, 0);
7231 if (GET_CODE (op0) == VEC_SELECT)
7232 op0 = XEXP (op0, 0);
7233 else if (GET_CODE (op1) == VEC_SELECT)
7234 op1 = XEXP (op1, 0);
7236 /* If the remaining parameters are not registers,
7237 get the cost to put them into registers. */
7238 *cost += rtx_cost (op0, mode, FMA, 0, speed);
7239 *cost += rtx_cost (op1, mode, FMA, 1, speed);
7240 *cost += rtx_cost (op2, mode, FMA, 2, speed);
7244 case UNSIGNED_FLOAT:
7246 *cost += extra_cost->fp[mode == DFmode].fromint;
7252 if (VECTOR_MODE_P (mode))
7254 /*Vector truncate. */
7255 *cost += extra_cost->vect.alu;
7258 *cost += extra_cost->fp[mode == DFmode].widen;
7262 case FLOAT_TRUNCATE:
7265 if (VECTOR_MODE_P (mode))
7267 /*Vector conversion. */
7268 *cost += extra_cost->vect.alu;
7271 *cost += extra_cost->fp[mode == DFmode].narrow;
7278 /* Strip the rounding part. They will all be implemented
7279 by the fcvt* family of instructions anyway. */
7280 if (GET_CODE (x) == UNSPEC)
7282 unsigned int uns_code = XINT (x, 1);
7284 if (uns_code == UNSPEC_FRINTA
7285 || uns_code == UNSPEC_FRINTM
7286 || uns_code == UNSPEC_FRINTN
7287 || uns_code == UNSPEC_FRINTP
7288 || uns_code == UNSPEC_FRINTZ)
7289 x = XVECEXP (x, 0, 0);
7294 if (VECTOR_MODE_P (mode))
7295 *cost += extra_cost->vect.alu;
7297 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
7300 /* We can combine fmul by a power of 2 followed by a fcvt into a single
7301 fixed-point fcvt. */
7302 if (GET_CODE (x) == MULT
7303 && ((VECTOR_MODE_P (mode)
7304 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
7305 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
7307 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
7312 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
7316 if (VECTOR_MODE_P (mode))
7320 *cost += extra_cost->vect.alu;
7322 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7326 /* FABD, which is analogous to FADD. */
7327 if (GET_CODE (op0) == MINUS)
7329 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
7330 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
7332 *cost += extra_cost->fp[mode == DFmode].addsub;
7336 /* Simple FABS is analogous to FNEG. */
7338 *cost += extra_cost->fp[mode == DFmode].neg;
7342 /* Integer ABS will either be split to
7343 two arithmetic instructions, or will be an ABS
7344 (scalar), which we don't model. */
7345 *cost = COSTS_N_INSNS (2);
7347 *cost += 2 * extra_cost->alu.arith;
7355 if (VECTOR_MODE_P (mode))
7356 *cost += extra_cost->vect.alu;
7359 /* FMAXNM/FMINNM/FMAX/FMIN.
7360 TODO: This may not be accurate for all implementations, but
7361 we do not model this in the cost tables. */
7362 *cost += extra_cost->fp[mode == DFmode].addsub;
7368 /* The floating point round to integer frint* instructions. */
7369 if (aarch64_frint_unspec_p (XINT (x, 1)))
7372 *cost += extra_cost->fp[mode == DFmode].roundint;
7377 if (XINT (x, 1) == UNSPEC_RBIT)
7380 *cost += extra_cost->alu.rev;
7388 /* Decompose <su>muldi3_highpart. */
7389 if (/* (truncate:DI */
7392 && GET_MODE (XEXP (x, 0)) == TImode
7393 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
7395 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
7396 /* (ANY_EXTEND:TI (reg:DI))
7397 (ANY_EXTEND:TI (reg:DI))) */
7398 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
7399 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
7400 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
7401 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
7402 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
7403 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
7404 /* (const_int 64) */
7405 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7406 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
7410 *cost += extra_cost->mult[mode == DImode].extend;
7411 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
7412 mode, MULT, 0, speed);
7413 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
7414 mode, MULT, 1, speed);
7423 if (dump_file && (dump_flags & TDF_DETAILS))
7425 "\nFailed to cost RTX. Assuming default cost.\n");
7430 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
7431 calculated for X. This cost is stored in *COST. Returns true
7432 if the total cost of X was calculated. */
7434 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
7435 int param, int *cost, bool speed)
7437 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
7439 if (dump_file && (dump_flags & TDF_DETAILS))
7441 print_rtl_single (dump_file, x);
7442 fprintf (dump_file, "\n%s cost: %d (%s)\n",
7443 speed ? "Hot" : "Cold",
7444 *cost, result ? "final" : "partial");
7451 aarch64_register_move_cost (machine_mode mode,
7452 reg_class_t from_i, reg_class_t to_i)
7454 enum reg_class from = (enum reg_class) from_i;
7455 enum reg_class to = (enum reg_class) to_i;
7456 const struct cpu_regmove_cost *regmove_cost
7457 = aarch64_tune_params.regmove_cost;
7459 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
7460 if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
7463 if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
7464 from = GENERAL_REGS;
7466 /* Moving between GPR and stack cost is the same as GP2GP. */
7467 if ((from == GENERAL_REGS && to == STACK_REG)
7468 || (to == GENERAL_REGS && from == STACK_REG))
7469 return regmove_cost->GP2GP;
7471 /* To/From the stack register, we move via the gprs. */
7472 if (to == STACK_REG || from == STACK_REG)
7473 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
7474 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
7476 if (GET_MODE_SIZE (mode) == 16)
7478 /* 128-bit operations on general registers require 2 instructions. */
7479 if (from == GENERAL_REGS && to == GENERAL_REGS)
7480 return regmove_cost->GP2GP * 2;
7481 else if (from == GENERAL_REGS)
7482 return regmove_cost->GP2FP * 2;
7483 else if (to == GENERAL_REGS)
7484 return regmove_cost->FP2GP * 2;
7486 /* When AdvSIMD instructions are disabled it is not possible to move
7487 a 128-bit value directly between Q registers. This is handled in
7488 secondary reload. A general register is used as a scratch to move
7489 the upper DI value and the lower DI value is moved directly,
7490 hence the cost is the sum of three moves. */
7492 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
7494 return regmove_cost->FP2FP;
7497 if (from == GENERAL_REGS && to == GENERAL_REGS)
7498 return regmove_cost->GP2GP;
7499 else if (from == GENERAL_REGS)
7500 return regmove_cost->GP2FP;
7501 else if (to == GENERAL_REGS)
7502 return regmove_cost->FP2GP;
7504 return regmove_cost->FP2FP;
7508 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
7509 reg_class_t rclass ATTRIBUTE_UNUSED,
7510 bool in ATTRIBUTE_UNUSED)
7512 return aarch64_tune_params.memmov_cost;
7515 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
7516 to optimize 1.0/sqrt. */
7519 use_rsqrt_p (machine_mode mode)
7521 return (!flag_trapping_math
7522 && flag_unsafe_math_optimizations
7523 && ((aarch64_tune_params.approx_modes->recip_sqrt
7524 & AARCH64_APPROX_MODE (mode))
7525 || flag_mrecip_low_precision_sqrt));
7528 /* Function to decide when to use the approximate reciprocal square root
7532 aarch64_builtin_reciprocal (tree fndecl)
7534 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
7536 if (!use_rsqrt_p (mode))
7538 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
7541 typedef rtx (*rsqrte_type) (rtx, rtx);
7543 /* Select reciprocal square root initial estimate insn depending on machine
7547 get_rsqrte_type (machine_mode mode)
7551 case DFmode: return gen_aarch64_rsqrtedf;
7552 case SFmode: return gen_aarch64_rsqrtesf;
7553 case V2DFmode: return gen_aarch64_rsqrtev2df;
7554 case V2SFmode: return gen_aarch64_rsqrtev2sf;
7555 case V4SFmode: return gen_aarch64_rsqrtev4sf;
7556 default: gcc_unreachable ();
7560 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
7562 /* Select reciprocal square root series step insn depending on machine mode. */
7565 get_rsqrts_type (machine_mode mode)
7569 case DFmode: return gen_aarch64_rsqrtsdf;
7570 case SFmode: return gen_aarch64_rsqrtssf;
7571 case V2DFmode: return gen_aarch64_rsqrtsv2df;
7572 case V2SFmode: return gen_aarch64_rsqrtsv2sf;
7573 case V4SFmode: return gen_aarch64_rsqrtsv4sf;
7574 default: gcc_unreachable ();
7578 /* Emit instruction sequence to compute either the approximate square root
7579 or its approximate reciprocal, depending on the flag RECP, and return
7580 whether the sequence was emitted or not. */
7583 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
7585 machine_mode mode = GET_MODE (dst);
7587 if (GET_MODE_INNER (mode) == HFmode)
7590 machine_mode mmsk = mode_for_vector
7591 (int_mode_for_mode (GET_MODE_INNER (mode)),
7592 GET_MODE_NUNITS (mode));
7593 bool use_approx_sqrt_p = (!recp
7594 && (flag_mlow_precision_sqrt
7595 || (aarch64_tune_params.approx_modes->sqrt
7596 & AARCH64_APPROX_MODE (mode))));
7597 bool use_approx_rsqrt_p = (recp
7598 && (flag_mrecip_low_precision_sqrt
7599 || (aarch64_tune_params.approx_modes->recip_sqrt
7600 & AARCH64_APPROX_MODE (mode))));
7602 if (!flag_finite_math_only
7603 || flag_trapping_math
7604 || !flag_unsafe_math_optimizations
7605 || !(use_approx_sqrt_p || use_approx_rsqrt_p)
7606 || optimize_function_for_size_p (cfun))
7609 rtx xmsk = gen_reg_rtx (mmsk);
7611 /* When calculating the approximate square root, compare the argument with
7612 0.0 and create a mask. */
7613 emit_insn (gen_rtx_SET (xmsk, gen_rtx_NEG (mmsk, gen_rtx_EQ (mmsk, src,
7614 CONST0_RTX (mode)))));
7616 /* Estimate the approximate reciprocal square root. */
7617 rtx xdst = gen_reg_rtx (mode);
7618 emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
7620 /* Iterate over the series twice for SF and thrice for DF. */
7621 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
7623 /* Optionally iterate over the series once less for faster performance
7624 while sacrificing the accuracy. */
7625 if ((recp && flag_mrecip_low_precision_sqrt)
7626 || (!recp && flag_mlow_precision_sqrt))
7629 /* Iterate over the series to calculate the approximate reciprocal square
7631 rtx x1 = gen_reg_rtx (mode);
7632 while (iterations--)
7634 rtx x2 = gen_reg_rtx (mode);
7635 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
7637 emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
7640 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
7645 /* Qualify the approximate reciprocal square root when the argument is
7646 0.0 by squashing the intermediary result to 0.0. */
7647 rtx xtmp = gen_reg_rtx (mmsk);
7648 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
7649 gen_rtx_SUBREG (mmsk, xdst, 0)));
7650 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
7652 /* Calculate the approximate square root. */
7653 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
7656 /* Finalize the approximation. */
7657 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
7662 typedef rtx (*recpe_type) (rtx, rtx);
7664 /* Select reciprocal initial estimate insn depending on machine mode. */
7667 get_recpe_type (machine_mode mode)
7671 case SFmode: return (gen_aarch64_frecpesf);
7672 case V2SFmode: return (gen_aarch64_frecpev2sf);
7673 case V4SFmode: return (gen_aarch64_frecpev4sf);
7674 case DFmode: return (gen_aarch64_frecpedf);
7675 case V2DFmode: return (gen_aarch64_frecpev2df);
7676 default: gcc_unreachable ();
7680 typedef rtx (*recps_type) (rtx, rtx, rtx);
7682 /* Select reciprocal series step insn depending on machine mode. */
7685 get_recps_type (machine_mode mode)
7689 case SFmode: return (gen_aarch64_frecpssf);
7690 case V2SFmode: return (gen_aarch64_frecpsv2sf);
7691 case V4SFmode: return (gen_aarch64_frecpsv4sf);
7692 case DFmode: return (gen_aarch64_frecpsdf);
7693 case V2DFmode: return (gen_aarch64_frecpsv2df);
7694 default: gcc_unreachable ();
7698 /* Emit the instruction sequence to compute the approximation for the division
7699 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
7702 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
7704 machine_mode mode = GET_MODE (quo);
7706 if (GET_MODE_INNER (mode) == HFmode)
7709 bool use_approx_division_p = (flag_mlow_precision_div
7710 || (aarch64_tune_params.approx_modes->division
7711 & AARCH64_APPROX_MODE (mode)));
7713 if (!flag_finite_math_only
7714 || flag_trapping_math
7715 || !flag_unsafe_math_optimizations
7716 || optimize_function_for_size_p (cfun)
7717 || !use_approx_division_p)
7720 /* Estimate the approximate reciprocal. */
7721 rtx xrcp = gen_reg_rtx (mode);
7722 emit_insn ((*get_recpe_type (mode)) (xrcp, den));
7724 /* Iterate over the series twice for SF and thrice for DF. */
7725 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
7727 /* Optionally iterate over the series once less for faster performance,
7728 while sacrificing the accuracy. */
7729 if (flag_mlow_precision_div)
7732 /* Iterate over the series to calculate the approximate reciprocal. */
7733 rtx xtmp = gen_reg_rtx (mode);
7734 while (iterations--)
7736 emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
7739 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
7742 if (num != CONST1_RTX (mode))
7744 /* As the approximate reciprocal of DEN is already calculated, only
7745 calculate the approximate division when NUM is not 1.0. */
7746 rtx xnum = force_reg (mode, num);
7747 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
7750 /* Finalize the approximation. */
7751 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
7755 /* Return the number of instructions that can be issued per cycle. */
7757 aarch64_sched_issue_rate (void)
7759 return aarch64_tune_params.issue_rate;
7763 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
7765 int issue_rate = aarch64_sched_issue_rate ();
7767 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
7771 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
7772 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
7773 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
7776 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
7779 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
7783 /* Vectorizer cost model target hooks. */
7785 /* Implement targetm.vectorize.builtin_vectorization_cost. */
7787 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
7789 int misalign ATTRIBUTE_UNUSED)
7793 switch (type_of_cost)
7796 return aarch64_tune_params.vec_costs->scalar_stmt_cost;
7799 return aarch64_tune_params.vec_costs->scalar_load_cost;
7802 return aarch64_tune_params.vec_costs->scalar_store_cost;
7805 return aarch64_tune_params.vec_costs->vec_stmt_cost;
7808 return aarch64_tune_params.vec_costs->vec_align_load_cost;
7811 return aarch64_tune_params.vec_costs->vec_store_cost;
7814 return aarch64_tune_params.vec_costs->vec_to_scalar_cost;
7817 return aarch64_tune_params.vec_costs->scalar_to_vec_cost;
7819 case unaligned_load:
7820 return aarch64_tune_params.vec_costs->vec_unalign_load_cost;
7822 case unaligned_store:
7823 return aarch64_tune_params.vec_costs->vec_unalign_store_cost;
7825 case cond_branch_taken:
7826 return aarch64_tune_params.vec_costs->cond_taken_branch_cost;
7828 case cond_branch_not_taken:
7829 return aarch64_tune_params.vec_costs->cond_not_taken_branch_cost;
7832 return aarch64_tune_params.vec_costs->vec_permute_cost;
7834 case vec_promote_demote:
7835 return aarch64_tune_params.vec_costs->vec_stmt_cost;
7838 elements = TYPE_VECTOR_SUBPARTS (vectype);
7839 return elements / 2 + 1;
7846 /* Implement targetm.vectorize.add_stmt_cost. */
7848 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
7849 struct _stmt_vec_info *stmt_info, int misalign,
7850 enum vect_cost_model_location where)
7852 unsigned *cost = (unsigned *) data;
7853 unsigned retval = 0;
7855 if (flag_vect_cost_model)
7857 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
7859 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
7861 /* Statements in an inner loop relative to the loop being
7862 vectorized are weighted more heavily. The value here is
7863 arbitrary and could potentially be improved with analysis. */
7864 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
7865 count *= 50; /* FIXME */
7867 retval = (unsigned) (count * stmt_cost);
7868 cost[where] += retval;
7874 static void initialize_aarch64_code_model (struct gcc_options *);
7876 /* Parse the TO_PARSE string and put the architecture struct that it
7877 selects into RES and the architectural features into ISA_FLAGS.
7878 Return an aarch64_parse_opt_result describing the parse result.
7879 If there is an error parsing, RES and ISA_FLAGS are left unchanged. */
7881 static enum aarch64_parse_opt_result
7882 aarch64_parse_arch (const char *to_parse, const struct processor **res,
7883 unsigned long *isa_flags)
7886 const struct processor *arch;
7887 char *str = (char *) alloca (strlen (to_parse) + 1);
7890 strcpy (str, to_parse);
7892 ext = strchr (str, '+');
7900 return AARCH64_PARSE_MISSING_ARG;
7903 /* Loop through the list of supported ARCHes to find a match. */
7904 for (arch = all_architectures; arch->name != NULL; arch++)
7906 if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
7908 unsigned long isa_temp = arch->flags;
7912 /* TO_PARSE string contains at least one extension. */
7913 enum aarch64_parse_opt_result ext_res
7914 = aarch64_parse_extension (ext, &isa_temp);
7916 if (ext_res != AARCH64_PARSE_OK)
7919 /* Extension parsing was successful. Confirm the result
7920 arch and ISA flags. */
7922 *isa_flags = isa_temp;
7923 return AARCH64_PARSE_OK;
7927 /* ARCH name not found in list. */
7928 return AARCH64_PARSE_INVALID_ARG;
7931 /* Parse the TO_PARSE string and put the result tuning in RES and the
7932 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
7933 describing the parse result. If there is an error parsing, RES and
7934 ISA_FLAGS are left unchanged. */
7936 static enum aarch64_parse_opt_result
7937 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
7938 unsigned long *isa_flags)
7941 const struct processor *cpu;
7942 char *str = (char *) alloca (strlen (to_parse) + 1);
7945 strcpy (str, to_parse);
7947 ext = strchr (str, '+');
7955 return AARCH64_PARSE_MISSING_ARG;
7958 /* Loop through the list of supported CPUs to find a match. */
7959 for (cpu = all_cores; cpu->name != NULL; cpu++)
7961 if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
7963 unsigned long isa_temp = cpu->flags;
7968 /* TO_PARSE string contains at least one extension. */
7969 enum aarch64_parse_opt_result ext_res
7970 = aarch64_parse_extension (ext, &isa_temp);
7972 if (ext_res != AARCH64_PARSE_OK)
7975 /* Extension parsing was successfull. Confirm the result
7976 cpu and ISA flags. */
7978 *isa_flags = isa_temp;
7979 return AARCH64_PARSE_OK;
7983 /* CPU name not found in list. */
7984 return AARCH64_PARSE_INVALID_ARG;
7987 /* Parse the TO_PARSE string and put the cpu it selects into RES.
7988 Return an aarch64_parse_opt_result describing the parse result.
7989 If the parsing fails the RES does not change. */
7991 static enum aarch64_parse_opt_result
7992 aarch64_parse_tune (const char *to_parse, const struct processor **res)
7994 const struct processor *cpu;
7995 char *str = (char *) alloca (strlen (to_parse) + 1);
7997 strcpy (str, to_parse);
7999 /* Loop through the list of supported CPUs to find a match. */
8000 for (cpu = all_cores; cpu->name != NULL; cpu++)
8002 if (strcmp (cpu->name, str) == 0)
8005 return AARCH64_PARSE_OK;
8009 /* CPU name not found in list. */
8010 return AARCH64_PARSE_INVALID_ARG;
8013 /* Parse TOKEN, which has length LENGTH to see if it is an option
8014 described in FLAG. If it is, return the index bit for that fusion type.
8015 If not, error (printing OPTION_NAME) and return zero. */
8018 aarch64_parse_one_option_token (const char *token,
8020 const struct aarch64_flag_desc *flag,
8021 const char *option_name)
8023 for (; flag->name != NULL; flag++)
8025 if (length == strlen (flag->name)
8026 && !strncmp (flag->name, token, length))
8030 error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
8034 /* Parse OPTION which is a comma-separated list of flags to enable.
8035 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
8036 default state we inherit from the CPU tuning structures. OPTION_NAME
8037 gives the top-level option we are parsing in the -moverride string,
8038 for use in error messages. */
8041 aarch64_parse_boolean_options (const char *option,
8042 const struct aarch64_flag_desc *flags,
8043 unsigned int initial_state,
8044 const char *option_name)
8046 const char separator = '.';
8047 const char* specs = option;
8048 const char* ntoken = option;
8049 unsigned int found_flags = initial_state;
8051 while ((ntoken = strchr (specs, separator)))
8053 size_t token_length = ntoken - specs;
8054 unsigned token_ops = aarch64_parse_one_option_token (specs,
8058 /* If we find "none" (or, for simplicity's sake, an error) anywhere
8059 in the token stream, reset the supported operations. So:
8061 adrp+add.cmp+branch.none.adrp+add
8063 would have the result of turning on only adrp+add fusion. */
8067 found_flags |= token_ops;
8071 /* We ended with a comma, print something. */
8074 error ("%s string ill-formed\n", option_name);
8078 /* We still have one more token to parse. */
8079 size_t token_length = strlen (specs);
8080 unsigned token_ops = aarch64_parse_one_option_token (specs,
8087 found_flags |= token_ops;
8091 /* Support for overriding instruction fusion. */
8094 aarch64_parse_fuse_string (const char *fuse_string,
8095 struct tune_params *tune)
8097 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
8098 aarch64_fusible_pairs,
8103 /* Support for overriding other tuning flags. */
8106 aarch64_parse_tune_string (const char *tune_string,
8107 struct tune_params *tune)
8109 tune->extra_tuning_flags
8110 = aarch64_parse_boolean_options (tune_string,
8111 aarch64_tuning_flags,
8112 tune->extra_tuning_flags,
8116 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
8117 we understand. If it is, extract the option string and handoff to
8118 the appropriate function. */
8121 aarch64_parse_one_override_token (const char* token,
8123 struct tune_params *tune)
8125 const struct aarch64_tuning_override_function *fn
8126 = aarch64_tuning_override_functions;
8128 const char *option_part = strchr (token, '=');
8131 error ("tuning string missing in option (%s)", token);
8135 /* Get the length of the option name. */
8136 length = option_part - token;
8137 /* Skip the '=' to get to the option string. */
8140 for (; fn->name != NULL; fn++)
8142 if (!strncmp (fn->name, token, length))
8144 fn->parse_override (option_part, tune);
8149 error ("unknown tuning option (%s)",token);
8153 /* A checking mechanism for the implementation of the tls size. */
8156 initialize_aarch64_tls_size (struct gcc_options *opts)
8158 if (aarch64_tls_size == 0)
8159 aarch64_tls_size = 24;
8161 switch (opts->x_aarch64_cmodel_var)
8163 case AARCH64_CMODEL_TINY:
8164 /* Both the default and maximum TLS size allowed under tiny is 1M which
8165 needs two instructions to address, so we clamp the size to 24. */
8166 if (aarch64_tls_size > 24)
8167 aarch64_tls_size = 24;
8169 case AARCH64_CMODEL_SMALL:
8170 /* The maximum TLS size allowed under small is 4G. */
8171 if (aarch64_tls_size > 32)
8172 aarch64_tls_size = 32;
8174 case AARCH64_CMODEL_LARGE:
8175 /* The maximum TLS size allowed under large is 16E.
8176 FIXME: 16E should be 64bit, we only support 48bit offset now. */
8177 if (aarch64_tls_size > 48)
8178 aarch64_tls_size = 48;
8187 /* Parse STRING looking for options in the format:
8188 string :: option:string
8189 option :: name=substring
8191 substring :: defined by option. */
8194 aarch64_parse_override_string (const char* input_string,
8195 struct tune_params* tune)
8197 const char separator = ':';
8198 size_t string_length = strlen (input_string) + 1;
8199 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
8200 char *string = string_root;
8201 strncpy (string, input_string, string_length);
8202 string[string_length - 1] = '\0';
8204 char* ntoken = string;
8206 while ((ntoken = strchr (string, separator)))
8208 size_t token_length = ntoken - string;
8209 /* Make this substring look like a string. */
8211 aarch64_parse_one_override_token (string, token_length, tune);
8215 /* One last option to parse. */
8216 aarch64_parse_one_override_token (string, strlen (string), tune);
8222 aarch64_override_options_after_change_1 (struct gcc_options *opts)
8224 /* The logic here is that if we are disabling all frame pointer generation
8225 then we do not need to disable leaf frame pointer generation as a
8226 separate operation. But if we are *only* disabling leaf frame pointer
8227 generation then we set flag_omit_frame_pointer to true, but in
8228 aarch64_frame_pointer_required we return false only for leaf functions.
8230 PR 70044: We have to be careful about being called multiple times for the
8231 same function. Once we have decided to set flag_omit_frame_pointer just
8232 so that we can omit leaf frame pointers, we must then not interpret a
8233 second call as meaning that all frame pointer generation should be
8234 omitted. We do this by setting flag_omit_frame_pointer to a special,
8236 if (opts->x_flag_omit_frame_pointer == 2)
8237 opts->x_flag_omit_frame_pointer = 0;
8239 if (opts->x_flag_omit_frame_pointer)
8240 opts->x_flag_omit_leaf_frame_pointer = false;
8241 else if (opts->x_flag_omit_leaf_frame_pointer)
8242 opts->x_flag_omit_frame_pointer = 2;
8244 /* If not optimizing for size, set the default
8245 alignment to what the target wants. */
8246 if (!opts->x_optimize_size)
8248 if (opts->x_align_loops <= 0)
8249 opts->x_align_loops = aarch64_tune_params.loop_align;
8250 if (opts->x_align_jumps <= 0)
8251 opts->x_align_jumps = aarch64_tune_params.jump_align;
8252 if (opts->x_align_functions <= 0)
8253 opts->x_align_functions = aarch64_tune_params.function_align;
8256 /* We default to no pc-relative literal loads. */
8258 aarch64_pcrelative_literal_loads = false;
8260 /* If -mpc-relative-literal-loads is set on the command line, this
8261 implies that the user asked for PC relative literal loads. */
8262 if (opts->x_pcrelative_literal_loads == 1)
8263 aarch64_pcrelative_literal_loads = true;
8265 /* This is PR70113. When building the Linux kernel with
8266 CONFIG_ARM64_ERRATUM_843419, support for relocations
8267 R_AARCH64_ADR_PREL_PG_HI21 and R_AARCH64_ADR_PREL_PG_HI21_NC is
8268 removed from the kernel to avoid loading objects with possibly
8269 offending sequences. Without -mpc-relative-literal-loads we would
8270 generate such relocations, preventing the kernel build from
8272 if (opts->x_pcrelative_literal_loads == 2
8273 && TARGET_FIX_ERR_A53_843419)
8274 aarch64_pcrelative_literal_loads = true;
8276 /* In the tiny memory model it makes no sense to disallow PC relative
8277 literal pool loads. */
8278 if (aarch64_cmodel == AARCH64_CMODEL_TINY
8279 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
8280 aarch64_pcrelative_literal_loads = true;
8282 /* When enabling the lower precision Newton series for the square root, also
8283 enable it for the reciprocal square root, since the latter is an
8284 intermediary step for the former. */
8285 if (flag_mlow_precision_sqrt)
8286 flag_mrecip_low_precision_sqrt = true;
8289 /* 'Unpack' up the internal tuning structs and update the options
8290 in OPTS. The caller must have set up selected_tune and selected_arch
8291 as all the other target-specific codegen decisions are
8292 derived from them. */
8295 aarch64_override_options_internal (struct gcc_options *opts)
8297 aarch64_tune_flags = selected_tune->flags;
8298 aarch64_tune = selected_tune->sched_core;
8299 /* Make a copy of the tuning parameters attached to the core, which
8300 we may later overwrite. */
8301 aarch64_tune_params = *(selected_tune->tune);
8302 aarch64_architecture_version = selected_arch->architecture_version;
8304 if (opts->x_aarch64_override_tune_string)
8305 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
8306 &aarch64_tune_params);
8308 /* This target defaults to strict volatile bitfields. */
8309 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
8310 opts->x_flag_strict_volatile_bitfields = 1;
8312 initialize_aarch64_code_model (opts);
8313 initialize_aarch64_tls_size (opts);
8315 int queue_depth = 0;
8316 switch (aarch64_tune_params.autoprefetcher_model)
8318 case tune_params::AUTOPREFETCHER_OFF:
8321 case tune_params::AUTOPREFETCHER_WEAK:
8324 case tune_params::AUTOPREFETCHER_STRONG:
8325 queue_depth = max_insn_queue_index + 1;
8331 /* We don't mind passing in global_options_set here as we don't use
8332 the *options_set structs anyway. */
8333 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
8335 opts->x_param_values,
8336 global_options_set.x_param_values);
8338 /* Set the L1 cache line size. */
8339 if (selected_cpu->tune->cache_line_size != 0)
8340 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
8341 selected_cpu->tune->cache_line_size,
8342 opts->x_param_values,
8343 global_options_set.x_param_values);
8345 aarch64_override_options_after_change_1 (opts);
8348 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
8349 specified in STR and throw errors if appropriate. Put the results if
8350 they are valid in RES and ISA_FLAGS. Return whether the option is
8354 aarch64_validate_mcpu (const char *str, const struct processor **res,
8355 unsigned long *isa_flags)
8357 enum aarch64_parse_opt_result parse_res
8358 = aarch64_parse_cpu (str, res, isa_flags);
8360 if (parse_res == AARCH64_PARSE_OK)
8365 case AARCH64_PARSE_MISSING_ARG:
8366 error ("missing cpu name in -mcpu=%qs", str);
8368 case AARCH64_PARSE_INVALID_ARG:
8369 error ("unknown value %qs for -mcpu", str);
8371 case AARCH64_PARSE_INVALID_FEATURE:
8372 error ("invalid feature modifier in -mcpu=%qs", str);
8381 /* Validate a command-line -march option. Parse the arch and extensions
8382 (if any) specified in STR and throw errors if appropriate. Put the
8383 results, if they are valid, in RES and ISA_FLAGS. Return whether the
8387 aarch64_validate_march (const char *str, const struct processor **res,
8388 unsigned long *isa_flags)
8390 enum aarch64_parse_opt_result parse_res
8391 = aarch64_parse_arch (str, res, isa_flags);
8393 if (parse_res == AARCH64_PARSE_OK)
8398 case AARCH64_PARSE_MISSING_ARG:
8399 error ("missing arch name in -march=%qs", str);
8401 case AARCH64_PARSE_INVALID_ARG:
8402 error ("unknown value %qs for -march", str);
8404 case AARCH64_PARSE_INVALID_FEATURE:
8405 error ("invalid feature modifier in -march=%qs", str);
8414 /* Validate a command-line -mtune option. Parse the cpu
8415 specified in STR and throw errors if appropriate. Put the
8416 result, if it is valid, in RES. Return whether the option is
8420 aarch64_validate_mtune (const char *str, const struct processor **res)
8422 enum aarch64_parse_opt_result parse_res
8423 = aarch64_parse_tune (str, res);
8425 if (parse_res == AARCH64_PARSE_OK)
8430 case AARCH64_PARSE_MISSING_ARG:
8431 error ("missing cpu name in -mtune=%qs", str);
8433 case AARCH64_PARSE_INVALID_ARG:
8434 error ("unknown value %qs for -mtune", str);
8442 /* Return the CPU corresponding to the enum CPU.
8443 If it doesn't specify a cpu, return the default. */
8445 static const struct processor *
8446 aarch64_get_tune_cpu (enum aarch64_processor cpu)
8448 if (cpu != aarch64_none)
8449 return &all_cores[cpu];
8451 /* The & 0x3f is to extract the bottom 6 bits that encode the
8452 default cpu as selected by the --with-cpu GCC configure option
8454 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
8455 flags mechanism should be reworked to make it more sane. */
8456 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
8459 /* Return the architecture corresponding to the enum ARCH.
8460 If it doesn't specify a valid architecture, return the default. */
8462 static const struct processor *
8463 aarch64_get_arch (enum aarch64_arch arch)
8465 if (arch != aarch64_no_arch)
8466 return &all_architectures[arch];
8468 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
8470 return &all_architectures[cpu->arch];
8473 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
8474 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
8475 tuning structs. In particular it must set selected_tune and
8476 aarch64_isa_flags that define the available ISA features and tuning
8477 decisions. It must also set selected_arch as this will be used to
8478 output the .arch asm tags for each function. */
8481 aarch64_override_options (void)
8483 unsigned long cpu_isa = 0;
8484 unsigned long arch_isa = 0;
8485 aarch64_isa_flags = 0;
8487 bool valid_cpu = true;
8488 bool valid_tune = true;
8489 bool valid_arch = true;
8491 selected_cpu = NULL;
8492 selected_arch = NULL;
8493 selected_tune = NULL;
8495 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
8496 If either of -march or -mtune is given, they override their
8497 respective component of -mcpu. */
8498 if (aarch64_cpu_string)
8499 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
8502 if (aarch64_arch_string)
8503 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
8506 if (aarch64_tune_string)
8507 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
8509 /* If the user did not specify a processor, choose the default
8510 one for them. This will be the CPU set during configuration using
8511 --with-cpu, otherwise it is "generic". */
8516 selected_cpu = &all_cores[selected_arch->ident];
8517 aarch64_isa_flags = arch_isa;
8518 explicit_arch = selected_arch->arch;
8522 /* Get default configure-time CPU. */
8523 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
8524 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
8528 explicit_tune_core = selected_tune->ident;
8530 /* If both -mcpu and -march are specified check that they are architecturally
8531 compatible, warn if they're not and prefer the -march ISA flags. */
8532 else if (selected_arch)
8534 if (selected_arch->arch != selected_cpu->arch)
8536 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
8537 all_architectures[selected_cpu->arch].name,
8538 selected_arch->name);
8540 aarch64_isa_flags = arch_isa;
8541 explicit_arch = selected_arch->arch;
8542 explicit_tune_core = selected_tune ? selected_tune->ident
8543 : selected_cpu->ident;
8547 /* -mcpu but no -march. */
8548 aarch64_isa_flags = cpu_isa;
8549 explicit_tune_core = selected_tune ? selected_tune->ident
8550 : selected_cpu->ident;
8551 gcc_assert (selected_cpu);
8552 selected_arch = &all_architectures[selected_cpu->arch];
8553 explicit_arch = selected_arch->arch;
8556 /* Set the arch as well as we will need it when outputing
8557 the .arch directive in assembly. */
8560 gcc_assert (selected_cpu);
8561 selected_arch = &all_architectures[selected_cpu->arch];
8565 selected_tune = selected_cpu;
8567 #ifndef HAVE_AS_MABI_OPTION
8568 /* The compiler may have been configured with 2.23.* binutils, which does
8569 not have support for ILP32. */
8571 error ("Assembler does not support -mabi=ilp32");
8574 /* Make sure we properly set up the explicit options. */
8575 if ((aarch64_cpu_string && valid_cpu)
8576 || (aarch64_tune_string && valid_tune))
8577 gcc_assert (explicit_tune_core != aarch64_none);
8579 if ((aarch64_cpu_string && valid_cpu)
8580 || (aarch64_arch_string && valid_arch))
8581 gcc_assert (explicit_arch != aarch64_no_arch);
8583 aarch64_override_options_internal (&global_options);
8585 /* Save these options as the default ones in case we push and pop them later
8586 while processing functions with potential target attributes. */
8587 target_option_default_node = target_option_current_node
8588 = build_target_option_node (&global_options);
8590 aarch64_register_fma_steering ();
8594 /* Implement targetm.override_options_after_change. */
8597 aarch64_override_options_after_change (void)
8599 aarch64_override_options_after_change_1 (&global_options);
8602 static struct machine_function *
8603 aarch64_init_machine_status (void)
8605 struct machine_function *machine;
8606 machine = ggc_cleared_alloc<machine_function> ();
8611 aarch64_init_expanders (void)
8613 init_machine_status = aarch64_init_machine_status;
8616 /* A checking mechanism for the implementation of the various code models. */
8618 initialize_aarch64_code_model (struct gcc_options *opts)
8620 if (opts->x_flag_pic)
8622 switch (opts->x_aarch64_cmodel_var)
8624 case AARCH64_CMODEL_TINY:
8625 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
8627 case AARCH64_CMODEL_SMALL:
8628 #ifdef HAVE_AS_SMALL_PIC_RELOCS
8629 aarch64_cmodel = (flag_pic == 2
8630 ? AARCH64_CMODEL_SMALL_PIC
8631 : AARCH64_CMODEL_SMALL_SPIC);
8633 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
8636 case AARCH64_CMODEL_LARGE:
8637 sorry ("code model %qs with -f%s", "large",
8638 opts->x_flag_pic > 1 ? "PIC" : "pic");
8645 aarch64_cmodel = opts->x_aarch64_cmodel_var;
8648 /* Implement TARGET_OPTION_SAVE. */
8651 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
8653 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
8656 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
8657 using the information saved in PTR. */
8660 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
8662 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
8663 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
8664 opts->x_explicit_arch = ptr->x_explicit_arch;
8665 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
8666 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
8668 aarch64_override_options_internal (opts);
8671 /* Implement TARGET_OPTION_PRINT. */
8674 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
8676 const struct processor *cpu
8677 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
8678 unsigned long isa_flags = ptr->x_aarch64_isa_flags;
8679 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
8680 std::string extension
8681 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
8683 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
8684 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
8685 arch->name, extension.c_str ());
8688 static GTY(()) tree aarch64_previous_fndecl;
8691 aarch64_reset_previous_fndecl (void)
8693 aarch64_previous_fndecl = NULL;
8696 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
8697 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
8698 make sure optab availability predicates are recomputed when necessary. */
8701 aarch64_save_restore_target_globals (tree new_tree)
8703 if (TREE_TARGET_GLOBALS (new_tree))
8704 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
8705 else if (new_tree == target_option_default_node)
8706 restore_target_globals (&default_target_globals);
8708 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
8711 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
8712 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
8713 of the function, if such exists. This function may be called multiple
8714 times on a single function so use aarch64_previous_fndecl to avoid
8715 setting up identical state. */
8718 aarch64_set_current_function (tree fndecl)
8720 if (!fndecl || fndecl == aarch64_previous_fndecl)
8723 tree old_tree = (aarch64_previous_fndecl
8724 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
8727 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
8729 /* If current function has no attributes but the previous one did,
8730 use the default node. */
8731 if (!new_tree && old_tree)
8732 new_tree = target_option_default_node;
8734 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
8735 the default have been handled by aarch64_save_restore_target_globals from
8736 aarch64_pragma_target_parse. */
8737 if (old_tree == new_tree)
8740 aarch64_previous_fndecl = fndecl;
8742 /* First set the target options. */
8743 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
8745 aarch64_save_restore_target_globals (new_tree);
8748 /* Enum describing the various ways we can handle attributes.
8749 In many cases we can reuse the generic option handling machinery. */
8751 enum aarch64_attr_opt_type
8753 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
8754 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
8755 aarch64_attr_enum, /* Attribute sets an enum variable. */
8756 aarch64_attr_custom /* Attribute requires a custom handling function. */
8759 /* All the information needed to handle a target attribute.
8760 NAME is the name of the attribute.
8761 ATTR_TYPE specifies the type of behavior of the attribute as described
8762 in the definition of enum aarch64_attr_opt_type.
8763 ALLOW_NEG is true if the attribute supports a "no-" form.
8764 HANDLER is the function that takes the attribute string and whether
8765 it is a pragma or attribute and handles the option. It is needed only
8766 when the ATTR_TYPE is aarch64_attr_custom.
8767 OPT_NUM is the enum specifying the option that the attribute modifies.
8768 This is needed for attributes that mirror the behavior of a command-line
8769 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
8770 aarch64_attr_enum. */
8772 struct aarch64_attribute_info
8775 enum aarch64_attr_opt_type attr_type;
8777 bool (*handler) (const char *, const char *);
8778 enum opt_code opt_num;
8781 /* Handle the ARCH_STR argument to the arch= target attribute.
8782 PRAGMA_OR_ATTR is used in potential error messages. */
8785 aarch64_handle_attr_arch (const char *str, const char *pragma_or_attr)
8787 const struct processor *tmp_arch = NULL;
8788 enum aarch64_parse_opt_result parse_res
8789 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
8791 if (parse_res == AARCH64_PARSE_OK)
8793 gcc_assert (tmp_arch);
8794 selected_arch = tmp_arch;
8795 explicit_arch = selected_arch->arch;
8801 case AARCH64_PARSE_MISSING_ARG:
8802 error ("missing architecture name in 'arch' target %s", pragma_or_attr);
8804 case AARCH64_PARSE_INVALID_ARG:
8805 error ("unknown value %qs for 'arch' target %s", str, pragma_or_attr);
8807 case AARCH64_PARSE_INVALID_FEATURE:
8808 error ("invalid feature modifier %qs for 'arch' target %s",
8809 str, pragma_or_attr);
8818 /* Handle the argument CPU_STR to the cpu= target attribute.
8819 PRAGMA_OR_ATTR is used in potential error messages. */
8822 aarch64_handle_attr_cpu (const char *str, const char *pragma_or_attr)
8824 const struct processor *tmp_cpu = NULL;
8825 enum aarch64_parse_opt_result parse_res
8826 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
8828 if (parse_res == AARCH64_PARSE_OK)
8830 gcc_assert (tmp_cpu);
8831 selected_tune = tmp_cpu;
8832 explicit_tune_core = selected_tune->ident;
8834 selected_arch = &all_architectures[tmp_cpu->arch];
8835 explicit_arch = selected_arch->arch;
8841 case AARCH64_PARSE_MISSING_ARG:
8842 error ("missing cpu name in 'cpu' target %s", pragma_or_attr);
8844 case AARCH64_PARSE_INVALID_ARG:
8845 error ("unknown value %qs for 'cpu' target %s", str, pragma_or_attr);
8847 case AARCH64_PARSE_INVALID_FEATURE:
8848 error ("invalid feature modifier %qs for 'cpu' target %s",
8849 str, pragma_or_attr);
8858 /* Handle the argument STR to the tune= target attribute.
8859 PRAGMA_OR_ATTR is used in potential error messages. */
8862 aarch64_handle_attr_tune (const char *str, const char *pragma_or_attr)
8864 const struct processor *tmp_tune = NULL;
8865 enum aarch64_parse_opt_result parse_res
8866 = aarch64_parse_tune (str, &tmp_tune);
8868 if (parse_res == AARCH64_PARSE_OK)
8870 gcc_assert (tmp_tune);
8871 selected_tune = tmp_tune;
8872 explicit_tune_core = selected_tune->ident;
8878 case AARCH64_PARSE_INVALID_ARG:
8879 error ("unknown value %qs for 'tune' target %s", str, pragma_or_attr);
8888 /* Parse an architecture extensions target attribute string specified in STR.
8889 For example "+fp+nosimd". Show any errors if needed. Return TRUE
8890 if successful. Update aarch64_isa_flags to reflect the ISA features
8892 PRAGMA_OR_ATTR is used in potential error messages. */
8895 aarch64_handle_attr_isa_flags (char *str, const char *pragma_or_attr)
8897 enum aarch64_parse_opt_result parse_res;
8898 unsigned long isa_flags = aarch64_isa_flags;
8900 /* We allow "+nothing" in the beginning to clear out all architectural
8901 features if the user wants to handpick specific features. */
8902 if (strncmp ("+nothing", str, 8) == 0)
8908 parse_res = aarch64_parse_extension (str, &isa_flags);
8910 if (parse_res == AARCH64_PARSE_OK)
8912 aarch64_isa_flags = isa_flags;
8918 case AARCH64_PARSE_MISSING_ARG:
8919 error ("missing feature modifier in target %s %qs",
8920 pragma_or_attr, str);
8923 case AARCH64_PARSE_INVALID_FEATURE:
8924 error ("invalid feature modifier in target %s %qs",
8925 pragma_or_attr, str);
8935 /* The target attributes that we support. On top of these we also support just
8936 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
8937 handled explicitly in aarch64_process_one_target_attr. */
8939 static const struct aarch64_attribute_info aarch64_attributes[] =
8941 { "general-regs-only", aarch64_attr_mask, false, NULL,
8942 OPT_mgeneral_regs_only },
8943 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
8944 OPT_mfix_cortex_a53_835769 },
8945 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
8946 OPT_mfix_cortex_a53_843419 },
8947 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
8948 { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
8949 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
8950 OPT_momit_leaf_frame_pointer },
8951 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
8952 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
8954 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
8955 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
8957 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
8960 /* Parse ARG_STR which contains the definition of one target attribute.
8961 Show appropriate errors if any or return true if the attribute is valid.
8962 PRAGMA_OR_ATTR holds the string to use in error messages about whether
8963 we're processing a target attribute or pragma. */
8966 aarch64_process_one_target_attr (char *arg_str, const char* pragma_or_attr)
8968 bool invert = false;
8970 size_t len = strlen (arg_str);
8974 error ("malformed target %s", pragma_or_attr);
8978 char *str_to_check = (char *) alloca (len + 1);
8979 strcpy (str_to_check, arg_str);
8981 /* Skip leading whitespace. */
8982 while (*str_to_check == ' ' || *str_to_check == '\t')
8985 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
8986 It is easier to detect and handle it explicitly here rather than going
8987 through the machinery for the rest of the target attributes in this
8989 if (*str_to_check == '+')
8990 return aarch64_handle_attr_isa_flags (str_to_check, pragma_or_attr);
8992 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
8997 char *arg = strchr (str_to_check, '=');
8999 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
9000 and point ARG to "foo". */
9006 const struct aarch64_attribute_info *p_attr;
9008 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
9010 /* If the names don't match up, or the user has given an argument
9011 to an attribute that doesn't accept one, or didn't give an argument
9012 to an attribute that expects one, fail to match. */
9013 if (strcmp (str_to_check, p_attr->name) != 0)
9017 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
9018 || p_attr->attr_type == aarch64_attr_enum;
9020 if (attr_need_arg_p ^ (arg != NULL))
9022 error ("target %s %qs does not accept an argument",
9023 pragma_or_attr, str_to_check);
9027 /* If the name matches but the attribute does not allow "no-" versions
9028 then we can't match. */
9029 if (invert && !p_attr->allow_neg)
9031 error ("target %s %qs does not allow a negated form",
9032 pragma_or_attr, str_to_check);
9036 switch (p_attr->attr_type)
9038 /* Has a custom handler registered.
9039 For example, cpu=, arch=, tune=. */
9040 case aarch64_attr_custom:
9041 gcc_assert (p_attr->handler);
9042 if (!p_attr->handler (arg, pragma_or_attr))
9046 /* Either set or unset a boolean option. */
9047 case aarch64_attr_bool:
9049 struct cl_decoded_option decoded;
9051 generate_option (p_attr->opt_num, NULL, !invert,
9052 CL_TARGET, &decoded);
9053 aarch64_handle_option (&global_options, &global_options_set,
9054 &decoded, input_location);
9057 /* Set or unset a bit in the target_flags. aarch64_handle_option
9058 should know what mask to apply given the option number. */
9059 case aarch64_attr_mask:
9061 struct cl_decoded_option decoded;
9062 /* We only need to specify the option number.
9063 aarch64_handle_option will know which mask to apply. */
9064 decoded.opt_index = p_attr->opt_num;
9065 decoded.value = !invert;
9066 aarch64_handle_option (&global_options, &global_options_set,
9067 &decoded, input_location);
9070 /* Use the option setting machinery to set an option to an enum. */
9071 case aarch64_attr_enum:
9076 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
9080 set_option (&global_options, NULL, p_attr->opt_num, value,
9081 NULL, DK_UNSPECIFIED, input_location,
9086 error ("target %s %s=%s is not valid",
9087 pragma_or_attr, str_to_check, arg);
9096 /* If we reached here we either have found an attribute and validated
9097 it or didn't match any. If we matched an attribute but its arguments
9098 were malformed we will have returned false already. */
9102 /* Count how many times the character C appears in
9103 NULL-terminated string STR. */
9106 num_occurences_in_str (char c, char *str)
9108 unsigned int res = 0;
9109 while (*str != '\0')
9120 /* Parse the tree in ARGS that contains the target attribute information
9121 and update the global target options space. PRAGMA_OR_ATTR is a string
9122 to be used in error messages, specifying whether this is processing
9123 a target attribute or a target pragma. */
9126 aarch64_process_target_attr (tree args, const char* pragma_or_attr)
9128 if (TREE_CODE (args) == TREE_LIST)
9132 tree head = TREE_VALUE (args);
9135 if (!aarch64_process_target_attr (head, pragma_or_attr))
9138 args = TREE_CHAIN (args);
9143 /* We expect to find a string to parse. */
9144 gcc_assert (TREE_CODE (args) == STRING_CST);
9146 size_t len = strlen (TREE_STRING_POINTER (args));
9147 char *str_to_check = (char *) alloca (len + 1);
9148 strcpy (str_to_check, TREE_STRING_POINTER (args));
9152 error ("malformed target %s value", pragma_or_attr);
9156 /* Used to catch empty spaces between commas i.e.
9157 attribute ((target ("attr1,,attr2"))). */
9158 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
9160 /* Handle multiple target attributes separated by ','. */
9161 char *token = strtok (str_to_check, ",");
9163 unsigned int num_attrs = 0;
9167 if (!aarch64_process_one_target_attr (token, pragma_or_attr))
9169 error ("target %s %qs is invalid", pragma_or_attr, token);
9173 token = strtok (NULL, ",");
9176 if (num_attrs != num_commas + 1)
9178 error ("malformed target %s list %qs",
9179 pragma_or_attr, TREE_STRING_POINTER (args));
9186 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
9187 process attribute ((target ("..."))). */
9190 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
9192 struct cl_target_option cur_target;
9195 tree new_target, new_optimize;
9196 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9198 /* If what we're processing is the current pragma string then the
9199 target option node is already stored in target_option_current_node
9200 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
9201 having to re-parse the string. This is especially useful to keep
9202 arm_neon.h compile times down since that header contains a lot
9203 of intrinsics enclosed in pragmas. */
9204 if (!existing_target && args == current_target_pragma)
9206 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
9209 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9211 old_optimize = build_optimization_node (&global_options);
9212 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9214 /* If the function changed the optimization levels as well as setting
9215 target options, start with the optimizations specified. */
9216 if (func_optimize && func_optimize != old_optimize)
9217 cl_optimization_restore (&global_options,
9218 TREE_OPTIMIZATION (func_optimize));
9220 /* Save the current target options to restore at the end. */
9221 cl_target_option_save (&cur_target, &global_options);
9223 /* If fndecl already has some target attributes applied to it, unpack
9224 them so that we add this attribute on top of them, rather than
9225 overwriting them. */
9226 if (existing_target)
9228 struct cl_target_option *existing_options
9229 = TREE_TARGET_OPTION (existing_target);
9231 if (existing_options)
9232 cl_target_option_restore (&global_options, existing_options);
9235 cl_target_option_restore (&global_options,
9236 TREE_TARGET_OPTION (target_option_current_node));
9239 ret = aarch64_process_target_attr (args, "attribute");
9241 /* Set up any additional state. */
9244 aarch64_override_options_internal (&global_options);
9245 /* Initialize SIMD builtins if we haven't already.
9246 Set current_target_pragma to NULL for the duration so that
9247 the builtin initialization code doesn't try to tag the functions
9248 being built with the attributes specified by any current pragma, thus
9249 going into an infinite recursion. */
9252 tree saved_current_target_pragma = current_target_pragma;
9253 current_target_pragma = NULL;
9254 aarch64_init_simd_builtins ();
9255 current_target_pragma = saved_current_target_pragma;
9257 new_target = build_target_option_node (&global_options);
9262 new_optimize = build_optimization_node (&global_options);
9266 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
9268 if (old_optimize != new_optimize)
9269 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
9272 cl_target_option_restore (&global_options, &cur_target);
9274 if (old_optimize != new_optimize)
9275 cl_optimization_restore (&global_options,
9276 TREE_OPTIMIZATION (old_optimize));
9280 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
9281 tri-bool options (yes, no, don't care) and the default value is
9282 DEF, determine whether to reject inlining. */
9285 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
9286 int dont_care, int def)
9288 /* If the callee doesn't care, always allow inlining. */
9289 if (callee == dont_care)
9292 /* If the caller doesn't care, always allow inlining. */
9293 if (caller == dont_care)
9296 /* Otherwise, allow inlining if either the callee and caller values
9297 agree, or if the callee is using the default value. */
9298 return (callee == caller || callee == def);
9301 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
9302 to inline CALLEE into CALLER based on target-specific info.
9303 Make sure that the caller and callee have compatible architectural
9304 features. Then go through the other possible target attributes
9305 and see if they can block inlining. Try not to reject always_inline
9306 callees unless they are incompatible architecturally. */
9309 aarch64_can_inline_p (tree caller, tree callee)
9311 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
9312 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
9314 /* If callee has no option attributes, then it is ok to inline. */
9318 struct cl_target_option *caller_opts
9319 = TREE_TARGET_OPTION (caller_tree ? caller_tree
9320 : target_option_default_node);
9322 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
9325 /* Callee's ISA flags should be a subset of the caller's. */
9326 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
9327 != callee_opts->x_aarch64_isa_flags)
9330 /* Allow non-strict aligned functions inlining into strict
9332 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
9333 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
9334 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
9335 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
9338 bool always_inline = lookup_attribute ("always_inline",
9339 DECL_ATTRIBUTES (callee));
9341 /* If the architectural features match up and the callee is always_inline
9342 then the other attributes don't matter. */
9346 if (caller_opts->x_aarch64_cmodel_var
9347 != callee_opts->x_aarch64_cmodel_var)
9350 if (caller_opts->x_aarch64_tls_dialect
9351 != callee_opts->x_aarch64_tls_dialect)
9354 /* Honour explicit requests to workaround errata. */
9355 if (!aarch64_tribools_ok_for_inlining_p (
9356 caller_opts->x_aarch64_fix_a53_err835769,
9357 callee_opts->x_aarch64_fix_a53_err835769,
9358 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
9361 if (!aarch64_tribools_ok_for_inlining_p (
9362 caller_opts->x_aarch64_fix_a53_err843419,
9363 callee_opts->x_aarch64_fix_a53_err843419,
9364 2, TARGET_FIX_ERR_A53_843419))
9367 /* If the user explicitly specified -momit-leaf-frame-pointer for the
9368 caller and calle and they don't match up, reject inlining. */
9369 if (!aarch64_tribools_ok_for_inlining_p (
9370 caller_opts->x_flag_omit_leaf_frame_pointer,
9371 callee_opts->x_flag_omit_leaf_frame_pointer,
9375 /* If the callee has specific tuning overrides, respect them. */
9376 if (callee_opts->x_aarch64_override_tune_string != NULL
9377 && caller_opts->x_aarch64_override_tune_string == NULL)
9380 /* If the user specified tuning override strings for the
9381 caller and callee and they don't match up, reject inlining.
9382 We just do a string compare here, we don't analyze the meaning
9383 of the string, as it would be too costly for little gain. */
9384 if (callee_opts->x_aarch64_override_tune_string
9385 && caller_opts->x_aarch64_override_tune_string
9386 && (strcmp (callee_opts->x_aarch64_override_tune_string,
9387 caller_opts->x_aarch64_override_tune_string) != 0))
9393 /* Return true if SYMBOL_REF X binds locally. */
9396 aarch64_symbol_binds_local_p (const_rtx x)
9398 return (SYMBOL_REF_DECL (x)
9399 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
9400 : SYMBOL_REF_LOCAL_P (x));
9403 /* Return true if SYMBOL_REF X is thread local */
9405 aarch64_tls_symbol_p (rtx x)
9407 if (! TARGET_HAVE_TLS)
9410 if (GET_CODE (x) != SYMBOL_REF)
9413 return SYMBOL_REF_TLS_MODEL (x) != 0;
9416 /* Classify a TLS symbol into one of the TLS kinds. */
9417 enum aarch64_symbol_type
9418 aarch64_classify_tls_symbol (rtx x)
9420 enum tls_model tls_kind = tls_symbolic_operand_type (x);
9424 case TLS_MODEL_GLOBAL_DYNAMIC:
9425 case TLS_MODEL_LOCAL_DYNAMIC:
9426 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
9428 case TLS_MODEL_INITIAL_EXEC:
9429 switch (aarch64_cmodel)
9431 case AARCH64_CMODEL_TINY:
9432 case AARCH64_CMODEL_TINY_PIC:
9433 return SYMBOL_TINY_TLSIE;
9435 return SYMBOL_SMALL_TLSIE;
9438 case TLS_MODEL_LOCAL_EXEC:
9439 if (aarch64_tls_size == 12)
9440 return SYMBOL_TLSLE12;
9441 else if (aarch64_tls_size == 24)
9442 return SYMBOL_TLSLE24;
9443 else if (aarch64_tls_size == 32)
9444 return SYMBOL_TLSLE32;
9445 else if (aarch64_tls_size == 48)
9446 return SYMBOL_TLSLE48;
9450 case TLS_MODEL_EMULATED:
9451 case TLS_MODEL_NONE:
9452 return SYMBOL_FORCE_TO_MEM;
9459 /* Return the method that should be used to access SYMBOL_REF or
9462 enum aarch64_symbol_type
9463 aarch64_classify_symbol (rtx x, rtx offset)
9465 if (GET_CODE (x) == LABEL_REF)
9467 switch (aarch64_cmodel)
9469 case AARCH64_CMODEL_LARGE:
9470 return SYMBOL_FORCE_TO_MEM;
9472 case AARCH64_CMODEL_TINY_PIC:
9473 case AARCH64_CMODEL_TINY:
9474 return SYMBOL_TINY_ABSOLUTE;
9476 case AARCH64_CMODEL_SMALL_SPIC:
9477 case AARCH64_CMODEL_SMALL_PIC:
9478 case AARCH64_CMODEL_SMALL:
9479 return SYMBOL_SMALL_ABSOLUTE;
9486 if (GET_CODE (x) == SYMBOL_REF)
9488 if (aarch64_tls_symbol_p (x))
9489 return aarch64_classify_tls_symbol (x);
9491 switch (aarch64_cmodel)
9493 case AARCH64_CMODEL_TINY:
9494 /* When we retrieve symbol + offset address, we have to make sure
9495 the offset does not cause overflow of the final address. But
9496 we have no way of knowing the address of symbol at compile time
9497 so we can't accurately say if the distance between the PC and
9498 symbol + offset is outside the addressible range of +/-1M in the
9499 TINY code model. So we rely on images not being greater than
9500 1M and cap the offset at 1M and anything beyond 1M will have to
9501 be loaded using an alternative mechanism. Furthermore if the
9502 symbol is a weak reference to something that isn't known to
9503 resolve to a symbol in this module, then force to memory. */
9504 if ((SYMBOL_REF_WEAK (x)
9505 && !aarch64_symbol_binds_local_p (x))
9506 || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
9507 return SYMBOL_FORCE_TO_MEM;
9508 return SYMBOL_TINY_ABSOLUTE;
9510 case AARCH64_CMODEL_SMALL:
9511 /* Same reasoning as the tiny code model, but the offset cap here is
9513 if ((SYMBOL_REF_WEAK (x)
9514 && !aarch64_symbol_binds_local_p (x))
9515 || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
9516 HOST_WIDE_INT_C (4294967264)))
9517 return SYMBOL_FORCE_TO_MEM;
9518 return SYMBOL_SMALL_ABSOLUTE;
9520 case AARCH64_CMODEL_TINY_PIC:
9521 if (!aarch64_symbol_binds_local_p (x))
9522 return SYMBOL_TINY_GOT;
9523 return SYMBOL_TINY_ABSOLUTE;
9525 case AARCH64_CMODEL_SMALL_SPIC:
9526 case AARCH64_CMODEL_SMALL_PIC:
9527 if (!aarch64_symbol_binds_local_p (x))
9528 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
9529 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
9530 return SYMBOL_SMALL_ABSOLUTE;
9532 case AARCH64_CMODEL_LARGE:
9533 /* This is alright even in PIC code as the constant
9534 pool reference is always PC relative and within
9535 the same translation unit. */
9536 if (CONSTANT_POOL_ADDRESS_P (x))
9537 return SYMBOL_SMALL_ABSOLUTE;
9539 return SYMBOL_FORCE_TO_MEM;
9546 /* By default push everything into the constant pool. */
9547 return SYMBOL_FORCE_TO_MEM;
9551 aarch64_constant_address_p (rtx x)
9553 return (CONSTANT_P (x) && memory_address_p (DImode, x));
9557 aarch64_legitimate_pic_operand_p (rtx x)
9559 if (GET_CODE (x) == SYMBOL_REF
9560 || (GET_CODE (x) == CONST
9561 && GET_CODE (XEXP (x, 0)) == PLUS
9562 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
9568 /* Return true if X holds either a quarter-precision or
9569 floating-point +0.0 constant. */
9571 aarch64_valid_floating_const (machine_mode mode, rtx x)
9573 if (!CONST_DOUBLE_P (x))
9576 if (aarch64_float_const_zero_rtx_p (x))
9579 /* We only handle moving 0.0 to a TFmode register. */
9580 if (!(mode == SFmode || mode == DFmode))
9583 return aarch64_float_const_representable_p (x);
9587 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
9589 /* Do not allow vector struct mode constants. We could support
9590 0 and -1 easily, but they need support in aarch64-simd.md. */
9591 if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
9594 /* This could probably go away because
9595 we now decompose CONST_INTs according to expand_mov_immediate. */
9596 if ((GET_CODE (x) == CONST_VECTOR
9597 && aarch64_simd_valid_immediate (x, mode, false, NULL))
9598 || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
9599 return !targetm.cannot_force_const_mem (mode, x);
9601 if (GET_CODE (x) == HIGH
9602 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
9605 return aarch64_constant_address_p (x);
9609 aarch64_load_tp (rtx target)
9612 || GET_MODE (target) != Pmode
9613 || !register_operand (target, Pmode))
9614 target = gen_reg_rtx (Pmode);
9616 /* Can return in any reg. */
9617 emit_insn (gen_aarch64_load_tp_hard (target));
9621 /* On AAPCS systems, this is the "struct __va_list". */
9622 static GTY(()) tree va_list_type;
9624 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
9625 Return the type to use as __builtin_va_list.
9627 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
9639 aarch64_build_builtin_va_list (void)
9642 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
9644 /* Create the type. */
9645 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
9646 /* Give it the required name. */
9647 va_list_name = build_decl (BUILTINS_LOCATION,
9649 get_identifier ("__va_list"),
9651 DECL_ARTIFICIAL (va_list_name) = 1;
9652 TYPE_NAME (va_list_type) = va_list_name;
9653 TYPE_STUB_DECL (va_list_type) = va_list_name;
9655 /* Create the fields. */
9656 f_stack = build_decl (BUILTINS_LOCATION,
9657 FIELD_DECL, get_identifier ("__stack"),
9659 f_grtop = build_decl (BUILTINS_LOCATION,
9660 FIELD_DECL, get_identifier ("__gr_top"),
9662 f_vrtop = build_decl (BUILTINS_LOCATION,
9663 FIELD_DECL, get_identifier ("__vr_top"),
9665 f_groff = build_decl (BUILTINS_LOCATION,
9666 FIELD_DECL, get_identifier ("__gr_offs"),
9668 f_vroff = build_decl (BUILTINS_LOCATION,
9669 FIELD_DECL, get_identifier ("__vr_offs"),
9672 /* Tell tree-stdarg pass about our internal offset fields.
9673 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
9674 purpose to identify whether the code is updating va_list internal
9675 offset fields through irregular way. */
9676 va_list_gpr_counter_field = f_groff;
9677 va_list_fpr_counter_field = f_vroff;
9679 DECL_ARTIFICIAL (f_stack) = 1;
9680 DECL_ARTIFICIAL (f_grtop) = 1;
9681 DECL_ARTIFICIAL (f_vrtop) = 1;
9682 DECL_ARTIFICIAL (f_groff) = 1;
9683 DECL_ARTIFICIAL (f_vroff) = 1;
9685 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
9686 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
9687 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
9688 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
9689 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
9691 TYPE_FIELDS (va_list_type) = f_stack;
9692 DECL_CHAIN (f_stack) = f_grtop;
9693 DECL_CHAIN (f_grtop) = f_vrtop;
9694 DECL_CHAIN (f_vrtop) = f_groff;
9695 DECL_CHAIN (f_groff) = f_vroff;
9697 /* Compute its layout. */
9698 layout_type (va_list_type);
9700 return va_list_type;
9703 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
9705 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
9707 const CUMULATIVE_ARGS *cum;
9708 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
9709 tree stack, grtop, vrtop, groff, vroff;
9711 int gr_save_area_size = cfun->va_list_gpr_size;
9712 int vr_save_area_size = cfun->va_list_fpr_size;
9715 cum = &crtl->args.info;
9716 if (cfun->va_list_gpr_size)
9717 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
9718 cfun->va_list_gpr_size);
9719 if (cfun->va_list_fpr_size)
9720 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
9721 * UNITS_PER_VREG, cfun->va_list_fpr_size);
9725 gcc_assert (cum->aapcs_nvrn == 0);
9726 vr_save_area_size = 0;
9729 f_stack = TYPE_FIELDS (va_list_type_node);
9730 f_grtop = DECL_CHAIN (f_stack);
9731 f_vrtop = DECL_CHAIN (f_grtop);
9732 f_groff = DECL_CHAIN (f_vrtop);
9733 f_vroff = DECL_CHAIN (f_groff);
9735 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
9737 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
9739 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
9741 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
9743 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
9746 /* Emit code to initialize STACK, which points to the next varargs stack
9747 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
9748 by named arguments. STACK is 8-byte aligned. */
9749 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
9750 if (cum->aapcs_stack_size > 0)
9751 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
9752 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
9753 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9755 /* Emit code to initialize GRTOP, the top of the GR save area.
9756 virtual_incoming_args_rtx should have been 16 byte aligned. */
9757 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
9758 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
9759 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9761 /* Emit code to initialize VRTOP, the top of the VR save area.
9762 This address is gr_save_area_bytes below GRTOP, rounded
9763 down to the next 16-byte boundary. */
9764 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
9765 vr_offset = ROUND_UP (gr_save_area_size,
9766 STACK_BOUNDARY / BITS_PER_UNIT);
9769 t = fold_build_pointer_plus_hwi (t, -vr_offset);
9770 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
9771 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9773 /* Emit code to initialize GROFF, the offset from GRTOP of the
9774 next GPR argument. */
9775 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
9776 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
9777 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9779 /* Likewise emit code to initialize VROFF, the offset from FTOP
9780 of the next VR argument. */
9781 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
9782 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
9783 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9786 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
9789 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
9790 gimple_seq *post_p ATTRIBUTE_UNUSED)
9794 bool is_ha; /* is HFA or HVA. */
9795 bool dw_align; /* double-word align. */
9796 machine_mode ag_mode = VOIDmode;
9800 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
9801 tree stack, f_top, f_off, off, arg, roundup, on_stack;
9802 HOST_WIDE_INT size, rsize, adjust, align;
9803 tree t, u, cond1, cond2;
9805 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
9807 type = build_pointer_type (type);
9809 mode = TYPE_MODE (type);
9811 f_stack = TYPE_FIELDS (va_list_type_node);
9812 f_grtop = DECL_CHAIN (f_stack);
9813 f_vrtop = DECL_CHAIN (f_grtop);
9814 f_groff = DECL_CHAIN (f_vrtop);
9815 f_vroff = DECL_CHAIN (f_groff);
9817 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
9818 f_stack, NULL_TREE);
9819 size = int_size_in_bytes (type);
9820 align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
9824 if (aarch64_vfp_is_call_or_return_candidate (mode,
9830 /* TYPE passed in fp/simd registers. */
9832 aarch64_err_no_fpadvsimd (mode, "varargs");
9834 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
9835 unshare_expr (valist), f_vrtop, NULL_TREE);
9836 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
9837 unshare_expr (valist), f_vroff, NULL_TREE);
9839 rsize = nregs * UNITS_PER_VREG;
9843 if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
9844 adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
9846 else if (BLOCK_REG_PADDING (mode, type, 1) == downward
9847 && size < UNITS_PER_VREG)
9849 adjust = UNITS_PER_VREG - size;
9854 /* TYPE passed in general registers. */
9855 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
9856 unshare_expr (valist), f_grtop, NULL_TREE);
9857 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
9858 unshare_expr (valist), f_groff, NULL_TREE);
9859 rsize = ROUND_UP (size, UNITS_PER_WORD);
9860 nregs = rsize / UNITS_PER_WORD;
9865 if (BLOCK_REG_PADDING (mode, type, 1) == downward
9866 && size < UNITS_PER_WORD)
9868 adjust = UNITS_PER_WORD - size;
9872 /* Get a local temporary for the field value. */
9873 off = get_initialized_tmp_var (f_off, pre_p, NULL);
9875 /* Emit code to branch if off >= 0. */
9876 t = build2 (GE_EXPR, boolean_type_node, off,
9877 build_int_cst (TREE_TYPE (off), 0));
9878 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
9882 /* Emit: offs = (offs + 15) & -16. */
9883 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
9884 build_int_cst (TREE_TYPE (off), 15));
9885 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
9886 build_int_cst (TREE_TYPE (off), -16));
9887 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
9892 /* Update ap.__[g|v]r_offs */
9893 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
9894 build_int_cst (TREE_TYPE (off), rsize));
9895 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
9899 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
9901 /* [cond2] if (ap.__[g|v]r_offs > 0) */
9902 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
9903 build_int_cst (TREE_TYPE (f_off), 0));
9904 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
9906 /* String up: make sure the assignment happens before the use. */
9907 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
9908 COND_EXPR_ELSE (cond1) = t;
9910 /* Prepare the trees handling the argument that is passed on the stack;
9911 the top level node will store in ON_STACK. */
9912 arg = get_initialized_tmp_var (stack, pre_p, NULL);
9915 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
9916 t = fold_convert (intDI_type_node, arg);
9917 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
9918 build_int_cst (TREE_TYPE (t), 15));
9919 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
9920 build_int_cst (TREE_TYPE (t), -16));
9921 t = fold_convert (TREE_TYPE (arg), t);
9922 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
9926 /* Advance ap.__stack */
9927 t = fold_convert (intDI_type_node, arg);
9928 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
9929 build_int_cst (TREE_TYPE (t), size + 7));
9930 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
9931 build_int_cst (TREE_TYPE (t), -8));
9932 t = fold_convert (TREE_TYPE (arg), t);
9933 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
9934 /* String up roundup and advance. */
9936 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
9937 /* String up with arg */
9938 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
9939 /* Big-endianness related address adjustment. */
9940 if (BLOCK_REG_PADDING (mode, type, 1) == downward
9941 && size < UNITS_PER_WORD)
9943 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
9944 size_int (UNITS_PER_WORD - size));
9945 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
9948 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
9949 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
9951 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
9954 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
9955 build_int_cst (TREE_TYPE (off), adjust));
9957 t = fold_convert (sizetype, t);
9958 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
9962 /* type ha; // treat as "struct {ftype field[n];}"
9963 ... [computing offs]
9964 for (i = 0; i <nregs; ++i, offs += 16)
9965 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
9968 tree tmp_ha, field_t, field_ptr_t;
9970 /* Declare a local variable. */
9971 tmp_ha = create_tmp_var_raw (type, "ha");
9972 gimple_add_tmp_var (tmp_ha);
9974 /* Establish the base type. */
9978 field_t = float_type_node;
9979 field_ptr_t = float_ptr_type_node;
9982 field_t = double_type_node;
9983 field_ptr_t = double_ptr_type_node;
9986 field_t = long_double_type_node;
9987 field_ptr_t = long_double_ptr_type_node;
9990 field_t = aarch64_fp16_type_node;
9991 field_ptr_t = aarch64_fp16_ptr_type_node;
9996 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
9997 field_t = build_vector_type_for_mode (innertype, ag_mode);
9998 field_ptr_t = build_pointer_type (field_t);
10005 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
10006 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
10008 t = fold_convert (field_ptr_t, addr);
10009 t = build2 (MODIFY_EXPR, field_t,
10010 build1 (INDIRECT_REF, field_t, tmp_ha),
10011 build1 (INDIRECT_REF, field_t, t));
10013 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
10014 for (i = 1; i < nregs; ++i)
10016 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
10017 u = fold_convert (field_ptr_t, addr);
10018 u = build2 (MODIFY_EXPR, field_t,
10019 build2 (MEM_REF, field_t, tmp_ha,
10020 build_int_cst (field_ptr_t,
10022 int_size_in_bytes (field_t)))),
10023 build1 (INDIRECT_REF, field_t, u));
10024 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
10027 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
10028 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
10031 COND_EXPR_ELSE (cond2) = t;
10032 addr = fold_convert (build_pointer_type (type), cond1);
10033 addr = build_va_arg_indirect_ref (addr);
10036 addr = build_va_arg_indirect_ref (addr);
10041 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
10044 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
10045 tree type, int *pretend_size ATTRIBUTE_UNUSED,
10048 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10049 CUMULATIVE_ARGS local_cum;
10050 int gr_saved = cfun->va_list_gpr_size;
10051 int vr_saved = cfun->va_list_fpr_size;
10053 /* The caller has advanced CUM up to, but not beyond, the last named
10054 argument. Advance a local copy of CUM past the last "real" named
10055 argument, to find out how many registers are left over. */
10057 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
10059 /* Found out how many registers we need to save.
10060 Honor tree-stdvar analysis results. */
10061 if (cfun->va_list_gpr_size)
10062 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
10063 cfun->va_list_gpr_size / UNITS_PER_WORD);
10064 if (cfun->va_list_fpr_size)
10065 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
10066 cfun->va_list_fpr_size / UNITS_PER_VREG);
10070 gcc_assert (local_cum.aapcs_nvrn == 0);
10080 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
10081 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
10082 - gr_saved * UNITS_PER_WORD);
10083 mem = gen_frame_mem (BLKmode, ptr);
10084 set_mem_alias_set (mem, get_varargs_alias_set ());
10086 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
10091 /* We can't use move_block_from_reg, because it will use
10092 the wrong mode, storing D regs only. */
10093 machine_mode mode = TImode;
10094 int off, i, vr_start;
10096 /* Set OFF to the offset from virtual_incoming_args_rtx of
10097 the first vector register. The VR save area lies below
10098 the GR one, and is aligned to 16 bytes. */
10099 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
10100 STACK_BOUNDARY / BITS_PER_UNIT);
10101 off -= vr_saved * UNITS_PER_VREG;
10103 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
10104 for (i = 0; i < vr_saved; ++i)
10108 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
10109 mem = gen_frame_mem (mode, ptr);
10110 set_mem_alias_set (mem, get_varargs_alias_set ());
10111 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
10112 off += UNITS_PER_VREG;
10117 /* We don't save the size into *PRETEND_SIZE because we want to avoid
10118 any complication of having crtl->args.pretend_args_size changed. */
10119 cfun->machine->frame.saved_varargs_size
10120 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
10121 STACK_BOUNDARY / BITS_PER_UNIT)
10122 + vr_saved * UNITS_PER_VREG);
10126 aarch64_conditional_register_usage (void)
10131 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
10134 call_used_regs[i] = 1;
10139 /* Walk down the type tree of TYPE counting consecutive base elements.
10140 If *MODEP is VOIDmode, then set it to the first valid floating point
10141 type. If a non-floating point type is found, or if a floating point
10142 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
10143 otherwise return the count in the sub-tree. */
10145 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
10148 HOST_WIDE_INT size;
10150 switch (TREE_CODE (type))
10153 mode = TYPE_MODE (type);
10154 if (mode != DFmode && mode != SFmode
10155 && mode != TFmode && mode != HFmode)
10158 if (*modep == VOIDmode)
10161 if (*modep == mode)
10167 mode = TYPE_MODE (TREE_TYPE (type));
10168 if (mode != DFmode && mode != SFmode
10169 && mode != TFmode && mode != HFmode)
10172 if (*modep == VOIDmode)
10175 if (*modep == mode)
10181 /* Use V2SImode and V4SImode as representatives of all 64-bit
10182 and 128-bit vector types. */
10183 size = int_size_in_bytes (type);
10196 if (*modep == VOIDmode)
10199 /* Vector modes are considered to be opaque: two vectors are
10200 equivalent for the purposes of being homogeneous aggregates
10201 if they are the same size. */
10202 if (*modep == mode)
10210 tree index = TYPE_DOMAIN (type);
10212 /* Can't handle incomplete types nor sizes that are not
10214 if (!COMPLETE_TYPE_P (type)
10215 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10218 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
10221 || !TYPE_MAX_VALUE (index)
10222 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
10223 || !TYPE_MIN_VALUE (index)
10224 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
10228 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
10229 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
10231 /* There must be no padding. */
10232 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10244 /* Can't handle incomplete types nor sizes that are not
10246 if (!COMPLETE_TYPE_P (type)
10247 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10250 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
10252 if (TREE_CODE (field) != FIELD_DECL)
10255 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
10258 count += sub_count;
10261 /* There must be no padding. */
10262 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10269 case QUAL_UNION_TYPE:
10271 /* These aren't very interesting except in a degenerate case. */
10276 /* Can't handle incomplete types nor sizes that are not
10278 if (!COMPLETE_TYPE_P (type)
10279 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10282 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
10284 if (TREE_CODE (field) != FIELD_DECL)
10287 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
10290 count = count > sub_count ? count : sub_count;
10293 /* There must be no padding. */
10294 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10307 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
10308 type as described in AAPCS64 \S 4.1.2.
10310 See the comment above aarch64_composite_type_p for the notes on MODE. */
10313 aarch64_short_vector_p (const_tree type,
10316 HOST_WIDE_INT size = -1;
10318 if (type && TREE_CODE (type) == VECTOR_TYPE)
10319 size = int_size_in_bytes (type);
10320 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
10321 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
10322 size = GET_MODE_SIZE (mode);
10324 return (size == 8 || size == 16);
10327 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
10328 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
10329 array types. The C99 floating-point complex types are also considered
10330 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
10331 types, which are GCC extensions and out of the scope of AAPCS64, are
10332 treated as composite types here as well.
10334 Note that MODE itself is not sufficient in determining whether a type
10335 is such a composite type or not. This is because
10336 stor-layout.c:compute_record_mode may have already changed the MODE
10337 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
10338 structure with only one field may have its MODE set to the mode of the
10339 field. Also an integer mode whose size matches the size of the
10340 RECORD_TYPE type may be used to substitute the original mode
10341 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
10342 solely relied on. */
10345 aarch64_composite_type_p (const_tree type,
10348 if (aarch64_short_vector_p (type, mode))
10351 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
10354 if (mode == BLKmode
10355 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
10356 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
10362 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
10363 shall be passed or returned in simd/fp register(s) (providing these
10364 parameter passing registers are available).
10366 Upon successful return, *COUNT returns the number of needed registers,
10367 *BASE_MODE returns the mode of the individual register and when IS_HAF
10368 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
10369 floating-point aggregate or a homogeneous short-vector aggregate. */
10372 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
10374 machine_mode *base_mode,
10378 machine_mode new_mode = VOIDmode;
10379 bool composite_p = aarch64_composite_type_p (type, mode);
10381 if (is_ha != NULL) *is_ha = false;
10383 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
10384 || aarch64_short_vector_p (type, mode))
10389 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
10391 if (is_ha != NULL) *is_ha = true;
10393 new_mode = GET_MODE_INNER (mode);
10395 else if (type && composite_p)
10397 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
10399 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
10401 if (is_ha != NULL) *is_ha = true;
10410 *base_mode = new_mode;
10414 /* Implement TARGET_STRUCT_VALUE_RTX. */
10417 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
10418 int incoming ATTRIBUTE_UNUSED)
10420 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
10423 /* Implements target hook vector_mode_supported_p. */
10425 aarch64_vector_mode_supported_p (machine_mode mode)
10428 && (mode == V4SImode || mode == V8HImode
10429 || mode == V16QImode || mode == V2DImode
10430 || mode == V2SImode || mode == V4HImode
10431 || mode == V8QImode || mode == V2SFmode
10432 || mode == V4SFmode || mode == V2DFmode
10433 || mode == V4HFmode || mode == V8HFmode
10434 || mode == V1DFmode))
10440 /* Return appropriate SIMD container
10441 for MODE within a vector of WIDTH bits. */
10442 static machine_mode
10443 aarch64_simd_container_mode (machine_mode mode, unsigned width)
10445 gcc_assert (width == 64 || width == 128);
10484 /* Return 128-bit container as the preferred SIMD mode for MODE. */
10485 static machine_mode
10486 aarch64_preferred_simd_mode (machine_mode mode)
10488 return aarch64_simd_container_mode (mode, 128);
10491 /* Return the bitmask of possible vector sizes for the vectorizer
10492 to iterate over. */
10493 static unsigned int
10494 aarch64_autovectorize_vector_sizes (void)
10499 /* Implement TARGET_MANGLE_TYPE. */
10501 static const char *
10502 aarch64_mangle_type (const_tree type)
10504 /* The AArch64 ABI documents say that "__va_list" has to be
10505 managled as if it is in the "std" namespace. */
10506 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
10507 return "St9__va_list";
10509 /* Half-precision float. */
10510 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
10513 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
10515 if (TYPE_NAME (type) != NULL)
10516 return aarch64_mangle_builtin_type (type);
10518 /* Use the default mangling. */
10523 /* Return true if the rtx_insn contains a MEM RTX somewhere
10527 has_memory_op (rtx_insn *mem_insn)
10529 subrtx_iterator::array_type array;
10530 FOR_EACH_SUBRTX (iter, array, PATTERN (mem_insn), ALL)
10537 /* Find the first rtx_insn before insn that will generate an assembly
10541 aarch64_prev_real_insn (rtx_insn *insn)
10548 insn = prev_real_insn (insn);
10550 while (insn && recog_memoized (insn) < 0);
10556 is_madd_op (enum attr_type t1)
10559 /* A number of these may be AArch32 only. */
10560 enum attr_type mlatypes[] = {
10561 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
10562 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
10563 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
10566 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
10568 if (t1 == mlatypes[i])
10575 /* Check if there is a register dependency between a load and the insn
10576 for which we hold recog_data. */
10579 dep_between_memop_and_curr (rtx memop)
10584 gcc_assert (GET_CODE (memop) == SET);
10586 if (!REG_P (SET_DEST (memop)))
10589 load_reg = SET_DEST (memop);
10590 for (opno = 1; opno < recog_data.n_operands; opno++)
10592 rtx operand = recog_data.operand[opno];
10593 if (REG_P (operand)
10594 && reg_overlap_mentioned_p (load_reg, operand))
10602 /* When working around the Cortex-A53 erratum 835769,
10603 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
10604 instruction and has a preceding memory instruction such that a NOP
10605 should be inserted between them. */
10608 aarch64_madd_needs_nop (rtx_insn* insn)
10610 enum attr_type attr_type;
10614 if (!TARGET_FIX_ERR_A53_835769)
10617 if (!INSN_P (insn) || recog_memoized (insn) < 0)
10620 attr_type = get_attr_type (insn);
10621 if (!is_madd_op (attr_type))
10624 prev = aarch64_prev_real_insn (insn);
10625 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
10626 Restore recog state to INSN to avoid state corruption. */
10627 extract_constrain_insn_cached (insn);
10629 if (!prev || !has_memory_op (prev))
10632 body = single_set (prev);
10634 /* If the previous insn is a memory op and there is no dependency between
10635 it and the DImode madd, emit a NOP between them. If body is NULL then we
10636 have a complex memory operation, probably a load/store pair.
10637 Be conservative for now and emit a NOP. */
10638 if (GET_MODE (recog_data.operand[0]) == DImode
10639 && (!body || !dep_between_memop_and_curr (body)))
10647 /* Implement FINAL_PRESCAN_INSN. */
10650 aarch64_final_prescan_insn (rtx_insn *insn)
10652 if (aarch64_madd_needs_nop (insn))
10653 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
10657 /* Return the equivalent letter for size. */
10659 sizetochar (int size)
10663 case 64: return 'd';
10664 case 32: return 's';
10665 case 16: return 'h';
10666 case 8 : return 'b';
10667 default: gcc_unreachable ();
10671 /* Return true iff x is a uniform vector of floating-point
10672 constants, and the constant can be represented in
10673 quarter-precision form. Note, as aarch64_float_const_representable
10674 rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0. */
10676 aarch64_vect_float_const_representable_p (rtx x)
10679 return (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
10680 && const_vec_duplicate_p (x, &elt)
10681 && aarch64_float_const_representable_p (elt));
10684 /* Return true for valid and false for invalid. */
10686 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
10687 struct simd_immediate_info *info)
10689 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG) \
10691 for (i = 0; i < idx; i += (STRIDE)) \
10696 immtype = (CLASS); \
10697 elsize = (ELSIZE); \
10698 eshift = (SHIFT); \
10703 unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
10704 unsigned int innersize = GET_MODE_UNIT_SIZE (mode);
10705 unsigned char bytes[16];
10706 int immtype = -1, matches;
10707 unsigned int invmask = inverse ? 0xff : 0;
10710 if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
10712 if (! (aarch64_simd_imm_zero_p (op, mode)
10713 || aarch64_vect_float_const_representable_p (op)))
10718 info->value = CONST_VECTOR_ELT (op, 0);
10719 info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
10727 /* Splat vector constant out into a byte vector. */
10728 for (i = 0; i < n_elts; i++)
10730 /* The vector is provided in gcc endian-neutral fashion. For aarch64_be,
10731 it must be laid out in the vector register in reverse order. */
10732 rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
10733 unsigned HOST_WIDE_INT elpart;
10735 gcc_assert (CONST_INT_P (el));
10736 elpart = INTVAL (el);
10738 for (unsigned int byte = 0; byte < innersize; byte++)
10740 bytes[idx++] = (elpart & 0xff) ^ invmask;
10741 elpart >>= BITS_PER_UNIT;
10746 /* Sanity check. */
10747 gcc_assert (idx == GET_MODE_SIZE (mode));
10751 CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
10752 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
10754 CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
10755 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
10757 CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
10758 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
10760 CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
10761 && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
10763 CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
10765 CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
10767 CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
10768 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
10770 CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
10771 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
10773 CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
10774 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
10776 CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
10777 && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
10779 CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
10781 CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
10783 CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
10784 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
10786 CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
10787 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
10789 CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
10790 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
10792 CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
10793 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
10795 CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
10797 CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
10798 && bytes[i] == bytes[(i + 8) % idx], 0, 0);
10807 info->element_width = elsize;
10808 info->mvn = emvn != 0;
10809 info->shift = eshift;
10811 unsigned HOST_WIDE_INT imm = 0;
10813 if (immtype >= 12 && immtype <= 15)
10816 /* Un-invert bytes of recognized vector, if necessary. */
10818 for (i = 0; i < idx; i++)
10819 bytes[i] ^= invmask;
10823 /* FIXME: Broken on 32-bit H_W_I hosts. */
10824 gcc_assert (sizeof (HOST_WIDE_INT) == 8);
10826 for (i = 0; i < 8; i++)
10827 imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
10828 << (i * BITS_PER_UNIT);
10831 info->value = GEN_INT (imm);
10835 for (i = 0; i < elsize / BITS_PER_UNIT; i++)
10836 imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
10838 /* Construct 'abcdefgh' because the assembler cannot handle
10839 generic constants. */
10842 imm = (imm >> info->shift) & 0xff;
10843 info->value = GEN_INT (imm);
10851 /* Check of immediate shift constants are within range. */
10853 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
10855 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
10857 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
10859 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
10862 /* Return true if X is a uniform vector where all elements
10863 are either the floating-point constant 0.0 or the
10864 integer constant 0. */
10866 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
10868 return x == CONST0_RTX (mode);
10872 /* Return the bitmask CONST_INT to select the bits required by a zero extract
10873 operation of width WIDTH at bit position POS. */
10876 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
10878 gcc_assert (CONST_INT_P (width));
10879 gcc_assert (CONST_INT_P (pos));
10881 unsigned HOST_WIDE_INT mask
10882 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
10883 return GEN_INT (mask << UINTVAL (pos));
10887 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
10889 HOST_WIDE_INT imm = INTVAL (x);
10892 for (i = 0; i < 8; i++)
10894 unsigned int byte = imm & 0xff;
10895 if (byte != 0xff && byte != 0)
10904 aarch64_mov_operand_p (rtx x, machine_mode mode)
10906 if (GET_CODE (x) == HIGH
10907 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
10910 if (CONST_INT_P (x))
10913 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
10916 return aarch64_classify_symbolic_expression (x)
10917 == SYMBOL_TINY_ABSOLUTE;
10920 /* Return a const_int vector of VAL. */
10922 aarch64_simd_gen_const_vector_dup (machine_mode mode, int val)
10924 int nunits = GET_MODE_NUNITS (mode);
10925 rtvec v = rtvec_alloc (nunits);
10928 for (i=0; i < nunits; i++)
10929 RTVEC_ELT (v, i) = GEN_INT (val);
10931 return gen_rtx_CONST_VECTOR (mode, v);
10934 /* Check OP is a legal scalar immediate for the MOVI instruction. */
10937 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
10939 machine_mode vmode;
10941 gcc_assert (!VECTOR_MODE_P (mode));
10942 vmode = aarch64_preferred_simd_mode (mode);
10943 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
10944 return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
10947 /* Construct and return a PARALLEL RTX vector with elements numbering the
10948 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
10949 the vector - from the perspective of the architecture. This does not
10950 line up with GCC's perspective on lane numbers, so we end up with
10951 different masks depending on our target endian-ness. The diagram
10952 below may help. We must draw the distinction when building masks
10953 which select one half of the vector. An instruction selecting
10954 architectural low-lanes for a big-endian target, must be described using
10955 a mask selecting GCC high-lanes.
10957 Big-Endian Little-Endian
10959 GCC 0 1 2 3 3 2 1 0
10960 | x | x | x | x | | x | x | x | x |
10961 Architecture 3 2 1 0 3 2 1 0
10963 Low Mask: { 2, 3 } { 0, 1 }
10964 High Mask: { 0, 1 } { 2, 3 }
10968 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
10970 int nunits = GET_MODE_NUNITS (mode);
10971 rtvec v = rtvec_alloc (nunits / 2);
10972 int high_base = nunits / 2;
10978 if (BYTES_BIG_ENDIAN)
10979 base = high ? low_base : high_base;
10981 base = high ? high_base : low_base;
10983 for (i = 0; i < nunits / 2; i++)
10984 RTVEC_ELT (v, i) = GEN_INT (base + i);
10986 t1 = gen_rtx_PARALLEL (mode, v);
10990 /* Check OP for validity as a PARALLEL RTX vector with elements
10991 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
10992 from the perspective of the architecture. See the diagram above
10993 aarch64_simd_vect_par_cnst_half for more details. */
10996 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
10999 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
11000 HOST_WIDE_INT count_op = XVECLEN (op, 0);
11001 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
11004 if (!VECTOR_MODE_P (mode))
11007 if (count_op != count_ideal)
11010 for (i = 0; i < count_ideal; i++)
11012 rtx elt_op = XVECEXP (op, 0, i);
11013 rtx elt_ideal = XVECEXP (ideal, 0, i);
11015 if (!CONST_INT_P (elt_op)
11016 || INTVAL (elt_ideal) != INTVAL (elt_op))
11022 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
11023 HIGH (exclusive). */
11025 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
11028 HOST_WIDE_INT lane;
11029 gcc_assert (CONST_INT_P (operand));
11030 lane = INTVAL (operand);
11032 if (lane < low || lane >= high)
11035 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
11037 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
11041 /* Return TRUE if OP is a valid vector addressing mode. */
11043 aarch64_simd_mem_operand_p (rtx op)
11045 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
11046 || REG_P (XEXP (op, 0)));
11049 /* Emit a register copy from operand to operand, taking care not to
11050 early-clobber source registers in the process.
11052 COUNT is the number of components into which the copy needs to be
11055 aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
11056 unsigned int count)
11059 int rdest = REGNO (operands[0]);
11060 int rsrc = REGNO (operands[1]);
11062 if (!reg_overlap_mentioned_p (operands[0], operands[1])
11064 for (i = 0; i < count; i++)
11065 emit_move_insn (gen_rtx_REG (mode, rdest + i),
11066 gen_rtx_REG (mode, rsrc + i));
11068 for (i = 0; i < count; i++)
11069 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
11070 gen_rtx_REG (mode, rsrc + count - i - 1));
11073 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
11074 one of VSTRUCT modes: OI, CI, or XI. */
11076 aarch64_simd_attr_length_rglist (enum machine_mode mode)
11078 return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
11081 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
11082 alignment of a vector to 128 bits. */
11083 static HOST_WIDE_INT
11084 aarch64_simd_vector_alignment (const_tree type)
11086 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
11087 return MIN (align, 128);
11090 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
11092 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
11097 /* We guarantee alignment for vectors up to 128-bits. */
11098 if (tree_int_cst_compare (TYPE_SIZE (type),
11099 bitsize_int (BIGGEST_ALIGNMENT)) > 0)
11102 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
11106 /* If VALS is a vector constant that can be loaded into a register
11107 using DUP, generate instructions to do so and return an RTX to
11108 assign to the register. Otherwise return NULL_RTX. */
11110 aarch64_simd_dup_constant (rtx vals)
11112 machine_mode mode = GET_MODE (vals);
11113 machine_mode inner_mode = GET_MODE_INNER (mode);
11116 if (!const_vec_duplicate_p (vals, &x))
11119 /* We can load this constant by using DUP and a constant in a
11120 single ARM register. This will be cheaper than a vector
11122 x = copy_to_mode_reg (inner_mode, x);
11123 return gen_rtx_VEC_DUPLICATE (mode, x);
11127 /* Generate code to load VALS, which is a PARALLEL containing only
11128 constants (for vec_init) or CONST_VECTOR, efficiently into a
11129 register. Returns an RTX to copy into the register, or NULL_RTX
11130 for a PARALLEL that can not be converted into a CONST_VECTOR. */
11132 aarch64_simd_make_constant (rtx vals)
11134 machine_mode mode = GET_MODE (vals);
11136 rtx const_vec = NULL_RTX;
11137 int n_elts = GET_MODE_NUNITS (mode);
11141 if (GET_CODE (vals) == CONST_VECTOR)
11143 else if (GET_CODE (vals) == PARALLEL)
11145 /* A CONST_VECTOR must contain only CONST_INTs and
11146 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
11147 Only store valid constants in a CONST_VECTOR. */
11148 for (i = 0; i < n_elts; ++i)
11150 rtx x = XVECEXP (vals, 0, i);
11151 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11154 if (n_const == n_elts)
11155 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
11158 gcc_unreachable ();
11160 if (const_vec != NULL_RTX
11161 && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
11162 /* Load using MOVI/MVNI. */
11164 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
11165 /* Loaded using DUP. */
11167 else if (const_vec != NULL_RTX)
11168 /* Load from constant pool. We can not take advantage of single-cycle
11169 LD1 because we need a PC-relative addressing mode. */
11172 /* A PARALLEL containing something not valid inside CONST_VECTOR.
11173 We can not construct an initializer. */
11177 /* Expand a vector initialisation sequence, such that TARGET is
11178 initialised to contain VALS. */
11181 aarch64_expand_vector_init (rtx target, rtx vals)
11183 machine_mode mode = GET_MODE (target);
11184 machine_mode inner_mode = GET_MODE_INNER (mode);
11185 /* The number of vector elements. */
11186 int n_elts = GET_MODE_NUNITS (mode);
11187 /* The number of vector elements which are not constant. */
11189 rtx any_const = NULL_RTX;
11190 /* The first element of vals. */
11191 rtx v0 = XVECEXP (vals, 0, 0);
11192 bool all_same = true;
11194 /* Count the number of variable elements to initialise. */
11195 for (int i = 0; i < n_elts; ++i)
11197 rtx x = XVECEXP (vals, 0, i);
11198 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
11203 all_same &= rtx_equal_p (x, v0);
11206 /* No variable elements, hand off to aarch64_simd_make_constant which knows
11207 how best to handle this. */
11210 rtx constant = aarch64_simd_make_constant (vals);
11211 if (constant != NULL_RTX)
11213 emit_move_insn (target, constant);
11218 /* Splat a single non-constant element if we can. */
11221 rtx x = copy_to_mode_reg (inner_mode, v0);
11222 aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
11226 /* Initialise a vector which is part-variable. We want to first try
11227 to build those lanes which are constant in the most efficient way we
11229 if (n_var != n_elts)
11231 rtx copy = copy_rtx (vals);
11233 /* Load constant part of vector. We really don't care what goes into the
11234 parts we will overwrite, but we're more likely to be able to load the
11235 constant efficiently if it has fewer, larger, repeating parts
11236 (see aarch64_simd_valid_immediate). */
11237 for (int i = 0; i < n_elts; i++)
11239 rtx x = XVECEXP (vals, 0, i);
11240 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11242 rtx subst = any_const;
11243 for (int bit = n_elts / 2; bit > 0; bit /= 2)
11245 /* Look in the copied vector, as more elements are const. */
11246 rtx test = XVECEXP (copy, 0, i ^ bit);
11247 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
11253 XVECEXP (copy, 0, i) = subst;
11255 aarch64_expand_vector_init (target, copy);
11258 /* Insert the variable lanes directly. */
11260 enum insn_code icode = optab_handler (vec_set_optab, mode);
11261 gcc_assert (icode != CODE_FOR_nothing);
11263 for (int i = 0; i < n_elts; i++)
11265 rtx x = XVECEXP (vals, 0, i);
11266 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11268 x = copy_to_mode_reg (inner_mode, x);
11269 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
11273 static unsigned HOST_WIDE_INT
11274 aarch64_shift_truncation_mask (machine_mode mode)
11277 (!SHIFT_COUNT_TRUNCATED
11278 || aarch64_vector_mode_supported_p (mode)
11279 || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
11282 /* Select a format to encode pointers in exception handling data. */
11284 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
11287 switch (aarch64_cmodel)
11289 case AARCH64_CMODEL_TINY:
11290 case AARCH64_CMODEL_TINY_PIC:
11291 case AARCH64_CMODEL_SMALL:
11292 case AARCH64_CMODEL_SMALL_PIC:
11293 case AARCH64_CMODEL_SMALL_SPIC:
11294 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
11296 type = DW_EH_PE_sdata4;
11299 /* No assumptions here. 8-byte relocs required. */
11300 type = DW_EH_PE_sdata8;
11303 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
11306 /* The last .arch and .tune assembly strings that we printed. */
11307 static std::string aarch64_last_printed_arch_string;
11308 static std::string aarch64_last_printed_tune_string;
11310 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
11311 by the function fndecl. */
11314 aarch64_declare_function_name (FILE *stream, const char* name,
11317 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11319 struct cl_target_option *targ_options;
11321 targ_options = TREE_TARGET_OPTION (target_parts);
11323 targ_options = TREE_TARGET_OPTION (target_option_current_node);
11324 gcc_assert (targ_options);
11326 const struct processor *this_arch
11327 = aarch64_get_arch (targ_options->x_explicit_arch);
11329 unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
11330 std::string extension
11331 = aarch64_get_extension_string_for_isa_flags (isa_flags,
11333 /* Only update the assembler .arch string if it is distinct from the last
11334 such string we printed. */
11335 std::string to_print = this_arch->name + extension;
11336 if (to_print != aarch64_last_printed_arch_string)
11338 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
11339 aarch64_last_printed_arch_string = to_print;
11342 /* Print the cpu name we're tuning for in the comments, might be
11343 useful to readers of the generated asm. Do it only when it changes
11344 from function to function and verbose assembly is requested. */
11345 const struct processor *this_tune
11346 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
11348 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
11350 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
11352 aarch64_last_printed_tune_string = this_tune->name;
11355 /* Don't forget the type directive for ELF. */
11356 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
11357 ASM_OUTPUT_LABEL (stream, name);
11360 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
11363 aarch64_start_file (void)
11365 struct cl_target_option *default_options
11366 = TREE_TARGET_OPTION (target_option_default_node);
11368 const struct processor *default_arch
11369 = aarch64_get_arch (default_options->x_explicit_arch);
11370 unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
11371 std::string extension
11372 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
11373 default_arch->flags);
11375 aarch64_last_printed_arch_string = default_arch->name + extension;
11376 aarch64_last_printed_tune_string = "";
11377 asm_fprintf (asm_out_file, "\t.arch %s\n",
11378 aarch64_last_printed_arch_string.c_str ());
11380 default_file_start ();
11383 /* Emit load exclusive. */
11386 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
11387 rtx mem, rtx model_rtx)
11389 rtx (*gen) (rtx, rtx, rtx);
11393 case QImode: gen = gen_aarch64_load_exclusiveqi; break;
11394 case HImode: gen = gen_aarch64_load_exclusivehi; break;
11395 case SImode: gen = gen_aarch64_load_exclusivesi; break;
11396 case DImode: gen = gen_aarch64_load_exclusivedi; break;
11398 gcc_unreachable ();
11401 emit_insn (gen (rval, mem, model_rtx));
11404 /* Emit store exclusive. */
11407 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
11408 rtx rval, rtx mem, rtx model_rtx)
11410 rtx (*gen) (rtx, rtx, rtx, rtx);
11414 case QImode: gen = gen_aarch64_store_exclusiveqi; break;
11415 case HImode: gen = gen_aarch64_store_exclusivehi; break;
11416 case SImode: gen = gen_aarch64_store_exclusivesi; break;
11417 case DImode: gen = gen_aarch64_store_exclusivedi; break;
11419 gcc_unreachable ();
11422 emit_insn (gen (bval, rval, mem, model_rtx));
11425 /* Mark the previous jump instruction as unlikely. */
11428 aarch64_emit_unlikely_jump (rtx insn)
11430 int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
11432 insn = emit_jump_insn (insn);
11433 add_int_reg_note (insn, REG_BR_PROB, very_unlikely);
11436 /* Expand a compare and swap pattern. */
11439 aarch64_expand_compare_and_swap (rtx operands[])
11441 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
11442 machine_mode mode, cmp_mode;
11443 typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
11446 const gen_cas_fn split_cas[] =
11448 gen_aarch64_compare_and_swapqi,
11449 gen_aarch64_compare_and_swaphi,
11450 gen_aarch64_compare_and_swapsi,
11451 gen_aarch64_compare_and_swapdi
11453 const gen_cas_fn atomic_cas[] =
11455 gen_aarch64_compare_and_swapqi_lse,
11456 gen_aarch64_compare_and_swaphi_lse,
11457 gen_aarch64_compare_and_swapsi_lse,
11458 gen_aarch64_compare_and_swapdi_lse
11461 bval = operands[0];
11462 rval = operands[1];
11464 oldval = operands[3];
11465 newval = operands[4];
11466 is_weak = operands[5];
11467 mod_s = operands[6];
11468 mod_f = operands[7];
11469 mode = GET_MODE (mem);
11472 /* Normally the succ memory model must be stronger than fail, but in the
11473 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
11474 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
11476 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
11477 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
11478 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
11484 /* For short modes, we're going to perform the comparison in SImode,
11485 so do the zero-extension now. */
11487 rval = gen_reg_rtx (SImode);
11488 oldval = convert_modes (SImode, mode, oldval, true);
11489 /* Fall through. */
11493 /* Force the value into a register if needed. */
11494 if (!aarch64_plus_operand (oldval, mode))
11495 oldval = force_reg (cmp_mode, oldval);
11499 gcc_unreachable ();
11504 case QImode: idx = 0; break;
11505 case HImode: idx = 1; break;
11506 case SImode: idx = 2; break;
11507 case DImode: idx = 3; break;
11509 gcc_unreachable ();
11512 gen = atomic_cas[idx];
11514 gen = split_cas[idx];
11516 emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
11518 if (mode == QImode || mode == HImode)
11519 emit_move_insn (operands[1], gen_lowpart (mode, rval));
11521 x = gen_rtx_REG (CCmode, CC_REGNUM);
11522 x = gen_rtx_EQ (SImode, x, const0_rtx);
11523 emit_insn (gen_rtx_SET (bval, x));
11526 /* Test whether the target supports using a atomic load-operate instruction.
11527 CODE is the operation and AFTER is TRUE if the data in memory after the
11528 operation should be returned and FALSE if the data before the operation
11529 should be returned. Returns FALSE if the operation isn't supported by the
11533 aarch64_atomic_ldop_supported_p (enum rtx_code code)
11552 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
11553 sequence implementing an atomic operation. */
11556 aarch64_emit_post_barrier (enum memmodel model)
11558 const enum memmodel base_model = memmodel_base (model);
11560 if (is_mm_sync (model)
11561 && (base_model == MEMMODEL_ACQUIRE
11562 || base_model == MEMMODEL_ACQ_REL
11563 || base_model == MEMMODEL_SEQ_CST))
11565 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
11569 /* Emit an atomic compare-and-swap operation. RVAL is the destination register
11570 for the data in memory. EXPECTED is the value expected to be in memory.
11571 DESIRED is the value to store to memory. MEM is the memory location. MODEL
11572 is the memory ordering to use. */
11575 aarch64_gen_atomic_cas (rtx rval, rtx mem,
11576 rtx expected, rtx desired,
11579 rtx (*gen) (rtx, rtx, rtx, rtx);
11582 mode = GET_MODE (mem);
11586 case QImode: gen = gen_aarch64_atomic_casqi; break;
11587 case HImode: gen = gen_aarch64_atomic_cashi; break;
11588 case SImode: gen = gen_aarch64_atomic_cassi; break;
11589 case DImode: gen = gen_aarch64_atomic_casdi; break;
11591 gcc_unreachable ();
11594 /* Move the expected value into the CAS destination register. */
11595 emit_insn (gen_rtx_SET (rval, expected));
11597 /* Emit the CAS. */
11598 emit_insn (gen (rval, mem, desired, model));
11600 /* Compare the expected value with the value loaded by the CAS, to establish
11601 whether the swap was made. */
11602 aarch64_gen_compare_reg (EQ, rval, expected);
11605 /* Split a compare and swap pattern. */
11608 aarch64_split_compare_and_swap (rtx operands[])
11610 rtx rval, mem, oldval, newval, scratch;
11613 rtx_code_label *label1, *label2;
11615 enum memmodel model;
11618 rval = operands[0];
11620 oldval = operands[2];
11621 newval = operands[3];
11622 is_weak = (operands[4] != const0_rtx);
11623 model_rtx = operands[5];
11624 scratch = operands[7];
11625 mode = GET_MODE (mem);
11626 model = memmodel_from_int (INTVAL (model_rtx));
11631 label1 = gen_label_rtx ();
11632 emit_label (label1);
11634 label2 = gen_label_rtx ();
11636 /* The initial load can be relaxed for a __sync operation since a final
11637 barrier will be emitted to stop code hoisting. */
11638 if (is_mm_sync (model))
11639 aarch64_emit_load_exclusive (mode, rval, mem,
11640 GEN_INT (MEMMODEL_RELAXED));
11642 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
11644 cond = aarch64_gen_compare_reg (NE, rval, oldval);
11645 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
11646 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
11647 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
11648 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
11650 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
11654 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
11655 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
11656 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
11657 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
11661 cond = gen_rtx_REG (CCmode, CC_REGNUM);
11662 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
11663 emit_insn (gen_rtx_SET (cond, x));
11666 emit_label (label2);
11668 /* Emit any final barrier needed for a __sync operation. */
11669 if (is_mm_sync (model))
11670 aarch64_emit_post_barrier (model);
11673 /* Emit a BIC instruction. */
11676 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
11678 rtx shift_rtx = GEN_INT (shift);
11679 rtx (*gen) (rtx, rtx, rtx, rtx);
11683 case SImode: gen = gen_and_one_cmpl_lshrsi3; break;
11684 case DImode: gen = gen_and_one_cmpl_lshrdi3; break;
11686 gcc_unreachable ();
11689 emit_insn (gen (dst, s2, shift_rtx, s1));
11692 /* Emit an atomic swap. */
11695 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
11696 rtx mem, rtx model)
11698 rtx (*gen) (rtx, rtx, rtx, rtx);
11702 case QImode: gen = gen_aarch64_atomic_swpqi; break;
11703 case HImode: gen = gen_aarch64_atomic_swphi; break;
11704 case SImode: gen = gen_aarch64_atomic_swpsi; break;
11705 case DImode: gen = gen_aarch64_atomic_swpdi; break;
11707 gcc_unreachable ();
11710 emit_insn (gen (dst, mem, value, model));
11713 /* Operations supported by aarch64_emit_atomic_load_op. */
11715 enum aarch64_atomic_load_op_code
11717 AARCH64_LDOP_PLUS, /* A + B */
11718 AARCH64_LDOP_XOR, /* A ^ B */
11719 AARCH64_LDOP_OR, /* A | B */
11720 AARCH64_LDOP_BIC /* A & ~B */
11723 /* Emit an atomic load-operate. */
11726 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
11727 machine_mode mode, rtx dst, rtx src,
11728 rtx mem, rtx model)
11730 typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
11731 const aarch64_atomic_load_op_fn plus[] =
11733 gen_aarch64_atomic_loadaddqi,
11734 gen_aarch64_atomic_loadaddhi,
11735 gen_aarch64_atomic_loadaddsi,
11736 gen_aarch64_atomic_loadadddi
11738 const aarch64_atomic_load_op_fn eor[] =
11740 gen_aarch64_atomic_loadeorqi,
11741 gen_aarch64_atomic_loadeorhi,
11742 gen_aarch64_atomic_loadeorsi,
11743 gen_aarch64_atomic_loadeordi
11745 const aarch64_atomic_load_op_fn ior[] =
11747 gen_aarch64_atomic_loadsetqi,
11748 gen_aarch64_atomic_loadsethi,
11749 gen_aarch64_atomic_loadsetsi,
11750 gen_aarch64_atomic_loadsetdi
11752 const aarch64_atomic_load_op_fn bic[] =
11754 gen_aarch64_atomic_loadclrqi,
11755 gen_aarch64_atomic_loadclrhi,
11756 gen_aarch64_atomic_loadclrsi,
11757 gen_aarch64_atomic_loadclrdi
11759 aarch64_atomic_load_op_fn gen;
11764 case QImode: idx = 0; break;
11765 case HImode: idx = 1; break;
11766 case SImode: idx = 2; break;
11767 case DImode: idx = 3; break;
11769 gcc_unreachable ();
11774 case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
11775 case AARCH64_LDOP_XOR: gen = eor[idx]; break;
11776 case AARCH64_LDOP_OR: gen = ior[idx]; break;
11777 case AARCH64_LDOP_BIC: gen = bic[idx]; break;
11779 gcc_unreachable ();
11782 emit_insn (gen (dst, mem, src, model));
11785 /* Emit an atomic load+operate. CODE is the operation. OUT_DATA is the
11786 location to store the data read from memory. OUT_RESULT is the location to
11787 store the result of the operation. MEM is the memory location to read and
11788 modify. MODEL_RTX is the memory ordering to use. VALUE is the second
11789 operand for the operation. Either OUT_DATA or OUT_RESULT, but not both, can
11793 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
11794 rtx mem, rtx value, rtx model_rtx)
11796 machine_mode mode = GET_MODE (mem);
11797 machine_mode wmode = (mode == DImode ? DImode : SImode);
11798 const bool short_mode = (mode < SImode);
11799 aarch64_atomic_load_op_code ldop_code;
11804 out_data = gen_lowpart (mode, out_data);
11807 out_result = gen_lowpart (mode, out_result);
11809 /* Make sure the value is in a register, putting it into a destination
11810 register if it needs to be manipulated. */
11811 if (!register_operand (value, mode)
11812 || code == AND || code == MINUS)
11814 src = out_result ? out_result : out_data;
11815 emit_move_insn (src, gen_lowpart (mode, value));
11819 gcc_assert (register_operand (src, mode));
11821 /* Preprocess the data for the operation as necessary. If the operation is
11822 a SET then emit a swap instruction and finish. */
11826 aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
11830 /* Negate the value and treat it as a PLUS. */
11834 /* Resize the value if necessary. */
11836 src = gen_lowpart (wmode, src);
11838 neg_src = gen_rtx_NEG (wmode, src);
11839 emit_insn (gen_rtx_SET (src, neg_src));
11842 src = gen_lowpart (mode, src);
11844 /* Fall-through. */
11846 ldop_code = AARCH64_LDOP_PLUS;
11850 ldop_code = AARCH64_LDOP_OR;
11854 ldop_code = AARCH64_LDOP_XOR;
11861 /* Resize the value if necessary. */
11863 src = gen_lowpart (wmode, src);
11865 not_src = gen_rtx_NOT (wmode, src);
11866 emit_insn (gen_rtx_SET (src, not_src));
11869 src = gen_lowpart (mode, src);
11871 ldop_code = AARCH64_LDOP_BIC;
11875 /* The operation can't be done with atomic instructions. */
11876 gcc_unreachable ();
11879 aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
11881 /* If necessary, calculate the data in memory after the update by redoing the
11882 operation from values in registers. */
11888 src = gen_lowpart (wmode, src);
11889 out_data = gen_lowpart (wmode, out_data);
11890 out_result = gen_lowpart (wmode, out_result);
11899 x = gen_rtx_PLUS (wmode, out_data, src);
11902 x = gen_rtx_IOR (wmode, out_data, src);
11905 x = gen_rtx_XOR (wmode, out_data, src);
11908 aarch64_emit_bic (wmode, out_result, out_data, src, 0);
11911 gcc_unreachable ();
11914 emit_set_insn (out_result, x);
11919 /* Split an atomic operation. */
11922 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
11923 rtx value, rtx model_rtx, rtx cond)
11925 machine_mode mode = GET_MODE (mem);
11926 machine_mode wmode = (mode == DImode ? DImode : SImode);
11927 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
11928 const bool is_sync = is_mm_sync (model);
11929 rtx_code_label *label;
11932 /* Split the atomic operation into a sequence. */
11933 label = gen_label_rtx ();
11934 emit_label (label);
11937 new_out = gen_lowpart (wmode, new_out);
11939 old_out = gen_lowpart (wmode, old_out);
11942 value = simplify_gen_subreg (wmode, value, mode, 0);
11944 /* The initial load can be relaxed for a __sync operation since a final
11945 barrier will be emitted to stop code hoisting. */
11947 aarch64_emit_load_exclusive (mode, old_out, mem,
11948 GEN_INT (MEMMODEL_RELAXED));
11950 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
11959 x = gen_rtx_AND (wmode, old_out, value);
11960 emit_insn (gen_rtx_SET (new_out, x));
11961 x = gen_rtx_NOT (wmode, new_out);
11962 emit_insn (gen_rtx_SET (new_out, x));
11966 if (CONST_INT_P (value))
11968 value = GEN_INT (-INTVAL (value));
11971 /* Fall through. */
11974 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
11975 emit_insn (gen_rtx_SET (new_out, x));
11979 aarch64_emit_store_exclusive (mode, cond, mem,
11980 gen_lowpart (mode, new_out), model_rtx);
11982 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
11983 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
11984 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
11985 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
11987 /* Emit any final barrier needed for a __sync operation. */
11989 aarch64_emit_post_barrier (model);
11993 aarch64_init_libfuncs (void)
11995 /* Half-precision float operations. The compiler handles all operations
11996 with NULL libfuncs by converting to SFmode. */
11999 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
12000 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
12003 set_optab_libfunc (add_optab, HFmode, NULL);
12004 set_optab_libfunc (sdiv_optab, HFmode, NULL);
12005 set_optab_libfunc (smul_optab, HFmode, NULL);
12006 set_optab_libfunc (neg_optab, HFmode, NULL);
12007 set_optab_libfunc (sub_optab, HFmode, NULL);
12010 set_optab_libfunc (eq_optab, HFmode, NULL);
12011 set_optab_libfunc (ne_optab, HFmode, NULL);
12012 set_optab_libfunc (lt_optab, HFmode, NULL);
12013 set_optab_libfunc (le_optab, HFmode, NULL);
12014 set_optab_libfunc (ge_optab, HFmode, NULL);
12015 set_optab_libfunc (gt_optab, HFmode, NULL);
12016 set_optab_libfunc (unord_optab, HFmode, NULL);
12019 /* Target hook for c_mode_for_suffix. */
12020 static machine_mode
12021 aarch64_c_mode_for_suffix (char suffix)
12029 /* We can only represent floating point constants which will fit in
12030 "quarter-precision" values. These values are characterised by
12031 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
12034 (-1)^s * (n/16) * 2^r
12037 's' is the sign bit.
12038 'n' is an integer in the range 16 <= n <= 31.
12039 'r' is an integer in the range -3 <= r <= 4. */
12041 /* Return true iff X can be represented by a quarter-precision
12042 floating point immediate operand X. Note, we cannot represent 0.0. */
12044 aarch64_float_const_representable_p (rtx x)
12046 /* This represents our current view of how many bits
12047 make up the mantissa. */
12048 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
12050 unsigned HOST_WIDE_INT mantissa, mask;
12051 REAL_VALUE_TYPE r, m;
12054 if (!CONST_DOUBLE_P (x))
12057 /* We don't support HFmode constants yet. */
12058 if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
12061 r = *CONST_DOUBLE_REAL_VALUE (x);
12063 /* We cannot represent infinities, NaNs or +/-zero. We won't
12064 know if we have +zero until we analyse the mantissa, but we
12065 can reject the other invalid values. */
12066 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
12067 || REAL_VALUE_MINUS_ZERO (r))
12070 /* Extract exponent. */
12071 r = real_value_abs (&r);
12072 exponent = REAL_EXP (&r);
12074 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
12075 highest (sign) bit, with a fixed binary point at bit point_pos.
12076 m1 holds the low part of the mantissa, m2 the high part.
12077 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
12078 bits for the mantissa, this can fail (low bits will be lost). */
12079 real_ldexp (&m, &r, point_pos - exponent);
12080 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
12082 /* If the low part of the mantissa has bits set we cannot represent
12084 if (w.elt (0) != 0)
12086 /* We have rejected the lower HOST_WIDE_INT, so update our
12087 understanding of how many bits lie in the mantissa and
12088 look only at the high HOST_WIDE_INT. */
12089 mantissa = w.elt (1);
12090 point_pos -= HOST_BITS_PER_WIDE_INT;
12092 /* We can only represent values with a mantissa of the form 1.xxxx. */
12093 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
12094 if ((mantissa & mask) != 0)
12097 /* Having filtered unrepresentable values, we may now remove all
12098 but the highest 5 bits. */
12099 mantissa >>= point_pos - 5;
12101 /* We cannot represent the value 0.0, so reject it. This is handled
12106 /* Then, as bit 4 is always set, we can mask it off, leaving
12107 the mantissa in the range [0, 15]. */
12108 mantissa &= ~(1 << 4);
12109 gcc_assert (mantissa <= 15);
12111 /* GCC internally does not use IEEE754-like encoding (where normalized
12112 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
12113 Our mantissa values are shifted 4 places to the left relative to
12114 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
12115 by 5 places to correct for GCC's representation. */
12116 exponent = 5 - exponent;
12118 return (exponent >= 0 && exponent <= 7);
12122 aarch64_output_simd_mov_immediate (rtx const_vector,
12127 static char templ[40];
12128 const char *mnemonic;
12129 const char *shift_op;
12130 unsigned int lane_count = 0;
12133 struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
12135 /* This will return true to show const_vector is legal for use as either
12136 a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate. It will
12137 also update INFO to show how the immediate should be generated. */
12138 is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
12139 gcc_assert (is_valid);
12141 element_char = sizetochar (info.element_width);
12142 lane_count = width / info.element_width;
12144 mode = GET_MODE_INNER (mode);
12145 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12147 gcc_assert (info.shift == 0 && ! info.mvn);
12148 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
12149 move immediate path. */
12150 if (aarch64_float_const_zero_rtx_p (info.value))
12151 info.value = GEN_INT (0);
12154 const unsigned int buf_size = 20;
12155 char float_buf[buf_size] = {'\0'};
12156 real_to_decimal_for_mode (float_buf,
12157 CONST_DOUBLE_REAL_VALUE (info.value),
12158 buf_size, buf_size, 1, mode);
12160 if (lane_count == 1)
12161 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
12163 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
12164 lane_count, element_char, float_buf);
12169 mnemonic = info.mvn ? "mvni" : "movi";
12170 shift_op = info.msl ? "msl" : "lsl";
12172 gcc_assert (CONST_INT_P (info.value));
12173 if (lane_count == 1)
12174 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
12175 mnemonic, UINTVAL (info.value));
12176 else if (info.shift)
12177 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
12178 ", %s %d", mnemonic, lane_count, element_char,
12179 UINTVAL (info.value), shift_op, info.shift);
12181 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
12182 mnemonic, lane_count, element_char, UINTVAL (info.value));
12187 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
12190 machine_mode vmode;
12192 gcc_assert (!VECTOR_MODE_P (mode));
12193 vmode = aarch64_simd_container_mode (mode, 64);
12194 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
12195 return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
12198 /* Split operands into moves from op[1] + op[2] into op[0]. */
12201 aarch64_split_combinev16qi (rtx operands[3])
12203 unsigned int dest = REGNO (operands[0]);
12204 unsigned int src1 = REGNO (operands[1]);
12205 unsigned int src2 = REGNO (operands[2]);
12206 machine_mode halfmode = GET_MODE (operands[1]);
12207 unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
12208 rtx destlo, desthi;
12210 gcc_assert (halfmode == V16QImode);
12212 if (src1 == dest && src2 == dest + halfregs)
12214 /* No-op move. Can't split to nothing; emit something. */
12215 emit_note (NOTE_INSN_DELETED);
12219 /* Preserve register attributes for variable tracking. */
12220 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
12221 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
12222 GET_MODE_SIZE (halfmode));
12224 /* Special case of reversed high/low parts. */
12225 if (reg_overlap_mentioned_p (operands[2], destlo)
12226 && reg_overlap_mentioned_p (operands[1], desthi))
12228 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
12229 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
12230 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
12232 else if (!reg_overlap_mentioned_p (operands[2], destlo))
12234 /* Try to avoid unnecessary moves if part of the result
12235 is in the right place already. */
12237 emit_move_insn (destlo, operands[1]);
12238 if (src2 != dest + halfregs)
12239 emit_move_insn (desthi, operands[2]);
12243 if (src2 != dest + halfregs)
12244 emit_move_insn (desthi, operands[2]);
12246 emit_move_insn (destlo, operands[1]);
12250 /* vec_perm support. */
12252 #define MAX_VECT_LEN 16
12254 struct expand_vec_perm_d
12256 rtx target, op0, op1;
12257 unsigned char perm[MAX_VECT_LEN];
12258 machine_mode vmode;
12259 unsigned char nelt;
12264 /* Generate a variable permutation. */
12267 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
12269 machine_mode vmode = GET_MODE (target);
12270 bool one_vector_p = rtx_equal_p (op0, op1);
12272 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
12273 gcc_checking_assert (GET_MODE (op0) == vmode);
12274 gcc_checking_assert (GET_MODE (op1) == vmode);
12275 gcc_checking_assert (GET_MODE (sel) == vmode);
12276 gcc_checking_assert (TARGET_SIMD);
12280 if (vmode == V8QImode)
12282 /* Expand the argument to a V16QI mode by duplicating it. */
12283 rtx pair = gen_reg_rtx (V16QImode);
12284 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
12285 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
12289 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
12296 if (vmode == V8QImode)
12298 pair = gen_reg_rtx (V16QImode);
12299 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
12300 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
12304 pair = gen_reg_rtx (OImode);
12305 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
12306 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
12312 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
12314 machine_mode vmode = GET_MODE (target);
12315 unsigned int nelt = GET_MODE_NUNITS (vmode);
12316 bool one_vector_p = rtx_equal_p (op0, op1);
12319 /* The TBL instruction does not use a modulo index, so we must take care
12320 of that ourselves. */
12321 mask = aarch64_simd_gen_const_vector_dup (vmode,
12322 one_vector_p ? nelt - 1 : 2 * nelt - 1);
12323 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
12325 /* For big-endian, we also need to reverse the index within the vector
12326 (but not which vector). */
12327 if (BYTES_BIG_ENDIAN)
12329 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
12331 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
12332 sel = expand_simple_binop (vmode, XOR, sel, mask,
12333 NULL, 0, OPTAB_LIB_WIDEN);
12335 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
12338 /* Recognize patterns suitable for the TRN instructions. */
12340 aarch64_evpc_trn (struct expand_vec_perm_d *d)
12342 unsigned int i, odd, mask, nelt = d->nelt;
12343 rtx out, in0, in1, x;
12344 rtx (*gen) (rtx, rtx, rtx);
12345 machine_mode vmode = d->vmode;
12347 if (GET_MODE_UNIT_SIZE (vmode) > 8)
12350 /* Note that these are little-endian tests.
12351 We correct for big-endian later. */
12352 if (d->perm[0] == 0)
12354 else if (d->perm[0] == 1)
12358 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12360 for (i = 0; i < nelt; i += 2)
12362 if (d->perm[i] != i + odd)
12364 if (d->perm[i + 1] != ((i + nelt + odd) & mask))
12374 if (BYTES_BIG_ENDIAN)
12376 x = in0, in0 = in1, in1 = x;
12385 case V16QImode: gen = gen_aarch64_trn2v16qi; break;
12386 case V8QImode: gen = gen_aarch64_trn2v8qi; break;
12387 case V8HImode: gen = gen_aarch64_trn2v8hi; break;
12388 case V4HImode: gen = gen_aarch64_trn2v4hi; break;
12389 case V4SImode: gen = gen_aarch64_trn2v4si; break;
12390 case V2SImode: gen = gen_aarch64_trn2v2si; break;
12391 case V2DImode: gen = gen_aarch64_trn2v2di; break;
12392 case V4HFmode: gen = gen_aarch64_trn2v4hf; break;
12393 case V8HFmode: gen = gen_aarch64_trn2v8hf; break;
12394 case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
12395 case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
12396 case V2DFmode: gen = gen_aarch64_trn2v2df; break;
12405 case V16QImode: gen = gen_aarch64_trn1v16qi; break;
12406 case V8QImode: gen = gen_aarch64_trn1v8qi; break;
12407 case V8HImode: gen = gen_aarch64_trn1v8hi; break;
12408 case V4HImode: gen = gen_aarch64_trn1v4hi; break;
12409 case V4SImode: gen = gen_aarch64_trn1v4si; break;
12410 case V2SImode: gen = gen_aarch64_trn1v2si; break;
12411 case V2DImode: gen = gen_aarch64_trn1v2di; break;
12412 case V4HFmode: gen = gen_aarch64_trn1v4hf; break;
12413 case V8HFmode: gen = gen_aarch64_trn1v8hf; break;
12414 case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
12415 case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
12416 case V2DFmode: gen = gen_aarch64_trn1v2df; break;
12422 emit_insn (gen (out, in0, in1));
12426 /* Recognize patterns suitable for the UZP instructions. */
12428 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
12430 unsigned int i, odd, mask, nelt = d->nelt;
12431 rtx out, in0, in1, x;
12432 rtx (*gen) (rtx, rtx, rtx);
12433 machine_mode vmode = d->vmode;
12435 if (GET_MODE_UNIT_SIZE (vmode) > 8)
12438 /* Note that these are little-endian tests.
12439 We correct for big-endian later. */
12440 if (d->perm[0] == 0)
12442 else if (d->perm[0] == 1)
12446 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12448 for (i = 0; i < nelt; i++)
12450 unsigned elt = (i * 2 + odd) & mask;
12451 if (d->perm[i] != elt)
12461 if (BYTES_BIG_ENDIAN)
12463 x = in0, in0 = in1, in1 = x;
12472 case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
12473 case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
12474 case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
12475 case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
12476 case V4SImode: gen = gen_aarch64_uzp2v4si; break;
12477 case V2SImode: gen = gen_aarch64_uzp2v2si; break;
12478 case V2DImode: gen = gen_aarch64_uzp2v2di; break;
12479 case V4HFmode: gen = gen_aarch64_uzp2v4hf; break;
12480 case V8HFmode: gen = gen_aarch64_uzp2v8hf; break;
12481 case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
12482 case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
12483 case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
12492 case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
12493 case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
12494 case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
12495 case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
12496 case V4SImode: gen = gen_aarch64_uzp1v4si; break;
12497 case V2SImode: gen = gen_aarch64_uzp1v2si; break;
12498 case V2DImode: gen = gen_aarch64_uzp1v2di; break;
12499 case V4HFmode: gen = gen_aarch64_uzp1v4hf; break;
12500 case V8HFmode: gen = gen_aarch64_uzp1v8hf; break;
12501 case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
12502 case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
12503 case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
12509 emit_insn (gen (out, in0, in1));
12513 /* Recognize patterns suitable for the ZIP instructions. */
12515 aarch64_evpc_zip (struct expand_vec_perm_d *d)
12517 unsigned int i, high, mask, nelt = d->nelt;
12518 rtx out, in0, in1, x;
12519 rtx (*gen) (rtx, rtx, rtx);
12520 machine_mode vmode = d->vmode;
12522 if (GET_MODE_UNIT_SIZE (vmode) > 8)
12525 /* Note that these are little-endian tests.
12526 We correct for big-endian later. */
12528 if (d->perm[0] == high)
12531 else if (d->perm[0] == 0)
12535 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12537 for (i = 0; i < nelt / 2; i++)
12539 unsigned elt = (i + high) & mask;
12540 if (d->perm[i * 2] != elt)
12542 elt = (elt + nelt) & mask;
12543 if (d->perm[i * 2 + 1] != elt)
12553 if (BYTES_BIG_ENDIAN)
12555 x = in0, in0 = in1, in1 = x;
12564 case V16QImode: gen = gen_aarch64_zip2v16qi; break;
12565 case V8QImode: gen = gen_aarch64_zip2v8qi; break;
12566 case V8HImode: gen = gen_aarch64_zip2v8hi; break;
12567 case V4HImode: gen = gen_aarch64_zip2v4hi; break;
12568 case V4SImode: gen = gen_aarch64_zip2v4si; break;
12569 case V2SImode: gen = gen_aarch64_zip2v2si; break;
12570 case V2DImode: gen = gen_aarch64_zip2v2di; break;
12571 case V4HFmode: gen = gen_aarch64_zip2v4hf; break;
12572 case V8HFmode: gen = gen_aarch64_zip2v8hf; break;
12573 case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
12574 case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
12575 case V2DFmode: gen = gen_aarch64_zip2v2df; break;
12584 case V16QImode: gen = gen_aarch64_zip1v16qi; break;
12585 case V8QImode: gen = gen_aarch64_zip1v8qi; break;
12586 case V8HImode: gen = gen_aarch64_zip1v8hi; break;
12587 case V4HImode: gen = gen_aarch64_zip1v4hi; break;
12588 case V4SImode: gen = gen_aarch64_zip1v4si; break;
12589 case V2SImode: gen = gen_aarch64_zip1v2si; break;
12590 case V2DImode: gen = gen_aarch64_zip1v2di; break;
12591 case V4HFmode: gen = gen_aarch64_zip1v4hf; break;
12592 case V8HFmode: gen = gen_aarch64_zip1v8hf; break;
12593 case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
12594 case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
12595 case V2DFmode: gen = gen_aarch64_zip1v2df; break;
12601 emit_insn (gen (out, in0, in1));
12605 /* Recognize patterns for the EXT insn. */
12608 aarch64_evpc_ext (struct expand_vec_perm_d *d)
12610 unsigned int i, nelt = d->nelt;
12611 rtx (*gen) (rtx, rtx, rtx, rtx);
12614 unsigned int location = d->perm[0]; /* Always < nelt. */
12616 /* Check if the extracted indices are increasing by one. */
12617 for (i = 1; i < nelt; i++)
12619 unsigned int required = location + i;
12620 if (d->one_vector_p)
12622 /* We'll pass the same vector in twice, so allow indices to wrap. */
12623 required &= (nelt - 1);
12625 if (d->perm[i] != required)
12631 case V16QImode: gen = gen_aarch64_extv16qi; break;
12632 case V8QImode: gen = gen_aarch64_extv8qi; break;
12633 case V4HImode: gen = gen_aarch64_extv4hi; break;
12634 case V8HImode: gen = gen_aarch64_extv8hi; break;
12635 case V2SImode: gen = gen_aarch64_extv2si; break;
12636 case V4SImode: gen = gen_aarch64_extv4si; break;
12637 case V4HFmode: gen = gen_aarch64_extv4hf; break;
12638 case V8HFmode: gen = gen_aarch64_extv8hf; break;
12639 case V2SFmode: gen = gen_aarch64_extv2sf; break;
12640 case V4SFmode: gen = gen_aarch64_extv4sf; break;
12641 case V2DImode: gen = gen_aarch64_extv2di; break;
12642 case V2DFmode: gen = gen_aarch64_extv2df; break;
12651 /* The case where (location == 0) is a no-op for both big- and little-endian,
12652 and is removed by the mid-end at optimization levels -O1 and higher. */
12654 if (BYTES_BIG_ENDIAN && (location != 0))
12656 /* After setup, we want the high elements of the first vector (stored
12657 at the LSB end of the register), and the low elements of the second
12658 vector (stored at the MSB end of the register). So swap. */
12659 std::swap (d->op0, d->op1);
12660 /* location != 0 (above), so safe to assume (nelt - location) < nelt. */
12661 location = nelt - location;
12664 offset = GEN_INT (location);
12665 emit_insn (gen (d->target, d->op0, d->op1, offset));
12669 /* Recognize patterns for the REV insns. */
12672 aarch64_evpc_rev (struct expand_vec_perm_d *d)
12674 unsigned int i, j, diff, nelt = d->nelt;
12675 rtx (*gen) (rtx, rtx);
12677 if (!d->one_vector_p)
12686 case V16QImode: gen = gen_aarch64_rev64v16qi; break;
12687 case V8QImode: gen = gen_aarch64_rev64v8qi; break;
12695 case V16QImode: gen = gen_aarch64_rev32v16qi; break;
12696 case V8QImode: gen = gen_aarch64_rev32v8qi; break;
12697 case V8HImode: gen = gen_aarch64_rev64v8hi; break;
12698 case V4HImode: gen = gen_aarch64_rev64v4hi; break;
12706 case V16QImode: gen = gen_aarch64_rev16v16qi; break;
12707 case V8QImode: gen = gen_aarch64_rev16v8qi; break;
12708 case V8HImode: gen = gen_aarch64_rev32v8hi; break;
12709 case V4HImode: gen = gen_aarch64_rev32v4hi; break;
12710 case V4SImode: gen = gen_aarch64_rev64v4si; break;
12711 case V2SImode: gen = gen_aarch64_rev64v2si; break;
12712 case V4SFmode: gen = gen_aarch64_rev64v4sf; break;
12713 case V2SFmode: gen = gen_aarch64_rev64v2sf; break;
12714 case V8HFmode: gen = gen_aarch64_rev64v8hf; break;
12715 case V4HFmode: gen = gen_aarch64_rev64v4hf; break;
12724 for (i = 0; i < nelt ; i += diff + 1)
12725 for (j = 0; j <= diff; j += 1)
12727 /* This is guaranteed to be true as the value of diff
12728 is 7, 3, 1 and we should have enough elements in the
12729 queue to generate this. Getting a vector mask with a
12730 value of diff other than these values implies that
12731 something is wrong by the time we get here. */
12732 gcc_assert (i + j < nelt);
12733 if (d->perm[i + j] != i + diff - j)
12741 emit_insn (gen (d->target, d->op0));
12746 aarch64_evpc_dup (struct expand_vec_perm_d *d)
12748 rtx (*gen) (rtx, rtx, rtx);
12749 rtx out = d->target;
12751 machine_mode vmode = d->vmode;
12752 unsigned int i, elt, nelt = d->nelt;
12756 for (i = 1; i < nelt; i++)
12758 if (elt != d->perm[i])
12762 /* The generic preparation in aarch64_expand_vec_perm_const_1
12763 swaps the operand order and the permute indices if it finds
12764 d->perm[0] to be in the second operand. Thus, we can always
12765 use d->op0 and need not do any extra arithmetic to get the
12766 correct lane number. */
12768 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
12772 case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
12773 case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
12774 case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
12775 case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
12776 case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
12777 case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
12778 case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
12779 case V8HFmode: gen = gen_aarch64_dup_lanev8hf; break;
12780 case V4HFmode: gen = gen_aarch64_dup_lanev4hf; break;
12781 case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
12782 case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
12783 case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
12788 emit_insn (gen (out, in0, lane));
12793 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
12795 rtx rperm[MAX_VECT_LEN], sel;
12796 machine_mode vmode = d->vmode;
12797 unsigned int i, nelt = d->nelt;
12802 /* Generic code will try constant permutation twice. Once with the
12803 original mode and again with the elements lowered to QImode.
12804 So wait and don't do the selector expansion ourselves. */
12805 if (vmode != V8QImode && vmode != V16QImode)
12808 for (i = 0; i < nelt; ++i)
12810 int nunits = GET_MODE_NUNITS (vmode);
12812 /* If big-endian and two vectors we end up with a weird mixed-endian
12813 mode on NEON. Reverse the index within each word but not the word
12815 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
12818 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
12819 sel = force_reg (vmode, sel);
12821 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
12826 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
12828 /* The pattern matching functions above are written to look for a small
12829 number to begin the sequence (0, 1, N/2). If we begin with an index
12830 from the second operand, we can swap the operands. */
12831 if (d->perm[0] >= d->nelt)
12833 unsigned i, nelt = d->nelt;
12835 gcc_assert (nelt == (nelt & -nelt));
12836 for (i = 0; i < nelt; ++i)
12837 d->perm[i] ^= nelt; /* Keep the same index, but in the other vector. */
12839 std::swap (d->op0, d->op1);
12844 if (aarch64_evpc_rev (d))
12846 else if (aarch64_evpc_ext (d))
12848 else if (aarch64_evpc_dup (d))
12850 else if (aarch64_evpc_zip (d))
12852 else if (aarch64_evpc_uzp (d))
12854 else if (aarch64_evpc_trn (d))
12856 return aarch64_evpc_tbl (d);
12861 /* Expand a vec_perm_const pattern. */
12864 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
12866 struct expand_vec_perm_d d;
12867 int i, nelt, which;
12873 d.vmode = GET_MODE (target);
12874 gcc_assert (VECTOR_MODE_P (d.vmode));
12875 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
12876 d.testing_p = false;
12878 for (i = which = 0; i < nelt; ++i)
12880 rtx e = XVECEXP (sel, 0, i);
12881 int ei = INTVAL (e) & (2 * nelt - 1);
12882 which |= (ei < nelt ? 1 : 2);
12889 gcc_unreachable ();
12892 d.one_vector_p = false;
12893 if (!rtx_equal_p (op0, op1))
12896 /* The elements of PERM do not suggest that only the first operand
12897 is used, but both operands are identical. Allow easier matching
12898 of the permutation by folding the permutation into the single
12900 /* Fall Through. */
12902 for (i = 0; i < nelt; ++i)
12903 d.perm[i] &= nelt - 1;
12905 d.one_vector_p = true;
12910 d.one_vector_p = true;
12914 return aarch64_expand_vec_perm_const_1 (&d);
12918 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
12919 const unsigned char *sel)
12921 struct expand_vec_perm_d d;
12922 unsigned int i, nelt, which;
12926 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
12927 d.testing_p = true;
12928 memcpy (d.perm, sel, nelt);
12930 /* Calculate whether all elements are in one vector. */
12931 for (i = which = 0; i < nelt; ++i)
12933 unsigned char e = d.perm[i];
12934 gcc_assert (e < 2 * nelt);
12935 which |= (e < nelt ? 1 : 2);
12938 /* If all elements are from the second vector, reindex as if from the
12941 for (i = 0; i < nelt; ++i)
12944 /* Check whether the mask can be applied to a single vector. */
12945 d.one_vector_p = (which != 3);
12947 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
12948 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
12949 if (!d.one_vector_p)
12950 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
12953 ret = aarch64_expand_vec_perm_const_1 (&d);
12960 aarch64_reverse_mask (enum machine_mode mode)
12962 /* We have to reverse each vector because we dont have
12963 a permuted load that can reverse-load according to ABI rules. */
12965 rtvec v = rtvec_alloc (16);
12967 int nunits = GET_MODE_NUNITS (mode);
12968 int usize = GET_MODE_UNIT_SIZE (mode);
12970 gcc_assert (BYTES_BIG_ENDIAN);
12971 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
12973 for (i = 0; i < nunits; i++)
12974 for (j = 0; j < usize; j++)
12975 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
12976 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
12977 return force_reg (V16QImode, mask);
12980 /* Implement MODES_TIEABLE_P. In principle we should always return true.
12981 However due to issues with register allocation it is preferable to avoid
12982 tieing integer scalar and FP scalar modes. Executing integer operations
12983 in general registers is better than treating them as scalar vector
12984 operations. This reduces latency and avoids redundant int<->FP moves.
12985 So tie modes if they are either the same class, or vector modes with
12986 other vector modes, vector structs or any scalar mode.
12990 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
12992 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
12995 /* We specifically want to allow elements of "structure" modes to
12996 be tieable to the structure. This more general condition allows
12997 other rarer situations too. */
12998 if (aarch64_vector_mode_p (mode1) && aarch64_vector_mode_p (mode2))
13001 /* Also allow any scalar modes with vectors. */
13002 if (aarch64_vector_mode_supported_p (mode1)
13003 || aarch64_vector_mode_supported_p (mode2))
13009 /* Return a new RTX holding the result of moving POINTER forward by
13013 aarch64_move_pointer (rtx pointer, int amount)
13015 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
13017 return adjust_automodify_address (pointer, GET_MODE (pointer),
13021 /* Return a new RTX holding the result of moving POINTER forward by the
13022 size of the mode it points to. */
13025 aarch64_progress_pointer (rtx pointer)
13027 HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
13029 return aarch64_move_pointer (pointer, amount);
13032 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
13036 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
13039 rtx reg = gen_reg_rtx (mode);
13041 /* "Cast" the pointers to the correct mode. */
13042 *src = adjust_address (*src, mode, 0);
13043 *dst = adjust_address (*dst, mode, 0);
13044 /* Emit the memcpy. */
13045 emit_move_insn (reg, *src);
13046 emit_move_insn (*dst, reg);
13047 /* Move the pointers forward. */
13048 *src = aarch64_progress_pointer (*src);
13049 *dst = aarch64_progress_pointer (*dst);
13052 /* Expand movmem, as if from a __builtin_memcpy. Return true if
13053 we succeed, otherwise return false. */
13056 aarch64_expand_movmem (rtx *operands)
13059 rtx dst = operands[0];
13060 rtx src = operands[1];
13062 bool speed_p = !optimize_function_for_size_p (cfun);
13064 /* When optimizing for size, give a better estimate of the length of a
13065 memcpy call, but use the default otherwise. */
13066 unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
13068 /* We can't do anything smart if the amount to copy is not constant. */
13069 if (!CONST_INT_P (operands[2]))
13072 n = UINTVAL (operands[2]);
13074 /* Try to keep the number of instructions low. For cases below 16 bytes we
13075 need to make at most two moves. For cases above 16 bytes it will be one
13076 move for each 16 byte chunk, then at most two additional moves. */
13077 if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
13080 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
13081 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
13083 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
13084 src = adjust_automodify_address (src, VOIDmode, base, 0);
13086 /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
13092 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13097 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13102 /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
13103 4-byte chunk, partially overlapping with the previously copied chunk. */
13106 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13112 src = aarch64_move_pointer (src, move);
13113 dst = aarch64_move_pointer (dst, move);
13114 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13119 /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
13120 them, then (if applicable) an 8-byte chunk. */
13125 aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
13130 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13135 /* Finish the final bytes of the copy. We can always do this in one
13136 instruction. We either copy the exact amount we need, or partially
13137 overlap with the previous chunk we copied and copy 8-bytes. */
13141 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13143 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13145 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13150 src = aarch64_move_pointer (src, -1);
13151 dst = aarch64_move_pointer (dst, -1);
13152 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13158 src = aarch64_move_pointer (src, move);
13159 dst = aarch64_move_pointer (dst, move);
13160 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13167 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
13169 static unsigned HOST_WIDE_INT
13170 aarch64_asan_shadow_offset (void)
13172 return (HOST_WIDE_INT_1 << 36);
13176 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
13177 unsigned int align,
13178 enum by_pieces_operation op,
13181 /* STORE_BY_PIECES can be used when copying a constant string, but
13182 in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
13183 For now we always fail this and let the move_by_pieces code copy
13184 the string from read-only memory. */
13185 if (op == STORE_BY_PIECES)
13188 return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
13192 aarch64_gen_ccmp_first (rtx *prep_seq, rtx *gen_seq,
13193 int code, tree treeop0, tree treeop1)
13195 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
13197 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
13199 struct expand_operand ops[4];
13202 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
13204 op_mode = GET_MODE (op0);
13205 if (op_mode == VOIDmode)
13206 op_mode = GET_MODE (op1);
13214 icode = CODE_FOR_cmpsi;
13219 icode = CODE_FOR_cmpdi;
13224 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
13225 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
13230 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
13231 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
13239 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
13240 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
13246 *prep_seq = get_insns ();
13249 create_fixed_operand (&ops[0], op0);
13250 create_fixed_operand (&ops[1], op1);
13253 if (!maybe_expand_insn (icode, 2, ops))
13258 *gen_seq = get_insns ();
13261 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
13262 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
13266 aarch64_gen_ccmp_next (rtx *prep_seq, rtx *gen_seq, rtx prev, int cmp_code,
13267 tree treeop0, tree treeop1, int bit_code)
13269 rtx op0, op1, target;
13270 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
13271 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
13273 struct expand_operand ops[6];
13276 push_to_sequence ((rtx_insn*) *prep_seq);
13277 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
13279 op_mode = GET_MODE (op0);
13280 if (op_mode == VOIDmode)
13281 op_mode = GET_MODE (op1);
13289 icode = CODE_FOR_ccmpsi;
13294 icode = CODE_FOR_ccmpdi;
13299 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
13300 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
13305 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
13306 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
13314 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
13315 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
13321 *prep_seq = get_insns ();
13324 target = gen_rtx_REG (cc_mode, CC_REGNUM);
13325 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
13327 if (bit_code != AND)
13329 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
13330 GET_MODE (XEXP (prev, 0))),
13331 VOIDmode, XEXP (prev, 0), const0_rtx);
13332 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
13335 create_fixed_operand (&ops[0], XEXP (prev, 0));
13336 create_fixed_operand (&ops[1], target);
13337 create_fixed_operand (&ops[2], op0);
13338 create_fixed_operand (&ops[3], op1);
13339 create_fixed_operand (&ops[4], prev);
13340 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
13342 push_to_sequence ((rtx_insn*) *gen_seq);
13343 if (!maybe_expand_insn (icode, 6, ops))
13349 *gen_seq = get_insns ();
13352 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
13355 #undef TARGET_GEN_CCMP_FIRST
13356 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
13358 #undef TARGET_GEN_CCMP_NEXT
13359 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
13361 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
13362 instruction fusion of some sort. */
13365 aarch64_macro_fusion_p (void)
13367 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
13371 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
13372 should be kept together during scheduling. */
13375 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
13378 rtx prev_set = single_set (prev);
13379 rtx curr_set = single_set (curr);
13380 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
13381 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
13383 if (!aarch64_macro_fusion_p ())
13387 && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_MOV_MOVK))
13389 /* We are trying to match:
13390 prev (mov) == (set (reg r0) (const_int imm16))
13391 curr (movk) == (set (zero_extract (reg r0)
13394 (const_int imm16_1)) */
13396 set_dest = SET_DEST (curr_set);
13398 if (GET_CODE (set_dest) == ZERO_EXTRACT
13399 && CONST_INT_P (SET_SRC (curr_set))
13400 && CONST_INT_P (SET_SRC (prev_set))
13401 && CONST_INT_P (XEXP (set_dest, 2))
13402 && INTVAL (XEXP (set_dest, 2)) == 16
13403 && REG_P (XEXP (set_dest, 0))
13404 && REG_P (SET_DEST (prev_set))
13405 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
13412 && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_ADRP_ADD))
13415 /* We're trying to match:
13416 prev (adrp) == (set (reg r1)
13417 (high (symbol_ref ("SYM"))))
13418 curr (add) == (set (reg r0)
13420 (symbol_ref ("SYM"))))
13421 Note that r0 need not necessarily be the same as r1, especially
13422 during pre-regalloc scheduling. */
13424 if (satisfies_constraint_Ush (SET_SRC (prev_set))
13425 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
13427 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
13428 && REG_P (XEXP (SET_SRC (curr_set), 0))
13429 && REGNO (XEXP (SET_SRC (curr_set), 0))
13430 == REGNO (SET_DEST (prev_set))
13431 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
13432 XEXP (SET_SRC (curr_set), 1)))
13438 && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_MOVK_MOVK))
13441 /* We're trying to match:
13442 prev (movk) == (set (zero_extract (reg r0)
13445 (const_int imm16_1))
13446 curr (movk) == (set (zero_extract (reg r0)
13449 (const_int imm16_2)) */
13451 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
13452 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
13453 && REG_P (XEXP (SET_DEST (prev_set), 0))
13454 && REG_P (XEXP (SET_DEST (curr_set), 0))
13455 && REGNO (XEXP (SET_DEST (prev_set), 0))
13456 == REGNO (XEXP (SET_DEST (curr_set), 0))
13457 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
13458 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
13459 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
13460 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
13461 && CONST_INT_P (SET_SRC (prev_set))
13462 && CONST_INT_P (SET_SRC (curr_set)))
13467 && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_ADRP_LDR))
13469 /* We're trying to match:
13470 prev (adrp) == (set (reg r0)
13471 (high (symbol_ref ("SYM"))))
13472 curr (ldr) == (set (reg r1)
13473 (mem (lo_sum (reg r0)
13474 (symbol_ref ("SYM")))))
13476 curr (ldr) == (set (reg r1)
13479 (symbol_ref ("SYM")))))) */
13480 if (satisfies_constraint_Ush (SET_SRC (prev_set))
13481 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
13483 rtx curr_src = SET_SRC (curr_set);
13485 if (GET_CODE (curr_src) == ZERO_EXTEND)
13486 curr_src = XEXP (curr_src, 0);
13488 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
13489 && REG_P (XEXP (XEXP (curr_src, 0), 0))
13490 && REGNO (XEXP (XEXP (curr_src, 0), 0))
13491 == REGNO (SET_DEST (prev_set))
13492 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
13493 XEXP (SET_SRC (prev_set), 0)))
13498 if ((aarch64_tune_params.fusible_ops & AARCH64_FUSE_AES_AESMC)
13499 && aarch_crypto_can_dual_issue (prev, curr))
13502 if ((aarch64_tune_params.fusible_ops & AARCH64_FUSE_CMP_BRANCH)
13503 && any_condjump_p (curr))
13505 enum attr_type prev_type = get_attr_type (prev);
13507 /* FIXME: this misses some which is considered simple arthematic
13508 instructions for ThunderX. Simple shifts are missed here. */
13509 if (prev_type == TYPE_ALUS_SREG
13510 || prev_type == TYPE_ALUS_IMM
13511 || prev_type == TYPE_LOGICS_REG
13512 || prev_type == TYPE_LOGICS_IMM)
13519 /* Return true iff the instruction fusion described by OP is enabled. */
13522 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
13524 return (aarch64_tune_params.fusible_ops & op) != 0;
13527 /* If MEM is in the form of [base+offset], extract the two parts
13528 of address and set to BASE and OFFSET, otherwise return false
13529 after clearing BASE and OFFSET. */
13532 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
13536 gcc_assert (MEM_P (mem));
13538 addr = XEXP (mem, 0);
13543 *offset = const0_rtx;
13547 if (GET_CODE (addr) == PLUS
13548 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
13550 *base = XEXP (addr, 0);
13551 *offset = XEXP (addr, 1);
13556 *offset = NULL_RTX;
13561 /* Types for scheduling fusion. */
13562 enum sched_fusion_type
13564 SCHED_FUSION_NONE = 0,
13565 SCHED_FUSION_LD_SIGN_EXTEND,
13566 SCHED_FUSION_LD_ZERO_EXTEND,
13572 /* If INSN is a load or store of address in the form of [base+offset],
13573 extract the two parts and set to BASE and OFFSET. Return scheduling
13574 fusion type this INSN is. */
13576 static enum sched_fusion_type
13577 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
13580 enum sched_fusion_type fusion = SCHED_FUSION_LD;
13582 gcc_assert (INSN_P (insn));
13583 x = PATTERN (insn);
13584 if (GET_CODE (x) != SET)
13585 return SCHED_FUSION_NONE;
13588 dest = SET_DEST (x);
13590 machine_mode dest_mode = GET_MODE (dest);
13592 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
13593 return SCHED_FUSION_NONE;
13595 if (GET_CODE (src) == SIGN_EXTEND)
13597 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
13598 src = XEXP (src, 0);
13599 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
13600 return SCHED_FUSION_NONE;
13602 else if (GET_CODE (src) == ZERO_EXTEND)
13604 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
13605 src = XEXP (src, 0);
13606 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
13607 return SCHED_FUSION_NONE;
13610 if (GET_CODE (src) == MEM && REG_P (dest))
13611 extract_base_offset_in_addr (src, base, offset);
13612 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
13614 fusion = SCHED_FUSION_ST;
13615 extract_base_offset_in_addr (dest, base, offset);
13618 return SCHED_FUSION_NONE;
13620 if (*base == NULL_RTX || *offset == NULL_RTX)
13621 fusion = SCHED_FUSION_NONE;
13626 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
13628 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
13629 and PRI are only calculated for these instructions. For other instruction,
13630 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
13631 type instruction fusion can be added by returning different priorities.
13633 It's important that irrelevant instructions get the largest FUSION_PRI. */
13636 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
13637 int *fusion_pri, int *pri)
13641 enum sched_fusion_type fusion;
13643 gcc_assert (INSN_P (insn));
13646 fusion = fusion_load_store (insn, &base, &offset);
13647 if (fusion == SCHED_FUSION_NONE)
13654 /* Set FUSION_PRI according to fusion type and base register. */
13655 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
13657 /* Calculate PRI. */
13660 /* INSN with smaller offset goes first. */
13661 off_val = (int)(INTVAL (offset));
13663 tmp -= (off_val & 0xfffff);
13665 tmp += ((- off_val) & 0xfffff);
13671 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
13672 Adjust priority of sha1h instructions so they are scheduled before
13673 other SHA1 instructions. */
13676 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
13678 rtx x = PATTERN (insn);
13680 if (GET_CODE (x) == SET)
13684 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
13685 return priority + 10;
13691 /* Given OPERANDS of consecutive load/store, check if we can merge
13692 them into ldp/stp. LOAD is true if they are load instructions.
13693 MODE is the mode of memory operands. */
13696 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
13697 enum machine_mode mode)
13699 HOST_WIDE_INT offval_1, offval_2, msize;
13700 enum reg_class rclass_1, rclass_2;
13701 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
13705 mem_1 = operands[1];
13706 mem_2 = operands[3];
13707 reg_1 = operands[0];
13708 reg_2 = operands[2];
13709 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
13710 if (REGNO (reg_1) == REGNO (reg_2))
13715 mem_1 = operands[0];
13716 mem_2 = operands[2];
13717 reg_1 = operands[1];
13718 reg_2 = operands[3];
13721 /* The mems cannot be volatile. */
13722 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
13725 /* If we have SImode and slow unaligned ldp,
13726 check the alignment to be at least 8 byte. */
13728 && (aarch64_tune_params.extra_tuning_flags
13729 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
13731 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
13734 /* Check if the addresses are in the form of [base+offset]. */
13735 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
13736 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
13738 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
13739 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
13742 /* Check if the bases are same. */
13743 if (!rtx_equal_p (base_1, base_2))
13746 offval_1 = INTVAL (offset_1);
13747 offval_2 = INTVAL (offset_2);
13748 msize = GET_MODE_SIZE (mode);
13749 /* Check if the offsets are consecutive. */
13750 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
13753 /* Check if the addresses are clobbered by load. */
13756 if (reg_mentioned_p (reg_1, mem_1))
13759 /* In increasing order, the last load can clobber the address. */
13760 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
13764 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
13765 rclass_1 = FP_REGS;
13767 rclass_1 = GENERAL_REGS;
13769 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
13770 rclass_2 = FP_REGS;
13772 rclass_2 = GENERAL_REGS;
13774 /* Check if the registers are of same class. */
13775 if (rclass_1 != rclass_2)
13781 /* Given OPERANDS of consecutive load/store, check if we can merge
13782 them into ldp/stp by adjusting the offset. LOAD is true if they
13783 are load instructions. MODE is the mode of memory operands.
13785 Given below consecutive stores:
13787 str w1, [xb, 0x100]
13788 str w1, [xb, 0x104]
13789 str w1, [xb, 0x108]
13790 str w1, [xb, 0x10c]
13792 Though the offsets are out of the range supported by stp, we can
13793 still pair them after adjusting the offset, like:
13795 add scratch, xb, 0x100
13796 stp w1, w1, [scratch]
13797 stp w1, w1, [scratch, 0x8]
13799 The peephole patterns detecting this opportunity should guarantee
13800 the scratch register is avaliable. */
13803 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
13804 enum machine_mode mode)
13806 enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
13807 HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
13808 rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
13809 rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
13813 reg_1 = operands[0];
13814 mem_1 = operands[1];
13815 reg_2 = operands[2];
13816 mem_2 = operands[3];
13817 reg_3 = operands[4];
13818 mem_3 = operands[5];
13819 reg_4 = operands[6];
13820 mem_4 = operands[7];
13821 gcc_assert (REG_P (reg_1) && REG_P (reg_2)
13822 && REG_P (reg_3) && REG_P (reg_4));
13823 if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
13828 mem_1 = operands[0];
13829 reg_1 = operands[1];
13830 mem_2 = operands[2];
13831 reg_2 = operands[3];
13832 mem_3 = operands[4];
13833 reg_3 = operands[5];
13834 mem_4 = operands[6];
13835 reg_4 = operands[7];
13837 /* Skip if memory operand is by itslef valid for ldp/stp. */
13838 if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
13841 /* The mems cannot be volatile. */
13842 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
13843 || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
13846 /* Check if the addresses are in the form of [base+offset]. */
13847 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
13848 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
13850 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
13851 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
13853 extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
13854 if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
13856 extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
13857 if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
13860 /* Check if the bases are same. */
13861 if (!rtx_equal_p (base_1, base_2)
13862 || !rtx_equal_p (base_2, base_3)
13863 || !rtx_equal_p (base_3, base_4))
13866 offval_1 = INTVAL (offset_1);
13867 offval_2 = INTVAL (offset_2);
13868 offval_3 = INTVAL (offset_3);
13869 offval_4 = INTVAL (offset_4);
13870 msize = GET_MODE_SIZE (mode);
13871 /* Check if the offsets are consecutive. */
13872 if ((offval_1 != (offval_2 + msize)
13873 || offval_1 != (offval_3 + msize * 2)
13874 || offval_1 != (offval_4 + msize * 3))
13875 && (offval_4 != (offval_3 + msize)
13876 || offval_4 != (offval_2 + msize * 2)
13877 || offval_4 != (offval_1 + msize * 3)))
13880 /* Check if the addresses are clobbered by load. */
13883 if (reg_mentioned_p (reg_1, mem_1)
13884 || reg_mentioned_p (reg_2, mem_2)
13885 || reg_mentioned_p (reg_3, mem_3))
13888 /* In increasing order, the last load can clobber the address. */
13889 if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
13893 /* If we have SImode and slow unaligned ldp,
13894 check the alignment to be at least 8 byte. */
13896 && (aarch64_tune_params.extra_tuning_flags
13897 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
13899 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
13902 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
13903 rclass_1 = FP_REGS;
13905 rclass_1 = GENERAL_REGS;
13907 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
13908 rclass_2 = FP_REGS;
13910 rclass_2 = GENERAL_REGS;
13912 if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
13913 rclass_3 = FP_REGS;
13915 rclass_3 = GENERAL_REGS;
13917 if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
13918 rclass_4 = FP_REGS;
13920 rclass_4 = GENERAL_REGS;
13922 /* Check if the registers are of same class. */
13923 if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
13929 /* Given OPERANDS of consecutive load/store, this function pairs them
13930 into ldp/stp after adjusting the offset. It depends on the fact
13931 that addresses of load/store instructions are in increasing order.
13932 MODE is the mode of memory operands. CODE is the rtl operator
13933 which should be applied to all memory operands, it's SIGN_EXTEND,
13934 ZERO_EXTEND or UNKNOWN. */
13937 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
13938 enum machine_mode mode, RTX_CODE code)
13940 rtx base, offset, t1, t2;
13941 rtx mem_1, mem_2, mem_3, mem_4;
13942 HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
13946 mem_1 = operands[1];
13947 mem_2 = operands[3];
13948 mem_3 = operands[5];
13949 mem_4 = operands[7];
13953 mem_1 = operands[0];
13954 mem_2 = operands[2];
13955 mem_3 = operands[4];
13956 mem_4 = operands[6];
13957 gcc_assert (code == UNKNOWN);
13960 extract_base_offset_in_addr (mem_1, &base, &offset);
13961 gcc_assert (base != NULL_RTX && offset != NULL_RTX);
13963 /* Adjust offset thus it can fit in ldp/stp instruction. */
13964 msize = GET_MODE_SIZE (mode);
13965 stp_off_limit = msize * 0x40;
13966 off_val = INTVAL (offset);
13967 abs_off = (off_val < 0) ? -off_val : off_val;
13968 new_off = abs_off % stp_off_limit;
13969 adj_off = abs_off - new_off;
13971 /* Further adjust to make sure all offsets are OK. */
13972 if ((new_off + msize * 2) >= stp_off_limit)
13974 adj_off += stp_off_limit;
13975 new_off -= stp_off_limit;
13978 /* Make sure the adjustment can be done with ADD/SUB instructions. */
13979 if (adj_off >= 0x1000)
13984 adj_off = -adj_off;
13985 new_off = -new_off;
13988 /* Create new memory references. */
13989 mem_1 = change_address (mem_1, VOIDmode,
13990 plus_constant (DImode, operands[8], new_off));
13992 /* Check if the adjusted address is OK for ldp/stp. */
13993 if (!aarch64_mem_pair_operand (mem_1, mode))
13996 msize = GET_MODE_SIZE (mode);
13997 mem_2 = change_address (mem_2, VOIDmode,
13998 plus_constant (DImode,
14001 mem_3 = change_address (mem_3, VOIDmode,
14002 plus_constant (DImode,
14004 new_off + msize * 2));
14005 mem_4 = change_address (mem_4, VOIDmode,
14006 plus_constant (DImode,
14008 new_off + msize * 3));
14010 if (code == ZERO_EXTEND)
14012 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
14013 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
14014 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
14015 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
14017 else if (code == SIGN_EXTEND)
14019 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
14020 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
14021 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
14022 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
14027 operands[1] = mem_1;
14028 operands[3] = mem_2;
14029 operands[5] = mem_3;
14030 operands[7] = mem_4;
14034 operands[0] = mem_1;
14035 operands[2] = mem_2;
14036 operands[4] = mem_3;
14037 operands[6] = mem_4;
14040 /* Emit adjusting instruction. */
14041 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
14042 /* Emit ldp/stp instructions. */
14043 t1 = gen_rtx_SET (operands[0], operands[1]);
14044 t2 = gen_rtx_SET (operands[2], operands[3]);
14045 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
14046 t1 = gen_rtx_SET (operands[4], operands[5]);
14047 t2 = gen_rtx_SET (operands[6], operands[7]);
14048 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
14052 /* Return 1 if pseudo register should be created and used to hold
14053 GOT address for PIC code. */
14056 aarch64_use_pseudo_pic_reg (void)
14058 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
14061 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
14064 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
14066 switch (XINT (x, 1))
14068 case UNSPEC_GOTSMALLPIC:
14069 case UNSPEC_GOTSMALLPIC28K:
14070 case UNSPEC_GOTTINYPIC:
14076 return default_unspec_may_trap_p (x, flags);
14080 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
14081 return the log2 of that value. Otherwise return -1. */
14084 aarch64_fpconst_pow_of_2 (rtx x)
14086 const REAL_VALUE_TYPE *r;
14088 if (!CONST_DOUBLE_P (x))
14091 r = CONST_DOUBLE_REAL_VALUE (x);
14093 if (REAL_VALUE_NEGATIVE (*r)
14094 || REAL_VALUE_ISNAN (*r)
14095 || REAL_VALUE_ISINF (*r)
14096 || !real_isinteger (r, DFmode))
14099 return exact_log2 (real_to_integer (r));
14102 /* If X is a vector of equal CONST_DOUBLE values and that value is
14103 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
14106 aarch64_vec_fpconst_pow_of_2 (rtx x)
14108 if (GET_CODE (x) != CONST_VECTOR)
14111 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
14114 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
14118 for (int i = 1; i < CONST_VECTOR_NUNITS (x); i++)
14119 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
14125 /* Implement TARGET_PROMOTED_TYPE to promote __fp16 to float. */
14127 aarch64_promoted_type (const_tree t)
14129 if (SCALAR_FLOAT_TYPE_P (t) && TYPE_PRECISION (t) == 16)
14130 return float_type_node;
14134 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
14137 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
14138 optimization_type opt_type)
14143 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
14150 #undef TARGET_ADDRESS_COST
14151 #define TARGET_ADDRESS_COST aarch64_address_cost
14153 /* This hook will determines whether unnamed bitfields affect the alignment
14154 of the containing structure. The hook returns true if the structure
14155 should inherit the alignment requirements of an unnamed bitfield's
14157 #undef TARGET_ALIGN_ANON_BITFIELD
14158 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
14160 #undef TARGET_ASM_ALIGNED_DI_OP
14161 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
14163 #undef TARGET_ASM_ALIGNED_HI_OP
14164 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
14166 #undef TARGET_ASM_ALIGNED_SI_OP
14167 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
14169 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
14170 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
14171 hook_bool_const_tree_hwi_hwi_const_tree_true
14173 #undef TARGET_ASM_FILE_START
14174 #define TARGET_ASM_FILE_START aarch64_start_file
14176 #undef TARGET_ASM_OUTPUT_MI_THUNK
14177 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
14179 #undef TARGET_ASM_SELECT_RTX_SECTION
14180 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
14182 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
14183 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
14185 #undef TARGET_BUILD_BUILTIN_VA_LIST
14186 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
14188 #undef TARGET_CALLEE_COPIES
14189 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
14191 #undef TARGET_CAN_ELIMINATE
14192 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
14194 #undef TARGET_CAN_INLINE_P
14195 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
14197 #undef TARGET_CANNOT_FORCE_CONST_MEM
14198 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
14200 #undef TARGET_CASE_VALUES_THRESHOLD
14201 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
14203 #undef TARGET_CONDITIONAL_REGISTER_USAGE
14204 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
14206 /* Only the least significant bit is used for initialization guard
14208 #undef TARGET_CXX_GUARD_MASK_BIT
14209 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
14211 #undef TARGET_C_MODE_FOR_SUFFIX
14212 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
14214 #ifdef TARGET_BIG_ENDIAN_DEFAULT
14215 #undef TARGET_DEFAULT_TARGET_FLAGS
14216 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
14219 #undef TARGET_CLASS_MAX_NREGS
14220 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
14222 #undef TARGET_BUILTIN_DECL
14223 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
14225 #undef TARGET_BUILTIN_RECIPROCAL
14226 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
14228 #undef TARGET_EXPAND_BUILTIN
14229 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
14231 #undef TARGET_EXPAND_BUILTIN_VA_START
14232 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
14234 #undef TARGET_FOLD_BUILTIN
14235 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
14237 #undef TARGET_FUNCTION_ARG
14238 #define TARGET_FUNCTION_ARG aarch64_function_arg
14240 #undef TARGET_FUNCTION_ARG_ADVANCE
14241 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
14243 #undef TARGET_FUNCTION_ARG_BOUNDARY
14244 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
14246 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
14247 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
14249 #undef TARGET_FUNCTION_VALUE
14250 #define TARGET_FUNCTION_VALUE aarch64_function_value
14252 #undef TARGET_FUNCTION_VALUE_REGNO_P
14253 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
14255 #undef TARGET_FRAME_POINTER_REQUIRED
14256 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
14258 #undef TARGET_GIMPLE_FOLD_BUILTIN
14259 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
14261 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
14262 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
14264 #undef TARGET_INIT_BUILTINS
14265 #define TARGET_INIT_BUILTINS aarch64_init_builtins
14267 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
14268 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
14269 aarch64_ira_change_pseudo_allocno_class
14271 #undef TARGET_LEGITIMATE_ADDRESS_P
14272 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
14274 #undef TARGET_LEGITIMATE_CONSTANT_P
14275 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
14277 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
14278 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
14279 aarch64_legitimize_address_displacement
14281 #undef TARGET_LIBGCC_CMP_RETURN_MODE
14282 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
14284 #undef TARGET_LRA_P
14285 #define TARGET_LRA_P hook_bool_void_true
14287 #undef TARGET_MANGLE_TYPE
14288 #define TARGET_MANGLE_TYPE aarch64_mangle_type
14290 #undef TARGET_MEMORY_MOVE_COST
14291 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
14293 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
14294 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
14296 #undef TARGET_MUST_PASS_IN_STACK
14297 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
14299 /* This target hook should return true if accesses to volatile bitfields
14300 should use the narrowest mode possible. It should return false if these
14301 accesses should use the bitfield container type. */
14302 #undef TARGET_NARROW_VOLATILE_BITFIELD
14303 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
14305 #undef TARGET_OPTION_OVERRIDE
14306 #define TARGET_OPTION_OVERRIDE aarch64_override_options
14308 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
14309 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
14310 aarch64_override_options_after_change
14312 #undef TARGET_OPTION_SAVE
14313 #define TARGET_OPTION_SAVE aarch64_option_save
14315 #undef TARGET_OPTION_RESTORE
14316 #define TARGET_OPTION_RESTORE aarch64_option_restore
14318 #undef TARGET_OPTION_PRINT
14319 #define TARGET_OPTION_PRINT aarch64_option_print
14321 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
14322 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
14324 #undef TARGET_SET_CURRENT_FUNCTION
14325 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
14327 #undef TARGET_PASS_BY_REFERENCE
14328 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
14330 #undef TARGET_PREFERRED_RELOAD_CLASS
14331 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
14333 #undef TARGET_SCHED_REASSOCIATION_WIDTH
14334 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
14336 #undef TARGET_PROMOTED_TYPE
14337 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
14339 #undef TARGET_SECONDARY_RELOAD
14340 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
14342 #undef TARGET_SHIFT_TRUNCATION_MASK
14343 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
14345 #undef TARGET_SETUP_INCOMING_VARARGS
14346 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
14348 #undef TARGET_STRUCT_VALUE_RTX
14349 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
14351 #undef TARGET_REGISTER_MOVE_COST
14352 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
14354 #undef TARGET_RETURN_IN_MEMORY
14355 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
14357 #undef TARGET_RETURN_IN_MSB
14358 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
14360 #undef TARGET_RTX_COSTS
14361 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
14363 #undef TARGET_SCHED_ISSUE_RATE
14364 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
14366 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
14367 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
14368 aarch64_sched_first_cycle_multipass_dfa_lookahead
14370 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
14371 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
14372 aarch64_first_cycle_multipass_dfa_lookahead_guard
14374 #undef TARGET_TRAMPOLINE_INIT
14375 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
14377 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
14378 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
14380 #undef TARGET_VECTOR_MODE_SUPPORTED_P
14381 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
14383 #undef TARGET_ARRAY_MODE_SUPPORTED_P
14384 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
14386 #undef TARGET_VECTORIZE_ADD_STMT_COST
14387 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
14389 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
14390 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
14391 aarch64_builtin_vectorization_cost
14393 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
14394 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
14396 #undef TARGET_VECTORIZE_BUILTINS
14397 #define TARGET_VECTORIZE_BUILTINS
14399 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
14400 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
14401 aarch64_builtin_vectorized_function
14403 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
14404 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
14405 aarch64_autovectorize_vector_sizes
14407 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
14408 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
14409 aarch64_atomic_assign_expand_fenv
14411 /* Section anchor support. */
14413 #undef TARGET_MIN_ANCHOR_OFFSET
14414 #define TARGET_MIN_ANCHOR_OFFSET -256
14416 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
14417 byte offset; we can do much more for larger data types, but have no way
14418 to determine the size of the access. We assume accesses are aligned. */
14419 #undef TARGET_MAX_ANCHOR_OFFSET
14420 #define TARGET_MAX_ANCHOR_OFFSET 4095
14422 #undef TARGET_VECTOR_ALIGNMENT
14423 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
14425 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
14426 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
14427 aarch64_simd_vector_alignment_reachable
14429 /* vec_perm support. */
14431 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
14432 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
14433 aarch64_vectorize_vec_perm_const_ok
14435 #undef TARGET_INIT_LIBFUNCS
14436 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
14438 #undef TARGET_FIXED_CONDITION_CODE_REGS
14439 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
14441 #undef TARGET_FLAGS_REGNUM
14442 #define TARGET_FLAGS_REGNUM CC_REGNUM
14444 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
14445 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
14447 #undef TARGET_ASAN_SHADOW_OFFSET
14448 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
14450 #undef TARGET_LEGITIMIZE_ADDRESS
14451 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
14453 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
14454 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
14455 aarch64_use_by_pieces_infrastructure_p
14457 #undef TARGET_CAN_USE_DOLOOP_P
14458 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
14460 #undef TARGET_SCHED_ADJUST_PRIORITY
14461 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
14463 #undef TARGET_SCHED_MACRO_FUSION_P
14464 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
14466 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
14467 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
14469 #undef TARGET_SCHED_FUSION_PRIORITY
14470 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
14472 #undef TARGET_UNSPEC_MAY_TRAP_P
14473 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
14475 #undef TARGET_USE_PSEUDO_PIC_REG
14476 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
14478 #undef TARGET_PRINT_OPERAND
14479 #define TARGET_PRINT_OPERAND aarch64_print_operand
14481 #undef TARGET_PRINT_OPERAND_ADDRESS
14482 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
14484 #undef TARGET_OPTAB_SUPPORTED_P
14485 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
14487 #undef TARGET_OMIT_STRUCT_RETURN_REG
14488 #define TARGET_OMIT_STRUCT_RETURN_REG true
14490 struct gcc_target targetm = TARGET_INITIALIZER;
14492 #include "gt-aarch64.h"