1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2016 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
22 #define INCLUDE_STRING
24 #include "coretypes.h"
35 #include "stringpool.h"
40 #include "diagnostic.h"
41 #include "insn-attr.h"
43 #include "fold-const.h"
44 #include "stor-layout.h"
52 #include "langhooks.h"
57 #include "gimple-iterator.h"
58 #include "tree-vectorizer.h"
59 #include "aarch64-cost-tables.h"
63 #include "tm-constrs.h"
64 #include "sched-int.h"
65 #include "target-globals.h"
66 #include "common/common-target.h"
68 /* This file should be included last. */
69 #include "target-def.h"
71 /* Defined for convenience. */
72 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
74 /* Classifies an address.
77 A simple base register plus immediate offset.
80 A base register indexed by immediate offset with writeback.
83 A base register indexed by (optionally scaled) register.
86 A base register indexed by (optionally scaled) zero-extended register.
89 A base register indexed by (optionally scaled) sign-extended register.
92 A LO_SUM rtx with a base register and "LO12" symbol relocation.
95 A constant symbolic address, in pc-relative literal pool. */
97 enum aarch64_address_type {
107 struct aarch64_address_info {
108 enum aarch64_address_type type;
112 enum aarch64_symbol_type symbol_type;
115 struct simd_immediate_info
124 /* The current code model. */
125 enum aarch64_code_model aarch64_cmodel;
128 #undef TARGET_HAVE_TLS
129 #define TARGET_HAVE_TLS 1
132 static bool aarch64_composite_type_p (const_tree, machine_mode);
133 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
135 machine_mode *, int *,
137 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
138 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
139 static void aarch64_override_options_after_change (void);
140 static bool aarch64_vector_mode_supported_p (machine_mode);
141 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
142 const unsigned char *sel);
143 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
145 /* Major revision number of the ARM Architecture implemented by the target. */
146 unsigned aarch64_architecture_version;
148 /* The processor for which instructions should be scheduled. */
149 enum aarch64_processor aarch64_tune = cortexa53;
151 /* Mask to specify which instruction scheduling options should be used. */
152 unsigned long aarch64_tune_flags = 0;
154 /* Global flag for PC relative loads. */
155 bool aarch64_pcrelative_literal_loads;
157 /* Support for command line parsing of boolean flags in the tuning
159 struct aarch64_flag_desc
165 #define AARCH64_FUSION_PAIR(name, internal_name) \
166 { name, AARCH64_FUSE_##internal_name },
167 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
169 { "none", AARCH64_FUSE_NOTHING },
170 #include "aarch64-fusion-pairs.def"
171 { "all", AARCH64_FUSE_ALL },
172 { NULL, AARCH64_FUSE_NOTHING }
174 #undef AARCH64_FUION_PAIR
176 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
177 { name, AARCH64_EXTRA_TUNE_##internal_name },
178 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
180 { "none", AARCH64_EXTRA_TUNE_NONE },
181 #include "aarch64-tuning-flags.def"
182 { "all", AARCH64_EXTRA_TUNE_ALL },
183 { NULL, AARCH64_EXTRA_TUNE_NONE }
185 #undef AARCH64_EXTRA_TUNING_OPTION
187 /* Tuning parameters. */
189 static const struct cpu_addrcost_table generic_addrcost_table =
199 0, /* register_offset */
200 0, /* register_sextend */
201 0, /* register_zextend */
205 static const struct cpu_addrcost_table cortexa57_addrcost_table =
215 0, /* register_offset */
216 0, /* register_sextend */
217 0, /* register_zextend */
221 static const struct cpu_addrcost_table exynosm1_addrcost_table =
231 1, /* register_offset */
232 1, /* register_sextend */
233 2, /* register_zextend */
237 static const struct cpu_addrcost_table xgene1_addrcost_table =
247 0, /* register_offset */
248 1, /* register_sextend */
249 1, /* register_zextend */
253 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
263 0, /* register_offset */
264 0, /* register_sextend */
265 0, /* register_zextend */
269 static const struct cpu_addrcost_table vulcan_addrcost_table =
279 2, /* register_offset */
280 3, /* register_sextend */
281 3, /* register_zextend */
285 static const struct cpu_regmove_cost generic_regmove_cost =
288 /* Avoid the use of slow int<->fp moves for spilling by setting
289 their cost higher than memmov_cost. */
295 static const struct cpu_regmove_cost cortexa57_regmove_cost =
298 /* Avoid the use of slow int<->fp moves for spilling by setting
299 their cost higher than memmov_cost. */
305 static const struct cpu_regmove_cost cortexa53_regmove_cost =
308 /* Avoid the use of slow int<->fp moves for spilling by setting
309 their cost higher than memmov_cost. */
315 static const struct cpu_regmove_cost exynosm1_regmove_cost =
318 /* Avoid the use of slow int<->fp moves for spilling by setting
319 their cost higher than memmov_cost (actual, 4 and 9). */
325 static const struct cpu_regmove_cost thunderx_regmove_cost =
333 static const struct cpu_regmove_cost xgene1_regmove_cost =
336 /* Avoid the use of slow int<->fp moves for spilling by setting
337 their cost higher than memmov_cost. */
343 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
346 /* Avoid the use of int<->fp moves for spilling. */
352 static const struct cpu_regmove_cost vulcan_regmove_cost =
355 /* Avoid the use of int<->fp moves for spilling. */
361 /* Generic costs for vector insn classes. */
362 static const struct cpu_vector_cost generic_vector_cost =
364 1, /* scalar_stmt_cost */
365 1, /* scalar_load_cost */
366 1, /* scalar_store_cost */
367 1, /* vec_stmt_cost */
368 2, /* vec_permute_cost */
369 1, /* vec_to_scalar_cost */
370 1, /* scalar_to_vec_cost */
371 1, /* vec_align_load_cost */
372 1, /* vec_unalign_load_cost */
373 1, /* vec_unalign_store_cost */
374 1, /* vec_store_cost */
375 3, /* cond_taken_branch_cost */
376 1 /* cond_not_taken_branch_cost */
379 /* ThunderX costs for vector insn classes. */
380 static const struct cpu_vector_cost thunderx_vector_cost =
382 1, /* scalar_stmt_cost */
383 3, /* scalar_load_cost */
384 1, /* scalar_store_cost */
385 4, /* vec_stmt_cost */
386 4, /* vec_permute_cost */
387 2, /* vec_to_scalar_cost */
388 2, /* scalar_to_vec_cost */
389 3, /* vec_align_load_cost */
390 10, /* vec_unalign_load_cost */
391 10, /* vec_unalign_store_cost */
392 1, /* vec_store_cost */
393 3, /* cond_taken_branch_cost */
394 3 /* cond_not_taken_branch_cost */
397 /* Generic costs for vector insn classes. */
398 static const struct cpu_vector_cost cortexa57_vector_cost =
400 1, /* scalar_stmt_cost */
401 4, /* scalar_load_cost */
402 1, /* scalar_store_cost */
403 2, /* vec_stmt_cost */
404 3, /* vec_permute_cost */
405 8, /* vec_to_scalar_cost */
406 8, /* scalar_to_vec_cost */
407 4, /* vec_align_load_cost */
408 4, /* vec_unalign_load_cost */
409 1, /* vec_unalign_store_cost */
410 1, /* vec_store_cost */
411 1, /* cond_taken_branch_cost */
412 1 /* cond_not_taken_branch_cost */
415 static const struct cpu_vector_cost exynosm1_vector_cost =
417 1, /* scalar_stmt_cost */
418 5, /* scalar_load_cost */
419 1, /* scalar_store_cost */
420 3, /* vec_stmt_cost */
421 3, /* vec_permute_cost */
422 3, /* vec_to_scalar_cost */
423 3, /* scalar_to_vec_cost */
424 5, /* vec_align_load_cost */
425 5, /* vec_unalign_load_cost */
426 1, /* vec_unalign_store_cost */
427 1, /* vec_store_cost */
428 1, /* cond_taken_branch_cost */
429 1 /* cond_not_taken_branch_cost */
432 /* Generic costs for vector insn classes. */
433 static const struct cpu_vector_cost xgene1_vector_cost =
435 1, /* scalar_stmt_cost */
436 5, /* scalar_load_cost */
437 1, /* scalar_store_cost */
438 2, /* vec_stmt_cost */
439 2, /* vec_permute_cost */
440 4, /* vec_to_scalar_cost */
441 4, /* scalar_to_vec_cost */
442 10, /* vec_align_load_cost */
443 10, /* vec_unalign_load_cost */
444 2, /* vec_unalign_store_cost */
445 2, /* vec_store_cost */
446 2, /* cond_taken_branch_cost */
447 1 /* cond_not_taken_branch_cost */
450 /* Costs for vector insn classes for Vulcan. */
451 static const struct cpu_vector_cost vulcan_vector_cost =
453 6, /* scalar_stmt_cost */
454 4, /* scalar_load_cost */
455 1, /* scalar_store_cost */
456 6, /* vec_stmt_cost */
457 3, /* vec_permute_cost */
458 6, /* vec_to_scalar_cost */
459 5, /* scalar_to_vec_cost */
460 8, /* vec_align_load_cost */
461 8, /* vec_unalign_load_cost */
462 4, /* vec_unalign_store_cost */
463 4, /* vec_store_cost */
464 2, /* cond_taken_branch_cost */
465 1 /* cond_not_taken_branch_cost */
468 /* Generic costs for branch instructions. */
469 static const struct cpu_branch_cost generic_branch_cost =
471 2, /* Predictable. */
472 2 /* Unpredictable. */
475 /* Branch costs for Cortex-A57. */
476 static const struct cpu_branch_cost cortexa57_branch_cost =
478 1, /* Predictable. */
479 3 /* Unpredictable. */
482 /* Branch costs for Vulcan. */
483 static const struct cpu_branch_cost vulcan_branch_cost =
485 1, /* Predictable. */
486 3 /* Unpredictable. */
489 /* Generic approximation modes. */
490 static const cpu_approx_modes generic_approx_modes =
492 AARCH64_APPROX_NONE, /* division */
493 AARCH64_APPROX_NONE, /* sqrt */
494 AARCH64_APPROX_NONE /* recip_sqrt */
497 /* Approximation modes for Exynos M1. */
498 static const cpu_approx_modes exynosm1_approx_modes =
500 AARCH64_APPROX_NONE, /* division */
501 AARCH64_APPROX_ALL, /* sqrt */
502 AARCH64_APPROX_ALL /* recip_sqrt */
505 /* Approximation modes for X-Gene 1. */
506 static const cpu_approx_modes xgene1_approx_modes =
508 AARCH64_APPROX_NONE, /* division */
509 AARCH64_APPROX_NONE, /* sqrt */
510 AARCH64_APPROX_ALL /* recip_sqrt */
513 static const struct tune_params generic_tunings =
515 &cortexa57_extra_costs,
516 &generic_addrcost_table,
517 &generic_regmove_cost,
518 &generic_vector_cost,
519 &generic_branch_cost,
520 &generic_approx_modes,
523 AARCH64_FUSE_NOTHING, /* fusible_ops */
524 8, /* function_align. */
527 2, /* int_reassoc_width. */
528 4, /* fp_reassoc_width. */
529 1, /* vec_reassoc_width. */
530 2, /* min_div_recip_mul_sf. */
531 2, /* min_div_recip_mul_df. */
532 0, /* max_case_values. */
533 0, /* cache_line_size. */
534 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
535 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
538 static const struct tune_params cortexa35_tunings =
540 &cortexa53_extra_costs,
541 &generic_addrcost_table,
542 &cortexa53_regmove_cost,
543 &generic_vector_cost,
544 &cortexa57_branch_cost,
545 &generic_approx_modes,
548 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
549 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
550 16, /* function_align. */
553 2, /* int_reassoc_width. */
554 4, /* fp_reassoc_width. */
555 1, /* vec_reassoc_width. */
556 2, /* min_div_recip_mul_sf. */
557 2, /* min_div_recip_mul_df. */
558 0, /* max_case_values. */
559 0, /* cache_line_size. */
560 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
561 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
564 static const struct tune_params cortexa53_tunings =
566 &cortexa53_extra_costs,
567 &generic_addrcost_table,
568 &cortexa53_regmove_cost,
569 &generic_vector_cost,
570 &cortexa57_branch_cost,
571 &generic_approx_modes,
574 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
575 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
576 16, /* function_align. */
579 2, /* int_reassoc_width. */
580 4, /* fp_reassoc_width. */
581 1, /* vec_reassoc_width. */
582 2, /* min_div_recip_mul_sf. */
583 2, /* min_div_recip_mul_df. */
584 0, /* max_case_values. */
585 0, /* cache_line_size. */
586 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
587 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
590 static const struct tune_params cortexa57_tunings =
592 &cortexa57_extra_costs,
593 &cortexa57_addrcost_table,
594 &cortexa57_regmove_cost,
595 &cortexa57_vector_cost,
596 &cortexa57_branch_cost,
597 &generic_approx_modes,
600 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
601 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
602 16, /* function_align. */
605 2, /* int_reassoc_width. */
606 4, /* fp_reassoc_width. */
607 1, /* vec_reassoc_width. */
608 2, /* min_div_recip_mul_sf. */
609 2, /* min_div_recip_mul_df. */
610 0, /* max_case_values. */
611 0, /* cache_line_size. */
612 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
613 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS) /* tune_flags. */
616 static const struct tune_params cortexa72_tunings =
618 &cortexa57_extra_costs,
619 &cortexa57_addrcost_table,
620 &cortexa57_regmove_cost,
621 &cortexa57_vector_cost,
622 &cortexa57_branch_cost,
623 &generic_approx_modes,
626 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
627 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
628 16, /* function_align. */
631 2, /* int_reassoc_width. */
632 4, /* fp_reassoc_width. */
633 1, /* vec_reassoc_width. */
634 2, /* min_div_recip_mul_sf. */
635 2, /* min_div_recip_mul_df. */
636 0, /* max_case_values. */
637 0, /* cache_line_size. */
638 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
639 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
642 static const struct tune_params cortexa73_tunings =
644 &cortexa57_extra_costs,
645 &cortexa57_addrcost_table,
646 &cortexa57_regmove_cost,
647 &cortexa57_vector_cost,
648 &cortexa57_branch_cost,
649 &generic_approx_modes,
650 4, /* memmov_cost. */
652 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
653 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
654 16, /* function_align. */
657 2, /* int_reassoc_width. */
658 4, /* fp_reassoc_width. */
659 1, /* vec_reassoc_width. */
660 2, /* min_div_recip_mul_sf. */
661 2, /* min_div_recip_mul_df. */
662 0, /* max_case_values. */
663 0, /* cache_line_size. */
664 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
665 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
668 static const struct tune_params exynosm1_tunings =
670 &exynosm1_extra_costs,
671 &exynosm1_addrcost_table,
672 &exynosm1_regmove_cost,
673 &exynosm1_vector_cost,
674 &generic_branch_cost,
675 &exynosm1_approx_modes,
678 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
679 4, /* function_align. */
682 2, /* int_reassoc_width. */
683 4, /* fp_reassoc_width. */
684 1, /* vec_reassoc_width. */
685 2, /* min_div_recip_mul_sf. */
686 2, /* min_div_recip_mul_df. */
687 48, /* max_case_values. */
688 64, /* cache_line_size. */
689 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
690 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
693 static const struct tune_params thunderx_tunings =
695 &thunderx_extra_costs,
696 &generic_addrcost_table,
697 &thunderx_regmove_cost,
698 &thunderx_vector_cost,
699 &generic_branch_cost,
700 &generic_approx_modes,
703 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
704 8, /* function_align. */
707 2, /* int_reassoc_width. */
708 4, /* fp_reassoc_width. */
709 1, /* vec_reassoc_width. */
710 2, /* min_div_recip_mul_sf. */
711 2, /* min_div_recip_mul_df. */
712 0, /* max_case_values. */
713 0, /* cache_line_size. */
714 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
715 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW) /* tune_flags. */
718 static const struct tune_params xgene1_tunings =
721 &xgene1_addrcost_table,
722 &xgene1_regmove_cost,
724 &generic_branch_cost,
725 &xgene1_approx_modes,
728 AARCH64_FUSE_NOTHING, /* fusible_ops */
729 16, /* function_align. */
731 16, /* loop_align. */
732 2, /* int_reassoc_width. */
733 4, /* fp_reassoc_width. */
734 1, /* vec_reassoc_width. */
735 2, /* min_div_recip_mul_sf. */
736 2, /* min_div_recip_mul_df. */
737 0, /* max_case_values. */
738 0, /* cache_line_size. */
739 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
740 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
743 static const struct tune_params qdf24xx_tunings =
745 &qdf24xx_extra_costs,
746 &qdf24xx_addrcost_table,
747 &qdf24xx_regmove_cost,
748 &generic_vector_cost,
749 &generic_branch_cost,
750 &generic_approx_modes,
753 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
754 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
755 16, /* function_align. */
757 16, /* loop_align. */
758 2, /* int_reassoc_width. */
759 4, /* fp_reassoc_width. */
760 1, /* vec_reassoc_width. */
761 2, /* min_div_recip_mul_sf. */
762 2, /* min_div_recip_mul_df. */
763 0, /* max_case_values. */
764 64, /* cache_line_size. */
765 tune_params::AUTOPREFETCHER_STRONG, /* autoprefetcher_model. */
766 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
769 static const struct tune_params vulcan_tunings =
772 &vulcan_addrcost_table,
773 &vulcan_regmove_cost,
776 &generic_approx_modes,
777 4, /* memmov_cost. */
779 AARCH64_FUSE_NOTHING, /* fuseable_ops. */
780 16, /* function_align. */
782 16, /* loop_align. */
783 3, /* int_reassoc_width. */
784 2, /* fp_reassoc_width. */
785 2, /* vec_reassoc_width. */
786 2, /* min_div_recip_mul_sf. */
787 2, /* min_div_recip_mul_df. */
788 0, /* max_case_values. */
789 64, /* cache_line_size. */
790 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
791 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
794 /* Support for fine-grained override of the tuning structures. */
795 struct aarch64_tuning_override_function
798 void (*parse_override)(const char*, struct tune_params*);
801 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
802 static void aarch64_parse_tune_string (const char*, struct tune_params*);
804 static const struct aarch64_tuning_override_function
805 aarch64_tuning_override_functions[] =
807 { "fuse", aarch64_parse_fuse_string },
808 { "tune", aarch64_parse_tune_string },
812 /* A processor implementing AArch64. */
815 const char *const name;
816 enum aarch64_processor ident;
817 enum aarch64_processor sched_core;
818 enum aarch64_arch arch;
819 unsigned architecture_version;
820 const unsigned long flags;
821 const struct tune_params *const tune;
824 /* Architectures implementing AArch64. */
825 static const struct processor all_architectures[] =
827 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
828 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
829 #include "aarch64-arches.def"
831 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
834 /* Processor cores implementing AArch64. */
835 static const struct processor all_cores[] =
837 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART) \
838 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
839 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
840 FLAGS, &COSTS##_tunings},
841 #include "aarch64-cores.def"
843 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
844 AARCH64_FL_FOR_ARCH8, &generic_tunings},
845 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
849 /* Target specification. These are populated by the -march, -mtune, -mcpu
850 handling code or by target attributes. */
851 static const struct processor *selected_arch;
852 static const struct processor *selected_cpu;
853 static const struct processor *selected_tune;
855 /* The current tuning set. */
856 struct tune_params aarch64_tune_params = generic_tunings;
858 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
860 /* An ISA extension in the co-processor and main instruction set space. */
861 struct aarch64_option_extension
863 const char *const name;
864 const unsigned long flags_on;
865 const unsigned long flags_off;
868 typedef enum aarch64_cond_code
870 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
871 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
872 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
876 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
878 /* The condition codes of the processor, and the inverse function. */
879 static const char * const aarch64_condition_codes[] =
881 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
882 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
885 /* Generate code to enable conditional branches in functions over 1 MiB. */
887 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
888 const char * branch_format)
890 rtx_code_label * tmp_label = gen_label_rtx ();
893 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
894 CODE_LABEL_NUMBER (tmp_label));
895 const char *label_ptr = targetm.strip_name_encoding (label_buf);
896 rtx dest_label = operands[pos_label];
897 operands[pos_label] = tmp_label;
899 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
900 output_asm_insn (buffer, operands);
902 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
903 operands[pos_label] = dest_label;
904 output_asm_insn (buffer, operands);
909 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
911 const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
912 if (TARGET_GENERAL_REGS_ONLY)
913 error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
915 error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
918 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
919 The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
920 the same cost even if ALL_REGS has a much larger cost. ALL_REGS is also
921 used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
922 cost (in this case the best class is the lowest cost one). Using ALL_REGS
923 irrespectively of its cost results in bad allocations with many redundant
924 int<->FP moves which are expensive on various cores.
925 To avoid this we don't allow ALL_REGS as the allocno class, but force a
926 decision between FP_REGS and GENERAL_REGS. We use the allocno class if it
927 isn't ALL_REGS. Similarly, use the best class if it isn't ALL_REGS.
928 Otherwise set the allocno class depending on the mode.
929 The result of this is that it is no longer inefficient to have a higher
930 memory move cost than the register move cost.
934 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
935 reg_class_t best_class)
937 enum machine_mode mode;
939 if (allocno_class != ALL_REGS)
940 return allocno_class;
942 if (best_class != ALL_REGS)
945 mode = PSEUDO_REGNO_MODE (regno);
946 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
950 aarch64_min_divisions_for_recip_mul (enum machine_mode mode)
952 if (GET_MODE_UNIT_SIZE (mode) == 4)
953 return aarch64_tune_params.min_div_recip_mul_sf;
954 return aarch64_tune_params.min_div_recip_mul_df;
958 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
959 enum machine_mode mode)
961 if (VECTOR_MODE_P (mode))
962 return aarch64_tune_params.vec_reassoc_width;
963 if (INTEGRAL_MODE_P (mode))
964 return aarch64_tune_params.int_reassoc_width;
965 if (FLOAT_MODE_P (mode))
966 return aarch64_tune_params.fp_reassoc_width;
970 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
972 aarch64_dbx_register_number (unsigned regno)
974 if (GP_REGNUM_P (regno))
975 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
976 else if (regno == SP_REGNUM)
977 return AARCH64_DWARF_SP;
978 else if (FP_REGNUM_P (regno))
979 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
981 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
982 equivalent DWARF register. */
983 return DWARF_FRAME_REGISTERS;
986 /* Return TRUE if MODE is any of the large INT modes. */
988 aarch64_vect_struct_mode_p (machine_mode mode)
990 return mode == OImode || mode == CImode || mode == XImode;
993 /* Return TRUE if MODE is any of the vector modes. */
995 aarch64_vector_mode_p (machine_mode mode)
997 return aarch64_vector_mode_supported_p (mode)
998 || aarch64_vect_struct_mode_p (mode);
1001 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1003 aarch64_array_mode_supported_p (machine_mode mode,
1004 unsigned HOST_WIDE_INT nelems)
1007 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1008 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1009 && (nelems >= 2 && nelems <= 4))
1015 /* Implement HARD_REGNO_NREGS. */
1018 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1020 switch (aarch64_regno_regclass (regno))
1024 return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
1026 return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
1031 /* Implement HARD_REGNO_MODE_OK. */
1034 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1036 if (GET_MODE_CLASS (mode) == MODE_CC)
1037 return regno == CC_REGNUM;
1039 if (regno == SP_REGNUM)
1040 /* The purpose of comparing with ptr_mode is to support the
1041 global register variable associated with the stack pointer
1042 register via the syntax of asm ("wsp") in ILP32. */
1043 return mode == Pmode || mode == ptr_mode;
1045 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1046 return mode == Pmode;
1048 if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
1051 if (FP_REGNUM_P (regno))
1053 if (aarch64_vect_struct_mode_p (mode))
1055 (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
1063 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1065 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
1068 /* Handle modes that fit within single registers. */
1069 if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
1071 if (GET_MODE_SIZE (mode) >= 4)
1076 /* Fall back to generic for multi-reg and very large modes. */
1078 return choose_hard_reg_mode (regno, nregs, false);
1081 /* Return true if calls to DECL should be treated as
1082 long-calls (ie called via a register). */
1084 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1089 /* Return true if calls to symbol-ref SYM should be treated as
1090 long-calls (ie called via a register). */
1092 aarch64_is_long_call_p (rtx sym)
1094 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1097 /* Return true if calls to symbol-ref SYM should not go through
1101 aarch64_is_noplt_call_p (rtx sym)
1103 const_tree decl = SYMBOL_REF_DECL (sym);
1108 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1109 && !targetm.binds_local_p (decl))
1115 /* Return true if the offsets to a zero/sign-extract operation
1116 represent an expression that matches an extend operation. The
1117 operands represent the paramters from
1119 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1121 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
1124 HOST_WIDE_INT mult_val, extract_val;
1126 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1129 mult_val = INTVAL (mult_imm);
1130 extract_val = INTVAL (extract_imm);
1133 && extract_val < GET_MODE_BITSIZE (mode)
1134 && exact_log2 (extract_val & ~7) > 0
1135 && (extract_val & 7) <= 4
1136 && mult_val == (1 << (extract_val & 7)))
1142 /* Emit an insn that's a simple single-set. Both the operands must be
1143 known to be valid. */
1145 emit_set_insn (rtx x, rtx y)
1147 return emit_insn (gen_rtx_SET (x, y));
1150 /* X and Y are two things to compare using CODE. Emit the compare insn and
1151 return the rtx for register 0 in the proper mode. */
1153 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1155 machine_mode mode = SELECT_CC_MODE (code, x, y);
1156 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1158 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1162 /* Build the SYMBOL_REF for __tls_get_addr. */
1164 static GTY(()) rtx tls_get_addr_libfunc;
1167 aarch64_tls_get_addr (void)
1169 if (!tls_get_addr_libfunc)
1170 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1171 return tls_get_addr_libfunc;
1174 /* Return the TLS model to use for ADDR. */
1176 static enum tls_model
1177 tls_symbolic_operand_type (rtx addr)
1179 enum tls_model tls_kind = TLS_MODEL_NONE;
1182 if (GET_CODE (addr) == CONST)
1184 split_const (addr, &sym, &addend);
1185 if (GET_CODE (sym) == SYMBOL_REF)
1186 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1188 else if (GET_CODE (addr) == SYMBOL_REF)
1189 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1194 /* We'll allow lo_sum's in addresses in our legitimate addresses
1195 so that combine would take care of combining addresses where
1196 necessary, but for generation purposes, we'll generate the address
1199 tmp = hi (symbol_ref); adrp x1, foo
1200 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1204 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1205 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1209 Load TLS symbol, depending on TLS mechanism and TLS access model.
1211 Global Dynamic - Traditional TLS:
1212 adrp tmp, :tlsgd:imm
1213 add dest, tmp, #:tlsgd_lo12:imm
1216 Global Dynamic - TLS Descriptors:
1217 adrp dest, :tlsdesc:imm
1218 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1219 add dest, dest, #:tlsdesc_lo12:imm
1226 adrp tmp, :gottprel:imm
1227 ldr dest, [tmp, #:gottprel_lo12:imm]
1232 add t0, tp, #:tprel_hi12:imm, lsl #12
1233 add t0, t0, #:tprel_lo12_nc:imm
1237 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1238 enum aarch64_symbol_type type)
1242 case SYMBOL_SMALL_ABSOLUTE:
1244 /* In ILP32, the mode of dest can be either SImode or DImode. */
1246 machine_mode mode = GET_MODE (dest);
1248 gcc_assert (mode == Pmode || mode == ptr_mode);
1250 if (can_create_pseudo_p ())
1251 tmp_reg = gen_reg_rtx (mode);
1253 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1254 emit_insn (gen_add_losym (dest, tmp_reg, imm));
1258 case SYMBOL_TINY_ABSOLUTE:
1259 emit_insn (gen_rtx_SET (dest, imm));
1262 case SYMBOL_SMALL_GOT_28K:
1264 machine_mode mode = GET_MODE (dest);
1265 rtx gp_rtx = pic_offset_table_rtx;
1269 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1270 here before rtl expand. Tree IVOPT will generate rtl pattern to
1271 decide rtx costs, in which case pic_offset_table_rtx is not
1272 initialized. For that case no need to generate the first adrp
1273 instruction as the final cost for global variable access is
1277 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1278 using the page base as GOT base, the first page may be wasted,
1279 in the worst scenario, there is only 28K space for GOT).
1281 The generate instruction sequence for accessing global variable
1284 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1286 Only one instruction needed. But we must initialize
1287 pic_offset_table_rtx properly. We generate initialize insn for
1288 every global access, and allow CSE to remove all redundant.
1290 The final instruction sequences will look like the following
1291 for multiply global variables access.
1293 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1295 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1296 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1297 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1300 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1301 crtl->uses_pic_offset_table = 1;
1302 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1304 if (mode != GET_MODE (gp_rtx))
1305 gp_rtx = simplify_gen_subreg (mode, gp_rtx, GET_MODE (gp_rtx), 0);
1308 if (mode == ptr_mode)
1311 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1313 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1315 mem = XVECEXP (SET_SRC (insn), 0, 0);
1319 gcc_assert (mode == Pmode);
1321 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1322 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1325 /* The operand is expected to be MEM. Whenever the related insn
1326 pattern changed, above code which calculate mem should be
1328 gcc_assert (GET_CODE (mem) == MEM);
1329 MEM_READONLY_P (mem) = 1;
1330 MEM_NOTRAP_P (mem) = 1;
1335 case SYMBOL_SMALL_GOT_4G:
1337 /* In ILP32, the mode of dest can be either SImode or DImode,
1338 while the got entry is always of SImode size. The mode of
1339 dest depends on how dest is used: if dest is assigned to a
1340 pointer (e.g. in the memory), it has SImode; it may have
1341 DImode if dest is dereferenced to access the memeory.
1342 This is why we have to handle three different ldr_got_small
1343 patterns here (two patterns for ILP32). */
1348 machine_mode mode = GET_MODE (dest);
1350 if (can_create_pseudo_p ())
1351 tmp_reg = gen_reg_rtx (mode);
1353 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1354 if (mode == ptr_mode)
1357 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1359 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1361 mem = XVECEXP (SET_SRC (insn), 0, 0);
1365 gcc_assert (mode == Pmode);
1367 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1368 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1371 gcc_assert (GET_CODE (mem) == MEM);
1372 MEM_READONLY_P (mem) = 1;
1373 MEM_NOTRAP_P (mem) = 1;
1378 case SYMBOL_SMALL_TLSGD:
1381 rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
1384 aarch64_emit_call_insn (gen_tlsgd_small (result, imm));
1385 insns = get_insns ();
1388 RTL_CONST_CALL_P (insns) = 1;
1389 emit_libcall_block (insns, dest, result, imm);
1393 case SYMBOL_SMALL_TLSDESC:
1395 machine_mode mode = GET_MODE (dest);
1396 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1399 gcc_assert (mode == Pmode || mode == ptr_mode);
1401 /* In ILP32, the got entry is always of SImode size. Unlike
1402 small GOT, the dest is fixed at reg 0. */
1404 emit_insn (gen_tlsdesc_small_si (imm));
1406 emit_insn (gen_tlsdesc_small_di (imm));
1407 tp = aarch64_load_tp (NULL);
1410 tp = gen_lowpart (mode, tp);
1412 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1413 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1417 case SYMBOL_SMALL_TLSIE:
1419 /* In ILP32, the mode of dest can be either SImode or DImode,
1420 while the got entry is always of SImode size. The mode of
1421 dest depends on how dest is used: if dest is assigned to a
1422 pointer (e.g. in the memory), it has SImode; it may have
1423 DImode if dest is dereferenced to access the memeory.
1424 This is why we have to handle three different tlsie_small
1425 patterns here (two patterns for ILP32). */
1426 machine_mode mode = GET_MODE (dest);
1427 rtx tmp_reg = gen_reg_rtx (mode);
1428 rtx tp = aarch64_load_tp (NULL);
1430 if (mode == ptr_mode)
1433 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1436 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1437 tp = gen_lowpart (mode, tp);
1442 gcc_assert (mode == Pmode);
1443 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1446 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1447 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1451 case SYMBOL_TLSLE12:
1452 case SYMBOL_TLSLE24:
1453 case SYMBOL_TLSLE32:
1454 case SYMBOL_TLSLE48:
1456 machine_mode mode = GET_MODE (dest);
1457 rtx tp = aarch64_load_tp (NULL);
1460 tp = gen_lowpart (mode, tp);
1464 case SYMBOL_TLSLE12:
1465 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1468 case SYMBOL_TLSLE24:
1469 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1472 case SYMBOL_TLSLE32:
1473 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1475 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1478 case SYMBOL_TLSLE48:
1479 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1481 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1488 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1492 case SYMBOL_TINY_GOT:
1493 emit_insn (gen_ldr_got_tiny (dest, imm));
1496 case SYMBOL_TINY_TLSIE:
1498 machine_mode mode = GET_MODE (dest);
1499 rtx tp = aarch64_load_tp (NULL);
1501 if (mode == ptr_mode)
1504 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1507 tp = gen_lowpart (mode, tp);
1508 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1513 gcc_assert (mode == Pmode);
1514 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1517 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1526 /* Emit a move from SRC to DEST. Assume that the move expanders can
1527 handle all moves if !can_create_pseudo_p (). The distinction is
1528 important because, unlike emit_move_insn, the move expanders know
1529 how to force Pmode objects into the constant pool even when the
1530 constant pool address is not itself legitimate. */
1532 aarch64_emit_move (rtx dest, rtx src)
1534 return (can_create_pseudo_p ()
1535 ? emit_move_insn (dest, src)
1536 : emit_move_insn_1 (dest, src));
1539 /* Split a 128-bit move operation into two 64-bit move operations,
1540 taking care to handle partial overlap of register to register
1541 copies. Special cases are needed when moving between GP regs and
1542 FP regs. SRC can be a register, constant or memory; DST a register
1543 or memory. If either operand is memory it must not have any side
1546 aarch64_split_128bit_move (rtx dst, rtx src)
1551 machine_mode mode = GET_MODE (dst);
1553 gcc_assert (mode == TImode || mode == TFmode);
1554 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1555 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1557 if (REG_P (dst) && REG_P (src))
1559 int src_regno = REGNO (src);
1560 int dst_regno = REGNO (dst);
1562 /* Handle FP <-> GP regs. */
1563 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1565 src_lo = gen_lowpart (word_mode, src);
1566 src_hi = gen_highpart (word_mode, src);
1570 emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1571 emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1575 emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1576 emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1580 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1582 dst_lo = gen_lowpart (word_mode, dst);
1583 dst_hi = gen_highpart (word_mode, dst);
1587 emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1588 emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1592 emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1593 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1599 dst_lo = gen_lowpart (word_mode, dst);
1600 dst_hi = gen_highpart (word_mode, dst);
1601 src_lo = gen_lowpart (word_mode, src);
1602 src_hi = gen_highpart_mode (word_mode, mode, src);
1604 /* At most one pairing may overlap. */
1605 if (reg_overlap_mentioned_p (dst_lo, src_hi))
1607 aarch64_emit_move (dst_hi, src_hi);
1608 aarch64_emit_move (dst_lo, src_lo);
1612 aarch64_emit_move (dst_lo, src_lo);
1613 aarch64_emit_move (dst_hi, src_hi);
1618 aarch64_split_128bit_move_p (rtx dst, rtx src)
1620 return (! REG_P (src)
1621 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1624 /* Split a complex SIMD combine. */
1627 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1629 machine_mode src_mode = GET_MODE (src1);
1630 machine_mode dst_mode = GET_MODE (dst);
1632 gcc_assert (VECTOR_MODE_P (dst_mode));
1634 if (REG_P (dst) && REG_P (src1) && REG_P (src2))
1636 rtx (*gen) (rtx, rtx, rtx);
1641 gen = gen_aarch64_simd_combinev8qi;
1644 gen = gen_aarch64_simd_combinev4hi;
1647 gen = gen_aarch64_simd_combinev2si;
1650 gen = gen_aarch64_simd_combinev4hf;
1653 gen = gen_aarch64_simd_combinev2sf;
1656 gen = gen_aarch64_simd_combinedi;
1659 gen = gen_aarch64_simd_combinedf;
1665 emit_insn (gen (dst, src1, src2));
1670 /* Split a complex SIMD move. */
1673 aarch64_split_simd_move (rtx dst, rtx src)
1675 machine_mode src_mode = GET_MODE (src);
1676 machine_mode dst_mode = GET_MODE (dst);
1678 gcc_assert (VECTOR_MODE_P (dst_mode));
1680 if (REG_P (dst) && REG_P (src))
1682 rtx (*gen) (rtx, rtx);
1684 gcc_assert (VECTOR_MODE_P (src_mode));
1689 gen = gen_aarch64_split_simd_movv16qi;
1692 gen = gen_aarch64_split_simd_movv8hi;
1695 gen = gen_aarch64_split_simd_movv4si;
1698 gen = gen_aarch64_split_simd_movv2di;
1701 gen = gen_aarch64_split_simd_movv8hf;
1704 gen = gen_aarch64_split_simd_movv4sf;
1707 gen = gen_aarch64_split_simd_movv2df;
1713 emit_insn (gen (dst, src));
1719 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
1720 machine_mode ymode, rtx y)
1722 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
1723 gcc_assert (r != NULL);
1724 return rtx_equal_p (x, r);
1729 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1731 if (can_create_pseudo_p ())
1732 return force_reg (mode, value);
1735 x = aarch64_emit_move (x, value);
1742 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1744 if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1747 /* Load the full offset into a register. This
1748 might be improvable in the future. */
1749 high = GEN_INT (offset);
1751 high = aarch64_force_temporary (mode, temp, high);
1752 reg = aarch64_force_temporary (mode, temp,
1753 gen_rtx_PLUS (mode, high, reg));
1755 return plus_constant (mode, reg, offset);
1759 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1763 unsigned HOST_WIDE_INT val, val2, mask;
1764 int one_match, zero_match;
1769 if (aarch64_move_imm (val, mode))
1772 emit_insn (gen_rtx_SET (dest, imm));
1776 if ((val >> 32) == 0 || mode == SImode)
1780 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
1782 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1783 GEN_INT ((val >> 16) & 0xffff)));
1785 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
1786 GEN_INT ((val >> 16) & 0xffff)));
1791 /* Remaining cases are all for DImode. */
1794 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
1795 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
1796 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
1797 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
1799 if (zero_match != 2 && one_match != 2)
1801 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1802 For a 64-bit bitmask try whether changing 16 bits to all ones or
1803 zeroes creates a valid bitmask. To check any repeated bitmask,
1804 try using 16 bits from the other 32-bit half of val. */
1806 for (i = 0; i < 64; i += 16, mask <<= 16)
1809 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1812 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1814 val2 = val2 & ~mask;
1815 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
1816 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1823 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1824 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1825 GEN_INT ((val >> i) & 0xffff)));
1831 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1832 are emitted by the initial mov. If one_match > zero_match, skip set bits,
1833 otherwise skip zero bits. */
1837 val2 = one_match > zero_match ? ~val : val;
1838 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
1841 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
1842 ? (val | ~(mask << i))
1843 : (val & (mask << i)))));
1844 for (i += 16; i < 64; i += 16)
1846 if ((val2 & (mask << i)) == 0)
1849 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1850 GEN_INT ((val >> i) & 0xffff)));
1859 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1861 machine_mode mode = GET_MODE (dest);
1863 gcc_assert (mode == SImode || mode == DImode);
1865 /* Check on what type of symbol it is. */
1866 if (GET_CODE (imm) == SYMBOL_REF
1867 || GET_CODE (imm) == LABEL_REF
1868 || GET_CODE (imm) == CONST)
1870 rtx mem, base, offset;
1871 enum aarch64_symbol_type sty;
1873 /* If we have (const (plus symbol offset)), separate out the offset
1874 before we start classifying the symbol. */
1875 split_const (imm, &base, &offset);
1877 sty = aarch64_classify_symbol (base, offset);
1880 case SYMBOL_FORCE_TO_MEM:
1881 if (offset != const0_rtx
1882 && targetm.cannot_force_const_mem (mode, imm))
1884 gcc_assert (can_create_pseudo_p ());
1885 base = aarch64_force_temporary (mode, dest, base);
1886 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1887 aarch64_emit_move (dest, base);
1891 mem = force_const_mem (ptr_mode, imm);
1894 /* If we aren't generating PC relative literals, then
1895 we need to expand the literal pool access carefully.
1896 This is something that needs to be done in a number
1897 of places, so could well live as a separate function. */
1898 if (!aarch64_pcrelative_literal_loads)
1900 gcc_assert (can_create_pseudo_p ());
1901 base = gen_reg_rtx (ptr_mode);
1902 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
1903 mem = gen_rtx_MEM (ptr_mode, base);
1906 if (mode != ptr_mode)
1907 mem = gen_rtx_ZERO_EXTEND (mode, mem);
1909 emit_insn (gen_rtx_SET (dest, mem));
1913 case SYMBOL_SMALL_TLSGD:
1914 case SYMBOL_SMALL_TLSDESC:
1915 case SYMBOL_SMALL_TLSIE:
1916 case SYMBOL_SMALL_GOT_28K:
1917 case SYMBOL_SMALL_GOT_4G:
1918 case SYMBOL_TINY_GOT:
1919 case SYMBOL_TINY_TLSIE:
1920 if (offset != const0_rtx)
1922 gcc_assert(can_create_pseudo_p ());
1923 base = aarch64_force_temporary (mode, dest, base);
1924 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1925 aarch64_emit_move (dest, base);
1930 case SYMBOL_SMALL_ABSOLUTE:
1931 case SYMBOL_TINY_ABSOLUTE:
1932 case SYMBOL_TLSLE12:
1933 case SYMBOL_TLSLE24:
1934 case SYMBOL_TLSLE32:
1935 case SYMBOL_TLSLE48:
1936 aarch64_load_symref_appropriately (dest, imm, sty);
1944 if (!CONST_INT_P (imm))
1946 if (GET_CODE (imm) == HIGH)
1947 emit_insn (gen_rtx_SET (dest, imm));
1950 rtx mem = force_const_mem (mode, imm);
1952 emit_insn (gen_rtx_SET (dest, mem));
1958 aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
1961 /* Add DELTA to REGNUM in mode MODE. SCRATCHREG can be used to hold a
1962 temporary value if necessary. FRAME_RELATED_P should be true if
1963 the RTX_FRAME_RELATED flag should be set and CFA adjustments added
1964 to the generated instructions. If SCRATCHREG is known to hold
1965 abs (delta), EMIT_MOVE_IMM can be set to false to avoid emitting the
1968 Since this function may be used to adjust the stack pointer, we must
1969 ensure that it cannot cause transient stack deallocation (for example
1970 by first incrementing SP and then decrementing when adjusting by a
1971 large immediate). */
1974 aarch64_add_constant_internal (machine_mode mode, int regnum, int scratchreg,
1975 HOST_WIDE_INT delta, bool frame_related_p,
1978 HOST_WIDE_INT mdelta = abs_hwi (delta);
1979 rtx this_rtx = gen_rtx_REG (mode, regnum);
1985 /* Single instruction adjustment. */
1986 if (aarch64_uimm12_shift (mdelta))
1988 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
1989 RTX_FRAME_RELATED_P (insn) = frame_related_p;
1993 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits.
1994 Only do this if mdelta is not a 16-bit move as adjusting using a move
1996 if (mdelta < 0x1000000 && !aarch64_move_imm (mdelta, mode))
1998 HOST_WIDE_INT low_off = mdelta & 0xfff;
2000 low_off = delta < 0 ? -low_off : low_off;
2001 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
2002 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2003 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
2004 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2008 /* Emit a move immediate if required and an addition/subtraction. */
2009 rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
2011 aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (mdelta), true, mode);
2012 insn = emit_insn (delta < 0 ? gen_sub2_insn (this_rtx, scratch_rtx)
2013 : gen_add2_insn (this_rtx, scratch_rtx));
2014 if (frame_related_p)
2016 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2017 rtx adj = plus_constant (mode, this_rtx, delta);
2018 add_reg_note (insn , REG_CFA_ADJUST_CFA, gen_rtx_SET (this_rtx, adj));
2023 aarch64_add_constant (machine_mode mode, int regnum, int scratchreg,
2024 HOST_WIDE_INT delta)
2026 aarch64_add_constant_internal (mode, regnum, scratchreg, delta, false, true);
2030 aarch64_add_sp (int scratchreg, HOST_WIDE_INT delta, bool emit_move_imm)
2032 aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, delta,
2033 true, emit_move_imm);
2037 aarch64_sub_sp (int scratchreg, HOST_WIDE_INT delta, bool frame_related_p)
2039 aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, -delta,
2040 frame_related_p, true);
2044 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
2045 tree exp ATTRIBUTE_UNUSED)
2047 /* Currently, always true. */
2051 /* Implement TARGET_PASS_BY_REFERENCE. */
2054 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
2057 bool named ATTRIBUTE_UNUSED)
2060 machine_mode dummymode;
2063 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
2064 size = (mode == BLKmode && type)
2065 ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
2067 /* Aggregates are passed by reference based on their size. */
2068 if (type && AGGREGATE_TYPE_P (type))
2070 size = int_size_in_bytes (type);
2073 /* Variable sized arguments are always returned by reference. */
2077 /* Can this be a candidate to be passed in fp/simd register(s)? */
2078 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2083 /* Arguments which are variable sized or larger than 2 registers are
2084 passed by reference unless they are a homogenous floating point
2086 return size > 2 * UNITS_PER_WORD;
2089 /* Return TRUE if VALTYPE is padded to its least significant bits. */
2091 aarch64_return_in_msb (const_tree valtype)
2093 machine_mode dummy_mode;
2096 /* Never happens in little-endian mode. */
2097 if (!BYTES_BIG_ENDIAN)
2100 /* Only composite types smaller than or equal to 16 bytes can
2101 be potentially returned in registers. */
2102 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
2103 || int_size_in_bytes (valtype) <= 0
2104 || int_size_in_bytes (valtype) > 16)
2107 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
2108 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
2109 is always passed/returned in the least significant bits of fp/simd
2111 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
2112 &dummy_mode, &dummy_int, NULL))
2118 /* Implement TARGET_FUNCTION_VALUE.
2119 Define how to find the value returned by a function. */
2122 aarch64_function_value (const_tree type, const_tree func,
2123 bool outgoing ATTRIBUTE_UNUSED)
2128 machine_mode ag_mode;
2130 mode = TYPE_MODE (type);
2131 if (INTEGRAL_TYPE_P (type))
2132 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
2134 if (aarch64_return_in_msb (type))
2136 HOST_WIDE_INT size = int_size_in_bytes (type);
2138 if (size % UNITS_PER_WORD != 0)
2140 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
2141 mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
2145 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2146 &ag_mode, &count, NULL))
2148 if (!aarch64_composite_type_p (type, mode))
2150 gcc_assert (count == 1 && mode == ag_mode);
2151 return gen_rtx_REG (mode, V0_REGNUM);
2158 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
2159 for (i = 0; i < count; i++)
2161 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
2162 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2163 GEN_INT (i * GET_MODE_SIZE (ag_mode)));
2164 XVECEXP (par, 0, i) = tmp;
2170 return gen_rtx_REG (mode, R0_REGNUM);
2173 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
2174 Return true if REGNO is the number of a hard register in which the values
2175 of called function may come back. */
2178 aarch64_function_value_regno_p (const unsigned int regno)
2180 /* Maximum of 16 bytes can be returned in the general registers. Examples
2181 of 16-byte return values are: 128-bit integers and 16-byte small
2182 structures (excluding homogeneous floating-point aggregates). */
2183 if (regno == R0_REGNUM || regno == R1_REGNUM)
2186 /* Up to four fp/simd registers can return a function value, e.g. a
2187 homogeneous floating-point aggregate having four members. */
2188 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
2189 return TARGET_FLOAT;
2194 /* Implement TARGET_RETURN_IN_MEMORY.
2196 If the type T of the result of a function is such that
2198 would require that arg be passed as a value in a register (or set of
2199 registers) according to the parameter passing rules, then the result
2200 is returned in the same registers as would be used for such an
2204 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
2207 machine_mode ag_mode;
2210 if (!AGGREGATE_TYPE_P (type)
2211 && TREE_CODE (type) != COMPLEX_TYPE
2212 && TREE_CODE (type) != VECTOR_TYPE)
2213 /* Simple scalar types always returned in registers. */
2216 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
2223 /* Types larger than 2 registers returned in memory. */
2224 size = int_size_in_bytes (type);
2225 return (size < 0 || size > 2 * UNITS_PER_WORD);
2229 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
2230 const_tree type, int *nregs)
2232 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2233 return aarch64_vfp_is_call_or_return_candidate (mode,
2235 &pcum->aapcs_vfp_rmode,
2240 /* Given MODE and TYPE of a function argument, return the alignment in
2241 bits. The idea is to suppress any stronger alignment requested by
2242 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
2243 This is a helper function for local use only. */
2246 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
2249 return GET_MODE_ALIGNMENT (mode);
2251 if (integer_zerop (TYPE_SIZE (type)))
2254 gcc_assert (TYPE_MODE (type) == mode);
2256 if (!AGGREGATE_TYPE_P (type))
2257 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
2259 if (TREE_CODE (type) == ARRAY_TYPE)
2260 return TYPE_ALIGN (TREE_TYPE (type));
2262 unsigned int alignment = 0;
2263 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
2264 if (TREE_CODE (field) == FIELD_DECL)
2265 alignment = std::max (alignment, DECL_ALIGN (field));
2270 /* Layout a function argument according to the AAPCS64 rules. The rule
2271 numbers refer to the rule numbers in the AAPCS64. */
2274 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
2276 bool named ATTRIBUTE_UNUSED)
2278 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2279 int ncrn, nvrn, nregs;
2280 bool allocate_ncrn, allocate_nvrn;
2283 /* We need to do this once per argument. */
2284 if (pcum->aapcs_arg_processed)
2287 pcum->aapcs_arg_processed = true;
2289 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
2291 = ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
2294 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
2295 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
2300 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
2301 The following code thus handles passing by SIMD/FP registers first. */
2303 nvrn = pcum->aapcs_nvrn;
2305 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
2306 and homogenous short-vector aggregates (HVA). */
2310 aarch64_err_no_fpadvsimd (mode, "argument");
2312 if (nvrn + nregs <= NUM_FP_ARG_REGS)
2314 pcum->aapcs_nextnvrn = nvrn + nregs;
2315 if (!aarch64_composite_type_p (type, mode))
2317 gcc_assert (nregs == 1);
2318 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
2324 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2325 for (i = 0; i < nregs; i++)
2327 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
2328 V0_REGNUM + nvrn + i);
2329 tmp = gen_rtx_EXPR_LIST
2331 GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
2332 XVECEXP (par, 0, i) = tmp;
2334 pcum->aapcs_reg = par;
2340 /* C.3 NSRN is set to 8. */
2341 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
2346 ncrn = pcum->aapcs_ncrn;
2347 nregs = size / UNITS_PER_WORD;
2349 /* C6 - C9. though the sign and zero extension semantics are
2350 handled elsewhere. This is the case where the argument fits
2351 entirely general registers. */
2352 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
2355 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
2357 /* C.8 if the argument has an alignment of 16 then the NGRN is
2358 rounded up to the next even number. */
2361 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
2362 comparison is there because for > 16 * BITS_PER_UNIT
2363 alignment nregs should be > 2 and therefore it should be
2364 passed by reference rather than value. */
2365 && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2368 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
2371 /* NREGS can be 0 when e.g. an empty structure is to be passed.
2372 A reg is still generated for it, but the caller should be smart
2373 enough not to use it. */
2374 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
2375 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
2381 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2382 for (i = 0; i < nregs; i++)
2384 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
2385 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2386 GEN_INT (i * UNITS_PER_WORD));
2387 XVECEXP (par, 0, i) = tmp;
2389 pcum->aapcs_reg = par;
2392 pcum->aapcs_nextncrn = ncrn + nregs;
2397 pcum->aapcs_nextncrn = NUM_ARG_REGS;
2399 /* The argument is passed on stack; record the needed number of words for
2400 this argument and align the total size if necessary. */
2402 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2404 if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2405 pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
2406 16 / UNITS_PER_WORD);
2410 /* Implement TARGET_FUNCTION_ARG. */
2413 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
2414 const_tree type, bool named)
2416 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2417 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
2419 if (mode == VOIDmode)
2422 aarch64_layout_arg (pcum_v, mode, type, named);
2423 return pcum->aapcs_reg;
2427 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
2428 const_tree fntype ATTRIBUTE_UNUSED,
2429 rtx libname ATTRIBUTE_UNUSED,
2430 const_tree fndecl ATTRIBUTE_UNUSED,
2431 unsigned n_named ATTRIBUTE_UNUSED)
2433 pcum->aapcs_ncrn = 0;
2434 pcum->aapcs_nvrn = 0;
2435 pcum->aapcs_nextncrn = 0;
2436 pcum->aapcs_nextnvrn = 0;
2437 pcum->pcs_variant = ARM_PCS_AAPCS64;
2438 pcum->aapcs_reg = NULL_RTX;
2439 pcum->aapcs_arg_processed = false;
2440 pcum->aapcs_stack_words = 0;
2441 pcum->aapcs_stack_size = 0;
2444 && fndecl && TREE_PUBLIC (fndecl)
2445 && fntype && fntype != error_mark_node)
2447 const_tree type = TREE_TYPE (fntype);
2448 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
2449 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
2450 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2451 &mode, &nregs, NULL))
2452 aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2458 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2463 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2464 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2466 aarch64_layout_arg (pcum_v, mode, type, named);
2467 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2468 != (pcum->aapcs_stack_words != 0));
2469 pcum->aapcs_arg_processed = false;
2470 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2471 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2472 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2473 pcum->aapcs_stack_words = 0;
2474 pcum->aapcs_reg = NULL_RTX;
2479 aarch64_function_arg_regno_p (unsigned regno)
2481 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2482 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2485 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
2486 PARM_BOUNDARY bits of alignment, but will be given anything up
2487 to STACK_BOUNDARY bits if the type requires it. This makes sure
2488 that both before and after the layout of each argument, the Next
2489 Stacked Argument Address (NSAA) will have a minimum alignment of
2493 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2495 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2496 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
2499 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
2501 Return true if an argument passed on the stack should be padded upwards,
2502 i.e. if the least-significant byte of the stack slot has useful data.
2504 Small aggregate types are placed in the lowest memory address.
2506 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
2509 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
2511 /* On little-endian targets, the least significant byte of every stack
2512 argument is passed at the lowest byte address of the stack slot. */
2513 if (!BYTES_BIG_ENDIAN)
2516 /* Otherwise, integral, floating-point and pointer types are padded downward:
2517 the least significant byte of a stack argument is passed at the highest
2518 byte address of the stack slot. */
2520 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2521 || POINTER_TYPE_P (type))
2522 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2525 /* Everything else padded upward, i.e. data in first byte of stack slot. */
2529 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2531 It specifies padding for the last (may also be the only)
2532 element of a block move between registers and memory. If
2533 assuming the block is in the memory, padding upward means that
2534 the last element is padded after its highest significant byte,
2535 while in downward padding, the last element is padded at the
2536 its least significant byte side.
2538 Small aggregates and small complex types are always padded
2541 We don't need to worry about homogeneous floating-point or
2542 short-vector aggregates; their move is not affected by the
2543 padding direction determined here. Regardless of endianness,
2544 each element of such an aggregate is put in the least
2545 significant bits of a fp/simd register.
2547 Return !BYTES_BIG_ENDIAN if the least significant byte of the
2548 register has useful data, and return the opposite if the most
2549 significant byte does. */
2552 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2553 bool first ATTRIBUTE_UNUSED)
2556 /* Small composite types are always padded upward. */
2557 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2559 HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2560 : GET_MODE_SIZE (mode));
2561 if (size < 2 * UNITS_PER_WORD)
2565 /* Otherwise, use the default padding. */
2566 return !BYTES_BIG_ENDIAN;
2570 aarch64_libgcc_cmp_return_mode (void)
2575 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
2577 /* We use the 12-bit shifted immediate arithmetic instructions so values
2578 must be multiple of (1 << 12), i.e. 4096. */
2579 #define ARITH_FACTOR 4096
2581 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
2582 #error Cannot use simple address calculation for stack probing
2585 /* The pair of scratch registers used for stack probing. */
2586 #define PROBE_STACK_FIRST_REG 9
2587 #define PROBE_STACK_SECOND_REG 10
2589 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
2590 inclusive. These are offsets from the current stack pointer. */
2593 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
2595 rtx reg1 = gen_rtx_REG (ptr_mode, PROBE_STACK_FIRST_REG);
2597 /* See the same assertion on PROBE_INTERVAL above. */
2598 gcc_assert ((first % ARITH_FACTOR) == 0);
2600 /* See if we have a constant small number of probes to generate. If so,
2601 that's the easy case. */
2602 if (size <= PROBE_INTERVAL)
2604 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
2606 emit_set_insn (reg1,
2607 plus_constant (ptr_mode,
2608 stack_pointer_rtx, -(first + base)));
2609 emit_stack_probe (plus_constant (ptr_mode, reg1, base - size));
2612 /* The run-time loop is made up of 8 insns in the generic case while the
2613 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
2614 else if (size <= 4 * PROBE_INTERVAL)
2616 HOST_WIDE_INT i, rem;
2618 emit_set_insn (reg1,
2619 plus_constant (ptr_mode,
2621 -(first + PROBE_INTERVAL)));
2622 emit_stack_probe (reg1);
2624 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
2625 it exceeds SIZE. If only two probes are needed, this will not
2626 generate any code. Then probe at FIRST + SIZE. */
2627 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
2629 emit_set_insn (reg1,
2630 plus_constant (ptr_mode, reg1, -PROBE_INTERVAL));
2631 emit_stack_probe (reg1);
2634 rem = size - (i - PROBE_INTERVAL);
2637 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2639 emit_set_insn (reg1, plus_constant (ptr_mode, reg1, -base));
2640 emit_stack_probe (plus_constant (ptr_mode, reg1, base - rem));
2643 emit_stack_probe (plus_constant (ptr_mode, reg1, -rem));
2646 /* Otherwise, do the same as above, but in a loop. Note that we must be
2647 extra careful with variables wrapping around because we might be at
2648 the very top (or the very bottom) of the address space and we have
2649 to be able to handle this case properly; in particular, we use an
2650 equality test for the loop condition. */
2653 rtx reg2 = gen_rtx_REG (ptr_mode, PROBE_STACK_SECOND_REG);
2655 /* Step 1: round SIZE to the previous multiple of the interval. */
2657 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
2660 /* Step 2: compute initial and final value of the loop counter. */
2662 /* TEST_ADDR = SP + FIRST. */
2663 emit_set_insn (reg1,
2664 plus_constant (ptr_mode, stack_pointer_rtx, -first));
2666 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
2667 emit_set_insn (reg2,
2668 plus_constant (ptr_mode, stack_pointer_rtx,
2669 -(first + rounded_size)));
2676 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
2679 while (TEST_ADDR != LAST_ADDR)
2681 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
2682 until it is equal to ROUNDED_SIZE. */
2684 if (ptr_mode == DImode)
2685 emit_insn (gen_probe_stack_range_di (reg1, reg1, reg2));
2687 emit_insn (gen_probe_stack_range_si (reg1, reg1, reg2));
2690 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
2691 that SIZE is equal to ROUNDED_SIZE. */
2693 if (size != rounded_size)
2695 HOST_WIDE_INT rem = size - rounded_size;
2699 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2701 emit_set_insn (reg2, plus_constant (ptr_mode, reg2, -base));
2702 emit_stack_probe (plus_constant (ptr_mode, reg2, base - rem));
2705 emit_stack_probe (plus_constant (ptr_mode, reg2, -rem));
2709 /* Make sure nothing is scheduled before we are done. */
2710 emit_insn (gen_blockage ());
2713 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
2714 absolute addresses. */
2717 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
2719 static int labelno = 0;
2723 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
2726 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
2728 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
2730 xops[1] = GEN_INT (PROBE_INTERVAL);
2731 output_asm_insn ("sub\t%0, %0, %1", xops);
2733 /* Probe at TEST_ADDR. */
2734 output_asm_insn ("str\txzr, [%0]", xops);
2736 /* Test if TEST_ADDR == LAST_ADDR. */
2738 output_asm_insn ("cmp\t%0, %1", xops);
2741 fputs ("\tb.ne\t", asm_out_file);
2742 assemble_name_raw (asm_out_file, loop_lab);
2743 fputc ('\n', asm_out_file);
2749 aarch64_frame_pointer_required (void)
2751 /* In aarch64_override_options_after_change
2752 flag_omit_leaf_frame_pointer turns off the frame pointer by
2753 default. Turn it back on now if we've not got a leaf
2755 if (flag_omit_leaf_frame_pointer
2756 && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2759 /* Force a frame pointer for EH returns so the return address is at FP+8. */
2760 if (crtl->calls_eh_return)
2766 /* Mark the registers that need to be saved by the callee and calculate
2767 the size of the callee-saved registers area and frame record (both FP
2768 and LR may be omitted). */
2770 aarch64_layout_frame (void)
2772 HOST_WIDE_INT offset = 0;
2773 int regno, last_fp_reg = INVALID_REGNUM;
2775 if (reload_completed && cfun->machine->frame.laid_out)
2778 #define SLOT_NOT_REQUIRED (-2)
2779 #define SLOT_REQUIRED (-1)
2781 cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
2782 cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
2784 /* First mark all the registers that really need to be saved... */
2785 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2786 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2788 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2789 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2791 /* ... that includes the eh data registers (if needed)... */
2792 if (crtl->calls_eh_return)
2793 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2794 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2797 /* ... and any callee saved register that dataflow says is live. */
2798 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2799 if (df_regs_ever_live_p (regno)
2800 && (regno == R30_REGNUM
2801 || !call_used_regs[regno]))
2802 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2804 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2805 if (df_regs_ever_live_p (regno)
2806 && !call_used_regs[regno])
2808 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2809 last_fp_reg = regno;
2812 if (frame_pointer_needed)
2814 /* FP and LR are placed in the linkage record. */
2815 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2816 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2817 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2818 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2819 offset += 2 * UNITS_PER_WORD;
2822 /* Now assign stack slots for them. */
2823 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2824 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2826 cfun->machine->frame.reg_offset[regno] = offset;
2827 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2828 cfun->machine->frame.wb_candidate1 = regno;
2829 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
2830 cfun->machine->frame.wb_candidate2 = regno;
2831 offset += UNITS_PER_WORD;
2834 HOST_WIDE_INT max_int_offset = offset;
2835 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2836 bool has_align_gap = offset != max_int_offset;
2838 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2839 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2841 /* If there is an alignment gap between integer and fp callee-saves,
2842 allocate the last fp register to it if possible. */
2843 if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
2845 cfun->machine->frame.reg_offset[regno] = max_int_offset;
2849 cfun->machine->frame.reg_offset[regno] = offset;
2850 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2851 cfun->machine->frame.wb_candidate1 = regno;
2852 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
2853 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2854 cfun->machine->frame.wb_candidate2 = regno;
2855 offset += UNITS_PER_WORD;
2858 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2860 cfun->machine->frame.saved_regs_size = offset;
2862 HOST_WIDE_INT varargs_and_saved_regs_size
2863 = offset + cfun->machine->frame.saved_varargs_size;
2865 cfun->machine->frame.hard_fp_offset
2866 = ROUND_UP (varargs_and_saved_regs_size + get_frame_size (),
2867 STACK_BOUNDARY / BITS_PER_UNIT);
2869 cfun->machine->frame.frame_size
2870 = ROUND_UP (cfun->machine->frame.hard_fp_offset
2871 + crtl->outgoing_args_size,
2872 STACK_BOUNDARY / BITS_PER_UNIT);
2874 cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
2876 cfun->machine->frame.initial_adjust = 0;
2877 cfun->machine->frame.final_adjust = 0;
2878 cfun->machine->frame.callee_adjust = 0;
2879 cfun->machine->frame.callee_offset = 0;
2881 HOST_WIDE_INT max_push_offset = 0;
2882 if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
2883 max_push_offset = 512;
2884 else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
2885 max_push_offset = 256;
2887 if (cfun->machine->frame.frame_size < max_push_offset
2888 && crtl->outgoing_args_size == 0)
2890 /* Simple, small frame with no outgoing arguments:
2891 stp reg1, reg2, [sp, -frame_size]!
2892 stp reg3, reg4, [sp, 16] */
2893 cfun->machine->frame.callee_adjust = cfun->machine->frame.frame_size;
2895 else if ((crtl->outgoing_args_size
2896 + cfun->machine->frame.saved_regs_size < 512)
2897 && !(cfun->calls_alloca
2898 && cfun->machine->frame.hard_fp_offset < max_push_offset))
2900 /* Frame with small outgoing arguments:
2901 sub sp, sp, frame_size
2902 stp reg1, reg2, [sp, outgoing_args_size]
2903 stp reg3, reg4, [sp, outgoing_args_size + 16] */
2904 cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
2905 cfun->machine->frame.callee_offset
2906 = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
2908 else if (cfun->machine->frame.hard_fp_offset < max_push_offset)
2910 /* Frame with large outgoing arguments but a small local area:
2911 stp reg1, reg2, [sp, -hard_fp_offset]!
2912 stp reg3, reg4, [sp, 16]
2913 sub sp, sp, outgoing_args_size */
2914 cfun->machine->frame.callee_adjust = cfun->machine->frame.hard_fp_offset;
2915 cfun->machine->frame.final_adjust
2916 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
2918 else if (!frame_pointer_needed
2919 && varargs_and_saved_regs_size < max_push_offset)
2921 /* Frame with large local area and outgoing arguments (this pushes the
2922 callee-saves first, followed by the locals and outgoing area):
2923 stp reg1, reg2, [sp, -varargs_and_saved_regs_size]!
2924 stp reg3, reg4, [sp, 16]
2925 sub sp, sp, frame_size - varargs_and_saved_regs_size */
2926 cfun->machine->frame.callee_adjust = varargs_and_saved_regs_size;
2927 cfun->machine->frame.final_adjust
2928 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
2929 cfun->machine->frame.hard_fp_offset = cfun->machine->frame.callee_adjust;
2930 cfun->machine->frame.locals_offset = cfun->machine->frame.hard_fp_offset;
2934 /* Frame with large local area and outgoing arguments using frame pointer:
2935 sub sp, sp, hard_fp_offset
2936 stp x29, x30, [sp, 0]
2938 stp reg3, reg4, [sp, 16]
2939 sub sp, sp, outgoing_args_size */
2940 cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
2941 cfun->machine->frame.final_adjust
2942 = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
2945 cfun->machine->frame.laid_out = true;
2948 /* Return true if the register REGNO is saved on entry to
2949 the current function. */
2952 aarch64_register_saved_on_entry (int regno)
2954 return cfun->machine->frame.reg_offset[regno] >= 0;
2957 /* Return the next register up from REGNO up to LIMIT for the callee
2961 aarch64_next_callee_save (unsigned regno, unsigned limit)
2963 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
2968 /* Push the register number REGNO of mode MODE to the stack with write-back
2969 adjusting the stack by ADJUSTMENT. */
2972 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
2973 HOST_WIDE_INT adjustment)
2975 rtx base_rtx = stack_pointer_rtx;
2978 reg = gen_rtx_REG (mode, regno);
2979 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
2980 plus_constant (Pmode, base_rtx, -adjustment));
2981 mem = gen_rtx_MEM (mode, mem);
2983 insn = emit_move_insn (mem, reg);
2984 RTX_FRAME_RELATED_P (insn) = 1;
2987 /* Generate and return an instruction to store the pair of registers
2988 REG and REG2 of mode MODE to location BASE with write-back adjusting
2989 the stack location BASE by ADJUSTMENT. */
2992 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2993 HOST_WIDE_INT adjustment)
2998 return gen_storewb_pairdi_di (base, base, reg, reg2,
2999 GEN_INT (-adjustment),
3000 GEN_INT (UNITS_PER_WORD - adjustment));
3002 return gen_storewb_pairdf_di (base, base, reg, reg2,
3003 GEN_INT (-adjustment),
3004 GEN_INT (UNITS_PER_WORD - adjustment));
3010 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
3011 stack pointer by ADJUSTMENT. */
3014 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
3017 machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
3019 if (regno2 == INVALID_REGNUM)
3020 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
3022 rtx reg1 = gen_rtx_REG (mode, regno1);
3023 rtx reg2 = gen_rtx_REG (mode, regno2);
3025 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
3027 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
3028 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3029 RTX_FRAME_RELATED_P (insn) = 1;
3032 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
3033 adjusting it by ADJUSTMENT afterwards. */
3036 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3037 HOST_WIDE_INT adjustment)
3042 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
3043 GEN_INT (UNITS_PER_WORD));
3045 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
3046 GEN_INT (UNITS_PER_WORD));
3052 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
3053 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
3057 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
3060 machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
3061 rtx reg1 = gen_rtx_REG (mode, regno1);
3063 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
3065 if (regno2 == INVALID_REGNUM)
3067 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
3068 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
3069 emit_move_insn (reg1, gen_rtx_MEM (mode, mem));
3073 rtx reg2 = gen_rtx_REG (mode, regno2);
3074 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3075 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
3080 /* Generate and return a store pair instruction of mode MODE to store
3081 register REG1 to MEM1 and register REG2 to MEM2. */
3084 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
3090 return gen_store_pairdi (mem1, reg1, mem2, reg2);
3093 return gen_store_pairdf (mem1, reg1, mem2, reg2);
3100 /* Generate and regurn a load pair isntruction of mode MODE to load register
3101 REG1 from MEM1 and register REG2 from MEM2. */
3104 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
3110 return gen_load_pairdi (reg1, mem1, reg2, mem2);
3113 return gen_load_pairdf (reg1, mem1, reg2, mem2);
3120 /* Emit code to save the callee-saved registers from register number START
3121 to LIMIT to the stack at the location starting at offset START_OFFSET,
3122 skipping any write-back candidates if SKIP_WB is true. */
3125 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
3126 unsigned start, unsigned limit, bool skip_wb)
3129 rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
3130 ? gen_frame_mem : gen_rtx_MEM);
3134 for (regno = aarch64_next_callee_save (start, limit);
3136 regno = aarch64_next_callee_save (regno + 1, limit))
3139 HOST_WIDE_INT offset;
3142 && (regno == cfun->machine->frame.wb_candidate1
3143 || regno == cfun->machine->frame.wb_candidate2))
3146 reg = gen_rtx_REG (mode, regno);
3147 offset = start_offset + cfun->machine->frame.reg_offset[regno];
3148 mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
3151 regno2 = aarch64_next_callee_save (regno + 1, limit);
3154 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3155 == cfun->machine->frame.reg_offset[regno2]))
3158 rtx reg2 = gen_rtx_REG (mode, regno2);
3161 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3162 mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
3164 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
3167 /* The first part of a frame-related parallel insn is
3168 always assumed to be relevant to the frame
3169 calculations; subsequent parts, are only
3170 frame-related if explicitly marked. */
3171 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3175 insn = emit_move_insn (mem, reg);
3177 RTX_FRAME_RELATED_P (insn) = 1;
3181 /* Emit code to restore the callee registers of mode MODE from register
3182 number START up to and including LIMIT. Restore from the stack offset
3183 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
3184 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
3187 aarch64_restore_callee_saves (machine_mode mode,
3188 HOST_WIDE_INT start_offset, unsigned start,
3189 unsigned limit, bool skip_wb, rtx *cfi_ops)
3191 rtx base_rtx = stack_pointer_rtx;
3192 rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
3193 ? gen_frame_mem : gen_rtx_MEM);
3196 HOST_WIDE_INT offset;
3198 for (regno = aarch64_next_callee_save (start, limit);
3200 regno = aarch64_next_callee_save (regno + 1, limit))
3205 && (regno == cfun->machine->frame.wb_candidate1
3206 || regno == cfun->machine->frame.wb_candidate2))
3209 reg = gen_rtx_REG (mode, regno);
3210 offset = start_offset + cfun->machine->frame.reg_offset[regno];
3211 mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
3213 regno2 = aarch64_next_callee_save (regno + 1, limit);
3216 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3217 == cfun->machine->frame.reg_offset[regno2]))
3219 rtx reg2 = gen_rtx_REG (mode, regno2);
3222 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3223 mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
3224 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3226 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3230 emit_move_insn (reg, mem);
3231 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
3235 /* AArch64 stack frames generated by this compiler look like:
3237 +-------------------------------+
3239 | incoming stack arguments |
3241 +-------------------------------+
3242 | | <-- incoming stack pointer (aligned)
3243 | callee-allocated save area |
3244 | for register varargs |
3246 +-------------------------------+
3247 | local variables | <-- frame_pointer_rtx
3249 +-------------------------------+
3251 +-------------------------------+ |
3252 | callee-saved registers | | frame.saved_regs_size
3253 +-------------------------------+ |
3255 +-------------------------------+ |
3256 | FP' | / <- hard_frame_pointer_rtx (aligned)
3257 +-------------------------------+
3258 | dynamic allocation |
3259 +-------------------------------+
3261 +-------------------------------+
3262 | outgoing stack arguments | <-- arg_pointer
3264 +-------------------------------+
3265 | | <-- stack_pointer_rtx (aligned)
3267 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
3268 but leave frame_pointer_rtx and hard_frame_pointer_rtx
3271 /* Generate the prologue instructions for entry into a function.
3272 Establish the stack frame by decreasing the stack pointer with a
3273 properly calculated size and, if necessary, create a frame record
3274 filled with the values of LR and previous frame pointer. The
3275 current FP is also set up if it is in use. */
3278 aarch64_expand_prologue (void)
3280 aarch64_layout_frame ();
3282 HOST_WIDE_INT frame_size = cfun->machine->frame.frame_size;
3283 HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3284 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3285 HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3286 HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3287 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3288 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3291 if (flag_stack_usage_info)
3292 current_function_static_stack_size = frame_size;
3294 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
3296 if (crtl->is_leaf && !cfun->calls_alloca)
3298 if (frame_size > PROBE_INTERVAL && frame_size > STACK_CHECK_PROTECT)
3299 aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT,
3300 frame_size - STACK_CHECK_PROTECT);
3302 else if (frame_size > 0)
3303 aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
3306 aarch64_sub_sp (IP0_REGNUM, initial_adjust, true);
3308 if (callee_adjust != 0)
3309 aarch64_push_regs (reg1, reg2, callee_adjust);
3311 if (frame_pointer_needed)
3313 if (callee_adjust == 0)
3314 aarch64_save_callee_saves (DImode, callee_offset, R29_REGNUM,
3316 insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
3318 GEN_INT (callee_offset)));
3319 RTX_FRAME_RELATED_P (insn) = 1;
3320 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
3323 aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3324 callee_adjust != 0 || frame_pointer_needed);
3325 aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3326 callee_adjust != 0 || frame_pointer_needed);
3327 aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed);
3330 /* Return TRUE if we can use a simple_return insn.
3332 This function checks whether the callee saved stack is empty, which
3333 means no restore actions are need. The pro_and_epilogue will use
3334 this to check whether shrink-wrapping opt is feasible. */
3337 aarch64_use_return_insn_p (void)
3339 if (!reload_completed)
3345 aarch64_layout_frame ();
3347 return cfun->machine->frame.frame_size == 0;
3350 /* Generate the epilogue instructions for returning from a function.
3351 This is almost exactly the reverse of the prolog sequence, except
3352 that we need to insert barriers to avoid scheduling loads that read
3353 from a deallocated stack, and we optimize the unwind records by
3354 emitting them all together if possible. */
3356 aarch64_expand_epilogue (bool for_sibcall)
3358 aarch64_layout_frame ();
3360 HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3361 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3362 HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3363 HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3364 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3365 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3369 /* We need to add memory barrier to prevent read from deallocated stack. */
3370 bool need_barrier_p = (get_frame_size ()
3371 + cfun->machine->frame.saved_varargs_size) != 0;
3373 /* Emit a barrier to prevent loads from a deallocated stack. */
3374 if (final_adjust > crtl->outgoing_args_size || cfun->calls_alloca
3375 || crtl->calls_eh_return)
3377 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3378 need_barrier_p = false;
3381 /* Restore the stack pointer from the frame pointer if it may not
3382 be the same as the stack pointer. */
3383 if (frame_pointer_needed && (final_adjust || cfun->calls_alloca))
3385 insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
3386 hard_frame_pointer_rtx,
3387 GEN_INT (-callee_offset)));
3388 /* If writeback is used when restoring callee-saves, the CFA
3389 is restored on the instruction doing the writeback. */
3390 RTX_FRAME_RELATED_P (insn) = callee_adjust == 0;
3393 aarch64_add_sp (IP1_REGNUM, final_adjust, df_regs_ever_live_p (IP1_REGNUM));
3395 aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3396 callee_adjust != 0, &cfi_ops);
3397 aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3398 callee_adjust != 0, &cfi_ops);
3401 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3403 if (callee_adjust != 0)
3404 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
3406 if (callee_adjust != 0 || initial_adjust > 65536)
3408 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
3409 insn = get_last_insn ();
3410 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
3411 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
3412 RTX_FRAME_RELATED_P (insn) = 1;
3416 aarch64_add_sp (IP0_REGNUM, initial_adjust, df_regs_ever_live_p (IP0_REGNUM));
3420 /* Emit delayed restores and reset the CFA to be SP. */
3421 insn = get_last_insn ();
3422 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
3423 REG_NOTES (insn) = cfi_ops;
3424 RTX_FRAME_RELATED_P (insn) = 1;
3427 /* Stack adjustment for exception handler. */
3428 if (crtl->calls_eh_return)
3430 /* We need to unwind the stack by the offset computed by
3431 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
3432 to be SP; letting the CFA move during this adjustment
3433 is just as correct as retaining the CFA from the body
3434 of the function. Therefore, do nothing special. */
3435 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
3438 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
3440 emit_jump_insn (ret_rtx);
3443 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
3444 normally or return to a previous frame after unwinding.
3446 An EH return uses a single shared return sequence. The epilogue is
3447 exactly like a normal epilogue except that it has an extra input
3448 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
3449 that must be applied after the frame has been destroyed. An extra label
3450 is inserted before the epilogue which initializes this register to zero,
3451 and this is the entry point for a normal return.
3453 An actual EH return updates the return address, initializes the stack
3454 adjustment and jumps directly into the epilogue (bypassing the zeroing
3455 of the adjustment). Since the return address is typically saved on the
3456 stack when a function makes a call, the saved LR must be updated outside
3459 This poses problems as the store is generated well before the epilogue,
3460 so the offset of LR is not known yet. Also optimizations will remove the
3461 store as it appears dead, even after the epilogue is generated (as the
3462 base or offset for loading LR is different in many cases).
3464 To avoid these problems this implementation forces the frame pointer
3465 in eh_return functions so that the location of LR is fixed and known early.
3466 It also marks the store volatile, so no optimization is permitted to
3467 remove the store. */
3469 aarch64_eh_return_handler_rtx (void)
3471 rtx tmp = gen_frame_mem (Pmode,
3472 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
3474 /* Mark the store volatile, so no optimization is permitted to remove it. */
3475 MEM_VOLATILE_P (tmp) = true;
3479 /* Output code to add DELTA to the first argument, and then jump
3480 to FUNCTION. Used for C++ multiple inheritance. */
3482 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3483 HOST_WIDE_INT delta,
3484 HOST_WIDE_INT vcall_offset,
3487 /* The this pointer is always in x0. Note that this differs from
3488 Arm where the this pointer maybe bumped to r1 if r0 is required
3489 to return a pointer to an aggregate. On AArch64 a result value
3490 pointer will be in x8. */
3491 int this_regno = R0_REGNUM;
3492 rtx this_rtx, temp0, temp1, addr, funexp;
3495 reload_completed = 1;
3496 emit_note (NOTE_INSN_PROLOGUE_END);
3498 if (vcall_offset == 0)
3499 aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3502 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3504 this_rtx = gen_rtx_REG (Pmode, this_regno);
3505 temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3506 temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3511 if (delta >= -256 && delta < 256)
3512 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3513 plus_constant (Pmode, this_rtx, delta));
3515 aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3518 if (Pmode == ptr_mode)
3519 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3521 aarch64_emit_move (temp0,
3522 gen_rtx_ZERO_EXTEND (Pmode,
3523 gen_rtx_MEM (ptr_mode, addr)));
3525 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3526 addr = plus_constant (Pmode, temp0, vcall_offset);
3529 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
3531 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3534 if (Pmode == ptr_mode)
3535 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3537 aarch64_emit_move (temp1,
3538 gen_rtx_SIGN_EXTEND (Pmode,
3539 gen_rtx_MEM (ptr_mode, addr)));
3541 emit_insn (gen_add2_insn (this_rtx, temp1));
3544 /* Generate a tail call to the target function. */
3545 if (!TREE_USED (function))
3547 assemble_external (function);
3548 TREE_USED (function) = 1;
3550 funexp = XEXP (DECL_RTL (function), 0);
3551 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3552 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3553 SIBLING_CALL_P (insn) = 1;
3555 insn = get_insns ();
3556 shorten_branches (insn);
3557 final_start_function (insn, file, 1);
3558 final (insn, file, 1);
3559 final_end_function ();
3561 /* Stop pretending to be a post-reload pass. */
3562 reload_completed = 0;
3566 aarch64_tls_referenced_p (rtx x)
3568 if (!TARGET_HAVE_TLS)
3570 subrtx_iterator::array_type array;
3571 FOR_EACH_SUBRTX (iter, array, x, ALL)
3573 const_rtx x = *iter;
3574 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3576 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3577 TLS offsets, not real symbol references. */
3578 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3579 iter.skip_subrtxes ();
3585 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3586 a left shift of 0 or 12 bits. */
3588 aarch64_uimm12_shift (HOST_WIDE_INT val)
3590 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3591 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3596 /* Return true if val is an immediate that can be loaded into a
3597 register by a MOVZ instruction. */
3599 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3601 if (GET_MODE_SIZE (mode) > 4)
3603 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3604 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3609 /* Ignore sign extension. */
3610 val &= (HOST_WIDE_INT) 0xffffffff;
3612 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3613 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3616 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
3618 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
3620 0x0000000100000001ull,
3621 0x0001000100010001ull,
3622 0x0101010101010101ull,
3623 0x1111111111111111ull,
3624 0x5555555555555555ull,
3628 /* Return true if val is a valid bitmask immediate. */
3631 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
3633 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
3636 /* Check for a single sequence of one bits and return quickly if so.
3637 The special cases of all ones and all zeroes returns false. */
3638 val = (unsigned HOST_WIDE_INT) val_in;
3639 tmp = val + (val & -val);
3641 if (tmp == (tmp & -tmp))
3642 return (val + 1) > 1;
3644 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
3646 val = (val << 32) | (val & 0xffffffff);
3648 /* Invert if the immediate doesn't start with a zero bit - this means we
3649 only need to search for sequences of one bits. */
3653 /* Find the first set bit and set tmp to val with the first sequence of one
3654 bits removed. Return success if there is a single sequence of ones. */
3655 first_one = val & -val;
3656 tmp = val & (val + first_one);
3661 /* Find the next set bit and compute the difference in bit position. */
3662 next_one = tmp & -tmp;
3663 bits = clz_hwi (first_one) - clz_hwi (next_one);
3666 /* Check the bit position difference is a power of 2, and that the first
3667 sequence of one bits fits within 'bits' bits. */
3668 if ((mask >> bits) != 0 || bits != (bits & -bits))
3671 /* Check the sequence of one bits is repeated 64/bits times. */
3672 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
3676 /* Return true if val is an immediate that can be loaded into a
3677 register in a single instruction. */
3679 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
3681 if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
3683 return aarch64_bitmask_imm (val, mode);
3687 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
3691 if (GET_CODE (x) == HIGH)
3694 split_const (x, &base, &offset);
3695 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
3697 if (aarch64_classify_symbol (base, offset)
3698 != SYMBOL_FORCE_TO_MEM)
3701 /* Avoid generating a 64-bit relocation in ILP32; leave
3702 to aarch64_expand_mov_immediate to handle it properly. */
3703 return mode != ptr_mode;
3706 return aarch64_tls_referenced_p (x);
3709 /* Implement TARGET_CASE_VALUES_THRESHOLD.
3710 The expansion for a table switch is quite expensive due to the number
3711 of instructions, the table lookup and hard to predict indirect jump.
3712 When optimizing for speed, and -O3 enabled, use the per-core tuning if
3713 set, otherwise use tables for > 16 cases as a tradeoff between size and
3714 performance. When optimizing for size, use the default setting. */
3717 aarch64_case_values_threshold (void)
3719 /* Use the specified limit for the number of cases before using jump
3720 tables at higher optimization levels. */
3722 && selected_cpu->tune->max_case_values != 0)
3723 return selected_cpu->tune->max_case_values;
3725 return optimize_size ? default_case_values_threshold () : 17;
3728 /* Return true if register REGNO is a valid index register.
3729 STRICT_P is true if REG_OK_STRICT is in effect. */
3732 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
3734 if (!HARD_REGISTER_NUM_P (regno))
3742 regno = reg_renumber[regno];
3744 return GP_REGNUM_P (regno);
3747 /* Return true if register REGNO is a valid base register for mode MODE.
3748 STRICT_P is true if REG_OK_STRICT is in effect. */
3751 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
3753 if (!HARD_REGISTER_NUM_P (regno))
3761 regno = reg_renumber[regno];
3764 /* The fake registers will be eliminated to either the stack or
3765 hard frame pointer, both of which are usually valid base registers.
3766 Reload deals with the cases where the eliminated form isn't valid. */
3767 return (GP_REGNUM_P (regno)
3768 || regno == SP_REGNUM
3769 || regno == FRAME_POINTER_REGNUM
3770 || regno == ARG_POINTER_REGNUM);
3773 /* Return true if X is a valid base register for mode MODE.
3774 STRICT_P is true if REG_OK_STRICT is in effect. */
3777 aarch64_base_register_rtx_p (rtx x, bool strict_p)
3779 if (!strict_p && GET_CODE (x) == SUBREG)
3782 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
3785 /* Return true if address offset is a valid index. If it is, fill in INFO
3786 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
3789 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
3790 machine_mode mode, bool strict_p)
3792 enum aarch64_address_type type;
3797 if ((REG_P (x) || GET_CODE (x) == SUBREG)
3798 && GET_MODE (x) == Pmode)
3800 type = ADDRESS_REG_REG;
3804 /* (sign_extend:DI (reg:SI)) */
3805 else if ((GET_CODE (x) == SIGN_EXTEND
3806 || GET_CODE (x) == ZERO_EXTEND)
3807 && GET_MODE (x) == DImode
3808 && GET_MODE (XEXP (x, 0)) == SImode)
3810 type = (GET_CODE (x) == SIGN_EXTEND)
3811 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3812 index = XEXP (x, 0);
3815 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
3816 else if (GET_CODE (x) == MULT
3817 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3818 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3819 && GET_MODE (XEXP (x, 0)) == DImode
3820 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3821 && CONST_INT_P (XEXP (x, 1)))
3823 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3824 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3825 index = XEXP (XEXP (x, 0), 0);
3826 shift = exact_log2 (INTVAL (XEXP (x, 1)));
3828 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
3829 else if (GET_CODE (x) == ASHIFT
3830 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3831 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3832 && GET_MODE (XEXP (x, 0)) == DImode
3833 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3834 && CONST_INT_P (XEXP (x, 1)))
3836 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3837 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3838 index = XEXP (XEXP (x, 0), 0);
3839 shift = INTVAL (XEXP (x, 1));
3841 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
3842 else if ((GET_CODE (x) == SIGN_EXTRACT
3843 || GET_CODE (x) == ZERO_EXTRACT)
3844 && GET_MODE (x) == DImode
3845 && GET_CODE (XEXP (x, 0)) == MULT
3846 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3847 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3849 type = (GET_CODE (x) == SIGN_EXTRACT)
3850 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3851 index = XEXP (XEXP (x, 0), 0);
3852 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3853 if (INTVAL (XEXP (x, 1)) != 32 + shift
3854 || INTVAL (XEXP (x, 2)) != 0)
3857 /* (and:DI (mult:DI (reg:DI) (const_int scale))
3858 (const_int 0xffffffff<<shift)) */
3859 else if (GET_CODE (x) == AND
3860 && GET_MODE (x) == DImode
3861 && GET_CODE (XEXP (x, 0)) == MULT
3862 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3863 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3864 && CONST_INT_P (XEXP (x, 1)))
3866 type = ADDRESS_REG_UXTW;
3867 index = XEXP (XEXP (x, 0), 0);
3868 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3869 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3872 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
3873 else if ((GET_CODE (x) == SIGN_EXTRACT
3874 || GET_CODE (x) == ZERO_EXTRACT)
3875 && GET_MODE (x) == DImode
3876 && GET_CODE (XEXP (x, 0)) == ASHIFT
3877 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3878 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3880 type = (GET_CODE (x) == SIGN_EXTRACT)
3881 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3882 index = XEXP (XEXP (x, 0), 0);
3883 shift = INTVAL (XEXP (XEXP (x, 0), 1));
3884 if (INTVAL (XEXP (x, 1)) != 32 + shift
3885 || INTVAL (XEXP (x, 2)) != 0)
3888 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
3889 (const_int 0xffffffff<<shift)) */
3890 else if (GET_CODE (x) == AND
3891 && GET_MODE (x) == DImode
3892 && GET_CODE (XEXP (x, 0)) == ASHIFT
3893 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3894 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3895 && CONST_INT_P (XEXP (x, 1)))
3897 type = ADDRESS_REG_UXTW;
3898 index = XEXP (XEXP (x, 0), 0);
3899 shift = INTVAL (XEXP (XEXP (x, 0), 1));
3900 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3903 /* (mult:P (reg:P) (const_int scale)) */
3904 else if (GET_CODE (x) == MULT
3905 && GET_MODE (x) == Pmode
3906 && GET_MODE (XEXP (x, 0)) == Pmode
3907 && CONST_INT_P (XEXP (x, 1)))
3909 type = ADDRESS_REG_REG;
3910 index = XEXP (x, 0);
3911 shift = exact_log2 (INTVAL (XEXP (x, 1)));
3913 /* (ashift:P (reg:P) (const_int shift)) */
3914 else if (GET_CODE (x) == ASHIFT
3915 && GET_MODE (x) == Pmode
3916 && GET_MODE (XEXP (x, 0)) == Pmode
3917 && CONST_INT_P (XEXP (x, 1)))
3919 type = ADDRESS_REG_REG;
3920 index = XEXP (x, 0);
3921 shift = INTVAL (XEXP (x, 1));
3926 if (GET_CODE (index) == SUBREG)
3927 index = SUBREG_REG (index);
3930 (shift > 0 && shift <= 3
3931 && (1 << shift) == GET_MODE_SIZE (mode)))
3933 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
3936 info->offset = index;
3937 info->shift = shift;
3945 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3947 return (offset >= -64 * GET_MODE_SIZE (mode)
3948 && offset < 64 * GET_MODE_SIZE (mode)
3949 && offset % GET_MODE_SIZE (mode) == 0);
3953 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3954 HOST_WIDE_INT offset)
3956 return offset >= -256 && offset < 256;
3960 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3963 && offset < 4096 * GET_MODE_SIZE (mode)
3964 && offset % GET_MODE_SIZE (mode) == 0);
3967 /* Return true if MODE is one of the modes for which we
3968 support LDP/STP operations. */
3971 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
3973 return mode == SImode || mode == DImode
3974 || mode == SFmode || mode == DFmode
3975 || (aarch64_vector_mode_supported_p (mode)
3976 && GET_MODE_SIZE (mode) == 8);
3979 /* Return true if REGNO is a virtual pointer register, or an eliminable
3980 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
3981 include stack_pointer or hard_frame_pointer. */
3983 virt_or_elim_regno_p (unsigned regno)
3985 return ((regno >= FIRST_VIRTUAL_REGISTER
3986 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
3987 || regno == FRAME_POINTER_REGNUM
3988 || regno == ARG_POINTER_REGNUM);
3991 /* Return true if X is a valid address for machine mode MODE. If it is,
3992 fill in INFO appropriately. STRICT_P is true if REG_OK_STRICT is in
3993 effect. OUTER_CODE is PARALLEL for a load/store pair. */
3996 aarch64_classify_address (struct aarch64_address_info *info,
3997 rtx x, machine_mode mode,
3998 RTX_CODE outer_code, bool strict_p)
4000 enum rtx_code code = GET_CODE (x);
4003 /* On BE, we use load/store pair for all large int mode load/stores. */
4004 bool load_store_pair_p = (outer_code == PARALLEL
4005 || (BYTES_BIG_ENDIAN
4006 && aarch64_vect_struct_mode_p (mode)));
4008 bool allow_reg_index_p =
4010 && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
4011 && !aarch64_vect_struct_mode_p (mode);
4013 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
4015 if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
4016 && (code != POST_INC && code != REG))
4023 info->type = ADDRESS_REG_IMM;
4025 info->offset = const0_rtx;
4026 return aarch64_base_register_rtx_p (x, strict_p);
4034 && virt_or_elim_regno_p (REGNO (op0))
4035 && CONST_INT_P (op1))
4037 info->type = ADDRESS_REG_IMM;
4044 if (GET_MODE_SIZE (mode) != 0
4045 && CONST_INT_P (op1)
4046 && aarch64_base_register_rtx_p (op0, strict_p))
4048 HOST_WIDE_INT offset = INTVAL (op1);
4050 info->type = ADDRESS_REG_IMM;
4054 /* TImode and TFmode values are allowed in both pairs of X
4055 registers and individual Q registers. The available
4057 X,X: 7-bit signed scaled offset
4058 Q: 9-bit signed offset
4059 We conservatively require an offset representable in either mode.
4060 When performing the check for pairs of X registers i.e. LDP/STP
4061 pass down DImode since that is the natural size of the LDP/STP
4062 instruction memory accesses. */
4063 if (mode == TImode || mode == TFmode)
4064 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
4065 && offset_9bit_signed_unscaled_p (mode, offset));
4067 /* A 7bit offset check because OImode will emit a ldp/stp
4068 instruction (only big endian will get here).
4069 For ldp/stp instructions, the offset is scaled for the size of a
4070 single element of the pair. */
4072 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
4074 /* Three 9/12 bit offsets checks because CImode will emit three
4075 ldr/str instructions (only big endian will get here). */
4077 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4078 && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
4079 || offset_12bit_unsigned_scaled_p (V16QImode,
4082 /* Two 7bit offsets checks because XImode will emit two ldp/stp
4083 instructions (only big endian will get here). */
4085 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4086 && aarch64_offset_7bit_signed_scaled_p (TImode,
4089 if (load_store_pair_p)
4090 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4091 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4093 return (offset_9bit_signed_unscaled_p (mode, offset)
4094 || offset_12bit_unsigned_scaled_p (mode, offset));
4097 if (allow_reg_index_p)
4099 /* Look for base + (scaled/extended) index register. */
4100 if (aarch64_base_register_rtx_p (op0, strict_p)
4101 && aarch64_classify_index (info, op1, mode, strict_p))
4106 if (aarch64_base_register_rtx_p (op1, strict_p)
4107 && aarch64_classify_index (info, op0, mode, strict_p))
4120 info->type = ADDRESS_REG_WB;
4121 info->base = XEXP (x, 0);
4122 info->offset = NULL_RTX;
4123 return aarch64_base_register_rtx_p (info->base, strict_p);
4127 info->type = ADDRESS_REG_WB;
4128 info->base = XEXP (x, 0);
4129 if (GET_CODE (XEXP (x, 1)) == PLUS
4130 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
4131 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
4132 && aarch64_base_register_rtx_p (info->base, strict_p))
4134 HOST_WIDE_INT offset;
4135 info->offset = XEXP (XEXP (x, 1), 1);
4136 offset = INTVAL (info->offset);
4138 /* TImode and TFmode values are allowed in both pairs of X
4139 registers and individual Q registers. The available
4141 X,X: 7-bit signed scaled offset
4142 Q: 9-bit signed offset
4143 We conservatively require an offset representable in either mode.
4145 if (mode == TImode || mode == TFmode)
4146 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
4147 && offset_9bit_signed_unscaled_p (mode, offset));
4149 if (load_store_pair_p)
4150 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4151 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4153 return offset_9bit_signed_unscaled_p (mode, offset);
4160 /* load literal: pc-relative constant pool entry. Only supported
4161 for SI mode or larger. */
4162 info->type = ADDRESS_SYMBOLIC;
4164 if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
4168 split_const (x, &sym, &addend);
4169 return ((GET_CODE (sym) == LABEL_REF
4170 || (GET_CODE (sym) == SYMBOL_REF
4171 && CONSTANT_POOL_ADDRESS_P (sym)
4172 && aarch64_pcrelative_literal_loads)));
4177 info->type = ADDRESS_LO_SUM;
4178 info->base = XEXP (x, 0);
4179 info->offset = XEXP (x, 1);
4180 if (allow_reg_index_p
4181 && aarch64_base_register_rtx_p (info->base, strict_p))
4184 split_const (info->offset, &sym, &offs);
4185 if (GET_CODE (sym) == SYMBOL_REF
4186 && (aarch64_classify_symbol (sym, offs) == SYMBOL_SMALL_ABSOLUTE))
4188 /* The symbol and offset must be aligned to the access size. */
4190 unsigned int ref_size;
4192 if (CONSTANT_POOL_ADDRESS_P (sym))
4193 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
4194 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
4196 tree exp = SYMBOL_REF_DECL (sym);
4197 align = TYPE_ALIGN (TREE_TYPE (exp));
4198 align = CONSTANT_ALIGNMENT (exp, align);
4200 else if (SYMBOL_REF_DECL (sym))
4201 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
4202 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
4203 && SYMBOL_REF_BLOCK (sym) != NULL)
4204 align = SYMBOL_REF_BLOCK (sym)->alignment;
4206 align = BITS_PER_UNIT;
4208 ref_size = GET_MODE_SIZE (mode);
4210 ref_size = GET_MODE_SIZE (DImode);
4212 return ((INTVAL (offs) & (ref_size - 1)) == 0
4213 && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
4224 aarch64_symbolic_address_p (rtx x)
4228 split_const (x, &x, &offset);
4229 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
4232 /* Classify the base of symbolic expression X. */
4234 enum aarch64_symbol_type
4235 aarch64_classify_symbolic_expression (rtx x)
4239 split_const (x, &x, &offset);
4240 return aarch64_classify_symbol (x, offset);
4244 /* Return TRUE if X is a legitimate address for accessing memory in
4247 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
4249 struct aarch64_address_info addr;
4251 return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
4254 /* Return TRUE if X is a legitimate address for accessing memory in
4255 mode MODE. OUTER_CODE will be PARALLEL if this is a load/store
4258 aarch64_legitimate_address_p (machine_mode mode, rtx x,
4259 RTX_CODE outer_code, bool strict_p)
4261 struct aarch64_address_info addr;
4263 return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
4266 /* Split an out-of-range address displacement into a base and offset.
4267 Use 4KB range for 1- and 2-byte accesses and a 16KB range otherwise
4268 to increase opportunities for sharing the base address of different sizes.
4269 For TI/TFmode and unaligned accesses use a 256-byte range. */
4271 aarch64_legitimize_address_displacement (rtx *disp, rtx *off, machine_mode mode)
4273 HOST_WIDE_INT mask = GET_MODE_SIZE (mode) < 4 ? 0xfff : 0x3fff;
4275 if (mode == TImode || mode == TFmode ||
4276 (INTVAL (*disp) & (GET_MODE_SIZE (mode) - 1)) != 0)
4279 *off = GEN_INT (INTVAL (*disp) & ~mask);
4280 *disp = GEN_INT (INTVAL (*disp) & mask);
4284 /* Return TRUE if rtx X is immediate constant 0.0 */
4286 aarch64_float_const_zero_rtx_p (rtx x)
4288 if (GET_MODE (x) == VOIDmode)
4291 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
4292 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
4293 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
4296 /* Return the fixed registers used for condition codes. */
4299 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
4302 *p2 = INVALID_REGNUM;
4306 /* Emit call insn with PAT and do aarch64-specific handling. */
4309 aarch64_emit_call_insn (rtx pat)
4311 rtx insn = emit_call_insn (pat);
4313 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
4314 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
4315 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
4319 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
4321 /* All floating point compares return CCFP if it is an equality
4322 comparison, and CCFPE otherwise. */
4323 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
4350 /* Equality comparisons of short modes against zero can be performed
4351 using the TST instruction with the appropriate bitmask. */
4352 if (y == const0_rtx && REG_P (x)
4353 && (code == EQ || code == NE)
4354 && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
4357 /* Similarly, comparisons of zero_extends from shorter modes can
4358 be performed using an ANDS with an immediate mask. */
4359 if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
4360 && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4361 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
4362 && (code == EQ || code == NE))
4365 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4367 && (code == EQ || code == NE || code == LT || code == GE)
4368 && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
4369 || GET_CODE (x) == NEG
4370 || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
4371 && CONST_INT_P (XEXP (x, 2)))))
4374 /* A compare with a shifted operand. Because of canonicalization,
4375 the comparison will have to be swapped when we emit the assembly
4377 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4378 && (REG_P (y) || GET_CODE (y) == SUBREG)
4379 && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
4380 || GET_CODE (x) == LSHIFTRT
4381 || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
4384 /* Similarly for a negated operand, but we can only do this for
4386 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4387 && (REG_P (y) || GET_CODE (y) == SUBREG)
4388 && (code == EQ || code == NE)
4389 && GET_CODE (x) == NEG)
4392 /* A test for unsigned overflow. */
4393 if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
4395 && GET_CODE (x) == PLUS
4396 && GET_CODE (y) == ZERO_EXTEND)
4399 /* For everything else, return CCmode. */
4404 aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
4407 aarch64_get_condition_code (rtx x)
4409 machine_mode mode = GET_MODE (XEXP (x, 0));
4410 enum rtx_code comp_code = GET_CODE (x);
4412 if (GET_MODE_CLASS (mode) != MODE_CC)
4413 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
4414 return aarch64_get_condition_code_1 (mode, comp_code);
4418 aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
4426 case GE: return AARCH64_GE;
4427 case GT: return AARCH64_GT;
4428 case LE: return AARCH64_LS;
4429 case LT: return AARCH64_MI;
4430 case NE: return AARCH64_NE;
4431 case EQ: return AARCH64_EQ;
4432 case ORDERED: return AARCH64_VC;
4433 case UNORDERED: return AARCH64_VS;
4434 case UNLT: return AARCH64_LT;
4435 case UNLE: return AARCH64_LE;
4436 case UNGT: return AARCH64_HI;
4437 case UNGE: return AARCH64_PL;
4445 case NE: return AARCH64_NE;
4446 case EQ: return AARCH64_EQ;
4447 case GE: return AARCH64_GE;
4448 case GT: return AARCH64_GT;
4449 case LE: return AARCH64_LE;
4450 case LT: return AARCH64_LT;
4451 case GEU: return AARCH64_CS;
4452 case GTU: return AARCH64_HI;
4453 case LEU: return AARCH64_LS;
4454 case LTU: return AARCH64_CC;
4462 case NE: return AARCH64_NE;
4463 case EQ: return AARCH64_EQ;
4464 case GE: return AARCH64_LE;
4465 case GT: return AARCH64_LT;
4466 case LE: return AARCH64_GE;
4467 case LT: return AARCH64_GT;
4468 case GEU: return AARCH64_LS;
4469 case GTU: return AARCH64_CC;
4470 case LEU: return AARCH64_CS;
4471 case LTU: return AARCH64_HI;
4479 case NE: return AARCH64_NE;
4480 case EQ: return AARCH64_EQ;
4481 case GE: return AARCH64_PL;
4482 case LT: return AARCH64_MI;
4490 case NE: return AARCH64_NE;
4491 case EQ: return AARCH64_EQ;
4499 case NE: return AARCH64_CS;
4500 case EQ: return AARCH64_CC;
4514 aarch64_const_vec_all_same_in_range_p (rtx x,
4515 HOST_WIDE_INT minval,
4516 HOST_WIDE_INT maxval)
4518 HOST_WIDE_INT firstval;
4521 if (GET_CODE (x) != CONST_VECTOR
4522 || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
4525 firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
4526 if (firstval < minval || firstval > maxval)
4529 count = CONST_VECTOR_NUNITS (x);
4530 for (i = 1; i < count; i++)
4531 if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
4538 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
4540 return aarch64_const_vec_all_same_in_range_p (x, val, val);
4545 #define AARCH64_CC_V 1
4546 #define AARCH64_CC_C (1 << 1)
4547 #define AARCH64_CC_Z (1 << 2)
4548 #define AARCH64_CC_N (1 << 3)
4550 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
4551 static const int aarch64_nzcv_codes[] =
4553 0, /* EQ, Z == 1. */
4554 AARCH64_CC_Z, /* NE, Z == 0. */
4555 0, /* CS, C == 1. */
4556 AARCH64_CC_C, /* CC, C == 0. */
4557 0, /* MI, N == 1. */
4558 AARCH64_CC_N, /* PL, N == 0. */
4559 0, /* VS, V == 1. */
4560 AARCH64_CC_V, /* VC, V == 0. */
4561 0, /* HI, C ==1 && Z == 0. */
4562 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
4563 AARCH64_CC_V, /* GE, N == V. */
4564 0, /* LT, N != V. */
4565 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
4566 0, /* LE, !(Z == 0 && N == V). */
4572 aarch64_print_operand (FILE *f, rtx x, int code)
4576 /* An integer or symbol address without a preceding # sign. */
4578 switch (GET_CODE (x))
4581 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
4585 output_addr_const (f, x);
4589 if (GET_CODE (XEXP (x, 0)) == PLUS
4590 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
4592 output_addr_const (f, x);
4598 output_operand_lossage ("Unsupported operand for code '%c'", code);
4603 /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w. */
4607 if (!CONST_INT_P (x)
4608 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
4610 output_operand_lossage ("invalid operand for '%%%c'", code);
4626 output_operand_lossage ("invalid operand for '%%%c'", code);
4636 /* Print N such that 2^N == X. */
4637 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
4639 output_operand_lossage ("invalid operand for '%%%c'", code);
4643 asm_fprintf (f, "%d", n);
4648 /* Print the number of non-zero bits in X (a const_int). */
4649 if (!CONST_INT_P (x))
4651 output_operand_lossage ("invalid operand for '%%%c'", code);
4655 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
4659 if (x == const0_rtx)
4661 asm_fprintf (f, "xzr");
4664 /* Print the higher numbered register of a pair (TImode) of regs. */
4665 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
4667 output_operand_lossage ("invalid operand for '%%%c'", code);
4671 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
4678 /* Print a condition (eq, ne, etc) or its inverse. */
4680 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
4681 if (x == const_true_rtx)
4688 if (!COMPARISON_P (x))
4690 output_operand_lossage ("invalid operand for '%%%c'", code);
4694 cond_code = aarch64_get_condition_code (x);
4695 gcc_assert (cond_code >= 0);
4697 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
4698 fputs (aarch64_condition_codes[cond_code], f);
4707 /* Print a scalar FP/SIMD register name. */
4708 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4710 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4713 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
4720 /* Print the first FP/SIMD register name in a list. */
4721 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4723 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4726 asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
4730 /* Print a scalar FP/SIMD register name + 1. */
4731 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4733 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4736 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
4740 /* Print bottom 16 bits of integer constant in hex. */
4741 if (!CONST_INT_P (x))
4743 output_operand_lossage ("invalid operand for '%%%c'", code);
4746 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
4751 /* Print a general register name or the zero register (32-bit or
4754 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
4756 asm_fprintf (f, "%czr", code);
4760 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
4762 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
4766 if (REG_P (x) && REGNO (x) == SP_REGNUM)
4768 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
4775 /* Print a normal operand, if it's a general register, then we
4779 output_operand_lossage ("missing operand");
4783 switch (GET_CODE (x))
4786 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
4790 output_address (GET_MODE (x), XEXP (x, 0));
4796 output_addr_const (asm_out_file, x);
4800 asm_fprintf (f, "%wd", INTVAL (x));
4804 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
4807 aarch64_const_vec_all_same_in_range_p (x,
4809 HOST_WIDE_INT_MAX));
4810 asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
4812 else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
4821 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
4822 be getting CONST_DOUBLEs holding integers. */
4823 gcc_assert (GET_MODE (x) != VOIDmode);
4824 if (aarch64_float_const_zero_rtx_p (x))
4829 else if (aarch64_float_const_representable_p (x))
4832 char float_buf[buf_size] = {'\0'};
4833 real_to_decimal_for_mode (float_buf,
4834 CONST_DOUBLE_REAL_VALUE (x),
4837 asm_fprintf (asm_out_file, "%s", float_buf);
4841 output_operand_lossage ("invalid constant");
4844 output_operand_lossage ("invalid operand");
4850 if (GET_CODE (x) == HIGH)
4853 switch (aarch64_classify_symbolic_expression (x))
4855 case SYMBOL_SMALL_GOT_4G:
4856 asm_fprintf (asm_out_file, ":got:");
4859 case SYMBOL_SMALL_TLSGD:
4860 asm_fprintf (asm_out_file, ":tlsgd:");
4863 case SYMBOL_SMALL_TLSDESC:
4864 asm_fprintf (asm_out_file, ":tlsdesc:");
4867 case SYMBOL_SMALL_TLSIE:
4868 asm_fprintf (asm_out_file, ":gottprel:");
4871 case SYMBOL_TLSLE24:
4872 asm_fprintf (asm_out_file, ":tprel:");
4875 case SYMBOL_TINY_GOT:
4882 output_addr_const (asm_out_file, x);
4886 switch (aarch64_classify_symbolic_expression (x))
4888 case SYMBOL_SMALL_GOT_4G:
4889 asm_fprintf (asm_out_file, ":lo12:");
4892 case SYMBOL_SMALL_TLSGD:
4893 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
4896 case SYMBOL_SMALL_TLSDESC:
4897 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
4900 case SYMBOL_SMALL_TLSIE:
4901 asm_fprintf (asm_out_file, ":gottprel_lo12:");
4904 case SYMBOL_TLSLE12:
4905 asm_fprintf (asm_out_file, ":tprel_lo12:");
4908 case SYMBOL_TLSLE24:
4909 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
4912 case SYMBOL_TINY_GOT:
4913 asm_fprintf (asm_out_file, ":got:");
4916 case SYMBOL_TINY_TLSIE:
4917 asm_fprintf (asm_out_file, ":gottprel:");
4923 output_addr_const (asm_out_file, x);
4928 switch (aarch64_classify_symbolic_expression (x))
4930 case SYMBOL_TLSLE24:
4931 asm_fprintf (asm_out_file, ":tprel_hi12:");
4936 output_addr_const (asm_out_file, x);
4941 HOST_WIDE_INT cond_code;
4944 if (!CONST_INT_P (x))
4946 output_operand_lossage ("invalid operand for '%%%c'", code);
4950 cond_code = INTVAL (x);
4951 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
4952 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
4957 output_operand_lossage ("invalid operand prefix '%%%c'", code);
4963 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
4965 struct aarch64_address_info addr;
4967 if (aarch64_classify_address (&addr, x, mode, MEM, true))
4970 case ADDRESS_REG_IMM:
4971 if (addr.offset == const0_rtx)
4972 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
4974 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
4975 INTVAL (addr.offset));
4978 case ADDRESS_REG_REG:
4979 if (addr.shift == 0)
4980 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
4981 reg_names [REGNO (addr.offset)]);
4983 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
4984 reg_names [REGNO (addr.offset)], addr.shift);
4987 case ADDRESS_REG_UXTW:
4988 if (addr.shift == 0)
4989 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
4990 REGNO (addr.offset) - R0_REGNUM);
4992 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
4993 REGNO (addr.offset) - R0_REGNUM, addr.shift);
4996 case ADDRESS_REG_SXTW:
4997 if (addr.shift == 0)
4998 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
4999 REGNO (addr.offset) - R0_REGNUM);
5001 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
5002 REGNO (addr.offset) - R0_REGNUM, addr.shift);
5005 case ADDRESS_REG_WB:
5006 switch (GET_CODE (x))
5009 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
5010 GET_MODE_SIZE (mode));
5013 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
5014 GET_MODE_SIZE (mode));
5017 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
5018 GET_MODE_SIZE (mode));
5021 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
5022 GET_MODE_SIZE (mode));
5025 asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
5026 INTVAL (addr.offset));
5029 asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
5030 INTVAL (addr.offset));
5037 case ADDRESS_LO_SUM:
5038 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
5039 output_addr_const (f, addr.offset);
5040 asm_fprintf (f, "]");
5043 case ADDRESS_SYMBOLIC:
5047 output_addr_const (f, x);
5051 aarch64_label_mentioned_p (rtx x)
5056 if (GET_CODE (x) == LABEL_REF)
5059 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
5060 referencing instruction, but they are constant offsets, not
5062 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5065 fmt = GET_RTX_FORMAT (GET_CODE (x));
5066 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
5072 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
5073 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
5076 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
5083 /* Implement REGNO_REG_CLASS. */
5086 aarch64_regno_regclass (unsigned regno)
5088 if (GP_REGNUM_P (regno))
5089 return GENERAL_REGS;
5091 if (regno == SP_REGNUM)
5094 if (regno == FRAME_POINTER_REGNUM
5095 || regno == ARG_POINTER_REGNUM)
5096 return POINTER_REGS;
5098 if (FP_REGNUM_P (regno))
5099 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
5105 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
5107 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
5108 where mask is selected by alignment and size of the offset.
5109 We try to pick as large a range for the offset as possible to
5110 maximize the chance of a CSE. However, for aligned addresses
5111 we limit the range to 4k so that structures with different sized
5112 elements are likely to use the same base. We need to be careful
5113 not to split a CONST for some forms of address expression, otherwise
5114 it will generate sub-optimal code. */
5116 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
5118 rtx base = XEXP (x, 0);
5119 rtx offset_rtx = XEXP (x, 1);
5120 HOST_WIDE_INT offset = INTVAL (offset_rtx);
5122 if (GET_CODE (base) == PLUS)
5124 rtx op0 = XEXP (base, 0);
5125 rtx op1 = XEXP (base, 1);
5127 /* Force any scaling into a temp for CSE. */
5128 op0 = force_reg (Pmode, op0);
5129 op1 = force_reg (Pmode, op1);
5131 /* Let the pointer register be in op0. */
5132 if (REG_POINTER (op1))
5133 std::swap (op0, op1);
5135 /* If the pointer is virtual or frame related, then we know that
5136 virtual register instantiation or register elimination is going
5137 to apply a second constant. We want the two constants folded
5138 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
5139 if (virt_or_elim_regno_p (REGNO (op0)))
5141 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
5142 NULL_RTX, true, OPTAB_DIRECT);
5143 return gen_rtx_PLUS (Pmode, base, op1);
5146 /* Otherwise, in order to encourage CSE (and thence loop strength
5147 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
5148 base = expand_binop (Pmode, add_optab, op0, op1,
5149 NULL_RTX, true, OPTAB_DIRECT);
5150 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
5153 /* Does it look like we'll need a load/store-pair operation? */
5154 HOST_WIDE_INT base_offset;
5155 if (GET_MODE_SIZE (mode) > 16
5157 base_offset = ((offset + 64 * GET_MODE_SIZE (mode))
5158 & ~((128 * GET_MODE_SIZE (mode)) - 1));
5159 /* For offsets aren't a multiple of the access size, the limit is
5161 else if (offset & (GET_MODE_SIZE (mode) - 1))
5162 base_offset = (offset + 0x100) & ~0x1ff;
5164 base_offset = offset & ~0xfff;
5166 if (base_offset != 0)
5168 base = plus_constant (Pmode, base, base_offset);
5169 base = force_operand (base, NULL_RTX);
5170 return plus_constant (Pmode, base, offset - base_offset);
5177 /* Return the reload icode required for a constant pool in mode. */
5178 static enum insn_code
5179 aarch64_constant_pool_reload_icode (machine_mode mode)
5184 return CODE_FOR_aarch64_reload_movcpsfdi;
5187 return CODE_FOR_aarch64_reload_movcpdfdi;
5190 return CODE_FOR_aarch64_reload_movcptfdi;
5193 return CODE_FOR_aarch64_reload_movcpv8qidi;
5196 return CODE_FOR_aarch64_reload_movcpv16qidi;
5199 return CODE_FOR_aarch64_reload_movcpv4hidi;
5202 return CODE_FOR_aarch64_reload_movcpv8hidi;
5205 return CODE_FOR_aarch64_reload_movcpv2sidi;
5208 return CODE_FOR_aarch64_reload_movcpv4sidi;
5211 return CODE_FOR_aarch64_reload_movcpv2didi;
5214 return CODE_FOR_aarch64_reload_movcpv2dfdi;
5223 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
5226 secondary_reload_info *sri)
5229 /* If we have to disable direct literal pool loads and stores because the
5230 function is too big, then we need a scratch register. */
5231 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
5232 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
5233 || targetm.vector_mode_supported_p (GET_MODE (x)))
5234 && !aarch64_pcrelative_literal_loads)
5236 sri->icode = aarch64_constant_pool_reload_icode (mode);
5240 /* Without the TARGET_SIMD instructions we cannot move a Q register
5241 to a Q register directly. We need a scratch. */
5242 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
5243 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
5244 && reg_class_subset_p (rclass, FP_REGS))
5247 sri->icode = CODE_FOR_aarch64_reload_movtf;
5248 else if (mode == TImode)
5249 sri->icode = CODE_FOR_aarch64_reload_movti;
5253 /* A TFmode or TImode memory access should be handled via an FP_REGS
5254 because AArch64 has richer addressing modes for LDR/STR instructions
5255 than LDP/STP instructions. */
5256 if (TARGET_FLOAT && rclass == GENERAL_REGS
5257 && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
5260 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
5261 return GENERAL_REGS;
5267 aarch64_can_eliminate (const int from, const int to)
5269 /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
5270 HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM. */
5272 if (frame_pointer_needed)
5274 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5276 if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
5278 if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
5279 && !cfun->calls_alloca)
5281 if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5288 /* If we decided that we didn't need a leaf frame pointer but then used
5289 LR in the function, then we'll want a frame pointer after all, so
5290 prevent this elimination to ensure a frame pointer is used. */
5291 if (to == STACK_POINTER_REGNUM
5292 && flag_omit_leaf_frame_pointer
5293 && df_regs_ever_live_p (LR_REGNUM))
5301 aarch64_initial_elimination_offset (unsigned from, unsigned to)
5303 aarch64_layout_frame ();
5305 if (to == HARD_FRAME_POINTER_REGNUM)
5307 if (from == ARG_POINTER_REGNUM)
5308 return cfun->machine->frame.hard_fp_offset;
5310 if (from == FRAME_POINTER_REGNUM)
5311 return cfun->machine->frame.hard_fp_offset
5312 - cfun->machine->frame.locals_offset;
5315 if (to == STACK_POINTER_REGNUM)
5317 if (from == FRAME_POINTER_REGNUM)
5318 return cfun->machine->frame.frame_size
5319 - cfun->machine->frame.locals_offset;
5322 return cfun->machine->frame.frame_size;
5325 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
5329 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
5333 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
5338 aarch64_asm_trampoline_template (FILE *f)
5342 asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5343 asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5347 asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5348 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5350 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5351 assemble_aligned_integer (4, const0_rtx);
5352 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5353 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5357 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5359 rtx fnaddr, mem, a_tramp;
5360 const int tramp_code_sz = 16;
5362 /* Don't need to copy the trailing D-words, we fill those in below. */
5363 emit_block_move (m_tramp, assemble_trampoline_template (),
5364 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
5365 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
5366 fnaddr = XEXP (DECL_RTL (fndecl), 0);
5367 if (GET_MODE (fnaddr) != ptr_mode)
5368 fnaddr = convert_memory_address (ptr_mode, fnaddr);
5369 emit_move_insn (mem, fnaddr);
5371 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
5372 emit_move_insn (mem, chain_value);
5374 /* XXX We should really define a "clear_cache" pattern and use
5375 gen_clear_cache(). */
5376 a_tramp = XEXP (m_tramp, 0);
5377 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
5378 LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
5379 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
5383 static unsigned char
5384 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
5388 case CALLER_SAVE_REGS:
5395 aarch64_vector_mode_p (mode)
5396 ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
5397 : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5411 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
5413 if (regclass == POINTER_REGS)
5414 return GENERAL_REGS;
5416 if (regclass == STACK_REG)
5419 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
5425 /* If it's an integer immediate that MOVI can't handle, then
5426 FP_REGS is not an option, so we return NO_REGS instead. */
5427 if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
5428 && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
5431 /* Register eliminiation can result in a request for
5432 SP+constant->FP_REGS. We cannot support such operations which
5433 use SP as source and an FP_REG as destination, so reject out
5435 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
5437 rtx lhs = XEXP (x, 0);
5439 /* Look through a possible SUBREG introduced by ILP32. */
5440 if (GET_CODE (lhs) == SUBREG)
5441 lhs = SUBREG_REG (lhs);
5443 gcc_assert (REG_P (lhs));
5444 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
5453 aarch64_asm_output_labelref (FILE* f, const char *name)
5455 asm_fprintf (f, "%U%s", name);
5459 aarch64_elf_asm_constructor (rtx symbol, int priority)
5461 if (priority == DEFAULT_INIT_PRIORITY)
5462 default_ctor_section_asm_out_constructor (symbol, priority);
5466 /* While priority is known to be in range [0, 65535], so 18 bytes
5467 would be enough, the compiler might not know that. To avoid
5468 -Wformat-truncation false positive, use a larger size. */
5470 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
5471 s = get_section (buf, SECTION_WRITE, NULL);
5472 switch_to_section (s);
5473 assemble_align (POINTER_SIZE);
5474 assemble_aligned_integer (POINTER_BYTES, symbol);
5479 aarch64_elf_asm_destructor (rtx symbol, int priority)
5481 if (priority == DEFAULT_INIT_PRIORITY)
5482 default_dtor_section_asm_out_destructor (symbol, priority);
5486 /* While priority is known to be in range [0, 65535], so 18 bytes
5487 would be enough, the compiler might not know that. To avoid
5488 -Wformat-truncation false positive, use a larger size. */
5490 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
5491 s = get_section (buf, SECTION_WRITE, NULL);
5492 switch_to_section (s);
5493 assemble_align (POINTER_SIZE);
5494 assemble_aligned_integer (POINTER_BYTES, symbol);
5499 aarch64_output_casesi (rtx *operands)
5503 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
5505 static const char *const patterns[4][2] =
5508 "ldrb\t%w3, [%0,%w1,uxtw]",
5509 "add\t%3, %4, %w3, sxtb #2"
5512 "ldrh\t%w3, [%0,%w1,uxtw #1]",
5513 "add\t%3, %4, %w3, sxth #2"
5516 "ldr\t%w3, [%0,%w1,uxtw #2]",
5517 "add\t%3, %4, %w3, sxtw #2"
5519 /* We assume that DImode is only generated when not optimizing and
5520 that we don't really need 64-bit address offsets. That would
5521 imply an object file with 8GB of code in a single function! */
5523 "ldr\t%w3, [%0,%w1,uxtw #2]",
5524 "add\t%3, %4, %w3, sxtw #2"
5528 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
5530 index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
5532 gcc_assert (index >= 0 && index <= 3);
5534 /* Need to implement table size reduction, by chaning the code below. */
5535 output_asm_insn (patterns[index][0], operands);
5536 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
5537 snprintf (buf, sizeof (buf),
5538 "adr\t%%4, %s", targetm.strip_name_encoding (label));
5539 output_asm_insn (buf, operands);
5540 output_asm_insn (patterns[index][1], operands);
5541 output_asm_insn ("br\t%3", operands);
5542 assemble_label (asm_out_file, label);
5547 /* Return size in bits of an arithmetic operand which is shifted/scaled and
5548 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
5552 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
5554 if (shift >= 0 && shift <= 3)
5557 for (size = 8; size <= 32; size *= 2)
5559 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
5560 if (mask == bits << shift)
5567 /* Constant pools are per function only when PC relative
5568 literal loads are true or we are in the large memory
5572 aarch64_can_use_per_function_literal_pools_p (void)
5574 return (aarch64_pcrelative_literal_loads
5575 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
5579 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
5581 /* Fixme:: In an ideal world this would work similar
5582 to the logic in aarch64_select_rtx_section but this
5583 breaks bootstrap in gcc go. For now we workaround
5584 this by returning false here. */
5588 /* Select appropriate section for constants depending
5589 on where we place literal pools. */
5592 aarch64_select_rtx_section (machine_mode mode,
5594 unsigned HOST_WIDE_INT align)
5596 if (aarch64_can_use_per_function_literal_pools_p ())
5597 return function_section (current_function_decl);
5599 return default_elf_select_rtx_section (mode, x, align);
5602 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
5604 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
5605 HOST_WIDE_INT offset)
5607 /* When using per-function literal pools, we must ensure that any code
5608 section is aligned to the minimal instruction length, lest we get
5609 errors from the assembler re "unaligned instructions". */
5610 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
5611 ASM_OUTPUT_ALIGN (f, 2);
5616 /* Helper function for rtx cost calculation. Strip a shift expression
5617 from X. Returns the inner operand if successful, or the original
5618 expression on failure. */
5620 aarch64_strip_shift (rtx x)
5624 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
5625 we can convert both to ROR during final output. */
5626 if ((GET_CODE (op) == ASHIFT
5627 || GET_CODE (op) == ASHIFTRT
5628 || GET_CODE (op) == LSHIFTRT
5629 || GET_CODE (op) == ROTATERT
5630 || GET_CODE (op) == ROTATE)
5631 && CONST_INT_P (XEXP (op, 1)))
5632 return XEXP (op, 0);
5634 if (GET_CODE (op) == MULT
5635 && CONST_INT_P (XEXP (op, 1))
5636 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
5637 return XEXP (op, 0);
5642 /* Helper function for rtx cost calculation. Strip an extend
5643 expression from X. Returns the inner operand if successful, or the
5644 original expression on failure. We deal with a number of possible
5645 canonicalization variations here. */
5647 aarch64_strip_extend (rtx x)
5651 /* Zero and sign extraction of a widened value. */
5652 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
5653 && XEXP (op, 2) == const0_rtx
5654 && GET_CODE (XEXP (op, 0)) == MULT
5655 && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
5657 return XEXP (XEXP (op, 0), 0);
5659 /* It can also be represented (for zero-extend) as an AND with an
5661 if (GET_CODE (op) == AND
5662 && GET_CODE (XEXP (op, 0)) == MULT
5663 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
5664 && CONST_INT_P (XEXP (op, 1))
5665 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
5666 INTVAL (XEXP (op, 1))) != 0)
5667 return XEXP (XEXP (op, 0), 0);
5669 /* Now handle extended register, as this may also have an optional
5670 left shift by 1..4. */
5671 if (GET_CODE (op) == ASHIFT
5672 && CONST_INT_P (XEXP (op, 1))
5673 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
5676 if (GET_CODE (op) == ZERO_EXTEND
5677 || GET_CODE (op) == SIGN_EXTEND)
5686 /* Return true iff CODE is a shift supported in combination
5687 with arithmetic instructions. */
5690 aarch64_shift_p (enum rtx_code code)
5692 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
5695 /* Helper function for rtx cost calculation. Calculate the cost of
5696 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
5697 Return the calculated cost of the expression, recursing manually in to
5698 operands where needed. */
5701 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
5704 const struct cpu_cost_table *extra_cost
5705 = aarch64_tune_params.insn_extra_cost;
5707 bool compound_p = (outer == PLUS || outer == MINUS);
5708 machine_mode mode = GET_MODE (x);
5710 gcc_checking_assert (code == MULT);
5715 if (VECTOR_MODE_P (mode))
5716 mode = GET_MODE_INNER (mode);
5718 /* Integer multiply/fma. */
5719 if (GET_MODE_CLASS (mode) == MODE_INT)
5721 /* The multiply will be canonicalized as a shift, cost it as such. */
5722 if (aarch64_shift_p (GET_CODE (x))
5723 || (CONST_INT_P (op1)
5724 && exact_log2 (INTVAL (op1)) > 0))
5726 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
5727 || GET_CODE (op0) == SIGN_EXTEND;
5733 /* ARITH + shift-by-register. */
5734 cost += extra_cost->alu.arith_shift_reg;
5736 /* ARITH + extended register. We don't have a cost field
5737 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
5738 cost += extra_cost->alu.extend_arith;
5740 /* ARITH + shift-by-immediate. */
5741 cost += extra_cost->alu.arith_shift;
5744 /* LSL (immediate). */
5745 cost += extra_cost->alu.shift;
5748 /* Strip extends as we will have costed them in the case above. */
5750 op0 = aarch64_strip_extend (op0);
5752 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
5757 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
5758 compound and let the below cases handle it. After all, MNEG is a
5759 special-case alias of MSUB. */
5760 if (GET_CODE (op0) == NEG)
5762 op0 = XEXP (op0, 0);
5766 /* Integer multiplies or FMAs have zero/sign extending variants. */
5767 if ((GET_CODE (op0) == ZERO_EXTEND
5768 && GET_CODE (op1) == ZERO_EXTEND)
5769 || (GET_CODE (op0) == SIGN_EXTEND
5770 && GET_CODE (op1) == SIGN_EXTEND))
5772 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
5773 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
5778 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
5779 cost += extra_cost->mult[0].extend_add;
5781 /* MUL/SMULL/UMULL. */
5782 cost += extra_cost->mult[0].extend;
5788 /* This is either an integer multiply or a MADD. In both cases
5789 we want to recurse and cost the operands. */
5790 cost += rtx_cost (op0, mode, MULT, 0, speed);
5791 cost += rtx_cost (op1, mode, MULT, 1, speed);
5797 cost += extra_cost->mult[mode == DImode].add;
5800 cost += extra_cost->mult[mode == DImode].simple;
5809 /* Floating-point FMA/FMUL can also support negations of the
5810 operands, unless the rounding mode is upward or downward in
5811 which case FNMUL is different than FMUL with operand negation. */
5812 bool neg0 = GET_CODE (op0) == NEG;
5813 bool neg1 = GET_CODE (op1) == NEG;
5814 if (compound_p || !flag_rounding_math || (neg0 && neg1))
5817 op0 = XEXP (op0, 0);
5819 op1 = XEXP (op1, 0);
5823 /* FMADD/FNMADD/FNMSUB/FMSUB. */
5824 cost += extra_cost->fp[mode == DFmode].fma;
5827 cost += extra_cost->fp[mode == DFmode].mult;
5830 cost += rtx_cost (op0, mode, MULT, 0, speed);
5831 cost += rtx_cost (op1, mode, MULT, 1, speed);
5837 aarch64_address_cost (rtx x,
5839 addr_space_t as ATTRIBUTE_UNUSED,
5842 enum rtx_code c = GET_CODE (x);
5843 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
5844 struct aarch64_address_info info;
5848 if (!aarch64_classify_address (&info, x, mode, c, false))
5850 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
5852 /* This is a CONST or SYMBOL ref which will be split
5853 in a different way depending on the code model in use.
5854 Cost it through the generic infrastructure. */
5855 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
5856 /* Divide through by the cost of one instruction to
5857 bring it to the same units as the address costs. */
5858 cost_symbol_ref /= COSTS_N_INSNS (1);
5859 /* The cost is then the cost of preparing the address,
5860 followed by an immediate (possibly 0) offset. */
5861 return cost_symbol_ref + addr_cost->imm_offset;
5865 /* This is most likely a jump table from a case
5867 return addr_cost->register_offset;
5873 case ADDRESS_LO_SUM:
5874 case ADDRESS_SYMBOLIC:
5875 case ADDRESS_REG_IMM:
5876 cost += addr_cost->imm_offset;
5879 case ADDRESS_REG_WB:
5880 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
5881 cost += addr_cost->pre_modify;
5882 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
5883 cost += addr_cost->post_modify;
5889 case ADDRESS_REG_REG:
5890 cost += addr_cost->register_offset;
5893 case ADDRESS_REG_SXTW:
5894 cost += addr_cost->register_sextend;
5897 case ADDRESS_REG_UXTW:
5898 cost += addr_cost->register_zextend;
5908 /* For the sake of calculating the cost of the shifted register
5909 component, we can treat same sized modes in the same way. */
5910 switch (GET_MODE_BITSIZE (mode))
5913 cost += addr_cost->addr_scale_costs.hi;
5917 cost += addr_cost->addr_scale_costs.si;
5921 cost += addr_cost->addr_scale_costs.di;
5924 /* We can't tell, or this is a 128-bit vector. */
5926 cost += addr_cost->addr_scale_costs.ti;
5934 /* Return the cost of a branch. If SPEED_P is true then the compiler is
5935 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
5939 aarch64_branch_cost (bool speed_p, bool predictable_p)
5941 /* When optimizing for speed, use the cost of unpredictable branches. */
5942 const struct cpu_branch_cost *branch_costs =
5943 aarch64_tune_params.branch_costs;
5945 if (!speed_p || predictable_p)
5946 return branch_costs->predictable;
5948 return branch_costs->unpredictable;
5951 /* Return true if the RTX X in mode MODE is a zero or sign extract
5952 usable in an ADD or SUB (extended register) instruction. */
5954 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
5956 /* Catch add with a sign extract.
5957 This is add_<optab><mode>_multp2. */
5958 if (GET_CODE (x) == SIGN_EXTRACT
5959 || GET_CODE (x) == ZERO_EXTRACT)
5961 rtx op0 = XEXP (x, 0);
5962 rtx op1 = XEXP (x, 1);
5963 rtx op2 = XEXP (x, 2);
5965 if (GET_CODE (op0) == MULT
5966 && CONST_INT_P (op1)
5967 && op2 == const0_rtx
5968 && CONST_INT_P (XEXP (op0, 1))
5969 && aarch64_is_extend_from_extract (mode,
5976 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
5978 else if (GET_CODE (x) == SIGN_EXTEND
5979 || GET_CODE (x) == ZERO_EXTEND)
5980 return REG_P (XEXP (x, 0));
5986 aarch64_frint_unspec_p (unsigned int u)
6004 /* Return true iff X is an rtx that will match an extr instruction
6005 i.e. as described in the *extr<mode>5_insn family of patterns.
6006 OP0 and OP1 will be set to the operands of the shifts involved
6007 on success and will be NULL_RTX otherwise. */
6010 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
6013 machine_mode mode = GET_MODE (x);
6015 *res_op0 = NULL_RTX;
6016 *res_op1 = NULL_RTX;
6018 if (GET_CODE (x) != IOR)
6024 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
6025 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
6027 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
6028 if (GET_CODE (op1) == ASHIFT)
6029 std::swap (op0, op1);
6031 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
6034 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
6035 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
6037 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
6038 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
6040 *res_op0 = XEXP (op0, 0);
6041 *res_op1 = XEXP (op1, 0);
6049 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
6050 storing it in *COST. Result is true if the total cost of the operation
6051 has now been calculated. */
6053 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
6057 enum rtx_code cmpcode;
6059 if (COMPARISON_P (op0))
6061 inner = XEXP (op0, 0);
6062 comparator = XEXP (op0, 1);
6063 cmpcode = GET_CODE (op0);
6068 comparator = const0_rtx;
6072 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
6074 /* Conditional branch. */
6075 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6079 if (cmpcode == NE || cmpcode == EQ)
6081 if (comparator == const0_rtx)
6083 /* TBZ/TBNZ/CBZ/CBNZ. */
6084 if (GET_CODE (inner) == ZERO_EXTRACT)
6086 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
6087 ZERO_EXTRACT, 0, speed);
6090 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
6095 else if (cmpcode == LT || cmpcode == GE)
6098 if (comparator == const0_rtx)
6103 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6106 if (GET_CODE (op1) == COMPARE)
6108 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
6109 if (XEXP (op1, 1) == const0_rtx)
6113 machine_mode mode = GET_MODE (XEXP (op1, 0));
6114 const struct cpu_cost_table *extra_cost
6115 = aarch64_tune_params.insn_extra_cost;
6117 if (GET_MODE_CLASS (mode) == MODE_INT)
6118 *cost += extra_cost->alu.arith;
6120 *cost += extra_cost->fp[mode == DFmode].compare;
6125 /* It's a conditional operation based on the status flags,
6126 so it must be some flavor of CSEL. */
6128 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
6129 if (GET_CODE (op1) == NEG
6130 || GET_CODE (op1) == NOT
6131 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
6132 op1 = XEXP (op1, 0);
6133 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
6135 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
6136 op1 = XEXP (op1, 0);
6137 op2 = XEXP (op2, 0);
6140 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
6141 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
6145 /* We don't know what this is, cost all operands. */
6149 /* Check whether X is a bitfield operation of the form shift + extend that
6150 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
6151 operand to which the bitfield operation is applied. Otherwise return
6155 aarch64_extend_bitfield_pattern_p (rtx x)
6157 rtx_code outer_code = GET_CODE (x);
6158 machine_mode outer_mode = GET_MODE (x);
6160 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
6161 && outer_mode != SImode && outer_mode != DImode)
6164 rtx inner = XEXP (x, 0);
6165 rtx_code inner_code = GET_CODE (inner);
6166 machine_mode inner_mode = GET_MODE (inner);
6172 if (CONST_INT_P (XEXP (inner, 1))
6173 && (inner_mode == QImode || inner_mode == HImode))
6174 op = XEXP (inner, 0);
6177 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
6178 && (inner_mode == QImode || inner_mode == HImode))
6179 op = XEXP (inner, 0);
6182 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
6183 && (inner_mode == QImode || inner_mode == HImode))
6184 op = XEXP (inner, 0);
6193 /* Return true if the mask and a shift amount from an RTX of the form
6194 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
6195 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
6198 aarch64_mask_and_shift_for_ubfiz_p (machine_mode mode, rtx mask, rtx shft_amnt)
6200 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
6201 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
6202 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
6203 && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
6206 /* Calculate the cost of calculating X, storing it in *COST. Result
6207 is true if the total cost of the operation has now been calculated. */
6209 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
6210 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
6213 const struct cpu_cost_table *extra_cost
6214 = aarch64_tune_params.insn_extra_cost;
6215 int code = GET_CODE (x);
6217 /* By default, assume that everything has equivalent cost to the
6218 cheapest instruction. Any additional costs are applied as a delta
6219 above this default. */
6220 *cost = COSTS_N_INSNS (1);
6225 /* The cost depends entirely on the operands to SET. */
6230 switch (GET_CODE (op0))
6235 rtx address = XEXP (op0, 0);
6236 if (VECTOR_MODE_P (mode))
6237 *cost += extra_cost->ldst.storev;
6238 else if (GET_MODE_CLASS (mode) == MODE_INT)
6239 *cost += extra_cost->ldst.store;
6240 else if (mode == SFmode)
6241 *cost += extra_cost->ldst.storef;
6242 else if (mode == DFmode)
6243 *cost += extra_cost->ldst.stored;
6246 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6250 *cost += rtx_cost (op1, mode, SET, 1, speed);
6254 if (! REG_P (SUBREG_REG (op0)))
6255 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
6259 /* The cost is one per vector-register copied. */
6260 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
6262 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6263 / GET_MODE_SIZE (V4SImode);
6264 *cost = COSTS_N_INSNS (n_minus_1 + 1);
6266 /* const0_rtx is in general free, but we will use an
6267 instruction to set a register to 0. */
6268 else if (REG_P (op1) || op1 == const0_rtx)
6270 /* The cost is 1 per register copied. */
6271 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6273 *cost = COSTS_N_INSNS (n_minus_1 + 1);
6276 /* Cost is just the cost of the RHS of the set. */
6277 *cost += rtx_cost (op1, mode, SET, 1, speed);
6282 /* Bit-field insertion. Strip any redundant widening of
6283 the RHS to meet the width of the target. */
6284 if (GET_CODE (op1) == SUBREG)
6285 op1 = SUBREG_REG (op1);
6286 if ((GET_CODE (op1) == ZERO_EXTEND
6287 || GET_CODE (op1) == SIGN_EXTEND)
6288 && CONST_INT_P (XEXP (op0, 1))
6289 && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
6290 >= INTVAL (XEXP (op0, 1))))
6291 op1 = XEXP (op1, 0);
6293 if (CONST_INT_P (op1))
6295 /* MOV immediate is assumed to always be cheap. */
6296 *cost = COSTS_N_INSNS (1);
6302 *cost += extra_cost->alu.bfi;
6303 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
6309 /* We can't make sense of this, assume default cost. */
6310 *cost = COSTS_N_INSNS (1);
6316 /* If an instruction can incorporate a constant within the
6317 instruction, the instruction's expression avoids calling
6318 rtx_cost() on the constant. If rtx_cost() is called on a
6319 constant, then it is usually because the constant must be
6320 moved into a register by one or more instructions.
6322 The exception is constant 0, which can be expressed
6323 as XZR/WZR and is therefore free. The exception to this is
6324 if we have (set (reg) (const0_rtx)) in which case we must cost
6325 the move. However, we can catch that when we cost the SET, so
6326 we don't need to consider that here. */
6327 if (x == const0_rtx)
6331 /* To an approximation, building any other constant is
6332 proportionally expensive to the number of instructions
6333 required to build that constant. This is true whether we
6334 are compiling for SPEED or otherwise. */
6335 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
6336 (NULL_RTX, x, false, mode));
6343 /* mov[df,sf]_aarch64. */
6344 if (aarch64_float_const_representable_p (x))
6345 /* FMOV (scalar immediate). */
6346 *cost += extra_cost->fp[mode == DFmode].fpconst;
6347 else if (!aarch64_float_const_zero_rtx_p (x))
6349 /* This will be a load from memory. */
6351 *cost += extra_cost->ldst.loadd;
6353 *cost += extra_cost->ldst.loadf;
6356 /* Otherwise this is +0.0. We get this using MOVI d0, #0
6357 or MOV v0.s[0], wzr - neither of which are modeled by the
6358 cost tables. Just use the default cost. */
6368 /* For loads we want the base cost of a load, plus an
6369 approximation for the additional cost of the addressing
6371 rtx address = XEXP (x, 0);
6372 if (VECTOR_MODE_P (mode))
6373 *cost += extra_cost->ldst.loadv;
6374 else if (GET_MODE_CLASS (mode) == MODE_INT)
6375 *cost += extra_cost->ldst.load;
6376 else if (mode == SFmode)
6377 *cost += extra_cost->ldst.loadf;
6378 else if (mode == DFmode)
6379 *cost += extra_cost->ldst.loadd;
6382 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6391 if (VECTOR_MODE_P (mode))
6396 *cost += extra_cost->vect.alu;
6401 if (GET_MODE_CLASS (mode) == MODE_INT)
6403 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6404 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6407 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
6411 /* Cost this as SUB wzr, X. */
6412 op0 = CONST0_RTX (mode);
6417 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6419 /* Support (neg(fma...)) as a single instruction only if
6420 sign of zeros is unimportant. This matches the decision
6421 making in aarch64.md. */
6422 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
6425 *cost = rtx_cost (op0, mode, NEG, 0, speed);
6428 if (GET_CODE (op0) == MULT)
6431 *cost = rtx_cost (op0, mode, NEG, 0, speed);
6436 *cost += extra_cost->fp[mode == DFmode].neg;
6446 if (VECTOR_MODE_P (mode))
6447 *cost += extra_cost->vect.alu;
6449 *cost += extra_cost->alu.clz;
6458 if (op1 == const0_rtx
6459 && GET_CODE (op0) == AND)
6462 mode = GET_MODE (op0);
6466 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
6468 /* TODO: A write to the CC flags possibly costs extra, this
6469 needs encoding in the cost tables. */
6471 mode = GET_MODE (op0);
6473 if (GET_CODE (op0) == AND)
6479 if (GET_CODE (op0) == PLUS)
6481 /* ADDS (and CMN alias). */
6486 if (GET_CODE (op0) == MINUS)
6493 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
6494 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
6495 && CONST_INT_P (XEXP (op0, 2)))
6497 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
6498 Handle it here directly rather than going to cost_logic
6499 since we know the immediate generated for the TST is valid
6500 so we can avoid creating an intermediate rtx for it only
6501 for costing purposes. */
6503 *cost += extra_cost->alu.logical;
6505 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
6506 ZERO_EXTRACT, 0, speed);
6510 if (GET_CODE (op1) == NEG)
6514 *cost += extra_cost->alu.arith;
6516 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
6517 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
6523 Compare can freely swap the order of operands, and
6524 canonicalization puts the more complex operation first.
6525 But the integer MINUS logic expects the shift/extend
6526 operation in op1. */
6528 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
6536 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
6540 *cost += extra_cost->fp[mode == DFmode].compare;
6542 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
6544 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
6545 /* FCMP supports constant 0.0 for no extra cost. */
6551 if (VECTOR_MODE_P (mode))
6553 /* Vector compare. */
6555 *cost += extra_cost->vect.alu;
6557 if (aarch64_float_const_zero_rtx_p (op1))
6559 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
6573 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
6575 /* Detect valid immediates. */
6576 if ((GET_MODE_CLASS (mode) == MODE_INT
6577 || (GET_MODE_CLASS (mode) == MODE_CC
6578 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
6579 && CONST_INT_P (op1)
6580 && aarch64_uimm12_shift (INTVAL (op1)))
6583 /* SUB(S) (immediate). */
6584 *cost += extra_cost->alu.arith;
6588 /* Look for SUB (extended register). */
6589 if (aarch64_rtx_arith_op_extract_p (op1, mode))
6592 *cost += extra_cost->alu.extend_arith;
6594 op1 = aarch64_strip_extend (op1);
6595 *cost += rtx_cost (op1, VOIDmode,
6596 (enum rtx_code) GET_CODE (op1), 0, speed);
6600 rtx new_op1 = aarch64_strip_extend (op1);
6602 /* Cost this as an FMA-alike operation. */
6603 if ((GET_CODE (new_op1) == MULT
6604 || aarch64_shift_p (GET_CODE (new_op1)))
6607 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
6608 (enum rtx_code) code,
6613 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
6617 if (VECTOR_MODE_P (mode))
6620 *cost += extra_cost->vect.alu;
6622 else if (GET_MODE_CLASS (mode) == MODE_INT)
6625 *cost += extra_cost->alu.arith;
6627 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6630 *cost += extra_cost->fp[mode == DFmode].addsub;
6644 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6645 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6648 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
6649 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
6653 if (GET_MODE_CLASS (mode) == MODE_INT
6654 && CONST_INT_P (op1)
6655 && aarch64_uimm12_shift (INTVAL (op1)))
6657 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
6660 /* ADD (immediate). */
6661 *cost += extra_cost->alu.arith;
6665 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
6667 /* Look for ADD (extended register). */
6668 if (aarch64_rtx_arith_op_extract_p (op0, mode))
6671 *cost += extra_cost->alu.extend_arith;
6673 op0 = aarch64_strip_extend (op0);
6674 *cost += rtx_cost (op0, VOIDmode,
6675 (enum rtx_code) GET_CODE (op0), 0, speed);
6679 /* Strip any extend, leave shifts behind as we will
6680 cost them through mult_cost. */
6681 new_op0 = aarch64_strip_extend (op0);
6683 if (GET_CODE (new_op0) == MULT
6684 || aarch64_shift_p (GET_CODE (new_op0)))
6686 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
6691 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
6695 if (VECTOR_MODE_P (mode))
6698 *cost += extra_cost->vect.alu;
6700 else if (GET_MODE_CLASS (mode) == MODE_INT)
6703 *cost += extra_cost->alu.arith;
6705 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6708 *cost += extra_cost->fp[mode == DFmode].addsub;
6715 *cost = COSTS_N_INSNS (1);
6719 if (VECTOR_MODE_P (mode))
6720 *cost += extra_cost->vect.alu;
6722 *cost += extra_cost->alu.rev;
6727 if (aarch_rev16_p (x))
6729 *cost = COSTS_N_INSNS (1);
6733 if (VECTOR_MODE_P (mode))
6734 *cost += extra_cost->vect.alu;
6736 *cost += extra_cost->alu.rev;
6741 if (aarch64_extr_rtx_p (x, &op0, &op1))
6743 *cost += rtx_cost (op0, mode, IOR, 0, speed);
6744 *cost += rtx_cost (op1, mode, IOR, 1, speed);
6746 *cost += extra_cost->alu.shift;
6757 if (VECTOR_MODE_P (mode))
6760 *cost += extra_cost->vect.alu;
6765 && GET_CODE (op0) == MULT
6766 && CONST_INT_P (XEXP (op0, 1))
6767 && CONST_INT_P (op1)
6768 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
6771 /* This is a UBFM/SBFM. */
6772 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
6774 *cost += extra_cost->alu.bfx;
6778 if (GET_MODE_CLASS (mode) == MODE_INT)
6780 if (CONST_INT_P (op1))
6782 /* We have a mask + shift version of a UBFIZ
6783 i.e. the *andim_ashift<mode>_bfiz pattern. */
6784 if (GET_CODE (op0) == ASHIFT
6785 && aarch64_mask_and_shift_for_ubfiz_p (mode, op1,
6788 *cost += rtx_cost (XEXP (op0, 0), mode,
6789 (enum rtx_code) code, 0, speed);
6791 *cost += extra_cost->alu.bfx;
6795 else if (aarch64_bitmask_imm (INTVAL (op1), mode))
6797 /* We possibly get the immediate for free, this is not
6799 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
6801 *cost += extra_cost->alu.logical;
6810 /* Handle ORN, EON, or BIC. */
6811 if (GET_CODE (op0) == NOT)
6812 op0 = XEXP (op0, 0);
6814 new_op0 = aarch64_strip_shift (op0);
6816 /* If we had a shift on op0 then this is a logical-shift-
6817 by-register/immediate operation. Otherwise, this is just
6818 a logical operation. */
6823 /* Shift by immediate. */
6824 if (CONST_INT_P (XEXP (op0, 1)))
6825 *cost += extra_cost->alu.log_shift;
6827 *cost += extra_cost->alu.log_shift_reg;
6830 *cost += extra_cost->alu.logical;
6833 /* In both cases we want to cost both operands. */
6834 *cost += rtx_cost (new_op0, mode, (enum rtx_code) code, 0, speed);
6835 *cost += rtx_cost (op1, mode, (enum rtx_code) code, 1, speed);
6844 op0 = aarch64_strip_shift (x);
6846 if (VECTOR_MODE_P (mode))
6849 *cost += extra_cost->vect.alu;
6853 /* MVN-shifted-reg. */
6856 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
6859 *cost += extra_cost->alu.log_shift;
6863 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
6864 Handle the second form here taking care that 'a' in the above can
6866 else if (GET_CODE (op0) == XOR)
6868 rtx newop0 = XEXP (op0, 0);
6869 rtx newop1 = XEXP (op0, 1);
6870 rtx op0_stripped = aarch64_strip_shift (newop0);
6872 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
6873 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
6877 if (op0_stripped != newop0)
6878 *cost += extra_cost->alu.log_shift;
6880 *cost += extra_cost->alu.logical;
6887 *cost += extra_cost->alu.logical;
6894 /* If a value is written in SI mode, then zero extended to DI
6895 mode, the operation will in general be free as a write to
6896 a 'w' register implicitly zeroes the upper bits of an 'x'
6897 register. However, if this is
6899 (set (reg) (zero_extend (reg)))
6901 we must cost the explicit register move. */
6903 && GET_MODE (op0) == SImode
6906 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
6908 /* If OP_COST is non-zero, then the cost of the zero extend
6909 is effectively the cost of the inner operation. Otherwise
6910 we have a MOV instruction and we take the cost from the MOV
6911 itself. This is true independently of whether we are
6912 optimizing for space or time. */
6918 else if (MEM_P (op0))
6920 /* All loads can zero extend to any size for free. */
6921 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
6925 op0 = aarch64_extend_bitfield_pattern_p (x);
6928 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
6930 *cost += extra_cost->alu.bfx;
6936 if (VECTOR_MODE_P (mode))
6939 *cost += extra_cost->vect.alu;
6943 /* We generate an AND instead of UXTB/UXTH. */
6944 *cost += extra_cost->alu.logical;
6950 if (MEM_P (XEXP (x, 0)))
6955 rtx address = XEXP (XEXP (x, 0), 0);
6956 *cost += extra_cost->ldst.load_sign_extend;
6959 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6965 op0 = aarch64_extend_bitfield_pattern_p (x);
6968 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
6970 *cost += extra_cost->alu.bfx;
6976 if (VECTOR_MODE_P (mode))
6977 *cost += extra_cost->vect.alu;
6979 *cost += extra_cost->alu.extend;
6987 if (CONST_INT_P (op1))
6991 if (VECTOR_MODE_P (mode))
6993 /* Vector shift (immediate). */
6994 *cost += extra_cost->vect.alu;
6998 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
7000 *cost += extra_cost->alu.shift;
7004 /* We can incorporate zero/sign extend for free. */
7005 if (GET_CODE (op0) == ZERO_EXTEND
7006 || GET_CODE (op0) == SIGN_EXTEND)
7007 op0 = XEXP (op0, 0);
7009 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
7016 if (VECTOR_MODE_P (mode))
7018 /* Vector shift (register). */
7019 *cost += extra_cost->vect.alu;
7024 *cost += extra_cost->alu.shift_reg;
7027 return false; /* All arguments need to be in registers. */
7037 if (CONST_INT_P (op1))
7039 /* ASR (immediate) and friends. */
7042 if (VECTOR_MODE_P (mode))
7043 *cost += extra_cost->vect.alu;
7045 *cost += extra_cost->alu.shift;
7048 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7054 /* ASR (register) and friends. */
7057 if (VECTOR_MODE_P (mode))
7058 *cost += extra_cost->vect.alu;
7060 *cost += extra_cost->alu.shift_reg;
7062 return false; /* All arguments need to be in registers. */
7067 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
7068 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
7072 *cost += extra_cost->ldst.load;
7074 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
7075 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
7077 /* ADRP, followed by ADD. */
7078 *cost += COSTS_N_INSNS (1);
7080 *cost += 2 * extra_cost->alu.arith;
7082 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
7083 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
7087 *cost += extra_cost->alu.arith;
7092 /* One extra load instruction, after accessing the GOT. */
7093 *cost += COSTS_N_INSNS (1);
7095 *cost += extra_cost->ldst.load;
7101 /* ADRP/ADD (immediate). */
7103 *cost += extra_cost->alu.arith;
7111 if (VECTOR_MODE_P (mode))
7112 *cost += extra_cost->vect.alu;
7114 *cost += extra_cost->alu.bfx;
7117 /* We can trust that the immediates used will be correct (there
7118 are no by-register forms), so we need only cost op0. */
7119 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
7123 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
7124 /* aarch64_rtx_mult_cost always handles recursion to its
7129 /* We can expand signed mod by power of 2 using a NEGS, two parallel
7130 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
7131 an unconditional negate. This case should only ever be reached through
7132 the set_smod_pow2_cheap check in expmed.c. */
7133 if (CONST_INT_P (XEXP (x, 1))
7134 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
7135 && (mode == SImode || mode == DImode))
7137 /* We expand to 4 instructions. Reset the baseline. */
7138 *cost = COSTS_N_INSNS (4);
7141 *cost += 2 * extra_cost->alu.logical
7142 + 2 * extra_cost->alu.arith;
7151 if (VECTOR_MODE_P (mode))
7152 *cost += extra_cost->vect.alu;
7153 else if (GET_MODE_CLASS (mode) == MODE_INT)
7154 *cost += (extra_cost->mult[mode == DImode].add
7155 + extra_cost->mult[mode == DImode].idiv);
7156 else if (mode == DFmode)
7157 *cost += (extra_cost->fp[1].mult
7158 + extra_cost->fp[1].div);
7159 else if (mode == SFmode)
7160 *cost += (extra_cost->fp[0].mult
7161 + extra_cost->fp[0].div);
7163 return false; /* All arguments need to be in registers. */
7170 if (VECTOR_MODE_P (mode))
7171 *cost += extra_cost->vect.alu;
7172 else if (GET_MODE_CLASS (mode) == MODE_INT)
7173 /* There is no integer SQRT, so only DIV and UDIV can get
7175 *cost += extra_cost->mult[mode == DImode].idiv;
7177 *cost += extra_cost->fp[mode == DFmode].div;
7179 return false; /* All arguments need to be in registers. */
7182 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
7183 XEXP (x, 2), cost, speed);
7196 return false; /* All arguments must be in registers. */
7205 if (VECTOR_MODE_P (mode))
7206 *cost += extra_cost->vect.alu;
7208 *cost += extra_cost->fp[mode == DFmode].fma;
7211 /* FMSUB, FNMADD, and FNMSUB are free. */
7212 if (GET_CODE (op0) == NEG)
7213 op0 = XEXP (op0, 0);
7215 if (GET_CODE (op2) == NEG)
7216 op2 = XEXP (op2, 0);
7218 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
7219 and the by-element operand as operand 0. */
7220 if (GET_CODE (op1) == NEG)
7221 op1 = XEXP (op1, 0);
7223 /* Catch vector-by-element operations. The by-element operand can
7224 either be (vec_duplicate (vec_select (x))) or just
7225 (vec_select (x)), depending on whether we are multiplying by
7226 a vector or a scalar.
7228 Canonicalization is not very good in these cases, FMA4 will put the
7229 by-element operand as operand 0, FNMA4 will have it as operand 1. */
7230 if (GET_CODE (op0) == VEC_DUPLICATE)
7231 op0 = XEXP (op0, 0);
7232 else if (GET_CODE (op1) == VEC_DUPLICATE)
7233 op1 = XEXP (op1, 0);
7235 if (GET_CODE (op0) == VEC_SELECT)
7236 op0 = XEXP (op0, 0);
7237 else if (GET_CODE (op1) == VEC_SELECT)
7238 op1 = XEXP (op1, 0);
7240 /* If the remaining parameters are not registers,
7241 get the cost to put them into registers. */
7242 *cost += rtx_cost (op0, mode, FMA, 0, speed);
7243 *cost += rtx_cost (op1, mode, FMA, 1, speed);
7244 *cost += rtx_cost (op2, mode, FMA, 2, speed);
7248 case UNSIGNED_FLOAT:
7250 *cost += extra_cost->fp[mode == DFmode].fromint;
7256 if (VECTOR_MODE_P (mode))
7258 /*Vector truncate. */
7259 *cost += extra_cost->vect.alu;
7262 *cost += extra_cost->fp[mode == DFmode].widen;
7266 case FLOAT_TRUNCATE:
7269 if (VECTOR_MODE_P (mode))
7271 /*Vector conversion. */
7272 *cost += extra_cost->vect.alu;
7275 *cost += extra_cost->fp[mode == DFmode].narrow;
7282 /* Strip the rounding part. They will all be implemented
7283 by the fcvt* family of instructions anyway. */
7284 if (GET_CODE (x) == UNSPEC)
7286 unsigned int uns_code = XINT (x, 1);
7288 if (uns_code == UNSPEC_FRINTA
7289 || uns_code == UNSPEC_FRINTM
7290 || uns_code == UNSPEC_FRINTN
7291 || uns_code == UNSPEC_FRINTP
7292 || uns_code == UNSPEC_FRINTZ)
7293 x = XVECEXP (x, 0, 0);
7298 if (VECTOR_MODE_P (mode))
7299 *cost += extra_cost->vect.alu;
7301 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
7304 /* We can combine fmul by a power of 2 followed by a fcvt into a single
7305 fixed-point fcvt. */
7306 if (GET_CODE (x) == MULT
7307 && ((VECTOR_MODE_P (mode)
7308 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
7309 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
7311 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
7316 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
7320 if (VECTOR_MODE_P (mode))
7324 *cost += extra_cost->vect.alu;
7326 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7330 /* FABD, which is analogous to FADD. */
7331 if (GET_CODE (op0) == MINUS)
7333 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
7334 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
7336 *cost += extra_cost->fp[mode == DFmode].addsub;
7340 /* Simple FABS is analogous to FNEG. */
7342 *cost += extra_cost->fp[mode == DFmode].neg;
7346 /* Integer ABS will either be split to
7347 two arithmetic instructions, or will be an ABS
7348 (scalar), which we don't model. */
7349 *cost = COSTS_N_INSNS (2);
7351 *cost += 2 * extra_cost->alu.arith;
7359 if (VECTOR_MODE_P (mode))
7360 *cost += extra_cost->vect.alu;
7363 /* FMAXNM/FMINNM/FMAX/FMIN.
7364 TODO: This may not be accurate for all implementations, but
7365 we do not model this in the cost tables. */
7366 *cost += extra_cost->fp[mode == DFmode].addsub;
7372 /* The floating point round to integer frint* instructions. */
7373 if (aarch64_frint_unspec_p (XINT (x, 1)))
7376 *cost += extra_cost->fp[mode == DFmode].roundint;
7381 if (XINT (x, 1) == UNSPEC_RBIT)
7384 *cost += extra_cost->alu.rev;
7392 /* Decompose <su>muldi3_highpart. */
7393 if (/* (truncate:DI */
7396 && GET_MODE (XEXP (x, 0)) == TImode
7397 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
7399 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
7400 /* (ANY_EXTEND:TI (reg:DI))
7401 (ANY_EXTEND:TI (reg:DI))) */
7402 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
7403 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
7404 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
7405 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
7406 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
7407 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
7408 /* (const_int 64) */
7409 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7410 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
7414 *cost += extra_cost->mult[mode == DImode].extend;
7415 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
7416 mode, MULT, 0, speed);
7417 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
7418 mode, MULT, 1, speed);
7427 if (dump_file && (dump_flags & TDF_DETAILS))
7429 "\nFailed to cost RTX. Assuming default cost.\n");
7434 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
7435 calculated for X. This cost is stored in *COST. Returns true
7436 if the total cost of X was calculated. */
7438 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
7439 int param, int *cost, bool speed)
7441 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
7443 if (dump_file && (dump_flags & TDF_DETAILS))
7445 print_rtl_single (dump_file, x);
7446 fprintf (dump_file, "\n%s cost: %d (%s)\n",
7447 speed ? "Hot" : "Cold",
7448 *cost, result ? "final" : "partial");
7455 aarch64_register_move_cost (machine_mode mode,
7456 reg_class_t from_i, reg_class_t to_i)
7458 enum reg_class from = (enum reg_class) from_i;
7459 enum reg_class to = (enum reg_class) to_i;
7460 const struct cpu_regmove_cost *regmove_cost
7461 = aarch64_tune_params.regmove_cost;
7463 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
7464 if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
7467 if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
7468 from = GENERAL_REGS;
7470 /* Moving between GPR and stack cost is the same as GP2GP. */
7471 if ((from == GENERAL_REGS && to == STACK_REG)
7472 || (to == GENERAL_REGS && from == STACK_REG))
7473 return regmove_cost->GP2GP;
7475 /* To/From the stack register, we move via the gprs. */
7476 if (to == STACK_REG || from == STACK_REG)
7477 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
7478 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
7480 if (GET_MODE_SIZE (mode) == 16)
7482 /* 128-bit operations on general registers require 2 instructions. */
7483 if (from == GENERAL_REGS && to == GENERAL_REGS)
7484 return regmove_cost->GP2GP * 2;
7485 else if (from == GENERAL_REGS)
7486 return regmove_cost->GP2FP * 2;
7487 else if (to == GENERAL_REGS)
7488 return regmove_cost->FP2GP * 2;
7490 /* When AdvSIMD instructions are disabled it is not possible to move
7491 a 128-bit value directly between Q registers. This is handled in
7492 secondary reload. A general register is used as a scratch to move
7493 the upper DI value and the lower DI value is moved directly,
7494 hence the cost is the sum of three moves. */
7496 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
7498 return regmove_cost->FP2FP;
7501 if (from == GENERAL_REGS && to == GENERAL_REGS)
7502 return regmove_cost->GP2GP;
7503 else if (from == GENERAL_REGS)
7504 return regmove_cost->GP2FP;
7505 else if (to == GENERAL_REGS)
7506 return regmove_cost->FP2GP;
7508 return regmove_cost->FP2FP;
7512 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
7513 reg_class_t rclass ATTRIBUTE_UNUSED,
7514 bool in ATTRIBUTE_UNUSED)
7516 return aarch64_tune_params.memmov_cost;
7519 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
7520 to optimize 1.0/sqrt. */
7523 use_rsqrt_p (machine_mode mode)
7525 return (!flag_trapping_math
7526 && flag_unsafe_math_optimizations
7527 && ((aarch64_tune_params.approx_modes->recip_sqrt
7528 & AARCH64_APPROX_MODE (mode))
7529 || flag_mrecip_low_precision_sqrt));
7532 /* Function to decide when to use the approximate reciprocal square root
7536 aarch64_builtin_reciprocal (tree fndecl)
7538 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
7540 if (!use_rsqrt_p (mode))
7542 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
7545 typedef rtx (*rsqrte_type) (rtx, rtx);
7547 /* Select reciprocal square root initial estimate insn depending on machine
7551 get_rsqrte_type (machine_mode mode)
7555 case DFmode: return gen_aarch64_rsqrtedf;
7556 case SFmode: return gen_aarch64_rsqrtesf;
7557 case V2DFmode: return gen_aarch64_rsqrtev2df;
7558 case V2SFmode: return gen_aarch64_rsqrtev2sf;
7559 case V4SFmode: return gen_aarch64_rsqrtev4sf;
7560 default: gcc_unreachable ();
7564 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
7566 /* Select reciprocal square root series step insn depending on machine mode. */
7569 get_rsqrts_type (machine_mode mode)
7573 case DFmode: return gen_aarch64_rsqrtsdf;
7574 case SFmode: return gen_aarch64_rsqrtssf;
7575 case V2DFmode: return gen_aarch64_rsqrtsv2df;
7576 case V2SFmode: return gen_aarch64_rsqrtsv2sf;
7577 case V4SFmode: return gen_aarch64_rsqrtsv4sf;
7578 default: gcc_unreachable ();
7582 /* Emit instruction sequence to compute either the approximate square root
7583 or its approximate reciprocal, depending on the flag RECP, and return
7584 whether the sequence was emitted or not. */
7587 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
7589 machine_mode mode = GET_MODE (dst);
7591 if (GET_MODE_INNER (mode) == HFmode)
7594 machine_mode mmsk = mode_for_vector
7595 (int_mode_for_mode (GET_MODE_INNER (mode)),
7596 GET_MODE_NUNITS (mode));
7597 bool use_approx_sqrt_p = (!recp
7598 && (flag_mlow_precision_sqrt
7599 || (aarch64_tune_params.approx_modes->sqrt
7600 & AARCH64_APPROX_MODE (mode))));
7601 bool use_approx_rsqrt_p = (recp
7602 && (flag_mrecip_low_precision_sqrt
7603 || (aarch64_tune_params.approx_modes->recip_sqrt
7604 & AARCH64_APPROX_MODE (mode))));
7606 if (!flag_finite_math_only
7607 || flag_trapping_math
7608 || !flag_unsafe_math_optimizations
7609 || !(use_approx_sqrt_p || use_approx_rsqrt_p)
7610 || optimize_function_for_size_p (cfun))
7613 rtx xmsk = gen_reg_rtx (mmsk);
7615 /* When calculating the approximate square root, compare the argument with
7616 0.0 and create a mask. */
7617 emit_insn (gen_rtx_SET (xmsk, gen_rtx_NEG (mmsk, gen_rtx_EQ (mmsk, src,
7618 CONST0_RTX (mode)))));
7620 /* Estimate the approximate reciprocal square root. */
7621 rtx xdst = gen_reg_rtx (mode);
7622 emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
7624 /* Iterate over the series twice for SF and thrice for DF. */
7625 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
7627 /* Optionally iterate over the series once less for faster performance
7628 while sacrificing the accuracy. */
7629 if ((recp && flag_mrecip_low_precision_sqrt)
7630 || (!recp && flag_mlow_precision_sqrt))
7633 /* Iterate over the series to calculate the approximate reciprocal square
7635 rtx x1 = gen_reg_rtx (mode);
7636 while (iterations--)
7638 rtx x2 = gen_reg_rtx (mode);
7639 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
7641 emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
7644 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
7649 /* Qualify the approximate reciprocal square root when the argument is
7650 0.0 by squashing the intermediary result to 0.0. */
7651 rtx xtmp = gen_reg_rtx (mmsk);
7652 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
7653 gen_rtx_SUBREG (mmsk, xdst, 0)));
7654 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
7656 /* Calculate the approximate square root. */
7657 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
7660 /* Finalize the approximation. */
7661 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
7666 typedef rtx (*recpe_type) (rtx, rtx);
7668 /* Select reciprocal initial estimate insn depending on machine mode. */
7671 get_recpe_type (machine_mode mode)
7675 case SFmode: return (gen_aarch64_frecpesf);
7676 case V2SFmode: return (gen_aarch64_frecpev2sf);
7677 case V4SFmode: return (gen_aarch64_frecpev4sf);
7678 case DFmode: return (gen_aarch64_frecpedf);
7679 case V2DFmode: return (gen_aarch64_frecpev2df);
7680 default: gcc_unreachable ();
7684 typedef rtx (*recps_type) (rtx, rtx, rtx);
7686 /* Select reciprocal series step insn depending on machine mode. */
7689 get_recps_type (machine_mode mode)
7693 case SFmode: return (gen_aarch64_frecpssf);
7694 case V2SFmode: return (gen_aarch64_frecpsv2sf);
7695 case V4SFmode: return (gen_aarch64_frecpsv4sf);
7696 case DFmode: return (gen_aarch64_frecpsdf);
7697 case V2DFmode: return (gen_aarch64_frecpsv2df);
7698 default: gcc_unreachable ();
7702 /* Emit the instruction sequence to compute the approximation for the division
7703 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
7706 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
7708 machine_mode mode = GET_MODE (quo);
7710 if (GET_MODE_INNER (mode) == HFmode)
7713 bool use_approx_division_p = (flag_mlow_precision_div
7714 || (aarch64_tune_params.approx_modes->division
7715 & AARCH64_APPROX_MODE (mode)));
7717 if (!flag_finite_math_only
7718 || flag_trapping_math
7719 || !flag_unsafe_math_optimizations
7720 || optimize_function_for_size_p (cfun)
7721 || !use_approx_division_p)
7724 /* Estimate the approximate reciprocal. */
7725 rtx xrcp = gen_reg_rtx (mode);
7726 emit_insn ((*get_recpe_type (mode)) (xrcp, den));
7728 /* Iterate over the series twice for SF and thrice for DF. */
7729 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
7731 /* Optionally iterate over the series once less for faster performance,
7732 while sacrificing the accuracy. */
7733 if (flag_mlow_precision_div)
7736 /* Iterate over the series to calculate the approximate reciprocal. */
7737 rtx xtmp = gen_reg_rtx (mode);
7738 while (iterations--)
7740 emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
7743 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
7746 if (num != CONST1_RTX (mode))
7748 /* As the approximate reciprocal of DEN is already calculated, only
7749 calculate the approximate division when NUM is not 1.0. */
7750 rtx xnum = force_reg (mode, num);
7751 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
7754 /* Finalize the approximation. */
7755 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
7759 /* Return the number of instructions that can be issued per cycle. */
7761 aarch64_sched_issue_rate (void)
7763 return aarch64_tune_params.issue_rate;
7767 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
7769 int issue_rate = aarch64_sched_issue_rate ();
7771 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
7775 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
7776 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
7777 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
7780 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
7783 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
7787 /* Vectorizer cost model target hooks. */
7789 /* Implement targetm.vectorize.builtin_vectorization_cost. */
7791 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
7793 int misalign ATTRIBUTE_UNUSED)
7797 switch (type_of_cost)
7800 return aarch64_tune_params.vec_costs->scalar_stmt_cost;
7803 return aarch64_tune_params.vec_costs->scalar_load_cost;
7806 return aarch64_tune_params.vec_costs->scalar_store_cost;
7809 return aarch64_tune_params.vec_costs->vec_stmt_cost;
7812 return aarch64_tune_params.vec_costs->vec_align_load_cost;
7815 return aarch64_tune_params.vec_costs->vec_store_cost;
7818 return aarch64_tune_params.vec_costs->vec_to_scalar_cost;
7821 return aarch64_tune_params.vec_costs->scalar_to_vec_cost;
7823 case unaligned_load:
7824 return aarch64_tune_params.vec_costs->vec_unalign_load_cost;
7826 case unaligned_store:
7827 return aarch64_tune_params.vec_costs->vec_unalign_store_cost;
7829 case cond_branch_taken:
7830 return aarch64_tune_params.vec_costs->cond_taken_branch_cost;
7832 case cond_branch_not_taken:
7833 return aarch64_tune_params.vec_costs->cond_not_taken_branch_cost;
7836 return aarch64_tune_params.vec_costs->vec_permute_cost;
7838 case vec_promote_demote:
7839 return aarch64_tune_params.vec_costs->vec_stmt_cost;
7842 elements = TYPE_VECTOR_SUBPARTS (vectype);
7843 return elements / 2 + 1;
7850 /* Implement targetm.vectorize.add_stmt_cost. */
7852 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
7853 struct _stmt_vec_info *stmt_info, int misalign,
7854 enum vect_cost_model_location where)
7856 unsigned *cost = (unsigned *) data;
7857 unsigned retval = 0;
7859 if (flag_vect_cost_model)
7861 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
7863 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
7865 /* Statements in an inner loop relative to the loop being
7866 vectorized are weighted more heavily. The value here is
7867 arbitrary and could potentially be improved with analysis. */
7868 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
7869 count *= 50; /* FIXME */
7871 retval = (unsigned) (count * stmt_cost);
7872 cost[where] += retval;
7878 static void initialize_aarch64_code_model (struct gcc_options *);
7880 /* Parse the TO_PARSE string and put the architecture struct that it
7881 selects into RES and the architectural features into ISA_FLAGS.
7882 Return an aarch64_parse_opt_result describing the parse result.
7883 If there is an error parsing, RES and ISA_FLAGS are left unchanged. */
7885 static enum aarch64_parse_opt_result
7886 aarch64_parse_arch (const char *to_parse, const struct processor **res,
7887 unsigned long *isa_flags)
7890 const struct processor *arch;
7891 char *str = (char *) alloca (strlen (to_parse) + 1);
7894 strcpy (str, to_parse);
7896 ext = strchr (str, '+');
7904 return AARCH64_PARSE_MISSING_ARG;
7907 /* Loop through the list of supported ARCHes to find a match. */
7908 for (arch = all_architectures; arch->name != NULL; arch++)
7910 if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
7912 unsigned long isa_temp = arch->flags;
7916 /* TO_PARSE string contains at least one extension. */
7917 enum aarch64_parse_opt_result ext_res
7918 = aarch64_parse_extension (ext, &isa_temp);
7920 if (ext_res != AARCH64_PARSE_OK)
7923 /* Extension parsing was successful. Confirm the result
7924 arch and ISA flags. */
7926 *isa_flags = isa_temp;
7927 return AARCH64_PARSE_OK;
7931 /* ARCH name not found in list. */
7932 return AARCH64_PARSE_INVALID_ARG;
7935 /* Parse the TO_PARSE string and put the result tuning in RES and the
7936 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
7937 describing the parse result. If there is an error parsing, RES and
7938 ISA_FLAGS are left unchanged. */
7940 static enum aarch64_parse_opt_result
7941 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
7942 unsigned long *isa_flags)
7945 const struct processor *cpu;
7946 char *str = (char *) alloca (strlen (to_parse) + 1);
7949 strcpy (str, to_parse);
7951 ext = strchr (str, '+');
7959 return AARCH64_PARSE_MISSING_ARG;
7962 /* Loop through the list of supported CPUs to find a match. */
7963 for (cpu = all_cores; cpu->name != NULL; cpu++)
7965 if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
7967 unsigned long isa_temp = cpu->flags;
7972 /* TO_PARSE string contains at least one extension. */
7973 enum aarch64_parse_opt_result ext_res
7974 = aarch64_parse_extension (ext, &isa_temp);
7976 if (ext_res != AARCH64_PARSE_OK)
7979 /* Extension parsing was successfull. Confirm the result
7980 cpu and ISA flags. */
7982 *isa_flags = isa_temp;
7983 return AARCH64_PARSE_OK;
7987 /* CPU name not found in list. */
7988 return AARCH64_PARSE_INVALID_ARG;
7991 /* Parse the TO_PARSE string and put the cpu it selects into RES.
7992 Return an aarch64_parse_opt_result describing the parse result.
7993 If the parsing fails the RES does not change. */
7995 static enum aarch64_parse_opt_result
7996 aarch64_parse_tune (const char *to_parse, const struct processor **res)
7998 const struct processor *cpu;
7999 char *str = (char *) alloca (strlen (to_parse) + 1);
8001 strcpy (str, to_parse);
8003 /* Loop through the list of supported CPUs to find a match. */
8004 for (cpu = all_cores; cpu->name != NULL; cpu++)
8006 if (strcmp (cpu->name, str) == 0)
8009 return AARCH64_PARSE_OK;
8013 /* CPU name not found in list. */
8014 return AARCH64_PARSE_INVALID_ARG;
8017 /* Parse TOKEN, which has length LENGTH to see if it is an option
8018 described in FLAG. If it is, return the index bit for that fusion type.
8019 If not, error (printing OPTION_NAME) and return zero. */
8022 aarch64_parse_one_option_token (const char *token,
8024 const struct aarch64_flag_desc *flag,
8025 const char *option_name)
8027 for (; flag->name != NULL; flag++)
8029 if (length == strlen (flag->name)
8030 && !strncmp (flag->name, token, length))
8034 error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
8038 /* Parse OPTION which is a comma-separated list of flags to enable.
8039 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
8040 default state we inherit from the CPU tuning structures. OPTION_NAME
8041 gives the top-level option we are parsing in the -moverride string,
8042 for use in error messages. */
8045 aarch64_parse_boolean_options (const char *option,
8046 const struct aarch64_flag_desc *flags,
8047 unsigned int initial_state,
8048 const char *option_name)
8050 const char separator = '.';
8051 const char* specs = option;
8052 const char* ntoken = option;
8053 unsigned int found_flags = initial_state;
8055 while ((ntoken = strchr (specs, separator)))
8057 size_t token_length = ntoken - specs;
8058 unsigned token_ops = aarch64_parse_one_option_token (specs,
8062 /* If we find "none" (or, for simplicity's sake, an error) anywhere
8063 in the token stream, reset the supported operations. So:
8065 adrp+add.cmp+branch.none.adrp+add
8067 would have the result of turning on only adrp+add fusion. */
8071 found_flags |= token_ops;
8075 /* We ended with a comma, print something. */
8078 error ("%s string ill-formed\n", option_name);
8082 /* We still have one more token to parse. */
8083 size_t token_length = strlen (specs);
8084 unsigned token_ops = aarch64_parse_one_option_token (specs,
8091 found_flags |= token_ops;
8095 /* Support for overriding instruction fusion. */
8098 aarch64_parse_fuse_string (const char *fuse_string,
8099 struct tune_params *tune)
8101 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
8102 aarch64_fusible_pairs,
8107 /* Support for overriding other tuning flags. */
8110 aarch64_parse_tune_string (const char *tune_string,
8111 struct tune_params *tune)
8113 tune->extra_tuning_flags
8114 = aarch64_parse_boolean_options (tune_string,
8115 aarch64_tuning_flags,
8116 tune->extra_tuning_flags,
8120 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
8121 we understand. If it is, extract the option string and handoff to
8122 the appropriate function. */
8125 aarch64_parse_one_override_token (const char* token,
8127 struct tune_params *tune)
8129 const struct aarch64_tuning_override_function *fn
8130 = aarch64_tuning_override_functions;
8132 const char *option_part = strchr (token, '=');
8135 error ("tuning string missing in option (%s)", token);
8139 /* Get the length of the option name. */
8140 length = option_part - token;
8141 /* Skip the '=' to get to the option string. */
8144 for (; fn->name != NULL; fn++)
8146 if (!strncmp (fn->name, token, length))
8148 fn->parse_override (option_part, tune);
8153 error ("unknown tuning option (%s)",token);
8157 /* A checking mechanism for the implementation of the tls size. */
8160 initialize_aarch64_tls_size (struct gcc_options *opts)
8162 if (aarch64_tls_size == 0)
8163 aarch64_tls_size = 24;
8165 switch (opts->x_aarch64_cmodel_var)
8167 case AARCH64_CMODEL_TINY:
8168 /* Both the default and maximum TLS size allowed under tiny is 1M which
8169 needs two instructions to address, so we clamp the size to 24. */
8170 if (aarch64_tls_size > 24)
8171 aarch64_tls_size = 24;
8173 case AARCH64_CMODEL_SMALL:
8174 /* The maximum TLS size allowed under small is 4G. */
8175 if (aarch64_tls_size > 32)
8176 aarch64_tls_size = 32;
8178 case AARCH64_CMODEL_LARGE:
8179 /* The maximum TLS size allowed under large is 16E.
8180 FIXME: 16E should be 64bit, we only support 48bit offset now. */
8181 if (aarch64_tls_size > 48)
8182 aarch64_tls_size = 48;
8191 /* Parse STRING looking for options in the format:
8192 string :: option:string
8193 option :: name=substring
8195 substring :: defined by option. */
8198 aarch64_parse_override_string (const char* input_string,
8199 struct tune_params* tune)
8201 const char separator = ':';
8202 size_t string_length = strlen (input_string) + 1;
8203 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
8204 char *string = string_root;
8205 strncpy (string, input_string, string_length);
8206 string[string_length - 1] = '\0';
8208 char* ntoken = string;
8210 while ((ntoken = strchr (string, separator)))
8212 size_t token_length = ntoken - string;
8213 /* Make this substring look like a string. */
8215 aarch64_parse_one_override_token (string, token_length, tune);
8219 /* One last option to parse. */
8220 aarch64_parse_one_override_token (string, strlen (string), tune);
8226 aarch64_override_options_after_change_1 (struct gcc_options *opts)
8228 /* The logic here is that if we are disabling all frame pointer generation
8229 then we do not need to disable leaf frame pointer generation as a
8230 separate operation. But if we are *only* disabling leaf frame pointer
8231 generation then we set flag_omit_frame_pointer to true, but in
8232 aarch64_frame_pointer_required we return false only for leaf functions.
8234 PR 70044: We have to be careful about being called multiple times for the
8235 same function. Once we have decided to set flag_omit_frame_pointer just
8236 so that we can omit leaf frame pointers, we must then not interpret a
8237 second call as meaning that all frame pointer generation should be
8238 omitted. We do this by setting flag_omit_frame_pointer to a special,
8240 if (opts->x_flag_omit_frame_pointer == 2)
8241 opts->x_flag_omit_frame_pointer = 0;
8243 if (opts->x_flag_omit_frame_pointer)
8244 opts->x_flag_omit_leaf_frame_pointer = false;
8245 else if (opts->x_flag_omit_leaf_frame_pointer)
8246 opts->x_flag_omit_frame_pointer = 2;
8248 /* If not optimizing for size, set the default
8249 alignment to what the target wants. */
8250 if (!opts->x_optimize_size)
8252 if (opts->x_align_loops <= 0)
8253 opts->x_align_loops = aarch64_tune_params.loop_align;
8254 if (opts->x_align_jumps <= 0)
8255 opts->x_align_jumps = aarch64_tune_params.jump_align;
8256 if (opts->x_align_functions <= 0)
8257 opts->x_align_functions = aarch64_tune_params.function_align;
8260 /* We default to no pc-relative literal loads. */
8262 aarch64_pcrelative_literal_loads = false;
8264 /* If -mpc-relative-literal-loads is set on the command line, this
8265 implies that the user asked for PC relative literal loads. */
8266 if (opts->x_pcrelative_literal_loads == 1)
8267 aarch64_pcrelative_literal_loads = true;
8269 /* This is PR70113. When building the Linux kernel with
8270 CONFIG_ARM64_ERRATUM_843419, support for relocations
8271 R_AARCH64_ADR_PREL_PG_HI21 and R_AARCH64_ADR_PREL_PG_HI21_NC is
8272 removed from the kernel to avoid loading objects with possibly
8273 offending sequences. Without -mpc-relative-literal-loads we would
8274 generate such relocations, preventing the kernel build from
8276 if (opts->x_pcrelative_literal_loads == 2
8277 && TARGET_FIX_ERR_A53_843419)
8278 aarch64_pcrelative_literal_loads = true;
8280 /* In the tiny memory model it makes no sense to disallow PC relative
8281 literal pool loads. */
8282 if (aarch64_cmodel == AARCH64_CMODEL_TINY
8283 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
8284 aarch64_pcrelative_literal_loads = true;
8286 /* When enabling the lower precision Newton series for the square root, also
8287 enable it for the reciprocal square root, since the latter is an
8288 intermediary step for the former. */
8289 if (flag_mlow_precision_sqrt)
8290 flag_mrecip_low_precision_sqrt = true;
8293 /* 'Unpack' up the internal tuning structs and update the options
8294 in OPTS. The caller must have set up selected_tune and selected_arch
8295 as all the other target-specific codegen decisions are
8296 derived from them. */
8299 aarch64_override_options_internal (struct gcc_options *opts)
8301 aarch64_tune_flags = selected_tune->flags;
8302 aarch64_tune = selected_tune->sched_core;
8303 /* Make a copy of the tuning parameters attached to the core, which
8304 we may later overwrite. */
8305 aarch64_tune_params = *(selected_tune->tune);
8306 aarch64_architecture_version = selected_arch->architecture_version;
8308 if (opts->x_aarch64_override_tune_string)
8309 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
8310 &aarch64_tune_params);
8312 /* This target defaults to strict volatile bitfields. */
8313 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
8314 opts->x_flag_strict_volatile_bitfields = 1;
8316 initialize_aarch64_code_model (opts);
8317 initialize_aarch64_tls_size (opts);
8319 int queue_depth = 0;
8320 switch (aarch64_tune_params.autoprefetcher_model)
8322 case tune_params::AUTOPREFETCHER_OFF:
8325 case tune_params::AUTOPREFETCHER_WEAK:
8328 case tune_params::AUTOPREFETCHER_STRONG:
8329 queue_depth = max_insn_queue_index + 1;
8335 /* We don't mind passing in global_options_set here as we don't use
8336 the *options_set structs anyway. */
8337 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
8339 opts->x_param_values,
8340 global_options_set.x_param_values);
8342 /* Set the L1 cache line size. */
8343 if (selected_cpu->tune->cache_line_size != 0)
8344 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
8345 selected_cpu->tune->cache_line_size,
8346 opts->x_param_values,
8347 global_options_set.x_param_values);
8349 aarch64_override_options_after_change_1 (opts);
8352 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
8353 specified in STR and throw errors if appropriate. Put the results if
8354 they are valid in RES and ISA_FLAGS. Return whether the option is
8358 aarch64_validate_mcpu (const char *str, const struct processor **res,
8359 unsigned long *isa_flags)
8361 enum aarch64_parse_opt_result parse_res
8362 = aarch64_parse_cpu (str, res, isa_flags);
8364 if (parse_res == AARCH64_PARSE_OK)
8369 case AARCH64_PARSE_MISSING_ARG:
8370 error ("missing cpu name in -mcpu=%qs", str);
8372 case AARCH64_PARSE_INVALID_ARG:
8373 error ("unknown value %qs for -mcpu", str);
8375 case AARCH64_PARSE_INVALID_FEATURE:
8376 error ("invalid feature modifier in -mcpu=%qs", str);
8385 /* Validate a command-line -march option. Parse the arch and extensions
8386 (if any) specified in STR and throw errors if appropriate. Put the
8387 results, if they are valid, in RES and ISA_FLAGS. Return whether the
8391 aarch64_validate_march (const char *str, const struct processor **res,
8392 unsigned long *isa_flags)
8394 enum aarch64_parse_opt_result parse_res
8395 = aarch64_parse_arch (str, res, isa_flags);
8397 if (parse_res == AARCH64_PARSE_OK)
8402 case AARCH64_PARSE_MISSING_ARG:
8403 error ("missing arch name in -march=%qs", str);
8405 case AARCH64_PARSE_INVALID_ARG:
8406 error ("unknown value %qs for -march", str);
8408 case AARCH64_PARSE_INVALID_FEATURE:
8409 error ("invalid feature modifier in -march=%qs", str);
8418 /* Validate a command-line -mtune option. Parse the cpu
8419 specified in STR and throw errors if appropriate. Put the
8420 result, if it is valid, in RES. Return whether the option is
8424 aarch64_validate_mtune (const char *str, const struct processor **res)
8426 enum aarch64_parse_opt_result parse_res
8427 = aarch64_parse_tune (str, res);
8429 if (parse_res == AARCH64_PARSE_OK)
8434 case AARCH64_PARSE_MISSING_ARG:
8435 error ("missing cpu name in -mtune=%qs", str);
8437 case AARCH64_PARSE_INVALID_ARG:
8438 error ("unknown value %qs for -mtune", str);
8446 /* Return the CPU corresponding to the enum CPU.
8447 If it doesn't specify a cpu, return the default. */
8449 static const struct processor *
8450 aarch64_get_tune_cpu (enum aarch64_processor cpu)
8452 if (cpu != aarch64_none)
8453 return &all_cores[cpu];
8455 /* The & 0x3f is to extract the bottom 6 bits that encode the
8456 default cpu as selected by the --with-cpu GCC configure option
8458 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
8459 flags mechanism should be reworked to make it more sane. */
8460 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
8463 /* Return the architecture corresponding to the enum ARCH.
8464 If it doesn't specify a valid architecture, return the default. */
8466 static const struct processor *
8467 aarch64_get_arch (enum aarch64_arch arch)
8469 if (arch != aarch64_no_arch)
8470 return &all_architectures[arch];
8472 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
8474 return &all_architectures[cpu->arch];
8477 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
8478 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
8479 tuning structs. In particular it must set selected_tune and
8480 aarch64_isa_flags that define the available ISA features and tuning
8481 decisions. It must also set selected_arch as this will be used to
8482 output the .arch asm tags for each function. */
8485 aarch64_override_options (void)
8487 unsigned long cpu_isa = 0;
8488 unsigned long arch_isa = 0;
8489 aarch64_isa_flags = 0;
8491 bool valid_cpu = true;
8492 bool valid_tune = true;
8493 bool valid_arch = true;
8495 selected_cpu = NULL;
8496 selected_arch = NULL;
8497 selected_tune = NULL;
8499 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
8500 If either of -march or -mtune is given, they override their
8501 respective component of -mcpu. */
8502 if (aarch64_cpu_string)
8503 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
8506 if (aarch64_arch_string)
8507 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
8510 if (aarch64_tune_string)
8511 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
8513 /* If the user did not specify a processor, choose the default
8514 one for them. This will be the CPU set during configuration using
8515 --with-cpu, otherwise it is "generic". */
8520 selected_cpu = &all_cores[selected_arch->ident];
8521 aarch64_isa_flags = arch_isa;
8522 explicit_arch = selected_arch->arch;
8526 /* Get default configure-time CPU. */
8527 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
8528 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
8532 explicit_tune_core = selected_tune->ident;
8534 /* If both -mcpu and -march are specified check that they are architecturally
8535 compatible, warn if they're not and prefer the -march ISA flags. */
8536 else if (selected_arch)
8538 if (selected_arch->arch != selected_cpu->arch)
8540 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
8541 all_architectures[selected_cpu->arch].name,
8542 selected_arch->name);
8544 aarch64_isa_flags = arch_isa;
8545 explicit_arch = selected_arch->arch;
8546 explicit_tune_core = selected_tune ? selected_tune->ident
8547 : selected_cpu->ident;
8551 /* -mcpu but no -march. */
8552 aarch64_isa_flags = cpu_isa;
8553 explicit_tune_core = selected_tune ? selected_tune->ident
8554 : selected_cpu->ident;
8555 gcc_assert (selected_cpu);
8556 selected_arch = &all_architectures[selected_cpu->arch];
8557 explicit_arch = selected_arch->arch;
8560 /* Set the arch as well as we will need it when outputing
8561 the .arch directive in assembly. */
8564 gcc_assert (selected_cpu);
8565 selected_arch = &all_architectures[selected_cpu->arch];
8569 selected_tune = selected_cpu;
8571 #ifndef HAVE_AS_MABI_OPTION
8572 /* The compiler may have been configured with 2.23.* binutils, which does
8573 not have support for ILP32. */
8575 error ("Assembler does not support -mabi=ilp32");
8578 /* Make sure we properly set up the explicit options. */
8579 if ((aarch64_cpu_string && valid_cpu)
8580 || (aarch64_tune_string && valid_tune))
8581 gcc_assert (explicit_tune_core != aarch64_none);
8583 if ((aarch64_cpu_string && valid_cpu)
8584 || (aarch64_arch_string && valid_arch))
8585 gcc_assert (explicit_arch != aarch64_no_arch);
8587 /* The pass to insert speculation tracking runs before
8588 shrink-wrapping and the latter does not know how to update the
8589 tracking status. So disable it in this case. */
8590 if (aarch64_track_speculation)
8591 flag_shrink_wrap = 0;
8593 aarch64_override_options_internal (&global_options);
8595 /* Save these options as the default ones in case we push and pop them later
8596 while processing functions with potential target attributes. */
8597 target_option_default_node = target_option_current_node
8598 = build_target_option_node (&global_options);
8601 /* Implement targetm.override_options_after_change. */
8604 aarch64_override_options_after_change (void)
8606 aarch64_override_options_after_change_1 (&global_options);
8609 static struct machine_function *
8610 aarch64_init_machine_status (void)
8612 struct machine_function *machine;
8613 machine = ggc_cleared_alloc<machine_function> ();
8618 aarch64_init_expanders (void)
8620 init_machine_status = aarch64_init_machine_status;
8623 /* A checking mechanism for the implementation of the various code models. */
8625 initialize_aarch64_code_model (struct gcc_options *opts)
8627 if (opts->x_flag_pic)
8629 switch (opts->x_aarch64_cmodel_var)
8631 case AARCH64_CMODEL_TINY:
8632 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
8634 case AARCH64_CMODEL_SMALL:
8635 #ifdef HAVE_AS_SMALL_PIC_RELOCS
8636 aarch64_cmodel = (flag_pic == 2
8637 ? AARCH64_CMODEL_SMALL_PIC
8638 : AARCH64_CMODEL_SMALL_SPIC);
8640 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
8643 case AARCH64_CMODEL_LARGE:
8644 sorry ("code model %qs with -f%s", "large",
8645 opts->x_flag_pic > 1 ? "PIC" : "pic");
8652 aarch64_cmodel = opts->x_aarch64_cmodel_var;
8655 /* Implement TARGET_OPTION_SAVE. */
8658 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
8660 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
8663 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
8664 using the information saved in PTR. */
8667 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
8669 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
8670 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
8671 opts->x_explicit_arch = ptr->x_explicit_arch;
8672 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
8673 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
8675 aarch64_override_options_internal (opts);
8678 /* Implement TARGET_OPTION_PRINT. */
8681 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
8683 const struct processor *cpu
8684 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
8685 unsigned long isa_flags = ptr->x_aarch64_isa_flags;
8686 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
8687 std::string extension
8688 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
8690 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
8691 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
8692 arch->name, extension.c_str ());
8695 static GTY(()) tree aarch64_previous_fndecl;
8698 aarch64_reset_previous_fndecl (void)
8700 aarch64_previous_fndecl = NULL;
8703 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
8704 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
8705 make sure optab availability predicates are recomputed when necessary. */
8708 aarch64_save_restore_target_globals (tree new_tree)
8710 if (TREE_TARGET_GLOBALS (new_tree))
8711 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
8712 else if (new_tree == target_option_default_node)
8713 restore_target_globals (&default_target_globals);
8715 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
8718 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
8719 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
8720 of the function, if such exists. This function may be called multiple
8721 times on a single function so use aarch64_previous_fndecl to avoid
8722 setting up identical state. */
8725 aarch64_set_current_function (tree fndecl)
8727 if (!fndecl || fndecl == aarch64_previous_fndecl)
8730 tree old_tree = (aarch64_previous_fndecl
8731 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
8734 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
8736 /* If current function has no attributes but the previous one did,
8737 use the default node. */
8738 if (!new_tree && old_tree)
8739 new_tree = target_option_default_node;
8741 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
8742 the default have been handled by aarch64_save_restore_target_globals from
8743 aarch64_pragma_target_parse. */
8744 if (old_tree == new_tree)
8747 aarch64_previous_fndecl = fndecl;
8749 /* First set the target options. */
8750 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
8752 aarch64_save_restore_target_globals (new_tree);
8755 /* Enum describing the various ways we can handle attributes.
8756 In many cases we can reuse the generic option handling machinery. */
8758 enum aarch64_attr_opt_type
8760 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
8761 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
8762 aarch64_attr_enum, /* Attribute sets an enum variable. */
8763 aarch64_attr_custom /* Attribute requires a custom handling function. */
8766 /* All the information needed to handle a target attribute.
8767 NAME is the name of the attribute.
8768 ATTR_TYPE specifies the type of behavior of the attribute as described
8769 in the definition of enum aarch64_attr_opt_type.
8770 ALLOW_NEG is true if the attribute supports a "no-" form.
8771 HANDLER is the function that takes the attribute string and whether
8772 it is a pragma or attribute and handles the option. It is needed only
8773 when the ATTR_TYPE is aarch64_attr_custom.
8774 OPT_NUM is the enum specifying the option that the attribute modifies.
8775 This is needed for attributes that mirror the behavior of a command-line
8776 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
8777 aarch64_attr_enum. */
8779 struct aarch64_attribute_info
8782 enum aarch64_attr_opt_type attr_type;
8784 bool (*handler) (const char *, const char *);
8785 enum opt_code opt_num;
8788 /* Handle the ARCH_STR argument to the arch= target attribute.
8789 PRAGMA_OR_ATTR is used in potential error messages. */
8792 aarch64_handle_attr_arch (const char *str, const char *pragma_or_attr)
8794 const struct processor *tmp_arch = NULL;
8795 enum aarch64_parse_opt_result parse_res
8796 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
8798 if (parse_res == AARCH64_PARSE_OK)
8800 gcc_assert (tmp_arch);
8801 selected_arch = tmp_arch;
8802 explicit_arch = selected_arch->arch;
8808 case AARCH64_PARSE_MISSING_ARG:
8809 error ("missing architecture name in 'arch' target %s", pragma_or_attr);
8811 case AARCH64_PARSE_INVALID_ARG:
8812 error ("unknown value %qs for 'arch' target %s", str, pragma_or_attr);
8814 case AARCH64_PARSE_INVALID_FEATURE:
8815 error ("invalid feature modifier %qs for 'arch' target %s",
8816 str, pragma_or_attr);
8825 /* Handle the argument CPU_STR to the cpu= target attribute.
8826 PRAGMA_OR_ATTR is used in potential error messages. */
8829 aarch64_handle_attr_cpu (const char *str, const char *pragma_or_attr)
8831 const struct processor *tmp_cpu = NULL;
8832 enum aarch64_parse_opt_result parse_res
8833 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
8835 if (parse_res == AARCH64_PARSE_OK)
8837 gcc_assert (tmp_cpu);
8838 selected_tune = tmp_cpu;
8839 explicit_tune_core = selected_tune->ident;
8841 selected_arch = &all_architectures[tmp_cpu->arch];
8842 explicit_arch = selected_arch->arch;
8848 case AARCH64_PARSE_MISSING_ARG:
8849 error ("missing cpu name in 'cpu' target %s", pragma_or_attr);
8851 case AARCH64_PARSE_INVALID_ARG:
8852 error ("unknown value %qs for 'cpu' target %s", str, pragma_or_attr);
8854 case AARCH64_PARSE_INVALID_FEATURE:
8855 error ("invalid feature modifier %qs for 'cpu' target %s",
8856 str, pragma_or_attr);
8865 /* Handle the argument STR to the tune= target attribute.
8866 PRAGMA_OR_ATTR is used in potential error messages. */
8869 aarch64_handle_attr_tune (const char *str, const char *pragma_or_attr)
8871 const struct processor *tmp_tune = NULL;
8872 enum aarch64_parse_opt_result parse_res
8873 = aarch64_parse_tune (str, &tmp_tune);
8875 if (parse_res == AARCH64_PARSE_OK)
8877 gcc_assert (tmp_tune);
8878 selected_tune = tmp_tune;
8879 explicit_tune_core = selected_tune->ident;
8885 case AARCH64_PARSE_INVALID_ARG:
8886 error ("unknown value %qs for 'tune' target %s", str, pragma_or_attr);
8895 /* Parse an architecture extensions target attribute string specified in STR.
8896 For example "+fp+nosimd". Show any errors if needed. Return TRUE
8897 if successful. Update aarch64_isa_flags to reflect the ISA features
8899 PRAGMA_OR_ATTR is used in potential error messages. */
8902 aarch64_handle_attr_isa_flags (char *str, const char *pragma_or_attr)
8904 enum aarch64_parse_opt_result parse_res;
8905 unsigned long isa_flags = aarch64_isa_flags;
8907 /* We allow "+nothing" in the beginning to clear out all architectural
8908 features if the user wants to handpick specific features. */
8909 if (strncmp ("+nothing", str, 8) == 0)
8915 parse_res = aarch64_parse_extension (str, &isa_flags);
8917 if (parse_res == AARCH64_PARSE_OK)
8919 aarch64_isa_flags = isa_flags;
8925 case AARCH64_PARSE_MISSING_ARG:
8926 error ("missing feature modifier in target %s %qs",
8927 pragma_or_attr, str);
8930 case AARCH64_PARSE_INVALID_FEATURE:
8931 error ("invalid feature modifier in target %s %qs",
8932 pragma_or_attr, str);
8942 /* The target attributes that we support. On top of these we also support just
8943 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
8944 handled explicitly in aarch64_process_one_target_attr. */
8946 static const struct aarch64_attribute_info aarch64_attributes[] =
8948 { "general-regs-only", aarch64_attr_mask, false, NULL,
8949 OPT_mgeneral_regs_only },
8950 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
8951 OPT_mfix_cortex_a53_835769 },
8952 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
8953 OPT_mfix_cortex_a53_843419 },
8954 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
8955 { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
8956 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
8957 OPT_momit_leaf_frame_pointer },
8958 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
8959 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
8961 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
8962 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
8964 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
8967 /* Parse ARG_STR which contains the definition of one target attribute.
8968 Show appropriate errors if any or return true if the attribute is valid.
8969 PRAGMA_OR_ATTR holds the string to use in error messages about whether
8970 we're processing a target attribute or pragma. */
8973 aarch64_process_one_target_attr (char *arg_str, const char* pragma_or_attr)
8975 bool invert = false;
8977 size_t len = strlen (arg_str);
8981 error ("malformed target %s", pragma_or_attr);
8985 char *str_to_check = (char *) alloca (len + 1);
8986 strcpy (str_to_check, arg_str);
8988 /* Skip leading whitespace. */
8989 while (*str_to_check == ' ' || *str_to_check == '\t')
8992 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
8993 It is easier to detect and handle it explicitly here rather than going
8994 through the machinery for the rest of the target attributes in this
8996 if (*str_to_check == '+')
8997 return aarch64_handle_attr_isa_flags (str_to_check, pragma_or_attr);
8999 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
9004 char *arg = strchr (str_to_check, '=');
9006 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
9007 and point ARG to "foo". */
9013 const struct aarch64_attribute_info *p_attr;
9015 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
9017 /* If the names don't match up, or the user has given an argument
9018 to an attribute that doesn't accept one, or didn't give an argument
9019 to an attribute that expects one, fail to match. */
9020 if (strcmp (str_to_check, p_attr->name) != 0)
9024 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
9025 || p_attr->attr_type == aarch64_attr_enum;
9027 if (attr_need_arg_p ^ (arg != NULL))
9029 error ("target %s %qs does not accept an argument",
9030 pragma_or_attr, str_to_check);
9034 /* If the name matches but the attribute does not allow "no-" versions
9035 then we can't match. */
9036 if (invert && !p_attr->allow_neg)
9038 error ("target %s %qs does not allow a negated form",
9039 pragma_or_attr, str_to_check);
9043 switch (p_attr->attr_type)
9045 /* Has a custom handler registered.
9046 For example, cpu=, arch=, tune=. */
9047 case aarch64_attr_custom:
9048 gcc_assert (p_attr->handler);
9049 if (!p_attr->handler (arg, pragma_or_attr))
9053 /* Either set or unset a boolean option. */
9054 case aarch64_attr_bool:
9056 struct cl_decoded_option decoded;
9058 generate_option (p_attr->opt_num, NULL, !invert,
9059 CL_TARGET, &decoded);
9060 aarch64_handle_option (&global_options, &global_options_set,
9061 &decoded, input_location);
9064 /* Set or unset a bit in the target_flags. aarch64_handle_option
9065 should know what mask to apply given the option number. */
9066 case aarch64_attr_mask:
9068 struct cl_decoded_option decoded;
9069 /* We only need to specify the option number.
9070 aarch64_handle_option will know which mask to apply. */
9071 decoded.opt_index = p_attr->opt_num;
9072 decoded.value = !invert;
9073 aarch64_handle_option (&global_options, &global_options_set,
9074 &decoded, input_location);
9077 /* Use the option setting machinery to set an option to an enum. */
9078 case aarch64_attr_enum:
9083 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
9087 set_option (&global_options, NULL, p_attr->opt_num, value,
9088 NULL, DK_UNSPECIFIED, input_location,
9093 error ("target %s %s=%s is not valid",
9094 pragma_or_attr, str_to_check, arg);
9103 /* If we reached here we either have found an attribute and validated
9104 it or didn't match any. If we matched an attribute but its arguments
9105 were malformed we will have returned false already. */
9109 /* Count how many times the character C appears in
9110 NULL-terminated string STR. */
9113 num_occurences_in_str (char c, char *str)
9115 unsigned int res = 0;
9116 while (*str != '\0')
9127 /* Parse the tree in ARGS that contains the target attribute information
9128 and update the global target options space. PRAGMA_OR_ATTR is a string
9129 to be used in error messages, specifying whether this is processing
9130 a target attribute or a target pragma. */
9133 aarch64_process_target_attr (tree args, const char* pragma_or_attr)
9135 if (TREE_CODE (args) == TREE_LIST)
9139 tree head = TREE_VALUE (args);
9142 if (!aarch64_process_target_attr (head, pragma_or_attr))
9145 args = TREE_CHAIN (args);
9150 /* We expect to find a string to parse. */
9151 gcc_assert (TREE_CODE (args) == STRING_CST);
9153 size_t len = strlen (TREE_STRING_POINTER (args));
9154 char *str_to_check = (char *) alloca (len + 1);
9155 strcpy (str_to_check, TREE_STRING_POINTER (args));
9159 error ("malformed target %s value", pragma_or_attr);
9163 /* Used to catch empty spaces between commas i.e.
9164 attribute ((target ("attr1,,attr2"))). */
9165 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
9167 /* Handle multiple target attributes separated by ','. */
9168 char *token = strtok (str_to_check, ",");
9170 unsigned int num_attrs = 0;
9174 if (!aarch64_process_one_target_attr (token, pragma_or_attr))
9176 error ("target %s %qs is invalid", pragma_or_attr, token);
9180 token = strtok (NULL, ",");
9183 if (num_attrs != num_commas + 1)
9185 error ("malformed target %s list %qs",
9186 pragma_or_attr, TREE_STRING_POINTER (args));
9193 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
9194 process attribute ((target ("..."))). */
9197 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
9199 struct cl_target_option cur_target;
9202 tree new_target, new_optimize;
9203 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9205 /* If what we're processing is the current pragma string then the
9206 target option node is already stored in target_option_current_node
9207 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
9208 having to re-parse the string. This is especially useful to keep
9209 arm_neon.h compile times down since that header contains a lot
9210 of intrinsics enclosed in pragmas. */
9211 if (!existing_target && args == current_target_pragma)
9213 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
9216 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9218 old_optimize = build_optimization_node (&global_options);
9219 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9221 /* If the function changed the optimization levels as well as setting
9222 target options, start with the optimizations specified. */
9223 if (func_optimize && func_optimize != old_optimize)
9224 cl_optimization_restore (&global_options,
9225 TREE_OPTIMIZATION (func_optimize));
9227 /* Save the current target options to restore at the end. */
9228 cl_target_option_save (&cur_target, &global_options);
9230 /* If fndecl already has some target attributes applied to it, unpack
9231 them so that we add this attribute on top of them, rather than
9232 overwriting them. */
9233 if (existing_target)
9235 struct cl_target_option *existing_options
9236 = TREE_TARGET_OPTION (existing_target);
9238 if (existing_options)
9239 cl_target_option_restore (&global_options, existing_options);
9242 cl_target_option_restore (&global_options,
9243 TREE_TARGET_OPTION (target_option_current_node));
9246 ret = aarch64_process_target_attr (args, "attribute");
9248 /* Set up any additional state. */
9251 aarch64_override_options_internal (&global_options);
9252 /* Initialize SIMD builtins if we haven't already.
9253 Set current_target_pragma to NULL for the duration so that
9254 the builtin initialization code doesn't try to tag the functions
9255 being built with the attributes specified by any current pragma, thus
9256 going into an infinite recursion. */
9259 tree saved_current_target_pragma = current_target_pragma;
9260 current_target_pragma = NULL;
9261 aarch64_init_simd_builtins ();
9262 current_target_pragma = saved_current_target_pragma;
9264 new_target = build_target_option_node (&global_options);
9269 new_optimize = build_optimization_node (&global_options);
9273 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
9275 if (old_optimize != new_optimize)
9276 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
9279 cl_target_option_restore (&global_options, &cur_target);
9281 if (old_optimize != new_optimize)
9282 cl_optimization_restore (&global_options,
9283 TREE_OPTIMIZATION (old_optimize));
9287 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
9288 tri-bool options (yes, no, don't care) and the default value is
9289 DEF, determine whether to reject inlining. */
9292 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
9293 int dont_care, int def)
9295 /* If the callee doesn't care, always allow inlining. */
9296 if (callee == dont_care)
9299 /* If the caller doesn't care, always allow inlining. */
9300 if (caller == dont_care)
9303 /* Otherwise, allow inlining if either the callee and caller values
9304 agree, or if the callee is using the default value. */
9305 return (callee == caller || callee == def);
9308 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
9309 to inline CALLEE into CALLER based on target-specific info.
9310 Make sure that the caller and callee have compatible architectural
9311 features. Then go through the other possible target attributes
9312 and see if they can block inlining. Try not to reject always_inline
9313 callees unless they are incompatible architecturally. */
9316 aarch64_can_inline_p (tree caller, tree callee)
9318 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
9319 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
9321 /* If callee has no option attributes, then it is ok to inline. */
9325 struct cl_target_option *caller_opts
9326 = TREE_TARGET_OPTION (caller_tree ? caller_tree
9327 : target_option_default_node);
9329 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
9332 /* Callee's ISA flags should be a subset of the caller's. */
9333 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
9334 != callee_opts->x_aarch64_isa_flags)
9337 /* Allow non-strict aligned functions inlining into strict
9339 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
9340 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
9341 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
9342 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
9345 bool always_inline = lookup_attribute ("always_inline",
9346 DECL_ATTRIBUTES (callee));
9348 /* If the architectural features match up and the callee is always_inline
9349 then the other attributes don't matter. */
9353 if (caller_opts->x_aarch64_cmodel_var
9354 != callee_opts->x_aarch64_cmodel_var)
9357 if (caller_opts->x_aarch64_tls_dialect
9358 != callee_opts->x_aarch64_tls_dialect)
9361 /* Honour explicit requests to workaround errata. */
9362 if (!aarch64_tribools_ok_for_inlining_p (
9363 caller_opts->x_aarch64_fix_a53_err835769,
9364 callee_opts->x_aarch64_fix_a53_err835769,
9365 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
9368 if (!aarch64_tribools_ok_for_inlining_p (
9369 caller_opts->x_aarch64_fix_a53_err843419,
9370 callee_opts->x_aarch64_fix_a53_err843419,
9371 2, TARGET_FIX_ERR_A53_843419))
9374 /* If the user explicitly specified -momit-leaf-frame-pointer for the
9375 caller and calle and they don't match up, reject inlining. */
9376 if (!aarch64_tribools_ok_for_inlining_p (
9377 caller_opts->x_flag_omit_leaf_frame_pointer,
9378 callee_opts->x_flag_omit_leaf_frame_pointer,
9382 /* If the callee has specific tuning overrides, respect them. */
9383 if (callee_opts->x_aarch64_override_tune_string != NULL
9384 && caller_opts->x_aarch64_override_tune_string == NULL)
9387 /* If the user specified tuning override strings for the
9388 caller and callee and they don't match up, reject inlining.
9389 We just do a string compare here, we don't analyze the meaning
9390 of the string, as it would be too costly for little gain. */
9391 if (callee_opts->x_aarch64_override_tune_string
9392 && caller_opts->x_aarch64_override_tune_string
9393 && (strcmp (callee_opts->x_aarch64_override_tune_string,
9394 caller_opts->x_aarch64_override_tune_string) != 0))
9400 /* Return true if SYMBOL_REF X binds locally. */
9403 aarch64_symbol_binds_local_p (const_rtx x)
9405 return (SYMBOL_REF_DECL (x)
9406 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
9407 : SYMBOL_REF_LOCAL_P (x));
9410 /* Return true if SYMBOL_REF X is thread local */
9412 aarch64_tls_symbol_p (rtx x)
9414 if (! TARGET_HAVE_TLS)
9417 if (GET_CODE (x) != SYMBOL_REF)
9420 return SYMBOL_REF_TLS_MODEL (x) != 0;
9423 /* Classify a TLS symbol into one of the TLS kinds. */
9424 enum aarch64_symbol_type
9425 aarch64_classify_tls_symbol (rtx x)
9427 enum tls_model tls_kind = tls_symbolic_operand_type (x);
9431 case TLS_MODEL_GLOBAL_DYNAMIC:
9432 case TLS_MODEL_LOCAL_DYNAMIC:
9433 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
9435 case TLS_MODEL_INITIAL_EXEC:
9436 switch (aarch64_cmodel)
9438 case AARCH64_CMODEL_TINY:
9439 case AARCH64_CMODEL_TINY_PIC:
9440 return SYMBOL_TINY_TLSIE;
9442 return SYMBOL_SMALL_TLSIE;
9445 case TLS_MODEL_LOCAL_EXEC:
9446 if (aarch64_tls_size == 12)
9447 return SYMBOL_TLSLE12;
9448 else if (aarch64_tls_size == 24)
9449 return SYMBOL_TLSLE24;
9450 else if (aarch64_tls_size == 32)
9451 return SYMBOL_TLSLE32;
9452 else if (aarch64_tls_size == 48)
9453 return SYMBOL_TLSLE48;
9457 case TLS_MODEL_EMULATED:
9458 case TLS_MODEL_NONE:
9459 return SYMBOL_FORCE_TO_MEM;
9466 /* Return the method that should be used to access SYMBOL_REF or
9469 enum aarch64_symbol_type
9470 aarch64_classify_symbol (rtx x, rtx offset)
9472 if (GET_CODE (x) == LABEL_REF)
9474 switch (aarch64_cmodel)
9476 case AARCH64_CMODEL_LARGE:
9477 return SYMBOL_FORCE_TO_MEM;
9479 case AARCH64_CMODEL_TINY_PIC:
9480 case AARCH64_CMODEL_TINY:
9481 return SYMBOL_TINY_ABSOLUTE;
9483 case AARCH64_CMODEL_SMALL_SPIC:
9484 case AARCH64_CMODEL_SMALL_PIC:
9485 case AARCH64_CMODEL_SMALL:
9486 return SYMBOL_SMALL_ABSOLUTE;
9493 if (GET_CODE (x) == SYMBOL_REF)
9495 if (aarch64_tls_symbol_p (x))
9496 return aarch64_classify_tls_symbol (x);
9498 switch (aarch64_cmodel)
9500 case AARCH64_CMODEL_TINY:
9501 /* When we retrieve symbol + offset address, we have to make sure
9502 the offset does not cause overflow of the final address. But
9503 we have no way of knowing the address of symbol at compile time
9504 so we can't accurately say if the distance between the PC and
9505 symbol + offset is outside the addressible range of +/-1M in the
9506 TINY code model. So we rely on images not being greater than
9507 1M and cap the offset at 1M and anything beyond 1M will have to
9508 be loaded using an alternative mechanism. Furthermore if the
9509 symbol is a weak reference to something that isn't known to
9510 resolve to a symbol in this module, then force to memory. */
9511 if ((SYMBOL_REF_WEAK (x)
9512 && !aarch64_symbol_binds_local_p (x))
9513 || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
9514 return SYMBOL_FORCE_TO_MEM;
9515 return SYMBOL_TINY_ABSOLUTE;
9517 case AARCH64_CMODEL_SMALL:
9518 /* Same reasoning as the tiny code model, but the offset cap here is
9520 if ((SYMBOL_REF_WEAK (x)
9521 && !aarch64_symbol_binds_local_p (x))
9522 || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
9523 HOST_WIDE_INT_C (4294967264)))
9524 return SYMBOL_FORCE_TO_MEM;
9525 return SYMBOL_SMALL_ABSOLUTE;
9527 case AARCH64_CMODEL_TINY_PIC:
9528 if (!aarch64_symbol_binds_local_p (x))
9529 return SYMBOL_TINY_GOT;
9530 return SYMBOL_TINY_ABSOLUTE;
9532 case AARCH64_CMODEL_SMALL_SPIC:
9533 case AARCH64_CMODEL_SMALL_PIC:
9534 if (!aarch64_symbol_binds_local_p (x))
9535 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
9536 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
9537 return SYMBOL_SMALL_ABSOLUTE;
9539 case AARCH64_CMODEL_LARGE:
9540 /* This is alright even in PIC code as the constant
9541 pool reference is always PC relative and within
9542 the same translation unit. */
9543 if (CONSTANT_POOL_ADDRESS_P (x))
9544 return SYMBOL_SMALL_ABSOLUTE;
9546 return SYMBOL_FORCE_TO_MEM;
9553 /* By default push everything into the constant pool. */
9554 return SYMBOL_FORCE_TO_MEM;
9558 aarch64_constant_address_p (rtx x)
9560 return (CONSTANT_P (x) && memory_address_p (DImode, x));
9564 aarch64_legitimate_pic_operand_p (rtx x)
9566 if (GET_CODE (x) == SYMBOL_REF
9567 || (GET_CODE (x) == CONST
9568 && GET_CODE (XEXP (x, 0)) == PLUS
9569 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
9575 /* Return true if X holds either a quarter-precision or
9576 floating-point +0.0 constant. */
9578 aarch64_valid_floating_const (machine_mode mode, rtx x)
9580 if (!CONST_DOUBLE_P (x))
9583 if (aarch64_float_const_zero_rtx_p (x))
9586 /* We only handle moving 0.0 to a TFmode register. */
9587 if (!(mode == SFmode || mode == DFmode))
9590 return aarch64_float_const_representable_p (x);
9594 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
9596 /* Do not allow vector struct mode constants. We could support
9597 0 and -1 easily, but they need support in aarch64-simd.md. */
9598 if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
9601 /* This could probably go away because
9602 we now decompose CONST_INTs according to expand_mov_immediate. */
9603 if ((GET_CODE (x) == CONST_VECTOR
9604 && aarch64_simd_valid_immediate (x, mode, false, NULL))
9605 || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
9606 return !targetm.cannot_force_const_mem (mode, x);
9608 if (GET_CODE (x) == HIGH
9609 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
9612 return aarch64_constant_address_p (x);
9616 aarch64_load_tp (rtx target)
9619 || GET_MODE (target) != Pmode
9620 || !register_operand (target, Pmode))
9621 target = gen_reg_rtx (Pmode);
9623 /* Can return in any reg. */
9624 emit_insn (gen_aarch64_load_tp_hard (target));
9628 /* On AAPCS systems, this is the "struct __va_list". */
9629 static GTY(()) tree va_list_type;
9631 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
9632 Return the type to use as __builtin_va_list.
9634 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
9646 aarch64_build_builtin_va_list (void)
9649 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
9651 /* Create the type. */
9652 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
9653 /* Give it the required name. */
9654 va_list_name = build_decl (BUILTINS_LOCATION,
9656 get_identifier ("__va_list"),
9658 DECL_ARTIFICIAL (va_list_name) = 1;
9659 TYPE_NAME (va_list_type) = va_list_name;
9660 TYPE_STUB_DECL (va_list_type) = va_list_name;
9662 /* Create the fields. */
9663 f_stack = build_decl (BUILTINS_LOCATION,
9664 FIELD_DECL, get_identifier ("__stack"),
9666 f_grtop = build_decl (BUILTINS_LOCATION,
9667 FIELD_DECL, get_identifier ("__gr_top"),
9669 f_vrtop = build_decl (BUILTINS_LOCATION,
9670 FIELD_DECL, get_identifier ("__vr_top"),
9672 f_groff = build_decl (BUILTINS_LOCATION,
9673 FIELD_DECL, get_identifier ("__gr_offs"),
9675 f_vroff = build_decl (BUILTINS_LOCATION,
9676 FIELD_DECL, get_identifier ("__vr_offs"),
9679 /* Tell tree-stdarg pass about our internal offset fields.
9680 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
9681 purpose to identify whether the code is updating va_list internal
9682 offset fields through irregular way. */
9683 va_list_gpr_counter_field = f_groff;
9684 va_list_fpr_counter_field = f_vroff;
9686 DECL_ARTIFICIAL (f_stack) = 1;
9687 DECL_ARTIFICIAL (f_grtop) = 1;
9688 DECL_ARTIFICIAL (f_vrtop) = 1;
9689 DECL_ARTIFICIAL (f_groff) = 1;
9690 DECL_ARTIFICIAL (f_vroff) = 1;
9692 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
9693 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
9694 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
9695 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
9696 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
9698 TYPE_FIELDS (va_list_type) = f_stack;
9699 DECL_CHAIN (f_stack) = f_grtop;
9700 DECL_CHAIN (f_grtop) = f_vrtop;
9701 DECL_CHAIN (f_vrtop) = f_groff;
9702 DECL_CHAIN (f_groff) = f_vroff;
9704 /* Compute its layout. */
9705 layout_type (va_list_type);
9707 return va_list_type;
9710 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
9712 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
9714 const CUMULATIVE_ARGS *cum;
9715 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
9716 tree stack, grtop, vrtop, groff, vroff;
9718 int gr_save_area_size = cfun->va_list_gpr_size;
9719 int vr_save_area_size = cfun->va_list_fpr_size;
9722 cum = &crtl->args.info;
9723 if (cfun->va_list_gpr_size)
9724 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
9725 cfun->va_list_gpr_size);
9726 if (cfun->va_list_fpr_size)
9727 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
9728 * UNITS_PER_VREG, cfun->va_list_fpr_size);
9732 gcc_assert (cum->aapcs_nvrn == 0);
9733 vr_save_area_size = 0;
9736 f_stack = TYPE_FIELDS (va_list_type_node);
9737 f_grtop = DECL_CHAIN (f_stack);
9738 f_vrtop = DECL_CHAIN (f_grtop);
9739 f_groff = DECL_CHAIN (f_vrtop);
9740 f_vroff = DECL_CHAIN (f_groff);
9742 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
9744 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
9746 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
9748 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
9750 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
9753 /* Emit code to initialize STACK, which points to the next varargs stack
9754 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
9755 by named arguments. STACK is 8-byte aligned. */
9756 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
9757 if (cum->aapcs_stack_size > 0)
9758 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
9759 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
9760 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9762 /* Emit code to initialize GRTOP, the top of the GR save area.
9763 virtual_incoming_args_rtx should have been 16 byte aligned. */
9764 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
9765 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
9766 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9768 /* Emit code to initialize VRTOP, the top of the VR save area.
9769 This address is gr_save_area_bytes below GRTOP, rounded
9770 down to the next 16-byte boundary. */
9771 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
9772 vr_offset = ROUND_UP (gr_save_area_size,
9773 STACK_BOUNDARY / BITS_PER_UNIT);
9776 t = fold_build_pointer_plus_hwi (t, -vr_offset);
9777 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
9778 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9780 /* Emit code to initialize GROFF, the offset from GRTOP of the
9781 next GPR argument. */
9782 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
9783 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
9784 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9786 /* Likewise emit code to initialize VROFF, the offset from FTOP
9787 of the next VR argument. */
9788 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
9789 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
9790 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9793 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
9796 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
9797 gimple_seq *post_p ATTRIBUTE_UNUSED)
9801 bool is_ha; /* is HFA or HVA. */
9802 bool dw_align; /* double-word align. */
9803 machine_mode ag_mode = VOIDmode;
9807 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
9808 tree stack, f_top, f_off, off, arg, roundup, on_stack;
9809 HOST_WIDE_INT size, rsize, adjust, align;
9810 tree t, u, cond1, cond2;
9812 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
9814 type = build_pointer_type (type);
9816 mode = TYPE_MODE (type);
9818 f_stack = TYPE_FIELDS (va_list_type_node);
9819 f_grtop = DECL_CHAIN (f_stack);
9820 f_vrtop = DECL_CHAIN (f_grtop);
9821 f_groff = DECL_CHAIN (f_vrtop);
9822 f_vroff = DECL_CHAIN (f_groff);
9824 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
9825 f_stack, NULL_TREE);
9826 size = int_size_in_bytes (type);
9827 align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
9831 if (aarch64_vfp_is_call_or_return_candidate (mode,
9837 /* TYPE passed in fp/simd registers. */
9839 aarch64_err_no_fpadvsimd (mode, "varargs");
9841 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
9842 unshare_expr (valist), f_vrtop, NULL_TREE);
9843 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
9844 unshare_expr (valist), f_vroff, NULL_TREE);
9846 rsize = nregs * UNITS_PER_VREG;
9850 if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
9851 adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
9853 else if (BLOCK_REG_PADDING (mode, type, 1) == downward
9854 && size < UNITS_PER_VREG)
9856 adjust = UNITS_PER_VREG - size;
9861 /* TYPE passed in general registers. */
9862 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
9863 unshare_expr (valist), f_grtop, NULL_TREE);
9864 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
9865 unshare_expr (valist), f_groff, NULL_TREE);
9866 rsize = ROUND_UP (size, UNITS_PER_WORD);
9867 nregs = rsize / UNITS_PER_WORD;
9872 if (BLOCK_REG_PADDING (mode, type, 1) == downward
9873 && size < UNITS_PER_WORD)
9875 adjust = UNITS_PER_WORD - size;
9879 /* Get a local temporary for the field value. */
9880 off = get_initialized_tmp_var (f_off, pre_p, NULL);
9882 /* Emit code to branch if off >= 0. */
9883 t = build2 (GE_EXPR, boolean_type_node, off,
9884 build_int_cst (TREE_TYPE (off), 0));
9885 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
9889 /* Emit: offs = (offs + 15) & -16. */
9890 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
9891 build_int_cst (TREE_TYPE (off), 15));
9892 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
9893 build_int_cst (TREE_TYPE (off), -16));
9894 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
9899 /* Update ap.__[g|v]r_offs */
9900 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
9901 build_int_cst (TREE_TYPE (off), rsize));
9902 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
9906 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
9908 /* [cond2] if (ap.__[g|v]r_offs > 0) */
9909 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
9910 build_int_cst (TREE_TYPE (f_off), 0));
9911 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
9913 /* String up: make sure the assignment happens before the use. */
9914 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
9915 COND_EXPR_ELSE (cond1) = t;
9917 /* Prepare the trees handling the argument that is passed on the stack;
9918 the top level node will store in ON_STACK. */
9919 arg = get_initialized_tmp_var (stack, pre_p, NULL);
9922 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
9923 t = fold_convert (intDI_type_node, arg);
9924 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
9925 build_int_cst (TREE_TYPE (t), 15));
9926 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
9927 build_int_cst (TREE_TYPE (t), -16));
9928 t = fold_convert (TREE_TYPE (arg), t);
9929 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
9933 /* Advance ap.__stack */
9934 t = fold_convert (intDI_type_node, arg);
9935 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
9936 build_int_cst (TREE_TYPE (t), size + 7));
9937 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
9938 build_int_cst (TREE_TYPE (t), -8));
9939 t = fold_convert (TREE_TYPE (arg), t);
9940 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
9941 /* String up roundup and advance. */
9943 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
9944 /* String up with arg */
9945 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
9946 /* Big-endianness related address adjustment. */
9947 if (BLOCK_REG_PADDING (mode, type, 1) == downward
9948 && size < UNITS_PER_WORD)
9950 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
9951 size_int (UNITS_PER_WORD - size));
9952 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
9955 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
9956 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
9958 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
9961 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
9962 build_int_cst (TREE_TYPE (off), adjust));
9964 t = fold_convert (sizetype, t);
9965 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
9969 /* type ha; // treat as "struct {ftype field[n];}"
9970 ... [computing offs]
9971 for (i = 0; i <nregs; ++i, offs += 16)
9972 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
9975 tree tmp_ha, field_t, field_ptr_t;
9977 /* Declare a local variable. */
9978 tmp_ha = create_tmp_var_raw (type, "ha");
9979 gimple_add_tmp_var (tmp_ha);
9981 /* Establish the base type. */
9985 field_t = float_type_node;
9986 field_ptr_t = float_ptr_type_node;
9989 field_t = double_type_node;
9990 field_ptr_t = double_ptr_type_node;
9993 field_t = long_double_type_node;
9994 field_ptr_t = long_double_ptr_type_node;
9997 field_t = aarch64_fp16_type_node;
9998 field_ptr_t = aarch64_fp16_ptr_type_node;
10003 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
10004 field_t = build_vector_type_for_mode (innertype, ag_mode);
10005 field_ptr_t = build_pointer_type (field_t);
10012 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
10013 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
10015 t = fold_convert (field_ptr_t, addr);
10016 t = build2 (MODIFY_EXPR, field_t,
10017 build1 (INDIRECT_REF, field_t, tmp_ha),
10018 build1 (INDIRECT_REF, field_t, t));
10020 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
10021 for (i = 1; i < nregs; ++i)
10023 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
10024 u = fold_convert (field_ptr_t, addr);
10025 u = build2 (MODIFY_EXPR, field_t,
10026 build2 (MEM_REF, field_t, tmp_ha,
10027 build_int_cst (field_ptr_t,
10029 int_size_in_bytes (field_t)))),
10030 build1 (INDIRECT_REF, field_t, u));
10031 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
10034 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
10035 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
10038 COND_EXPR_ELSE (cond2) = t;
10039 addr = fold_convert (build_pointer_type (type), cond1);
10040 addr = build_va_arg_indirect_ref (addr);
10043 addr = build_va_arg_indirect_ref (addr);
10048 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
10051 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
10052 tree type, int *pretend_size ATTRIBUTE_UNUSED,
10055 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10056 CUMULATIVE_ARGS local_cum;
10057 int gr_saved = cfun->va_list_gpr_size;
10058 int vr_saved = cfun->va_list_fpr_size;
10060 /* The caller has advanced CUM up to, but not beyond, the last named
10061 argument. Advance a local copy of CUM past the last "real" named
10062 argument, to find out how many registers are left over. */
10064 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
10066 /* Found out how many registers we need to save.
10067 Honor tree-stdvar analysis results. */
10068 if (cfun->va_list_gpr_size)
10069 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
10070 cfun->va_list_gpr_size / UNITS_PER_WORD);
10071 if (cfun->va_list_fpr_size)
10072 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
10073 cfun->va_list_fpr_size / UNITS_PER_VREG);
10077 gcc_assert (local_cum.aapcs_nvrn == 0);
10087 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
10088 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
10089 - gr_saved * UNITS_PER_WORD);
10090 mem = gen_frame_mem (BLKmode, ptr);
10091 set_mem_alias_set (mem, get_varargs_alias_set ());
10093 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
10098 /* We can't use move_block_from_reg, because it will use
10099 the wrong mode, storing D regs only. */
10100 machine_mode mode = TImode;
10101 int off, i, vr_start;
10103 /* Set OFF to the offset from virtual_incoming_args_rtx of
10104 the first vector register. The VR save area lies below
10105 the GR one, and is aligned to 16 bytes. */
10106 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
10107 STACK_BOUNDARY / BITS_PER_UNIT);
10108 off -= vr_saved * UNITS_PER_VREG;
10110 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
10111 for (i = 0; i < vr_saved; ++i)
10115 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
10116 mem = gen_frame_mem (mode, ptr);
10117 set_mem_alias_set (mem, get_varargs_alias_set ());
10118 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
10119 off += UNITS_PER_VREG;
10124 /* We don't save the size into *PRETEND_SIZE because we want to avoid
10125 any complication of having crtl->args.pretend_args_size changed. */
10126 cfun->machine->frame.saved_varargs_size
10127 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
10128 STACK_BOUNDARY / BITS_PER_UNIT)
10129 + vr_saved * UNITS_PER_VREG);
10133 aarch64_conditional_register_usage (void)
10138 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
10141 call_used_regs[i] = 1;
10145 /* When tracking speculation, we need a couple of call-clobbered registers
10146 to track the speculation state. It would be nice to just use
10147 IP0 and IP1, but currently there are numerous places that just
10148 assume these registers are free for other uses (eg pointer
10149 authentication). */
10150 if (aarch64_track_speculation)
10152 fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
10153 call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
10154 fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
10155 call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
10159 /* Walk down the type tree of TYPE counting consecutive base elements.
10160 If *MODEP is VOIDmode, then set it to the first valid floating point
10161 type. If a non-floating point type is found, or if a floating point
10162 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
10163 otherwise return the count in the sub-tree. */
10165 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
10168 HOST_WIDE_INT size;
10170 switch (TREE_CODE (type))
10173 mode = TYPE_MODE (type);
10174 if (mode != DFmode && mode != SFmode
10175 && mode != TFmode && mode != HFmode)
10178 if (*modep == VOIDmode)
10181 if (*modep == mode)
10187 mode = TYPE_MODE (TREE_TYPE (type));
10188 if (mode != DFmode && mode != SFmode
10189 && mode != TFmode && mode != HFmode)
10192 if (*modep == VOIDmode)
10195 if (*modep == mode)
10201 /* Use V2SImode and V4SImode as representatives of all 64-bit
10202 and 128-bit vector types. */
10203 size = int_size_in_bytes (type);
10216 if (*modep == VOIDmode)
10219 /* Vector modes are considered to be opaque: two vectors are
10220 equivalent for the purposes of being homogeneous aggregates
10221 if they are the same size. */
10222 if (*modep == mode)
10230 tree index = TYPE_DOMAIN (type);
10232 /* Can't handle incomplete types nor sizes that are not
10234 if (!COMPLETE_TYPE_P (type)
10235 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10238 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
10241 || !TYPE_MAX_VALUE (index)
10242 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
10243 || !TYPE_MIN_VALUE (index)
10244 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
10248 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
10249 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
10251 /* There must be no padding. */
10252 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10264 /* Can't handle incomplete types nor sizes that are not
10266 if (!COMPLETE_TYPE_P (type)
10267 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10270 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
10272 if (TREE_CODE (field) != FIELD_DECL)
10275 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
10278 count += sub_count;
10281 /* There must be no padding. */
10282 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10289 case QUAL_UNION_TYPE:
10291 /* These aren't very interesting except in a degenerate case. */
10296 /* Can't handle incomplete types nor sizes that are not
10298 if (!COMPLETE_TYPE_P (type)
10299 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10302 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
10304 if (TREE_CODE (field) != FIELD_DECL)
10307 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
10310 count = count > sub_count ? count : sub_count;
10313 /* There must be no padding. */
10314 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10327 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
10328 type as described in AAPCS64 \S 4.1.2.
10330 See the comment above aarch64_composite_type_p for the notes on MODE. */
10333 aarch64_short_vector_p (const_tree type,
10336 HOST_WIDE_INT size = -1;
10338 if (type && TREE_CODE (type) == VECTOR_TYPE)
10339 size = int_size_in_bytes (type);
10340 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
10341 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
10342 size = GET_MODE_SIZE (mode);
10344 return (size == 8 || size == 16);
10347 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
10348 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
10349 array types. The C99 floating-point complex types are also considered
10350 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
10351 types, which are GCC extensions and out of the scope of AAPCS64, are
10352 treated as composite types here as well.
10354 Note that MODE itself is not sufficient in determining whether a type
10355 is such a composite type or not. This is because
10356 stor-layout.c:compute_record_mode may have already changed the MODE
10357 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
10358 structure with only one field may have its MODE set to the mode of the
10359 field. Also an integer mode whose size matches the size of the
10360 RECORD_TYPE type may be used to substitute the original mode
10361 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
10362 solely relied on. */
10365 aarch64_composite_type_p (const_tree type,
10368 if (aarch64_short_vector_p (type, mode))
10371 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
10374 if (mode == BLKmode
10375 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
10376 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
10382 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
10383 shall be passed or returned in simd/fp register(s) (providing these
10384 parameter passing registers are available).
10386 Upon successful return, *COUNT returns the number of needed registers,
10387 *BASE_MODE returns the mode of the individual register and when IS_HAF
10388 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
10389 floating-point aggregate or a homogeneous short-vector aggregate. */
10392 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
10394 machine_mode *base_mode,
10398 machine_mode new_mode = VOIDmode;
10399 bool composite_p = aarch64_composite_type_p (type, mode);
10401 if (is_ha != NULL) *is_ha = false;
10403 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
10404 || aarch64_short_vector_p (type, mode))
10409 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
10411 if (is_ha != NULL) *is_ha = true;
10413 new_mode = GET_MODE_INNER (mode);
10415 else if (type && composite_p)
10417 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
10419 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
10421 if (is_ha != NULL) *is_ha = true;
10430 *base_mode = new_mode;
10434 /* Implement TARGET_STRUCT_VALUE_RTX. */
10437 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
10438 int incoming ATTRIBUTE_UNUSED)
10440 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
10443 /* Implements target hook vector_mode_supported_p. */
10445 aarch64_vector_mode_supported_p (machine_mode mode)
10448 && (mode == V4SImode || mode == V8HImode
10449 || mode == V16QImode || mode == V2DImode
10450 || mode == V2SImode || mode == V4HImode
10451 || mode == V8QImode || mode == V2SFmode
10452 || mode == V4SFmode || mode == V2DFmode
10453 || mode == V4HFmode || mode == V8HFmode
10454 || mode == V1DFmode))
10460 /* Return appropriate SIMD container
10461 for MODE within a vector of WIDTH bits. */
10462 static machine_mode
10463 aarch64_simd_container_mode (machine_mode mode, unsigned width)
10465 gcc_assert (width == 64 || width == 128);
10504 /* Return 128-bit container as the preferred SIMD mode for MODE. */
10505 static machine_mode
10506 aarch64_preferred_simd_mode (machine_mode mode)
10508 return aarch64_simd_container_mode (mode, 128);
10511 /* Return the bitmask of possible vector sizes for the vectorizer
10512 to iterate over. */
10513 static unsigned int
10514 aarch64_autovectorize_vector_sizes (void)
10519 /* Implement TARGET_MANGLE_TYPE. */
10521 static const char *
10522 aarch64_mangle_type (const_tree type)
10524 /* The AArch64 ABI documents say that "__va_list" has to be
10525 managled as if it is in the "std" namespace. */
10526 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
10527 return "St9__va_list";
10529 /* Half-precision float. */
10530 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
10533 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
10535 if (TYPE_NAME (type) != NULL)
10536 return aarch64_mangle_builtin_type (type);
10538 /* Use the default mangling. */
10543 /* Return true if the rtx_insn contains a MEM RTX somewhere
10547 has_memory_op (rtx_insn *mem_insn)
10549 subrtx_iterator::array_type array;
10550 FOR_EACH_SUBRTX (iter, array, PATTERN (mem_insn), ALL)
10557 /* Find the first rtx_insn before insn that will generate an assembly
10561 aarch64_prev_real_insn (rtx_insn *insn)
10568 insn = prev_real_insn (insn);
10570 while (insn && recog_memoized (insn) < 0);
10576 is_madd_op (enum attr_type t1)
10579 /* A number of these may be AArch32 only. */
10580 enum attr_type mlatypes[] = {
10581 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
10582 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
10583 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
10586 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
10588 if (t1 == mlatypes[i])
10595 /* Check if there is a register dependency between a load and the insn
10596 for which we hold recog_data. */
10599 dep_between_memop_and_curr (rtx memop)
10604 gcc_assert (GET_CODE (memop) == SET);
10606 if (!REG_P (SET_DEST (memop)))
10609 load_reg = SET_DEST (memop);
10610 for (opno = 1; opno < recog_data.n_operands; opno++)
10612 rtx operand = recog_data.operand[opno];
10613 if (REG_P (operand)
10614 && reg_overlap_mentioned_p (load_reg, operand))
10622 /* When working around the Cortex-A53 erratum 835769,
10623 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
10624 instruction and has a preceding memory instruction such that a NOP
10625 should be inserted between them. */
10628 aarch64_madd_needs_nop (rtx_insn* insn)
10630 enum attr_type attr_type;
10634 if (!TARGET_FIX_ERR_A53_835769)
10637 if (!INSN_P (insn) || recog_memoized (insn) < 0)
10640 attr_type = get_attr_type (insn);
10641 if (!is_madd_op (attr_type))
10644 prev = aarch64_prev_real_insn (insn);
10645 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
10646 Restore recog state to INSN to avoid state corruption. */
10647 extract_constrain_insn_cached (insn);
10649 if (!prev || !has_memory_op (prev))
10652 body = single_set (prev);
10654 /* If the previous insn is a memory op and there is no dependency between
10655 it and the DImode madd, emit a NOP between them. If body is NULL then we
10656 have a complex memory operation, probably a load/store pair.
10657 Be conservative for now and emit a NOP. */
10658 if (GET_MODE (recog_data.operand[0]) == DImode
10659 && (!body || !dep_between_memop_and_curr (body)))
10667 /* Implement FINAL_PRESCAN_INSN. */
10670 aarch64_final_prescan_insn (rtx_insn *insn)
10672 if (aarch64_madd_needs_nop (insn))
10673 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
10677 /* Return the equivalent letter for size. */
10679 sizetochar (int size)
10683 case 64: return 'd';
10684 case 32: return 's';
10685 case 16: return 'h';
10686 case 8 : return 'b';
10687 default: gcc_unreachable ();
10691 /* Return true iff x is a uniform vector of floating-point
10692 constants, and the constant can be represented in
10693 quarter-precision form. Note, as aarch64_float_const_representable
10694 rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0. */
10696 aarch64_vect_float_const_representable_p (rtx x)
10699 return (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
10700 && const_vec_duplicate_p (x, &elt)
10701 && aarch64_float_const_representable_p (elt));
10704 /* Return true for valid and false for invalid. */
10706 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
10707 struct simd_immediate_info *info)
10709 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG) \
10711 for (i = 0; i < idx; i += (STRIDE)) \
10716 immtype = (CLASS); \
10717 elsize = (ELSIZE); \
10718 eshift = (SHIFT); \
10723 unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
10724 unsigned int innersize = GET_MODE_UNIT_SIZE (mode);
10725 unsigned char bytes[16];
10726 int immtype = -1, matches;
10727 unsigned int invmask = inverse ? 0xff : 0;
10730 if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
10732 if (! (aarch64_simd_imm_zero_p (op, mode)
10733 || aarch64_vect_float_const_representable_p (op)))
10738 info->value = CONST_VECTOR_ELT (op, 0);
10739 info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
10747 /* Splat vector constant out into a byte vector. */
10748 for (i = 0; i < n_elts; i++)
10750 /* The vector is provided in gcc endian-neutral fashion. For aarch64_be,
10751 it must be laid out in the vector register in reverse order. */
10752 rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
10753 unsigned HOST_WIDE_INT elpart;
10755 gcc_assert (CONST_INT_P (el));
10756 elpart = INTVAL (el);
10758 for (unsigned int byte = 0; byte < innersize; byte++)
10760 bytes[idx++] = (elpart & 0xff) ^ invmask;
10761 elpart >>= BITS_PER_UNIT;
10766 /* Sanity check. */
10767 gcc_assert (idx == GET_MODE_SIZE (mode));
10771 CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
10772 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
10774 CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
10775 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
10777 CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
10778 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
10780 CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
10781 && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
10783 CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
10785 CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
10787 CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
10788 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
10790 CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
10791 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
10793 CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
10794 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
10796 CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
10797 && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
10799 CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
10801 CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
10803 CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
10804 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
10806 CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
10807 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
10809 CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
10810 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
10812 CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
10813 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
10815 CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
10817 CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
10818 && bytes[i] == bytes[(i + 8) % idx], 0, 0);
10827 info->element_width = elsize;
10828 info->mvn = emvn != 0;
10829 info->shift = eshift;
10831 unsigned HOST_WIDE_INT imm = 0;
10833 if (immtype >= 12 && immtype <= 15)
10836 /* Un-invert bytes of recognized vector, if necessary. */
10838 for (i = 0; i < idx; i++)
10839 bytes[i] ^= invmask;
10843 /* FIXME: Broken on 32-bit H_W_I hosts. */
10844 gcc_assert (sizeof (HOST_WIDE_INT) == 8);
10846 for (i = 0; i < 8; i++)
10847 imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
10848 << (i * BITS_PER_UNIT);
10851 info->value = GEN_INT (imm);
10855 for (i = 0; i < elsize / BITS_PER_UNIT; i++)
10856 imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
10858 /* Construct 'abcdefgh' because the assembler cannot handle
10859 generic constants. */
10862 imm = (imm >> info->shift) & 0xff;
10863 info->value = GEN_INT (imm);
10871 /* Check of immediate shift constants are within range. */
10873 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
10875 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
10877 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
10879 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
10882 /* Return true if X is a uniform vector where all elements
10883 are either the floating-point constant 0.0 or the
10884 integer constant 0. */
10886 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
10888 return x == CONST0_RTX (mode);
10892 /* Return the bitmask CONST_INT to select the bits required by a zero extract
10893 operation of width WIDTH at bit position POS. */
10896 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
10898 gcc_assert (CONST_INT_P (width));
10899 gcc_assert (CONST_INT_P (pos));
10901 unsigned HOST_WIDE_INT mask
10902 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
10903 return GEN_INT (mask << UINTVAL (pos));
10907 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
10909 HOST_WIDE_INT imm = INTVAL (x);
10912 for (i = 0; i < 8; i++)
10914 unsigned int byte = imm & 0xff;
10915 if (byte != 0xff && byte != 0)
10924 aarch64_mov_operand_p (rtx x, machine_mode mode)
10926 if (GET_CODE (x) == HIGH
10927 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
10930 if (CONST_INT_P (x))
10933 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
10936 return aarch64_classify_symbolic_expression (x)
10937 == SYMBOL_TINY_ABSOLUTE;
10940 /* Return a const_int vector of VAL. */
10942 aarch64_simd_gen_const_vector_dup (machine_mode mode, int val)
10944 int nunits = GET_MODE_NUNITS (mode);
10945 rtvec v = rtvec_alloc (nunits);
10948 for (i=0; i < nunits; i++)
10949 RTVEC_ELT (v, i) = GEN_INT (val);
10951 return gen_rtx_CONST_VECTOR (mode, v);
10954 /* Check OP is a legal scalar immediate for the MOVI instruction. */
10957 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
10959 machine_mode vmode;
10961 gcc_assert (!VECTOR_MODE_P (mode));
10962 vmode = aarch64_preferred_simd_mode (mode);
10963 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
10964 return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
10967 /* Construct and return a PARALLEL RTX vector with elements numbering the
10968 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
10969 the vector - from the perspective of the architecture. This does not
10970 line up with GCC's perspective on lane numbers, so we end up with
10971 different masks depending on our target endian-ness. The diagram
10972 below may help. We must draw the distinction when building masks
10973 which select one half of the vector. An instruction selecting
10974 architectural low-lanes for a big-endian target, must be described using
10975 a mask selecting GCC high-lanes.
10977 Big-Endian Little-Endian
10979 GCC 0 1 2 3 3 2 1 0
10980 | x | x | x | x | | x | x | x | x |
10981 Architecture 3 2 1 0 3 2 1 0
10983 Low Mask: { 2, 3 } { 0, 1 }
10984 High Mask: { 0, 1 } { 2, 3 }
10988 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
10990 int nunits = GET_MODE_NUNITS (mode);
10991 rtvec v = rtvec_alloc (nunits / 2);
10992 int high_base = nunits / 2;
10998 if (BYTES_BIG_ENDIAN)
10999 base = high ? low_base : high_base;
11001 base = high ? high_base : low_base;
11003 for (i = 0; i < nunits / 2; i++)
11004 RTVEC_ELT (v, i) = GEN_INT (base + i);
11006 t1 = gen_rtx_PARALLEL (mode, v);
11010 /* Check OP for validity as a PARALLEL RTX vector with elements
11011 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
11012 from the perspective of the architecture. See the diagram above
11013 aarch64_simd_vect_par_cnst_half for more details. */
11016 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
11019 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
11020 HOST_WIDE_INT count_op = XVECLEN (op, 0);
11021 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
11024 if (!VECTOR_MODE_P (mode))
11027 if (count_op != count_ideal)
11030 for (i = 0; i < count_ideal; i++)
11032 rtx elt_op = XVECEXP (op, 0, i);
11033 rtx elt_ideal = XVECEXP (ideal, 0, i);
11035 if (!CONST_INT_P (elt_op)
11036 || INTVAL (elt_ideal) != INTVAL (elt_op))
11042 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
11043 HIGH (exclusive). */
11045 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
11048 HOST_WIDE_INT lane;
11049 gcc_assert (CONST_INT_P (operand));
11050 lane = INTVAL (operand);
11052 if (lane < low || lane >= high)
11055 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
11057 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
11061 /* Return TRUE if OP is a valid vector addressing mode. */
11063 aarch64_simd_mem_operand_p (rtx op)
11065 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
11066 || REG_P (XEXP (op, 0)));
11069 /* Emit a register copy from operand to operand, taking care not to
11070 early-clobber source registers in the process.
11072 COUNT is the number of components into which the copy needs to be
11075 aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
11076 unsigned int count)
11079 int rdest = REGNO (operands[0]);
11080 int rsrc = REGNO (operands[1]);
11082 if (!reg_overlap_mentioned_p (operands[0], operands[1])
11084 for (i = 0; i < count; i++)
11085 emit_move_insn (gen_rtx_REG (mode, rdest + i),
11086 gen_rtx_REG (mode, rsrc + i));
11088 for (i = 0; i < count; i++)
11089 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
11090 gen_rtx_REG (mode, rsrc + count - i - 1));
11093 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
11094 one of VSTRUCT modes: OI, CI, or XI. */
11096 aarch64_simd_attr_length_rglist (enum machine_mode mode)
11098 return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
11101 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
11102 alignment of a vector to 128 bits. */
11103 static HOST_WIDE_INT
11104 aarch64_simd_vector_alignment (const_tree type)
11106 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
11107 return MIN (align, 128);
11110 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
11112 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
11117 /* We guarantee alignment for vectors up to 128-bits. */
11118 if (tree_int_cst_compare (TYPE_SIZE (type),
11119 bitsize_int (BIGGEST_ALIGNMENT)) > 0)
11122 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
11126 /* If VALS is a vector constant that can be loaded into a register
11127 using DUP, generate instructions to do so and return an RTX to
11128 assign to the register. Otherwise return NULL_RTX. */
11130 aarch64_simd_dup_constant (rtx vals)
11132 machine_mode mode = GET_MODE (vals);
11133 machine_mode inner_mode = GET_MODE_INNER (mode);
11136 if (!const_vec_duplicate_p (vals, &x))
11139 /* We can load this constant by using DUP and a constant in a
11140 single ARM register. This will be cheaper than a vector
11142 x = copy_to_mode_reg (inner_mode, x);
11143 return gen_rtx_VEC_DUPLICATE (mode, x);
11147 /* Generate code to load VALS, which is a PARALLEL containing only
11148 constants (for vec_init) or CONST_VECTOR, efficiently into a
11149 register. Returns an RTX to copy into the register, or NULL_RTX
11150 for a PARALLEL that can not be converted into a CONST_VECTOR. */
11152 aarch64_simd_make_constant (rtx vals)
11154 machine_mode mode = GET_MODE (vals);
11156 rtx const_vec = NULL_RTX;
11157 int n_elts = GET_MODE_NUNITS (mode);
11161 if (GET_CODE (vals) == CONST_VECTOR)
11163 else if (GET_CODE (vals) == PARALLEL)
11165 /* A CONST_VECTOR must contain only CONST_INTs and
11166 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
11167 Only store valid constants in a CONST_VECTOR. */
11168 for (i = 0; i < n_elts; ++i)
11170 rtx x = XVECEXP (vals, 0, i);
11171 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11174 if (n_const == n_elts)
11175 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
11178 gcc_unreachable ();
11180 if (const_vec != NULL_RTX
11181 && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
11182 /* Load using MOVI/MVNI. */
11184 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
11185 /* Loaded using DUP. */
11187 else if (const_vec != NULL_RTX)
11188 /* Load from constant pool. We can not take advantage of single-cycle
11189 LD1 because we need a PC-relative addressing mode. */
11192 /* A PARALLEL containing something not valid inside CONST_VECTOR.
11193 We can not construct an initializer. */
11197 /* Expand a vector initialisation sequence, such that TARGET is
11198 initialised to contain VALS. */
11201 aarch64_expand_vector_init (rtx target, rtx vals)
11203 machine_mode mode = GET_MODE (target);
11204 machine_mode inner_mode = GET_MODE_INNER (mode);
11205 /* The number of vector elements. */
11206 int n_elts = GET_MODE_NUNITS (mode);
11207 /* The number of vector elements which are not constant. */
11209 rtx any_const = NULL_RTX;
11210 /* The first element of vals. */
11211 rtx v0 = XVECEXP (vals, 0, 0);
11212 bool all_same = true;
11214 /* Count the number of variable elements to initialise. */
11215 for (int i = 0; i < n_elts; ++i)
11217 rtx x = XVECEXP (vals, 0, i);
11218 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
11223 all_same &= rtx_equal_p (x, v0);
11226 /* No variable elements, hand off to aarch64_simd_make_constant which knows
11227 how best to handle this. */
11230 rtx constant = aarch64_simd_make_constant (vals);
11231 if (constant != NULL_RTX)
11233 emit_move_insn (target, constant);
11238 /* Splat a single non-constant element if we can. */
11241 rtx x = copy_to_mode_reg (inner_mode, v0);
11242 aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
11246 /* Initialise a vector which is part-variable. We want to first try
11247 to build those lanes which are constant in the most efficient way we
11249 if (n_var != n_elts)
11251 rtx copy = copy_rtx (vals);
11253 /* Load constant part of vector. We really don't care what goes into the
11254 parts we will overwrite, but we're more likely to be able to load the
11255 constant efficiently if it has fewer, larger, repeating parts
11256 (see aarch64_simd_valid_immediate). */
11257 for (int i = 0; i < n_elts; i++)
11259 rtx x = XVECEXP (vals, 0, i);
11260 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11262 rtx subst = any_const;
11263 for (int bit = n_elts / 2; bit > 0; bit /= 2)
11265 /* Look in the copied vector, as more elements are const. */
11266 rtx test = XVECEXP (copy, 0, i ^ bit);
11267 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
11273 XVECEXP (copy, 0, i) = subst;
11275 aarch64_expand_vector_init (target, copy);
11278 /* Insert the variable lanes directly. */
11280 enum insn_code icode = optab_handler (vec_set_optab, mode);
11281 gcc_assert (icode != CODE_FOR_nothing);
11283 for (int i = 0; i < n_elts; i++)
11285 rtx x = XVECEXP (vals, 0, i);
11286 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11288 x = copy_to_mode_reg (inner_mode, x);
11289 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
11293 static unsigned HOST_WIDE_INT
11294 aarch64_shift_truncation_mask (machine_mode mode)
11297 (!SHIFT_COUNT_TRUNCATED
11298 || aarch64_vector_mode_supported_p (mode)
11299 || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
11302 /* Select a format to encode pointers in exception handling data. */
11304 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
11307 switch (aarch64_cmodel)
11309 case AARCH64_CMODEL_TINY:
11310 case AARCH64_CMODEL_TINY_PIC:
11311 case AARCH64_CMODEL_SMALL:
11312 case AARCH64_CMODEL_SMALL_PIC:
11313 case AARCH64_CMODEL_SMALL_SPIC:
11314 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
11316 type = DW_EH_PE_sdata4;
11319 /* No assumptions here. 8-byte relocs required. */
11320 type = DW_EH_PE_sdata8;
11323 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
11326 /* The last .arch and .tune assembly strings that we printed. */
11327 static std::string aarch64_last_printed_arch_string;
11328 static std::string aarch64_last_printed_tune_string;
11330 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
11331 by the function fndecl. */
11334 aarch64_declare_function_name (FILE *stream, const char* name,
11337 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11339 struct cl_target_option *targ_options;
11341 targ_options = TREE_TARGET_OPTION (target_parts);
11343 targ_options = TREE_TARGET_OPTION (target_option_current_node);
11344 gcc_assert (targ_options);
11346 const struct processor *this_arch
11347 = aarch64_get_arch (targ_options->x_explicit_arch);
11349 unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
11350 std::string extension
11351 = aarch64_get_extension_string_for_isa_flags (isa_flags,
11353 /* Only update the assembler .arch string if it is distinct from the last
11354 such string we printed. */
11355 std::string to_print = this_arch->name + extension;
11356 if (to_print != aarch64_last_printed_arch_string)
11358 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
11359 aarch64_last_printed_arch_string = to_print;
11362 /* Print the cpu name we're tuning for in the comments, might be
11363 useful to readers of the generated asm. Do it only when it changes
11364 from function to function and verbose assembly is requested. */
11365 const struct processor *this_tune
11366 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
11368 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
11370 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
11372 aarch64_last_printed_tune_string = this_tune->name;
11375 /* Don't forget the type directive for ELF. */
11376 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
11377 ASM_OUTPUT_LABEL (stream, name);
11380 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
11383 aarch64_start_file (void)
11385 struct cl_target_option *default_options
11386 = TREE_TARGET_OPTION (target_option_default_node);
11388 const struct processor *default_arch
11389 = aarch64_get_arch (default_options->x_explicit_arch);
11390 unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
11391 std::string extension
11392 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
11393 default_arch->flags);
11395 aarch64_last_printed_arch_string = default_arch->name + extension;
11396 aarch64_last_printed_tune_string = "";
11397 asm_fprintf (asm_out_file, "\t.arch %s\n",
11398 aarch64_last_printed_arch_string.c_str ());
11400 default_file_start ();
11403 /* Emit load exclusive. */
11406 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
11407 rtx mem, rtx model_rtx)
11409 rtx (*gen) (rtx, rtx, rtx);
11413 case QImode: gen = gen_aarch64_load_exclusiveqi; break;
11414 case HImode: gen = gen_aarch64_load_exclusivehi; break;
11415 case SImode: gen = gen_aarch64_load_exclusivesi; break;
11416 case DImode: gen = gen_aarch64_load_exclusivedi; break;
11418 gcc_unreachable ();
11421 emit_insn (gen (rval, mem, model_rtx));
11424 /* Emit store exclusive. */
11427 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
11428 rtx rval, rtx mem, rtx model_rtx)
11430 rtx (*gen) (rtx, rtx, rtx, rtx);
11434 case QImode: gen = gen_aarch64_store_exclusiveqi; break;
11435 case HImode: gen = gen_aarch64_store_exclusivehi; break;
11436 case SImode: gen = gen_aarch64_store_exclusivesi; break;
11437 case DImode: gen = gen_aarch64_store_exclusivedi; break;
11439 gcc_unreachable ();
11442 emit_insn (gen (bval, rval, mem, model_rtx));
11445 /* Mark the previous jump instruction as unlikely. */
11448 aarch64_emit_unlikely_jump (rtx insn)
11450 int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
11452 insn = emit_jump_insn (insn);
11453 add_int_reg_note (insn, REG_BR_PROB, very_unlikely);
11456 /* Expand a compare and swap pattern. */
11459 aarch64_expand_compare_and_swap (rtx operands[])
11461 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
11462 machine_mode mode, cmp_mode;
11463 typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
11466 const gen_cas_fn split_cas[] =
11468 gen_aarch64_compare_and_swapqi,
11469 gen_aarch64_compare_and_swaphi,
11470 gen_aarch64_compare_and_swapsi,
11471 gen_aarch64_compare_and_swapdi
11473 const gen_cas_fn atomic_cas[] =
11475 gen_aarch64_compare_and_swapqi_lse,
11476 gen_aarch64_compare_and_swaphi_lse,
11477 gen_aarch64_compare_and_swapsi_lse,
11478 gen_aarch64_compare_and_swapdi_lse
11481 bval = operands[0];
11482 rval = operands[1];
11484 oldval = operands[3];
11485 newval = operands[4];
11486 is_weak = operands[5];
11487 mod_s = operands[6];
11488 mod_f = operands[7];
11489 mode = GET_MODE (mem);
11492 /* Normally the succ memory model must be stronger than fail, but in the
11493 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
11494 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
11496 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
11497 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
11498 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
11504 /* For short modes, we're going to perform the comparison in SImode,
11505 so do the zero-extension now. */
11507 rval = gen_reg_rtx (SImode);
11508 oldval = convert_modes (SImode, mode, oldval, true);
11509 /* Fall through. */
11513 /* Force the value into a register if needed. */
11514 if (!aarch64_plus_operand (oldval, mode))
11515 oldval = force_reg (cmp_mode, oldval);
11519 gcc_unreachable ();
11524 case QImode: idx = 0; break;
11525 case HImode: idx = 1; break;
11526 case SImode: idx = 2; break;
11527 case DImode: idx = 3; break;
11529 gcc_unreachable ();
11532 gen = atomic_cas[idx];
11534 gen = split_cas[idx];
11536 emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
11538 if (mode == QImode || mode == HImode)
11539 emit_move_insn (operands[1], gen_lowpart (mode, rval));
11541 x = gen_rtx_REG (CCmode, CC_REGNUM);
11542 x = gen_rtx_EQ (SImode, x, const0_rtx);
11543 emit_insn (gen_rtx_SET (bval, x));
11546 /* Test whether the target supports using a atomic load-operate instruction.
11547 CODE is the operation and AFTER is TRUE if the data in memory after the
11548 operation should be returned and FALSE if the data before the operation
11549 should be returned. Returns FALSE if the operation isn't supported by the
11553 aarch64_atomic_ldop_supported_p (enum rtx_code code)
11572 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
11573 sequence implementing an atomic operation. */
11576 aarch64_emit_post_barrier (enum memmodel model)
11578 const enum memmodel base_model = memmodel_base (model);
11580 if (is_mm_sync (model)
11581 && (base_model == MEMMODEL_ACQUIRE
11582 || base_model == MEMMODEL_ACQ_REL
11583 || base_model == MEMMODEL_SEQ_CST))
11585 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
11589 /* Emit an atomic compare-and-swap operation. RVAL is the destination register
11590 for the data in memory. EXPECTED is the value expected to be in memory.
11591 DESIRED is the value to store to memory. MEM is the memory location. MODEL
11592 is the memory ordering to use. */
11595 aarch64_gen_atomic_cas (rtx rval, rtx mem,
11596 rtx expected, rtx desired,
11599 rtx (*gen) (rtx, rtx, rtx, rtx);
11602 mode = GET_MODE (mem);
11606 case QImode: gen = gen_aarch64_atomic_casqi; break;
11607 case HImode: gen = gen_aarch64_atomic_cashi; break;
11608 case SImode: gen = gen_aarch64_atomic_cassi; break;
11609 case DImode: gen = gen_aarch64_atomic_casdi; break;
11611 gcc_unreachable ();
11614 /* Move the expected value into the CAS destination register. */
11615 emit_insn (gen_rtx_SET (rval, expected));
11617 /* Emit the CAS. */
11618 emit_insn (gen (rval, mem, desired, model));
11620 /* Compare the expected value with the value loaded by the CAS, to establish
11621 whether the swap was made. */
11622 aarch64_gen_compare_reg (EQ, rval, expected);
11625 /* Split a compare and swap pattern. */
11628 aarch64_split_compare_and_swap (rtx operands[])
11630 rtx rval, mem, oldval, newval, scratch;
11633 rtx_code_label *label1, *label2;
11635 enum memmodel model;
11638 rval = operands[0];
11640 oldval = operands[2];
11641 newval = operands[3];
11642 is_weak = (operands[4] != const0_rtx);
11643 model_rtx = operands[5];
11644 scratch = operands[7];
11645 mode = GET_MODE (mem);
11646 model = memmodel_from_int (INTVAL (model_rtx));
11648 /* When OLDVAL is zero and we want the strong version we can emit a tighter
11651 LD[A]XR rval, [mem]
11653 ST[L]XR scratch, newval, [mem]
11654 CBNZ scratch, .label1
11657 bool strong_zero_p = !is_weak && oldval == const0_rtx;
11662 label1 = gen_label_rtx ();
11663 emit_label (label1);
11665 label2 = gen_label_rtx ();
11667 /* The initial load can be relaxed for a __sync operation since a final
11668 barrier will be emitted to stop code hoisting. */
11669 if (is_mm_sync (model))
11670 aarch64_emit_load_exclusive (mode, rval, mem,
11671 GEN_INT (MEMMODEL_RELAXED));
11673 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
11677 if (aarch64_track_speculation)
11679 /* Emit an explicit compare instruction, so that we can correctly
11680 track the condition codes. */
11681 rtx cc_reg = aarch64_gen_compare_reg (NE, rval, const0_rtx);
11682 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
11685 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
11687 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
11688 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
11689 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
11693 cond = aarch64_gen_compare_reg (NE, rval, oldval);
11694 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
11695 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
11696 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
11697 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
11700 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
11704 if (aarch64_track_speculation)
11706 /* Emit an explicit compare instruction, so that we can correctly
11707 track the condition codes. */
11708 rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
11709 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
11712 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
11714 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
11715 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
11716 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
11720 cond = gen_rtx_REG (CCmode, CC_REGNUM);
11721 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
11722 emit_insn (gen_rtx_SET (cond, x));
11725 emit_label (label2);
11726 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
11727 to set the condition flags. If this is not used it will be removed by
11731 cond = gen_rtx_REG (CCmode, CC_REGNUM);
11732 x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
11733 emit_insn (gen_rtx_SET (cond, x));
11735 /* Emit any final barrier needed for a __sync operation. */
11736 if (is_mm_sync (model))
11737 aarch64_emit_post_barrier (model);
11740 /* Emit a BIC instruction. */
11743 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
11745 rtx shift_rtx = GEN_INT (shift);
11746 rtx (*gen) (rtx, rtx, rtx, rtx);
11750 case SImode: gen = gen_and_one_cmpl_lshrsi3; break;
11751 case DImode: gen = gen_and_one_cmpl_lshrdi3; break;
11753 gcc_unreachable ();
11756 emit_insn (gen (dst, s2, shift_rtx, s1));
11759 /* Emit an atomic swap. */
11762 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
11763 rtx mem, rtx model)
11765 rtx (*gen) (rtx, rtx, rtx, rtx);
11769 case QImode: gen = gen_aarch64_atomic_swpqi; break;
11770 case HImode: gen = gen_aarch64_atomic_swphi; break;
11771 case SImode: gen = gen_aarch64_atomic_swpsi; break;
11772 case DImode: gen = gen_aarch64_atomic_swpdi; break;
11774 gcc_unreachable ();
11777 emit_insn (gen (dst, mem, value, model));
11780 /* Operations supported by aarch64_emit_atomic_load_op. */
11782 enum aarch64_atomic_load_op_code
11784 AARCH64_LDOP_PLUS, /* A + B */
11785 AARCH64_LDOP_XOR, /* A ^ B */
11786 AARCH64_LDOP_OR, /* A | B */
11787 AARCH64_LDOP_BIC /* A & ~B */
11790 /* Emit an atomic load-operate. */
11793 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
11794 machine_mode mode, rtx dst, rtx src,
11795 rtx mem, rtx model)
11797 typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
11798 const aarch64_atomic_load_op_fn plus[] =
11800 gen_aarch64_atomic_loadaddqi,
11801 gen_aarch64_atomic_loadaddhi,
11802 gen_aarch64_atomic_loadaddsi,
11803 gen_aarch64_atomic_loadadddi
11805 const aarch64_atomic_load_op_fn eor[] =
11807 gen_aarch64_atomic_loadeorqi,
11808 gen_aarch64_atomic_loadeorhi,
11809 gen_aarch64_atomic_loadeorsi,
11810 gen_aarch64_atomic_loadeordi
11812 const aarch64_atomic_load_op_fn ior[] =
11814 gen_aarch64_atomic_loadsetqi,
11815 gen_aarch64_atomic_loadsethi,
11816 gen_aarch64_atomic_loadsetsi,
11817 gen_aarch64_atomic_loadsetdi
11819 const aarch64_atomic_load_op_fn bic[] =
11821 gen_aarch64_atomic_loadclrqi,
11822 gen_aarch64_atomic_loadclrhi,
11823 gen_aarch64_atomic_loadclrsi,
11824 gen_aarch64_atomic_loadclrdi
11826 aarch64_atomic_load_op_fn gen;
11831 case QImode: idx = 0; break;
11832 case HImode: idx = 1; break;
11833 case SImode: idx = 2; break;
11834 case DImode: idx = 3; break;
11836 gcc_unreachable ();
11841 case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
11842 case AARCH64_LDOP_XOR: gen = eor[idx]; break;
11843 case AARCH64_LDOP_OR: gen = ior[idx]; break;
11844 case AARCH64_LDOP_BIC: gen = bic[idx]; break;
11846 gcc_unreachable ();
11849 emit_insn (gen (dst, mem, src, model));
11852 /* Emit an atomic load+operate. CODE is the operation. OUT_DATA is the
11853 location to store the data read from memory. OUT_RESULT is the location to
11854 store the result of the operation. MEM is the memory location to read and
11855 modify. MODEL_RTX is the memory ordering to use. VALUE is the second
11856 operand for the operation. Either OUT_DATA or OUT_RESULT, but not both, can
11860 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
11861 rtx mem, rtx value, rtx model_rtx)
11863 machine_mode mode = GET_MODE (mem);
11864 machine_mode wmode = (mode == DImode ? DImode : SImode);
11865 const bool short_mode = (mode < SImode);
11866 aarch64_atomic_load_op_code ldop_code;
11871 out_data = gen_lowpart (mode, out_data);
11874 out_result = gen_lowpart (mode, out_result);
11876 /* Make sure the value is in a register, putting it into a destination
11877 register if it needs to be manipulated. */
11878 if (!register_operand (value, mode)
11879 || code == AND || code == MINUS)
11881 src = out_result ? out_result : out_data;
11882 emit_move_insn (src, gen_lowpart (mode, value));
11886 gcc_assert (register_operand (src, mode));
11888 /* Preprocess the data for the operation as necessary. If the operation is
11889 a SET then emit a swap instruction and finish. */
11893 aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
11897 /* Negate the value and treat it as a PLUS. */
11901 /* Resize the value if necessary. */
11903 src = gen_lowpart (wmode, src);
11905 neg_src = gen_rtx_NEG (wmode, src);
11906 emit_insn (gen_rtx_SET (src, neg_src));
11909 src = gen_lowpart (mode, src);
11911 /* Fall-through. */
11913 ldop_code = AARCH64_LDOP_PLUS;
11917 ldop_code = AARCH64_LDOP_OR;
11921 ldop_code = AARCH64_LDOP_XOR;
11928 /* Resize the value if necessary. */
11930 src = gen_lowpart (wmode, src);
11932 not_src = gen_rtx_NOT (wmode, src);
11933 emit_insn (gen_rtx_SET (src, not_src));
11936 src = gen_lowpart (mode, src);
11938 ldop_code = AARCH64_LDOP_BIC;
11942 /* The operation can't be done with atomic instructions. */
11943 gcc_unreachable ();
11946 aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
11948 /* If necessary, calculate the data in memory after the update by redoing the
11949 operation from values in registers. */
11955 src = gen_lowpart (wmode, src);
11956 out_data = gen_lowpart (wmode, out_data);
11957 out_result = gen_lowpart (wmode, out_result);
11966 x = gen_rtx_PLUS (wmode, out_data, src);
11969 x = gen_rtx_IOR (wmode, out_data, src);
11972 x = gen_rtx_XOR (wmode, out_data, src);
11975 aarch64_emit_bic (wmode, out_result, out_data, src, 0);
11978 gcc_unreachable ();
11981 emit_set_insn (out_result, x);
11986 /* Split an atomic operation. */
11989 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
11990 rtx value, rtx model_rtx, rtx cond)
11992 machine_mode mode = GET_MODE (mem);
11993 machine_mode wmode = (mode == DImode ? DImode : SImode);
11994 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
11995 const bool is_sync = is_mm_sync (model);
11996 rtx_code_label *label;
11999 /* Split the atomic operation into a sequence. */
12000 label = gen_label_rtx ();
12001 emit_label (label);
12004 new_out = gen_lowpart (wmode, new_out);
12006 old_out = gen_lowpart (wmode, old_out);
12009 value = simplify_gen_subreg (wmode, value, mode, 0);
12011 /* The initial load can be relaxed for a __sync operation since a final
12012 barrier will be emitted to stop code hoisting. */
12014 aarch64_emit_load_exclusive (mode, old_out, mem,
12015 GEN_INT (MEMMODEL_RELAXED));
12017 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
12026 x = gen_rtx_AND (wmode, old_out, value);
12027 emit_insn (gen_rtx_SET (new_out, x));
12028 x = gen_rtx_NOT (wmode, new_out);
12029 emit_insn (gen_rtx_SET (new_out, x));
12033 if (CONST_INT_P (value))
12035 value = GEN_INT (-INTVAL (value));
12038 /* Fall through. */
12041 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
12042 emit_insn (gen_rtx_SET (new_out, x));
12046 aarch64_emit_store_exclusive (mode, cond, mem,
12047 gen_lowpart (mode, new_out), model_rtx);
12049 if (aarch64_track_speculation)
12051 /* Emit an explicit compare instruction, so that we can correctly
12052 track the condition codes. */
12053 rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
12054 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
12057 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12059 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12060 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
12061 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12063 /* Emit any final barrier needed for a __sync operation. */
12065 aarch64_emit_post_barrier (model);
12069 aarch64_init_libfuncs (void)
12071 /* Half-precision float operations. The compiler handles all operations
12072 with NULL libfuncs by converting to SFmode. */
12075 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
12076 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
12079 set_optab_libfunc (add_optab, HFmode, NULL);
12080 set_optab_libfunc (sdiv_optab, HFmode, NULL);
12081 set_optab_libfunc (smul_optab, HFmode, NULL);
12082 set_optab_libfunc (neg_optab, HFmode, NULL);
12083 set_optab_libfunc (sub_optab, HFmode, NULL);
12086 set_optab_libfunc (eq_optab, HFmode, NULL);
12087 set_optab_libfunc (ne_optab, HFmode, NULL);
12088 set_optab_libfunc (lt_optab, HFmode, NULL);
12089 set_optab_libfunc (le_optab, HFmode, NULL);
12090 set_optab_libfunc (ge_optab, HFmode, NULL);
12091 set_optab_libfunc (gt_optab, HFmode, NULL);
12092 set_optab_libfunc (unord_optab, HFmode, NULL);
12095 /* Target hook for c_mode_for_suffix. */
12096 static machine_mode
12097 aarch64_c_mode_for_suffix (char suffix)
12105 /* We can only represent floating point constants which will fit in
12106 "quarter-precision" values. These values are characterised by
12107 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
12110 (-1)^s * (n/16) * 2^r
12113 's' is the sign bit.
12114 'n' is an integer in the range 16 <= n <= 31.
12115 'r' is an integer in the range -3 <= r <= 4. */
12117 /* Return true iff X can be represented by a quarter-precision
12118 floating point immediate operand X. Note, we cannot represent 0.0. */
12120 aarch64_float_const_representable_p (rtx x)
12122 /* This represents our current view of how many bits
12123 make up the mantissa. */
12124 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
12126 unsigned HOST_WIDE_INT mantissa, mask;
12127 REAL_VALUE_TYPE r, m;
12130 if (!CONST_DOUBLE_P (x))
12133 /* We don't support HFmode constants yet. */
12134 if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
12137 r = *CONST_DOUBLE_REAL_VALUE (x);
12139 /* We cannot represent infinities, NaNs or +/-zero. We won't
12140 know if we have +zero until we analyse the mantissa, but we
12141 can reject the other invalid values. */
12142 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
12143 || REAL_VALUE_MINUS_ZERO (r))
12146 /* Extract exponent. */
12147 r = real_value_abs (&r);
12148 exponent = REAL_EXP (&r);
12150 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
12151 highest (sign) bit, with a fixed binary point at bit point_pos.
12152 m1 holds the low part of the mantissa, m2 the high part.
12153 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
12154 bits for the mantissa, this can fail (low bits will be lost). */
12155 real_ldexp (&m, &r, point_pos - exponent);
12156 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
12158 /* If the low part of the mantissa has bits set we cannot represent
12160 if (w.elt (0) != 0)
12162 /* We have rejected the lower HOST_WIDE_INT, so update our
12163 understanding of how many bits lie in the mantissa and
12164 look only at the high HOST_WIDE_INT. */
12165 mantissa = w.elt (1);
12166 point_pos -= HOST_BITS_PER_WIDE_INT;
12168 /* We can only represent values with a mantissa of the form 1.xxxx. */
12169 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
12170 if ((mantissa & mask) != 0)
12173 /* Having filtered unrepresentable values, we may now remove all
12174 but the highest 5 bits. */
12175 mantissa >>= point_pos - 5;
12177 /* We cannot represent the value 0.0, so reject it. This is handled
12182 /* Then, as bit 4 is always set, we can mask it off, leaving
12183 the mantissa in the range [0, 15]. */
12184 mantissa &= ~(1 << 4);
12185 gcc_assert (mantissa <= 15);
12187 /* GCC internally does not use IEEE754-like encoding (where normalized
12188 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
12189 Our mantissa values are shifted 4 places to the left relative to
12190 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
12191 by 5 places to correct for GCC's representation. */
12192 exponent = 5 - exponent;
12194 return (exponent >= 0 && exponent <= 7);
12198 aarch64_output_simd_mov_immediate (rtx const_vector,
12203 static char templ[40];
12204 const char *mnemonic;
12205 const char *shift_op;
12206 unsigned int lane_count = 0;
12209 struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
12211 /* This will return true to show const_vector is legal for use as either
12212 a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate. It will
12213 also update INFO to show how the immediate should be generated. */
12214 is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
12215 gcc_assert (is_valid);
12217 element_char = sizetochar (info.element_width);
12218 lane_count = width / info.element_width;
12220 mode = GET_MODE_INNER (mode);
12221 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12223 gcc_assert (info.shift == 0 && ! info.mvn);
12224 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
12225 move immediate path. */
12226 if (aarch64_float_const_zero_rtx_p (info.value))
12227 info.value = GEN_INT (0);
12230 const unsigned int buf_size = 20;
12231 char float_buf[buf_size] = {'\0'};
12232 real_to_decimal_for_mode (float_buf,
12233 CONST_DOUBLE_REAL_VALUE (info.value),
12234 buf_size, buf_size, 1, mode);
12236 if (lane_count == 1)
12237 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
12239 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
12240 lane_count, element_char, float_buf);
12245 mnemonic = info.mvn ? "mvni" : "movi";
12246 shift_op = info.msl ? "msl" : "lsl";
12248 gcc_assert (CONST_INT_P (info.value));
12249 if (lane_count == 1)
12250 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
12251 mnemonic, UINTVAL (info.value));
12252 else if (info.shift)
12253 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
12254 ", %s %d", mnemonic, lane_count, element_char,
12255 UINTVAL (info.value), shift_op, info.shift);
12257 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
12258 mnemonic, lane_count, element_char, UINTVAL (info.value));
12263 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
12266 machine_mode vmode;
12268 gcc_assert (!VECTOR_MODE_P (mode));
12269 vmode = aarch64_simd_container_mode (mode, 64);
12270 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
12271 return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
12274 /* Split operands into moves from op[1] + op[2] into op[0]. */
12277 aarch64_split_combinev16qi (rtx operands[3])
12279 unsigned int dest = REGNO (operands[0]);
12280 unsigned int src1 = REGNO (operands[1]);
12281 unsigned int src2 = REGNO (operands[2]);
12282 machine_mode halfmode = GET_MODE (operands[1]);
12283 unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
12284 rtx destlo, desthi;
12286 gcc_assert (halfmode == V16QImode);
12288 if (src1 == dest && src2 == dest + halfregs)
12290 /* No-op move. Can't split to nothing; emit something. */
12291 emit_note (NOTE_INSN_DELETED);
12295 /* Preserve register attributes for variable tracking. */
12296 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
12297 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
12298 GET_MODE_SIZE (halfmode));
12300 /* Special case of reversed high/low parts. */
12301 if (reg_overlap_mentioned_p (operands[2], destlo)
12302 && reg_overlap_mentioned_p (operands[1], desthi))
12304 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
12305 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
12306 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
12308 else if (!reg_overlap_mentioned_p (operands[2], destlo))
12310 /* Try to avoid unnecessary moves if part of the result
12311 is in the right place already. */
12313 emit_move_insn (destlo, operands[1]);
12314 if (src2 != dest + halfregs)
12315 emit_move_insn (desthi, operands[2]);
12319 if (src2 != dest + halfregs)
12320 emit_move_insn (desthi, operands[2]);
12322 emit_move_insn (destlo, operands[1]);
12326 /* vec_perm support. */
12328 #define MAX_VECT_LEN 16
12330 struct expand_vec_perm_d
12332 rtx target, op0, op1;
12333 unsigned char perm[MAX_VECT_LEN];
12334 machine_mode vmode;
12335 unsigned char nelt;
12340 /* Generate a variable permutation. */
12343 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
12345 machine_mode vmode = GET_MODE (target);
12346 bool one_vector_p = rtx_equal_p (op0, op1);
12348 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
12349 gcc_checking_assert (GET_MODE (op0) == vmode);
12350 gcc_checking_assert (GET_MODE (op1) == vmode);
12351 gcc_checking_assert (GET_MODE (sel) == vmode);
12352 gcc_checking_assert (TARGET_SIMD);
12356 if (vmode == V8QImode)
12358 /* Expand the argument to a V16QI mode by duplicating it. */
12359 rtx pair = gen_reg_rtx (V16QImode);
12360 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
12361 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
12365 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
12372 if (vmode == V8QImode)
12374 pair = gen_reg_rtx (V16QImode);
12375 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
12376 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
12380 pair = gen_reg_rtx (OImode);
12381 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
12382 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
12388 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
12390 machine_mode vmode = GET_MODE (target);
12391 unsigned int nelt = GET_MODE_NUNITS (vmode);
12392 bool one_vector_p = rtx_equal_p (op0, op1);
12395 /* The TBL instruction does not use a modulo index, so we must take care
12396 of that ourselves. */
12397 mask = aarch64_simd_gen_const_vector_dup (vmode,
12398 one_vector_p ? nelt - 1 : 2 * nelt - 1);
12399 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
12401 /* For big-endian, we also need to reverse the index within the vector
12402 (but not which vector). */
12403 if (BYTES_BIG_ENDIAN)
12405 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
12407 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
12408 sel = expand_simple_binop (vmode, XOR, sel, mask,
12409 NULL, 0, OPTAB_LIB_WIDEN);
12411 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
12414 /* Recognize patterns suitable for the TRN instructions. */
12416 aarch64_evpc_trn (struct expand_vec_perm_d *d)
12418 unsigned int i, odd, mask, nelt = d->nelt;
12419 rtx out, in0, in1, x;
12420 rtx (*gen) (rtx, rtx, rtx);
12421 machine_mode vmode = d->vmode;
12423 if (GET_MODE_UNIT_SIZE (vmode) > 8)
12426 /* Note that these are little-endian tests.
12427 We correct for big-endian later. */
12428 if (d->perm[0] == 0)
12430 else if (d->perm[0] == 1)
12434 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12436 for (i = 0; i < nelt; i += 2)
12438 if (d->perm[i] != i + odd)
12440 if (d->perm[i + 1] != ((i + nelt + odd) & mask))
12450 if (BYTES_BIG_ENDIAN)
12452 x = in0, in0 = in1, in1 = x;
12461 case V16QImode: gen = gen_aarch64_trn2v16qi; break;
12462 case V8QImode: gen = gen_aarch64_trn2v8qi; break;
12463 case V8HImode: gen = gen_aarch64_trn2v8hi; break;
12464 case V4HImode: gen = gen_aarch64_trn2v4hi; break;
12465 case V4SImode: gen = gen_aarch64_trn2v4si; break;
12466 case V2SImode: gen = gen_aarch64_trn2v2si; break;
12467 case V2DImode: gen = gen_aarch64_trn2v2di; break;
12468 case V4HFmode: gen = gen_aarch64_trn2v4hf; break;
12469 case V8HFmode: gen = gen_aarch64_trn2v8hf; break;
12470 case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
12471 case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
12472 case V2DFmode: gen = gen_aarch64_trn2v2df; break;
12481 case V16QImode: gen = gen_aarch64_trn1v16qi; break;
12482 case V8QImode: gen = gen_aarch64_trn1v8qi; break;
12483 case V8HImode: gen = gen_aarch64_trn1v8hi; break;
12484 case V4HImode: gen = gen_aarch64_trn1v4hi; break;
12485 case V4SImode: gen = gen_aarch64_trn1v4si; break;
12486 case V2SImode: gen = gen_aarch64_trn1v2si; break;
12487 case V2DImode: gen = gen_aarch64_trn1v2di; break;
12488 case V4HFmode: gen = gen_aarch64_trn1v4hf; break;
12489 case V8HFmode: gen = gen_aarch64_trn1v8hf; break;
12490 case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
12491 case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
12492 case V2DFmode: gen = gen_aarch64_trn1v2df; break;
12498 emit_insn (gen (out, in0, in1));
12502 /* Recognize patterns suitable for the UZP instructions. */
12504 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
12506 unsigned int i, odd, mask, nelt = d->nelt;
12507 rtx out, in0, in1, x;
12508 rtx (*gen) (rtx, rtx, rtx);
12509 machine_mode vmode = d->vmode;
12511 if (GET_MODE_UNIT_SIZE (vmode) > 8)
12514 /* Note that these are little-endian tests.
12515 We correct for big-endian later. */
12516 if (d->perm[0] == 0)
12518 else if (d->perm[0] == 1)
12522 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12524 for (i = 0; i < nelt; i++)
12526 unsigned elt = (i * 2 + odd) & mask;
12527 if (d->perm[i] != elt)
12537 if (BYTES_BIG_ENDIAN)
12539 x = in0, in0 = in1, in1 = x;
12548 case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
12549 case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
12550 case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
12551 case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
12552 case V4SImode: gen = gen_aarch64_uzp2v4si; break;
12553 case V2SImode: gen = gen_aarch64_uzp2v2si; break;
12554 case V2DImode: gen = gen_aarch64_uzp2v2di; break;
12555 case V4HFmode: gen = gen_aarch64_uzp2v4hf; break;
12556 case V8HFmode: gen = gen_aarch64_uzp2v8hf; break;
12557 case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
12558 case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
12559 case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
12568 case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
12569 case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
12570 case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
12571 case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
12572 case V4SImode: gen = gen_aarch64_uzp1v4si; break;
12573 case V2SImode: gen = gen_aarch64_uzp1v2si; break;
12574 case V2DImode: gen = gen_aarch64_uzp1v2di; break;
12575 case V4HFmode: gen = gen_aarch64_uzp1v4hf; break;
12576 case V8HFmode: gen = gen_aarch64_uzp1v8hf; break;
12577 case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
12578 case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
12579 case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
12585 emit_insn (gen (out, in0, in1));
12589 /* Recognize patterns suitable for the ZIP instructions. */
12591 aarch64_evpc_zip (struct expand_vec_perm_d *d)
12593 unsigned int i, high, mask, nelt = d->nelt;
12594 rtx out, in0, in1, x;
12595 rtx (*gen) (rtx, rtx, rtx);
12596 machine_mode vmode = d->vmode;
12598 if (GET_MODE_UNIT_SIZE (vmode) > 8)
12601 /* Note that these are little-endian tests.
12602 We correct for big-endian later. */
12604 if (d->perm[0] == high)
12607 else if (d->perm[0] == 0)
12611 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12613 for (i = 0; i < nelt / 2; i++)
12615 unsigned elt = (i + high) & mask;
12616 if (d->perm[i * 2] != elt)
12618 elt = (elt + nelt) & mask;
12619 if (d->perm[i * 2 + 1] != elt)
12629 if (BYTES_BIG_ENDIAN)
12631 x = in0, in0 = in1, in1 = x;
12640 case V16QImode: gen = gen_aarch64_zip2v16qi; break;
12641 case V8QImode: gen = gen_aarch64_zip2v8qi; break;
12642 case V8HImode: gen = gen_aarch64_zip2v8hi; break;
12643 case V4HImode: gen = gen_aarch64_zip2v4hi; break;
12644 case V4SImode: gen = gen_aarch64_zip2v4si; break;
12645 case V2SImode: gen = gen_aarch64_zip2v2si; break;
12646 case V2DImode: gen = gen_aarch64_zip2v2di; break;
12647 case V4HFmode: gen = gen_aarch64_zip2v4hf; break;
12648 case V8HFmode: gen = gen_aarch64_zip2v8hf; break;
12649 case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
12650 case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
12651 case V2DFmode: gen = gen_aarch64_zip2v2df; break;
12660 case V16QImode: gen = gen_aarch64_zip1v16qi; break;
12661 case V8QImode: gen = gen_aarch64_zip1v8qi; break;
12662 case V8HImode: gen = gen_aarch64_zip1v8hi; break;
12663 case V4HImode: gen = gen_aarch64_zip1v4hi; break;
12664 case V4SImode: gen = gen_aarch64_zip1v4si; break;
12665 case V2SImode: gen = gen_aarch64_zip1v2si; break;
12666 case V2DImode: gen = gen_aarch64_zip1v2di; break;
12667 case V4HFmode: gen = gen_aarch64_zip1v4hf; break;
12668 case V8HFmode: gen = gen_aarch64_zip1v8hf; break;
12669 case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
12670 case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
12671 case V2DFmode: gen = gen_aarch64_zip1v2df; break;
12677 emit_insn (gen (out, in0, in1));
12681 /* Recognize patterns for the EXT insn. */
12684 aarch64_evpc_ext (struct expand_vec_perm_d *d)
12686 unsigned int i, nelt = d->nelt;
12687 rtx (*gen) (rtx, rtx, rtx, rtx);
12690 unsigned int location = d->perm[0]; /* Always < nelt. */
12692 /* Check if the extracted indices are increasing by one. */
12693 for (i = 1; i < nelt; i++)
12695 unsigned int required = location + i;
12696 if (d->one_vector_p)
12698 /* We'll pass the same vector in twice, so allow indices to wrap. */
12699 required &= (nelt - 1);
12701 if (d->perm[i] != required)
12707 case V16QImode: gen = gen_aarch64_extv16qi; break;
12708 case V8QImode: gen = gen_aarch64_extv8qi; break;
12709 case V4HImode: gen = gen_aarch64_extv4hi; break;
12710 case V8HImode: gen = gen_aarch64_extv8hi; break;
12711 case V2SImode: gen = gen_aarch64_extv2si; break;
12712 case V4SImode: gen = gen_aarch64_extv4si; break;
12713 case V4HFmode: gen = gen_aarch64_extv4hf; break;
12714 case V8HFmode: gen = gen_aarch64_extv8hf; break;
12715 case V2SFmode: gen = gen_aarch64_extv2sf; break;
12716 case V4SFmode: gen = gen_aarch64_extv4sf; break;
12717 case V2DImode: gen = gen_aarch64_extv2di; break;
12718 case V2DFmode: gen = gen_aarch64_extv2df; break;
12727 /* The case where (location == 0) is a no-op for both big- and little-endian,
12728 and is removed by the mid-end at optimization levels -O1 and higher. */
12730 if (BYTES_BIG_ENDIAN && (location != 0))
12732 /* After setup, we want the high elements of the first vector (stored
12733 at the LSB end of the register), and the low elements of the second
12734 vector (stored at the MSB end of the register). So swap. */
12735 std::swap (d->op0, d->op1);
12736 /* location != 0 (above), so safe to assume (nelt - location) < nelt. */
12737 location = nelt - location;
12740 offset = GEN_INT (location);
12741 emit_insn (gen (d->target, d->op0, d->op1, offset));
12745 /* Recognize patterns for the REV insns. */
12748 aarch64_evpc_rev (struct expand_vec_perm_d *d)
12750 unsigned int i, j, diff, nelt = d->nelt;
12751 rtx (*gen) (rtx, rtx);
12753 if (!d->one_vector_p)
12762 case V16QImode: gen = gen_aarch64_rev64v16qi; break;
12763 case V8QImode: gen = gen_aarch64_rev64v8qi; break;
12771 case V16QImode: gen = gen_aarch64_rev32v16qi; break;
12772 case V8QImode: gen = gen_aarch64_rev32v8qi; break;
12773 case V8HImode: gen = gen_aarch64_rev64v8hi; break;
12774 case V4HImode: gen = gen_aarch64_rev64v4hi; break;
12782 case V16QImode: gen = gen_aarch64_rev16v16qi; break;
12783 case V8QImode: gen = gen_aarch64_rev16v8qi; break;
12784 case V8HImode: gen = gen_aarch64_rev32v8hi; break;
12785 case V4HImode: gen = gen_aarch64_rev32v4hi; break;
12786 case V4SImode: gen = gen_aarch64_rev64v4si; break;
12787 case V2SImode: gen = gen_aarch64_rev64v2si; break;
12788 case V4SFmode: gen = gen_aarch64_rev64v4sf; break;
12789 case V2SFmode: gen = gen_aarch64_rev64v2sf; break;
12790 case V8HFmode: gen = gen_aarch64_rev64v8hf; break;
12791 case V4HFmode: gen = gen_aarch64_rev64v4hf; break;
12800 for (i = 0; i < nelt ; i += diff + 1)
12801 for (j = 0; j <= diff; j += 1)
12803 /* This is guaranteed to be true as the value of diff
12804 is 7, 3, 1 and we should have enough elements in the
12805 queue to generate this. Getting a vector mask with a
12806 value of diff other than these values implies that
12807 something is wrong by the time we get here. */
12808 gcc_assert (i + j < nelt);
12809 if (d->perm[i + j] != i + diff - j)
12817 emit_insn (gen (d->target, d->op0));
12822 aarch64_evpc_dup (struct expand_vec_perm_d *d)
12824 rtx (*gen) (rtx, rtx, rtx);
12825 rtx out = d->target;
12827 machine_mode vmode = d->vmode;
12828 unsigned int i, elt, nelt = d->nelt;
12832 for (i = 1; i < nelt; i++)
12834 if (elt != d->perm[i])
12838 /* The generic preparation in aarch64_expand_vec_perm_const_1
12839 swaps the operand order and the permute indices if it finds
12840 d->perm[0] to be in the second operand. Thus, we can always
12841 use d->op0 and need not do any extra arithmetic to get the
12842 correct lane number. */
12844 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
12848 case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
12849 case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
12850 case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
12851 case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
12852 case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
12853 case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
12854 case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
12855 case V8HFmode: gen = gen_aarch64_dup_lanev8hf; break;
12856 case V4HFmode: gen = gen_aarch64_dup_lanev4hf; break;
12857 case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
12858 case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
12859 case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
12864 emit_insn (gen (out, in0, lane));
12869 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
12871 rtx rperm[MAX_VECT_LEN], sel;
12872 machine_mode vmode = d->vmode;
12873 unsigned int i, nelt = d->nelt;
12878 /* Generic code will try constant permutation twice. Once with the
12879 original mode and again with the elements lowered to QImode.
12880 So wait and don't do the selector expansion ourselves. */
12881 if (vmode != V8QImode && vmode != V16QImode)
12884 for (i = 0; i < nelt; ++i)
12886 int nunits = GET_MODE_NUNITS (vmode);
12888 /* If big-endian and two vectors we end up with a weird mixed-endian
12889 mode on NEON. Reverse the index within each word but not the word
12891 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
12894 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
12895 sel = force_reg (vmode, sel);
12897 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
12902 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
12904 /* The pattern matching functions above are written to look for a small
12905 number to begin the sequence (0, 1, N/2). If we begin with an index
12906 from the second operand, we can swap the operands. */
12907 if (d->perm[0] >= d->nelt)
12909 unsigned i, nelt = d->nelt;
12911 gcc_assert (nelt == (nelt & -nelt));
12912 for (i = 0; i < nelt; ++i)
12913 d->perm[i] ^= nelt; /* Keep the same index, but in the other vector. */
12915 std::swap (d->op0, d->op1);
12920 if (aarch64_evpc_rev (d))
12922 else if (aarch64_evpc_ext (d))
12924 else if (aarch64_evpc_dup (d))
12926 else if (aarch64_evpc_zip (d))
12928 else if (aarch64_evpc_uzp (d))
12930 else if (aarch64_evpc_trn (d))
12932 return aarch64_evpc_tbl (d);
12937 /* Expand a vec_perm_const pattern. */
12940 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
12942 struct expand_vec_perm_d d;
12943 int i, nelt, which;
12949 d.vmode = GET_MODE (target);
12950 gcc_assert (VECTOR_MODE_P (d.vmode));
12951 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
12952 d.testing_p = false;
12954 for (i = which = 0; i < nelt; ++i)
12956 rtx e = XVECEXP (sel, 0, i);
12957 int ei = INTVAL (e) & (2 * nelt - 1);
12958 which |= (ei < nelt ? 1 : 2);
12965 gcc_unreachable ();
12968 d.one_vector_p = false;
12969 if (!rtx_equal_p (op0, op1))
12972 /* The elements of PERM do not suggest that only the first operand
12973 is used, but both operands are identical. Allow easier matching
12974 of the permutation by folding the permutation into the single
12976 /* Fall Through. */
12978 for (i = 0; i < nelt; ++i)
12979 d.perm[i] &= nelt - 1;
12981 d.one_vector_p = true;
12986 d.one_vector_p = true;
12990 return aarch64_expand_vec_perm_const_1 (&d);
12994 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
12995 const unsigned char *sel)
12997 struct expand_vec_perm_d d;
12998 unsigned int i, nelt, which;
13002 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13003 d.testing_p = true;
13004 memcpy (d.perm, sel, nelt);
13006 /* Calculate whether all elements are in one vector. */
13007 for (i = which = 0; i < nelt; ++i)
13009 unsigned char e = d.perm[i];
13010 gcc_assert (e < 2 * nelt);
13011 which |= (e < nelt ? 1 : 2);
13014 /* If all elements are from the second vector, reindex as if from the
13017 for (i = 0; i < nelt; ++i)
13020 /* Check whether the mask can be applied to a single vector. */
13021 d.one_vector_p = (which != 3);
13023 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
13024 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
13025 if (!d.one_vector_p)
13026 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
13029 ret = aarch64_expand_vec_perm_const_1 (&d);
13036 aarch64_reverse_mask (enum machine_mode mode)
13038 /* We have to reverse each vector because we dont have
13039 a permuted load that can reverse-load according to ABI rules. */
13041 rtvec v = rtvec_alloc (16);
13043 int nunits = GET_MODE_NUNITS (mode);
13044 int usize = GET_MODE_UNIT_SIZE (mode);
13046 gcc_assert (BYTES_BIG_ENDIAN);
13047 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
13049 for (i = 0; i < nunits; i++)
13050 for (j = 0; j < usize; j++)
13051 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
13052 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
13053 return force_reg (V16QImode, mask);
13056 /* Implement MODES_TIEABLE_P. In principle we should always return true.
13057 However due to issues with register allocation it is preferable to avoid
13058 tieing integer scalar and FP scalar modes. Executing integer operations
13059 in general registers is better than treating them as scalar vector
13060 operations. This reduces latency and avoids redundant int<->FP moves.
13061 So tie modes if they are either the same class, or vector modes with
13062 other vector modes, vector structs or any scalar mode.
13066 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
13068 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
13071 /* We specifically want to allow elements of "structure" modes to
13072 be tieable to the structure. This more general condition allows
13073 other rarer situations too. */
13074 if (aarch64_vector_mode_p (mode1) && aarch64_vector_mode_p (mode2))
13077 /* Also allow any scalar modes with vectors. */
13078 if (aarch64_vector_mode_supported_p (mode1)
13079 || aarch64_vector_mode_supported_p (mode2))
13085 /* Return a new RTX holding the result of moving POINTER forward by
13089 aarch64_move_pointer (rtx pointer, int amount)
13091 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
13093 return adjust_automodify_address (pointer, GET_MODE (pointer),
13097 /* Return a new RTX holding the result of moving POINTER forward by the
13098 size of the mode it points to. */
13101 aarch64_progress_pointer (rtx pointer)
13103 HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
13105 return aarch64_move_pointer (pointer, amount);
13108 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
13112 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
13115 rtx reg = gen_reg_rtx (mode);
13117 /* "Cast" the pointers to the correct mode. */
13118 *src = adjust_address (*src, mode, 0);
13119 *dst = adjust_address (*dst, mode, 0);
13120 /* Emit the memcpy. */
13121 emit_move_insn (reg, *src);
13122 emit_move_insn (*dst, reg);
13123 /* Move the pointers forward. */
13124 *src = aarch64_progress_pointer (*src);
13125 *dst = aarch64_progress_pointer (*dst);
13128 /* Expand movmem, as if from a __builtin_memcpy. Return true if
13129 we succeed, otherwise return false. */
13132 aarch64_expand_movmem (rtx *operands)
13135 rtx dst = operands[0];
13136 rtx src = operands[1];
13138 bool speed_p = !optimize_function_for_size_p (cfun);
13140 /* When optimizing for size, give a better estimate of the length of a
13141 memcpy call, but use the default otherwise. */
13142 unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
13144 /* We can't do anything smart if the amount to copy is not constant. */
13145 if (!CONST_INT_P (operands[2]))
13148 n = UINTVAL (operands[2]);
13150 /* Try to keep the number of instructions low. For cases below 16 bytes we
13151 need to make at most two moves. For cases above 16 bytes it will be one
13152 move for each 16 byte chunk, then at most two additional moves. */
13153 if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
13156 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
13157 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
13159 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
13160 src = adjust_automodify_address (src, VOIDmode, base, 0);
13162 /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
13168 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13173 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13178 /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
13179 4-byte chunk, partially overlapping with the previously copied chunk. */
13182 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13188 src = aarch64_move_pointer (src, move);
13189 dst = aarch64_move_pointer (dst, move);
13190 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13195 /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
13196 them, then (if applicable) an 8-byte chunk. */
13201 aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
13206 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13211 /* Finish the final bytes of the copy. We can always do this in one
13212 instruction. We either copy the exact amount we need, or partially
13213 overlap with the previous chunk we copied and copy 8-bytes. */
13217 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13219 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13221 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13226 src = aarch64_move_pointer (src, -1);
13227 dst = aarch64_move_pointer (dst, -1);
13228 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13234 src = aarch64_move_pointer (src, move);
13235 dst = aarch64_move_pointer (dst, move);
13236 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13243 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
13245 static unsigned HOST_WIDE_INT
13246 aarch64_asan_shadow_offset (void)
13248 return (HOST_WIDE_INT_1 << 36);
13252 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
13253 unsigned int align,
13254 enum by_pieces_operation op,
13257 /* STORE_BY_PIECES can be used when copying a constant string, but
13258 in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
13259 For now we always fail this and let the move_by_pieces code copy
13260 the string from read-only memory. */
13261 if (op == STORE_BY_PIECES)
13264 return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
13268 aarch64_gen_ccmp_first (rtx *prep_seq, rtx *gen_seq,
13269 int code, tree treeop0, tree treeop1)
13271 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
13273 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
13275 struct expand_operand ops[4];
13278 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
13280 op_mode = GET_MODE (op0);
13281 if (op_mode == VOIDmode)
13282 op_mode = GET_MODE (op1);
13290 icode = CODE_FOR_cmpsi;
13295 icode = CODE_FOR_cmpdi;
13300 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
13301 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
13306 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
13307 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
13315 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
13316 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
13322 *prep_seq = get_insns ();
13325 create_fixed_operand (&ops[0], op0);
13326 create_fixed_operand (&ops[1], op1);
13329 if (!maybe_expand_insn (icode, 2, ops))
13334 *gen_seq = get_insns ();
13337 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
13338 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
13342 aarch64_gen_ccmp_next (rtx *prep_seq, rtx *gen_seq, rtx prev, int cmp_code,
13343 tree treeop0, tree treeop1, int bit_code)
13345 rtx op0, op1, target;
13346 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
13347 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
13349 struct expand_operand ops[6];
13352 push_to_sequence ((rtx_insn*) *prep_seq);
13353 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
13355 op_mode = GET_MODE (op0);
13356 if (op_mode == VOIDmode)
13357 op_mode = GET_MODE (op1);
13365 icode = CODE_FOR_ccmpsi;
13370 icode = CODE_FOR_ccmpdi;
13375 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
13376 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
13381 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
13382 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
13390 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
13391 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
13397 *prep_seq = get_insns ();
13400 target = gen_rtx_REG (cc_mode, CC_REGNUM);
13401 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
13403 if (bit_code != AND)
13405 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
13406 GET_MODE (XEXP (prev, 0))),
13407 VOIDmode, XEXP (prev, 0), const0_rtx);
13408 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
13411 create_fixed_operand (&ops[0], XEXP (prev, 0));
13412 create_fixed_operand (&ops[1], target);
13413 create_fixed_operand (&ops[2], op0);
13414 create_fixed_operand (&ops[3], op1);
13415 create_fixed_operand (&ops[4], prev);
13416 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
13418 push_to_sequence ((rtx_insn*) *gen_seq);
13419 if (!maybe_expand_insn (icode, 6, ops))
13425 *gen_seq = get_insns ();
13428 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
13431 #undef TARGET_GEN_CCMP_FIRST
13432 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
13434 #undef TARGET_GEN_CCMP_NEXT
13435 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
13437 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
13438 instruction fusion of some sort. */
13441 aarch64_macro_fusion_p (void)
13443 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
13447 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
13448 should be kept together during scheduling. */
13451 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
13454 rtx prev_set = single_set (prev);
13455 rtx curr_set = single_set (curr);
13456 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
13457 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
13459 if (!aarch64_macro_fusion_p ())
13463 && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_MOV_MOVK))
13465 /* We are trying to match:
13466 prev (mov) == (set (reg r0) (const_int imm16))
13467 curr (movk) == (set (zero_extract (reg r0)
13470 (const_int imm16_1)) */
13472 set_dest = SET_DEST (curr_set);
13474 if (GET_CODE (set_dest) == ZERO_EXTRACT
13475 && CONST_INT_P (SET_SRC (curr_set))
13476 && CONST_INT_P (SET_SRC (prev_set))
13477 && CONST_INT_P (XEXP (set_dest, 2))
13478 && INTVAL (XEXP (set_dest, 2)) == 16
13479 && REG_P (XEXP (set_dest, 0))
13480 && REG_P (SET_DEST (prev_set))
13481 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
13488 && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_ADRP_ADD))
13491 /* We're trying to match:
13492 prev (adrp) == (set (reg r1)
13493 (high (symbol_ref ("SYM"))))
13494 curr (add) == (set (reg r0)
13496 (symbol_ref ("SYM"))))
13497 Note that r0 need not necessarily be the same as r1, especially
13498 during pre-regalloc scheduling. */
13500 if (satisfies_constraint_Ush (SET_SRC (prev_set))
13501 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
13503 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
13504 && REG_P (XEXP (SET_SRC (curr_set), 0))
13505 && REGNO (XEXP (SET_SRC (curr_set), 0))
13506 == REGNO (SET_DEST (prev_set))
13507 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
13508 XEXP (SET_SRC (curr_set), 1)))
13514 && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_MOVK_MOVK))
13517 /* We're trying to match:
13518 prev (movk) == (set (zero_extract (reg r0)
13521 (const_int imm16_1))
13522 curr (movk) == (set (zero_extract (reg r0)
13525 (const_int imm16_2)) */
13527 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
13528 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
13529 && REG_P (XEXP (SET_DEST (prev_set), 0))
13530 && REG_P (XEXP (SET_DEST (curr_set), 0))
13531 && REGNO (XEXP (SET_DEST (prev_set), 0))
13532 == REGNO (XEXP (SET_DEST (curr_set), 0))
13533 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
13534 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
13535 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
13536 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
13537 && CONST_INT_P (SET_SRC (prev_set))
13538 && CONST_INT_P (SET_SRC (curr_set)))
13543 && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_ADRP_LDR))
13545 /* We're trying to match:
13546 prev (adrp) == (set (reg r0)
13547 (high (symbol_ref ("SYM"))))
13548 curr (ldr) == (set (reg r1)
13549 (mem (lo_sum (reg r0)
13550 (symbol_ref ("SYM")))))
13552 curr (ldr) == (set (reg r1)
13555 (symbol_ref ("SYM")))))) */
13556 if (satisfies_constraint_Ush (SET_SRC (prev_set))
13557 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
13559 rtx curr_src = SET_SRC (curr_set);
13561 if (GET_CODE (curr_src) == ZERO_EXTEND)
13562 curr_src = XEXP (curr_src, 0);
13564 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
13565 && REG_P (XEXP (XEXP (curr_src, 0), 0))
13566 && REGNO (XEXP (XEXP (curr_src, 0), 0))
13567 == REGNO (SET_DEST (prev_set))
13568 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
13569 XEXP (SET_SRC (prev_set), 0)))
13574 if ((aarch64_tune_params.fusible_ops & AARCH64_FUSE_AES_AESMC)
13575 && aarch_crypto_can_dual_issue (prev, curr))
13578 if ((aarch64_tune_params.fusible_ops & AARCH64_FUSE_CMP_BRANCH)
13579 && any_condjump_p (curr))
13581 enum attr_type prev_type = get_attr_type (prev);
13583 /* FIXME: this misses some which is considered simple arthematic
13584 instructions for ThunderX. Simple shifts are missed here. */
13585 if (prev_type == TYPE_ALUS_SREG
13586 || prev_type == TYPE_ALUS_IMM
13587 || prev_type == TYPE_LOGICS_REG
13588 || prev_type == TYPE_LOGICS_IMM)
13595 /* Return true iff the instruction fusion described by OP is enabled. */
13598 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
13600 return (aarch64_tune_params.fusible_ops & op) != 0;
13603 /* If MEM is in the form of [base+offset], extract the two parts
13604 of address and set to BASE and OFFSET, otherwise return false
13605 after clearing BASE and OFFSET. */
13608 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
13612 gcc_assert (MEM_P (mem));
13614 addr = XEXP (mem, 0);
13619 *offset = const0_rtx;
13623 if (GET_CODE (addr) == PLUS
13624 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
13626 *base = XEXP (addr, 0);
13627 *offset = XEXP (addr, 1);
13632 *offset = NULL_RTX;
13637 /* Types for scheduling fusion. */
13638 enum sched_fusion_type
13640 SCHED_FUSION_NONE = 0,
13641 SCHED_FUSION_LD_SIGN_EXTEND,
13642 SCHED_FUSION_LD_ZERO_EXTEND,
13648 /* If INSN is a load or store of address in the form of [base+offset],
13649 extract the two parts and set to BASE and OFFSET. Return scheduling
13650 fusion type this INSN is. */
13652 static enum sched_fusion_type
13653 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
13656 enum sched_fusion_type fusion = SCHED_FUSION_LD;
13658 gcc_assert (INSN_P (insn));
13659 x = PATTERN (insn);
13660 if (GET_CODE (x) != SET)
13661 return SCHED_FUSION_NONE;
13664 dest = SET_DEST (x);
13666 machine_mode dest_mode = GET_MODE (dest);
13668 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
13669 return SCHED_FUSION_NONE;
13671 if (GET_CODE (src) == SIGN_EXTEND)
13673 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
13674 src = XEXP (src, 0);
13675 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
13676 return SCHED_FUSION_NONE;
13678 else if (GET_CODE (src) == ZERO_EXTEND)
13680 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
13681 src = XEXP (src, 0);
13682 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
13683 return SCHED_FUSION_NONE;
13686 if (GET_CODE (src) == MEM && REG_P (dest))
13687 extract_base_offset_in_addr (src, base, offset);
13688 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
13690 fusion = SCHED_FUSION_ST;
13691 extract_base_offset_in_addr (dest, base, offset);
13694 return SCHED_FUSION_NONE;
13696 if (*base == NULL_RTX || *offset == NULL_RTX)
13697 fusion = SCHED_FUSION_NONE;
13702 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
13704 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
13705 and PRI are only calculated for these instructions. For other instruction,
13706 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
13707 type instruction fusion can be added by returning different priorities.
13709 It's important that irrelevant instructions get the largest FUSION_PRI. */
13712 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
13713 int *fusion_pri, int *pri)
13717 enum sched_fusion_type fusion;
13719 gcc_assert (INSN_P (insn));
13722 fusion = fusion_load_store (insn, &base, &offset);
13723 if (fusion == SCHED_FUSION_NONE)
13730 /* Set FUSION_PRI according to fusion type and base register. */
13731 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
13733 /* Calculate PRI. */
13736 /* INSN with smaller offset goes first. */
13737 off_val = (int)(INTVAL (offset));
13739 tmp -= (off_val & 0xfffff);
13741 tmp += ((- off_val) & 0xfffff);
13747 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
13748 Adjust priority of sha1h instructions so they are scheduled before
13749 other SHA1 instructions. */
13752 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
13754 rtx x = PATTERN (insn);
13756 if (GET_CODE (x) == SET)
13760 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
13761 return priority + 10;
13767 /* Given OPERANDS of consecutive load/store, check if we can merge
13768 them into ldp/stp. LOAD is true if they are load instructions.
13769 MODE is the mode of memory operands. */
13772 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
13773 enum machine_mode mode)
13775 HOST_WIDE_INT offval_1, offval_2, msize;
13776 enum reg_class rclass_1, rclass_2;
13777 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
13781 mem_1 = operands[1];
13782 mem_2 = operands[3];
13783 reg_1 = operands[0];
13784 reg_2 = operands[2];
13785 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
13786 if (REGNO (reg_1) == REGNO (reg_2))
13791 mem_1 = operands[0];
13792 mem_2 = operands[2];
13793 reg_1 = operands[1];
13794 reg_2 = operands[3];
13797 /* The mems cannot be volatile. */
13798 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
13801 /* If we have SImode and slow unaligned ldp,
13802 check the alignment to be at least 8 byte. */
13804 && (aarch64_tune_params.extra_tuning_flags
13805 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
13807 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
13810 /* Check if the addresses are in the form of [base+offset]. */
13811 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
13812 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
13814 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
13815 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
13818 /* Check if the bases are same. */
13819 if (!rtx_equal_p (base_1, base_2))
13822 offval_1 = INTVAL (offset_1);
13823 offval_2 = INTVAL (offset_2);
13824 msize = GET_MODE_SIZE (mode);
13825 /* Check if the offsets are consecutive. */
13826 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
13829 /* Check if the addresses are clobbered by load. */
13832 if (reg_mentioned_p (reg_1, mem_1))
13835 /* In increasing order, the last load can clobber the address. */
13836 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
13840 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
13841 rclass_1 = FP_REGS;
13843 rclass_1 = GENERAL_REGS;
13845 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
13846 rclass_2 = FP_REGS;
13848 rclass_2 = GENERAL_REGS;
13850 /* Check if the registers are of same class. */
13851 if (rclass_1 != rclass_2)
13857 /* Given OPERANDS of consecutive load/store, check if we can merge
13858 them into ldp/stp by adjusting the offset. LOAD is true if they
13859 are load instructions. MODE is the mode of memory operands.
13861 Given below consecutive stores:
13863 str w1, [xb, 0x100]
13864 str w1, [xb, 0x104]
13865 str w1, [xb, 0x108]
13866 str w1, [xb, 0x10c]
13868 Though the offsets are out of the range supported by stp, we can
13869 still pair them after adjusting the offset, like:
13871 add scratch, xb, 0x100
13872 stp w1, w1, [scratch]
13873 stp w1, w1, [scratch, 0x8]
13875 The peephole patterns detecting this opportunity should guarantee
13876 the scratch register is avaliable. */
13879 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
13880 enum machine_mode mode)
13882 enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
13883 HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
13884 rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
13885 rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
13889 reg_1 = operands[0];
13890 mem_1 = operands[1];
13891 reg_2 = operands[2];
13892 mem_2 = operands[3];
13893 reg_3 = operands[4];
13894 mem_3 = operands[5];
13895 reg_4 = operands[6];
13896 mem_4 = operands[7];
13897 gcc_assert (REG_P (reg_1) && REG_P (reg_2)
13898 && REG_P (reg_3) && REG_P (reg_4));
13899 if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
13904 mem_1 = operands[0];
13905 reg_1 = operands[1];
13906 mem_2 = operands[2];
13907 reg_2 = operands[3];
13908 mem_3 = operands[4];
13909 reg_3 = operands[5];
13910 mem_4 = operands[6];
13911 reg_4 = operands[7];
13913 /* Skip if memory operand is by itslef valid for ldp/stp. */
13914 if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
13917 /* The mems cannot be volatile. */
13918 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
13919 || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
13922 /* Check if the addresses are in the form of [base+offset]. */
13923 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
13924 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
13926 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
13927 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
13929 extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
13930 if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
13932 extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
13933 if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
13936 /* Check if the bases are same. */
13937 if (!rtx_equal_p (base_1, base_2)
13938 || !rtx_equal_p (base_2, base_3)
13939 || !rtx_equal_p (base_3, base_4))
13942 offval_1 = INTVAL (offset_1);
13943 offval_2 = INTVAL (offset_2);
13944 offval_3 = INTVAL (offset_3);
13945 offval_4 = INTVAL (offset_4);
13946 msize = GET_MODE_SIZE (mode);
13947 /* Check if the offsets are consecutive. */
13948 if ((offval_1 != (offval_2 + msize)
13949 || offval_1 != (offval_3 + msize * 2)
13950 || offval_1 != (offval_4 + msize * 3))
13951 && (offval_4 != (offval_3 + msize)
13952 || offval_4 != (offval_2 + msize * 2)
13953 || offval_4 != (offval_1 + msize * 3)))
13956 /* Check if the addresses are clobbered by load. */
13959 if (reg_mentioned_p (reg_1, mem_1)
13960 || reg_mentioned_p (reg_2, mem_2)
13961 || reg_mentioned_p (reg_3, mem_3))
13964 /* In increasing order, the last load can clobber the address. */
13965 if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
13969 /* If we have SImode and slow unaligned ldp,
13970 check the alignment to be at least 8 byte. */
13972 && (aarch64_tune_params.extra_tuning_flags
13973 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
13975 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
13978 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
13979 rclass_1 = FP_REGS;
13981 rclass_1 = GENERAL_REGS;
13983 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
13984 rclass_2 = FP_REGS;
13986 rclass_2 = GENERAL_REGS;
13988 if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
13989 rclass_3 = FP_REGS;
13991 rclass_3 = GENERAL_REGS;
13993 if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
13994 rclass_4 = FP_REGS;
13996 rclass_4 = GENERAL_REGS;
13998 /* Check if the registers are of same class. */
13999 if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
14005 /* Given OPERANDS of consecutive load/store, this function pairs them
14006 into ldp/stp after adjusting the offset. It depends on the fact
14007 that addresses of load/store instructions are in increasing order.
14008 MODE is the mode of memory operands. CODE is the rtl operator
14009 which should be applied to all memory operands, it's SIGN_EXTEND,
14010 ZERO_EXTEND or UNKNOWN. */
14013 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
14014 enum machine_mode mode, RTX_CODE code)
14016 rtx base, offset, t1, t2;
14017 rtx mem_1, mem_2, mem_3, mem_4;
14018 HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
14022 mem_1 = operands[1];
14023 mem_2 = operands[3];
14024 mem_3 = operands[5];
14025 mem_4 = operands[7];
14029 mem_1 = operands[0];
14030 mem_2 = operands[2];
14031 mem_3 = operands[4];
14032 mem_4 = operands[6];
14033 gcc_assert (code == UNKNOWN);
14036 extract_base_offset_in_addr (mem_1, &base, &offset);
14037 gcc_assert (base != NULL_RTX && offset != NULL_RTX);
14039 /* Adjust offset thus it can fit in ldp/stp instruction. */
14040 msize = GET_MODE_SIZE (mode);
14041 stp_off_limit = msize * 0x40;
14042 off_val = INTVAL (offset);
14043 abs_off = (off_val < 0) ? -off_val : off_val;
14044 new_off = abs_off % stp_off_limit;
14045 adj_off = abs_off - new_off;
14047 /* Further adjust to make sure all offsets are OK. */
14048 if ((new_off + msize * 2) >= stp_off_limit)
14050 adj_off += stp_off_limit;
14051 new_off -= stp_off_limit;
14054 /* Make sure the adjustment can be done with ADD/SUB instructions. */
14055 if (adj_off >= 0x1000)
14060 adj_off = -adj_off;
14061 new_off = -new_off;
14064 /* Create new memory references. */
14065 mem_1 = change_address (mem_1, VOIDmode,
14066 plus_constant (DImode, operands[8], new_off));
14068 /* Check if the adjusted address is OK for ldp/stp. */
14069 if (!aarch64_mem_pair_operand (mem_1, mode))
14072 msize = GET_MODE_SIZE (mode);
14073 mem_2 = change_address (mem_2, VOIDmode,
14074 plus_constant (DImode,
14077 mem_3 = change_address (mem_3, VOIDmode,
14078 plus_constant (DImode,
14080 new_off + msize * 2));
14081 mem_4 = change_address (mem_4, VOIDmode,
14082 plus_constant (DImode,
14084 new_off + msize * 3));
14086 if (code == ZERO_EXTEND)
14088 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
14089 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
14090 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
14091 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
14093 else if (code == SIGN_EXTEND)
14095 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
14096 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
14097 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
14098 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
14103 operands[1] = mem_1;
14104 operands[3] = mem_2;
14105 operands[5] = mem_3;
14106 operands[7] = mem_4;
14110 operands[0] = mem_1;
14111 operands[2] = mem_2;
14112 operands[4] = mem_3;
14113 operands[6] = mem_4;
14116 /* Emit adjusting instruction. */
14117 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
14118 /* Emit ldp/stp instructions. */
14119 t1 = gen_rtx_SET (operands[0], operands[1]);
14120 t2 = gen_rtx_SET (operands[2], operands[3]);
14121 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
14122 t1 = gen_rtx_SET (operands[4], operands[5]);
14123 t2 = gen_rtx_SET (operands[6], operands[7]);
14124 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
14128 /* Return 1 if pseudo register should be created and used to hold
14129 GOT address for PIC code. */
14132 aarch64_use_pseudo_pic_reg (void)
14134 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
14137 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
14140 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
14142 switch (XINT (x, 1))
14144 case UNSPEC_GOTSMALLPIC:
14145 case UNSPEC_GOTSMALLPIC28K:
14146 case UNSPEC_GOTTINYPIC:
14152 return default_unspec_may_trap_p (x, flags);
14156 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
14157 return the log2 of that value. Otherwise return -1. */
14160 aarch64_fpconst_pow_of_2 (rtx x)
14162 const REAL_VALUE_TYPE *r;
14164 if (!CONST_DOUBLE_P (x))
14167 r = CONST_DOUBLE_REAL_VALUE (x);
14169 if (REAL_VALUE_NEGATIVE (*r)
14170 || REAL_VALUE_ISNAN (*r)
14171 || REAL_VALUE_ISINF (*r)
14172 || !real_isinteger (r, DFmode))
14175 return exact_log2 (real_to_integer (r));
14178 /* If X is a vector of equal CONST_DOUBLE values and that value is
14179 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
14182 aarch64_vec_fpconst_pow_of_2 (rtx x)
14184 if (GET_CODE (x) != CONST_VECTOR)
14187 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
14190 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
14194 for (int i = 1; i < CONST_VECTOR_NUNITS (x); i++)
14195 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
14201 /* Implement TARGET_PROMOTED_TYPE to promote __fp16 to float. */
14203 aarch64_promoted_type (const_tree t)
14205 if (SCALAR_FLOAT_TYPE_P (t) && TYPE_PRECISION (t) == 16)
14206 return float_type_node;
14210 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
14213 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
14214 optimization_type opt_type)
14219 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
14226 /* Override the default target speculation_safe_value. */
14228 aarch64_speculation_safe_value (machine_mode mode,
14229 rtx result, rtx val, rtx failval)
14231 /* Maybe we should warn if falling back to hard barriers. They are
14232 likely to be noticably more expensive than the alternative below. */
14233 if (!aarch64_track_speculation)
14234 return default_speculation_safe_value (mode, result, val, failval);
14237 val = copy_to_mode_reg (mode, val);
14239 if (!aarch64_reg_or_zero (failval, mode))
14240 failval = copy_to_mode_reg (mode, failval);
14245 emit_insn (gen_despeculate_copyqi (result, val, failval));
14248 emit_insn (gen_despeculate_copyhi (result, val, failval));
14251 emit_insn (gen_despeculate_copysi (result, val, failval));
14254 emit_insn (gen_despeculate_copydi (result, val, failval));
14257 emit_insn (gen_despeculate_copyti (result, val, failval));
14260 gcc_unreachable ();
14265 #undef TARGET_ADDRESS_COST
14266 #define TARGET_ADDRESS_COST aarch64_address_cost
14268 /* This hook will determines whether unnamed bitfields affect the alignment
14269 of the containing structure. The hook returns true if the structure
14270 should inherit the alignment requirements of an unnamed bitfield's
14272 #undef TARGET_ALIGN_ANON_BITFIELD
14273 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
14275 #undef TARGET_ASM_ALIGNED_DI_OP
14276 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
14278 #undef TARGET_ASM_ALIGNED_HI_OP
14279 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
14281 #undef TARGET_ASM_ALIGNED_SI_OP
14282 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
14284 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
14285 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
14286 hook_bool_const_tree_hwi_hwi_const_tree_true
14288 #undef TARGET_ASM_FILE_START
14289 #define TARGET_ASM_FILE_START aarch64_start_file
14291 #undef TARGET_ASM_OUTPUT_MI_THUNK
14292 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
14294 #undef TARGET_ASM_SELECT_RTX_SECTION
14295 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
14297 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
14298 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
14300 #undef TARGET_BUILD_BUILTIN_VA_LIST
14301 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
14303 #undef TARGET_CALLEE_COPIES
14304 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
14306 #undef TARGET_CAN_ELIMINATE
14307 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
14309 #undef TARGET_CAN_INLINE_P
14310 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
14312 #undef TARGET_CANNOT_FORCE_CONST_MEM
14313 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
14315 #undef TARGET_CASE_VALUES_THRESHOLD
14316 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
14318 #undef TARGET_CONDITIONAL_REGISTER_USAGE
14319 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
14321 /* Only the least significant bit is used for initialization guard
14323 #undef TARGET_CXX_GUARD_MASK_BIT
14324 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
14326 #undef TARGET_C_MODE_FOR_SUFFIX
14327 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
14329 #ifdef TARGET_BIG_ENDIAN_DEFAULT
14330 #undef TARGET_DEFAULT_TARGET_FLAGS
14331 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
14334 #undef TARGET_CLASS_MAX_NREGS
14335 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
14337 #undef TARGET_BUILTIN_DECL
14338 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
14340 #undef TARGET_BUILTIN_RECIPROCAL
14341 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
14343 #undef TARGET_EXPAND_BUILTIN
14344 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
14346 #undef TARGET_EXPAND_BUILTIN_VA_START
14347 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
14349 #undef TARGET_FOLD_BUILTIN
14350 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
14352 #undef TARGET_FUNCTION_ARG
14353 #define TARGET_FUNCTION_ARG aarch64_function_arg
14355 #undef TARGET_FUNCTION_ARG_ADVANCE
14356 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
14358 #undef TARGET_FUNCTION_ARG_BOUNDARY
14359 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
14361 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
14362 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
14364 #undef TARGET_FUNCTION_VALUE
14365 #define TARGET_FUNCTION_VALUE aarch64_function_value
14367 #undef TARGET_FUNCTION_VALUE_REGNO_P
14368 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
14370 #undef TARGET_FRAME_POINTER_REQUIRED
14371 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
14373 #undef TARGET_GIMPLE_FOLD_BUILTIN
14374 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
14376 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
14377 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
14379 #undef TARGET_INIT_BUILTINS
14380 #define TARGET_INIT_BUILTINS aarch64_init_builtins
14382 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
14383 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
14384 aarch64_ira_change_pseudo_allocno_class
14386 #undef TARGET_LEGITIMATE_ADDRESS_P
14387 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
14389 #undef TARGET_LEGITIMATE_CONSTANT_P
14390 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
14392 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
14393 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
14394 aarch64_legitimize_address_displacement
14396 #undef TARGET_LIBGCC_CMP_RETURN_MODE
14397 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
14399 #undef TARGET_LRA_P
14400 #define TARGET_LRA_P hook_bool_void_true
14402 #undef TARGET_MANGLE_TYPE
14403 #define TARGET_MANGLE_TYPE aarch64_mangle_type
14405 #undef TARGET_MEMORY_MOVE_COST
14406 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
14408 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
14409 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
14411 #undef TARGET_MUST_PASS_IN_STACK
14412 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
14414 /* This target hook should return true if accesses to volatile bitfields
14415 should use the narrowest mode possible. It should return false if these
14416 accesses should use the bitfield container type. */
14417 #undef TARGET_NARROW_VOLATILE_BITFIELD
14418 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
14420 #undef TARGET_OPTION_OVERRIDE
14421 #define TARGET_OPTION_OVERRIDE aarch64_override_options
14423 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
14424 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
14425 aarch64_override_options_after_change
14427 #undef TARGET_OPTION_SAVE
14428 #define TARGET_OPTION_SAVE aarch64_option_save
14430 #undef TARGET_OPTION_RESTORE
14431 #define TARGET_OPTION_RESTORE aarch64_option_restore
14433 #undef TARGET_OPTION_PRINT
14434 #define TARGET_OPTION_PRINT aarch64_option_print
14436 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
14437 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
14439 #undef TARGET_SET_CURRENT_FUNCTION
14440 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
14442 #undef TARGET_PASS_BY_REFERENCE
14443 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
14445 #undef TARGET_PREFERRED_RELOAD_CLASS
14446 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
14448 #undef TARGET_SCHED_REASSOCIATION_WIDTH
14449 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
14451 #undef TARGET_PROMOTED_TYPE
14452 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
14454 #undef TARGET_SECONDARY_RELOAD
14455 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
14457 #undef TARGET_SHIFT_TRUNCATION_MASK
14458 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
14460 #undef TARGET_SETUP_INCOMING_VARARGS
14461 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
14463 #undef TARGET_STRUCT_VALUE_RTX
14464 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
14466 #undef TARGET_REGISTER_MOVE_COST
14467 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
14469 #undef TARGET_RETURN_IN_MEMORY
14470 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
14472 #undef TARGET_RETURN_IN_MSB
14473 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
14475 #undef TARGET_RTX_COSTS
14476 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
14478 #undef TARGET_SCHED_ISSUE_RATE
14479 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
14481 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
14482 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
14483 aarch64_sched_first_cycle_multipass_dfa_lookahead
14485 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
14486 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
14487 aarch64_first_cycle_multipass_dfa_lookahead_guard
14489 #undef TARGET_TRAMPOLINE_INIT
14490 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
14492 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
14493 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
14495 #undef TARGET_VECTOR_MODE_SUPPORTED_P
14496 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
14498 #undef TARGET_ARRAY_MODE_SUPPORTED_P
14499 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
14501 #undef TARGET_VECTORIZE_ADD_STMT_COST
14502 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
14504 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
14505 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
14506 aarch64_builtin_vectorization_cost
14508 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
14509 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
14511 #undef TARGET_VECTORIZE_BUILTINS
14512 #define TARGET_VECTORIZE_BUILTINS
14514 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
14515 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
14516 aarch64_builtin_vectorized_function
14518 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
14519 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
14520 aarch64_autovectorize_vector_sizes
14522 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
14523 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
14524 aarch64_atomic_assign_expand_fenv
14526 /* Section anchor support. */
14528 #undef TARGET_MIN_ANCHOR_OFFSET
14529 #define TARGET_MIN_ANCHOR_OFFSET -256
14531 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
14532 byte offset; we can do much more for larger data types, but have no way
14533 to determine the size of the access. We assume accesses are aligned. */
14534 #undef TARGET_MAX_ANCHOR_OFFSET
14535 #define TARGET_MAX_ANCHOR_OFFSET 4095
14537 #undef TARGET_VECTOR_ALIGNMENT
14538 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
14540 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
14541 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
14542 aarch64_simd_vector_alignment_reachable
14544 /* vec_perm support. */
14546 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
14547 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
14548 aarch64_vectorize_vec_perm_const_ok
14550 #undef TARGET_INIT_LIBFUNCS
14551 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
14553 #undef TARGET_FIXED_CONDITION_CODE_REGS
14554 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
14556 #undef TARGET_FLAGS_REGNUM
14557 #define TARGET_FLAGS_REGNUM CC_REGNUM
14559 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
14560 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
14562 #undef TARGET_ASAN_SHADOW_OFFSET
14563 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
14565 #undef TARGET_LEGITIMIZE_ADDRESS
14566 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
14568 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
14569 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
14570 aarch64_use_by_pieces_infrastructure_p
14572 #undef TARGET_CAN_USE_DOLOOP_P
14573 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
14575 #undef TARGET_SCHED_ADJUST_PRIORITY
14576 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
14578 #undef TARGET_SCHED_MACRO_FUSION_P
14579 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
14581 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
14582 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
14584 #undef TARGET_SCHED_FUSION_PRIORITY
14585 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
14587 #undef TARGET_UNSPEC_MAY_TRAP_P
14588 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
14590 #undef TARGET_USE_PSEUDO_PIC_REG
14591 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
14593 #undef TARGET_PRINT_OPERAND
14594 #define TARGET_PRINT_OPERAND aarch64_print_operand
14596 #undef TARGET_PRINT_OPERAND_ADDRESS
14597 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
14599 #undef TARGET_OPTAB_SUPPORTED_P
14600 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
14602 #undef TARGET_SPECULATION_SAFE_VALUE
14603 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
14605 #undef TARGET_OMIT_STRUCT_RETURN_REG
14606 #define TARGET_OMIT_STRUCT_RETURN_REG true
14608 struct gcc_target targetm = TARGET_INITIALIZER;
14610 #include "gt-aarch64.h"