From: Richard Earnshaw Date: Tue, 7 Jun 2022 11:09:47 +0000 (+0100) Subject: arm: Improve code generation for BFI and BFC [PR105090] X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=2005b9b888eeac078f2524b1521885f4b5453894;p=platform%2Fupstream%2Fgcc.git arm: Improve code generation for BFI and BFC [PR105090] This patch, in response to PR105090, makes some general improvements to the code generation when BFI and BFC instructions are available. Firstly we handle more cases where the RTL does not generate an INSV operation due to a lack of a tie between the input and output, but we nevertheless need to emit BFI later on; we handle this by requiring the register allocator to tie the operands. Secondly we handle some cases where we were previously emitting BFC, but AND with an immediate would be better; we do this by converting all BFC patterns into AND using a split pattern. And finally, we handle some cases where previously we would emit multiple BIC operations to clear a value, but could instead use a single BFC instruction. BFC and BFI express the mask as a pair of values, one for the number of bits to clear and another for the location of the least significant bit. We handle these with a single new output modifier letter that causes both values to be printed; we use an 'inverted' value so that it can be used directly with the constant used in an AND rtl construct. We've run out of 'new' letters, so to do this we re-use one of the long-obsoleted Maverick output modifiers. gcc/ChangeLog: PR target/105090 * config/arm/arm.cc (arm_bfi_1_p): New function. (arm_bfi_p): New function. (arm_rtx_costs_internal): Add costs for BFI idioms. (arm_print_operand [case 'V']): Format output for BFI/BFC masks. * config/arm/constraints.md (Dj): New constraint. * config/arm/arm.md (arm_andsi3_insn): Add alternative to use BFC. (insv_zero): Convert to an insn with a split. (*bfi, *bfi_alt1, *bfi_alt2, *bfi_alt3): New patterns. --- diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc index 7ecf7b7..2a76c7b 100644 --- a/gcc/config/arm/arm.cc +++ b/gcc/config/arm/arm.cc @@ -10201,6 +10201,61 @@ arm_mem_costs (rtx x, const struct cpu_cost_table *extra_cost, return true; } +/* Helper for arm_bfi_p. */ +static bool +arm_bfi_1_p (rtx op0, rtx op1, rtx *sub0, rtx *sub1) +{ + unsigned HOST_WIDE_INT const1; + unsigned HOST_WIDE_INT const2 = 0; + + if (!CONST_INT_P (XEXP (op0, 1))) + return false; + + const1 = XUINT (XEXP (op0, 1), 0); + if (!CONST_INT_P (XEXP (op1, 1)) + || ~XUINT (XEXP (op1, 1), 0) != const1) + return false; + + if (GET_CODE (XEXP (op0, 0)) == ASHIFT + && CONST_INT_P (XEXP (XEXP (op0, 0), 1))) + { + const2 = XUINT (XEXP (XEXP (op0, 0), 1), 0); + *sub0 = XEXP (XEXP (op0, 0), 0); + } + else + *sub0 = XEXP (op0, 0); + + if (const2 >= GET_MODE_BITSIZE (GET_MODE (op0))) + return false; + + *sub1 = XEXP (op1, 0); + return exact_log2 (const1 + (HOST_WIDE_INT_1U << const2)) >= 0; +} + +/* Recognize a BFI idiom. Helper for arm_rtx_costs_internal. The + format looks something like: + + (IOR (AND (reg1) (~const1)) + (AND (ASHIFT (reg2) (const2)) + (const1))) + + where const1 is a consecutive sequence of 1-bits with the + least-significant non-zero bit starting at bit position const2. If + const2 is zero, then the shift will not appear at all, due to + canonicalization. The two arms of the IOR expression may be + flipped. */ +static bool +arm_bfi_p (rtx x, rtx *sub0, rtx *sub1) +{ + if (GET_CODE (x) != IOR) + return false; + if (GET_CODE (XEXP (x, 0)) != AND + || GET_CODE (XEXP (x, 1)) != AND) + return false; + return (arm_bfi_1_p (XEXP (x, 0), XEXP (x, 1), sub0, sub1) + || arm_bfi_1_p (XEXP (x, 1), XEXP (x, 0), sub1, sub0)); +} + /* RTX costs. Make an estimate of the cost of executing the operation X, which is contained within an operation with code OUTER_CODE. SPEED_P indicates whether the cost desired is the performance cost, @@ -10959,14 +11014,28 @@ arm_rtx_costs_internal (rtx x, enum rtx_code code, enum rtx_code outer_code, *cost = LIBCALL_COST (2); return false; case IOR: - if (mode == SImode && arm_arch6 && aarch_rev16_p (x)) - { - if (speed_p) - *cost += extra_cost->alu.rev; + { + rtx sub0, sub1; + if (mode == SImode && arm_arch6 && aarch_rev16_p (x)) + { + if (speed_p) + *cost += extra_cost->alu.rev; - return true; - } - /* Fall through. */ + return true; + } + else if (mode == SImode && arm_arch_thumb2 + && arm_bfi_p (x, &sub0, &sub1)) + { + *cost += rtx_cost (sub0, mode, ZERO_EXTRACT, 1, speed_p); + *cost += rtx_cost (sub1, mode, ZERO_EXTRACT, 0, speed_p); + if (speed_p) + *cost += extra_cost->alu.bfi; + + return true; + } + } + + /* Fall through. */ case AND: case XOR: if (mode == SImode) { @@ -23780,8 +23849,8 @@ arm_print_condition (FILE *stream) /* Globally reserved letters: acln Puncutation letters currently used: @_|?().!# Lower case letters currently used: bcdefhimpqtvwxyz - Upper case letters currently used: ABCDEFGHIJKLMNOPQRSTU - Letters previously used, but now deprecated/obsolete: sVWXYZ. + Upper case letters currently used: ABCDEFGHIJKLMNOPQRSTUV + Letters previously used, but now deprecated/obsolete: sWXYZ. Note that the global reservation for 'c' is only for CONSTANT_ADDRESS_P. @@ -23797,7 +23866,10 @@ arm_print_condition (FILE *stream) If CODE is 'N' then X is a floating point operand that must be negated before output. If CODE is 'B' then output a bitwise inverted value of X (a const int). - If X is a REG and CODE is `M', output a ldm/stm style multi-reg. */ + If X is a REG and CODE is `M', output a ldm/stm style multi-reg. + If CODE is 'V', then the operand must be a CONST_INT representing + the bits to preserve in the modified register (Rd) of a BFI or BFC + instruction: print out both the width and lsb (shift) fields. */ static void arm_print_operand (FILE *stream, rtx x, int code) { @@ -24106,8 +24178,27 @@ arm_print_operand (FILE *stream, rtx x, int code) stream); return; - case 's': case 'V': + { + /* Output the LSB (shift) and width for a bitmask instruction + based on a literal mask. The LSB is printed first, + followed by the width. + + Eg. For 0b1...1110001, the result is #1, #3. */ + if (!CONST_INT_P (x)) + { + output_operand_lossage ("invalid operand for code '%c'", code); + return; + } + + unsigned HOST_WIDE_INT val = ~XUINT (x, 0); + int lsb = exact_log2 (val & -val); + asm_fprintf (stream, "#%d, #%d", lsb, + (exact_log2 (val + (val & -val)) - lsb)); + } + return; + + case 's': case 'W': case 'X': case 'Y': diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md index 60468f6..69bf343 100644 --- a/gcc/config/arm/arm.md +++ b/gcc/config/arm/arm.md @@ -3002,30 +3002,36 @@ ; ??? Check split length for Thumb-2 (define_insn_and_split "*arm_andsi3_insn" - [(set (match_operand:SI 0 "s_register_operand" "=r,l,r,r,r") - (and:SI (match_operand:SI 1 "s_register_operand" "%r,0,r,r,r") - (match_operand:SI 2 "reg_or_int_operand" "I,l,K,r,?n")))] + [(set (match_operand:SI 0 "s_register_operand" "=r,l,r,r,r,r") + (and:SI (match_operand:SI 1 "s_register_operand" "%r,0,r,r,0,r") + (match_operand:SI 2 "reg_or_int_operand" "I,l,K,r,Dj,?n")))] "TARGET_32BIT" "@ and%?\\t%0, %1, %2 and%?\\t%0, %1, %2 bic%?\\t%0, %1, #%B2 and%?\\t%0, %1, %2 + bfc%?\\t%0, %V2 #" "TARGET_32BIT && CONST_INT_P (operands[2]) && !(const_ok_for_arm (INTVAL (operands[2])) - || const_ok_for_arm (~INTVAL (operands[2])))" + || const_ok_for_arm (~INTVAL (operands[2])) + || (arm_arch_thumb2 + && satisfies_constraint_Dj (operands[2]) + && (rtx_equal_p (operands[0], operands[1]) + || !reload_completed)))" [(clobber (const_int 0))] " - arm_split_constant (AND, SImode, curr_insn, + arm_split_constant (AND, SImode, curr_insn, INTVAL (operands[2]), operands[0], operands[1], 0); DONE; " - [(set_attr "length" "4,4,4,4,16") + [(set_attr "length" "4,4,4,4,4,16") (set_attr "predicable" "yes") - (set_attr "predicable_short_it" "no,yes,no,no,no") - (set_attr "type" "logic_imm,logic_imm,logic_reg,logic_reg,logic_imm")] + (set_attr "predicable_short_it" "no,yes,no,no,no,no") + (set_attr "arch" "*,*,*,*,v6t2,*") + (set_attr "type" "logic_imm,logic_imm,logic_reg,logic_reg,bfm,logic_imm")] ) (define_insn "*andsi3_compare0" @@ -3471,13 +3477,25 @@ }" ) -(define_insn "insv_zero" +(define_insn_and_split "insv_zero" [(set (zero_extract:SI (match_operand:SI 0 "s_register_operand" "+r") (match_operand:SI 1 "const_int_M_operand" "M") (match_operand:SI 2 "const_int_M_operand" "M")) (const_int 0))] "arm_arch_thumb2" "bfc%?\t%0, %2, %1" + "" + [(set (match_dup 0) (and:SI (match_dup 0) (match_dup 1)))] + { + /* Convert back to a normal AND operation, so that we can take advantage + of BIC and AND when appropriate; we'll still emit BFC if that's the + right thing to do. */ + unsigned HOST_WIDE_INT width = UINTVAL (operands[1]); + unsigned HOST_WIDE_INT lsb = UINTVAL (operands[2]); + unsigned HOST_WIDE_INT mask = (HOST_WIDE_INT_1U << width) - 1; + + operands[1] = gen_int_mode (~(mask << lsb), SImode); + } [(set_attr "length" "4") (set_attr "predicable" "yes") (set_attr "type" "bfm")] @@ -3495,6 +3513,76 @@ (set_attr "type" "bfm")] ) +(define_insn "*bfi" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (ior:SI (and:SI (match_operand:SI 1 "s_register_operand" "0") + (match_operand 2 "const_int_operand" "Dj")) + (and:SI (ashift:SI + (match_operand:SI 3 "s_register_operand" "r") + (match_operand 4 "const_int_operand" "i")) + (match_operand 5 "const_int_operand" "i"))))] + "arm_arch_thumb2 + && UINTVAL (operands[4]) < 32 + && UINTVAL (operands[2]) == ~UINTVAL (operands[5]) + && (exact_log2 (UINTVAL (operands[5]) + + (HOST_WIDE_INT_1U << UINTVAL (operands[4]))) + >= 0)" + "bfi%?\t%0, %3, %V2" + [(set_attr "length" "4") + (set_attr "predicable" "yes") + (set_attr "type" "bfm")] +) + +(define_insn "*bfi_alt1" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (ior:SI (and:SI (ashift:SI + (match_operand:SI 3 "s_register_operand" "r") + (match_operand 4 "const_int_operand" "i")) + (match_operand 5 "const_int_operand" "i")) + (and:SI (match_operand:SI 1 "s_register_operand" "0") + (match_operand 2 "const_int_operand" "Dj"))))] + "arm_arch_thumb2 + && UINTVAL (operands[4]) < 32 + && UINTVAL (operands[2]) == ~UINTVAL (operands[5]) + && (exact_log2 (UINTVAL (operands[5]) + + (HOST_WIDE_INT_1U << UINTVAL (operands[4]))) + >= 0)" + "bfi%?\t%0, %3, %V2" + [(set_attr "length" "4") + (set_attr "predicable" "yes") + (set_attr "type" "bfm")] +) + +(define_insn "*bfi_alt2" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (ior:SI (and:SI (match_operand:SI 1 "s_register_operand" "0") + (match_operand 2 "const_int_operand" "i")) + (and:SI (match_operand:SI 3 "s_register_operand" "r") + (match_operand 4 "const_int_operand" "i"))))] + "arm_arch_thumb2 + && UINTVAL (operands[2]) == ~UINTVAL (operands[4]) + && exact_log2 (UINTVAL (operands[4]) + 1) >= 0" + "bfi%?\t%0, %3, %V2" + [(set_attr "length" "4") + (set_attr "predicable" "yes") + (set_attr "type" "bfm")] +) + +(define_insn "*bfi_alt3" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (ior:SI (and:SI (match_operand:SI 3 "s_register_operand" "r") + (match_operand 4 "const_int_operand" "i")) + (and:SI (match_operand:SI 1 "s_register_operand" "0") + (match_operand 2 "const_int_operand" "i"))))] + "arm_arch_thumb2 + && UINTVAL (operands[2]) == ~UINTVAL (operands[4]) + && exact_log2 (UINTVAL (operands[4]) + 1) >= 0" + "bfi%?\t%0, %3, %V2" + [(set_attr "length" "4") + (set_attr "predicable" "yes") + (set_attr "type" "bfm")] +) + (define_insn "andsi_notsi_si" [(set (match_operand:SI 0 "s_register_operand" "=r") (and:SI (not:SI (match_operand:SI 2 "s_register_operand" "r")) diff --git a/gcc/config/arm/constraints.md b/gcc/config/arm/constraints.md index 2b411b0..e5a36d2 100644 --- a/gcc/config/arm/constraints.md +++ b/gcc/config/arm/constraints.md @@ -32,7 +32,7 @@ ;; The following multi-letter normal constraints have been used: ;; in ARM/Thumb-2 state: Da, Db, Dc, Dd, Dn, DN, Dm, Dl, DL, Do, Dv, Dy, Di, -;; Ds, Dt, Dp, Dz, Tu, Te +;; Dj, Ds, Dt, Dp, Dz, Tu, Te ;; in Thumb-1 state: Pa, Pb, Pc, Pd, Pe ;; in Thumb-2 state: Ha, Pj, PJ, Ps, Pt, Pu, Pv, Pw, Px, Py, Pz, Rd, Rf, Rb, Ra, ;; Rg, Ri @@ -354,6 +354,14 @@ (and (match_code "const_double,const_int") (match_test "TARGET_32BIT && arm_const_double_by_immediates (op)"))) +(define_constraint "Dj" + "@internal + In cores with the v6t2 ISA, a constant with exactly one consecutive + string of zero bits." + (and (match_code "const_int") + (match_test "arm_arch_thumb2 + && exact_log2 (~ival + (~ival & -~ival)) >= 0"))) + (define_constraint "Dm" "@internal In ARM/Thumb-2 state a const_vector which can be loaded with a Neon vmov