From 4003e470a7ff8d8cfc94d28f193033fd53cbd1cf Mon Sep 17 00:00:00 2001 From: Roger Sayle Date: Wed, 28 Dec 2022 19:30:17 +0000 Subject: [PATCH] Provide zero_extend versions/variants of several patterns on x86. The middle-end doesn't have a preferred canonical form for expressing zero-extension, sometimes using an AND, sometimes pairs of SHIFTs, and sometimes using zero_extend. Pending changes to RTL simplification will/may alter some of these representations, so a few additional patterns are required to recognize these alternate representations and avoid any testsuite regressions. As an example, *popcountsi2_zext is currently represented as: [(set (match_operand:DI 0 "register_operand" "=r") (and:DI (subreg:DI (popcount:SI (match_operand:SI 1 "nonimmediate_operand" "rm")) 0) (const_int 63))) (clobber (reg:CC FLAGS_REG))] this patch adds an alternate/equivalent pattern that matches: [(set (match_operand:DI 0 "register_operand" "=r") (zero_extend:DI (popcount:SI (match_operand:SI 1 "nonimmediate_operand" "rm")))) (clobber (reg:CC FLAGS_REG))] Another example is *popcounthi2 which is currently represented as: [(set (match_operand:SI 0 "register_operand") (popcount:SI (zero_extend:SI (match_operand:HI 1 "nonimmediate_operand")))) (clobber (reg:CC FLAGS_REG))] this patch adds an alternate/equivalent pattern that matches: [(set (match_operand:SI 0 "register_operand") (zero_extend:SI (popcount:HI (match_operand:HI 1 "nonimmediate_operand")))) (clobber (reg:CC FLAGS_REG))] The contents of the machine description definitions remain the same. it's just the expected RTL is slightly different but equivalent. Providing both forms makes the backend more robust to middle-end changes [and possibly catches some missed optimizations]. 2022-12-28 Roger Sayle gcc/ChangeLog * config/i386/i386.md (*clzsi2_lzcnt_zext_2): define_insn_and_split to match ZERO_EXTEND form of *clzsi2_lzcnt_zext. (*clzsi2_lzcnt_zext_2_falsedep): Likewise, new define_insn to match ZERO_EXTEND form of *clzsi2_lzcnt_zext_falsedep. (*bmi2_bzhi_zero_extendsidi_5): Likewise, new define_insn to match ZERO_EXTEND form of *bmi2_bzhi_zero_extendsidi. (*popcountsi2_zext_2): Likewise, new define_insn_and_split to match ZERO_EXTEND form of *popcountsi2_zext. (*popcountsi2_zext_2_falsedep): Likewise, new define_insn to match ZERO_EXTEND form of *popcountsi2_zext_falsedep. (*popcounthi2_2): Likewise, new define_insn_and_split to match ZERO_EXTEND form of *popcounthi2. (define_peephole2): ZERO_EXTEND variant of HImode popcount&1 using parity flag peephole2. --- gcc/config/i386/i386.md | 151 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 151 insertions(+) diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 0626752..ca40c4f 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -17419,6 +17419,42 @@ (set_attr "type" "bitmanip") (set_attr "mode" "SI")]) +(define_insn_and_split "*clzsi2_lzcnt_zext_2" + [(set (match_operand:DI 0 "register_operand" "=r") + (zero_extend:DI + (clz:SI (match_operand:SI 1 "nonimmediate_operand" "rm")))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_LZCNT && TARGET_64BIT" + "lzcnt{l}\t{%1, %k0|%k0, %1}" + "&& TARGET_AVOID_FALSE_DEP_FOR_BMI && epilogue_completed + && optimize_function_for_speed_p (cfun) + && !reg_mentioned_p (operands[0], operands[1])" + [(parallel + [(set (match_dup 0) + (zero_extend:DI (clz:SI (match_dup 1)))) + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP) + (clobber (reg:CC FLAGS_REG))])] + "ix86_expand_clear (operands[0]);" + [(set_attr "prefix_rep" "1") + (set_attr "type" "bitmanip") + (set_attr "mode" "SI")]) + +; False dependency happens when destination is only updated by tzcnt, +; lzcnt or popcnt. There is no false dependency when destination is +; also used in source. +(define_insn "*clzsi2_lzcnt_zext_2_falsedep" + [(set (match_operand:DI 0 "register_operand" "=r") + (zero_extend:DI + (clz:SI (match_operand:SWI48 1 "nonimmediate_operand" "rm")))) + (unspec [(match_operand:DI 2 "register_operand" "0")] + UNSPEC_INSN_FALSE_DEP) + (clobber (reg:CC FLAGS_REG))] + "TARGET_LZCNT" + "lzcnt{l}\t{%1, %k0|%k0, %1}" + [(set_attr "prefix_rep" "1") + (set_attr "type" "bitmanip") + (set_attr "mode" "SI")]) + (define_int_iterator LT_ZCNT [(UNSPEC_TZCNT "TARGET_BMI") (UNSPEC_LZCNT "TARGET_LZCNT")]) @@ -17737,6 +17773,22 @@ (set_attr "prefix" "vex") (set_attr "mode" "DI")]) +(define_insn "*bmi2_bzhi_zero_extendsidi_5" + [(set (match_operand:DI 0 "register_operand" "=r") + (and:DI + (zero_extend:DI + (plus:SI + (ashift:SI (const_int 1) + (match_operand:QI 2 "register_operand" "r")) + (const_int -1))) + (match_operand:DI 1 "nonimmediate_operand" "rm"))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_64BIT && TARGET_BMI2" + "bzhi\t{%q2, %q1, %q0|%q0, %q1, %q2}" + [(set_attr "type" "bitmanip") + (set_attr "prefix" "vex") + (set_attr "mode" "DI")]) + (define_insn "bmi2_pdep_3" [(set (match_operand:SWI48 0 "register_operand" "=r") (unspec:SWI48 [(match_operand:SWI48 1 "register_operand" "r") @@ -17999,6 +18051,54 @@ (set_attr "type" "bitmanip") (set_attr "mode" "SI")]) +(define_insn_and_split "*popcountsi2_zext_2" + [(set (match_operand:DI 0 "register_operand" "=r") + (zero_extend:DI + (popcount:SI (match_operand:SI 1 "nonimmediate_operand" "rm")))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_POPCNT && TARGET_64BIT" +{ +#if TARGET_MACHO + return "popcnt\t{%1, %k0|%k0, %1}"; +#else + return "popcnt{l}\t{%1, %k0|%k0, %1}"; +#endif +} + "&& TARGET_AVOID_FALSE_DEP_FOR_BMI && epilogue_completed + && optimize_function_for_speed_p (cfun) + && !reg_mentioned_p (operands[0], operands[1])" + [(parallel + [(set (match_dup 0) + (zero_extend:DI (popcount:SI (match_dup 1)))) + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP) + (clobber (reg:CC FLAGS_REG))])] + "ix86_expand_clear (operands[0]);" + [(set_attr "prefix_rep" "1") + (set_attr "type" "bitmanip") + (set_attr "mode" "SI")]) + +; False dependency happens when destination is only updated by tzcnt, +; lzcnt or popcnt. There is no false dependency when destination is +; also used in source. +(define_insn "*popcountsi2_zext_2_falsedep" + [(set (match_operand:DI 0 "register_operand" "=r") + (zero_extend:DI + (popcount:SI (match_operand:SI 1 "nonimmediate_operand" "rm")))) + (unspec [(match_operand:DI 2 "register_operand" "0")] + UNSPEC_INSN_FALSE_DEP) + (clobber (reg:CC FLAGS_REG))] + "TARGET_POPCNT && TARGET_64BIT" +{ +#if TARGET_MACHO + return "popcnt\t{%1, %k0|%k0, %1}"; +#else + return "popcnt{l}\t{%1, %k0|%k0, %1}"; +#endif +} + [(set_attr "prefix_rep" "1") + (set_attr "type" "bitmanip") + (set_attr "mode" "SI")]) + (define_insn_and_split "*popcounthi2_1" [(set (match_operand:SI 0 "register_operand") (popcount:SI @@ -18017,6 +18117,24 @@ DONE; }) +(define_insn_and_split "*popcounthi2_2" + [(set (match_operand:SI 0 "register_operand") + (zero_extend:SI + (popcount:HI (match_operand:HI 1 "nonimmediate_operand")))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_POPCNT + && ix86_pre_reload_split ()" + "#" + "&& 1" + [(const_int 0)] +{ + rtx tmp = gen_reg_rtx (HImode); + + emit_insn (gen_popcounthi2 (tmp, operands[1])); + emit_insn (gen_zero_extendhisi2 (operands[0], tmp)); + DONE; +}) + (define_insn "popcounthi2" [(set (match_operand:HI 0 "register_operand" "=r") (popcount:HI @@ -18336,6 +18454,39 @@ PUT_CODE (operands[5], GET_CODE (operands[5]) == EQ ? UNORDERED : ORDERED); }) +;; Eliminate HImode popcount&1 using parity flag (variant 2) +(define_peephole2 + [(match_scratch:HI 0 "Q") + (parallel [(set (match_operand:HI 1 "register_operand") + (popcount:HI + (match_operand:HI 2 "nonimmediate_operand"))) + (clobber (reg:CC FLAGS_REG))]) + (set (reg:CCZ FLAGS_REG) + (compare:CCZ (and:QI (match_operand:QI 3 "register_operand") + (const_int 1)) + (const_int 0))) + (set (pc) (if_then_else (match_operator 4 "bt_comparison_operator" + [(reg:CCZ FLAGS_REG) + (const_int 0)]) + (label_ref (match_operand 5)) + (pc)))] + "REGNO (operands[1]) == REGNO (operands[3]) + && peep2_reg_dead_p (2, operands[1]) + && peep2_reg_dead_p (2, operands[3]) + && peep2_regno_dead_p (3, FLAGS_REG)" + [(set (match_dup 0) (match_dup 2)) + (parallel [(set (reg:CC FLAGS_REG) + (unspec:CC [(match_dup 0)] UNSPEC_PARITY)) + (clobber (match_dup 0))]) + (set (pc) (if_then_else (match_op_dup 4 [(reg:CC FLAGS_REG) + (const_int 0)]) + (label_ref (match_dup 5)) + (pc)))] +{ + operands[4] = shallow_copy_rtx (operands[4]); + PUT_CODE (operands[4], GET_CODE (operands[4]) == EQ ? UNORDERED : ORDERED); +}) + ;; Thread-local storage patterns for ELF. ;; -- 2.7.4