(UNSPEC_SCAS 20)
(UNSPEC_SIN 21)
(UNSPEC_COS 22)
- (UNSPEC_BSF 23)
(UNSPEC_FNSTSW 24)
(UNSPEC_SAHF 25)
(UNSPEC_FSTCW 26)
[(set_attr "type" "leave")])
\f
(define_expand "ffssi2"
- [(set (match_operand:SI 0 "nonimmediate_operand" "")
- (ffs:SI (match_operand:SI 1 "nonimmediate_operand" "")))]
+ [(parallel
+ [(set (match_operand:SI 0 "register_operand" "")
+ (ffs:SI (match_operand:SI 1 "nonimmediate_operand" "")))
+ (clobber (match_scratch:SI 2 ""))
+ (clobber (reg:CC 17))])]
""
-{
- rtx out = gen_reg_rtx (SImode), tmp = gen_reg_rtx (SImode);
- rtx in = operands[1];
+ "")
- if (TARGET_CMOVE)
- {
- emit_move_insn (tmp, constm1_rtx);
- emit_insn (gen_ffssi_1 (out, in));
- emit_insn (gen_rtx_SET (VOIDmode, out,
- gen_rtx_IF_THEN_ELSE (SImode,
- gen_rtx_EQ (VOIDmode, gen_rtx_REG (CCZmode, FLAGS_REG),
- const0_rtx),
- tmp,
- out)));
- emit_insn (gen_addsi3 (out, out, const1_rtx));
- emit_move_insn (operands[0], out);
- }
+(define_insn_and_split "*ffs_cmove"
+ [(set (match_operand:SI 0 "register_operand" "=r")
+ (ffs:SI (match_operand:SI 1 "nonimmediate_operand" "rm")))
+ (clobber (match_scratch:SI 2 "=&r"))
+ (clobber (reg:CC 17))]
+ "TARGET_CMOVE"
+ "#"
+ "&& reload_completed"
+ [(set (match_dup 2) (const_int -1))
+ (parallel [(set (reg:CCZ 17) (compare:CCZ (match_dup 1) (const_int 0)))
+ (set (match_dup 0) (ctz:SI (match_dup 1)))])
+ (set (match_dup 0) (if_then_else:SI
+ (eq (reg:CCZ 17) (const_int 0))
+ (match_dup 2)
+ (match_dup 0)))
+ (parallel [(set (match_dup 0) (plus:SI (match_dup 0) (const_int 1)))
+ (clobber (reg:CC 17))])]
+ "")
- /* Pentium bsf instruction is extremely slow. The following code is
- recommended by the Intel Optimizing Manual as a reasonable replacement:
- TEST EAX,EAX
- JZ SHORT BS2
- XOR ECX,ECX
- MOV DWORD PTR [TEMP+4],ECX
- SUB ECX,EAX
- AND EAX,ECX
- MOV DWORD PTR [TEMP],EAX
- FILD QWORD PTR [TEMP]
- FSTP QWORD PTR [TEMP]
- WAIT ; WAIT only needed for compatibility with
- ; earlier processors
- MOV ECX, DWORD PTR [TEMP+4]
- SHR ECX,20
- SUB ECX,3FFH
- TEST EAX,EAX ; clear zero flag
- BS2:
- Following piece of code expand ffs to similar beast.
- */
-
- else if (TARGET_PENTIUM && !optimize_size && TARGET_80387)
- {
- rtx label = gen_label_rtx ();
- rtx lo, hi;
- rtx mem = assign_386_stack_local (DImode, 0);
- rtx fptmp = gen_reg_rtx (DFmode);
- split_di (&mem, 1, &lo, &hi);
-
- emit_move_insn (out, const0_rtx);
-
- emit_cmp_and_jump_insns (in, const0_rtx, EQ, 0, SImode, 1, label);
-
- emit_move_insn (hi, out);
- emit_insn (gen_subsi3 (out, out, in));
- emit_insn (gen_andsi3 (out, out, in));
- emit_move_insn (lo, out);
- emit_insn (gen_floatdidf2 (fptmp,mem));
- emit_move_insn (gen_rtx_MEM (DFmode, XEXP (mem, 0)), fptmp);
- emit_move_insn (out, hi);
- emit_insn (gen_lshrsi3 (out, out, GEN_INT (20)));
- emit_insn (gen_subsi3 (out, out, GEN_INT (0x3ff - 1)));
-
- emit_label (label);
- LABEL_NUSES (label) = 1;
-
- emit_move_insn (operands[0], out);
- }
- else
- {
- emit_move_insn (tmp, const0_rtx);
- emit_insn (gen_ffssi_1 (out, in));
- emit_insn (gen_rtx_SET (VOIDmode,
- gen_rtx_STRICT_LOW_PART (VOIDmode, gen_lowpart (QImode, tmp)),
- gen_rtx_EQ (QImode, gen_rtx_REG (CCZmode, FLAGS_REG),
- const0_rtx)));
- emit_insn (gen_negsi2 (tmp, tmp));
- emit_insn (gen_iorsi3 (out, out, tmp));
- emit_insn (gen_addsi3 (out, out, const1_rtx));
- emit_move_insn (operands[0], out);
- }
- DONE;
+(define_insn_and_split "*ffs_no_cmove"
+ [(set (match_operand:SI 0 "nonimmediate_operand" "=r")
+ (ffs:SI (match_operand:SI 1 "nonimmediate_operand" "rm")))
+ (clobber (match_scratch:SI 2 "=&r"))
+ (clobber (reg:CC 17))]
+ ""
+ "#"
+ "reload_completed"
+ [(parallel [(set (match_dup 2) (const_int 0))
+ (clobber (reg:CC 17))])
+ (parallel [(set (reg:CCZ 17) (compare:CCZ (match_dup 1) (const_int 0)))
+ (set (match_dup 0) (ctz:SI (match_dup 1)))])
+ (set (strict_low_part (match_dup 3))
+ (eq:QI (reg:CCZ 17) (const_int 0)))
+ (parallel [(set (match_dup 2) (neg:SI (match_dup 2)))
+ (clobber (reg:CC 17))])
+ (parallel [(set (match_dup 0) (ior:SI (match_dup 0) (match_dup 2)))
+ (clobber (reg:CC 17))])
+ (parallel [(set (match_dup 0) (plus:SI (match_dup 0) (const_int 1)))
+ (clobber (reg:CC 17))])]
+{
+ operands[3] = gen_lowpart (QImode, operands[2]);
})
-(define_insn "ffssi_1"
+(define_insn "*ffssi_1"
[(set (reg:CCZ 17)
- (compare:CCZ (match_operand:SI 1 "nonimmediate_operand" "rm")
+ (compare:CCZ (match_operand:SI 1 "nonimmediate_operand" "rm")
(const_int 0)))
(set (match_operand:SI 0 "register_operand" "=r")
- (unspec:SI [(match_dup 1)] UNSPEC_BSF))]
+ (ctz:SI (match_dup 1)))]
+ ""
+ "bsf{l}\t{%1, %0|%0, %1}"
+ [(set_attr "prefix_0f" "1")
+ (set_attr "ppro_uops" "few")])
+
+(define_insn "ctzsi2"
+ [(set (match_operand:SI 0 "register_operand" "=r")
+ (ctz:SI (match_operand:SI 1 "nonimmediate_operand" "rm")))
+ (clobber (reg:CC 17))]
""
"bsf{l}\t{%1, %0|%0, %1}"
[(set_attr "prefix_0f" "1")
(set_attr "ppro_uops" "few")])
-;; ffshi2 is not useful -- 4 word prefix ops are needed, which is larger
-;; and slower than the two-byte movzx insn needed to do the work in SImode.
+(define_expand "clzsi2"
+ [(parallel
+ [(set (match_operand:SI 0 "register_operand" "")
+ (minus:SI (const_int 31)
+ (clz:SI (match_operand:SI 1 "nonimmediate_operand" ""))))
+ (clobber (reg:CC 17))])
+ (parallel
+ [(set (match_dup 0) (xor:SI (match_dup 0) (const_int 31)))
+ (clobber (reg:CC 17))])]
+ ""
+ "")
+
+(define_insn "*bsr"
+ [(set (match_operand:SI 0 "register_operand" "=r")
+ (minus:SI (const_int 31)
+ (clz:SI (match_operand:SI 1 "nonimmediate_operand" "rm"))))
+ (clobber (reg:CC 17))]
+ ""
+ "bsr{l}\t{%1, %0|%0, %1}"
+ [(set_attr "prefix_0f" "1")
+ (set_attr "ppro_uops" "few")])
\f
;; Thread-local storage patterns for ELF.
;;