* config/bfin/lib1funcs.asm (___umulsi3_highpart, __smulsi3_highpart):
authorbernds <bernds@138bc75d-0d04-0410-961f-82ee72b054a4>
Thu, 12 Apr 2007 13:39:35 +0000 (13:39 +0000)
committerbernds <bernds@138bc75d-0d04-0410-961f-82ee72b054a4>
Thu, 12 Apr 2007 13:39:35 +0000 (13:39 +0000)
Use a more efficient implementation.
* config/bfin/bfin.md (umulsi3_highpart, smulsi3_highpart): Emit
inline sequences when not optimizing for size.

git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@123748 138bc75d-0d04-0410-961f-82ee72b054a4

gcc/ChangeLog
gcc/config/bfin/bfin.md
gcc/config/bfin/lib1funcs.asm

index 07f8a5b..ef344f0 100644 (file)
        (flag_macinit1hi): Tighten constraints.
        (flag_mul_macv2hi_parts_acconly): New pattern.
 
+       * config/bfin/lib1funcs.asm (___umulsi3_highpart, __smulsi3_highpart):
+       Use a more efficient implementation.
+       * config/bfin/bfin.md (umulsi3_highpart, smulsi3_highpart): Emit
+       inline sequences when not optimizing for size.
+
        2007-02-11  Jie Zhang  <jie.zhang@analog.com>
        * config/bfin/bfin.opt (msim): New option.
        (mcpu=): New option.
index e1eeaa6..ed0da5a 100644 (file)
   [(set_attr "type" "mult")])
 
 (define_expand "umulsi3_highpart"
-  [(set (match_operand:SI 0 "register_operand" "")
-       (truncate:SI
-        (lshiftrt:DI
-         (mult:DI (zero_extend:DI
-                   (match_operand:SI 1 "nonimmediate_operand" ""))
-                  (zero_extend:DI
-                   (match_operand:SI 2 "register_operand" "")))
-         (const_int 32))))]
-  ""
-{
-  rtx umulsi3_highpart_libfunc
-    = init_one_libfunc ("__umulsi3_highpart");
+  [(parallel
+    [(set (match_operand:SI 0 "register_operand" "")
+         (truncate:SI
+          (lshiftrt:DI
+           (mult:DI (zero_extend:DI
+                     (match_operand:SI 1 "nonimmediate_operand" ""))
+                    (zero_extend:DI
+                     (match_operand:SI 2 "register_operand" "")))
+           (const_int 32))))
+     (clobber (reg:PDI REG_A0))
+     (clobber (reg:PDI REG_A1))])]
+  ""
+{
+  if (!optimize_size)
+    {
+      rtx a1reg = gen_rtx_REG (PDImode, REG_A1);
+      rtx a0reg = gen_rtx_REG (PDImode, REG_A0);
+      emit_insn (gen_flag_macinit1hi (a1reg,
+                                     gen_lowpart (HImode, operands[1]),
+                                     gen_lowpart (HImode, operands[2]),
+                                     GEN_INT (MACFLAG_FU)));
+      emit_insn (gen_lshrpdi3 (a1reg, a1reg, GEN_INT (16)));
+      emit_insn (gen_flag_mul_macv2hi_parts_acconly (a0reg, a1reg,
+                                                    gen_lowpart (V2HImode, operands[1]),
+                                                    gen_lowpart (V2HImode, operands[2]),
+                                                    const1_rtx, const1_rtx,
+                                                    const1_rtx, const0_rtx, a1reg,
+                                                    const0_rtx, GEN_INT (MACFLAG_FU),
+                                                    GEN_INT (MACFLAG_FU)));
+      emit_insn (gen_flag_machi_parts_acconly (a1reg,
+                                              gen_lowpart (V2HImode, operands[2]),
+                                              gen_lowpart (V2HImode, operands[1]),
+                                              const1_rtx, const0_rtx,
+                                              a1reg, const0_rtx, GEN_INT (MACFLAG_FU)));
+      emit_insn (gen_lshrpdi3 (a1reg, a1reg, GEN_INT (16)));
+      emit_insn (gen_sum_of_accumulators (operands[0], a0reg, a0reg, a1reg));
+    }
+  else
+    {
+      rtx umulsi3_highpart_libfunc
+       = init_one_libfunc ("__umulsi3_highpart");
 
-  emit_library_call_value (umulsi3_highpart_libfunc,
-                          operands[0], LCT_NORMAL, SImode,
-                          2, operands[1], SImode, operands[2], SImode);
+      emit_library_call_value (umulsi3_highpart_libfunc,
+                              operands[0], LCT_NORMAL, SImode,
+                              2, operands[1], SImode, operands[2], SImode);
+    }
   DONE;
 })
 
 (define_expand "smulsi3_highpart"
-  [(set (match_operand:SI 0 "register_operand" "")
-       (truncate:SI
-        (lshiftrt:DI
-         (mult:DI (sign_extend:DI
-                   (match_operand:SI 1 "nonimmediate_operand" ""))
-                  (sign_extend:DI
-                   (match_operand:SI 2 "register_operand" "")))
-         (const_int 32))))]
-  ""
-{
-  rtx smulsi3_highpart_libfunc
-    = init_one_libfunc ("__smulsi3_highpart");
+  [(parallel
+    [(set (match_operand:SI 0 "register_operand" "")
+         (truncate:SI
+          (lshiftrt:DI
+           (mult:DI (sign_extend:DI
+                     (match_operand:SI 1 "nonimmediate_operand" ""))
+                    (sign_extend:DI
+                     (match_operand:SI 2 "register_operand" "")))
+           (const_int 32))))
+     (clobber (reg:PDI REG_A0))
+     (clobber (reg:PDI REG_A1))])]
+  ""
+{
+  if (!optimize_size)
+    {
+      rtx a1reg = gen_rtx_REG (PDImode, REG_A1);
+      rtx a0reg = gen_rtx_REG (PDImode, REG_A0);
+      emit_insn (gen_flag_macinit1hi (a1reg,
+                                     gen_lowpart (HImode, operands[1]),
+                                     gen_lowpart (HImode, operands[2]),
+                                     GEN_INT (MACFLAG_FU)));
+      emit_insn (gen_lshrpdi3 (a1reg, a1reg, GEN_INT (16)));
+      emit_insn (gen_flag_mul_macv2hi_parts_acconly (a0reg, a1reg,
+                                                    gen_lowpart (V2HImode, operands[1]),
+                                                    gen_lowpart (V2HImode, operands[2]),
+                                                    const1_rtx, const1_rtx,
+                                                    const1_rtx, const0_rtx, a1reg,
+                                                    const0_rtx, GEN_INT (MACFLAG_IS),
+                                                    GEN_INT (MACFLAG_IS_M)));
+      emit_insn (gen_flag_machi_parts_acconly (a1reg,
+                                              gen_lowpart (V2HImode, operands[2]),
+                                              gen_lowpart (V2HImode, operands[1]),
+                                              const1_rtx, const0_rtx,
+                                              a1reg, const0_rtx, GEN_INT (MACFLAG_IS_M)));
+      emit_insn (gen_ashrpdi3 (a1reg, a1reg, GEN_INT (16)));
+      emit_insn (gen_sum_of_accumulators (operands[0], a0reg, a0reg, a1reg));
+    }
+  else
+    {
+      rtx smulsi3_highpart_libfunc
+       = init_one_libfunc ("__smulsi3_highpart");
 
-  emit_library_call_value (smulsi3_highpart_libfunc,
-                          operands[0], LCT_NORMAL, SImode,
-                          2, operands[1], SImode, operands[2], SImode);
+      emit_library_call_value (smulsi3_highpart_libfunc,
+                              operands[0], LCT_NORMAL, SImode,
+                              2, operands[1], SImode, operands[2], SImode);
+    }
   DONE;
 })
 
index 1d2db9b..fe4c3d5 100644 (file)
@@ -123,17 +123,12 @@ ___umodsi3:
 .type ___umulsi3_highpart, STT_FUNC;
 
 ___umulsi3_highpart:
-       R2 = R1.H * R0.H, R3 = R1.L * R0.H (FU);
-       R0 = R1.L * R0.L, R1 = R1.H * R0.L (FU);
-       R0 >>= 16;
-       /* Unsigned multiplication has the nice property that we can
-          ignore carry on this first addition.  */
-       R0 = R0 + R3;
-       R0 = R0 + R1;
-       cc = ac0;
-       R1 = cc;
-       R1 = PACK(R1.l,R0.h);
-       R0 = R1 + R2;
+       A1 = R1.L * R0.L (FU);
+       A1 = A1 >> 16;
+       A0 = R1.H * R0.H, A1 += R1.L * R0.H (FU);
+       A1 += R0.L * R1.H (FU);
+       A1 = A1 >> 16;
+       R0 = (A0 += A1);
        RTS;
 #endif
 
@@ -143,24 +138,11 @@ ___umulsi3_highpart:
 .type ___smulsi3_highpart, STT_FUNC;
 
 ___smulsi3_highpart:
-       R2 = R1.L * R0.L (FU);
-       R3 = R1.H * R0.L (IS,M);
-       R0 = R0.H * R1.H, R1 = R0.H * R1.L (IS,M);
-
-       R1.L = R2.H + R1.L;
-       cc = ac0;
-       R2 = cc;
-
-       R1.L = R1.L + R3.L;
-       cc = ac0;
-       R1 >>>= 16;
-       R3 >>>= 16;
-       R1 = R1 + R3;
-       R1 = R1 + R2;
-       R2 = cc;
-       R1 = R1 + R2;
-
-       R0 = R0 + R1;
+       A1 = R1.L * R0.L (FU);
+       A1 = A1 >> 16;
+       A0 = R0.H * R1.H, A1 += R0.H * R1.L (IS,M);
+       A1 += R1.H * R0.L (IS,M);
+       A1 = A1 >>> 16;
+       R0 = (A0 += A1);
        RTS;
 #endif
-