From d8911f269faa235572b80a8a25b842b09df6e151 Mon Sep 17 00:00:00 2001
From: Wim Taymans <wim.taymans@collabora.co.uk>
Date: Fri, 28 Aug 2009 12:43:43 +0200
Subject: [PATCH] utils: optimize for x86_64 with some inline asm

64bit x86 has native 64x64->128 bit multiply that we can use with some inline
assembler to speed up large multiplications.
Use bsr to find the number of leading zeros more efficiently.
---
 gst/gstutils.c | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/gst/gstutils.c b/gst/gstutils.c
index fddf871..fa78e11 100644
--- a/gst/gstutils.c
+++ b/gst/gstutils.c
@@ -204,6 +204,16 @@ typedef union
   } l;
 } GstUInt64;
 
+#if defined (__x86_64__) && defined (__GNUC__)
+static void
+gst_util_uint64_mul_uint64 (GstUInt64 * c1, GstUInt64 * c0, guint64 arg1,
+    guint64 arg2)
+{
+  __asm__ __volatile__ ("mul %3":"=a" (c0->ll), "=d" (c1->ll)
+      :"a" (arg1), "g" (arg2)
+      );
+}
+#else /* defined (__x86_64__) */
 /* multiply two 64-bit unsigned ints into a 128-bit unsigned int.  the high
  * and low 64 bits of the product are placed in c1 and c0 respectively.
  * this operation cannot overflow. */
@@ -246,8 +256,21 @@ gst_util_uint64_mul_uint64 (GstUInt64 * c1, GstUInt64 * c0, guint64 arg1,
    * the high words of a1 and b0 to b1, the result is c1. */
   c1->ll = (guint64) v.l.high * n.l.high + c1->l.high + a1.l.high + b0.l.high;
 }
+#endif /* defined (__x86_64__) */
 
 /* count leading zeros */
+#if defined (__x86_64__) && defined (__GNUC__)
+static guint
+gst_util_clz (guint32 val)
+{
+  guint s;
+
+  __asm__ __volatile__ ("bsrl %0, %0    \n\t"
+      "xor $31, %0    \n\t":"=r" (s):"0" (val)
+      );
+  return s;
+}
+#else /* defined (__x86_64__) */
 static guint
 gst_util_clz (guint32 val)
 {
@@ -266,6 +289,7 @@ gst_util_clz (guint32 val)
 
   return s;
 }
+#endif /* defined (__x86_64__) */
 
 /* based on Hacker's Delight p152 */
 static guint64
@@ -329,6 +353,21 @@ gst_util_div128_64 (GstUInt64 c1, GstUInt64 c0, guint64 denom)
 /* multiply a 64-bit unsigned int by a 32-bit unsigned int into a 96-bit
  * unsigned int.  the high 64 bits and low 32 bits of the product are
  * placed in c1 and c0 respectively.  this operation cannot overflow. */
+#if defined (__x86_64__) && defined (__GNUC__)
+static void
+gst_util_uint64_mul_uint32 (GstUInt64 * c1, GstUInt64 * c0, guint64 arg1,
+    guint32 arg2)
+{
+  __asm__ __volatile__ ("mul %%rcx               \n\t"
+      "mov %%rax, %%rcx        \n\t"
+      "shl $32, %%rdx          \n\t"
+      "shr $32, %%rcx          \n\t"
+      "or  %%rcx, %%rdx        \n\t"
+      "and $0xffffffff, %%eax  \n\t":"=a" (c0->ll), "=d" (c1->ll)
+      :"a" (arg1), "c" ((guint64) arg2)
+      );
+}
+#else /* defined (__x86_64__) */
 static void
 gst_util_uint64_mul_uint32 (GstUInt64 * c1, GstUInt64 * c0, guint64 arg1,
     guint32 arg2)
@@ -341,6 +380,7 @@ gst_util_uint64_mul_uint32 (GstUInt64 * c1, GstUInt64 * c0, guint64 arg1,
   c1->ll = (guint64) a.l.high * arg2 + c0->l.high;
   c0->l.high = 0;
 }
+#endif /* defined (__x86_64__) */
 
 /* divide a 96-bit unsigned int by a 32-bit unsigned int when we know the
  * quotient fits into 64 bits.  the high 64 bits and low 32 bits of the
-- 
2.7.4