svolume_mmx: optimize some more
authorWim Taymans <wim.taymans@collabora.co.uk>
Sat, 12 Sep 2009 10:02:59 +0000 (12:02 +0200)
committerWim Taymans <wim@metal.(none)>
Wed, 16 Sep 2009 15:14:12 +0000 (17:14 +0200)
We can reorder the algortihm a bit like we do for sse so that we
don't need the contants and masking instructions. Saves 2 instructions
for the mmx code.

src/pulsecore/svolume_mmx.c

index 74918e7..170f01d 100644 (file)
       " punpcklwd %%mm4, "#s"        \n\t" /* .. |    0  |   p0  | */                   \
       " pcmpgtw "#v", %%mm4          \n\t" /* .. |    0  | s(vl) | */                   \
       " pand "#s", %%mm4             \n\t" /* .. |    0  |  (p0) |  (vl >> 15) & p */   \
-      " movq %%mm6, %%mm5            \n\t" /* .. |  ffff |   0   | */                   \
-      " pand "#v", %%mm5             \n\t" /* .. |   vh  |   0   | */                   \
-      " por %%mm5, %%mm4             \n\t" /* .. |   vh  |  (p0) | */                   \
-      " pmulhw "#s", "#v"            \n\t" /* .. |    0  | vl*p0 | */                   \
-      " paddw %%mm4, "#v"            \n\t" /* .. |   vh  | vl*p0 | vh + sign correct */ \
-      " pslld $16, "#s"              \n\t" /* .. |   p0  |    0  | */                   \
-      " por %%mm7, "#s"              \n\t" /* .. |   p0  |    1  | */                   \
-      " pmaddwd "#s", "#v"           \n\t" /* .. |    p0 * v0    | */                   \
+      " movq "#s", %%mm5             \n\t"                                              \
+      " pmulhw "#v", "#s"            \n\t" /* .. |    0  | vl*p0 | */                   \
+      " paddw %%mm4, "#s"            \n\t" /* .. |    0  | vl*p0 | + sign correct */    \
+      " psrld $16, "#v"              \n\t" /* .. |    0  |   vh  | */                   \
+      " pmaddwd %%mm5, "#v"          \n\t" /* .. |    p0 * vh    | */                   \
+      " paddd "#s", "#v"             \n\t" /* .. |    p0 * v0    | */                   \
       " packssdw "#v", "#v"          \n\t" /* .. | p1*v1 | p0*v0 | */
 
 /* approximately advances %3 = (%3 + a) % b. This function requires that
@@ -105,10 +103,6 @@ pa_volume_s16ne_mmx (int16_t *samples, int32_t *volumes, unsigned channels, unsi
     __asm__ __volatile__ (
         " xor %3, %3                    \n\t"
         " sar $1, %2                    \n\t" /* length /= sizeof (int16_t) */
-        " pcmpeqw %%mm6, %%mm6          \n\t" /* .. |  ffff |  ffff | */
-        " pcmpeqw %%mm7, %%mm7          \n\t" /* .. |  ffff |  ffff | */
-        " pslld  $16, %%mm6             \n\t" /* .. |  ffff |     0 | */
-        " psrld  $31, %%mm7             \n\t" /* .. |     0 |     1 | */
 
         " test $1, %2                   \n\t" /* check for odd samples */
         " je 2f                         \n\t"
@@ -239,7 +233,7 @@ pa_volume_s16re_mmx (int16_t *samples, int32_t *volumes, unsigned channels, unsi
     );
 }
 
-#undef RUN_TEST
+#define RUN_TEST
 
 #ifdef RUN_TEST
 #define CHANNELS 2