fix neon (thnx nash) to not round down incorrectly on blend
authorraster <raster@7cbeb6ba-43b4-40fd-8cce-4c39aea84d33>
Wed, 1 Dec 2010 09:46:49 +0000 (09:46 +0000)
committerraster <raster@7cbeb6ba-43b4-40fd-8cce-4c39aea84d33>
Wed, 1 Dec 2010 09:46:49 +0000 (09:46 +0000)
git-svn-id: http://svn.enlightenment.org/svn/e/trunk/evas@55115 7cbeb6ba-43b4-40fd-8cce-4c39aea84d33

src/lib/engines/common/evas_op_blend/op_blend_color_neon.c
src/lib/engines/common/evas_op_blend/op_blend_mask_color_neon.c
src/lib/engines/common/evas_op_blend/op_blend_pixel_color_neon.c
src/lib/engines/common/evas_op_blend/op_blend_pixel_i386.c
src/lib/engines/common/evas_op_blend/op_blend_pixel_neon.c

index 0483811..fd0192a 100644 (file)
@@ -28,7 +28,7 @@ _op_blend_c_dp_neon(DATA32 *s __UNUSED__, DATA8 *m __UNUSED__, DATA32 c, DATA32
                "vld1.32        d0[0], [%[d]]           \n\t"
                // Only touch d1
                "vmull.u8       q0, d0, d14             \n\t"
-               "vshrn.u16      d0, q0, #8              \n\t"
+               "vqrshrn.u16    d0, q0, #8              \n\t"
                "vadd.u8        d0, d12, d0             \n\t"
                "vst1.32        d0[0], [%[d]]           \n\t"
 
@@ -47,7 +47,7 @@ _op_blend_c_dp_neon(DATA32 *s __UNUSED__, DATA8 *m __UNUSED__, DATA32 c, DATA32
        AP "dualloopint:                                        \n\t"
                "vldr.32        d0, [%[d]]              \n\t"
                "vmull.u8       q1, d0, d14             \n\t"
-               "vshrn.u16      d0, q1, #8              \n\t"
+               "vqrshrn.u16    d0, q1, #8              \n\t"
                "vqadd.u8       d0, d0, d12             \n\t"
 
                "vstm           %[d]!, {d0}             \n\t"
@@ -70,10 +70,10 @@ _op_blend_c_dp_neon(DATA32 *s __UNUSED__, DATA8 *m __UNUSED__, DATA32 c, DATA32
                "vmull.u8       q4, d2, d14             \n\t"
                "vmull.u8       q5, d3, d15             \n\t"
 
-               "vshrn.u16      d0, q2, #8              \n\t"
-               "vshrn.u16      d1, q3, #8              \n\t"
-               "vshrn.u16      d2, q4, #8              \n\t"
-               "vshrn.u16      d3, q5, #8              \n\t"
+               "vqrshrn.u16    d0, q2, #8              \n\t"
+               "vqrshrn.u16    d1, q3, #8              \n\t"
+               "vqrshrn.u16    d2, q4, #8              \n\t"
+               "vqrshrn.u16    d3, q5, #8              \n\t"
 
                "vqadd.u8       q0, q6, q0              \n\t"
                "vqadd.u8       q1, q6, q1              \n\t"
@@ -95,7 +95,7 @@ _op_blend_c_dp_neon(DATA32 *s __UNUSED__, DATA8 *m __UNUSED__, DATA32 c, DATA32
        AP "dualloop2int:                                       \n\t"
                "vldr.64        d0, [%[d]]              \n\t"
                "vmull.u8       q1, d0, d14             \n\t"
-               "vshrn.u16      d0, q1, #8              \n\t"
+               "vqrshrn.u16    d0, q1, #8              \n\t"
                "vqadd.u8       d0, d0, d12             \n\t"
 
                "vstr.64        d0, [%[d]]              \n\t"
@@ -111,7 +111,7 @@ _op_blend_c_dp_neon(DATA32 *s __UNUSED__, DATA8 *m __UNUSED__, DATA32 c, DATA32
        AP "singleloop2:                                        \n\t"
                "vld1.32        d0[0], [%[d]]           \n\t"
                "vmull.u8       q1, d0, d14             \n\t"
-               "vshrn.u16      d0, q1, #8              \n\t"
+               "vqrshrn.u16    d0, q1, #8              \n\t"
                "vqadd.u8       d0, d0, d12             \n\t"
 
                "vst1.32        d0[0], [%[d]]           \n\t"
index 9738970..a97a20a 100644 (file)
@@ -40,12 +40,12 @@ _op_blend_mas_c_dp_neon(DATA32 *s __UNUSED__, DATA8 *m, DATA32 c, DATA32 *d, int
        "       vld1.32         d4[0],  [%[d]]                  \n\t"
        "       vdup.u8         d0,     d0[0]                   \n\t"
        "       vmull.u8        q4,     d0, d30                 \n\t"
-       "       vshrn.u16       d12,    q4, #8                  \n\t"
+       "       vqrshrn.u16     d12,    q4, #8                  \n\t"
        "       vmvn.u16        d14,    d12                     \n\t"
        "       vshr.u32        d16,    d14, #24                \n\t"
        "       vmul.u32        d16,    d16, d28                \n\t"
        "       vmull.u8        q7,     d16, d4                 \n\t"
-       "       vshrn.u16       d0,     q7, #8                  \n\t"
+       "       vqrshrn.u16     d0,     q7, #8                  \n\t"
        "       vqadd.u8        d0,     d0, d12                 \n\t"
        "       vst1.32         d0[0],  [%[d]]!                 \n\t"
 
@@ -64,12 +64,12 @@ _op_blend_mas_c_dp_neon(DATA32 *s __UNUSED__, DATA8 *m, DATA32 c, DATA32 *d, int
        "       vmovl.u8        q0,     d0                      \n\t"
        "       vmul.u32        q0,     q14                     \n\t"
        "       vmull.u8        q4,     d0, d30                 \n\t"
-       "       vshrn.u16       d12,    q4, #8                  \n\t"
+       "       vqrshrn.u16     d12,    q4, #8                  \n\t"
        "       vmvn.u16        d14,    d12                     \n\t"
        "       vshr.u32        d16,    d14, #24                \n\t"
        "       vmul.u32        d16,    d16, d28                \n\t"
        "       vmull.u8        q7,     d16, d4                 \n\t"
-       "       vshrn.u16       d0,     q7, #8                  \n\t"
+       "       vqrshrn.u16     d0,     q7, #8                  \n\t"
        "       vqadd.u8        q0,     q0, q6                  \n\t"
        "       vstm            %[d]!,  {d0}                    \n\t"
 
@@ -104,8 +104,8 @@ _op_blend_mas_c_dp_neon(DATA32 *s __UNUSED__, DATA8 *m, DATA32 c, DATA32 *d, int
        "       vmull.u8        q5,     d1, d31                 \n\t"
 
        // Shorten
-       "       vshrn.u16       d12,    q4, #8                  \n\t"
-       "       vshrn.u16       d13,    q5, #8                  \n\t"
+       "       vqrshrn.u16     d12,    q4, #8                  \n\t"
+       "       vqrshrn.u16     d13,    q5, #8                  \n\t"
 
        // extract negated alpha
        "       vmvn.u16        q7,     q6                      \n\t"
@@ -116,8 +116,8 @@ _op_blend_mas_c_dp_neon(DATA32 *s __UNUSED__, DATA8 *m, DATA32 c, DATA32 *d, int
        "       vmull.u8        q7,     d16, d4                 \n\t"
        "       vmull.u8        q8,     d17, d5                 \n\t"
 
-       "       vshrn.u16       d0,     q7, #8                  \n\t"
-       "       vshrn.u16       d1,     q8, #8                  \n\t"
+       "       vqrshrn.u16     d0,     q7, #8                  \n\t"
+       "       vqrshrn.u16     d1,     q8, #8                  \n\t"
 
        // Add
        "       vqadd.u8        q0,     q0, q6                  \n\t"
@@ -151,12 +151,12 @@ _op_blend_mas_c_dp_neon(DATA32 *s __UNUSED__, DATA8 *m, DATA32 c, DATA32 *d, int
        "       vmovl.u8        q0,     d0                      \n\t"
        "       vmul.u32        q0,     q14                     \n\t"
        "       vmull.u8        q4,     d0, d30                 \n\t"
-       "       vshrn.u16       d12,    q4, #8                  \n\t"
+       "       vqrshrn.u16     d12,    q4, #8                  \n\t"
        "       vmvn.u16        d14,    d12                     \n\t"
        "       vshr.u32        d16,    d14, #24                \n\t"
        "       vmul.u32        d16,    d16, d28                \n\t"
        "       vmull.u8        q7,     d16, d4                 \n\t"
-       "       vshrn.u16       d0,     q7, #8                  \n\t"
+       "       vqrshrn.u16     d0,     q7, #8                  \n\t"
        "       vqadd.u8        q0,     q0, q6                  \n\t"
        "       vstm            %[d]!,  {d0}                    \n\t"
 
@@ -168,12 +168,12 @@ _op_blend_mas_c_dp_neon(DATA32 *s __UNUSED__, DATA8 *m, DATA32 c, DATA32 *d, int
        "       vld1.32         d4[0],  [%[d]]                  \n\t"
        "       vdup.u8         d0,     d0[0]                   \n\t"
        "       vmull.u8        q4,     d0, d30                 \n\t"
-       "       vshrn.u16       d12,    q4, #8                  \n\t"
+       "       vqrshrn.u16     d12,    q4, #8                  \n\t"
        "       vmvn.u16        d14,    d12                     \n\t"
        "       vshr.u32        d16,    d14, #24                \n\t"
        "       vmul.u32        d16,    d16, d28                \n\t"
        "       vmull.u8        q7,     d16, d4                 \n\t"
-       "       vshrn.u16       d0,     q7, #8                  \n\t"
+       "       vqrshrn.u16     d0,     q7, #8                  \n\t"
        "       vqadd.u8        q0,     q0, q6                  \n\t"
        "       vst1.32         d0[0],  [%[d]]!                 \n\t"
 
index a44b128..a25fcd0 100644 (file)
@@ -23,7 +23,7 @@ _op_blend_p_c_dp_neon(DATA32 *s, DATA8 *m __UNUSED__, DATA32 c, DATA32 *d, int l
                //  Mulitply s * c (= sc)
                "vmull.u8       q4,     d0,d14          \n\t"
                // sc in d8
-               "vshrn.u16      d4,     q4, #8          \n\t"
+               "vqrshrn.u16    d4,     q4, #8          \n\t"
 
                // sca in d9
                "vmvn.u32       d6,     d4              \n\t"
@@ -33,7 +33,7 @@ _op_blend_p_c_dp_neon(DATA32 *s, DATA8 *m __UNUSED__, DATA32 c, DATA32 *d, int l
 
                /* d * alpha */
                "vmull.u8       q4,     d6, d2          \n\t"
-               "vshrn.u16      d0,     q4, #8          \n\t"
+               "vqrshrn.u16    d0,     q4, #8          \n\t"
 
                "vqadd.u8       d2,     d0, d4          \n\t"
 
@@ -57,7 +57,7 @@ _op_blend_p_c_dp_neon(DATA32 *s, DATA8 *m __UNUSED__, DATA32 c, DATA32 *d, int l
                //  Mulitply s * c (= sc)
                "vmull.u8       q4,     d0,d14          \n\t"
                // sc in d8
-               "vshrn.u16      d4,     q4, #8          \n\t"
+               "vqrshrn.u16    d4,     q4, #8          \n\t"
 
                // sca in d9
                "vmvn.u32       d6,     d4              \n\t"
@@ -67,7 +67,7 @@ _op_blend_p_c_dp_neon(DATA32 *s, DATA8 *m __UNUSED__, DATA32 c, DATA32 *d, int l
 
                /* d * alpha */
                "vmull.u8       q4,     d6, d2          \n\t"
-               "vshrn.u16      d0,     q4, #8          \n\t"
+               "vqrshrn.u16    d0,     q4, #8          \n\t"
 
                "vqadd.u8       d2,     d0, d4          \n\t"
 
@@ -90,8 +90,8 @@ _op_blend_p_c_dp_neon(DATA32 *s, DATA8 *m __UNUSED__, DATA32 c, DATA32 *d, int l
                "vmull.u8       q5,     d1,d14  \n\t"
 
                // Get sc & sc alpha
-               "vshrn.u16      d4,     q4, #8          \n\t"
-               "vshrn.u16      d5,     q5, #8          \n\t"
+               "vqrshrn.u16    d4,     q4, #8          \n\t"
+               "vqrshrn.u16    d5,     q5, #8          \n\t"
                        // sc is now in q2, 8bpp
                // Shift out, then spread alpha for q2
                "vmvn.u32       q3,     q2              \n\t"
@@ -102,8 +102,8 @@ _op_blend_p_c_dp_neon(DATA32 *s, DATA8 *m __UNUSED__, DATA32 c, DATA32 *d, int l
                "vmull.u8       q4,     d6,d2           \n\t"
                "vmull.u8       q5,     d7,d3           \n\t"
 
-               "vshrn.u16      d0,     q4, #8          \n\t"
-               "vshrn.u16      d1,     q5, #8          \n\t"
+               "vqrshrn.u16    d0,     q4, #8          \n\t"
+               "vqrshrn.u16    d1,     q5, #8          \n\t"
 
                "vqadd.u8       q1,     q0, q2          \n\t"
 
@@ -131,7 +131,7 @@ _op_blend_p_c_dp_neon(DATA32 *s, DATA8 *m __UNUSED__, DATA32 c, DATA32 *d, int l
                //  Mulitply s * c (= sc)
                "vmull.u8       q4,     d0,d14          \n\t"
                // sc in d8
-               "vshrn.u16      d4,     q4, #8          \n\t"
+               "vqrshrn.u16    d4,     q4, #8          \n\t"
 
                // sca in d9
                // XXX: I can probably squash one of these 3
@@ -141,7 +141,7 @@ _op_blend_p_c_dp_neon(DATA32 *s, DATA8 *m __UNUSED__, DATA32 c, DATA32 *d, int l
 
                /* d * alpha */
                "vmull.u8       q4,     d6, d2          \n\t"
-               "vshrn.u16      d0,     q4, #8          \n\t"
+               "vqrshrn.u16    d0,     q4, #8          \n\t"
 
                "vqadd.u8       d2,     d0, d4          \n\t"
 
@@ -160,7 +160,7 @@ _op_blend_p_c_dp_neon(DATA32 *s, DATA8 *m __UNUSED__, DATA32 c, DATA32 *d, int l
                //  Mulitply s * c (= sc)
                "vmull.u8       q4,     d0,d14          \n\t"
                // sc in d8
-               "vshrn.u16      d4,     q4, #8          \n\t"
+               "vqrshrn.u16    d4,     q4, #8          \n\t"
 
                // sca in d6
                "vmvn.u32       d6,     d4              \n\t"
@@ -169,7 +169,7 @@ _op_blend_p_c_dp_neon(DATA32 *s, DATA8 *m __UNUSED__, DATA32 c, DATA32 *d, int l
 
                /* d * alpha */
                "vmull.u8       q4,     d6, d2          \n\t"
-               "vshrn.u16      d0,     q4, #8          \n\t"
+               "vqrshrn.u16    d0,     q4, #8          \n\t"
 
                "vqadd.u8       d2,     d0, d4          \n\t"
 
index 7b561aa..9b5abe6 100644 (file)
@@ -24,6 +24,8 @@ _op_blend_p_dp_mmx(DATA32 *s, DATA8 *m __UNUSED__, DATA32 c __UNUSED__, DATA32 *
 
 static void
 _op_blend_pas_dp_mmx(DATA32 *s, DATA8 *m __UNUSED__, DATA32 c __UNUSED__, DATA32 *d, int l) {
+   _op_blend_p_dp_mmx(s, m, c, d, l);
+   return;
    DATA32 *e = d + l;
    pxor_r2r(mm0, mm0);
    MOV_A2R(ALPHA_256, mm6)
index b640b03..19fe98b 100644 (file)
@@ -30,7 +30,7 @@ _op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
                "vmul.u32       d8,     d16, d8         \n\t"
 
                "vmull.u8       q6,     d4,d8           \n\t"
-               "vshrn.u16      d8,     q6, #8          \n\t"
+               "vqrshrn.u16    d8,     q6, #8          \n\t"
                // Add to 's'
                "vqadd.u8       q2,     q4,q0           \n\t"
 
@@ -61,7 +61,7 @@ _op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
                "vmul.u32       d8,     d16, d8         \n\t"
 
                "vmull.u8       q6,     d4,d8           \n\t"
-               "vshrn.u16      d8,     q6, #8          \n\t"
+               "vqrshrn.u16    d8,     q6, #8          \n\t"
                // Add to 's'
                "vqadd.u8       d4,     d8,d0           \n\t"
                "vstr           d4,     [%[d]]          \n\t"
@@ -91,8 +91,8 @@ _op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
                "vmull.u8       q2,     d5,d9           \n\t"
 
                // Shift & narrow it
-               "vshrn.u16      d8,     q6, #8          \n\t"
-               "vshrn.u16      d9,     q2, #8          \n\t"
+               "vqrshrn.u16    d8,     q6, #8          \n\t"
+               "vqrshrn.u16    d9,     q2, #8          \n\t"
 
                // Add to s
                "vqadd.u8       q2,     q4,q0           \n\t"
@@ -134,10 +134,10 @@ _op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
                "cmp     %[tmp], %[d]\n\t"
 
                // Shift & narrow it
-               "vshrn.u16      d8,     q6, #8          \n\t"
-               "vshrn.u16      d9,     q2, #8          \n\t"
-                       "vshrn.u16      d10,    q7, #8  \n\t"
-                       "vshrn.u16      d11,    q3, #8  \n\t"
+               "vqrshrn.u16    d8,     q6, #8          \n\t"
+               "vqrshrn.u16    d9,     q2, #8          \n\t"
+                       "vqrshrn.u16    d10,    q7, #8  \n\t"
+                       "vqrshrn.u16    d11,    q3, #8  \n\t"
 
 
                // Add to s
@@ -174,7 +174,7 @@ _op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
                "vmul.u32       d8,     d16, d8         \n\t"
 
                "vmull.u8       q6,     d4,d8           \n\t"
-               "vshrn.u16      d8,     q6, #8          \n\t"
+               "vqrshrn.u16    d8,     q6, #8          \n\t"
                // Add to 's'
                "vqadd.u8       d4,     d8,d0           \n\t"
 
@@ -198,7 +198,7 @@ _op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
                "vmul.u32       d8,     d8, d16         \n\t"
 
                "vmull.u8       q6,     d8,d4           \n\t"
-               "vshrn.u16      d8,     q6, #8          \n\t"
+               "vqrshrn.u16    d8,     q6, #8          \n\t"
                // Add to 's'
                "vqadd.u8       d0,     d0,d8           \n\t"
                "vst1.32        d0[0],  [%[d]]          \n\t"
@@ -251,7 +251,7 @@ _op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
                // Multiply out
                "vmull.u8       q6,     d8, d4                  \n\t"
 
-               "vshrn.u16      d8,     q6, #8                  \n\t"
+               "vqrshrn.u16    d8,     q6, #8                  \n\t"
 
                // Add to s
                "vqadd.u8       d0,     d0,d8                   \n\t"
@@ -282,7 +282,7 @@ _op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
                // Multiply out
                "vmull.u8       q6,     d8, d4                  \n\t"
 
-               "vshrn.u16      d8,     q6, #8                  \n\t"
+               "vqrshrn.u16    d8,     q6, #8                  \n\t"
 
                // Add to s
                "vqadd.u8       d0,     d0,d8                   \n\t"
@@ -325,10 +325,10 @@ _op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
 
                "add    %[pl], %[d], #32\n\t"
 
-               "vshrn.u16      d8,     q6, #8                  \n\t"
-                       "vshrn.u16      d10,    q7, #8          \n\t"
-               "vshrn.u16      d9,     q2, #8                  \n\t"
-                       "vshrn.u16      d11,    q3, #8          \n\t"
+               "vqrshrn.u16    d8,     q6, #8                  \n\t"
+                       "vqrshrn.u16    d10,    q7, #8          \n\t"
+               "vqrshrn.u16    d9,     q2, #8                  \n\t"
+                       "vqrshrn.u16    d11,    q3, #8          \n\t"
                "pld    [%[pl]]\n\t"
 
                "cmp            %[tmp], %[pl]                   \n\t"
@@ -364,7 +364,7 @@ _op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
                // Multiply out
                "vmull.u8       q6,     d8, d4                  \n\t"
 
-               "vshrn.u16      d8,     q6, #8                  \n\t"
+               "vqrshrn.u16    d8,     q6, #8                  \n\t"
 
                // Add to s
                "vqadd.u8       d0,     d0,d8                   \n\t"
@@ -392,7 +392,7 @@ _op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
                // Multiply out
                "vmull.u8       q6,     d8, d4                  \n\t"
 
-               "vshrn.u16      d8,     q6, #8                  \n\t"
+               "vqrshrn.u16    d8,     q6, #8                  \n\t"
 
                // Add to s
                "vqadd.u8       d0,     d0,d8                   \n\t"