Evas: Neon: More .fpu neon flags
authornash <nash@7cbeb6ba-43b4-40fd-8cce-4c39aea84d33>
Mon, 6 Dec 2010 04:57:54 +0000 (04:57 +0000)
committernash <nash@7cbeb6ba-43b4-40fd-8cce-4c39aea84d33>
Mon, 6 Dec 2010 04:57:54 +0000 (04:57 +0000)
Also clean up some ugly code.

git-svn-id: svn+ssh://svn.enlightenment.org/var/svn/e/trunk/evas@55314 7cbeb6ba-43b4-40fd-8cce-4c39aea84d33

src/lib/engines/common/evas_blit_main.c
src/lib/engines/common/evas_convert_rgb_32.c
src/lib/engines/common/evas_op_blend/op_blend_mask_color_neon.c
src/lib/engines/common/evas_op_blend/op_blend_pixel_color_neon.c
src/lib/engines/common/evas_op_blend/op_blend_pixel_neon.c
src/lib/engines/common/evas_op_copy/op_copy_pixel_neon.c

index 9322ffe..4c077b3 100644 (file)
@@ -136,6 +136,7 @@ evas_common_copy_pixels_rev_neon(DATA32 *src, DATA32 *dst, int len)
    uint32_t *tmp = (void *)37;
 #define AP     "evas_common_copy_rev_pixels_neon_"
    asm volatile (
+               ".fpu neon                              \n\t"
                // Can we do 32 byte?
                "andS           %[tmp], %[d], $0x1f     \n\t"
                "beq            "AP"quadstart           \n\t"
@@ -334,6 +335,7 @@ evas_common_copy_pixels_neon(DATA32 *src, DATA32 *dst, int len){
    e = dst + len;
 #define AP     "evas_common_copy_pixels_neon_"
    asm volatile (
+               ".fpu neon                              \n\t"
                // Can we do 32 byte?
                "andS           %[tmp], %[d], $0x1f     \n\t"
                "beq            "AP"quadstart           \n\t"
index aba2c4a..41dac6f 100644 (file)
@@ -102,6 +102,7 @@ evas_common_convert_rgba_to_32bpp_rgb_8888_rot_90 (DATA32 *src, DATA8 *dst, int
    } else {
 #define AP     "convert_rgba32_rot_90_"
        asm volatile (
+       ".fpu neon                                              \n\t"
        "       mov             %[s1],  %[src]                  \n\t"
        "       add             %[s1],  %[h],lsl #2             \n\t"
        "       sub             %[s1],  #8                      \n\t"
index 46929f3..f5eb480 100644 (file)
@@ -25,7 +25,7 @@ _op_blend_mas_c_dp_neon(DATA32 *s __UNUSED__, DATA8 *m, DATA32 c, DATA32 *d, int
 
 #define AP "blend_mas_c_dp_"
      asm volatile (
-       ".fpu neon                                      \n\t"
+       ".fpu neon                                              \n\t"
        "       vdup.i32        q15, %[c]                       \n\t"
        "       vmov.i8         q14,    #1                      \n\t"
 
@@ -206,7 +206,7 @@ _op_blend_mas_can_dp_neon(DATA32 *s __UNUSED__, DATA8 *m, DATA32 c, DATA32 *d, i
 
 #define AP     "_blend_mas_can_dp_neon_"
      asm volatile (
-       ".fpu neon                                      \n\t"
+               ".fpu neon                              \n\t"
                "vdup.u32       q9,     %[c]            \n\t"
                "vmov.i8        q15,    #1              \n\t"
                "vmov.i8        q14,    #0              \n\t"
index 51925c3..a57052c 100644 (file)
@@ -6,10 +6,10 @@ static void
 _op_blend_p_c_dp_neon(DATA32 *s, DATA8 *m __UNUSED__, DATA32 c, DATA32 *d, int l) {
 #define AP     "blend_p_c_dp_"
    asm volatile (
-       ".fpu neon                                      \n\t"
+               ".fpu neon                              \n\t"
                // Load 'c'
-               "vdup.u32       q7, %[c]                        \n\t"
-               "vmov.i8        q6, #1                          \n\t"
+               "vdup.u32       q7, %[c]                \n\t"
+               "vmov.i8        q6, #1                  \n\t"
 
                // Choose a loop
                "andS           %[tmp], %[d], $0xf      \n\t"
index 5fcae8d..cba9c66 100644 (file)
@@ -7,18 +7,18 @@ _op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
   asm volatile (
        ".fpu neon                                      \n\t"
        //** init
-       "vmov.i8        q8,     $0x1            \n\t"
+               "vmov.i8        q8,     $0x1            \n\t"
 
        AP "loopchoose:                                 \n\t"
                // If aligned already - straight to octs
-               "andS           %[tmp], %[d],$0x1f              \n\t"
-               "beq            "AP"octloops                    \n\t"
+               "andS           %[tmp], %[d],$0x1f      \n\t"
+               "beq            "AP"octloops            \n\t"
 
-               "andS           %[tmp], %[d],$0xf               \n\t"
-               "beq            "AP"quadloops                   \n\t"
+               "andS           %[tmp], %[d],$0xf       \n\t"
+               "beq            "AP"quadloops           \n\t"
 
-               "andS           %[tmp], %[d],$0x4               \n\t"
-               "beq            "AP"dualloop                    \n\t"
+               "andS           %[tmp], %[d],$0x4       \n\t"
+               "beq            "AP"dualloop            \n\t"
 
        // Only ever executes once, fall through to dual
        AP "singleloop:                                 \n\t"
@@ -106,7 +106,7 @@ _op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
                "cmp            %[tmp], #32             \n\t"
                "ble            "AP"loopout             \n\t"
 
-               "sub            %[tmp],%[e],#64 \n\t"
+               "sub            %[tmp],%[e],#64         \n\t"
 
 
        AP "octloopint:\n\t"
@@ -151,12 +151,8 @@ _op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
                 "bhi    "AP"octloopint\n\t"
 
        AP "loopout:                                    \n\t"
-//"sub %[tmp], %[d], #4\n\t"
-//"vmov.i16    d0, $0xff00 \n\t"
-//"vst1.32     d0[0],  [%[tmp]]                \n\t"
-
-               "cmp            %[d], %[e]\n\t"
-                "beq           "AP"done\n\t"
+               "cmp            %[d], %[e]              \n\t"
+                "beq           "AP"done                \n\t"
                "sub            %[tmp],%[e], %[d]       \n\t"
                "cmp            %[tmp],$0x04            \n\t"
                "ble            "AP"singleloop2         \n\t"
@@ -183,7 +179,7 @@ _op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
                "add            %[d],   #8              \n\t"
 
                "cmp            %[tmp], %[d]            \n\t"
-               "bhi            "AP"dualloop2int                \n\t"
+               "bhi            "AP"dualloop2int        \n\t"
 
                // Single ??
                "cmp            %[e], %[d]              \n\t"
@@ -227,11 +223,11 @@ _op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
 #define AP "blend_pas_dp_"
    DATA32 *e = d + l,*tmp  = e + 32,*pl=(void*)912;
       asm volatile (
-        ".fpu neon                                     \n\t"
+        ".fpu neon                                             \n\t"
                "vmov.i8        q8,     #1                      \n\t"
        AP"loopchoose:                                          \n\t"
                // If aliged - go as fast we can
-               "andS   %[tmp], %[d],   #31             \n\t"
+               "andS   %[tmp], %[d],   #31                     \n\t"
                "beq    "AP"quadstart                           \n\t"
 
                // See if we can at least do our double loop
@@ -312,12 +308,12 @@ _op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
                        "vshr.u32       q5,     q5,$0x18        \n\t"
 
                // Prepare to preload
-               "add    %[pl], %[s], #32\n\t"
+               "add    %[pl], %[s], #32                        \n\t"
 
                // Mulitply into all fields
                "vmul.u32       q4,     q4, q8                  \n\t"
                        "vmul.u32       q5,     q5, q8          \n\t"
-               "pld    [%[pl]]\n\t"
+               "pld    [%[pl]]                                 \n\t"
 
                // Multiply out
                "vmull.u8       q6,     d8, d4                  \n\t"
@@ -325,13 +321,13 @@ _op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
                "vmull.u8       q2,     d9, d5                  \n\t"
                        "vmull.u8       q3,     d11, d7         \n\t"
 
-               "add    %[pl], %[d], #32\n\t"
+               "add    %[pl], %[d], #32                        \n\t"
 
                "vqrshrn.u16    d8,     q6, #8                  \n\t"
                        "vqrshrn.u16    d10,    q7, #8          \n\t"
                "vqrshrn.u16    d9,     q2, #8                  \n\t"
                        "vqrshrn.u16    d11,    q3, #8          \n\t"
-               "pld    [%[pl]]\n\t"
+               "pld    [%[pl]]                                 \n\t"
 
                "cmp            %[tmp], %[pl]                   \n\t"
                // Add to s
@@ -350,11 +346,11 @@ _op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
                "cmp            %[tmp],$0x04                    \n\t"
                "beq            "AP"singleloop2                 \n\t"
 
-               "sub            %[tmp],%[e],$0x7        \n\t"
+               "sub            %[tmp],%[e],$0x7                \n\t"
 
        AP"dualloop2:                                           \n\t"
-               "vldm   %[s]!,  {d0)                            \n\t"
-               "vldm   %[d],   {d4}                            \n\t"
+               "vldm           %[s]!,  {d0)                    \n\t"
+               "vldm           %[d],   {d4}                    \n\t"
 
                // Subtract from 255 (ie negate) and extract alpha channel
                "vmvn.u8        d8,     d0                      \n\t"
@@ -377,8 +373,8 @@ _op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
                "bhi            "AP"dualloop2                   \n\t"
 
                // Single ??
-               "cmp            %[e], %[d]              \n\t"
-               "beq            "AP"done                \n\t"
+               "cmp            %[e], %[d]                      \n\t"
+               "beq            "AP"done                        \n\t"
 
        AP "singleloop2:                                        \n\t"
                "vld1.32        d0[0], [%[s]]                   \n\t"
index bc3748d..b55f7a3 100644 (file)
@@ -6,7 +6,7 @@ _op_copy_p_dp_neon(DATA32 *s, DATA8 *m __UNUSED__, DATA32 c __UNUSED__, DATA32 *
    uint32_t *e;
    e = d + l;
 //#ifdef NEON_INSTRINSICS_OK
-#if 1
+#if 0
    // odd this is faster than the below asm... :(
    e -= 15;
    uint32x4_t col1, col2, col3, col4; 
@@ -35,7 +35,7 @@ _op_copy_p_dp_neon(DATA32 *s, DATA8 *m __UNUSED__, DATA32 c __UNUSED__, DATA32 *
         s3 = s + 8;
         s4 = s + 12;
         asm volatile (
-       ".fpu neon                                      \n\t"
+       ".fpu neon                              \n\t"
                       "asmloop2:\n\t"
                       "cmp %[e], %[d]\n\t"
                       "vld1.32 {d16-d17}, [%[s]]!\n\t"