solve neon rotation issue by moving to the tiled rotator

the tiles rotator is faster no matter what. this will fix D8099 by movoing to tiled rotation and nuking the neon code and we end uop being faster anyway in all cases. @fix
2019-03-09 15:19:28 +00:00 · 2019-03-09 15:19:28 +00:00 · 4758f06e63
parent f234a2b6c7
commit 4758f06e63
2 changed files with 224 additions and 361 deletions
--- a/configure.ac
+++ b/configure.ac
@ -2538,19 +2538,6 @@ AC_ARG_ENABLE([pixman-image-scale-sample],
   ],
   [have_pixman_image_scale_sample="no"])

-# Tile rotate
-AC_ARG_ENABLE([tile-rotate],
-   [AS_HELP_STRING([--enable-tile-rotate],[Enable tiled rotate algorithm. @<:@default=disabled@:>@])],
-   [
-    if test "x${enableval}" = "xyes" ; then
-       have_tile_rotate="yes"
-       CFOPT_WARNING="xyes"
-    else
-       have_tile_rotate="no"
-    fi
-  ],
-  [have_tile_rotate="no"])
-
 # Ecore Buffer
 AC_ARG_ENABLE([ecore-buffer],
   [AS_HELP_STRING([--enable-ecore-buffer],[enable ecore-buffer. @<:@default=disabled@:>@])],
@ -2984,13 +2971,6 @@ AC_CHECK_LIB([m], [lround],

 ### Configuration

-## Tile rotation
-
-if test "x${have_tile_rotate}" = "xyes" ; then
-   AC_DEFINE(TILE_ROTATE, 1, [Enable tiled rotate algorithm])
-fi
-
-
 ## dither options

 AC_ARG_WITH([evas-dither-mask],
--- a/src/lib/evas/common/evas_convert_rgb_32.c
+++ b/src/lib/evas/common/evas_convert_rgb_32.c
@ -1,9 +1,13 @@
 #include "evas_common_private.h"
 #include "evas_convert_rgb_32.h"
 #ifdef BUILD_NEON
-#include <arm_neon.h>
+# include <arm_neon.h>
 #endif

+// tiled rotate is faster in every case i've tested, so just use this
+// by default.
+#define TILE_ROTATE 1
+
 void
 evas_common_convert_rgba_to_32bpp_rgb_8888 (DATA32 *src, DATA8 *dst, int src_jump, int dst_jump, int w, int h, int dith_x EINA_UNUSED, int dith_y EINA_UNUSED, DATA8 *pal EINA_UNUSED)
 {
@ -19,9 +23,9 @@ evas_common_convert_rgba_to_32bpp_rgb_8888 (DATA32 *src, DATA8 *dst, int src_jum

   for (y = 0; y < h; y++)
     {
-	func(src_ptr, dst_ptr, w);
-	src_ptr += w + src_jump;
-	dst_ptr += w + dst_jump;
+        func(src_ptr, dst_ptr, w);
+        src_ptr += w + src_jump;
+        dst_ptr += w + dst_jump;
     }
   return;
 }
@ -44,234 +48,205 @@ evas_common_convert_rgba_to_32bpp_rgb_8888_rot_180 (DATA32 *src, DATA8 *dst, int
 }

 #ifdef TILE_ROTATE
-#ifdef BUILD_NEON
-#define ROT90_QUAD_COPY_LOOP(pix_type) \
-   if (evas_common_cpu_has_feature(CPU_FEATURE_NEON)) \
-   { \
-      if((w%4) == 0) \
-      { \
-        int klght = 4 * src_stride; \
-        for(y = 0; y < h; y++) \
-        { \
-          const pix_type *s = &(src[(h - y - 1)]); \
-          pix_type *d = &(dst[(dst_stride * y)]); \
-          pix_type *ptr1 = s; \
-          pix_type *ptr2 = ptr1 + src_stride; \
-          pix_type *ptr3 = ptr2 + src_stride; \
-          pix_type *ptr4 = ptr3 + src_stride; \
-          for(x = 0; x < w; x+=4) \
-          { \
-            pix_type s_array[4] = {*ptr1, *ptr2, *ptr3, *ptr4}; \
-            vst1q_s32(d, vld1q_s32(s_array)); \
-            d += 4; \
-            ptr1 += klght; \
-            ptr2 += klght; \
-            ptr3 += klght; \
-            ptr4 += klght; \
-          } \
-        } \
+# ifdef BUILD_NEON
+#  define ROT90_QUAD_COPY_LOOP(pix_type) \
+   if (evas_common_cpu_has_feature(CPU_FEATURE_NEON)) { \
+      if ((w % 4) == 0) { \
+         int klght = 4 * src_stride; \
+         for (y = 0; y < h; y++) { \
+            const pix_type *s = &(src[h - y - 1]); \
+            pix_type *d = &(dst[dst_stride * y]); \
+            pix_type *ptr1 = s; \
+            pix_type *ptr2 = ptr1 + src_stride; \
+            pix_type *ptr3 = ptr2 + src_stride; \
+            pix_type *ptr4 = ptr3 + src_stride; \
+            for(x = 0; x < w; x += 4) { \
+               pix_type s_array[4] = { *ptr1, *ptr2, *ptr3, *ptr4 }; \
+               vst1q_s32(d, vld1q_s32(s_array)); \
+               d += 4; \
+               ptr1 += klght; \
+               ptr2 += klght; \
+               ptr3 += klght; \
+               ptr4 += klght; \
+            } \
+         } \
      } \
-      else \
-      { \
-        for (y = 0; y < h; y++) \
-          { \
-             const pix_type *s = &(src[(h - y - 1)]); \
-             pix_type *d = &(dst[(dst_stride * y)]); \
-             for (x = 0; x < w; x++) \
-               { \
-                  *d++ = *s; \
-                  s += src_stride; \
-               } \
-          } \
+      else { \
+         for (y = 0; y < h; y++) { \
+            const pix_type *s = &(src[h - y - 1]); \
+            pix_type *d = &(dst[dst_stride * y]); \
+            for (x = 0; x < w; x++) { \
+               *d++ = *s; \
+               s += src_stride; \
+            } \
+         } \
      } \
   } \
   else
-#define ROT270_QUAD_COPY_LOOP(pix_type) \
-   if (evas_common_cpu_has_feature(CPU_FEATURE_NEON)) \
-   { \
-      if((w%4) == 0) \
-      { \
-        int klght = 4 * src_stride; \
-        for(y = 0; y < h; y++) \
-        { \
-          const pix_type *s = &(src[(src_stride * (w - 1)) + y]); \
-          pix_type *d = &(dst[(dst_stride * y)]); \
-          pix_type *ptr1 = s; \
-          pix_type *ptr2 = ptr1 + src_stride; \
-          pix_type *ptr3 = ptr2 + src_stride; \
-          pix_type *ptr4 = ptr3 + src_stride; \
-          for(x = 0; x < w; x+=4) \
-          { \
-            pix_type s_array[4] = {*ptr1, *ptr2, *ptr3, *ptr4}; \
-            vst1q_s32(d, vld1q_s32(s_array)); \
-            d += 4; \
-            ptr1 += klght; \
-            ptr2 += klght; \
-            ptr3 += klght; \
-            ptr4 += klght; \
-          } \
-        } \
+#  define ROT270_QUAD_COPY_LOOP(pix_type) \
+   if (evas_common_cpu_has_feature(CPU_FEATURE_NEON)) { \
+      if ((w % 4) == 0) { \
+         int klght = 4 * src_stride; \
+         for (y = 0; y < h; y++) { \
+            const pix_type *s = &(src[(src_stride * (w - 1)) + y]); \
+            pix_type *d = &(dst[dst_stride * y]); \
+            pix_type *ptr1 = s; \
+            pix_type *ptr2 = ptr1 + src_stride; \
+            pix_type *ptr3 = ptr2 + src_stride; \
+            pix_type *ptr4 = ptr3 + src_stride; \
+            for(x = 0; x < w; x+=4) { \
+               pix_type s_array[4] = { *ptr1, *ptr2, *ptr3, *ptr4 }; \
+               vst1q_s32(d, vld1q_s32(s_array)); \
+               d += 4; \
+               ptr1 += klght; \
+               ptr2 += klght; \
+               ptr3 += klght; \
+               ptr4 += klght; \
+            } \
+         } \
      } \
-      else \
-      { \
-        for (y = 0; y < h; y++) \
-          { \
-             const pix_type *s = &(src[(src_stride * (w - 1)) + y]); \
-             pix_type *d = &(dst[(dst_stride * y)]); \
-             for (x = 0; x < w; x++) \
-               { \
-                  *d++ = *s; \
-                  s += src_stride; \
-               } \
-          } \
-      } \
-   } \
-   else
-#else
-#define ROT90_QUAD_COPY_LOOP(pix_type)
-#define ROT270_QUAD_COPY_LOOP(pix_type)
-#endif
-#define FAST_SIMPLE_ROTATE(suffix, pix_type) \
-   static void \
-   blt_rotated_90_trivial_##suffix(pix_type * restrict dst, \
-                                   int             dst_stride, \
-                                   const pix_type * restrict src, \
-                                   int             src_stride, \
-                                   int             w, \
-                                   int             h) \
-   { \
-      int x, y; \
-      ROT90_QUAD_COPY_LOOP(pix_type) \
-      { \
-        for (y = 0; y < h; y++) \
-          { \
-             const pix_type *s = &(src[(h - y - 1)]); \
-             pix_type *d = &(dst[(dst_stride * y)]); \
-             for (x = 0; x < w; x++) \
-               { \
-                  *d++ = *s; \
-                  s += src_stride; \
-               } \
-          } \
-      } \
-   } \
-   static void \
-   blt_rotated_270_trivial_##suffix(pix_type * restrict dst, \
-                                    int             dst_stride, \
-                                    const pix_type * restrict src, \
-                                    int             src_stride, \
-                                    int             w, \
-                                    int             h) \
-   { \
-      int x, y; \
-      ROT270_QUAD_COPY_LOOP(pix_type) \
-      { \
-        for(y = 0; y < h; y++) \
-        { \
+      else { \
+        for (y = 0; y < h; y++) { \
           const pix_type *s = &(src[(src_stride * (w - 1)) + y]); \
-           pix_type *d = &(dst[(dst_stride * y)]); \
-           for (x = 0; x < w; x++) \
-           { \
+           pix_type *d = &(dst[dst_stride * y]); \
+           for (x = 0; x < w; x++) { \
              *d++ = *s; \
-              s -= src_stride; \
+              s += src_stride; \
           } \
        } \
      } \
   } \
+   else
+# else
+#  define ROT90_QUAD_COPY_LOOP(pix_type)
+#  define ROT270_QUAD_COPY_LOOP(pix_type)
+# endif
+
+# define FAST_SIMPLE_ROTATE(suffix, pix_type) \
   static void \
-   blt_rotated_90_##suffix(pix_type * restrict dst, \
-                           int             dst_stride, \
+   blt_rotated_90_trivial_##suffix(pix_type       * restrict dst, \
+                                   int              dst_stride, \
+                                   const pix_type * restrict src, \
+                                   int              src_stride, \
+                                   int              w, \
+                                   int              h) \
+   { \
+      int x, y; \
+      ROT90_QUAD_COPY_LOOP(pix_type) { \
+         for (y = 0; y < h; y++) { \
+            const pix_type *s = &(src[h - y - 1]); \
+            pix_type *d = &(dst[dst_stride * y]); \
+            for (x = 0; x < w; x++) { \
+               *d++ = *s; \
+               s += src_stride; \
+            } \
+         } \
+      } \
+   } \
+   static void \
+   blt_rotated_270_trivial_##suffix(pix_type       * restrict dst, \
+                                    int              dst_stride, \
+                                    const pix_type * restrict src, \
+                                    int              src_stride, \
+                                    int              w, \
+                                    int              h) \
+   { \
+      int x, y; \
+      ROT270_QUAD_COPY_LOOP(pix_type) { \
+         for (y = 0; y < h; y++) { \
+            const pix_type *s = &(src[(src_stride * (w - 1)) + y]); \
+            pix_type *d = &(dst[dst_stride * y]); \
+            for (x = 0; x < w; x++) { \
+               *d++ = *s; \
+               s -= src_stride; \
+            } \
+         } \
+      } \
+   } \
+   static void \
+   blt_rotated_90_##suffix(pix_type       * restrict dst, \
+                           int              dst_stride, \
                           const pix_type * restrict src, \
-                           int             src_stride, \
-                           int             w, \
-                           int             h) \
+                           int              src_stride, \
+                           int              w, \
+                           int              h) \
   { \
      int x, leading_pixels = 0, trailing_pixels = 0; \
      const int TILE_SIZE = TILE_CACHE_LINE_SIZE / sizeof(pix_type); \
-      if ((uintptr_t)dst & (TILE_CACHE_LINE_SIZE - 1)) \
-        { \
-           leading_pixels = TILE_SIZE - \
-             (((uintptr_t)dst & (TILE_CACHE_LINE_SIZE - 1)) / sizeof(pix_type)); \
-           if (leading_pixels > w) \
-             leading_pixels = w; \
-           blt_rotated_90_trivial_##suffix(dst, \
-                                           dst_stride, \
-                                           src, \
-                                           src_stride, \
-                                           leading_pixels, \
-                                           h); \
-           dst += leading_pixels; \
-           src += leading_pixels * src_stride; \
-           w -= leading_pixels; \
-        } \
-      if ((uintptr_t)(dst + w) & (TILE_CACHE_LINE_SIZE - 1)) \
-        { \
-           trailing_pixels = (((uintptr_t)(dst + w) & \
-                               (TILE_CACHE_LINE_SIZE - 1)) / sizeof(pix_type)); \
-           if (trailing_pixels > w) \
-             trailing_pixels = w; \
-           w -= trailing_pixels; \
-        } \
-      for (x = 0; x < w; x += TILE_SIZE) \
-        { \
-           blt_rotated_90_trivial_##suffix(dst + x, \
-                                           dst_stride, \
-                                           &(src[(src_stride * x)]), \
-                                           src_stride, \
-                                           TILE_SIZE, \
-                                           h); \
-        } \
+      if ((uintptr_t)dst & (TILE_CACHE_LINE_SIZE - 1)) { \
+         leading_pixels = TILE_SIZE - \
+         (((uintptr_t)dst & (TILE_CACHE_LINE_SIZE - 1)) / sizeof(pix_type)); \
+         if (leading_pixels > w) leading_pixels = w; \
+         blt_rotated_90_trivial_##suffix(dst, \
+                                         dst_stride, \
+                                         src, \
+                                         src_stride, \
+                                         leading_pixels, \
+                                         h); \
+         dst += leading_pixels; \
+         src += leading_pixels * src_stride; \
+         w -= leading_pixels; \
+      } \
+      if ((uintptr_t)(dst + w) & (TILE_CACHE_LINE_SIZE - 1)) { \
+         trailing_pixels = (((uintptr_t)(dst + w) & \
+                             (TILE_CACHE_LINE_SIZE - 1)) / sizeof(pix_type)); \
+         if (trailing_pixels > w) trailing_pixels = w; \
+         w -= trailing_pixels; \
+      } \
+      for (x = 0; x < w; x += TILE_SIZE) { \
+         blt_rotated_90_trivial_##suffix(dst + x, \
+                                         dst_stride, \
+                                         &(src[src_stride * x]), \
+                                         src_stride, \
+                                         TILE_SIZE, \
+                                         h); \
+      } \
      if (trailing_pixels) \
        blt_rotated_90_trivial_##suffix(dst + w, \
                                        dst_stride, \
-                                        &(src[(w * src_stride)]), \
+                                        &(src[src_stride * w]), \
                                        src_stride, \
                                        trailing_pixels, \
                                        h); \
   } \
   static void \
-   blt_rotated_270_##suffix(pix_type * restrict dst, \
-                            int             dst_stride, \
+   blt_rotated_270_##suffix(pix_type       * restrict dst, \
+                            int              dst_stride, \
                            const pix_type * restrict src, \
-                            int             src_stride, \
-                            int             w, \
-                            int             h) \
+                            int              src_stride, \
+                            int              w, \
+                            int              h) \
   { \
      int x, leading_pixels = 0, trailing_pixels = 0; \
      const int TILE_SIZE = TILE_CACHE_LINE_SIZE / sizeof(pix_type); \
-      if ((uintptr_t)dst & (TILE_CACHE_LINE_SIZE - 1)) \
-        { \
-           leading_pixels = TILE_SIZE - \
-             (((uintptr_t)dst & (TILE_CACHE_LINE_SIZE - 1)) / sizeof(pix_type)); \
-           if (leading_pixels > w) \
-             leading_pixels = w; \
-           blt_rotated_270_trivial_##suffix(dst, \
-                                            dst_stride, \
-                                            &(src[(src_stride * (w - leading_pixels))]), \
-                                            src_stride, \
-                                            leading_pixels, \
-                                            h); \
-           dst += leading_pixels; \
-           w -= leading_pixels; \
-        } \
-      if ((uintptr_t)(dst + w) & (TILE_CACHE_LINE_SIZE - 1)) \
-        { \
-           trailing_pixels = (((uintptr_t)(dst + w) & \
-                               (TILE_CACHE_LINE_SIZE - 1)) / sizeof(pix_type)); \
-           if (trailing_pixels > w) \
-             trailing_pixels = w; \
-           w -= trailing_pixels; \
-           src += trailing_pixels * src_stride; \
-        } \
-      for (x = 0; x < w; x += TILE_SIZE) \
-        { \
-           blt_rotated_270_trivial_##suffix(dst + x, \
-                                            dst_stride, \
-                                            &(src[(src_stride * (w - x - TILE_SIZE))]), \
-                                            src_stride, \
-                                            TILE_SIZE, \
-                                            h); \
-        } \
+      if ((uintptr_t)dst & (TILE_CACHE_LINE_SIZE - 1)) { \
+         leading_pixels = TILE_SIZE - \
+         (((uintptr_t)dst & (TILE_CACHE_LINE_SIZE - 1)) / sizeof(pix_type)); \
+         if (leading_pixels > w) leading_pixels = w; \
+         blt_rotated_270_trivial_##suffix(dst, \
+                                          dst_stride, \
+                                          &(src[src_stride * (w - leading_pixels)]), \
+                                          src_stride, \
+                                          leading_pixels, \
+                                          h); \
+         dst += leading_pixels; \
+         w -= leading_pixels; \
+      } \
+      if ((uintptr_t)(dst + w) & (TILE_CACHE_LINE_SIZE - 1)) { \
+         trailing_pixels = (((uintptr_t)(dst + w) & \
+                             (TILE_CACHE_LINE_SIZE - 1)) / sizeof(pix_type)); \
+         if (trailing_pixels > w) trailing_pixels = w; \
+         w -= trailing_pixels; \
+         src += trailing_pixels * src_stride; \
+      } \
+      for (x = 0; x < w; x += TILE_SIZE) { \
+         blt_rotated_270_trivial_##suffix(dst + x, \
+                                          dst_stride, \
+                                          &(src[src_stride * (w - x - TILE_SIZE)]), \
+                                          src_stride, \
+                                          TILE_SIZE, \
+                                          h); \
+      } \
      if (trailing_pixels) \
        blt_rotated_270_trivial_##suffix(dst + w, \
                                         dst_stride, \
@ -288,12 +263,13 @@ void
 evas_common_convert_rgba_to_32bpp_rgb_8888_rot_270 (DATA32 *src, DATA8 *dst, int src_jump, int dst_jump, int w, int h, int dith_x EINA_UNUSED, int dith_y EINA_UNUSED, DATA8 *pal EINA_UNUSED)
 {
 #ifdef TILE_ROTATE
-   blt_rotated_270_8888((DATA8 *)dst,  dst_jump+w, (const DATA8 *)src, src_jump+h,  w, h) ;
+   blt_rotated_270_8888((DATA32 *)dst, dst_jump + w,
+                        src, src_jump + h,
+                        w, h);
 #else
-   DATA32 *src_ptr;
-   DATA32 *dst_ptr;
+   DATA32 *src_ptr *dst_ptr;
   int x, y;
-   
+
   dst_ptr = (DATA32 *)dst;

   CONVERT_LOOP_START_ROT_270();
@ -305,15 +281,32 @@ evas_common_convert_rgba_to_32bpp_rgb_8888_rot_270 (DATA32 *src, DATA8 *dst, int
   return;
 }

+/* speed measuring code - enable when optimizing to compare
+#include <time.h>
+static double
+get_time(void)
+{
+   struct timespec t;
+
+   clock_gettime(CLOCK_MONOTONIC, &t);
+   return (double)t.tv_sec + (((double)t.tv_nsec) / 1000000000.0);
+}
+*/
+
 void
 evas_common_convert_rgba_to_32bpp_rgb_8888_rot_90 (DATA32 *src, DATA8 *dst, int src_jump, int dst_jump, int w, int h, int dith_x EINA_UNUSED, int dith_y EINA_UNUSED, DATA8 *pal EINA_UNUSED)
 {
+/*
+   static double tt = 0.0;
+   static unsigned long long pt = 0;
+   double t0 = get_time();
+ */
 #ifdef TILE_ROTATE
-   blt_rotated_90_8888((DATA8 *)dst,  dst_jump+w, (const DATA8 *)src, src_jump+h, w, h) ;
+   blt_rotated_90_8888((DATA32 *)dst, dst_jump + w,
+                       src, src_jump + h,
+                       w, h);
 #else
-# ifndef BUILD_NEON
-   DATA32 *src_ptr;
-   DATA32 *dst_ptr;
+   DATA32 *src_ptr, *dst_ptr;
   int x, y;

   dst_ptr = (DATA32 *)dst;
@ -322,117 +315,19 @@ evas_common_convert_rgba_to_32bpp_rgb_8888_rot_90 (DATA32 *src, DATA8 *dst, int
   *dst_ptr = *src_ptr;

   CONVERT_LOOP_END_ROT_90();
-# elif defined BUILD_NEON_INTRINSICS
-   DATA32 *src_ptr;
-   DATA32 *dst_ptr;
-   int x, y;
-
-   dst_ptr = (DATA32 *)dst;
-   CONVERT_LOOP_START_ROT_90();
-
-   *dst_ptr = *src_ptr;
-
-   CONVERT_LOOP_END_ROT_90();
-# else
-   if ((w & 1) || (h & 1))
-     {
-        /* Rarely (if ever) if ever: so slow path is fine */
-        DATA32 *src_ptr;
-        DATA32 *dst_ptr;
-        int x, y;
-
-        dst_ptr = (DATA32 *)dst;
-        CONVERT_LOOP_START_ROT_90();
-
-        *dst_ptr = *src_ptr;
-
-        CONVERT_LOOP_END_ROT_90();
-     }
-   else
-     {
-#  define AP  "convert_rgba32_rot_90_"
-        asm volatile (
-        ".fpu neon                      \n\t"
-        "   mov     %[s1],  %[src]          \n\t"
-        "   add     %[s1],  %[s1],  %[h],lsl #2 \n\t"
-        "   sub     %[s1],  #8          \n\t"
-
-        "   mov     %[s2],  %[src]          \n\t"
-        "   add     %[s2],  %[s2],  %[h], lsl #3    \n\t"
-        "   add     %[s2],  %[s2],  %[sjmp], lsr #1 \n\t"
-        "   sub     %[s2],  #8          \n\t"
-
-        "   mov     %[d1],  %[dst]          \n\t"
-
-        "   add     %[d2],  %[d1], %[djmp]      \n\t"
-        "   add     %[d2],  %[d2],  %[w], lsl #2    \n\t"
-
-        "   mov     %[sadv], %[h], lsl #3       \n\t"
-        "   add     %[sadv], %[sadv], %[sjmp], lsl #1\n\t"
-
-        "   mov     %[y],   #0          \n\t"
-        "   mov     %[x],   #0          \n\t"
-        AP"loop:                        \n\t"
-        "   vld1.u32    d0, [%[s1]]         \n\t"
-        "   vld1.u32    d1, [%[s2]]         \n\t"
-        "   add     %[x],   #2          \n\t"
-        "   add     %[s1],  %[sadv]         \n\t"
-        "   add     %[s2],  %[sadv]         \n\t"
-        "   vtrn.u32    d0, d1          \n\t"
-        "   cmp     %[x],   %[w]            \n\t"
-        "   vst1.u32    d1, [%[d1]]!        \n\t"
-        "   vst1.u32    d0, [%[d2]]!        \n\t"
-        "   blt     "AP"loop            \n\t"
-
-        "   mov     %[x],   #0          \n\t"
-        "   add     %[d1],  %[djmp]         \n\t"
-        "   add     %[d1],  %[d1],  %[w], lsl #2    \n\t"
-        "   add     %[d2],  %[djmp]         \n\t"
-        "   add     %[d2],  %[d2],  %[w], lsl #2    \n\t"
-
-        "   mov     %[s1],  %[src]          \n\t"
-        "   add     %[s1],  %[s1],  %[h], lsl #2    \n\t"
-        "   sub     %[s1],  %[s1],  %[y], lsl #2    \n\t"
-        "   sub     %[s1],  #16         \n\t"
-
-        "   add     %[s2],  %[s1], %[h], lsl #2 \n\t"
-        "   add     %[s2],  %[s2],  %[sjmp], lsl #2 \n\t"
-
-        "   add     %[y],   #2          \n\t"
-
-        "   cmp     %[y],   %[h]            \n\t"
-        "   blt     "AP"loop            \n\t"
-
-    : // Out
-    :   [s1] "r" (1),
-        [s2] "r" (11),
-        [d1] "r" (2),
-        [d2] "r" (12),
-        [src] "r" (src),
-        [dst] "r" (dst),
-        [x] "r" (3),
-        [y] "r" (4),
-        [w] "r" (w),
-        [h] "r" (h),
-        [sadv] "r" (5),
-        [sjmp] "r" (src_jump * 4),
-        [djmp] "r" (dst_jump * 4 * 2)
-    : "d0", "d1", "memory", "cc"// Clober
-
-
-        );
-     }
-#  undef AP
-# endif
 #endif
-   return;
+/*
+   double t1 = get_time();
+   tt += t1 - t0;
+   pt += (w * h);
+   printf("%1.2f mpix/sec (%1.9f @ %1.9f)\n", (double)pt / (tt * 1000000), tt, t1);
+*/
 }

 void
 evas_common_convert_rgba_to_32bpp_rgbx_8888 (DATA32 *src, DATA8 *dst, int src_jump, int dst_jump, int w, int h, int dith_x EINA_UNUSED, int dith_y EINA_UNUSED, DATA8 *pal EINA_UNUSED)
 {
-   DATA32 *src_ptr;
-   DATA32 *dst_ptr;
+   DATA32 *src_ptr, *dst_ptr;
   int x, y;

   dst_ptr = (DATA32 *)dst;
@ -449,8 +344,7 @@ evas_common_convert_rgba_to_32bpp_rgbx_8888 (DATA32 *src, DATA8 *dst, int src_ju
 void
 evas_common_convert_rgba_to_32bpp_rgbx_8888_rot_180 (DATA32 *src, DATA8 *dst, int src_jump, int dst_jump, int w, int h, int dith_x EINA_UNUSED, int dith_y EINA_UNUSED, DATA8 *pal EINA_UNUSED)
 {
-   DATA32 *src_ptr;
-   DATA32 *dst_ptr;
+   DATA32 *src_ptr, *dst_ptr;
   int x, y;

   dst_ptr = (DATA32 *)dst;
@ -467,8 +361,7 @@ evas_common_convert_rgba_to_32bpp_rgbx_8888_rot_180 (DATA32 *src, DATA8 *dst, in
 void
 evas_common_convert_rgba_to_32bpp_rgbx_8888_rot_270 (DATA32 *src, DATA8 *dst, int src_jump, int dst_jump, int w, int h, int dith_x EINA_UNUSED, int dith_y EINA_UNUSED, DATA8 *pal EINA_UNUSED)
 {
-   DATA32 *src_ptr;
-   DATA32 *dst_ptr;
+   DATA32 *src_ptr, *dst_ptr;
   int x, y;

   dst_ptr = (DATA32 *)dst;
@ -485,8 +378,7 @@ evas_common_convert_rgba_to_32bpp_rgbx_8888_rot_270 (DATA32 *src, DATA8 *dst, in
 void
 evas_common_convert_rgba_to_32bpp_rgbx_8888_rot_90 (DATA32 *src, DATA8 *dst, int src_jump, int dst_jump, int w, int h, int dith_x EINA_UNUSED, int dith_y EINA_UNUSED, DATA8 *pal EINA_UNUSED)
 {
-   DATA32 *src_ptr;
-   DATA32 *dst_ptr;
+   DATA32 *src_ptr, *dst_ptr;
   int x, y;

   dst_ptr = (DATA32 *)dst;
@ -503,8 +395,7 @@ evas_common_convert_rgba_to_32bpp_rgbx_8888_rot_90 (DATA32 *src, DATA8 *dst, int
 void
 evas_common_convert_rgba_to_32bpp_bgr_8888 (DATA32 *src, DATA8 *dst, int src_jump, int dst_jump, int w, int h, int dith_x EINA_UNUSED, int dith_y EINA_UNUSED, DATA8 *pal EINA_UNUSED)
 {
-   DATA32 *src_ptr;
-   DATA32 *dst_ptr;
+   DATA32 *src_ptr, *dst_ptr;
   int x, y;

   dst_ptr = (DATA32 *)dst;
@ -520,8 +411,7 @@ evas_common_convert_rgba_to_32bpp_bgr_8888 (DATA32 *src, DATA8 *dst, int src_jum
 void
 evas_common_convert_rgba_to_32bpp_bgr_8888_rot_180 (DATA32 *src, DATA8 *dst, int src_jump, int dst_jump, int w, int h, int dith_x EINA_UNUSED, int dith_y EINA_UNUSED, DATA8 *pal EINA_UNUSED)
 {
-   DATA32 *src_ptr;
-   DATA32 *dst_ptr;
+   DATA32 *src_ptr, *dst_ptr;
   int x, y;

   dst_ptr = (DATA32 *)dst;
@ -537,8 +427,7 @@ evas_common_convert_rgba_to_32bpp_bgr_8888_rot_180 (DATA32 *src, DATA8 *dst, int
 void
 evas_common_convert_rgba_to_32bpp_bgr_8888_rot_270 (DATA32 *src, DATA8 *dst, int src_jump, int dst_jump, int w, int h, int dith_x EINA_UNUSED, int dith_y EINA_UNUSED, DATA8 *pal EINA_UNUSED)
 {
-   DATA32 *src_ptr;
-   DATA32 *dst_ptr;
+   DATA32 *src_ptr, *dst_ptr;
   int x, y;

   dst_ptr = (DATA32 *)dst;
@ -554,8 +443,7 @@ evas_common_convert_rgba_to_32bpp_bgr_8888_rot_270 (DATA32 *src, DATA8 *dst, int
 void
 evas_common_convert_rgba_to_32bpp_bgr_8888_rot_90 (DATA32 *src, DATA8 *dst, int src_jump, int dst_jump, int w, int h, int dith_x EINA_UNUSED, int dith_y EINA_UNUSED, DATA8 *pal EINA_UNUSED)
 {
-   DATA32 *src_ptr;
-   DATA32 *dst_ptr;
+   DATA32 *src_ptr, *dst_ptr;
   int x, y;

   dst_ptr = (DATA32 *)dst;
@ -571,8 +459,7 @@ evas_common_convert_rgba_to_32bpp_bgr_8888_rot_90 (DATA32 *src, DATA8 *dst, int
 void
 evas_common_convert_rgba_to_32bpp_bgrx_8888 (DATA32 *src, DATA8 *dst, int src_jump, int dst_jump, int w, int h, int dith_x EINA_UNUSED, int dith_y EINA_UNUSED, DATA8 *pal EINA_UNUSED)
 {
-   DATA32 *src_ptr;
-   DATA32 *dst_ptr;
+   DATA32 *src_ptr, *dst_ptr;
   int x, y;

   dst_ptr = (DATA32 *)dst;
@ -588,8 +475,7 @@ evas_common_convert_rgba_to_32bpp_bgrx_8888 (DATA32 *src, DATA8 *dst, int src_ju
 void
 evas_common_convert_rgba_to_32bpp_bgrx_8888_rot_180 (DATA32 *src, DATA8 *dst, int src_jump, int dst_jump, int w, int h, int dith_x EINA_UNUSED, int dith_y EINA_UNUSED, DATA8 *pal EINA_UNUSED)
 {
-   DATA32 *src_ptr;
-   DATA32 *dst_ptr;
+   DATA32 *src_ptr, *dst_ptr;
   int x, y;

   dst_ptr = (DATA32 *)dst;
@ -605,8 +491,7 @@ evas_common_convert_rgba_to_32bpp_bgrx_8888_rot_180 (DATA32 *src, DATA8 *dst, in
 void
 evas_common_convert_rgba_to_32bpp_bgrx_8888_rot_270 (DATA32 *src, DATA8 *dst, int src_jump, int dst_jump, int w, int h, int dith_x EINA_UNUSED, int dith_y EINA_UNUSED, DATA8 *pal EINA_UNUSED)
 {
-   DATA32 *src_ptr;
-   DATA32 *dst_ptr;
+   DATA32 *src_ptr, *dst_ptr;
   int x, y;

   dst_ptr = (DATA32 *)dst;
@ -622,8 +507,7 @@ evas_common_convert_rgba_to_32bpp_bgrx_8888_rot_270 (DATA32 *src, DATA8 *dst, in
 void
 evas_common_convert_rgba_to_32bpp_bgrx_8888_rot_90 (DATA32 *src, DATA8 *dst, int src_jump, int dst_jump, int w, int h, int dith_x EINA_UNUSED, int dith_y EINA_UNUSED, DATA8 *pal EINA_UNUSED)
 {
-   DATA32 *src_ptr;
-   DATA32 *dst_ptr;
+   DATA32 *src_ptr, *dst_ptr;
   int x, y;

   dst_ptr = (DATA32 *)dst;
@ -639,8 +523,7 @@ evas_common_convert_rgba_to_32bpp_bgrx_8888_rot_90 (DATA32 *src, DATA8 *dst, int
 void
 evas_common_convert_rgba_to_32bpp_rgb_666(DATA32 *src, DATA8 *dst, int src_jump, int dst_jump, int w, int h, int dith_x EINA_UNUSED, int dith_y EINA_UNUSED, DATA8 *pal EINA_UNUSED)
 {
-   DATA32 *src_ptr;
-   DATA32 *dst_ptr;
+   DATA32 *src_ptr, *dst_ptr;
   int x, y;

   dst_ptr = (DATA32 *)dst;