From 8e17290f1903d6ec51af517e4ab67d6e959a1843 Mon Sep 17 00:00:00 2001
From: "Snacker (Vladimir)" <wsnacker@mail.ru>
Date: Fri, 28 Feb 2014 07:04:52 +0900
Subject: [PATCH] @feature - Apply NEON intrisics improvement to rotation

---
 configure.ac                              |   3 +
 src/Makefile_Evas.am                      |  16 ++-
 src/lib/evas/common/evas_convert_rgb_32.c | 153 ++++++++++++++++++----
 3 files changed, 142 insertions(+), 30 deletions(-)

diff --git a/configure.ac b/configure.ac
index a0baf94d65..de6222b3b9 100644
--- a/configure.ac
+++ b/configure.ac
@@ -453,6 +453,7 @@ build_cpu_neon="no"
 
 SSE3_CFLAGS=""
 ALTIVEC_CFLAGS=""
+NEON_CFLAGS=""
 
 case $host_cpu in
   i*86|x86_64|amd64)
@@ -524,6 +525,7 @@ case $host_cpu in
         AC_MSG_RESULT([yes])
         AC_DEFINE([BUILD_NEON], [1], [Build NEON Code])
         build_cpu_neon="yes"
+        NEON_CFLAGS="-mfpu=neon"
        ],
        [
         AC_MSG_RESULT([no])
@@ -535,6 +537,7 @@ esac
 
 AC_SUBST([ALTIVEC_CFLAGS])
 AC_SUBST([SSE3_CFLAGS])
+AC_SUBST([NEON_CFLAGS])
 
 #### Checks for linker characteristics
 
diff --git a/src/Makefile_Evas.am b/src/Makefile_Evas.am
index 644391b056..1fa48564ef 100644
--- a/src/Makefile_Evas.am
+++ b/src/Makefile_Evas.am
@@ -138,7 +138,6 @@ lib/evas/common/evas_convert_gry_8.c \
 lib/evas/common/evas_convert_main.c \
 lib/evas/common/evas_convert_rgb_16.c \
 lib/evas/common/evas_convert_rgb_24.c \
-lib/evas/common/evas_convert_rgb_32.c \
 lib/evas/common/evas_convert_rgb_8.c \
 lib/evas/common/evas_convert_grypal_6.c \
 lib/evas/common/evas_convert_yuv.c \
@@ -230,13 +229,28 @@ $(lib_evas_libevas_la_CPPFLAGS) \
 lib_evas_common_libevas_op_blend_sse3_la_LIBADD = @EVAS_LIBS@
 lib_evas_common_libevas_op_blend_sse3_la_DEPENDENCIES = @EVAS_INTERNAL_LIBS@
 
+# maybe neon, maybe not
+noinst_LTLIBRARIES += lib/evas/common/libevas_convert_rgb_32.la
+
+lib_evas_common_libevas_convert_rgb_32_la_SOURCES = \
+lib/evas/common/evas_convert_rgb_32.c
+
+lib_evas_common_libevas_convert_rgb_32_la_CPPFLAGS = -I$(top_builddir)/src/lib/efl \
+$(lib_evas_libevas_la_CPPFLAGS) \
+@NEON_CFLAGS@
+
+lib_evas_common_libevas_convert_rgb_32_la_LIBADD = @EVAS_LIBS@
+lib_evas_common_libevas_convert_rgb_32_la_DEPENDENCIES = @EVAS_INTERNAL_LIBS@
+
 lib_evas_libevas_la_CXXFLAGS =
 
 lib_evas_libevas_la_LIBADD = \
 lib/evas/common/libevas_op_blend_sse3.la \
+lib/evas/common/libevas_convert_rgb_32.la \
 @EVAS_LIBS@
 lib_evas_libevas_la_DEPENDENCIES = \
 lib/evas/common/libevas_op_blend_sse3.la \
+lib/evas/common/libevas_convert_rgb_32.la \
 @EVAS_INTERNAL_LIBS@
 
 lib_evas_libevas_la_LDFLAGS = @EFL_LTLIBRARY_FLAGS@
diff --git a/src/lib/evas/common/evas_convert_rgb_32.c b/src/lib/evas/common/evas_convert_rgb_32.c
index 11c47e26b0..aae9d37e12 100644
--- a/src/lib/evas/common/evas_convert_rgb_32.c
+++ b/src/lib/evas/common/evas_convert_rgb_32.c
@@ -1,5 +1,8 @@
 #include "evas_common_private.h"
 #include "evas_convert_rgb_32.h"
+#ifdef BUILD_NEON
+#include <arm_neon.h>
+#endif
 
 void
 evas_common_convert_rgba_to_32bpp_rgb_8888 (DATA32 *src, DATA8 *dst, int src_jump, int dst_jump, int w, int h, int dith_x EINA_UNUSED, int dith_y EINA_UNUSED, DATA8 *pal EINA_UNUSED)
@@ -41,51 +44,143 @@ evas_common_convert_rgba_to_32bpp_rgb_8888_rot_180 (DATA32 *src, DATA8 *dst, int
 }
 
 #ifdef TILE_ROTATE
+#ifdef BUILD_NEON
+#define ROT90_QUAD_COPY_LOOP \
+   if (evas_common_cpu_has_feature(CPU_FEATURE_NEON)) \
+   { \
+      if((w%4) == 0) \
+      { \
+        int klght = 4 * src_stride; \
+        for(y = 0; y < h; y++) \
+        { \
+          const pix_type *s = &(src[(h - y - 1)]); \
+          pix_type *d = &(dst[(dst_stride * y)]); \
+          pix_type *ptr1 = s; \
+          pix_type *ptr2 = ptr1 + src_stride; \
+          pix_type *ptr3 = ptr2 + src_stride; \
+          pix_type *ptr4 = ptr3 + src_stride; \
+          for(x = 0; x < w; x+=4) \
+          { \
+            pix_type s_array[4] = {*ptr1, *ptr2, *ptr3, *ptr4}; \
+            vst1q_s32(d, vld1q_s32(s_array)); \
+            d += 4; \
+            ptr1 += klght; \
+            ptr2 += klght; \
+            ptr3 += klght; \
+            ptr4 += klght; \
+          } \
+        } \
+      } \
+      else \
+      { \
+        for (y = 0; y < h; y++) \
+          { \
+             const pix_type *s = &(src[(h - y - 1)]); \
+             pix_type *d = &(dst[(dst_stride * y)]); \
+             for (x = 0; x < w; x++) \
+               { \
+                  *d++ = *s; \
+                  s += src_stride; \
+               } \
+          } \
+      } \
+   } \
+   else
+#define ROT270_QUAD_COPY_LOOP \
+   if (evas_common_cpu_has_feature(CPU_FEATURE_NEON)) \
+      if((w%4) == 0) \
+      { \
+        int klght = 4 * src_stride; \
+        for(y = 0; y < h; y++) \
+        { \
+          const pix_type *s = &(src[(src_stride * (w - 1)) + y]); \
+          pix_type *d = &(dst[(dst_stride * y)]); \
+          pix_type *ptr1 = s; \
+          pix_type *ptr2 = ptr1 + src_stride; \
+          pix_type *ptr3 = ptr2 + src_stride; \
+          pix_type *ptr4 = ptr3 + src_stride; \
+          for(x = 0; x < w; x+=4) \
+          { \
+            pix_type s_array[4] = {*ptr1, *ptr2, *ptr3, *ptr4}; \
+            vst1q_s32(d, vld1q_s32(s_array)); \
+            d += 4; \
+            ptr1 += klght; \
+            ptr2 += klght; \
+            ptr3 += klght; \
+            ptr4 += klght; \
+          } \
+        } \
+      } \
+      else \
+      { \
+        for (y = 0; y < h; y++) \
+          { \
+             const pix_type *s = &(src[(src_stride * (w - 1)) + y]); \
+             pix_type *d = &(dst[(dst_stride * y)]); \
+             for (x = 0; x < w; x++) \
+               { \
+                  *d++ = *s; \
+                  s += src_stride; \
+               } \
+          } \
+      } \
+   } \
+   else
+#else
+#define ROT90_QUAD_COPY_LOOP
+#define ROT270_QUAD_COPY_LOOP
+#endif
 #define FAST_SIMPLE_ROTATE(suffix, pix_type) \
    static void \
-   blt_rotated_90_trivial_##suffix(pix_type       *dst, \
+   blt_rotated_90_trivial_##suffix(pix_type * restrict dst, \
                                    int             dst_stride, \
-                                   const pix_type *src, \
+                                   const pix_type * restrict src, \
                                    int             src_stride, \
                                    int             w, \
                                    int             h) \
    { \
       int x, y; \
-      for (y = 0; y < h; y++) \
-        { \
-           const pix_type *s = src + (h - y - 1); \
-           pix_type *d = dst + (dst_stride * y); \
-           for (x = 0; x < w; x++) \
-             { \
-                *d++ = *s; \
-                s += src_stride; \
-             } \
-        } \
+      ROT90_QUAD_COPY_LOOP \
+      { \
+        for (y = 0; y < h; y++) \
+          { \
+             const pix_type *s = &(src[(h - y - 1)]); \
+             pix_type *d = &(dst[(dst_stride * y)]); \
+             for (x = 0; x < w; x++) \
+               { \
+                  *d++ = *s; \
+                  s += src_stride; \
+               } \
+          } \
+      } \
    } \
    static void \
-   blt_rotated_270_trivial_##suffix(pix_type       *dst, \
+   blt_rotated_270_trivial_##suffix(pix_type * restrict dst, \
                                     int             dst_stride, \
-                                    const pix_type *src, \
+                                    const pix_type * restrict src, \
                                     int             src_stride, \
                                     int             w, \
                                     int             h) \
    { \
       int x, y; \
-      for (y = 0; y < h; y++) \
+      ROT270_QUAD_COPY_LOOP \
+      { \
+        for(y = 0; y < h; y++) \
         { \
-           const pix_type *s = src + (src_stride * (w - 1)) + y; \
-           pix_type *d = dst + (dst_stride * y); \
+           const pix_type *s = &(src[(src_stride * (w - 1)) + y]); \
+           pix_type *d = &(dst[(dst_stride * y)]); \
            for (x = 0; x < w; x++) \
-             { \
-                *d++ = *s; \
-                s -= src_stride; \
-             } \
+           { \
+              *d++ = *s; \
+              s -= src_stride; \
+           } \
         } \
+      } \
    } \
    static void \
-   blt_rotated_90_##suffix(pix_type       *dst, \
+   blt_rotated_90_##suffix(pix_type * restrict dst, \
                            int             dst_stride, \
-                           const pix_type *src, \
+                           const pix_type * restrict src, \
                            int             src_stride, \
                            int             w, \
                            int             h) \
@@ -120,7 +215,7 @@ evas_common_convert_rgba_to_32bpp_rgb_8888_rot_180 (DATA32 *src, DATA8 *dst, int
         { \
            blt_rotated_90_trivial_##suffix(dst + x, \
                                            dst_stride, \
-                                           src + (src_stride * x), \
+                                           &(src[(src_stride * x)]), \
                                            src_stride, \
                                            TILE_SIZE, \
                                            h); \
@@ -128,15 +223,15 @@ evas_common_convert_rgba_to_32bpp_rgb_8888_rot_180 (DATA32 *src, DATA8 *dst, int
       if (trailing_pixels) \
         blt_rotated_90_trivial_##suffix(dst + w, \
                                         dst_stride, \
-                                        src + (w * src_stride), \
+                                        &(src[(w * src_stride)]), \
                                         src_stride, \
                                         trailing_pixels, \
                                         h); \
    } \
    static void \
-   blt_rotated_270_##suffix(pix_type       *dst, \
+   blt_rotated_270_##suffix(pix_type * restrict dst, \
                             int             dst_stride, \
-                            const pix_type *src, \
+                            const pix_type * restrict src, \
                             int             src_stride, \
                             int             w, \
                             int             h) \
@@ -151,7 +246,7 @@ evas_common_convert_rgba_to_32bpp_rgb_8888_rot_180 (DATA32 *src, DATA8 *dst, int
              leading_pixels = w; \
            blt_rotated_270_trivial_##suffix(dst, \
                                             dst_stride, \
-                                            src + (src_stride * (w - leading_pixels)), \
+                                            &(src[(src_stride * (w - leading_pixels))]), \
                                             src_stride, \
                                             leading_pixels, \
                                             h); \
@@ -171,7 +266,7 @@ evas_common_convert_rgba_to_32bpp_rgb_8888_rot_180 (DATA32 *src, DATA8 *dst, int
         { \
            blt_rotated_270_trivial_##suffix(dst + x, \
                                             dst_stride, \
-                                            src + (src_stride * (w - x - TILE_SIZE)), \
+                                            &(src[(src_stride * (w - x - TILE_SIZE))]), \
                                             src_stride, \
                                             TILE_SIZE, \
                                             h); \