evas: enable NEON-optimized code for aarch64.

Summary: Add new define, BUILD_NEON_INTRINSICS to control whether NEON inline code or NEON intrinsics should be built. GCC NEON intrinsics can be built both for armv7 and armv8. However NEON inline code can be built only for armv7. @feature Reviewers: raster, stefan_schmidt, cedric Subscribers: cedric, stefan_schmidt Projects: #efl Differential Revision: https://phab.enlightenment.org/D2309 Signed-off-by: Cedric BAIL <cedric@osg.samsung.com>
2015-04-15 17:21:33 +02:00 · 2015-04-15 17:21:33 +02:00 · 71eec44ccc
parent 88b30ef28c
commit 71eec44ccc
8 changed files with 145 additions and 3 deletions
--- a/configure.ac
+++ b/configure.ac
@ -576,6 +576,21 @@ case $host_cpu in
       CFLAGS="${CFLAGS_save}"
    fi
    ;;
+  aarch64*)
+    if test "x${want_neon}" = "xyes"; then
+       build_cpu_neon="yes"
+       AC_MSG_CHECKING([whether to use NEON instructions])
+       AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[#include <arm_neon.h>]], [[volatile uint32x4_t test = vdupq_n_u32(0x1);]])],[
+           AC_MSG_RESULT([yes])
+           AC_DEFINE([BUILD_NEON], [1], [Build NEON Code])
+           AC_DEFINE([BUILD_NEON_INTRINSICS], [1], [Build NEON Intrinsics])
+           build_cpu_neon="yes"
+       	 ],[
+	   AC_MSG_RESULT([no])
+           build_cpu_neon="no"
+       	 ])
+    fi
+    ;;
 esac

 AC_SUBST([ALTIVEC_CFLAGS])
@ -4741,6 +4756,9 @@ case $host_cpu in
  arm*)
    EFL_ADD_FEATURE([cpu], [neon], [${build_cpu_neon}])
    ;;
+  aarch64*)
+    EFL_ADD_FEATURE([cpu], [neon], [${build_cpu_neon}])
+    ;;
 esac

 if test "${have_linux}" = "yes"; then
--- a/src/lib/evas/common/evas_blit_main.c
+++ b/src/lib/evas/common/evas_blit_main.c
@ -132,6 +132,9 @@ evas_common_copy_rev_pixels_c(DATA32 *src, DATA32 *dst, int len)
 static void
 evas_common_copy_pixels_rev_neon(DATA32 *src, DATA32 *dst, int len)
 {
+#ifdef BUILD_NEON_INTRINSICS
+evas_common_copy_pixels_rev_c(src, dst, len);
+#else
   uint32_t *tmp = (void *)37;
 #define AP	"evas_common_copy_rev_pixels_neon_"
   asm volatile (
@ -228,6 +231,7 @@ evas_common_copy_pixels_rev_neon(DATA32 *src, DATA32 *dst, int len)
   );
 #undef AP

+#endif
 }
 #endif

@ -324,6 +328,9 @@ evas_common_copy_pixels_mmx2(DATA32 *src, DATA32 *dst, int len)
 #ifdef BUILD_NEON
 static void
 evas_common_copy_pixels_neon(DATA32 *src, DATA32 *dst, int len){
+#ifdef BUILD_NEON_INTRINSICS
+evas_common_copy_pixels_c(src, dst, len);
+#else
   uint32_t *e,*tmp = (void *)37;
   e = dst + len;
 #define AP	"evas_common_copy_pixels_neon_"
@ -410,6 +417,7 @@ evas_common_copy_pixels_neon(DATA32 *src, DATA32 *dst, int len){
   );
 #undef AP

+#endif
 }
 #endif /* BUILD_NEON */

--- a/src/lib/evas/common/evas_cpu.c
+++ b/src/lib/evas/common/evas_cpu.c
@ -2,6 +2,11 @@
 #ifdef BUILD_MMX
 #include "evas_mmx.h"
 #endif
+#ifdef BUILD_NEON
+#ifdef BUILD_NEON_INTRINSICS
+#include <arm_neon.h>
+#endif
+#endif
 #if defined BUILD_SSE3
 #include <immintrin.h>
 #endif
@ -92,6 +97,9 @@ evas_common_cpu_neon_test(void)
 {
 //#if defined(__ARM_ARCH__) && (__ARM_ARCH__ >= 70)
 #ifdef BUILD_NEON
+#ifdef BUILD_NEON_INTRINSICS
+   volatile uint32x4_t temp = vdupq_n_u32(0x1);
+#else
   asm volatile (
 		".fpu neon	     \n\t"
                 "vqadd.u8 d0, d1, d0\n"
@ -101,6 +109,7 @@ evas_common_cpu_neon_test(void)
                 "d0", "d1"
                 );
 #endif
+#endif
 //#endif
 }

--- a/src/lib/evas/common/evas_op_blend/op_blend_color_neon.c
+++ b/src/lib/evas/common/evas_op_blend/op_blend_color_neon.c
@ -3,6 +3,14 @@
 #ifdef BUILD_NEON
 static void
 _op_blend_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) {
+#ifdef BUILD_NEON_INTRINSICS
+    DATA32 *e, a = 256 - (c >> 24);
+    UNROLL8_PLD_WHILE(d, l, e,
+                      {
+                         *d = c + MUL_256(a, *d);
+                         d++;
+                      });
+#else
 	DATA32 *e, *tmp = 0;
 #define AP	"B_C_DP"
   asm volatile (
@ -142,7 +150,7 @@ _op_blend_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m EINA_UNUSED, DATA32 c, DATA3

 	);
 #undef AP
-
+#endif
 }

 #define _op_blend_caa_dp_neon _op_blend_c_dp_neon
--- a/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c
+++ b/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c
@ -19,6 +19,30 @@
 #ifdef BUILD_NEON
 static void
 _op_blend_mas_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, int l) {
+#ifdef BUILD_NEON_INTRINSICS
+   DATA32 *e;
+   int alpha = 256 - (c >> 24);
+   UNROLL8_PLD_WHILE(d, l, e,
+                     {
+                        DATA32 a = *m;
+                        switch(a)
+                          {
+                          case 0:
+                             break;
+                          case 255:
+                             *d = c + MUL_256(alpha, *d);
+                             break;
+                          default:
+                               {
+                                  DATA32 mc = MUL_SYM(a, c);
+                                  a = 256 - (mc >> 24);
+                                  *d = mc + MUL_256(a, *d);
+                               }
+                             break;
+                          }
+                        m++;  d++;
+                     });
+#else
   DATA32 *e = d + l;

   // everything we can do only once per cycle
@ -142,12 +166,34 @@ _op_blend_mas_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, in
                                "q10", "q15", "q14", "memory"
        );
    }
+#endif
 }
 #endif

 #ifdef BUILD_NEON
 static void
 _op_blend_mas_can_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, int l) {
+#ifdef BUILD_NEON_INTRINSICS
+   DATA32 *e;
+   int alpha;
+   UNROLL8_PLD_WHILE(d, l, e,
+                     {
+                        alpha = *m;
+                        switch(alpha)
+                          {
+                          case 0:
+                             break;
+                          case 255:
+                             *d = c;
+                             break;
+                          default:
+                             alpha++;
+                             *d = INTERP_256(alpha, c, *d);
+                             break;
+                          }
+                        m++;  d++;
+                     });
+#else
   DATA32 *e,*tmp;
   int alpha;

@ -372,6 +418,7 @@ _op_blend_mas_can_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d,

     );
 #undef AP
+#endif
 }
 #endif

--- a/src/lib/evas/common/evas_op_blend/op_blend_pixel_color_neon.c
+++ b/src/lib/evas/common/evas_op_blend/op_blend_pixel_color_neon.c
@ -7,7 +7,18 @@
 * reads, then two writes, a miss on read is 'just' two reads */
 static void
 _op_blend_p_c_dp_neon(DATA32 * __restrict s, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 * __restrict d, int l) {
-
+#ifdef BUILD_NEON_INTRINSICS
+   DATA32 *e;
+   int alpha;
+   UNROLL8_PLD_WHILE(d, l, e,
+                     {
+                        DATA32 sc = MUL4_SYM(c, *s);
+                        alpha = 256 - (sc >> 24);
+                        *d = sc + MUL_256(alpha, *d);
+                        d++;
+                        s++;
+                     });
+#else
 #define AP	"blend_p_c_dp_"
   asm volatile (
      ".fpu neon\n\t"
@ -92,6 +103,7 @@ _op_blend_p_c_dp_neon(DATA32 * __restrict s, DATA8 *m EINA_UNUSED, DATA32 c, DAT
      : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "memory"
   );
 #undef AP
+#endif
 }

 static void
--- a/src/lib/evas/common/evas_op_blend/op_blend_pixel_neon.c
+++ b/src/lib/evas/common/evas_op_blend/op_blend_pixel_neon.c
@ -3,6 +3,16 @@
 #ifdef BUILD_NEON
 static void
 _op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
+#ifdef BUILD_NEON_INTRINSICS
+   DATA32 *e;
+   int alpha;
+   UNROLL8_PLD_WHILE(d, l, e,
+                     {
+                        alpha = 256 - (*s >> 24);
+                        *d = *s++ + MUL_256(alpha, *d);
+                        d++;
+                     });
+#else
 #define AP "blend_p_dp_"
  asm volatile (
 	".fpu neon					\n\t"
@ -238,11 +248,31 @@ _op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
          : "q0", "q1", "q2","q3", "q4","q5","q6", "q7","q8","memory" // clobbered
   );
 #undef AP
-
+#endif
 }

 static void
 _op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
+#ifdef BUILD_NEON_INTRINSICS
+   DATA32 *e;
+   int alpha;
+   UNROLL8_PLD_WHILE(d, l, e,
+                     {
+                        switch (*s & 0xff000000)
+                          {
+                          case 0:
+                             break;
+                          case 0xff000000:
+                             *d = *s;
+                             break;
+                          default:
+                             alpha = 256 - (*s >> 24);
+                             *d = *s + MUL_256(alpha, *d);
+                             break;
+                          }
+                        s++;  d++;
+                     });
+#else
 #define AP "blend_pas_dp_"
   DATA32 *e = d + l,*tmp  = e + 32,*pl=(void*)912;
      asm volatile (
@ -447,6 +477,7 @@ _op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
 		 "q0","q1","q2","q3","q4","q5","q6","q7","q8","memory"
      );
 #undef AP
+#endif
 }

 #define _op_blend_pan_dp_neon NULL
--- a/src/lib/evas/common/evas_op_copy/op_copy_color_neon.c
+++ b/src/lib/evas/common/evas_op_copy/op_copy_color_neon.c
@ -3,6 +3,14 @@
 #ifdef BUILD_NEON
 static void
 _op_copy_c_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
+#ifdef BUILD_NEON_INTRINSICS
+   DATA32 *e;
+   UNROLL8_PLD_WHILE(d, l, e,
+                     {
+                        *d = c;
+                        d++;
+                     });
+#else
 #define AP "COPY_C_DP_"
   uint32_t *e = d + l,*tmp;
   asm volatile (
@ -85,6 +93,7 @@ _op_copy_c_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {


   );
+#endif
 }

 #define _op_copy_cn_dp_neon _op_copy_c_dp_neon