From 71eec44ccc9ab43e728ba986fadce6c6cfd2ff7c Mon Sep 17 00:00:00 2001 From: Yury Usishchev Date: Wed, 15 Apr 2015 17:21:33 +0200 Subject: [PATCH] evas: enable NEON-optimized code for aarch64. Summary: Add new define, BUILD_NEON_INTRINSICS to control whether NEON inline code or NEON intrinsics should be built. GCC NEON intrinsics can be built both for armv7 and armv8. However NEON inline code can be built only for armv7. @feature Reviewers: raster, stefan_schmidt, cedric Subscribers: cedric, stefan_schmidt Projects: #efl Differential Revision: https://phab.enlightenment.org/D2309 Signed-off-by: Cedric BAIL --- configure.ac | 18 +++++++ src/lib/evas/common/evas_blit_main.c | 8 ++++ src/lib/evas/common/evas_cpu.c | 9 ++++ .../evas_op_blend/op_blend_color_neon.c | 10 +++- .../evas_op_blend/op_blend_mask_color_neon.c | 47 +++++++++++++++++++ .../evas_op_blend/op_blend_pixel_color_neon.c | 14 +++++- .../evas_op_blend/op_blend_pixel_neon.c | 33 ++++++++++++- .../common/evas_op_copy/op_copy_color_neon.c | 9 ++++ 8 files changed, 145 insertions(+), 3 deletions(-) diff --git a/configure.ac b/configure.ac index 9eed98ce9b..63cc54ddce 100644 --- a/configure.ac +++ b/configure.ac @@ -576,6 +576,21 @@ case $host_cpu in CFLAGS="${CFLAGS_save}" fi ;; + aarch64*) + if test "x${want_neon}" = "xyes"; then + build_cpu_neon="yes" + AC_MSG_CHECKING([whether to use NEON instructions]) + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[#include ]], [[volatile uint32x4_t test = vdupq_n_u32(0x1);]])],[ + AC_MSG_RESULT([yes]) + AC_DEFINE([BUILD_NEON], [1], [Build NEON Code]) + AC_DEFINE([BUILD_NEON_INTRINSICS], [1], [Build NEON Intrinsics]) + build_cpu_neon="yes" + ],[ + AC_MSG_RESULT([no]) + build_cpu_neon="no" + ]) + fi + ;; esac AC_SUBST([ALTIVEC_CFLAGS]) @@ -4741,6 +4756,9 @@ case $host_cpu in arm*) EFL_ADD_FEATURE([cpu], [neon], [${build_cpu_neon}]) ;; + aarch64*) + EFL_ADD_FEATURE([cpu], [neon], [${build_cpu_neon}]) + ;; esac if test "${have_linux}" = "yes"; then diff --git a/src/lib/evas/common/evas_blit_main.c b/src/lib/evas/common/evas_blit_main.c index 7f8faa18a4..4da4034742 100644 --- a/src/lib/evas/common/evas_blit_main.c +++ b/src/lib/evas/common/evas_blit_main.c @@ -132,6 +132,9 @@ evas_common_copy_rev_pixels_c(DATA32 *src, DATA32 *dst, int len) static void evas_common_copy_pixels_rev_neon(DATA32 *src, DATA32 *dst, int len) { +#ifdef BUILD_NEON_INTRINSICS +evas_common_copy_pixels_rev_c(src, dst, len); +#else uint32_t *tmp = (void *)37; #define AP "evas_common_copy_rev_pixels_neon_" asm volatile ( @@ -228,6 +231,7 @@ evas_common_copy_pixels_rev_neon(DATA32 *src, DATA32 *dst, int len) ); #undef AP +#endif } #endif @@ -324,6 +328,9 @@ evas_common_copy_pixels_mmx2(DATA32 *src, DATA32 *dst, int len) #ifdef BUILD_NEON static void evas_common_copy_pixels_neon(DATA32 *src, DATA32 *dst, int len){ +#ifdef BUILD_NEON_INTRINSICS +evas_common_copy_pixels_c(src, dst, len); +#else uint32_t *e,*tmp = (void *)37; e = dst + len; #define AP "evas_common_copy_pixels_neon_" @@ -410,6 +417,7 @@ evas_common_copy_pixels_neon(DATA32 *src, DATA32 *dst, int len){ ); #undef AP +#endif } #endif /* BUILD_NEON */ diff --git a/src/lib/evas/common/evas_cpu.c b/src/lib/evas/common/evas_cpu.c index 41390989d8..0f83258806 100644 --- a/src/lib/evas/common/evas_cpu.c +++ b/src/lib/evas/common/evas_cpu.c @@ -2,6 +2,11 @@ #ifdef BUILD_MMX #include "evas_mmx.h" #endif +#ifdef BUILD_NEON +#ifdef BUILD_NEON_INTRINSICS +#include +#endif +#endif #if defined BUILD_SSE3 #include #endif @@ -92,6 +97,9 @@ evas_common_cpu_neon_test(void) { //#if defined(__ARM_ARCH__) && (__ARM_ARCH__ >= 70) #ifdef BUILD_NEON +#ifdef BUILD_NEON_INTRINSICS + volatile uint32x4_t temp = vdupq_n_u32(0x1); +#else asm volatile ( ".fpu neon \n\t" "vqadd.u8 d0, d1, d0\n" @@ -101,6 +109,7 @@ evas_common_cpu_neon_test(void) "d0", "d1" ); #endif +#endif //#endif } diff --git a/src/lib/evas/common/evas_op_blend/op_blend_color_neon.c b/src/lib/evas/common/evas_op_blend/op_blend_color_neon.c index 9e94298cc6..2bf14c1f7c 100644 --- a/src/lib/evas/common/evas_op_blend/op_blend_color_neon.c +++ b/src/lib/evas/common/evas_op_blend/op_blend_color_neon.c @@ -3,6 +3,14 @@ #ifdef BUILD_NEON static void _op_blend_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) { +#ifdef BUILD_NEON_INTRINSICS + DATA32 *e, a = 256 - (c >> 24); + UNROLL8_PLD_WHILE(d, l, e, + { + *d = c + MUL_256(a, *d); + d++; + }); +#else DATA32 *e, *tmp = 0; #define AP "B_C_DP" asm volatile ( @@ -142,7 +150,7 @@ _op_blend_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m EINA_UNUSED, DATA32 c, DATA3 ); #undef AP - +#endif } #define _op_blend_caa_dp_neon _op_blend_c_dp_neon diff --git a/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c b/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c index 99f4b38625..dbeb0638b3 100644 --- a/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c +++ b/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c @@ -19,6 +19,30 @@ #ifdef BUILD_NEON static void _op_blend_mas_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, int l) { +#ifdef BUILD_NEON_INTRINSICS + DATA32 *e; + int alpha = 256 - (c >> 24); + UNROLL8_PLD_WHILE(d, l, e, + { + DATA32 a = *m; + switch(a) + { + case 0: + break; + case 255: + *d = c + MUL_256(alpha, *d); + break; + default: + { + DATA32 mc = MUL_SYM(a, c); + a = 256 - (mc >> 24); + *d = mc + MUL_256(a, *d); + } + break; + } + m++; d++; + }); +#else DATA32 *e = d + l; // everything we can do only once per cycle @@ -142,12 +166,34 @@ _op_blend_mas_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, in "q10", "q15", "q14", "memory" ); } +#endif } #endif #ifdef BUILD_NEON static void _op_blend_mas_can_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, int l) { +#ifdef BUILD_NEON_INTRINSICS + DATA32 *e; + int alpha; + UNROLL8_PLD_WHILE(d, l, e, + { + alpha = *m; + switch(alpha) + { + case 0: + break; + case 255: + *d = c; + break; + default: + alpha++; + *d = INTERP_256(alpha, c, *d); + break; + } + m++; d++; + }); +#else DATA32 *e,*tmp; int alpha; @@ -372,6 +418,7 @@ _op_blend_mas_can_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, ); #undef AP +#endif } #endif diff --git a/src/lib/evas/common/evas_op_blend/op_blend_pixel_color_neon.c b/src/lib/evas/common/evas_op_blend/op_blend_pixel_color_neon.c index d6b3a733dd..c47ec7c47f 100644 --- a/src/lib/evas/common/evas_op_blend/op_blend_pixel_color_neon.c +++ b/src/lib/evas/common/evas_op_blend/op_blend_pixel_color_neon.c @@ -7,7 +7,18 @@ * reads, then two writes, a miss on read is 'just' two reads */ static void _op_blend_p_c_dp_neon(DATA32 * __restrict s, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 * __restrict d, int l) { - +#ifdef BUILD_NEON_INTRINSICS + DATA32 *e; + int alpha; + UNROLL8_PLD_WHILE(d, l, e, + { + DATA32 sc = MUL4_SYM(c, *s); + alpha = 256 - (sc >> 24); + *d = sc + MUL_256(alpha, *d); + d++; + s++; + }); +#else #define AP "blend_p_c_dp_" asm volatile ( ".fpu neon\n\t" @@ -92,6 +103,7 @@ _op_blend_p_c_dp_neon(DATA32 * __restrict s, DATA8 *m EINA_UNUSED, DATA32 c, DAT : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "memory" ); #undef AP +#endif } static void diff --git a/src/lib/evas/common/evas_op_blend/op_blend_pixel_neon.c b/src/lib/evas/common/evas_op_blend/op_blend_pixel_neon.c index 4b9993b42b..3c32790c81 100644 --- a/src/lib/evas/common/evas_op_blend/op_blend_pixel_neon.c +++ b/src/lib/evas/common/evas_op_blend/op_blend_pixel_neon.c @@ -3,6 +3,16 @@ #ifdef BUILD_NEON static void _op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) { +#ifdef BUILD_NEON_INTRINSICS + DATA32 *e; + int alpha; + UNROLL8_PLD_WHILE(d, l, e, + { + alpha = 256 - (*s >> 24); + *d = *s++ + MUL_256(alpha, *d); + d++; + }); +#else #define AP "blend_p_dp_" asm volatile ( ".fpu neon \n\t" @@ -238,11 +248,31 @@ _op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) { : "q0", "q1", "q2","q3", "q4","q5","q6", "q7","q8","memory" // clobbered ); #undef AP - +#endif } static void _op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) { +#ifdef BUILD_NEON_INTRINSICS + DATA32 *e; + int alpha; + UNROLL8_PLD_WHILE(d, l, e, + { + switch (*s & 0xff000000) + { + case 0: + break; + case 0xff000000: + *d = *s; + break; + default: + alpha = 256 - (*s >> 24); + *d = *s + MUL_256(alpha, *d); + break; + } + s++; d++; + }); +#else #define AP "blend_pas_dp_" DATA32 *e = d + l,*tmp = e + 32,*pl=(void*)912; asm volatile ( @@ -447,6 +477,7 @@ _op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) { "q0","q1","q2","q3","q4","q5","q6","q7","q8","memory" ); #undef AP +#endif } #define _op_blend_pan_dp_neon NULL diff --git a/src/lib/evas/common/evas_op_copy/op_copy_color_neon.c b/src/lib/evas/common/evas_op_copy/op_copy_color_neon.c index 96310cdf3a..009bd750ea 100644 --- a/src/lib/evas/common/evas_op_copy/op_copy_color_neon.c +++ b/src/lib/evas/common/evas_op_copy/op_copy_color_neon.c @@ -3,6 +3,14 @@ #ifdef BUILD_NEON static void _op_copy_c_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) { +#ifdef BUILD_NEON_INTRINSICS + DATA32 *e; + UNROLL8_PLD_WHILE(d, l, e, + { + *d = c; + d++; + }); +#else #define AP "COPY_C_DP_" uint32_t *e = d + l,*tmp; asm volatile ( @@ -85,6 +93,7 @@ _op_copy_c_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) { ); +#endif } #define _op_copy_cn_dp_neon _op_copy_c_dp_neon