diff --git a/configure.ac b/configure.ac index 9eed98ce9b..63cc54ddce 100644 --- a/configure.ac +++ b/configure.ac @@ -576,6 +576,21 @@ case $host_cpu in CFLAGS="${CFLAGS_save}" fi ;; + aarch64*) + if test "x${want_neon}" = "xyes"; then + build_cpu_neon="yes" + AC_MSG_CHECKING([whether to use NEON instructions]) + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[#include ]], [[volatile uint32x4_t test = vdupq_n_u32(0x1);]])],[ + AC_MSG_RESULT([yes]) + AC_DEFINE([BUILD_NEON], [1], [Build NEON Code]) + AC_DEFINE([BUILD_NEON_INTRINSICS], [1], [Build NEON Intrinsics]) + build_cpu_neon="yes" + ],[ + AC_MSG_RESULT([no]) + build_cpu_neon="no" + ]) + fi + ;; esac AC_SUBST([ALTIVEC_CFLAGS]) @@ -4741,6 +4756,9 @@ case $host_cpu in arm*) EFL_ADD_FEATURE([cpu], [neon], [${build_cpu_neon}]) ;; + aarch64*) + EFL_ADD_FEATURE([cpu], [neon], [${build_cpu_neon}]) + ;; esac if test "${have_linux}" = "yes"; then diff --git a/src/lib/evas/common/evas_blit_main.c b/src/lib/evas/common/evas_blit_main.c index 7f8faa18a4..4da4034742 100644 --- a/src/lib/evas/common/evas_blit_main.c +++ b/src/lib/evas/common/evas_blit_main.c @@ -132,6 +132,9 @@ evas_common_copy_rev_pixels_c(DATA32 *src, DATA32 *dst, int len) static void evas_common_copy_pixels_rev_neon(DATA32 *src, DATA32 *dst, int len) { +#ifdef BUILD_NEON_INTRINSICS +evas_common_copy_pixels_rev_c(src, dst, len); +#else uint32_t *tmp = (void *)37; #define AP "evas_common_copy_rev_pixels_neon_" asm volatile ( @@ -228,6 +231,7 @@ evas_common_copy_pixels_rev_neon(DATA32 *src, DATA32 *dst, int len) ); #undef AP +#endif } #endif @@ -324,6 +328,9 @@ evas_common_copy_pixels_mmx2(DATA32 *src, DATA32 *dst, int len) #ifdef BUILD_NEON static void evas_common_copy_pixels_neon(DATA32 *src, DATA32 *dst, int len){ +#ifdef BUILD_NEON_INTRINSICS +evas_common_copy_pixels_c(src, dst, len); +#else uint32_t *e,*tmp = (void *)37; e = dst + len; #define AP "evas_common_copy_pixels_neon_" @@ -410,6 +417,7 @@ evas_common_copy_pixels_neon(DATA32 *src, DATA32 *dst, int len){ ); #undef AP +#endif } #endif /* BUILD_NEON */ diff --git a/src/lib/evas/common/evas_cpu.c b/src/lib/evas/common/evas_cpu.c index 41390989d8..0f83258806 100644 --- a/src/lib/evas/common/evas_cpu.c +++ b/src/lib/evas/common/evas_cpu.c @@ -2,6 +2,11 @@ #ifdef BUILD_MMX #include "evas_mmx.h" #endif +#ifdef BUILD_NEON +#ifdef BUILD_NEON_INTRINSICS +#include +#endif +#endif #if defined BUILD_SSE3 #include #endif @@ -92,6 +97,9 @@ evas_common_cpu_neon_test(void) { //#if defined(__ARM_ARCH__) && (__ARM_ARCH__ >= 70) #ifdef BUILD_NEON +#ifdef BUILD_NEON_INTRINSICS + volatile uint32x4_t temp = vdupq_n_u32(0x1); +#else asm volatile ( ".fpu neon \n\t" "vqadd.u8 d0, d1, d0\n" @@ -101,6 +109,7 @@ evas_common_cpu_neon_test(void) "d0", "d1" ); #endif +#endif //#endif } diff --git a/src/lib/evas/common/evas_op_blend/op_blend_color_neon.c b/src/lib/evas/common/evas_op_blend/op_blend_color_neon.c index 9e94298cc6..2bf14c1f7c 100644 --- a/src/lib/evas/common/evas_op_blend/op_blend_color_neon.c +++ b/src/lib/evas/common/evas_op_blend/op_blend_color_neon.c @@ -3,6 +3,14 @@ #ifdef BUILD_NEON static void _op_blend_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) { +#ifdef BUILD_NEON_INTRINSICS + DATA32 *e, a = 256 - (c >> 24); + UNROLL8_PLD_WHILE(d, l, e, + { + *d = c + MUL_256(a, *d); + d++; + }); +#else DATA32 *e, *tmp = 0; #define AP "B_C_DP" asm volatile ( @@ -142,7 +150,7 @@ _op_blend_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m EINA_UNUSED, DATA32 c, DATA3 ); #undef AP - +#endif } #define _op_blend_caa_dp_neon _op_blend_c_dp_neon diff --git a/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c b/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c index 99f4b38625..dbeb0638b3 100644 --- a/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c +++ b/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c @@ -19,6 +19,30 @@ #ifdef BUILD_NEON static void _op_blend_mas_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, int l) { +#ifdef BUILD_NEON_INTRINSICS + DATA32 *e; + int alpha = 256 - (c >> 24); + UNROLL8_PLD_WHILE(d, l, e, + { + DATA32 a = *m; + switch(a) + { + case 0: + break; + case 255: + *d = c + MUL_256(alpha, *d); + break; + default: + { + DATA32 mc = MUL_SYM(a, c); + a = 256 - (mc >> 24); + *d = mc + MUL_256(a, *d); + } + break; + } + m++; d++; + }); +#else DATA32 *e = d + l; // everything we can do only once per cycle @@ -142,12 +166,34 @@ _op_blend_mas_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, in "q10", "q15", "q14", "memory" ); } +#endif } #endif #ifdef BUILD_NEON static void _op_blend_mas_can_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, int l) { +#ifdef BUILD_NEON_INTRINSICS + DATA32 *e; + int alpha; + UNROLL8_PLD_WHILE(d, l, e, + { + alpha = *m; + switch(alpha) + { + case 0: + break; + case 255: + *d = c; + break; + default: + alpha++; + *d = INTERP_256(alpha, c, *d); + break; + } + m++; d++; + }); +#else DATA32 *e,*tmp; int alpha; @@ -372,6 +418,7 @@ _op_blend_mas_can_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, ); #undef AP +#endif } #endif diff --git a/src/lib/evas/common/evas_op_blend/op_blend_pixel_color_neon.c b/src/lib/evas/common/evas_op_blend/op_blend_pixel_color_neon.c index d6b3a733dd..c47ec7c47f 100644 --- a/src/lib/evas/common/evas_op_blend/op_blend_pixel_color_neon.c +++ b/src/lib/evas/common/evas_op_blend/op_blend_pixel_color_neon.c @@ -7,7 +7,18 @@ * reads, then two writes, a miss on read is 'just' two reads */ static void _op_blend_p_c_dp_neon(DATA32 * __restrict s, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 * __restrict d, int l) { - +#ifdef BUILD_NEON_INTRINSICS + DATA32 *e; + int alpha; + UNROLL8_PLD_WHILE(d, l, e, + { + DATA32 sc = MUL4_SYM(c, *s); + alpha = 256 - (sc >> 24); + *d = sc + MUL_256(alpha, *d); + d++; + s++; + }); +#else #define AP "blend_p_c_dp_" asm volatile ( ".fpu neon\n\t" @@ -92,6 +103,7 @@ _op_blend_p_c_dp_neon(DATA32 * __restrict s, DATA8 *m EINA_UNUSED, DATA32 c, DAT : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "memory" ); #undef AP +#endif } static void diff --git a/src/lib/evas/common/evas_op_blend/op_blend_pixel_neon.c b/src/lib/evas/common/evas_op_blend/op_blend_pixel_neon.c index 4b9993b42b..3c32790c81 100644 --- a/src/lib/evas/common/evas_op_blend/op_blend_pixel_neon.c +++ b/src/lib/evas/common/evas_op_blend/op_blend_pixel_neon.c @@ -3,6 +3,16 @@ #ifdef BUILD_NEON static void _op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) { +#ifdef BUILD_NEON_INTRINSICS + DATA32 *e; + int alpha; + UNROLL8_PLD_WHILE(d, l, e, + { + alpha = 256 - (*s >> 24); + *d = *s++ + MUL_256(alpha, *d); + d++; + }); +#else #define AP "blend_p_dp_" asm volatile ( ".fpu neon \n\t" @@ -238,11 +248,31 @@ _op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) { : "q0", "q1", "q2","q3", "q4","q5","q6", "q7","q8","memory" // clobbered ); #undef AP - +#endif } static void _op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) { +#ifdef BUILD_NEON_INTRINSICS + DATA32 *e; + int alpha; + UNROLL8_PLD_WHILE(d, l, e, + { + switch (*s & 0xff000000) + { + case 0: + break; + case 0xff000000: + *d = *s; + break; + default: + alpha = 256 - (*s >> 24); + *d = *s + MUL_256(alpha, *d); + break; + } + s++; d++; + }); +#else #define AP "blend_pas_dp_" DATA32 *e = d + l,*tmp = e + 32,*pl=(void*)912; asm volatile ( @@ -447,6 +477,7 @@ _op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) { "q0","q1","q2","q3","q4","q5","q6","q7","q8","memory" ); #undef AP +#endif } #define _op_blend_pan_dp_neon NULL diff --git a/src/lib/evas/common/evas_op_copy/op_copy_color_neon.c b/src/lib/evas/common/evas_op_copy/op_copy_color_neon.c index 96310cdf3a..009bd750ea 100644 --- a/src/lib/evas/common/evas_op_copy/op_copy_color_neon.c +++ b/src/lib/evas/common/evas_op_copy/op_copy_color_neon.c @@ -3,6 +3,14 @@ #ifdef BUILD_NEON static void _op_copy_c_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) { +#ifdef BUILD_NEON_INTRINSICS + DATA32 *e; + UNROLL8_PLD_WHILE(d, l, e, + { + *d = c; + d++; + }); +#else #define AP "COPY_C_DP_" uint32_t *e = d + l,*tmp; asm volatile ( @@ -85,6 +93,7 @@ _op_copy_c_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) { ); +#endif } #define _op_copy_cn_dp_neon _op_copy_c_dp_neon