evas: enable NEON-optimized code for aarch64.

Summary:
Add new define, BUILD_NEON_INTRINSICS to control whether NEON inline code or
NEON intrinsics should be built.

GCC NEON intrinsics can be built both for armv7 and armv8. However NEON inline
code can be built only for armv7.

@feature

Reviewers: raster, stefan_schmidt, cedric

Subscribers: cedric, stefan_schmidt

Projects: #efl

Differential Revision: https://phab.enlightenment.org/D2309

Signed-off-by: Cedric BAIL <cedric@osg.samsung.com>
This commit is contained in:
Yury Usishchev 2015-04-15 17:21:33 +02:00 committed by Cedric BAIL
parent 88b30ef28c
commit 71eec44ccc
8 changed files with 145 additions and 3 deletions

View File

@ -576,6 +576,21 @@ case $host_cpu in
CFLAGS="${CFLAGS_save}"
fi
;;
aarch64*)
if test "x${want_neon}" = "xyes"; then
build_cpu_neon="yes"
AC_MSG_CHECKING([whether to use NEON instructions])
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[#include <arm_neon.h>]], [[volatile uint32x4_t test = vdupq_n_u32(0x1);]])],[
AC_MSG_RESULT([yes])
AC_DEFINE([BUILD_NEON], [1], [Build NEON Code])
AC_DEFINE([BUILD_NEON_INTRINSICS], [1], [Build NEON Intrinsics])
build_cpu_neon="yes"
],[
AC_MSG_RESULT([no])
build_cpu_neon="no"
])
fi
;;
esac
AC_SUBST([ALTIVEC_CFLAGS])
@ -4741,6 +4756,9 @@ case $host_cpu in
arm*)
EFL_ADD_FEATURE([cpu], [neon], [${build_cpu_neon}])
;;
aarch64*)
EFL_ADD_FEATURE([cpu], [neon], [${build_cpu_neon}])
;;
esac
if test "${have_linux}" = "yes"; then

View File

@ -132,6 +132,9 @@ evas_common_copy_rev_pixels_c(DATA32 *src, DATA32 *dst, int len)
static void
evas_common_copy_pixels_rev_neon(DATA32 *src, DATA32 *dst, int len)
{
#ifdef BUILD_NEON_INTRINSICS
evas_common_copy_pixels_rev_c(src, dst, len);
#else
uint32_t *tmp = (void *)37;
#define AP "evas_common_copy_rev_pixels_neon_"
asm volatile (
@ -228,6 +231,7 @@ evas_common_copy_pixels_rev_neon(DATA32 *src, DATA32 *dst, int len)
);
#undef AP
#endif
}
#endif
@ -324,6 +328,9 @@ evas_common_copy_pixels_mmx2(DATA32 *src, DATA32 *dst, int len)
#ifdef BUILD_NEON
static void
evas_common_copy_pixels_neon(DATA32 *src, DATA32 *dst, int len){
#ifdef BUILD_NEON_INTRINSICS
evas_common_copy_pixels_c(src, dst, len);
#else
uint32_t *e,*tmp = (void *)37;
e = dst + len;
#define AP "evas_common_copy_pixels_neon_"
@ -410,6 +417,7 @@ evas_common_copy_pixels_neon(DATA32 *src, DATA32 *dst, int len){
);
#undef AP
#endif
}
#endif /* BUILD_NEON */

View File

@ -2,6 +2,11 @@
#ifdef BUILD_MMX
#include "evas_mmx.h"
#endif
#ifdef BUILD_NEON
#ifdef BUILD_NEON_INTRINSICS
#include <arm_neon.h>
#endif
#endif
#if defined BUILD_SSE3
#include <immintrin.h>
#endif
@ -92,6 +97,9 @@ evas_common_cpu_neon_test(void)
{
//#if defined(__ARM_ARCH__) && (__ARM_ARCH__ >= 70)
#ifdef BUILD_NEON
#ifdef BUILD_NEON_INTRINSICS
volatile uint32x4_t temp = vdupq_n_u32(0x1);
#else
asm volatile (
".fpu neon \n\t"
"vqadd.u8 d0, d1, d0\n"
@ -101,6 +109,7 @@ evas_common_cpu_neon_test(void)
"d0", "d1"
);
#endif
#endif
//#endif
}

View File

@ -3,6 +3,14 @@
#ifdef BUILD_NEON
static void
_op_blend_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) {
#ifdef BUILD_NEON_INTRINSICS
DATA32 *e, a = 256 - (c >> 24);
UNROLL8_PLD_WHILE(d, l, e,
{
*d = c + MUL_256(a, *d);
d++;
});
#else
DATA32 *e, *tmp = 0;
#define AP "B_C_DP"
asm volatile (
@ -142,7 +150,7 @@ _op_blend_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m EINA_UNUSED, DATA32 c, DATA3
);
#undef AP
#endif
}
#define _op_blend_caa_dp_neon _op_blend_c_dp_neon

View File

@ -19,6 +19,30 @@
#ifdef BUILD_NEON
static void
_op_blend_mas_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, int l) {
#ifdef BUILD_NEON_INTRINSICS
DATA32 *e;
int alpha = 256 - (c >> 24);
UNROLL8_PLD_WHILE(d, l, e,
{
DATA32 a = *m;
switch(a)
{
case 0:
break;
case 255:
*d = c + MUL_256(alpha, *d);
break;
default:
{
DATA32 mc = MUL_SYM(a, c);
a = 256 - (mc >> 24);
*d = mc + MUL_256(a, *d);
}
break;
}
m++; d++;
});
#else
DATA32 *e = d + l;
// everything we can do only once per cycle
@ -142,12 +166,34 @@ _op_blend_mas_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, in
"q10", "q15", "q14", "memory"
);
}
#endif
}
#endif
#ifdef BUILD_NEON
static void
_op_blend_mas_can_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, int l) {
#ifdef BUILD_NEON_INTRINSICS
DATA32 *e;
int alpha;
UNROLL8_PLD_WHILE(d, l, e,
{
alpha = *m;
switch(alpha)
{
case 0:
break;
case 255:
*d = c;
break;
default:
alpha++;
*d = INTERP_256(alpha, c, *d);
break;
}
m++; d++;
});
#else
DATA32 *e,*tmp;
int alpha;
@ -372,6 +418,7 @@ _op_blend_mas_can_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d,
);
#undef AP
#endif
}
#endif

View File

@ -7,7 +7,18 @@
* reads, then two writes, a miss on read is 'just' two reads */
static void
_op_blend_p_c_dp_neon(DATA32 * __restrict s, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 * __restrict d, int l) {
#ifdef BUILD_NEON_INTRINSICS
DATA32 *e;
int alpha;
UNROLL8_PLD_WHILE(d, l, e,
{
DATA32 sc = MUL4_SYM(c, *s);
alpha = 256 - (sc >> 24);
*d = sc + MUL_256(alpha, *d);
d++;
s++;
});
#else
#define AP "blend_p_c_dp_"
asm volatile (
".fpu neon\n\t"
@ -92,6 +103,7 @@ _op_blend_p_c_dp_neon(DATA32 * __restrict s, DATA8 *m EINA_UNUSED, DATA32 c, DAT
: "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "memory"
);
#undef AP
#endif
}
static void

View File

@ -3,6 +3,16 @@
#ifdef BUILD_NEON
static void
_op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
#ifdef BUILD_NEON_INTRINSICS
DATA32 *e;
int alpha;
UNROLL8_PLD_WHILE(d, l, e,
{
alpha = 256 - (*s >> 24);
*d = *s++ + MUL_256(alpha, *d);
d++;
});
#else
#define AP "blend_p_dp_"
asm volatile (
".fpu neon \n\t"
@ -238,11 +248,31 @@ _op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
: "q0", "q1", "q2","q3", "q4","q5","q6", "q7","q8","memory" // clobbered
);
#undef AP
#endif
}
static void
_op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
#ifdef BUILD_NEON_INTRINSICS
DATA32 *e;
int alpha;
UNROLL8_PLD_WHILE(d, l, e,
{
switch (*s & 0xff000000)
{
case 0:
break;
case 0xff000000:
*d = *s;
break;
default:
alpha = 256 - (*s >> 24);
*d = *s + MUL_256(alpha, *d);
break;
}
s++; d++;
});
#else
#define AP "blend_pas_dp_"
DATA32 *e = d + l,*tmp = e + 32,*pl=(void*)912;
asm volatile (
@ -447,6 +477,7 @@ _op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
"q0","q1","q2","q3","q4","q5","q6","q7","q8","memory"
);
#undef AP
#endif
}
#define _op_blend_pan_dp_neon NULL

View File

@ -3,6 +3,14 @@
#ifdef BUILD_NEON
static void
_op_copy_c_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
#ifdef BUILD_NEON_INTRINSICS
DATA32 *e;
UNROLL8_PLD_WHILE(d, l, e,
{
*d = c;
d++;
});
#else
#define AP "COPY_C_DP_"
uint32_t *e = d + l,*tmp;
asm volatile (
@ -85,6 +93,7 @@ _op_copy_c_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
);
#endif
}
#define _op_copy_cn_dp_neon _op_copy_c_dp_neon