forked from enlightenment/efl
evas: enable NEON-optimized code for aarch64.
Summary: Add new define, BUILD_NEON_INTRINSICS to control whether NEON inline code or NEON intrinsics should be built. GCC NEON intrinsics can be built both for armv7 and armv8. However NEON inline code can be built only for armv7. @feature Reviewers: raster, stefan_schmidt, cedric Subscribers: cedric, stefan_schmidt Projects: #efl Differential Revision: https://phab.enlightenment.org/D2309 Signed-off-by: Cedric BAIL <cedric@osg.samsung.com>
This commit is contained in:
parent
88b30ef28c
commit
71eec44ccc
18
configure.ac
18
configure.ac
|
@ -576,6 +576,21 @@ case $host_cpu in
|
|||
CFLAGS="${CFLAGS_save}"
|
||||
fi
|
||||
;;
|
||||
aarch64*)
|
||||
if test "x${want_neon}" = "xyes"; then
|
||||
build_cpu_neon="yes"
|
||||
AC_MSG_CHECKING([whether to use NEON instructions])
|
||||
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[#include <arm_neon.h>]], [[volatile uint32x4_t test = vdupq_n_u32(0x1);]])],[
|
||||
AC_MSG_RESULT([yes])
|
||||
AC_DEFINE([BUILD_NEON], [1], [Build NEON Code])
|
||||
AC_DEFINE([BUILD_NEON_INTRINSICS], [1], [Build NEON Intrinsics])
|
||||
build_cpu_neon="yes"
|
||||
],[
|
||||
AC_MSG_RESULT([no])
|
||||
build_cpu_neon="no"
|
||||
])
|
||||
fi
|
||||
;;
|
||||
esac
|
||||
|
||||
AC_SUBST([ALTIVEC_CFLAGS])
|
||||
|
@ -4741,6 +4756,9 @@ case $host_cpu in
|
|||
arm*)
|
||||
EFL_ADD_FEATURE([cpu], [neon], [${build_cpu_neon}])
|
||||
;;
|
||||
aarch64*)
|
||||
EFL_ADD_FEATURE([cpu], [neon], [${build_cpu_neon}])
|
||||
;;
|
||||
esac
|
||||
|
||||
if test "${have_linux}" = "yes"; then
|
||||
|
|
|
@ -132,6 +132,9 @@ evas_common_copy_rev_pixels_c(DATA32 *src, DATA32 *dst, int len)
|
|||
static void
|
||||
evas_common_copy_pixels_rev_neon(DATA32 *src, DATA32 *dst, int len)
|
||||
{
|
||||
#ifdef BUILD_NEON_INTRINSICS
|
||||
evas_common_copy_pixels_rev_c(src, dst, len);
|
||||
#else
|
||||
uint32_t *tmp = (void *)37;
|
||||
#define AP "evas_common_copy_rev_pixels_neon_"
|
||||
asm volatile (
|
||||
|
@ -228,6 +231,7 @@ evas_common_copy_pixels_rev_neon(DATA32 *src, DATA32 *dst, int len)
|
|||
);
|
||||
#undef AP
|
||||
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -324,6 +328,9 @@ evas_common_copy_pixels_mmx2(DATA32 *src, DATA32 *dst, int len)
|
|||
#ifdef BUILD_NEON
|
||||
static void
|
||||
evas_common_copy_pixels_neon(DATA32 *src, DATA32 *dst, int len){
|
||||
#ifdef BUILD_NEON_INTRINSICS
|
||||
evas_common_copy_pixels_c(src, dst, len);
|
||||
#else
|
||||
uint32_t *e,*tmp = (void *)37;
|
||||
e = dst + len;
|
||||
#define AP "evas_common_copy_pixels_neon_"
|
||||
|
@ -410,6 +417,7 @@ evas_common_copy_pixels_neon(DATA32 *src, DATA32 *dst, int len){
|
|||
);
|
||||
#undef AP
|
||||
|
||||
#endif
|
||||
}
|
||||
#endif /* BUILD_NEON */
|
||||
|
||||
|
|
|
@ -2,6 +2,11 @@
|
|||
#ifdef BUILD_MMX
|
||||
#include "evas_mmx.h"
|
||||
#endif
|
||||
#ifdef BUILD_NEON
|
||||
#ifdef BUILD_NEON_INTRINSICS
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
#endif
|
||||
#if defined BUILD_SSE3
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
|
@ -92,6 +97,9 @@ evas_common_cpu_neon_test(void)
|
|||
{
|
||||
//#if defined(__ARM_ARCH__) && (__ARM_ARCH__ >= 70)
|
||||
#ifdef BUILD_NEON
|
||||
#ifdef BUILD_NEON_INTRINSICS
|
||||
volatile uint32x4_t temp = vdupq_n_u32(0x1);
|
||||
#else
|
||||
asm volatile (
|
||||
".fpu neon \n\t"
|
||||
"vqadd.u8 d0, d1, d0\n"
|
||||
|
@ -101,6 +109,7 @@ evas_common_cpu_neon_test(void)
|
|||
"d0", "d1"
|
||||
);
|
||||
#endif
|
||||
#endif
|
||||
//#endif
|
||||
}
|
||||
|
||||
|
|
|
@ -3,6 +3,14 @@
|
|||
#ifdef BUILD_NEON
|
||||
static void
|
||||
_op_blend_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) {
|
||||
#ifdef BUILD_NEON_INTRINSICS
|
||||
DATA32 *e, a = 256 - (c >> 24);
|
||||
UNROLL8_PLD_WHILE(d, l, e,
|
||||
{
|
||||
*d = c + MUL_256(a, *d);
|
||||
d++;
|
||||
});
|
||||
#else
|
||||
DATA32 *e, *tmp = 0;
|
||||
#define AP "B_C_DP"
|
||||
asm volatile (
|
||||
|
@ -142,7 +150,7 @@ _op_blend_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m EINA_UNUSED, DATA32 c, DATA3
|
|||
|
||||
);
|
||||
#undef AP
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
#define _op_blend_caa_dp_neon _op_blend_c_dp_neon
|
||||
|
|
|
@ -19,6 +19,30 @@
|
|||
#ifdef BUILD_NEON
|
||||
static void
|
||||
_op_blend_mas_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, int l) {
|
||||
#ifdef BUILD_NEON_INTRINSICS
|
||||
DATA32 *e;
|
||||
int alpha = 256 - (c >> 24);
|
||||
UNROLL8_PLD_WHILE(d, l, e,
|
||||
{
|
||||
DATA32 a = *m;
|
||||
switch(a)
|
||||
{
|
||||
case 0:
|
||||
break;
|
||||
case 255:
|
||||
*d = c + MUL_256(alpha, *d);
|
||||
break;
|
||||
default:
|
||||
{
|
||||
DATA32 mc = MUL_SYM(a, c);
|
||||
a = 256 - (mc >> 24);
|
||||
*d = mc + MUL_256(a, *d);
|
||||
}
|
||||
break;
|
||||
}
|
||||
m++; d++;
|
||||
});
|
||||
#else
|
||||
DATA32 *e = d + l;
|
||||
|
||||
// everything we can do only once per cycle
|
||||
|
@ -142,12 +166,34 @@ _op_blend_mas_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, in
|
|||
"q10", "q15", "q14", "memory"
|
||||
);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef BUILD_NEON
|
||||
static void
|
||||
_op_blend_mas_can_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, int l) {
|
||||
#ifdef BUILD_NEON_INTRINSICS
|
||||
DATA32 *e;
|
||||
int alpha;
|
||||
UNROLL8_PLD_WHILE(d, l, e,
|
||||
{
|
||||
alpha = *m;
|
||||
switch(alpha)
|
||||
{
|
||||
case 0:
|
||||
break;
|
||||
case 255:
|
||||
*d = c;
|
||||
break;
|
||||
default:
|
||||
alpha++;
|
||||
*d = INTERP_256(alpha, c, *d);
|
||||
break;
|
||||
}
|
||||
m++; d++;
|
||||
});
|
||||
#else
|
||||
DATA32 *e,*tmp;
|
||||
int alpha;
|
||||
|
||||
|
@ -372,6 +418,7 @@ _op_blend_mas_can_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d,
|
|||
|
||||
);
|
||||
#undef AP
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
|
|
|
@ -7,7 +7,18 @@
|
|||
* reads, then two writes, a miss on read is 'just' two reads */
|
||||
static void
|
||||
_op_blend_p_c_dp_neon(DATA32 * __restrict s, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 * __restrict d, int l) {
|
||||
|
||||
#ifdef BUILD_NEON_INTRINSICS
|
||||
DATA32 *e;
|
||||
int alpha;
|
||||
UNROLL8_PLD_WHILE(d, l, e,
|
||||
{
|
||||
DATA32 sc = MUL4_SYM(c, *s);
|
||||
alpha = 256 - (sc >> 24);
|
||||
*d = sc + MUL_256(alpha, *d);
|
||||
d++;
|
||||
s++;
|
||||
});
|
||||
#else
|
||||
#define AP "blend_p_c_dp_"
|
||||
asm volatile (
|
||||
".fpu neon\n\t"
|
||||
|
@ -92,6 +103,7 @@ _op_blend_p_c_dp_neon(DATA32 * __restrict s, DATA8 *m EINA_UNUSED, DATA32 c, DAT
|
|||
: "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "memory"
|
||||
);
|
||||
#undef AP
|
||||
#endif
|
||||
}
|
||||
|
||||
static void
|
||||
|
|
|
@ -3,6 +3,16 @@
|
|||
#ifdef BUILD_NEON
|
||||
static void
|
||||
_op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
|
||||
#ifdef BUILD_NEON_INTRINSICS
|
||||
DATA32 *e;
|
||||
int alpha;
|
||||
UNROLL8_PLD_WHILE(d, l, e,
|
||||
{
|
||||
alpha = 256 - (*s >> 24);
|
||||
*d = *s++ + MUL_256(alpha, *d);
|
||||
d++;
|
||||
});
|
||||
#else
|
||||
#define AP "blend_p_dp_"
|
||||
asm volatile (
|
||||
".fpu neon \n\t"
|
||||
|
@ -238,11 +248,31 @@ _op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
|
|||
: "q0", "q1", "q2","q3", "q4","q5","q6", "q7","q8","memory" // clobbered
|
||||
);
|
||||
#undef AP
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
static void
|
||||
_op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
|
||||
#ifdef BUILD_NEON_INTRINSICS
|
||||
DATA32 *e;
|
||||
int alpha;
|
||||
UNROLL8_PLD_WHILE(d, l, e,
|
||||
{
|
||||
switch (*s & 0xff000000)
|
||||
{
|
||||
case 0:
|
||||
break;
|
||||
case 0xff000000:
|
||||
*d = *s;
|
||||
break;
|
||||
default:
|
||||
alpha = 256 - (*s >> 24);
|
||||
*d = *s + MUL_256(alpha, *d);
|
||||
break;
|
||||
}
|
||||
s++; d++;
|
||||
});
|
||||
#else
|
||||
#define AP "blend_pas_dp_"
|
||||
DATA32 *e = d + l,*tmp = e + 32,*pl=(void*)912;
|
||||
asm volatile (
|
||||
|
@ -447,6 +477,7 @@ _op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
|
|||
"q0","q1","q2","q3","q4","q5","q6","q7","q8","memory"
|
||||
);
|
||||
#undef AP
|
||||
#endif
|
||||
}
|
||||
|
||||
#define _op_blend_pan_dp_neon NULL
|
||||
|
|
|
@ -3,6 +3,14 @@
|
|||
#ifdef BUILD_NEON
|
||||
static void
|
||||
_op_copy_c_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
|
||||
#ifdef BUILD_NEON_INTRINSICS
|
||||
DATA32 *e;
|
||||
UNROLL8_PLD_WHILE(d, l, e,
|
||||
{
|
||||
*d = c;
|
||||
d++;
|
||||
});
|
||||
#else
|
||||
#define AP "COPY_C_DP_"
|
||||
uint32_t *e = d + l,*tmp;
|
||||
asm volatile (
|
||||
|
@ -85,6 +93,7 @@ _op_copy_c_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
|
|||
|
||||
|
||||
);
|
||||
#endif
|
||||
}
|
||||
|
||||
#define _op_copy_cn_dp_neon _op_copy_c_dp_neon
|
||||
|
|
Loading…
Reference in New Issue