Add neon for upscaling and map routines in evas.

This commit is contained in:
Yury Usischev 2013-08-02 18:06:55 +09:00 committed by Carsten Haitzler (Rasterman)
parent a3165bff15
commit bd6de4ba8c
8 changed files with 220 additions and 7 deletions

View File

@ -125,6 +125,7 @@ Patryk Kaczmarek <patryk.k@samsung.com>
Zbigniew Kosinski <z.kosinski@samsung.com>
Paulo Cavalcanti <paulo.cavalcanti@linux.intel.com>
Jean-Philippe Andre <jp.andre@samsung.com>
Yury Usischev <y.usishchev@samsung.com>
Ecore

View File

@ -1,3 +1,7 @@
2013-08-02 Yury Usischev
* Add neon optimizations for several scaling/map routines in evas
2013-08-02 Cedric Bail
* Evas: change mapping policy for image loader (RANDOM during header,

1
NEWS
View File

@ -201,6 +201,7 @@ Improvements:
- Use eo array of callbacks to reduce callbacks memory footprint of Evas_Object_Box and Evas_Object_Table.
- Optimized path for when map use the same color for all corner.
- Asynchronous preload of GL texture.
- Add neon assembly for upscaling and map routines
* Ecore_Con:
- Rebase dns.c against upstream
* Edje:

View File

@ -19,6 +19,9 @@
#ifdef SCALE_USING_MMX
pxor_r2r(mm0, mm0);
MOV_A2R(ALPHA_255, mm5)
#elif defined SCALE_USING_NEON
FPU_NEON;
VMOV_I2R_NEON(q2, #255);
#endif
line = &(spans[y - ystart]);

View File

@ -1,13 +1,27 @@
#ifdef SMOOTH
{
# ifdef SCALE_USING_MMX
# ifdef COLMUL
# ifdef COLSAME
# ifdef COLMUL
# ifdef COLSAME
MOV_P2R(c1, mm7, mm0); // col
# endif
# endif
# endif
while (ww > 0)
# endif
# ifdef SCALE_USING_NEON
# ifdef COLMUL
# ifndef COLBLACK
// this part can be done here as c1 and c2 are constants in the cycle
FPU_NEON;
VMOV_M2R_NEON(d18, c1);
VEOR_NEON(q8);
VMOV_M2R_NEON(d19, c2);
VZIP_NEON(q9, q8);
VMOV_R2R_NEON(d19, d16);
// here we have c1 and c2 spread through q9 register
# endif
# endif
# endif
while (ww > 0)
{
# ifdef COLBLACK
*d = 0xff000000; // col
@ -77,6 +91,41 @@
# endif
# endif
MOV_R2P(mm1, *d, mm0);
# elif defined SCALE_USING_NEON
// not sure if we need this condition, but it doesn't affect the result
if (val1 | val2 | val3 | val4)
{
FPU_NEON;
# ifdef COLMUL
// initialize alpha for interpolation of c1 and c2
VDUP_NEON(d15, cv >> 16);
// copy c1 and c2 as algorithm will overwrite it
VMOV_R2R_NEON(q6, q9);
cv += cd; // col
# endif
VMOV_M2R_NEON(d8, val1);
VEOR_NEON(q0);
VMOV_M2R_NEON(d9, val3);
VMOV_M2R_NEON(d10, val2);
VEOR_NEON(q1);
VMOV_M2R_NEON(d11, val4);
VDUP_NEON(q3, ru);
VDUP_NEON(d14, rv);
VZIP_NEON(q4, q0);
VZIP_NEON(q5, q1);
VMOV_R2R_NEON(d9, d0);
VMOV_R2R_NEON(d11, d2);
// by this point we have all required data in right registers
INTERP_256_NEON(q3, q5, q4, q2); // interpolate val1,val2 and val3,val4
VSWP_NEON(d9, d12); // move result of val3,val4 interpolation (and c1 if COLMUL is defined) for next step
INTERP_256_NEON(q7, q6, q4, q2); // second stage of interpolation, also here c1 and c2 are interpolated
# ifdef COLMUL
MUL4_SYM_NEON(d8, d9, d4); // do required multiplication
# endif
VMOV_R2M_NEON(q4, d8, d); // save result to d
}
else
*d = val1;
# else
val1 = INTERP_256(ru, val2, val1);
val3 = INTERP_256(ru, val4, val3);
@ -102,10 +151,23 @@
}
#else
{
# ifdef SCALE_USING_NEON
# ifdef COLMUL
# ifndef COLBLACK
// c1 and c2 are constants inside the cycle
FPU_NEON;
VMOV_M2R_NEON(d10, c1);
VEOR_NEON(q0);
VMOV_M2R_NEON(d11, c2);
VZIP_NEON(q5, q0);
VMOV_R2R_NEON(d11, d0);
# endif
# endif
# endif
while (ww > 0)
{
# ifdef COLMUL
# ifndef COLBLACK
# ifndef COLBLACK
DATA32 val1;
# ifdef COLSAME
# else
@ -121,11 +183,27 @@
# ifdef COLMUL
val1 = *s; // col
# ifdef COLSAME
# ifdef SCALE_USING_NEON
*d = MUL4_SYM(c1, val1);
# else
# else
*d = MUL4_SYM(c1, val1); // XXX: do this in neon
# endif
# else
# ifdef SCALE_USING_NEON
FPU_NEON;
VMOV_M2R_NEON(d12, val1);
VMOV_R2R_NEON(q4, q5);
VEOR_NEON(q1);
VDUP_NEON(d15, cv >> 16);
VZIP_NEON(q6, q1);
INTERP_256_NEON(d15, d9, d8, d4); // interpolate c1 and c2
MUL4_SYM_NEON(d8, d12, d4); // multiply
VMOV_R2M_NEON(q4, d8, d); // save result
# else
cval = INTERP_256((cv >> 16), c2, c1); // col
*d = MUL4_SYM(cval, val1);
cv += cd; // col
# endif
# endif
# else
*d = *s;

View File

@ -97,6 +97,15 @@ scale_calc_a_points(int *p, int s, int d, int c, int cc)
# include "evas_scale_smooth_scaler.c"
#endif
#ifdef BUILD_NEON
# undef SCALE_FUNC
# undef SCALE_USING_NEON
# define SCALE_USING_NEON
# define SCALE_FUNC evas_common_scale_rgba_in_to_out_clip_smooth_neon
# include "evas_scale_smooth_scaler.c"
# undef SCALE_USING_NEON
#endif
#undef SCALE_FUNC
#define SCALE_FUNC _evas_common_scale_rgba_in_to_out_clip_smooth_c
#undef SCALE_USING_MMX
@ -196,6 +205,11 @@ evas_common_scale_rgba_in_to_out_clip_smooth(RGBA_Image *src, RGBA_Image *dst,
if (mmx)
cb = evas_common_scale_rgba_in_to_out_clip_smooth_mmx;
else
#endif
#ifdef BUILD_NEON
if (evas_common_cpu_has_feature(CPU_FEATURE_NEON))
cb = evas_common_scale_rgba_in_to_out_clip_smooth_neon;
else
#endif
cb = evas_common_scale_rgba_in_to_out_clip_smooth_c;
@ -222,6 +236,16 @@ evas_common_scale_rgba_smooth_draw(RGBA_Image *src, RGBA_Image *dst, int dst_cli
src_region_x, src_region_y, src_region_w, src_region_h,
dst_region_x, dst_region_y, dst_region_w, dst_region_h);
else
#endif
#ifdef BUILD_NEON
if (evas_common_cpu_has_feature(CPU_FEATURE_NEON))
_evas_common_scale_rgba_in_to_out_clip_smooth_neon
(src, dst,
dst_clip_x, dst_clip_y, dst_clip_w, dst_clip_h,
mul_col, render_op,
src_region_x, src_region_y, src_region_w, src_region_h,
dst_region_x, dst_region_y, dst_region_w, dst_region_h);
else
#endif
_evas_common_scale_rgba_in_to_out_clip_smooth_c
(src, dst,
@ -263,6 +287,15 @@ evas_common_scale_rgba_in_to_out_clip_smooth_do(const Cutout_Rects *reuse,
dst_region_w, dst_region_h);
else
# endif
#ifdef BUILD_NEON
if (evas_common_cpu_has_feature(CPU_FEATURE_NEON))
evas_common_scale_rgba_in_to_out_clip_smooth_neon(src, dst, dc,
src_region_x, src_region_y,
src_region_w, src_region_h,
dst_region_x, dst_region_y,
dst_region_w, dst_region_h);
else
#endif
evas_common_scale_rgba_in_to_out_clip_smooth_c(src, dst, dc,
src_region_x, src_region_y,
src_region_w, src_region_h,
@ -287,7 +320,16 @@ evas_common_scale_rgba_in_to_out_clip_smooth_do(const Cutout_Rects *reuse,
dst_region_w, dst_region_h);
else
# endif
evas_common_scale_rgba_in_to_out_clip_smooth_c(src, dst, dc,
#ifdef BUILD_NEON
if (evas_common_cpu_has_feature(CPU_FEATURE_NEON))
evas_common_scale_rgba_in_to_out_clip_smooth_neon(src, dst, dc,
src_region_x, src_region_y,
src_region_w, src_region_h,
dst_region_x, dst_region_y,
dst_region_w, dst_region_h);
else
#endif
evas_common_scale_rgba_in_to_out_clip_smooth_c(src, dst, dc,
src_region_x, src_region_y,
src_region_w, src_region_h,
dst_region_x, dst_region_y,

View File

@ -172,6 +172,10 @@
MOV_A2R(ay, mm4)
pxor_r2r(mm0, mm0);
MOV_A2R(ALPHA_255, mm5)
#elif defined SCALE_USING_NEON
FPU_NEON;
VDUP_NEON(d12, ay);
VMOV_I2R_NEON(q2, #255);
#endif
pbuf = buf; pbuf_end = buf + dst_clip_w;
sxx = sxx0;
@ -210,6 +214,28 @@
INTERP_256_R2R(mm4, mm2, mm1, mm5)
MOV_R2P(mm1, *pbuf, mm0)
pbuf++;
#elif defined SCALE_USING_NEON
if (p0 | p1 | p2 | p3)
{
FPU_NEON;
VMOV_M2R_NEON(d8, p0);
VEOR_NEON(q0);
VMOV_M2R_NEON(d9, p2);
VMOV_M2R_NEON(d10, p1);
VEOR_NEON(q1);
VMOV_M2R_NEON(d11, p3);
VDUP_NEON(q3, ax);
VZIP_NEON(q4, q0);
VZIP_NEON(q5, q1);
VMOV_R2R_NEON(d9, d0);
VMOV_R2R_NEON(d11, d2);
INTERP_256_NEON(q3, q5, q4, q2);
INTERP_256_NEON(d12, d9, d8, d5);
VMOV_R2M_NEON(q4, d8, pbuf);
pbuf++;
}
else
*pbuf++ = p0;
#else
if (p0 | p1)
p0 = INTERP_256(ax, p1, p0);

View File

@ -186,6 +186,64 @@ extern const DATA32 ALPHA_256;
#endif
/* some useful NEON macros */
#ifdef BUILD_NEON
#define FPU_NEON \
__asm__ __volatile__(".fpu neon \n\t");
/* copy reg1 to reg2 */
#define VMOV_R2R_NEON(reg1, reg2) \
__asm__ __volatile__("vmov " #reg1 ", " #reg2 " \n\t" ::: #reg1);
/* copy 32bit value to lower bits of register reg */
#define VMOV_M2R_NEON(reg, value) \
__asm__ __volatile__("vmov.32 " #reg "[0], %[val] \n\t" :: [val] "r" (value) : #reg);
/* save 32bit value from lower 64 bits of register regq to memory location */
/* pointed to by pointer, using 64bit register regd as temporary location */
#define VMOV_R2M_NEON(regq, regd, pointer) \
__asm__ __volatile__("vqmovn.u16 " #regd ", " #regq " \n\t" \
"vst1.32 {" #regd "[0]}, [%[p]] \n\t" :: [p] "r" (pointer) : #regd, "memory");
/* spread constant imm in register reg */
#define VMOV_I2R_NEON(reg, imm) \
__asm__ __volatile__("vmov.i16 " #reg ", " #imm " \n\t" ::: #reg);
/* spread value in register reg */
#define VDUP_NEON(reg, value) \
__asm__ __volatile__("vdup.16 " #reg ", %[val] \n\t" :: [val] "r" (value) : #reg);
/* interleave contents of reg1 and reg2 */
#define VZIP_NEON(reg1, reg2) \
__asm__ __volatile__("vzip.8 " #reg1 ", " #reg2 " \n\t" ::: #reg1 , #reg2);
/* swap contents of two registers */
#define VSWP_NEON(reg1, reg2) \
__asm__ __volatile__("vswp " #reg1 ", " #reg2 " \n\t" ::: #reg1 , #reg2);
/* set register to zero */
#define VEOR_NEON(reg) \
__asm__ __volatile__("veor " #reg ", " #reg ", " #reg " \n\t" ::: #reg);
/* do interpolation of every channel RGBA, result is contained in regy */
#define INTERP_256_NEON(rega, regx, regy, reg255) \
__asm__ __volatile__("vsub.i16 " #regx ", " #regx ", " #regy " \n\t" \
"vmul.u16 " #regx ", " #regx ", " #rega " \n\t" \
"vsri.16 " #regx ", " #regx ", #8 \n\t" \
"vadd.i16 " #regx ", " #regx ", " #regy " \n\t" \
"vand " #regy ", " #regx ", " #reg255 " \n\t" \
::: #regx, #regy );
/* multiply every channel of regx and regy */
#define MUL4_SYM_NEON(regx, regy, reg255) \
__asm__ __volatile__("vmul.u16 " #regx ", " #regx ", " #regy " \n\t" \
"vadd.i16 " #regx ", " #regx ", " #reg255 " \n\t" \
"vsri.16 " #regx ", " #regx ", #8 \n\t" \
"vand " #regx ", " #regx ", " #reg255 " \n\t" \
::: #regx );
#endif
/* some useful SSE3 inline functions */