forked from enlightenment/efl
Add neon for upscaling and map routines in evas.
This commit is contained in:
parent
a3165bff15
commit
bd6de4ba8c
1
AUTHORS
1
AUTHORS
|
@ -125,6 +125,7 @@ Patryk Kaczmarek <patryk.k@samsung.com>
|
|||
Zbigniew Kosinski <z.kosinski@samsung.com>
|
||||
Paulo Cavalcanti <paulo.cavalcanti@linux.intel.com>
|
||||
Jean-Philippe Andre <jp.andre@samsung.com>
|
||||
Yury Usischev <y.usishchev@samsung.com>
|
||||
|
||||
|
||||
Ecore
|
||||
|
|
|
@ -1,3 +1,7 @@
|
|||
2013-08-02 Yury Usischev
|
||||
|
||||
* Add neon optimizations for several scaling/map routines in evas
|
||||
|
||||
2013-08-02 Cedric Bail
|
||||
|
||||
* Evas: change mapping policy for image loader (RANDOM during header,
|
||||
|
|
1
NEWS
1
NEWS
|
@ -201,6 +201,7 @@ Improvements:
|
|||
- Use eo array of callbacks to reduce callbacks memory footprint of Evas_Object_Box and Evas_Object_Table.
|
||||
- Optimized path for when map use the same color for all corner.
|
||||
- Asynchronous preload of GL texture.
|
||||
- Add neon assembly for upscaling and map routines
|
||||
* Ecore_Con:
|
||||
- Rebase dns.c against upstream
|
||||
* Edje:
|
||||
|
|
|
@ -19,6 +19,9 @@
|
|||
#ifdef SCALE_USING_MMX
|
||||
pxor_r2r(mm0, mm0);
|
||||
MOV_A2R(ALPHA_255, mm5)
|
||||
#elif defined SCALE_USING_NEON
|
||||
FPU_NEON;
|
||||
VMOV_I2R_NEON(q2, #255);
|
||||
#endif
|
||||
|
||||
line = &(spans[y - ystart]);
|
||||
|
|
|
@ -1,13 +1,27 @@
|
|||
#ifdef SMOOTH
|
||||
{
|
||||
# ifdef SCALE_USING_MMX
|
||||
# ifdef COLMUL
|
||||
# ifdef COLSAME
|
||||
# ifdef COLMUL
|
||||
# ifdef COLSAME
|
||||
MOV_P2R(c1, mm7, mm0); // col
|
||||
# endif
|
||||
# endif
|
||||
# endif
|
||||
while (ww > 0)
|
||||
# endif
|
||||
# ifdef SCALE_USING_NEON
|
||||
# ifdef COLMUL
|
||||
# ifndef COLBLACK
|
||||
// this part can be done here as c1 and c2 are constants in the cycle
|
||||
FPU_NEON;
|
||||
VMOV_M2R_NEON(d18, c1);
|
||||
VEOR_NEON(q8);
|
||||
VMOV_M2R_NEON(d19, c2);
|
||||
VZIP_NEON(q9, q8);
|
||||
VMOV_R2R_NEON(d19, d16);
|
||||
// here we have c1 and c2 spread through q9 register
|
||||
# endif
|
||||
# endif
|
||||
# endif
|
||||
while (ww > 0)
|
||||
{
|
||||
# ifdef COLBLACK
|
||||
*d = 0xff000000; // col
|
||||
|
@ -77,6 +91,41 @@
|
|||
# endif
|
||||
# endif
|
||||
MOV_R2P(mm1, *d, mm0);
|
||||
# elif defined SCALE_USING_NEON
|
||||
// not sure if we need this condition, but it doesn't affect the result
|
||||
if (val1 | val2 | val3 | val4)
|
||||
{
|
||||
FPU_NEON;
|
||||
# ifdef COLMUL
|
||||
// initialize alpha for interpolation of c1 and c2
|
||||
VDUP_NEON(d15, cv >> 16);
|
||||
// copy c1 and c2 as algorithm will overwrite it
|
||||
VMOV_R2R_NEON(q6, q9);
|
||||
cv += cd; // col
|
||||
# endif
|
||||
VMOV_M2R_NEON(d8, val1);
|
||||
VEOR_NEON(q0);
|
||||
VMOV_M2R_NEON(d9, val3);
|
||||
VMOV_M2R_NEON(d10, val2);
|
||||
VEOR_NEON(q1);
|
||||
VMOV_M2R_NEON(d11, val4);
|
||||
VDUP_NEON(q3, ru);
|
||||
VDUP_NEON(d14, rv);
|
||||
VZIP_NEON(q4, q0);
|
||||
VZIP_NEON(q5, q1);
|
||||
VMOV_R2R_NEON(d9, d0);
|
||||
VMOV_R2R_NEON(d11, d2);
|
||||
// by this point we have all required data in right registers
|
||||
INTERP_256_NEON(q3, q5, q4, q2); // interpolate val1,val2 and val3,val4
|
||||
VSWP_NEON(d9, d12); // move result of val3,val4 interpolation (and c1 if COLMUL is defined) for next step
|
||||
INTERP_256_NEON(q7, q6, q4, q2); // second stage of interpolation, also here c1 and c2 are interpolated
|
||||
# ifdef COLMUL
|
||||
MUL4_SYM_NEON(d8, d9, d4); // do required multiplication
|
||||
# endif
|
||||
VMOV_R2M_NEON(q4, d8, d); // save result to d
|
||||
}
|
||||
else
|
||||
*d = val1;
|
||||
# else
|
||||
val1 = INTERP_256(ru, val2, val1);
|
||||
val3 = INTERP_256(ru, val4, val3);
|
||||
|
@ -102,10 +151,23 @@
|
|||
}
|
||||
#else
|
||||
{
|
||||
# ifdef SCALE_USING_NEON
|
||||
# ifdef COLMUL
|
||||
# ifndef COLBLACK
|
||||
// c1 and c2 are constants inside the cycle
|
||||
FPU_NEON;
|
||||
VMOV_M2R_NEON(d10, c1);
|
||||
VEOR_NEON(q0);
|
||||
VMOV_M2R_NEON(d11, c2);
|
||||
VZIP_NEON(q5, q0);
|
||||
VMOV_R2R_NEON(d11, d0);
|
||||
# endif
|
||||
# endif
|
||||
# endif
|
||||
while (ww > 0)
|
||||
{
|
||||
# ifdef COLMUL
|
||||
# ifndef COLBLACK
|
||||
# ifndef COLBLACK
|
||||
DATA32 val1;
|
||||
# ifdef COLSAME
|
||||
# else
|
||||
|
@ -121,11 +183,27 @@
|
|||
# ifdef COLMUL
|
||||
val1 = *s; // col
|
||||
# ifdef COLSAME
|
||||
# ifdef SCALE_USING_NEON
|
||||
*d = MUL4_SYM(c1, val1);
|
||||
# else
|
||||
# else
|
||||
*d = MUL4_SYM(c1, val1); // XXX: do this in neon
|
||||
# endif
|
||||
# else
|
||||
# ifdef SCALE_USING_NEON
|
||||
FPU_NEON;
|
||||
VMOV_M2R_NEON(d12, val1);
|
||||
VMOV_R2R_NEON(q4, q5);
|
||||
VEOR_NEON(q1);
|
||||
VDUP_NEON(d15, cv >> 16);
|
||||
VZIP_NEON(q6, q1);
|
||||
INTERP_256_NEON(d15, d9, d8, d4); // interpolate c1 and c2
|
||||
MUL4_SYM_NEON(d8, d12, d4); // multiply
|
||||
VMOV_R2M_NEON(q4, d8, d); // save result
|
||||
# else
|
||||
cval = INTERP_256((cv >> 16), c2, c1); // col
|
||||
*d = MUL4_SYM(cval, val1);
|
||||
cv += cd; // col
|
||||
# endif
|
||||
# endif
|
||||
# else
|
||||
*d = *s;
|
||||
|
|
|
@ -97,6 +97,15 @@ scale_calc_a_points(int *p, int s, int d, int c, int cc)
|
|||
# include "evas_scale_smooth_scaler.c"
|
||||
#endif
|
||||
|
||||
#ifdef BUILD_NEON
|
||||
# undef SCALE_FUNC
|
||||
# undef SCALE_USING_NEON
|
||||
# define SCALE_USING_NEON
|
||||
# define SCALE_FUNC evas_common_scale_rgba_in_to_out_clip_smooth_neon
|
||||
# include "evas_scale_smooth_scaler.c"
|
||||
# undef SCALE_USING_NEON
|
||||
#endif
|
||||
|
||||
#undef SCALE_FUNC
|
||||
#define SCALE_FUNC _evas_common_scale_rgba_in_to_out_clip_smooth_c
|
||||
#undef SCALE_USING_MMX
|
||||
|
@ -196,6 +205,11 @@ evas_common_scale_rgba_in_to_out_clip_smooth(RGBA_Image *src, RGBA_Image *dst,
|
|||
if (mmx)
|
||||
cb = evas_common_scale_rgba_in_to_out_clip_smooth_mmx;
|
||||
else
|
||||
#endif
|
||||
#ifdef BUILD_NEON
|
||||
if (evas_common_cpu_has_feature(CPU_FEATURE_NEON))
|
||||
cb = evas_common_scale_rgba_in_to_out_clip_smooth_neon;
|
||||
else
|
||||
#endif
|
||||
cb = evas_common_scale_rgba_in_to_out_clip_smooth_c;
|
||||
|
||||
|
@ -222,6 +236,16 @@ evas_common_scale_rgba_smooth_draw(RGBA_Image *src, RGBA_Image *dst, int dst_cli
|
|||
src_region_x, src_region_y, src_region_w, src_region_h,
|
||||
dst_region_x, dst_region_y, dst_region_w, dst_region_h);
|
||||
else
|
||||
#endif
|
||||
#ifdef BUILD_NEON
|
||||
if (evas_common_cpu_has_feature(CPU_FEATURE_NEON))
|
||||
_evas_common_scale_rgba_in_to_out_clip_smooth_neon
|
||||
(src, dst,
|
||||
dst_clip_x, dst_clip_y, dst_clip_w, dst_clip_h,
|
||||
mul_col, render_op,
|
||||
src_region_x, src_region_y, src_region_w, src_region_h,
|
||||
dst_region_x, dst_region_y, dst_region_w, dst_region_h);
|
||||
else
|
||||
#endif
|
||||
_evas_common_scale_rgba_in_to_out_clip_smooth_c
|
||||
(src, dst,
|
||||
|
@ -263,6 +287,15 @@ evas_common_scale_rgba_in_to_out_clip_smooth_do(const Cutout_Rects *reuse,
|
|||
dst_region_w, dst_region_h);
|
||||
else
|
||||
# endif
|
||||
#ifdef BUILD_NEON
|
||||
if (evas_common_cpu_has_feature(CPU_FEATURE_NEON))
|
||||
evas_common_scale_rgba_in_to_out_clip_smooth_neon(src, dst, dc,
|
||||
src_region_x, src_region_y,
|
||||
src_region_w, src_region_h,
|
||||
dst_region_x, dst_region_y,
|
||||
dst_region_w, dst_region_h);
|
||||
else
|
||||
#endif
|
||||
evas_common_scale_rgba_in_to_out_clip_smooth_c(src, dst, dc,
|
||||
src_region_x, src_region_y,
|
||||
src_region_w, src_region_h,
|
||||
|
@ -287,7 +320,16 @@ evas_common_scale_rgba_in_to_out_clip_smooth_do(const Cutout_Rects *reuse,
|
|||
dst_region_w, dst_region_h);
|
||||
else
|
||||
# endif
|
||||
evas_common_scale_rgba_in_to_out_clip_smooth_c(src, dst, dc,
|
||||
#ifdef BUILD_NEON
|
||||
if (evas_common_cpu_has_feature(CPU_FEATURE_NEON))
|
||||
evas_common_scale_rgba_in_to_out_clip_smooth_neon(src, dst, dc,
|
||||
src_region_x, src_region_y,
|
||||
src_region_w, src_region_h,
|
||||
dst_region_x, dst_region_y,
|
||||
dst_region_w, dst_region_h);
|
||||
else
|
||||
#endif
|
||||
evas_common_scale_rgba_in_to_out_clip_smooth_c(src, dst, dc,
|
||||
src_region_x, src_region_y,
|
||||
src_region_w, src_region_h,
|
||||
dst_region_x, dst_region_y,
|
||||
|
|
|
@ -172,6 +172,10 @@
|
|||
MOV_A2R(ay, mm4)
|
||||
pxor_r2r(mm0, mm0);
|
||||
MOV_A2R(ALPHA_255, mm5)
|
||||
#elif defined SCALE_USING_NEON
|
||||
FPU_NEON;
|
||||
VDUP_NEON(d12, ay);
|
||||
VMOV_I2R_NEON(q2, #255);
|
||||
#endif
|
||||
pbuf = buf; pbuf_end = buf + dst_clip_w;
|
||||
sxx = sxx0;
|
||||
|
@ -210,6 +214,28 @@
|
|||
INTERP_256_R2R(mm4, mm2, mm1, mm5)
|
||||
MOV_R2P(mm1, *pbuf, mm0)
|
||||
pbuf++;
|
||||
#elif defined SCALE_USING_NEON
|
||||
if (p0 | p1 | p2 | p3)
|
||||
{
|
||||
FPU_NEON;
|
||||
VMOV_M2R_NEON(d8, p0);
|
||||
VEOR_NEON(q0);
|
||||
VMOV_M2R_NEON(d9, p2);
|
||||
VMOV_M2R_NEON(d10, p1);
|
||||
VEOR_NEON(q1);
|
||||
VMOV_M2R_NEON(d11, p3);
|
||||
VDUP_NEON(q3, ax);
|
||||
VZIP_NEON(q4, q0);
|
||||
VZIP_NEON(q5, q1);
|
||||
VMOV_R2R_NEON(d9, d0);
|
||||
VMOV_R2R_NEON(d11, d2);
|
||||
INTERP_256_NEON(q3, q5, q4, q2);
|
||||
INTERP_256_NEON(d12, d9, d8, d5);
|
||||
VMOV_R2M_NEON(q4, d8, pbuf);
|
||||
pbuf++;
|
||||
}
|
||||
else
|
||||
*pbuf++ = p0;
|
||||
#else
|
||||
if (p0 | p1)
|
||||
p0 = INTERP_256(ax, p1, p0);
|
||||
|
|
|
@ -186,6 +186,64 @@ extern const DATA32 ALPHA_256;
|
|||
|
||||
#endif
|
||||
|
||||
/* some useful NEON macros */
|
||||
|
||||
#ifdef BUILD_NEON
|
||||
#define FPU_NEON \
|
||||
__asm__ __volatile__(".fpu neon \n\t");
|
||||
|
||||
/* copy reg1 to reg2 */
|
||||
#define VMOV_R2R_NEON(reg1, reg2) \
|
||||
__asm__ __volatile__("vmov " #reg1 ", " #reg2 " \n\t" ::: #reg1);
|
||||
|
||||
/* copy 32bit value to lower bits of register reg */
|
||||
#define VMOV_M2R_NEON(reg, value) \
|
||||
__asm__ __volatile__("vmov.32 " #reg "[0], %[val] \n\t" :: [val] "r" (value) : #reg);
|
||||
|
||||
/* save 32bit value from lower 64 bits of register regq to memory location */
|
||||
/* pointed to by pointer, using 64bit register regd as temporary location */
|
||||
#define VMOV_R2M_NEON(regq, regd, pointer) \
|
||||
__asm__ __volatile__("vqmovn.u16 " #regd ", " #regq " \n\t" \
|
||||
"vst1.32 {" #regd "[0]}, [%[p]] \n\t" :: [p] "r" (pointer) : #regd, "memory");
|
||||
|
||||
/* spread constant imm in register reg */
|
||||
#define VMOV_I2R_NEON(reg, imm) \
|
||||
__asm__ __volatile__("vmov.i16 " #reg ", " #imm " \n\t" ::: #reg);
|
||||
|
||||
/* spread value in register reg */
|
||||
#define VDUP_NEON(reg, value) \
|
||||
__asm__ __volatile__("vdup.16 " #reg ", %[val] \n\t" :: [val] "r" (value) : #reg);
|
||||
|
||||
/* interleave contents of reg1 and reg2 */
|
||||
#define VZIP_NEON(reg1, reg2) \
|
||||
__asm__ __volatile__("vzip.8 " #reg1 ", " #reg2 " \n\t" ::: #reg1 , #reg2);
|
||||
|
||||
/* swap contents of two registers */
|
||||
#define VSWP_NEON(reg1, reg2) \
|
||||
__asm__ __volatile__("vswp " #reg1 ", " #reg2 " \n\t" ::: #reg1 , #reg2);
|
||||
|
||||
/* set register to zero */
|
||||
#define VEOR_NEON(reg) \
|
||||
__asm__ __volatile__("veor " #reg ", " #reg ", " #reg " \n\t" ::: #reg);
|
||||
|
||||
/* do interpolation of every channel RGBA, result is contained in regy */
|
||||
#define INTERP_256_NEON(rega, regx, regy, reg255) \
|
||||
__asm__ __volatile__("vsub.i16 " #regx ", " #regx ", " #regy " \n\t" \
|
||||
"vmul.u16 " #regx ", " #regx ", " #rega " \n\t" \
|
||||
"vsri.16 " #regx ", " #regx ", #8 \n\t" \
|
||||
"vadd.i16 " #regx ", " #regx ", " #regy " \n\t" \
|
||||
"vand " #regy ", " #regx ", " #reg255 " \n\t" \
|
||||
::: #regx, #regy );
|
||||
|
||||
/* multiply every channel of regx and regy */
|
||||
#define MUL4_SYM_NEON(regx, regy, reg255) \
|
||||
__asm__ __volatile__("vmul.u16 " #regx ", " #regx ", " #regy " \n\t" \
|
||||
"vadd.i16 " #regx ", " #regx ", " #reg255 " \n\t" \
|
||||
"vsri.16 " #regx ", " #regx ", #8 \n\t" \
|
||||
"vand " #regx ", " #regx ", " #reg255 " \n\t" \
|
||||
::: #regx );
|
||||
|
||||
#endif
|
||||
|
||||
/* some useful SSE3 inline functions */
|
||||
|
||||
|
|
Loading…
Reference in New Issue