diff --git a/AUTHORS b/AUTHORS index a42f1e411e..e0e0cecbe4 100644 --- a/AUTHORS +++ b/AUTHORS @@ -125,6 +125,7 @@ Patryk Kaczmarek Zbigniew Kosinski Paulo Cavalcanti Jean-Philippe Andre +Yury Usischev Ecore diff --git a/ChangeLog b/ChangeLog index 4cd2a4f364..d45dab1af3 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,7 @@ +2013-08-02 Yury Usischev + + * Add neon optimizations for several scaling/map routines in evas + 2013-08-02 Cedric Bail * Evas: change mapping policy for image loader (RANDOM during header, diff --git a/NEWS b/NEWS index bbbdc06763..243bf6d12a 100644 --- a/NEWS +++ b/NEWS @@ -201,6 +201,7 @@ Improvements: - Use eo array of callbacks to reduce callbacks memory footprint of Evas_Object_Box and Evas_Object_Table. - Optimized path for when map use the same color for all corner. - Asynchronous preload of GL texture. + - Add neon assembly for upscaling and map routines * Ecore_Con: - Rebase dns.c against upstream * Edje: diff --git a/src/lib/evas/common/evas_map_image_core.c b/src/lib/evas/common/evas_map_image_core.c index 7e44c4b161..6e2be0e30a 100644 --- a/src/lib/evas/common/evas_map_image_core.c +++ b/src/lib/evas/common/evas_map_image_core.c @@ -19,6 +19,9 @@ #ifdef SCALE_USING_MMX pxor_r2r(mm0, mm0); MOV_A2R(ALPHA_255, mm5) +#elif defined SCALE_USING_NEON + FPU_NEON; + VMOV_I2R_NEON(q2, #255); #endif line = &(spans[y - ystart]); diff --git a/src/lib/evas/common/evas_map_image_loop.c b/src/lib/evas/common/evas_map_image_loop.c index fc322860aa..a8a49eb7f4 100644 --- a/src/lib/evas/common/evas_map_image_loop.c +++ b/src/lib/evas/common/evas_map_image_loop.c @@ -1,13 +1,27 @@ #ifdef SMOOTH { # ifdef SCALE_USING_MMX -# ifdef COLMUL -# ifdef COLSAME +# ifdef COLMUL +# ifdef COLSAME MOV_P2R(c1, mm7, mm0); // col -# endif # endif # endif - while (ww > 0) +# endif +# ifdef SCALE_USING_NEON +# ifdef COLMUL +# ifndef COLBLACK + // this part can be done here as c1 and c2 are constants in the cycle + FPU_NEON; + VMOV_M2R_NEON(d18, c1); + VEOR_NEON(q8); + VMOV_M2R_NEON(d19, c2); + VZIP_NEON(q9, q8); + VMOV_R2R_NEON(d19, d16); + // here we have c1 and c2 spread through q9 register +# endif +# endif +# endif + while (ww > 0) { # ifdef COLBLACK *d = 0xff000000; // col @@ -77,6 +91,41 @@ # endif # endif MOV_R2P(mm1, *d, mm0); +# elif defined SCALE_USING_NEON + // not sure if we need this condition, but it doesn't affect the result + if (val1 | val2 | val3 | val4) + { + FPU_NEON; +# ifdef COLMUL + // initialize alpha for interpolation of c1 and c2 + VDUP_NEON(d15, cv >> 16); + // copy c1 and c2 as algorithm will overwrite it + VMOV_R2R_NEON(q6, q9); + cv += cd; // col +# endif + VMOV_M2R_NEON(d8, val1); + VEOR_NEON(q0); + VMOV_M2R_NEON(d9, val3); + VMOV_M2R_NEON(d10, val2); + VEOR_NEON(q1); + VMOV_M2R_NEON(d11, val4); + VDUP_NEON(q3, ru); + VDUP_NEON(d14, rv); + VZIP_NEON(q4, q0); + VZIP_NEON(q5, q1); + VMOV_R2R_NEON(d9, d0); + VMOV_R2R_NEON(d11, d2); + // by this point we have all required data in right registers + INTERP_256_NEON(q3, q5, q4, q2); // interpolate val1,val2 and val3,val4 + VSWP_NEON(d9, d12); // move result of val3,val4 interpolation (and c1 if COLMUL is defined) for next step + INTERP_256_NEON(q7, q6, q4, q2); // second stage of interpolation, also here c1 and c2 are interpolated +# ifdef COLMUL + MUL4_SYM_NEON(d8, d9, d4); // do required multiplication +# endif + VMOV_R2M_NEON(q4, d8, d); // save result to d + } + else + *d = val1; # else val1 = INTERP_256(ru, val2, val1); val3 = INTERP_256(ru, val4, val3); @@ -102,10 +151,23 @@ } #else { +# ifdef SCALE_USING_NEON +# ifdef COLMUL +# ifndef COLBLACK + // c1 and c2 are constants inside the cycle + FPU_NEON; + VMOV_M2R_NEON(d10, c1); + VEOR_NEON(q0); + VMOV_M2R_NEON(d11, c2); + VZIP_NEON(q5, q0); + VMOV_R2R_NEON(d11, d0); +# endif +# endif +# endif while (ww > 0) { # ifdef COLMUL -# ifndef COLBLACK +# ifndef COLBLACK DATA32 val1; # ifdef COLSAME # else @@ -121,11 +183,27 @@ # ifdef COLMUL val1 = *s; // col # ifdef COLSAME +# ifdef SCALE_USING_NEON *d = MUL4_SYM(c1, val1); -# else +# else + *d = MUL4_SYM(c1, val1); // XXX: do this in neon +# endif +# else +# ifdef SCALE_USING_NEON + FPU_NEON; + VMOV_M2R_NEON(d12, val1); + VMOV_R2R_NEON(q4, q5); + VEOR_NEON(q1); + VDUP_NEON(d15, cv >> 16); + VZIP_NEON(q6, q1); + INTERP_256_NEON(d15, d9, d8, d4); // interpolate c1 and c2 + MUL4_SYM_NEON(d8, d12, d4); // multiply + VMOV_R2M_NEON(q4, d8, d); // save result +# else cval = INTERP_256((cv >> 16), c2, c1); // col *d = MUL4_SYM(cval, val1); cv += cd; // col +# endif # endif # else *d = *s; diff --git a/src/lib/evas/common/evas_scale_smooth.c b/src/lib/evas/common/evas_scale_smooth.c index 02dbe7d44d..61bda22b0a 100644 --- a/src/lib/evas/common/evas_scale_smooth.c +++ b/src/lib/evas/common/evas_scale_smooth.c @@ -97,6 +97,15 @@ scale_calc_a_points(int *p, int s, int d, int c, int cc) # include "evas_scale_smooth_scaler.c" #endif +#ifdef BUILD_NEON +# undef SCALE_FUNC +# undef SCALE_USING_NEON +# define SCALE_USING_NEON +# define SCALE_FUNC evas_common_scale_rgba_in_to_out_clip_smooth_neon +# include "evas_scale_smooth_scaler.c" +# undef SCALE_USING_NEON +#endif + #undef SCALE_FUNC #define SCALE_FUNC _evas_common_scale_rgba_in_to_out_clip_smooth_c #undef SCALE_USING_MMX @@ -196,6 +205,11 @@ evas_common_scale_rgba_in_to_out_clip_smooth(RGBA_Image *src, RGBA_Image *dst, if (mmx) cb = evas_common_scale_rgba_in_to_out_clip_smooth_mmx; else +#endif +#ifdef BUILD_NEON + if (evas_common_cpu_has_feature(CPU_FEATURE_NEON)) + cb = evas_common_scale_rgba_in_to_out_clip_smooth_neon; + else #endif cb = evas_common_scale_rgba_in_to_out_clip_smooth_c; @@ -222,6 +236,16 @@ evas_common_scale_rgba_smooth_draw(RGBA_Image *src, RGBA_Image *dst, int dst_cli src_region_x, src_region_y, src_region_w, src_region_h, dst_region_x, dst_region_y, dst_region_w, dst_region_h); else +#endif +#ifdef BUILD_NEON + if (evas_common_cpu_has_feature(CPU_FEATURE_NEON)) + _evas_common_scale_rgba_in_to_out_clip_smooth_neon + (src, dst, + dst_clip_x, dst_clip_y, dst_clip_w, dst_clip_h, + mul_col, render_op, + src_region_x, src_region_y, src_region_w, src_region_h, + dst_region_x, dst_region_y, dst_region_w, dst_region_h); + else #endif _evas_common_scale_rgba_in_to_out_clip_smooth_c (src, dst, @@ -263,6 +287,15 @@ evas_common_scale_rgba_in_to_out_clip_smooth_do(const Cutout_Rects *reuse, dst_region_w, dst_region_h); else # endif +#ifdef BUILD_NEON + if (evas_common_cpu_has_feature(CPU_FEATURE_NEON)) + evas_common_scale_rgba_in_to_out_clip_smooth_neon(src, dst, dc, + src_region_x, src_region_y, + src_region_w, src_region_h, + dst_region_x, dst_region_y, + dst_region_w, dst_region_h); + else +#endif evas_common_scale_rgba_in_to_out_clip_smooth_c(src, dst, dc, src_region_x, src_region_y, src_region_w, src_region_h, @@ -287,7 +320,16 @@ evas_common_scale_rgba_in_to_out_clip_smooth_do(const Cutout_Rects *reuse, dst_region_w, dst_region_h); else # endif - evas_common_scale_rgba_in_to_out_clip_smooth_c(src, dst, dc, +#ifdef BUILD_NEON + if (evas_common_cpu_has_feature(CPU_FEATURE_NEON)) + evas_common_scale_rgba_in_to_out_clip_smooth_neon(src, dst, dc, + src_region_x, src_region_y, + src_region_w, src_region_h, + dst_region_x, dst_region_y, + dst_region_w, dst_region_h); + else +#endif + evas_common_scale_rgba_in_to_out_clip_smooth_c(src, dst, dc, src_region_x, src_region_y, src_region_w, src_region_h, dst_region_x, dst_region_y, diff --git a/src/lib/evas/common/evas_scale_smooth_scaler_up.c b/src/lib/evas/common/evas_scale_smooth_scaler_up.c index e43e0c7a6c..4b21d598dd 100644 --- a/src/lib/evas/common/evas_scale_smooth_scaler_up.c +++ b/src/lib/evas/common/evas_scale_smooth_scaler_up.c @@ -172,6 +172,10 @@ MOV_A2R(ay, mm4) pxor_r2r(mm0, mm0); MOV_A2R(ALPHA_255, mm5) +#elif defined SCALE_USING_NEON + FPU_NEON; + VDUP_NEON(d12, ay); + VMOV_I2R_NEON(q2, #255); #endif pbuf = buf; pbuf_end = buf + dst_clip_w; sxx = sxx0; @@ -210,6 +214,28 @@ INTERP_256_R2R(mm4, mm2, mm1, mm5) MOV_R2P(mm1, *pbuf, mm0) pbuf++; +#elif defined SCALE_USING_NEON + if (p0 | p1 | p2 | p3) + { + FPU_NEON; + VMOV_M2R_NEON(d8, p0); + VEOR_NEON(q0); + VMOV_M2R_NEON(d9, p2); + VMOV_M2R_NEON(d10, p1); + VEOR_NEON(q1); + VMOV_M2R_NEON(d11, p3); + VDUP_NEON(q3, ax); + VZIP_NEON(q4, q0); + VZIP_NEON(q5, q1); + VMOV_R2R_NEON(d9, d0); + VMOV_R2R_NEON(d11, d2); + INTERP_256_NEON(q3, q5, q4, q2); + INTERP_256_NEON(d12, d9, d8, d5); + VMOV_R2M_NEON(q4, d8, pbuf); + pbuf++; + } + else + *pbuf++ = p0; #else if (p0 | p1) p0 = INTERP_256(ax, p1, p0); diff --git a/src/lib/evas/include/evas_blend_ops.h b/src/lib/evas/include/evas_blend_ops.h index 0a78843579..3ae94379ec 100644 --- a/src/lib/evas/include/evas_blend_ops.h +++ b/src/lib/evas/include/evas_blend_ops.h @@ -186,6 +186,64 @@ extern const DATA32 ALPHA_256; #endif +/* some useful NEON macros */ + +#ifdef BUILD_NEON +#define FPU_NEON \ + __asm__ __volatile__(".fpu neon \n\t"); + +/* copy reg1 to reg2 */ +#define VMOV_R2R_NEON(reg1, reg2) \ + __asm__ __volatile__("vmov " #reg1 ", " #reg2 " \n\t" ::: #reg1); + +/* copy 32bit value to lower bits of register reg */ +#define VMOV_M2R_NEON(reg, value) \ + __asm__ __volatile__("vmov.32 " #reg "[0], %[val] \n\t" :: [val] "r" (value) : #reg); + +/* save 32bit value from lower 64 bits of register regq to memory location */ +/* pointed to by pointer, using 64bit register regd as temporary location */ +#define VMOV_R2M_NEON(regq, regd, pointer) \ + __asm__ __volatile__("vqmovn.u16 " #regd ", " #regq " \n\t" \ + "vst1.32 {" #regd "[0]}, [%[p]] \n\t" :: [p] "r" (pointer) : #regd, "memory"); + +/* spread constant imm in register reg */ +#define VMOV_I2R_NEON(reg, imm) \ + __asm__ __volatile__("vmov.i16 " #reg ", " #imm " \n\t" ::: #reg); + +/* spread value in register reg */ +#define VDUP_NEON(reg, value) \ + __asm__ __volatile__("vdup.16 " #reg ", %[val] \n\t" :: [val] "r" (value) : #reg); + +/* interleave contents of reg1 and reg2 */ +#define VZIP_NEON(reg1, reg2) \ + __asm__ __volatile__("vzip.8 " #reg1 ", " #reg2 " \n\t" ::: #reg1 , #reg2); + +/* swap contents of two registers */ +#define VSWP_NEON(reg1, reg2) \ + __asm__ __volatile__("vswp " #reg1 ", " #reg2 " \n\t" ::: #reg1 , #reg2); + +/* set register to zero */ +#define VEOR_NEON(reg) \ + __asm__ __volatile__("veor " #reg ", " #reg ", " #reg " \n\t" ::: #reg); + +/* do interpolation of every channel RGBA, result is contained in regy */ +#define INTERP_256_NEON(rega, regx, regy, reg255) \ + __asm__ __volatile__("vsub.i16 " #regx ", " #regx ", " #regy " \n\t" \ + "vmul.u16 " #regx ", " #regx ", " #rega " \n\t" \ + "vsri.16 " #regx ", " #regx ", #8 \n\t" \ + "vadd.i16 " #regx ", " #regx ", " #regy " \n\t" \ + "vand " #regy ", " #regx ", " #reg255 " \n\t" \ + ::: #regx, #regy ); + +/* multiply every channel of regx and regy */ +#define MUL4_SYM_NEON(regx, regy, reg255) \ + __asm__ __volatile__("vmul.u16 " #regx ", " #regx ", " #regy " \n\t" \ + "vadd.i16 " #regx ", " #regx ", " #reg255 " \n\t" \ + "vsri.16 " #regx ", " #regx ", #8 \n\t" \ + "vand " #regx ", " #regx ", " #reg255 " \n\t" \ + ::: #regx ); + +#endif /* some useful SSE3 inline functions */