Add neon for upscaling and map routines in evas.

2013-08-02 18:06:55 +09:00 · 2013-08-02 18:06:55 +09:00 · bd6de4ba8c
parent a3165bff15
commit bd6de4ba8c
8 changed files with 220 additions and 7 deletions
--- a/1
+++ b/1
@ -125,6 +125,7 @@ Patryk Kaczmarek <patryk.k@samsung.com>
 Zbigniew Kosinski <z.kosinski@samsung.com>
 Paulo Cavalcanti <paulo.cavalcanti@linux.intel.com>
 Jean-Philippe Andre <jp.andre@samsung.com>
+Yury Usischev <y.usishchev@samsung.com>


 Ecore
--- a/4
+++ b/4
@ -1,3 +1,7 @@
+2013-08-02  Yury Usischev
+
+        * Add neon optimizations for several scaling/map routines in evas
+
 2013-08-02  Cedric Bail

        * Evas: change mapping policy for image loader (RANDOM during header,
--- a/1
+++ b/1
@ -201,6 +201,7 @@ Improvements:
     - Use eo array of callbacks to reduce callbacks memory footprint of Evas_Object_Box and Evas_Object_Table.
     - Optimized path for when map use the same color for all corner.
     - Asynchronous preload of GL texture.
+     - Add neon assembly for upscaling and map routines
    * Ecore_Con:
     - Rebase dns.c against upstream
    * Edje:
--- a/src/lib/evas/common/evas_map_image_core.c
+++ b/src/lib/evas/common/evas_map_image_core.c
@ -19,6 +19,9 @@
 #ifdef SCALE_USING_MMX
             pxor_r2r(mm0, mm0);
             MOV_A2R(ALPHA_255, mm5)
+#elif defined SCALE_USING_NEON
+             FPU_NEON;
+             VMOV_I2R_NEON(q2, #255);
 #endif
               
             line = &(spans[y - ystart]);
--- a/src/lib/evas/common/evas_map_image_loop.c
+++ b/src/lib/evas/common/evas_map_image_loop.c
@ -1,13 +1,27 @@
 #ifdef SMOOTH
 {
 # ifdef SCALE_USING_MMX
-#   ifdef COLMUL
-#    ifdef COLSAME
+#  ifdef COLMUL
+#   ifdef COLSAME
   MOV_P2R(c1, mm7, mm0); // col
-#    endif   
 #   endif   
 #  endif   
-   while (ww > 0)
+# endif
+# ifdef SCALE_USING_NEON
+#  ifdef COLMUL
+#   ifndef COLBLACK
+   // this part can be done here as c1 and c2 are constants in the cycle
+   FPU_NEON;
+   VMOV_M2R_NEON(d18, c1);
+   VEOR_NEON(q8);
+   VMOV_M2R_NEON(d19, c2);
+   VZIP_NEON(q9, q8);
+   VMOV_R2R_NEON(d19, d16);
+   // here we have c1 and c2 spread through q9 register
+#   endif
+#  endif
+# endif
+     while (ww > 0)
     {
 # ifdef COLBLACK
        *d = 0xff000000; // col
@ -77,6 +91,41 @@
 #    endif        
 #   endif                            
        MOV_R2P(mm1, *d, mm0);
+#  elif defined SCALE_USING_NEON
+        // not sure if we need this condition, but it doesn't affect the result
+        if (val1 | val2 | val3 | val4)
+          {
+            FPU_NEON;
+#   ifdef COLMUL
+            // initialize alpha for interpolation of c1 and c2
+            VDUP_NEON(d15, cv >> 16);
+            // copy c1 and c2 as algorithm will overwrite it
+            VMOV_R2R_NEON(q6, q9);
+            cv += cd; // col
+#   endif
+            VMOV_M2R_NEON(d8, val1);
+            VEOR_NEON(q0);
+            VMOV_M2R_NEON(d9, val3);
+            VMOV_M2R_NEON(d10, val2);
+            VEOR_NEON(q1);
+            VMOV_M2R_NEON(d11, val4);
+            VDUP_NEON(q3, ru);
+            VDUP_NEON(d14, rv);
+            VZIP_NEON(q4, q0);
+            VZIP_NEON(q5, q1);
+            VMOV_R2R_NEON(d9, d0);
+            VMOV_R2R_NEON(d11, d2);
+            // by this point we have all required data in right registers
+            INTERP_256_NEON(q3, q5, q4, q2); // interpolate val1,val2 and val3,val4
+            VSWP_NEON(d9, d12); // move result of val3,val4 interpolation (and c1 if COLMUL is defined) for next step
+            INTERP_256_NEON(q7, q6, q4, q2); // second stage of interpolation, also here c1 and c2 are interpolated
+#   ifdef COLMUL
+            MUL4_SYM_NEON(d8, d9, d4); // do required multiplication
+#   endif
+            VMOV_R2M_NEON(q4, d8, d); // save result to d
+          }
+        else
+          *d = val1;
 #  else
        val1 = INTERP_256(ru, val2, val1);
        val3 = INTERP_256(ru, val4, val3);
@ -102,10 +151,23 @@
 }
 #else
 {
+# ifdef SCALE_USING_NEON
+#  ifdef COLMUL
+#   ifndef COLBLACK
+   // c1 and c2 are constants inside the cycle
+   FPU_NEON;
+   VMOV_M2R_NEON(d10, c1);
+   VEOR_NEON(q0);
+   VMOV_M2R_NEON(d11, c2);
+   VZIP_NEON(q5, q0);
+   VMOV_R2R_NEON(d11, d0);
+#   endif
+#  endif
+# endif
   while (ww > 0)
     {
 # ifdef COLMUL
-#  ifndef COLBLACK        
+#  ifndef COLBLACK
        DATA32 val1;
 #   ifdef COLSAME
 #   else        
@ -121,11 +183,27 @@
 #  ifdef COLMUL
        val1 = *s; // col
 #   ifdef COLSAME
+#    ifdef SCALE_USING_NEON
        *d = MUL4_SYM(c1, val1);
-#   else        
+#    else
+        *d = MUL4_SYM(c1, val1); // XXX: do this in neon
+#    endif
+#   else
+#    ifdef SCALE_USING_NEON
+        FPU_NEON;
+        VMOV_M2R_NEON(d12, val1);
+        VMOV_R2R_NEON(q4, q5);
+        VEOR_NEON(q1);
+        VDUP_NEON(d15, cv >> 16);
+        VZIP_NEON(q6, q1);
+        INTERP_256_NEON(d15, d9, d8, d4); // interpolate c1 and c2
+        MUL4_SYM_NEON(d8, d12, d4); // multiply
+        VMOV_R2M_NEON(q4, d8, d); // save result
+#    else
        cval = INTERP_256((cv >> 16), c2, c1); // col
        *d = MUL4_SYM(cval, val1);
        cv += cd; // col              
+#    endif
 #   endif        
 #  else
        *d = *s;
--- a/src/lib/evas/common/evas_scale_smooth.c
+++ b/src/lib/evas/common/evas_scale_smooth.c
@ -97,6 +97,15 @@ scale_calc_a_points(int *p, int s, int d, int c, int cc)
 # include "evas_scale_smooth_scaler.c"
 #endif

+#ifdef BUILD_NEON
+# undef SCALE_FUNC
+# undef SCALE_USING_NEON
+# define SCALE_USING_NEON
+# define SCALE_FUNC evas_common_scale_rgba_in_to_out_clip_smooth_neon
+# include "evas_scale_smooth_scaler.c"
+# undef SCALE_USING_NEON
+#endif
+
 #undef SCALE_FUNC
 #define SCALE_FUNC _evas_common_scale_rgba_in_to_out_clip_smooth_c
 #undef SCALE_USING_MMX
@ -196,6 +205,11 @@ evas_common_scale_rgba_in_to_out_clip_smooth(RGBA_Image *src, RGBA_Image *dst,
   if (mmx)
     cb = evas_common_scale_rgba_in_to_out_clip_smooth_mmx;
   else
+#endif
+#ifdef BUILD_NEON
+     if (evas_common_cpu_has_feature(CPU_FEATURE_NEON))
+       cb = evas_common_scale_rgba_in_to_out_clip_smooth_neon;
+   else
 #endif
     cb = evas_common_scale_rgba_in_to_out_clip_smooth_c;

@ -222,6 +236,16 @@ evas_common_scale_rgba_smooth_draw(RGBA_Image *src, RGBA_Image *dst, int dst_cli
        src_region_x, src_region_y, src_region_w, src_region_h,
        dst_region_x, dst_region_y, dst_region_w, dst_region_h);
   else
+#endif
+#ifdef BUILD_NEON
+     if (evas_common_cpu_has_feature(CPU_FEATURE_NEON))
+       _evas_common_scale_rgba_in_to_out_clip_smooth_neon
+     (src, dst,
+         dst_clip_x, dst_clip_y, dst_clip_w, dst_clip_h,
+         mul_col, render_op,
+         src_region_x, src_region_y, src_region_w, src_region_h,
+         dst_region_x, dst_region_y, dst_region_w, dst_region_h);
+   else
 #endif
     _evas_common_scale_rgba_in_to_out_clip_smooth_c
       (src, dst,
@ -263,6 +287,15 @@ evas_common_scale_rgba_in_to_out_clip_smooth_do(const Cutout_Rects *reuse,
 					       dst_region_w, dst_region_h);
 	else
 # endif
+#ifdef BUILD_NEON
+          if (evas_common_cpu_has_feature(CPU_FEATURE_NEON))
+            evas_common_scale_rgba_in_to_out_clip_smooth_neon(src, dst, dc,
+                                                              src_region_x, src_region_y,
+                                                              src_region_w, src_region_h,
+                                                              dst_region_x, dst_region_y,
+                                                              dst_region_w, dst_region_h);
+        else
+#endif
 	  evas_common_scale_rgba_in_to_out_clip_smooth_c(src, dst, dc,
                                                         src_region_x, src_region_y,
                                                         src_region_w, src_region_h,
@ -287,7 +320,16 @@ evas_common_scale_rgba_in_to_out_clip_smooth_do(const Cutout_Rects *reuse,
 					       dst_region_w, dst_region_h);
 	else
 # endif
-	  evas_common_scale_rgba_in_to_out_clip_smooth_c(src, dst, dc,
+#ifdef BUILD_NEON
+          if (evas_common_cpu_has_feature(CPU_FEATURE_NEON))
+            evas_common_scale_rgba_in_to_out_clip_smooth_neon(src, dst, dc,
+                                                              src_region_x, src_region_y,
+                                                              src_region_w, src_region_h,
+                                                              dst_region_x, dst_region_y,
+                                                              dst_region_w, dst_region_h);
+        else
+#endif
+            evas_common_scale_rgba_in_to_out_clip_smooth_c(src, dst, dc,
                                                         src_region_x, src_region_y,
                                                         src_region_w, src_region_h,
                                                         dst_region_x, dst_region_y,
--- a/src/lib/evas/common/evas_scale_smooth_scaler_up.c
+++ b/src/lib/evas/common/evas_scale_smooth_scaler_up.c
@ -172,6 +172,10 @@
 	    MOV_A2R(ay, mm4)
 	    pxor_r2r(mm0, mm0);
 	    MOV_A2R(ALPHA_255, mm5)
+#elif defined SCALE_USING_NEON
+	    FPU_NEON;
+	    VDUP_NEON(d12, ay);
+	    VMOV_I2R_NEON(q2, #255);
 #endif
 	    pbuf = buf;  pbuf_end = buf + dst_clip_w;
 	    sxx = sxx0;
@ -210,6 +214,28 @@
 		INTERP_256_R2R(mm4, mm2, mm1, mm5)
 		MOV_R2P(mm1, *pbuf, mm0)
 		pbuf++;
+#elif defined SCALE_USING_NEON
+		if (p0 | p1 | p2 | p3)
+		  {
+		    FPU_NEON;
+		    VMOV_M2R_NEON(d8, p0);
+		    VEOR_NEON(q0);
+		    VMOV_M2R_NEON(d9, p2);
+		    VMOV_M2R_NEON(d10, p1);
+		    VEOR_NEON(q1);
+		    VMOV_M2R_NEON(d11, p3);
+		    VDUP_NEON(q3, ax);
+		    VZIP_NEON(q4, q0);
+		    VZIP_NEON(q5, q1);
+		    VMOV_R2R_NEON(d9, d0);
+		    VMOV_R2R_NEON(d11, d2);
+		    INTERP_256_NEON(q3, q5, q4, q2);
+		    INTERP_256_NEON(d12, d9, d8, d5);
+		    VMOV_R2M_NEON(q4, d8, pbuf);
+		    pbuf++;
+		  }
+		else
+		  *pbuf++ = p0;
 #else
 		if (p0 | p1)
 		  p0 = INTERP_256(ax, p1, p0);
--- a/src/lib/evas/include/evas_blend_ops.h
+++ b/src/lib/evas/include/evas_blend_ops.h
@ -186,6 +186,64 @@ extern const DATA32 ALPHA_256;

 #endif

+/* some useful NEON macros */
+
+#ifdef BUILD_NEON
+#define FPU_NEON \
+	__asm__ __volatile__(".fpu neon \n\t");
+
+/* copy reg1 to reg2 */
+#define VMOV_R2R_NEON(reg1, reg2) \
+	__asm__ __volatile__("vmov " #reg1 ", " #reg2 " \n\t" ::: #reg1);
+
+/* copy 32bit value to lower bits of register reg */
+#define VMOV_M2R_NEON(reg, value) \
+	__asm__ __volatile__("vmov.32 " #reg "[0], %[val] \n\t" :: [val] "r" (value) : #reg); 
+
+/* save 32bit value from lower 64 bits of register regq to memory location */
+/* pointed to by pointer, using 64bit register regd as temporary location */
+#define VMOV_R2M_NEON(regq, regd, pointer) \
+	__asm__ __volatile__("vqmovn.u16 " #regd ", " #regq " \n\t" \
+			     "vst1.32 {" #regd "[0]}, [%[p]] \n\t" :: [p] "r" (pointer) : #regd, "memory");
+
+/* spread constant imm in register reg */
+#define VMOV_I2R_NEON(reg, imm) \
+	__asm__ __volatile__("vmov.i16 " #reg ", " #imm " \n\t" ::: #reg);
+
+/* spread value in register reg */
+#define VDUP_NEON(reg, value) \
+	__asm__ __volatile__("vdup.16 " #reg ", %[val] \n\t" :: [val] "r" (value) : #reg); 
+
+/* interleave contents of reg1 and reg2 */
+#define VZIP_NEON(reg1, reg2) \
+	__asm__ __volatile__("vzip.8 " #reg1 ", " #reg2 " \n\t" ::: #reg1 , #reg2);
+
+/* swap contents of two registers */
+#define VSWP_NEON(reg1, reg2) \
+	__asm__ __volatile__("vswp " #reg1 ", " #reg2 " \n\t" ::: #reg1 , #reg2);
+
+/* set register to zero */
+#define VEOR_NEON(reg) \
+	__asm__ __volatile__("veor " #reg ", " #reg ", " #reg " \n\t" ::: #reg);
+
+/* do interpolation of every channel RGBA, result is contained in regy */
+#define INTERP_256_NEON(rega, regx, regy, reg255) \
+	__asm__ __volatile__("vsub.i16 " #regx ", " #regx ", " #regy " \n\t" \
+			     "vmul.u16 " #regx ", " #regx ", " #rega " \n\t" \
+			     "vsri.16 " #regx ", " #regx ", #8 \n\t" \
+			     "vadd.i16 " #regx ", " #regx ", " #regy " \n\t" \
+			     "vand " #regy ", " #regx ", " #reg255 " \n\t" \
+			     ::: #regx, #regy );
+
+/* multiply every channel of regx and regy */
+#define MUL4_SYM_NEON(regx, regy, reg255) \
+	__asm__ __volatile__("vmul.u16 " #regx ", " #regx ", " #regy " \n\t" \
+			     "vadd.i16 " #regx ", " #regx ", " #reg255 " \n\t" \
+			     "vsri.16 " #regx ", " #regx ", #8 \n\t" \
+			     "vand " #regx ", " #regx ", " #reg255 " \n\t" \
+			     ::: #regx );
+
+#endif

 /* some useful SSE3 inline functions */