ector: cleanup default backend drawer.

2015-09-12 07:31:49 +02:00 · 2015-09-12 07:31:49 +02:00 · b3dc08bf8b
parent 576331955e
commit b3dc08bf8b
4 changed files with 158 additions and 130 deletions
--- a/src/lib/ector/software/ector_drawhelper.c
+++ b/src/lib/ector/software/ector_drawhelper.c
@ -15,28 +15,30 @@
 */

 /*
-  result = s + d * sia  
+  result = s + d * sia
  dest = (s + d * sia) * ca + d * cia
       = s * ca + d * (sia * ca + cia)
       = s * ca + d * (1 - sa*ca)
 */
-void
-comp_func_solid_source_over(uint *dest, int length, uint color, uint const_alpha)
+
+static void
+_comp_func_solid_source_over(uint *dest, int length, uint color, uint const_alpha)
 {
   int ialpha, i;
+
   if (const_alpha != 255)
     color = BYTE_MUL(color, const_alpha);
-   ialpha = _alpha(~color);
+   ialpha = alpha_inverse(color);
   for (i = 0; i < length; ++i)
     dest[i] = color + BYTE_MUL(dest[i], ialpha);
 }

-
 static void
-comp_func_source_over(uint *dest, const uint *src, int length, uint color, uint const_alpha)
+_comp_func_source_over(uint *dest, const uint *src, int length, uint color, uint const_alpha)
 {
   int i;
   uint s, sc, sia;
+
   if (const_alpha != 255)
     color = BYTE_MUL(color, const_alpha);

@ -49,7 +51,7 @@ comp_func_source_over(uint *dest, const uint *src, int length, uint color, uint
               dest[i] = s;
             else if (s != 0)
               {
-                  sia = _alpha(~s);
+                  sia = alpha_inverse(s);
                  dest[i] = s + BYTE_MUL(dest[i], sia);
               }
          }
@ -60,7 +62,7 @@ comp_func_source_over(uint *dest, const uint *src, int length, uint color, uint
          {
             s = src[i];
             sc = ECTOR_MUL4_SYM(color, s);
-             sia = _alpha(~sc);
+             sia = alpha_inverse(sc);
             dest[i] = sc + BYTE_MUL(dest[i], sia);
          }
     }
@ -71,10 +73,14 @@ comp_func_source_over(uint *dest, const uint *src, int length, uint color, uint
  dest = s * ca + d * cia
 */
 static void
-comp_func_solid_source(uint *dest, int length, uint color, uint const_alpha)
+_comp_func_solid_source(uint *dest, int length, uint color, uint const_alpha)
 {
   int ialpha, i;
-   if (const_alpha == 255) _ector_memfill(dest, length, color);
+
+   if (const_alpha == 255)
+     {
+        _ector_memfill(dest, length, color);
+     }
   else
     {
        ialpha = 255 - const_alpha;
@ -85,20 +91,23 @@ comp_func_solid_source(uint *dest, int length, uint color, uint const_alpha)
 }

 static void
-comp_func_source(uint *dest, const uint *src, int length, uint color, uint const_alpha)
+_comp_func_source(uint *dest, const uint *src, int length, uint color, uint const_alpha)
 {
   int i, ialpha;
   uint src_color;
+
   if (color == 0xffffffff) // No color multiplier
     {
        if (const_alpha == 255)
-          memcpy(dest, src, length * sizeof(uint));
+          {
+             memcpy(dest, src, length * sizeof(uint));
+          }
        else
-         {
-            ialpha = 255 - const_alpha;
-            for (i = 0; i < length; ++i)
-              dest[i] = INTERPOLATE_PIXEL_256(src[i], const_alpha, dest[i], ialpha);
-         }
+          {
+             ialpha = 255 - const_alpha;
+             for (i = 0; i < length; ++i)
+               dest[i] = INTERPOLATE_PIXEL_256(src[i], const_alpha, dest[i], ialpha);
+          }
     }
   else
     {
@ -109,24 +118,24 @@ comp_func_source(uint *dest, const uint *src, int length, uint color, uint const
          }
        else
          {
-            ialpha = 255 - const_alpha;
-            for (i = 0; i < length; ++i)
-              {
-                 src_color = ECTOR_MUL4_SYM(src[i], color);
-                 dest[i] = INTERPOLATE_PIXEL_256(src_color, const_alpha, dest[i], ialpha);
-            }
+             ialpha = 255 - const_alpha;
+             for (i = 0; i < length; ++i)
+               {
+                  src_color = ECTOR_MUL4_SYM(src[i], color);
+                  dest[i] = INTERPOLATE_PIXEL_256(src_color, const_alpha, dest[i], ialpha);
+               }
          }
     }
 }

 RGBA_Comp_Func_Solid func_for_mode_solid[ECTOR_ROP_LAST] = {
-        comp_func_solid_source_over,
-        comp_func_solid_source
+  _comp_func_solid_source_over,
+  _comp_func_solid_source
 };

 RGBA_Comp_Func func_for_mode[ECTOR_ROP_LAST] = {
-        comp_func_source_over,
-        comp_func_source
+  _comp_func_source_over,
+  _comp_func_source
 };

 RGBA_Comp_Func_Solid
@ -146,13 +155,10 @@ RGBA_Comp_Func ector_comp_func_span_get(Ector_Rop op, uint color, Eina_Bool src_
     {
        if (op == ECTOR_ROP_BLEND) op = ECTOR_ROP_COPY;
     }
+
   return func_for_mode[op];
 }

-extern void init_drawhelper_gradient();
-extern void init_draw_helper_sse2();
-extern void init_draw_helper_neon();
-
 void init_draw_helper()
 {
   init_drawhelper_gradient();
--- a/src/lib/ector/software/ector_drawhelper_neon.c
+++ b/src/lib/ector/software/ector_drawhelper_neon.c
@ -7,6 +7,7 @@

 #ifdef BUILD_NEON
 #include <arm_neon.h>
+
 static void
 comp_func_solid_source_over_neon(uint * __restrict dest, int length, uint color, uint const_alpha)
 {
@ -39,8 +40,8 @@ comp_func_solid_source_over_neon(uint * __restrict dest, int length, uint color,
   // alpha can only be 0 if color is 0x0. In that case we can just return.
   // Otherwise we can assume alpha != 0. This allows more optimization in
   // NEON code.
-   if(!color)
-      return;
+   if (!color)
+     return;

   DATA32 *start = dest;
   int size = length;
@ -53,46 +54,47 @@ comp_func_solid_source_over_neon(uint * __restrict dest, int length, uint color,
   c_32x4 = vdupq_n_u32(color);

   while (start < end)
-   {
-      d0_32x4 = vld1q_u32(start);
-      d1_32x4 = vld1q_u32(start+4);
-      d0_8x16 = vreinterpretq_u8_u32(d0_32x4);
-      d1_8x16 = vreinterpretq_u8_u32(d1_32x4);
+     {
+        d0_32x4 = vld1q_u32(start);
+        d1_32x4 = vld1q_u32(start+4);
+        d0_8x16 = vreinterpretq_u8_u32(d0_32x4);
+        d1_8x16 = vreinterpretq_u8_u32(d1_32x4);

-      d00_8x8 = vget_low_u8(d0_8x16);
-      d01_8x8 = vget_high_u8(d0_8x16);
-      d10_8x8 = vget_low_u8(d1_8x16);
-      d11_8x8 = vget_high_u8(d1_8x16);
+        d00_8x8 = vget_low_u8(d0_8x16);
+        d01_8x8 = vget_high_u8(d0_8x16);
+        d10_8x8 = vget_low_u8(d1_8x16);
+        d11_8x8 = vget_high_u8(d1_8x16);

-      temp00_16x8 = vmull_u8(alpha_8x8, d00_8x8);
-      temp01_16x8 = vmull_u8(alpha_8x8, d01_8x8);
-      temp10_16x8 = vmull_u8(alpha_8x8, d10_8x8);
-      temp11_16x8 = vmull_u8(alpha_8x8, d11_8x8);
+        temp00_16x8 = vmull_u8(alpha_8x8, d00_8x8);
+        temp01_16x8 = vmull_u8(alpha_8x8, d01_8x8);
+        temp10_16x8 = vmull_u8(alpha_8x8, d10_8x8);
+        temp11_16x8 = vmull_u8(alpha_8x8, d11_8x8);

-      temp00_8x8 = vshrn_n_u16(temp00_16x8,8);
-      temp01_8x8 = vshrn_n_u16(temp01_16x8,8);
-      temp10_8x8 = vshrn_n_u16(temp10_16x8,8);
-      temp11_8x8 = vshrn_n_u16(temp11_16x8,8);
+        temp00_8x8 = vshrn_n_u16(temp00_16x8,8);
+        temp01_8x8 = vshrn_n_u16(temp01_16x8,8);
+        temp10_8x8 = vshrn_n_u16(temp10_16x8,8);
+        temp11_8x8 = vshrn_n_u16(temp11_16x8,8);

-      temp0_8x16 = vcombine_u8(temp00_8x8, temp01_8x8);
-      temp1_8x16 = vcombine_u8(temp10_8x8, temp11_8x8);
+        temp0_8x16 = vcombine_u8(temp00_8x8, temp01_8x8);
+        temp1_8x16 = vcombine_u8(temp10_8x8, temp11_8x8);

-      temp0_32x4 = vreinterpretq_u32_u8(temp0_8x16);
-      temp1_32x4 = vreinterpretq_u32_u8(temp1_8x16);
+        temp0_32x4 = vreinterpretq_u32_u8(temp0_8x16);
+        temp1_32x4 = vreinterpretq_u32_u8(temp1_8x16);

-      d0_32x4 = vaddq_u32(c_32x4, temp0_32x4);
-      d1_32x4 = vaddq_u32(c_32x4, temp1_32x4);
+        d0_32x4 = vaddq_u32(c_32x4, temp0_32x4);
+        d1_32x4 = vaddq_u32(c_32x4, temp1_32x4);
+
+        vst1q_u32(start, d0_32x4);
+        vst1q_u32(start+4, d1_32x4);
+        start+=8;
+     }

-      vst1q_u32(start, d0_32x4);
-      vst1q_u32(start+4, d1_32x4);
-      start+=8;
-   }
   end += (size & 7);
   while (start <  end)
-   {
-      *start = color + MUL_256(alpha, *start);
-      start++;
-   }
+     {
+        *start = color + MUL_256(alpha, *start);
+        start++;
+     }
 }

 /* Note: Optimisation is based on keeping _dest_ aligned: else it's a pair of
@ -132,6 +134,9 @@ comp_func_source_over_sse2(uint * __restrict dest, const uint * __restrict src,
   uint8x8_t s1_8x8;
   uint8x8_t sc0_8x8;
   uint8x8_t sc1_8x8;
+   int size;
+   DATA32 *start;
+   DATA32 *end;

   if (const_alpha != 255)
     color = BYTE_MUL(color, const_alpha);
@ -143,69 +148,69 @@ comp_func_source_over_sse2(uint * __restrict dest, const uint * __restrict src,
   x0_32x4 = vreinterpretq_u32_u8(x0_8x16);
   x1_8x16 = vdupq_n_u8(0x1);
   x1_32x4 = vreinterpretq_u32_u8(x1_8x16);
-   DATA32 *start = dest;
-   int size = l;
-   DATA32 *end = start + (size & ~3);
+   start = dest;
+   size = l;
+   end = start + (size & ~3);
+
   while (start < end)
-   {
+     {
+        s_32x4 = vld1q_u32(src);
+        s_8x16 = vreinterpretq_u8_u32(s_32x4);

-      s_32x4 = vld1q_u32(src);
-      s_8x16 = vreinterpretq_u8_u32(s_32x4);
+        d_32x4 = vld1q_u32(start);
+        d_8x16 = vreinterpretq_u8_u32(d_32x4);
+        d0_8x8 = vget_low_u8(d_8x16);
+        d1_8x8 = vget_high_u8(d_8x16);

-      d_32x4 = vld1q_u32(start);
-      d_8x16 = vreinterpretq_u8_u32(d_32x4);
-      d0_8x8 = vget_low_u8(d_8x16);
-      d1_8x8 = vget_high_u8(d_8x16);
+        s0_8x8 = vget_low_u8(s_8x16);
+        s1_8x8 = vget_high_u8(s_8x16);

-      s0_8x8 = vget_low_u8(s_8x16);
-      s1_8x8 = vget_high_u8(s_8x16);
+        sc0_16x8 = vmull_u8(s0_8x8, c_8x8);
+        sc1_16x8 = vmull_u8(s1_8x8, c_8x8);
+        sc0_16x8 = vaddq_u16(sc0_16x8, x255_16x8);
+        sc1_16x8 = vaddq_u16(sc1_16x8, x255_16x8);
+        sc0_8x8 = vshrn_n_u16(sc0_16x8, 8);
+        sc1_8x8 = vshrn_n_u16(sc1_16x8, 8);
+        sc_8x16 = vcombine_u8(sc0_8x8, sc1_8x8);

-      sc0_16x8 = vmull_u8(s0_8x8, c_8x8);
-      sc1_16x8 = vmull_u8(s1_8x8, c_8x8);
-      sc0_16x8 = vaddq_u16(sc0_16x8, x255_16x8);
-      sc1_16x8 = vaddq_u16(sc1_16x8, x255_16x8);
-      sc0_8x8 = vshrn_n_u16(sc0_16x8, 8);
-      sc1_8x8 = vshrn_n_u16(sc1_16x8, 8);
-      sc_8x16 = vcombine_u8(sc0_8x8, sc1_8x8);
+        alpha_32x4 = vreinterpretq_u32_u8(sc_8x16);
+        alpha_32x4 = vshrq_n_u32(alpha_32x4, 24);
+        alpha_32x4 = vmulq_u32(x1_32x4, alpha_32x4);
+        alpha_8x16 = vreinterpretq_u8_u32(alpha_32x4);
+        alpha_8x16 = vsubq_u8(x0_8x16, alpha_8x16);
+        alpha0_8x8 = vget_low_u8(alpha_8x16);
+        alpha1_8x8 = vget_high_u8(alpha_8x16);

-      alpha_32x4 = vreinterpretq_u32_u8(sc_8x16);
-      alpha_32x4 = vshrq_n_u32(alpha_32x4, 24);
-      alpha_32x4 = vmulq_u32(x1_32x4, alpha_32x4);
-      alpha_8x16 = vreinterpretq_u8_u32(alpha_32x4);
-      alpha_8x16 = vsubq_u8(x0_8x16, alpha_8x16);
-      alpha0_8x8 = vget_low_u8(alpha_8x16);
-      alpha1_8x8 = vget_high_u8(alpha_8x16);
+        ad0_16x8 = vmull_u8(alpha0_8x8, d0_8x8);
+        ad1_16x8 = vmull_u8(alpha1_8x8, d1_8x8);
+        ad0_8x8 = vshrn_n_u16(ad0_16x8,8);
+        ad1_8x8 = vshrn_n_u16(ad1_16x8,8);
+        ad_8x16 = vcombine_u8(ad0_8x8, ad1_8x8);
+        ad_32x4 = vreinterpretq_u32_u8(ad_8x16);

-      ad0_16x8 = vmull_u8(alpha0_8x8, d0_8x8);
-      ad1_16x8 = vmull_u8(alpha1_8x8, d1_8x8);
-      ad0_8x8 = vshrn_n_u16(ad0_16x8,8);
-      ad1_8x8 = vshrn_n_u16(ad1_16x8,8);
-      ad_8x16 = vcombine_u8(ad0_8x8, ad1_8x8);
-      ad_32x4 = vreinterpretq_u32_u8(ad_8x16);
+        alpha_32x4 = vreinterpretq_u32_u8(alpha_8x16);
+        cond_32x4 = vceqq_u32(alpha_32x4, x0_32x4);
+        ad_32x4 = vbslq_u32(cond_32x4, d_32x4 , ad_32x4);

-      alpha_32x4 = vreinterpretq_u32_u8(alpha_8x16);
-      cond_32x4 = vceqq_u32(alpha_32x4, x0_32x4);
-      ad_32x4 = vbslq_u32(cond_32x4, d_32x4 , ad_32x4);
+        sc_32x4 = vreinterpretq_u32_u8(sc_8x16);
+        d_32x4 = vaddq_u32(sc_32x4, ad_32x4);

-      sc_32x4 = vreinterpretq_u32_u8(sc_8x16);
-      d_32x4 = vaddq_u32(sc_32x4, ad_32x4);
+        vst1q_u32(start, d_32x4);

-      vst1q_u32(start, d_32x4);
+        src+=4;
+        start+=4;
+     }

-      src+=4;
-      start+=4;
-   }
   end += (size & 3);
   while (start <  end)
-   {
-      DATA32 sc = MUL4_SYM(color, *s);
-      DATA32 alpha = 256 - (sc >> 24);
-      *start = sc + MUL_256(alpha, *start);
-      start++;
-      src++;
-   }
+     {
+        DATA32 sc = MUL4_SYM(color, *s);
+        DATA32 alpha = 256 - (sc >> 24);
+        *start = sc + MUL_256(alpha, *start);
+        start++;
+        src++;
+     }
 }
-
 #endif

 void
--- a/src/lib/ector/software/ector_drawhelper_private.h
+++ b/src/lib/ector/software/ector_drawhelper_private.h
@ -1,9 +1,7 @@
 #ifndef ECTOR_DRAWHELPER_PRIVATE_H
 #define ECTOR_DRAWHELPER_PRIVATE_H

-#ifdef HAVE_CONFIG_H
-# include <config.h>
-#endif
+#include "ector_private.h"

 #ifndef MIN
 #define MIN( a, b )  ( (a) < (b) ? (a) : (b) )
@ -17,11 +15,6 @@
 typedef unsigned int uint;
 #endif

-static inline int _alpha(uint c)
-{
-   return c>>24;
-}
-
 #define ECTOR_ARGB_JOIN(a,r,g,b) \
        (((a) << 24) + ((r) << 16) + ((g) << 8) + (b))

@ -53,6 +46,13 @@ static inline int _alpha(uint c)
      } \
   }

+static inline int
+alpha_inverse(int color)
+{
+   color = ~color;
+   return A_VAL(&color);
+}
+
 static inline void
 _ector_memfill(uint *dest, int length, uint value)
 {
@ -89,9 +89,14 @@ INTERPOLATE_PIXEL_256(uint x, uint a, uint y, uint b)

 typedef void (*RGBA_Comp_Func)(uint *dest, const uint *src, int length, uint mul_col, uint const_alpha);
 typedef void (*RGBA_Comp_Func_Solid)(uint *dest, int length, uint color, uint const_alpha);
+
 extern RGBA_Comp_Func_Solid func_for_mode_solid[ECTOR_ROP_LAST];
 extern RGBA_Comp_Func func_for_mode[ECTOR_ROP_LAST];

+void init_drawhelper_gradient();
+void init_draw_helper_sse2();
+void init_draw_helper_neon();
+
 void init_draw_helper();

 RGBA_Comp_Func_Solid ector_comp_func_solid_span_get(Ector_Rop op, uint color);
--- a/src/lib/ector/software/ector_drawhelper_sse2.c
+++ b/src/lib/ector/software/ector_drawhelper_sse2.c
@ -89,6 +89,7 @@ v4_mul_color_sse2(__m128i x, __m128i y)
 {
   const __m128i zero = _mm_setzero_si128();
   const __m128i sym4_mask = _mm_set_epi32(0x00FF00FF, 0x000000FF, 0x00FF00FF, 0x000000FF);
+
   __m128i x_l = _mm_unpacklo_epi8(x, zero);
   __m128i x_h = _mm_unpackhi_epi8(x, zero);

@ -111,6 +112,7 @@ static inline __m128i
 v4_ialpha_sse2(__m128i c)
 {
   __m128i a = _mm_srli_epi32(c, 24);
+
   return _mm_sub_epi32(_mm_set1_epi32(0xff), a);
 }

@ -141,10 +143,14 @@ comp_func_helper_sse2 (uint *dest, int length, uint color, uint alpha)
 void
 comp_func_solid_source_sse2(uint *dest, int length, uint color, uint const_alpha)
 {
-   int ialpha;
-   if (const_alpha == 255) _ector_memfill(dest, length, color);
+   if (const_alpha == 255)
+     {
+        _ector_memfill(dest, length, color);
+     }
   else
     {
+        int ialpha;
+
        ialpha = 255 - const_alpha;
        color = BYTE_MUL(color, const_alpha);
        comp_func_helper_sse2(dest, length, color, ialpha);
@ -155,9 +161,10 @@ void
 comp_func_solid_source_over_sse2(uint *dest, int length, uint color, uint const_alpha)
 {
   int ialpha;
+
   if (const_alpha != 255)
     color = BYTE_MUL(color, const_alpha);
-   ialpha = _alpha(~color);
+   ialpha = alpha_inverse(color);
   comp_func_helper_sse2(dest, length, color, ialpha);
 }

@ -194,21 +201,23 @@ comp_func_solid_source_over_sse2(uint *dest, int length, uint color, uint const_
 #define V4_COMP_OP_SRC \
  v_src = v4_interpolate_color_sse2(v_alpha, v_src, v_dest);

-
-
 static void
 comp_func_source_sse2(uint *dest, const uint *src, int length, uint color, uint const_alpha)
 {
   int ialpha;
   uint src_color;
+
   if (color == 0xffffffff) // No color multiplier
     {
        if (const_alpha == 255)
-          memcpy(dest, src, length * sizeof(uint));
+          {
+             memcpy(dest, src, length * sizeof(uint));
+          }
        else
          {
             ialpha = 255 - const_alpha;
             __m128i v_alpha = _mm_set1_epi32(const_alpha);
+
             LOOP_ALIGNED_U1_A4(dest, length,
               { /* UOP */
                  *dest = INTERPOLATE_PIXEL_256(*src, const_alpha, *dest, ialpha);
@ -225,6 +234,7 @@ comp_func_source_sse2(uint *dest, const uint *src, int length, uint color, uint
   else
     {
        __m128i v_color = _mm_set1_epi32(color);
+
        if (const_alpha == 255)
          {
             LOOP_ALIGNED_U1_A4(dest, length,
@ -243,6 +253,7 @@ comp_func_source_sse2(uint *dest, const uint *src, int length, uint color, uint
          {
             ialpha = 255 - const_alpha;
             __m128i v_alpha = _mm_set1_epi32(const_alpha);
+
             LOOP_ALIGNED_U1_A4(dest, length,
               { /* UOP */
                  src_color = ECTOR_MUL4_SYM(*src, color);
@ -264,6 +275,7 @@ static void
 comp_func_source_over_sse2(uint *dest, const uint *src, int length, uint color, uint const_alpha)
 {
   uint s, sia;
+
   if (const_alpha != 255)
     color = BYTE_MUL(color, const_alpha);

@ -272,7 +284,7 @@ comp_func_source_over_sse2(uint *dest, const uint *src, int length, uint color,
        LOOP_ALIGNED_U1_A4(dest, length,
         { /* UOP */
            s = *src;
-            sia = _alpha(~s);
+            sia = alpha_inverse(s);
            *dest = s + BYTE_MUL(*dest, sia);
            dest++; src++; length--;
         },
@ -286,10 +298,11 @@ comp_func_source_over_sse2(uint *dest, const uint *src, int length, uint color,
   else
     {
        __m128i v_color = _mm_set1_epi32(color);
+
        LOOP_ALIGNED_U1_A4(dest, length,
         { /* UOP */
            s = ECTOR_MUL4_SYM(*src, color);
-            sia = _alpha(~s);
+            sia = alpha_inverse(s);
            *dest = s + BYTE_MUL(*dest, sia);
            dest++; src++; length--;
         },
@ -321,4 +334,3 @@ init_draw_helper_sse2()
      }
 #endif
 }
-