ector: add sse2 support for composition function in software backend.

Signed-off-by: Cedric BAIL <cedric@osg.samsung.com>
2015-08-17 15:36:57 +09:00 · 2015-08-17 15:36:57 +09:00 · 74dcf5ed15
parent 2766ce57ce
commit 74dcf5ed15
3 changed files with 331 additions and 3 deletions
--- a/src/Makefile_Ector.am
+++ b/src/Makefile_Ector.am
@ -96,7 +96,8 @@ lib/ector/software/ector_software_surface.c \
 lib/ector/software/sw_ft_math.c \
 lib/ector/software/sw_ft_raster.c \
 lib/ector/software/sw_ft_stroker.c \
-lib/ector/software/ector_drawhelper.c
+lib/ector/software/ector_drawhelper.c \
+lib/ector/software/ector_drawhelper_sse2.c

 installed_ectorsoftwareheadersdir = $(includedir)/ector-@VMAJ@/software
 nodist_installed_ectorsoftwareheaders_DATA = $(ector_eolian_software_h)
@ -109,7 +110,8 @@ lib_ector_libector_la_CPPFLAGS = -I$(top_builddir)/src/lib/efl \
 -DPACKAGE_BIN_DIR=\"$(bindir)\" \
 -DPACKAGE_LIB_DIR=\"$(libdir)\" \
 -DPACKAGE_DATA_DIR=\"$(datadir)/ector\" \
-@VALGRIND_CFLAGS@
+@VALGRIND_CFLAGS@ \
+@SSE3_CFLAGS@

 lib_ector_libector_la_LIBADD = @ECTOR_LIBS@ @DL_LIBS@
 lib_ector_libector_la_DEPENDENCIES = @ECTOR_INTERNAL_LIBS@ @DL_INTERNAL_LIBS@
--- a/src/lib/ector/software/ector_drawhelper.c
+++ b/src/lib/ector/software/ector_drawhelper.c
@ -149,7 +149,9 @@ RGBA_Comp_Func ector_comp_func_span_get(Ector_Rop op, uint color, Eina_Bool src_
   return func_for_mode[op];
 }

+extern void init_draw_helper_sse2();
+
 void init_draw_helper()
 {
-
+   init_draw_helper_sse2();
 }
--- a/src/lib/ector/software/ector_drawhelper_sse2.c
+++ b/src/lib/ector/software/ector_drawhelper_sse2.c
@ -0,0 +1,324 @@
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <Ector.h>
+#include "ector_drawhelper_private.h"
+
+#ifdef BUILD_SSE3
+#include <immintrin.h>
+
+// Each 32bits components of alphaChannel must be in the form 0x00AA00AA
+inline static __m128i
+v4_byte_mul_sse2(__m128i c, __m128i a)
+{
+   const __m128i ag_mask = _mm_set1_epi32(0xFF00FF00);
+   const __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
+
+   /* for AG */
+   __m128i v_ag = _mm_and_si128(ag_mask, c);
+   v_ag = _mm_srli_epi32(v_ag, 8);
+   v_ag = _mm_mullo_epi16(a, v_ag);
+   v_ag = _mm_and_si128(ag_mask, v_ag);
+
+   /* for RB */
+   __m128i v_rb = _mm_and_si128(rb_mask, c);
+   v_rb = _mm_mullo_epi16(a, v_rb);
+   v_rb = _mm_srli_epi32(v_rb, 8);
+   v_rb = _mm_and_si128(rb_mask, v_rb);
+
+   /* combine */
+   return _mm_add_epi32(v_ag, v_rb);
+}
+
+static inline __m128i
+v4_interpolate_color_sse2(__m128i a, __m128i c0, __m128i c1)
+{
+   const __m128i rb_mask = _mm_set1_epi32(0xFF00FF00);
+   const __m128i zero = _mm_setzero_si128();
+
+   __m128i a_l = a;
+   __m128i a_h = a;
+   a_l = _mm_unpacklo_epi16(a_l, a_l);
+   a_h = _mm_unpackhi_epi16(a_h, a_h);
+
+   __m128i a_t = _mm_slli_epi64(a_l, 32);
+   __m128i a_t0 = _mm_slli_epi64(a_h, 32);
+
+   a_l = _mm_add_epi32(a_l, a_t);
+   a_h = _mm_add_epi32(a_h, a_t0);
+
+   __m128i c0_l = c0;
+   __m128i c0_h = c0;
+
+   c0_l = _mm_unpacklo_epi8(c0_l, zero);
+   c0_h = _mm_unpackhi_epi8(c0_h, zero);
+
+   __m128i c1_l = c1;
+   __m128i c1_h = c1;
+
+   c1_l = _mm_unpacklo_epi8(c1_l, zero);
+   c1_h = _mm_unpackhi_epi8(c1_h, zero);
+
+   __m128i cl_sub = _mm_sub_epi16(c0_l, c1_l);
+   __m128i ch_sub = _mm_sub_epi16(c0_h, c1_h);
+
+   cl_sub = _mm_mullo_epi16(cl_sub, a_l);
+   ch_sub = _mm_mullo_epi16(ch_sub, a_h);
+
+   __m128i c1ls = _mm_slli_epi16(c1_l, 8);
+   __m128i c1hs = _mm_slli_epi16(c1_h, 8);
+
+   cl_sub = _mm_add_epi16(cl_sub, c1ls);
+   ch_sub = _mm_add_epi16(ch_sub, c1hs);
+
+   cl_sub = _mm_and_si128(cl_sub, rb_mask);
+   ch_sub = _mm_and_si128(ch_sub, rb_mask);
+
+   cl_sub = _mm_srli_epi64(cl_sub, 8);
+   ch_sub = _mm_srli_epi64(ch_sub, 8);
+
+   cl_sub = _mm_packus_epi16(cl_sub, cl_sub);
+   ch_sub = _mm_packus_epi16(ch_sub, ch_sub);
+
+   return  (__m128i) _mm_shuffle_ps( (__m128)cl_sub, (__m128)ch_sub, 0x44);
+}
+
+static inline __m128i
+v4_mul_color_sse2(__m128i x, __m128i y)
+{
+   const __m128i zero = _mm_setzero_si128();
+   const __m128i sym4_mask = _mm_set_epi32(0x00FF00FF, 0x000000FF, 0x00FF00FF, 0x000000FF);
+   __m128i x_l = _mm_unpacklo_epi8(x, zero);
+   __m128i x_h = _mm_unpackhi_epi8(x, zero);
+
+   __m128i y_l = _mm_unpacklo_epi8(y, zero);
+   __m128i y_h = _mm_unpackhi_epi8(y, zero);
+
+   __m128i r_l = _mm_mullo_epi16(x_l, y_l);
+   __m128i r_h = _mm_mullo_epi16(x_h, y_h);
+
+   r_l = _mm_add_epi16(r_l, sym4_mask);
+   r_h = _mm_add_epi16(r_h, sym4_mask);
+
+   r_l = _mm_srli_epi16(r_l, 8);
+   r_h = _mm_srli_epi16(r_h, 8);
+
+   return  _mm_packus_epi16(r_l, r_h);
+}
+
+static inline __m128i
+v4_ialpha_sse2(__m128i c)
+{
+   __m128i a = _mm_srli_epi32(c, 24);
+   return _mm_sub_epi32(_mm_set1_epi32(0xff), a);
+}
+
+// dest = color + (dest * alpha)
+inline static void
+comp_func_helper_sse2 (uint *dest, int length, uint color, uint alpha)
+{
+   const __m128i v_color = _mm_set1_epi32(color);
+   const __m128i v_a = _mm_set1_epi16(alpha);
+
+   LOOP_ALIGNED_U1_A4(dest, length,
+      { /* UOP */
+         *dest = color + BYTE_MUL(*dest, alpha);
+         dest++; length--;
+      },
+      { /* A4OP */
+         __m128i v_dest = _mm_load_si128((__m128i *)dest);
+
+         v_dest = v4_byte_mul_sse2(v_dest, v_a);
+         v_dest = _mm_add_epi32(v_dest, v_color);
+
+         _mm_store_si128((__m128i *)dest, v_dest);
+
+         dest += 4; length -= 4;
+      })
+}
+
+void
+comp_func_solid_source_sse2(uint *dest, int length, uint color, uint const_alpha)
+{
+   int ialpha;
+   if (const_alpha == 255) _ector_memfill(dest, length, color);
+   else
+     {
+        ialpha = 255 - const_alpha;
+        color = BYTE_MUL(color, const_alpha);
+        comp_func_helper_sse2(dest, length, color, ialpha);
+     }
+}
+
+void
+comp_func_solid_source_over_sse2(uint *dest, int length, uint color, uint const_alpha)
+{
+   int ialpha;
+   if (const_alpha != 255)
+     color = BYTE_MUL(color, const_alpha);
+   ialpha = Alpha(~color);
+   comp_func_helper_sse2(dest, length, color, ialpha);
+}
+
+// Load src and dest vector
+#define V4_FETCH_SRC_DEST \
+  __m128i v_src = _mm_loadu_si128((__m128i *)src); \
+  __m128i v_dest = _mm_load_si128((__m128i *)dest);
+
+#define V4_FETCH_SRC \
+  __m128i v_src = _mm_loadu_si128((__m128i *)src);
+
+#define V4_STORE_DEST \
+  _mm_store_si128((__m128i *)dest, v_src);
+
+#define V4_SRC_DEST_LEN_INC \
+  dest += 4; src +=4; length -= 4;
+
+// Multiply src color with color multiplier
+#define V4_COLOR_MULTIPLY \
+  v_src = v4_mul_color_sse2(v_src, v_color);
+
+// Multiply src color with const_alpha
+#define V4_ALPHA_MULTIPLY \
+  v_src = v4_byte_mul_sse2(v_src, v_alpha);
+
+// dest = src + dest * sia
+#define V4_COMP_OP_SRC_OVER \
+  __m128i v_sia = v4_ialpha_sse2(v_src); \
+  v_sia = _mm_add_epi32(v_sia, _mm_slli_epi32(v_sia, 16)); \
+  v_dest = v4_byte_mul_sse2(v_dest, v_sia); \
+  v_src = _mm_add_epi32(v_src, v_dest);
+
+// dest = src + dest * sia
+#define V4_COMP_OP_SRC \
+  v_src = v4_interpolate_color_sse2(v_alpha, v_src, v_dest);
+
+
+
+static void
+comp_func_source_sse2(uint *dest, const uint *src, int length, uint color, uint const_alpha)
+{
+   int ialpha;
+   uint src_color;
+   if (color == 0xffffffff) // No color multiplier
+     {
+        if (const_alpha == 255)
+          memcpy(dest, src, length * sizeof(uint));
+        else
+          {
+             ialpha = 255 - const_alpha;
+             __m128i v_alpha = _mm_set1_epi32(const_alpha);
+             LOOP_ALIGNED_U1_A4(dest, length,
+               { /* UOP */
+                  *dest = INTERPOLATE_PIXEL_256(*src, const_alpha, *dest, ialpha);
+                  dest++; src++; length--;
+               },
+               { /* A4OP */
+                  V4_FETCH_SRC_DEST
+                  V4_COMP_OP_SRC
+                  V4_STORE_DEST
+                  V4_SRC_DEST_LEN_INC
+               })
+          }
+     }
+   else
+     {
+        __m128i v_color = _mm_set1_epi32(color);
+        if (const_alpha == 255)
+          {
+             LOOP_ALIGNED_U1_A4(dest, length,
+               { /* UOP */
+                  *dest = ECTOR_MUL4_SYM(*src, color);
+                  dest++; src++; length--;
+               },
+               { /* A4OP */
+                  V4_FETCH_SRC
+                  V4_COLOR_MULTIPLY
+                  V4_STORE_DEST
+                  V4_SRC_DEST_LEN_INC
+               })
+          }
+        else
+          {
+             ialpha = 255 - const_alpha;
+             __m128i v_alpha = _mm_set1_epi32(const_alpha);
+             LOOP_ALIGNED_U1_A4(dest, length,
+               { /* UOP */
+                  src_color = ECTOR_MUL4_SYM(*src, color);
+                  *dest = INTERPOLATE_PIXEL_256(src_color, const_alpha, *dest, ialpha);
+                  dest++; src++; length--;
+               },
+               { /* A4OP */
+                  V4_FETCH_SRC_DEST
+                  V4_COLOR_MULTIPLY
+                  V4_COMP_OP_SRC
+                  V4_STORE_DEST
+                  V4_SRC_DEST_LEN_INC
+               })
+          }
+     }
+}
+
+static void
+comp_func_source_over_sse2(uint *dest, const uint *src, int length, uint color, uint const_alpha)
+{
+   uint s, sia;
+   if (const_alpha != 255)
+     color = BYTE_MUL(color, const_alpha);
+
+   if (color == 0xffffffff) // No color multiplier
+     {
+        LOOP_ALIGNED_U1_A4(dest, length,
+         { /* UOP */
+            s = *src;
+            sia = Alpha(~s);
+            *dest = s + BYTE_MUL(*dest, sia);
+            dest++; src++; length--;
+         },
+         { /* A4OP */
+            V4_FETCH_SRC_DEST
+            V4_COMP_OP_SRC_OVER
+            V4_STORE_DEST
+            V4_SRC_DEST_LEN_INC
+         })
+     }
+   else
+     {
+        __m128i v_color = _mm_set1_epi32(color);
+        LOOP_ALIGNED_U1_A4(dest, length,
+         { /* UOP */
+            s = ECTOR_MUL4_SYM(*src, color);
+            sia = Alpha(~s);
+            *dest = s + BYTE_MUL(*dest, sia);
+            dest++; src++; length--;
+         },
+         { /* A4OP */
+            V4_FETCH_SRC_DEST
+            V4_COLOR_MULTIPLY
+            V4_COMP_OP_SRC_OVER
+            V4_STORE_DEST
+            V4_SRC_DEST_LEN_INC
+         })
+     }
+}
+
+#endif
+
+void
+init_draw_helper_sse2()
+{
+#ifdef BUILD_SSE3
+   if (eina_cpu_features_get() & EINA_CPU_SSE2)
+     {
+        // update the comp_function table for solid color
+        func_for_mode_solid[ECTOR_ROP_COPY] = comp_func_solid_source_sse2;
+        func_for_mode_solid[ECTOR_ROP_BLEND] = comp_func_solid_source_over_sse2;
+
+        // update the comp_function table for source data
+        func_for_mode[ECTOR_ROP_COPY] = comp_func_source_sse2;
+        func_for_mode[ECTOR_ROP_BLEND] = comp_func_source_over_sse2;
+      }
+#endif
+}
+