summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSubhransu Mohanty <sub.mohanty@samsung.com>2015-08-17 15:36:57 +0900
committerCedric BAIL <cedric@osg.samsung.com>2015-08-19 15:09:16 +0200
commit74dcf5ed15061349614c9d9a33437808734d5afb (patch)
tree71c6a145314a2b1ba450b14cf3e37ec2f86aea75
parent2766ce57ce698850e50ebd8b92c4897fa739baf9 (diff)
ector: add sse2 support for composition function in software backend.
Signed-off-by: Cedric BAIL <cedric@osg.samsung.com>
-rw-r--r--src/Makefile_Ector.am6
-rw-r--r--src/lib/ector/software/ector_drawhelper.c4
-rw-r--r--src/lib/ector/software/ector_drawhelper_sse2.c324
3 files changed, 331 insertions, 3 deletions
diff --git a/src/Makefile_Ector.am b/src/Makefile_Ector.am
index c05642f696..26e934d4b0 100644
--- a/src/Makefile_Ector.am
+++ b/src/Makefile_Ector.am
@@ -96,7 +96,8 @@ lib/ector/software/ector_software_surface.c \
96lib/ector/software/sw_ft_math.c \ 96lib/ector/software/sw_ft_math.c \
97lib/ector/software/sw_ft_raster.c \ 97lib/ector/software/sw_ft_raster.c \
98lib/ector/software/sw_ft_stroker.c \ 98lib/ector/software/sw_ft_stroker.c \
99lib/ector/software/ector_drawhelper.c 99lib/ector/software/ector_drawhelper.c \
100lib/ector/software/ector_drawhelper_sse2.c
100 101
101installed_ectorsoftwareheadersdir = $(includedir)/ector-@VMAJ@/software 102installed_ectorsoftwareheadersdir = $(includedir)/ector-@VMAJ@/software
102nodist_installed_ectorsoftwareheaders_DATA = $(ector_eolian_software_h) 103nodist_installed_ectorsoftwareheaders_DATA = $(ector_eolian_software_h)
@@ -109,7 +110,8 @@ lib_ector_libector_la_CPPFLAGS = -I$(top_builddir)/src/lib/efl \
109-DPACKAGE_BIN_DIR=\"$(bindir)\" \ 110-DPACKAGE_BIN_DIR=\"$(bindir)\" \
110-DPACKAGE_LIB_DIR=\"$(libdir)\" \ 111-DPACKAGE_LIB_DIR=\"$(libdir)\" \
111-DPACKAGE_DATA_DIR=\"$(datadir)/ector\" \ 112-DPACKAGE_DATA_DIR=\"$(datadir)/ector\" \
112@VALGRIND_CFLAGS@ 113@VALGRIND_CFLAGS@ \
114@SSE3_CFLAGS@
113 115
114lib_ector_libector_la_LIBADD = @ECTOR_LIBS@ @DL_LIBS@ 116lib_ector_libector_la_LIBADD = @ECTOR_LIBS@ @DL_LIBS@
115lib_ector_libector_la_DEPENDENCIES = @ECTOR_INTERNAL_LIBS@ @DL_INTERNAL_LIBS@ 117lib_ector_libector_la_DEPENDENCIES = @ECTOR_INTERNAL_LIBS@ @DL_INTERNAL_LIBS@
diff --git a/src/lib/ector/software/ector_drawhelper.c b/src/lib/ector/software/ector_drawhelper.c
index 40e7faaaae..26d8f988b0 100644
--- a/src/lib/ector/software/ector_drawhelper.c
+++ b/src/lib/ector/software/ector_drawhelper.c
@@ -149,7 +149,9 @@ RGBA_Comp_Func ector_comp_func_span_get(Ector_Rop op, uint color, Eina_Bool src_
149 return func_for_mode[op]; 149 return func_for_mode[op];
150} 150}
151 151
152extern void init_draw_helper_sse2();
153
152void init_draw_helper() 154void init_draw_helper()
153{ 155{
154 156 init_draw_helper_sse2();
155} 157}
diff --git a/src/lib/ector/software/ector_drawhelper_sse2.c b/src/lib/ector/software/ector_drawhelper_sse2.c
new file mode 100644
index 0000000000..bf6b25cb27
--- /dev/null
+++ b/src/lib/ector/software/ector_drawhelper_sse2.c
@@ -0,0 +1,324 @@
1#ifdef HAVE_CONFIG_H
2#include "config.h"
3#endif
4
5#include <Ector.h>
6#include "ector_drawhelper_private.h"
7
8#ifdef BUILD_SSE3
9#include <immintrin.h>
10
11// Each 32bits components of alphaChannel must be in the form 0x00AA00AA
12inline static __m128i
13v4_byte_mul_sse2(__m128i c, __m128i a)
14{
15 const __m128i ag_mask = _mm_set1_epi32(0xFF00FF00);
16 const __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
17
18 /* for AG */
19 __m128i v_ag = _mm_and_si128(ag_mask, c);
20 v_ag = _mm_srli_epi32(v_ag, 8);
21 v_ag = _mm_mullo_epi16(a, v_ag);
22 v_ag = _mm_and_si128(ag_mask, v_ag);
23
24 /* for RB */
25 __m128i v_rb = _mm_and_si128(rb_mask, c);
26 v_rb = _mm_mullo_epi16(a, v_rb);
27 v_rb = _mm_srli_epi32(v_rb, 8);
28 v_rb = _mm_and_si128(rb_mask, v_rb);
29
30 /* combine */
31 return _mm_add_epi32(v_ag, v_rb);
32}
33
34static inline __m128i
35v4_interpolate_color_sse2(__m128i a, __m128i c0, __m128i c1)
36{
37 const __m128i rb_mask = _mm_set1_epi32(0xFF00FF00);
38 const __m128i zero = _mm_setzero_si128();
39
40 __m128i a_l = a;
41 __m128i a_h = a;
42 a_l = _mm_unpacklo_epi16(a_l, a_l);
43 a_h = _mm_unpackhi_epi16(a_h, a_h);
44
45 __m128i a_t = _mm_slli_epi64(a_l, 32);
46 __m128i a_t0 = _mm_slli_epi64(a_h, 32);
47
48 a_l = _mm_add_epi32(a_l, a_t);
49 a_h = _mm_add_epi32(a_h, a_t0);
50
51 __m128i c0_l = c0;
52 __m128i c0_h = c0;
53
54 c0_l = _mm_unpacklo_epi8(c0_l, zero);
55 c0_h = _mm_unpackhi_epi8(c0_h, zero);
56
57 __m128i c1_l = c1;
58 __m128i c1_h = c1;
59
60 c1_l = _mm_unpacklo_epi8(c1_l, zero);
61 c1_h = _mm_unpackhi_epi8(c1_h, zero);
62
63 __m128i cl_sub = _mm_sub_epi16(c0_l, c1_l);
64 __m128i ch_sub = _mm_sub_epi16(c0_h, c1_h);
65
66 cl_sub = _mm_mullo_epi16(cl_sub, a_l);
67 ch_sub = _mm_mullo_epi16(ch_sub, a_h);
68
69 __m128i c1ls = _mm_slli_epi16(c1_l, 8);
70 __m128i c1hs = _mm_slli_epi16(c1_h, 8);
71
72 cl_sub = _mm_add_epi16(cl_sub, c1ls);
73 ch_sub = _mm_add_epi16(ch_sub, c1hs);
74
75 cl_sub = _mm_and_si128(cl_sub, rb_mask);
76 ch_sub = _mm_and_si128(ch_sub, rb_mask);
77
78 cl_sub = _mm_srli_epi64(cl_sub, 8);
79 ch_sub = _mm_srli_epi64(ch_sub, 8);
80
81 cl_sub = _mm_packus_epi16(cl_sub, cl_sub);
82 ch_sub = _mm_packus_epi16(ch_sub, ch_sub);
83
84 return (__m128i) _mm_shuffle_ps( (__m128)cl_sub, (__m128)ch_sub, 0x44);
85}
86
87static inline __m128i
88v4_mul_color_sse2(__m128i x, __m128i y)
89{
90 const __m128i zero = _mm_setzero_si128();
91 const __m128i sym4_mask = _mm_set_epi32(0x00FF00FF, 0x000000FF, 0x00FF00FF, 0x000000FF);
92 __m128i x_l = _mm_unpacklo_epi8(x, zero);
93 __m128i x_h = _mm_unpackhi_epi8(x, zero);
94
95 __m128i y_l = _mm_unpacklo_epi8(y, zero);
96 __m128i y_h = _mm_unpackhi_epi8(y, zero);
97
98 __m128i r_l = _mm_mullo_epi16(x_l, y_l);
99 __m128i r_h = _mm_mullo_epi16(x_h, y_h);
100
101 r_l = _mm_add_epi16(r_l, sym4_mask);
102 r_h = _mm_add_epi16(r_h, sym4_mask);
103
104 r_l = _mm_srli_epi16(r_l, 8);
105 r_h = _mm_srli_epi16(r_h, 8);
106
107 return _mm_packus_epi16(r_l, r_h);
108}
109
110static inline __m128i
111v4_ialpha_sse2(__m128i c)
112{
113 __m128i a = _mm_srli_epi32(c, 24);
114 return _mm_sub_epi32(_mm_set1_epi32(0xff), a);
115}
116
117// dest = color + (dest * alpha)
118inline static void
119comp_func_helper_sse2 (uint *dest, int length, uint color, uint alpha)
120{
121 const __m128i v_color = _mm_set1_epi32(color);
122 const __m128i v_a = _mm_set1_epi16(alpha);
123
124 LOOP_ALIGNED_U1_A4(dest, length,
125 { /* UOP */
126 *dest = color + BYTE_MUL(*dest, alpha);
127 dest++; length--;
128 },
129 { /* A4OP */
130 __m128i v_dest = _mm_load_si128((__m128i *)dest);
131
132 v_dest = v4_byte_mul_sse2(v_dest, v_a);
133 v_dest = _mm_add_epi32(v_dest, v_color);
134
135 _mm_store_si128((__m128i *)dest, v_dest);
136
137 dest += 4; length -= 4;
138 })
139}
140
141void
142comp_func_solid_source_sse2(uint *dest, int length, uint color, uint const_alpha)
143{
144 int ialpha;
145 if (const_alpha == 255) _ector_memfill(dest, length, color);
146 else
147 {
148 ialpha = 255 - const_alpha;
149 color = BYTE_MUL(color, const_alpha);
150 comp_func_helper_sse2(dest, length, color, ialpha);
151 }
152}
153
154void
155comp_func_solid_source_over_sse2(uint *dest, int length, uint color, uint const_alpha)
156{
157 int ialpha;
158 if (const_alpha != 255)
159 color = BYTE_MUL(color, const_alpha);
160 ialpha = Alpha(~color);
161 comp_func_helper_sse2(dest, length, color, ialpha);
162}
163
164// Load src and dest vector
165#define V4_FETCH_SRC_DEST \
166 __m128i v_src = _mm_loadu_si128((__m128i *)src); \
167 __m128i v_dest = _mm_load_si128((__m128i *)dest);
168
169#define V4_FETCH_SRC \
170 __m128i v_src = _mm_loadu_si128((__m128i *)src);
171
172#define V4_STORE_DEST \
173 _mm_store_si128((__m128i *)dest, v_src);
174
175#define V4_SRC_DEST_LEN_INC \
176 dest += 4; src +=4; length -= 4;
177
178// Multiply src color with color multiplier
179#define V4_COLOR_MULTIPLY \
180 v_src = v4_mul_color_sse2(v_src, v_color);
181
182// Multiply src color with const_alpha
183#define V4_ALPHA_MULTIPLY \
184 v_src = v4_byte_mul_sse2(v_src, v_alpha);
185
186// dest = src + dest * sia
187#define V4_COMP_OP_SRC_OVER \
188 __m128i v_sia = v4_ialpha_sse2(v_src); \
189 v_sia = _mm_add_epi32(v_sia, _mm_slli_epi32(v_sia, 16)); \
190 v_dest = v4_byte_mul_sse2(v_dest, v_sia); \
191 v_src = _mm_add_epi32(v_src, v_dest);
192
193// dest = src + dest * sia
194#define V4_COMP_OP_SRC \
195 v_src = v4_interpolate_color_sse2(v_alpha, v_src, v_dest);
196
197
198
199static void
200comp_func_source_sse2(uint *dest, const uint *src, int length, uint color, uint const_alpha)
201{
202 int ialpha;
203 uint src_color;
204 if (color == 0xffffffff) // No color multiplier
205 {
206 if (const_alpha == 255)
207 memcpy(dest, src, length * sizeof(uint));
208 else
209 {
210 ialpha = 255 - const_alpha;
211 __m128i v_alpha = _mm_set1_epi32(const_alpha);
212 LOOP_ALIGNED_U1_A4(dest, length,
213 { /* UOP */
214 *dest = INTERPOLATE_PIXEL_256(*src, const_alpha, *dest, ialpha);
215 dest++; src++; length--;
216 },
217 { /* A4OP */
218 V4_FETCH_SRC_DEST
219 V4_COMP_OP_SRC
220 V4_STORE_DEST
221 V4_SRC_DEST_LEN_INC
222 })
223 }
224 }
225 else
226 {
227 __m128i v_color = _mm_set1_epi32(color);
228 if (const_alpha == 255)
229 {
230 LOOP_ALIGNED_U1_A4(dest, length,
231 { /* UOP */
232 *dest = ECTOR_MUL4_SYM(*src, color);
233 dest++; src++; length--;
234 },
235 { /* A4OP */
236 V4_FETCH_SRC
237 V4_COLOR_MULTIPLY
238 V4_STORE_DEST
239 V4_SRC_DEST_LEN_INC
240 })
241 }
242 else
243 {
244 ialpha = 255 - const_alpha;
245 __m128i v_alpha = _mm_set1_epi32(const_alpha);
246 LOOP_ALIGNED_U1_A4(dest, length,
247 { /* UOP */
248 src_color = ECTOR_MUL4_SYM(*src, color);
249 *dest = INTERPOLATE_PIXEL_256(src_color, const_alpha, *dest, ialpha);
250 dest++; src++; length--;
251 },
252 { /* A4OP */
253 V4_FETCH_SRC_DEST
254 V4_COLOR_MULTIPLY
255 V4_COMP_OP_SRC
256 V4_STORE_DEST
257 V4_SRC_DEST_LEN_INC
258 })
259 }
260 }
261}
262
263static void
264comp_func_source_over_sse2(uint *dest, const uint *src, int length, uint color, uint const_alpha)
265{
266 uint s, sia;
267 if (const_alpha != 255)
268 color = BYTE_MUL(color, const_alpha);
269
270 if (color == 0xffffffff) // No color multiplier
271 {
272 LOOP_ALIGNED_U1_A4(dest, length,
273 { /* UOP */
274 s = *src;
275 sia = Alpha(~s);
276 *dest = s + BYTE_MUL(*dest, sia);
277 dest++; src++; length--;
278 },
279 { /* A4OP */
280 V4_FETCH_SRC_DEST
281 V4_COMP_OP_SRC_OVER
282 V4_STORE_DEST
283 V4_SRC_DEST_LEN_INC
284 })
285 }
286 else
287 {
288 __m128i v_color = _mm_set1_epi32(color);
289 LOOP_ALIGNED_U1_A4(dest, length,
290 { /* UOP */
291 s = ECTOR_MUL4_SYM(*src, color);
292 sia = Alpha(~s);
293 *dest = s + BYTE_MUL(*dest, sia);
294 dest++; src++; length--;
295 },
296 { /* A4OP */
297 V4_FETCH_SRC_DEST
298 V4_COLOR_MULTIPLY
299 V4_COMP_OP_SRC_OVER
300 V4_STORE_DEST
301 V4_SRC_DEST_LEN_INC
302 })
303 }
304}
305
306#endif
307
308void
309init_draw_helper_sse2()
310{
311#ifdef BUILD_SSE3
312 if (eina_cpu_features_get() & EINA_CPU_SSE2)
313 {
314 // update the comp_function table for solid color
315 func_for_mode_solid[ECTOR_ROP_COPY] = comp_func_solid_source_sse2;
316 func_for_mode_solid[ECTOR_ROP_BLEND] = comp_func_solid_source_over_sse2;
317
318 // update the comp_function table for source data
319 func_for_mode[ECTOR_ROP_COPY] = comp_func_source_sse2;
320 func_for_mode[ECTOR_ROP_BLEND] = comp_func_source_over_sse2;
321 }
322#endif
323}
324