summaryrefslogtreecommitdiff
path: root/src/lib/evas/common
diff options
context:
space:
mode:
authorCedric BAIL <cedric@osg.samsung.com>2015-04-22 15:24:13 +0200
committerCedric BAIL <cedric@osg.samsung.com>2015-05-07 09:53:09 +0200
commit657d495aa9607774409206099d809a1a737b736a (patch)
tree5f49f16429c9afb28aabf2ede256094633cd4351 /src/lib/evas/common
parent8fa4d415e4e82316bfaecd7f9dbe64131fff345b (diff)
evas: implement _op_blend_p_mas_dp_neon and _op_blend_pas_mas_dp_neon in NEON intrinsics.
Reviewers: raster, cedric Reviewed By: cedric Subscribers: cedric Projects: #efl Differential Revision: https://phab.enlightenment.org/D2391 Signed-off-by: Cedric BAIL <cedric@osg.samsung.com>
Diffstat (limited to 'src/lib/evas/common')
-rw-r--r--src/lib/evas/common/evas_op_blend/op_blend_pixel_mask_neon.c384
1 files changed, 337 insertions, 47 deletions
diff --git a/src/lib/evas/common/evas_op_blend/op_blend_pixel_mask_neon.c b/src/lib/evas/common/evas_op_blend/op_blend_pixel_mask_neon.c
index 0c1029b..317e7d5 100644
--- a/src/lib/evas/common/evas_op_blend/op_blend_pixel_mask_neon.c
+++ b/src/lib/evas/common/evas_op_blend/op_blend_pixel_mask_neon.c
@@ -1,62 +1,352 @@
1#ifdef BUILD_NEON
2#include <arm_neon.h>
3#endif
1/* blend pixel x mask --> dst */ 4/* blend pixel x mask --> dst */
2 5
3// FIXME: These functions most likely don't perform the correct operation.
4// Test them with masks and images.
5
6#ifdef BUILD_NEON 6#ifdef BUILD_NEON
7#if 0
8static void 7static void
9_op_blend_pas_mas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c EINA_UNUSED, DATA32 *d, int l) { 8_op_blend_pas_mas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c EINA_UNUSED, DATA32 *d, int l) {
10 DATA32 *e; 9 uint16x8_t m_16x8;
10 uint16x8_t ms0_16x8;
11 uint16x8_t ms1_16x8;
12 uint16x8_t temp0_16x8;
13 uint16x8_t temp1_16x8;
14 uint16x8_t x255_16x8;
15 uint32_t m_32;
16 uint32x2_t m_32x2;
17 uint32x4_t a_32x4;
18 uint32x4_t ad_32x4;
19 uint32x4_t cond_32x4;
20 uint32x4_t d_32x4;
21 uint32x4_t m_32x4;
22 uint32x4_t ms_32x4;
23 uint32x4_t s_32x4;
24 uint32x4_t temp_32x4;
25 uint32x4_t x0_32x4;
26 uint32x4_t x1_32x4;
27 uint8x16_t a_8x16;
28 uint8x16_t d_8x16;
29 uint8x16_t m_8x16;
30 uint8x16_t ms_8x16;
31 uint8x16_t s_8x16;
32 uint8x16_t temp_8x16;
33 uint8x16_t x0_8x16;
34 uint8x16_t x1_8x16;
35 uint8x8_t a0_8x8;
36 uint8x8_t a1_8x8;
37 uint8x8_t d0_8x8;
38 uint8x8_t d1_8x8;
39 uint8x8_t m0_8x8;
40 uint8x8_t m1_8x8;
41 uint8x8_t m_8x8;
42 uint8x8_t ms0_8x8;
43 uint8x8_t ms1_8x8;
44 uint8x8_t s0_8x8;
45 uint8x8_t s1_8x8;
46 uint8x8_t temp0_8x8;
47 uint8x8_t temp1_8x8;
48
49 x1_8x16 = vdupq_n_u8(0x1);
50 x1_32x4 = vreinterpretq_u32_u8(x1_8x16);
51 x255_16x8 = vdupq_n_u16(0xff);
52 x0_8x16 = vdupq_n_u8(0x0);
53 x0_32x4 = vreinterpretq_u32_u8(x0_8x16);
54
55 DATA32 *end = d + (l & ~3);
56 while (d < end)
57 {
58 unsigned int k = *((unsigned int *)m);
59 // shortcut if *m==0
60 if (k == 0)
61 {
62 m+=4;
63 d+=4;
64 s+=4;
65 continue;
66 }
67 // shortcut if *m==0xff
68 if (~k == 0)
69 {
70 // load 4 elements from s
71 s_32x4 = vld1q_u32(s);
72 s_8x16 = vreinterpretq_u8_u32(s_32x4);
73
74 // load 4 elements from d
75 d_32x4 = vld1q_u32(d);
76 d_8x16 = vreinterpretq_u8_u32(d_32x4);
77 d0_8x8 = vget_low_u8(d_8x16);
78 d1_8x8 = vget_high_u8(d_8x16);
79
80 // substract 256 - *s
81 a_8x16 = vsubq_u8(x0_8x16, s_8x16);
82 a_32x4 = vreinterpretq_u32_u8(a_8x16);
83
84 // shift alpha>>24 and place it into every 8bit element
85 a_32x4 = vshrq_n_u32(a_32x4, 24);
86 a_32x4 = vmulq_u32(a_32x4, x1_32x4);
87 a_8x16 = vreinterpretq_u8_u32(a_32x4);
88 a0_8x8 = vget_low_u8(a_8x16);
89 a1_8x8 = vget_high_u8(a_8x16);
90
91 // multiply MUL_256(a, *d)
92 temp0_16x8 = vmull_u8(a0_8x8, d0_8x8);
93 temp1_16x8 = vmull_u8(a1_8x8, d1_8x8);
94 temp0_8x8 = vshrn_n_u16(temp0_16x8,8);
95 temp1_8x8 = vshrn_n_u16(temp1_16x8,8);
96 temp_8x16 = vcombine_u8(temp0_8x8, temp1_8x8);
97 temp_32x4 = vreinterpretq_u32_u8(temp_8x16);
98
99 // if alpha is 0, replace a*d with d
100 cond_32x4 = vceqq_u32(a_32x4, x0_32x4);
101 ad_32x4 = vbslq_u32(cond_32x4, d_32x4, temp_32x4);
102
103 // add *s
104 d_32x4 = vaddq_u32(s_32x4, ad_32x4);
105
106 // save result
107 vst1q_u32(d, d_32x4);
108 m+=4;
109 d+=4;
110 s+=4;
111 continue;
112 }
113 // load 4 elements from m
114 m_32 = k;
115 m_32x2 = vset_lane_u32(m_32, m_32x2, 0);
116
117 // load 4 elements from s
118 s_32x4 = vld1q_u32(s);
119 s_8x16 = vreinterpretq_u8_u32(s_32x4);
120 s0_8x8 = vget_low_u8(s_8x16);
121 s1_8x8 = vget_high_u8(s_8x16);
122
123 // load 4 elements from d
124 d_32x4 = vld1q_u32(d);
125 d_8x16 = vreinterpretq_u8_u32(d_32x4);
126 d0_8x8 = vget_low_u8(d_8x16);
127 d1_8x8 = vget_high_u8(d_8x16);
128
129 // make m 32 bit wide
130 m_8x8 = vreinterpret_u8_u32(m_32x2);
131 m_16x8 = vmovl_u8(m_8x8);
132 m_8x16 = vreinterpretq_u8_u16(m_16x8);
133 m_8x8 = vget_low_u8(m_8x16);
134 m_16x8 = vmovl_u8(m_8x8);
135 m_32x4 = vreinterpretq_u32_u16(m_16x8);
136
137 // place m into every 8 bit element of vector
138 m_32x4 = vmulq_u32(m_32x4, x1_32x4);
139 m_8x16 = vreinterpretq_u8_u32(m_32x4);
140 m0_8x8 = vget_low_u8(m_8x16);
141 m1_8x8 = vget_high_u8(m_8x16);
142
143 // multiply MUL_SYM(m, *s);
144 ms0_16x8 = vmull_u8(m0_8x8, s0_8x8);
145 ms1_16x8 = vmull_u8(m1_8x8, s1_8x8);
146 ms0_16x8 = vaddq_u16(ms0_16x8, x255_16x8);
147 ms1_16x8 = vaddq_u16(ms1_16x8, x255_16x8);
148 ms0_8x8 = vshrn_n_u16(ms0_16x8, 8);
149 ms1_8x8 = vshrn_n_u16(ms1_16x8, 8);
150 ms_8x16 = vcombine_u8(ms0_8x8, ms1_8x8);
151
152 // substract 256 - m*s
153 a_8x16 = vsubq_u8(x0_8x16, ms_8x16);
154 a_32x4 = vreinterpretq_u32_u8(a_8x16);
155
156 // shift alpha>>24 and place it into every 8bit element
157 a_32x4 = vshrq_n_u32(a_32x4, 24);
158 a_32x4 = vmulq_u32(a_32x4, x1_32x4);
159 a_8x16 = vreinterpretq_u8_u32(a_32x4);
160 a0_8x8 = vget_low_u8(a_8x16);
161 a1_8x8 = vget_high_u8(a_8x16);
162
163 // multiply MUL_256(a, *d)
164 temp0_16x8 = vmull_u8(a0_8x8, d0_8x8);
165 temp1_16x8 = vmull_u8(a1_8x8, d1_8x8);
166 temp0_8x8 = vshrn_n_u16(temp0_16x8,8);
167 temp1_8x8 = vshrn_n_u16(temp1_16x8,8);
168 temp_8x16 = vcombine_u8(temp0_8x8, temp1_8x8);
169 temp_32x4 = vreinterpretq_u32_u8(temp_8x16);
170
171 // if alpha is 0, replace a*d with d
172 cond_32x4 = vceqq_u32(a_32x4, x0_32x4);
173 ad_32x4 = vbslq_u32(cond_32x4, d_32x4, temp_32x4);
174
175 // add m*s
176 ms_32x4 = vreinterpretq_u32_u8(ms_8x16);
177 d_32x4 = vaddq_u32(ms_32x4, ad_32x4);
178
179 // save result
180 vst1q_u32(d, d_32x4);
181
182 d+=4;
183 s+=4;
184 m+=4;
185 }
186
11 int alpha; 187 int alpha;
12 UNROLL8_PLD_WHILE(d, l, e, 188 DATA32 temp;
13 { 189
14 alpha = *m; 190 end += (l & 3);
15 switch(alpha) 191 while (d < end)
16 { 192 {
17 case 0: 193 alpha = *m;
18 break; 194 switch(alpha)
19 case 255: 195 {
20 *d = *s; 196 case 0:
21 break; 197 break;
22 default: 198 case 255:
23 alpha++; 199 alpha = 256 - (*s >> 24);
24 *d = INTERP_256(alpha, *s, *d); 200 *d = *s + MUL_256(alpha, *d);
25 break; 201 break;
26 } 202 default:
27 m++; s++; d++; 203 temp = MUL_SYM(alpha, *s);
28 }); 204 alpha = 256 - (temp >> 24);
205 *d = temp + MUL_256(alpha, *d);
206 break;
207 }
208 m++; s++; d++;
209 }
29} 210}
30 211
31static void 212static void
32_op_blend_pan_mas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c EINA_UNUSED, DATA32 *d, int l) { 213_op_blend_p_mas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c EINA_UNUSED, DATA32 *d, int l) {
33 DATA32 *e; 214 uint16x8_t m_16x8;
215 uint16x8_t ms0_16x8;
216 uint16x8_t ms1_16x8;
217 uint16x8_t temp0_16x8;
218 uint16x8_t temp1_16x8;
219 uint16x8_t x255_16x8;
220 uint32x2_t m_32x2;
221 uint32x4_t a_32x4;
222 uint32x4_t ad_32x4;
223 uint32x4_t cond_32x4;
224 uint32x4_t d_32x4;
225 uint32x4_t m_32x4;
226 uint32x4_t ms_32x4;
227 uint32x4_t s_32x4;
228 uint32x4_t temp_32x4;
229 uint32x4_t x0_32x4;
230 uint32x4_t x1_32x4;
231 uint8x16_t a_8x16;
232 uint8x16_t d_8x16;
233 uint8x16_t m_8x16;
234 uint8x16_t ms_8x16;
235 uint8x16_t s_8x16;
236 uint8x16_t temp_8x16;
237 uint8x16_t x0_8x16;
238 uint8x16_t x1_8x16;
239 uint8x8_t a0_8x8;
240 uint8x8_t a1_8x8;
241 uint8x8_t d0_8x8;
242 uint8x8_t d1_8x8;
243 uint8x8_t m0_8x8;
244 uint8x8_t m1_8x8;
245 uint8x8_t m_8x8;
246 uint8x8_t ms0_8x8;
247 uint8x8_t ms1_8x8;
248 uint8x8_t s0_8x8;
249 uint8x8_t s1_8x8;
250 uint8x8_t temp0_8x8;
251 uint8x8_t temp1_8x8;
252
253 x1_8x16 = vdupq_n_u8(0x1);
254 x1_32x4 = vreinterpretq_u32_u8(x1_8x16);
255 x255_16x8 = vdupq_n_u16(0xff);
256 x0_8x16 = vdupq_n_u8(0x0);
257 x0_32x4 = vreinterpretq_u32_u8(x0_8x16);
258
259 DATA32 *end = d + (l & ~3);
260 while (d < end)
261 {
262 // load 4 elements from m
263 m_32x2 = vld1_lane_u32((DATA32*)m, m_32x2, 0);
264
265 // load 4 elements from s
266 s_32x4 = vld1q_u32(s);
267 s_8x16 = vreinterpretq_u8_u32(s_32x4);
268 s0_8x8 = vget_low_u8(s_8x16);
269 s1_8x8 = vget_high_u8(s_8x16);
270
271 // load 4 elements from d
272 d_32x4 = vld1q_u32(d);
273 d_8x16 = vreinterpretq_u8_u32(d_32x4);
274 d0_8x8 = vget_low_u8(d_8x16);
275 d1_8x8 = vget_high_u8(d_8x16);
276
277 // make m 32 bit wide
278 m_8x8 = vreinterpret_u8_u32(m_32x2);
279 m_16x8 = vmovl_u8(m_8x8);
280 m_8x16 = vreinterpretq_u8_u16(m_16x8);
281 m_8x8 = vget_low_u8(m_8x16);
282 m_16x8 = vmovl_u8(m_8x8);
283 m_32x4 = vreinterpretq_u32_u16(m_16x8);
284
285 // place m into every 8 bit element of vector
286 m_32x4 = vmulq_u32(m_32x4, x1_32x4);
287 m_8x16 = vreinterpretq_u8_u32(m_32x4);
288 m0_8x8 = vget_low_u8(m_8x16);
289 m1_8x8 = vget_high_u8(m_8x16);
290
291 // multiply MUL_SYM(m, *s);
292 ms0_16x8 = vmull_u8(m0_8x8, s0_8x8);
293 ms1_16x8 = vmull_u8(m1_8x8, s1_8x8);
294 ms0_16x8 = vaddq_u16(ms0_16x8, x255_16x8);
295 ms1_16x8 = vaddq_u16(ms1_16x8, x255_16x8);
296 ms0_8x8 = vshrn_n_u16(ms0_16x8, 8);
297 ms1_8x8 = vshrn_n_u16(ms1_16x8, 8);
298 ms_8x16 = vcombine_u8(ms0_8x8, ms1_8x8);
299
300 // substract 256 - m*s
301 a_8x16 = vsubq_u8(x0_8x16, ms_8x16);
302 a_32x4 = vreinterpretq_u32_u8(a_8x16);
303
304 // shift alpha>>24 and place it into every 8bit element
305 a_32x4 = vshrq_n_u32(a_32x4, 24);
306 a_32x4 = vmulq_u32(a_32x4, x1_32x4);
307 a_8x16 = vreinterpretq_u8_u32(a_32x4);
308 a0_8x8 = vget_low_u8(a_8x16);
309 a1_8x8 = vget_high_u8(a_8x16);
310
311 // multiply MUL_256(a, *d)
312 temp0_16x8 = vmull_u8(a0_8x8, d0_8x8);
313 temp1_16x8 = vmull_u8(a1_8x8, d1_8x8);
314 temp0_8x8 = vshrn_n_u16(temp0_16x8,8);
315 temp1_8x8 = vshrn_n_u16(temp1_16x8,8);
316 temp_8x16 = vcombine_u8(temp0_8x8, temp1_8x8);
317 temp_32x4 = vreinterpretq_u32_u8(temp_8x16);
318
319 // if alpha is 0, replace a*d with d
320 cond_32x4 = vceqq_u32(a_32x4, x0_32x4);
321 ad_32x4 = vbslq_u32(cond_32x4, d_32x4, temp_32x4);
322
323 // add m*s
324 ms_32x4 = vreinterpretq_u32_u8(ms_8x16);
325 d_32x4 = vaddq_u32(ms_32x4, ad_32x4);
326
327 // save result
328 vst1q_u32(d, d_32x4);
329
330 d+=4;
331 s+=4;
332 m+=4;
333 }
334
34 int alpha; 335 int alpha;
35 UNROLL8_PLD_WHILE(d, l, e, 336 DATA32 temp;
36 { 337
37 alpha = *m; 338 end += (l & 3);
38 switch(alpha) 339 while (d < end)
39 { 340 {
40 case 0: 341 alpha = *m;
41 break; 342 temp = MUL_SYM(alpha, *s);
42 case 255: 343 alpha = 256 - (temp >> 24);
43 *d = *s; 344 *d = temp + MUL_256(alpha, *d);
44 break; 345 m++; s++; d++;
45 default: 346 }
46 alpha++;
47 *d = INTERP_256(alpha, *s, *d);
48 break;
49 }
50 m++; s++; d++;
51 });
52} 347}
53#else
54// FIXME
55#define _op_blend_pas_mas_dp_neon NULL
56#define _op_blend_pan_mas_dp_neon NULL
57#endif
58 348
59#define _op_blend_p_mas_dp_neon _op_blend_pas_mas_dp_neon 349#define _op_blend_pan_mas_dp_neon _op_blend_pas_mas_dp_neon
60 350
61#define _op_blend_p_mas_dpan_neon _op_blend_p_mas_dp_neon 351#define _op_blend_p_mas_dpan_neon _op_blend_p_mas_dp_neon
62#define _op_blend_pan_mas_dpan_neon _op_blend_pan_mas_dp_neon 352#define _op_blend_pan_mas_dpan_neon _op_blend_pan_mas_dp_neon