summaryrefslogtreecommitdiff
path: root/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c
diff options
context:
space:
mode:
authorYury Usishchev <y.usishchev@samsung.com>2015-04-16 19:26:49 +0200
committerCedric BAIL <cedric@osg.samsung.com>2015-05-07 09:53:09 +0200
commit970afe9bea5bd61bef208d65af0a3a6c7b912a42 (patch)
tree1540378b3c96185b72457d1cd871b42268fc200b /src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c
parentbe7c7c2c77c7b61f569532be7abb07858490bae6 (diff)
evas: implement _op_blend_mas_can_dp_neon in NEON intrinsics.
Reviewers: raster, cedric Reviewed By: cedric Projects: #efl Differential Revision: https://phab.enlightenment.org/D2369 Signed-off-by: Cedric BAIL <cedric@osg.samsung.com>
Diffstat (limited to '')
-rw-r--r--src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c136
1 files changed, 117 insertions, 19 deletions
diff --git a/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c b/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c
index a09277ed31..e492bb057b 100644
--- a/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c
+++ b/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c
@@ -279,25 +279,123 @@ _op_blend_mas_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, in
279static void 279static void
280_op_blend_mas_can_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, int l) { 280_op_blend_mas_can_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, int l) {
281#ifdef BUILD_NEON_INTRINSICS 281#ifdef BUILD_NEON_INTRINSICS
282 DATA32 *e; 282 int16x8_t c_i16x8;
283 int alpha; 283 int16x8_t d0_i16x8;
284 UNROLL8_PLD_WHILE(d, l, e, 284 int16x8_t d1_i16x8;
285 { 285 int16x8_t dc0_i16x8;
286 alpha = *m; 286 int16x8_t dc1_i16x8;
287 switch(alpha) 287 int16x8_t m0_i16x8;
288 { 288 int16x8_t m1_i16x8;
289 case 0: 289 int8x16_t dc_i8x16;
290 break; 290 int8x8_t dc0_i8x8;
291 case 255: 291 int8x8_t dc1_i8x8;
292 *d = c; 292 uint16x8_t c_16x8;
293 break; 293 uint16x8_t d0_16x8;
294 default: 294 uint16x8_t d1_16x8;
295 alpha++; 295 uint16x8_t m0_16x8;
296 *d = INTERP_256(alpha, c, *d); 296 uint16x8_t m1_16x8;
297 break; 297 uint16x8_t m_16x8;
298 } 298 uint32x2_t c_32x2;
299 m++; d++; 299 uint32x2_t m_32x2;
300 }); 300 uint32x4_t d_32x4;
301 uint32x4_t dc_32x4;
302 uint32x4_t m_32x4;
303 uint32x4_t x1_32x4;
304 uint8x16_t d_8x16;
305 uint8x16_t m_8x16;
306 uint8x16_t x1_8x16;
307 uint8x8_t c_8x8;
308 uint8x8_t d0_8x8;
309 uint8x8_t d1_8x8;
310 uint8x8_t m0_8x8;
311 uint8x8_t m1_8x8;
312 uint8x8_t m_8x8;
313 uint8x8_t x1_8x8;
314 uint32x4_t x0_32x4;
315 uint32x4_t cond_32x4;
316
317 c_32x2 = vdup_n_u32(c);
318 c_8x8 = vreinterpret_u8_u32(c_32x2);
319 c_16x8 = vmovl_u8(c_8x8);
320 c_i16x8 = vreinterpretq_s16_u16(c_16x8);
321 x1_8x16 = vdupq_n_u8(0x1);
322 x1_8x8 = vget_low_u8(x1_8x16);
323 x1_32x4 = vreinterpretq_u32_u8(x1_8x16);
324 x0_32x4 = vdupq_n_u32(0x0);
325
326 DATA32 *start = d;
327 int size = l;
328 DATA32 *end = start + (size & ~3);
329 while (start < end) {
330 int k = *((int *)m);
331 if (k == 0)
332 {
333 m+=4;
334 start+=4;
335 continue;
336 }
337
338 m_32x2 = vld1_lane_u32((DATA32*)m, m_32x2, 0);
339 d_32x4 = vld1q_u32(start);
340 d_8x16 = vreinterpretq_u8_u32(d_32x4);
341 d0_8x8 = vget_low_u8(d_8x16);
342 d1_8x8 = vget_high_u8(d_8x16);
343
344 m_8x8 = vreinterpret_u8_u32(m_32x2);
345 m_16x8 = vmovl_u8(m_8x8);
346 m_8x16 = vreinterpretq_u8_u16(m_16x8);
347 m_8x8 = vget_low_u8(m_8x16);
348 m_16x8 = vmovl_u8(m_8x8);
349 m_32x4 = vreinterpretq_u32_u16(m_16x8);
350
351 m_32x4 = vmulq_u32(m_32x4, x1_32x4);
352 m_8x16 = vreinterpretq_u8_u32(m_32x4);
353 m0_8x8 = vget_low_u8(m_8x16);
354 m1_8x8 = vget_high_u8(m_8x16);
355 m0_16x8 = vaddl_u8(m0_8x8, x1_8x8);
356 m1_16x8 = vaddl_u8(m1_8x8, x1_8x8);
357
358 m0_i16x8 = vreinterpretq_s16_u16(m0_16x8);
359 m1_i16x8 = vreinterpretq_s16_u16(m1_16x8);
360
361 d0_16x8 = vmovl_u8(d0_8x8);
362 d1_16x8 = vmovl_u8(d1_8x8);
363
364 d0_i16x8 = vreinterpretq_s16_u16(d0_16x8);
365 d1_i16x8 = vreinterpretq_s16_u16(d1_16x8);
366
367 dc0_i16x8 = vsubq_s16(c_i16x8, d0_i16x8);
368 dc1_i16x8 = vsubq_s16(c_i16x8, d1_i16x8);
369
370 dc0_i16x8 = vmulq_s16(dc0_i16x8, m0_i16x8);
371 dc1_i16x8 = vmulq_s16(dc1_i16x8, m1_i16x8);
372
373 dc0_i16x8 = vshrq_n_s16(dc0_i16x8, 8);
374 dc1_i16x8 = vshrq_n_s16(dc1_i16x8, 8);
375
376 dc0_i16x8 = vaddq_s16(dc0_i16x8, d0_i16x8);
377 dc1_i16x8 = vaddq_s16(dc1_i16x8, d1_i16x8);
378
379 dc0_i8x8 = vmovn_s16(dc0_i16x8);
380 dc1_i8x8 = vmovn_s16(dc1_i16x8);
381
382 dc_i8x16 = vcombine_s8(dc0_i8x8, dc1_i8x8);
383 dc_32x4 = vreinterpretq_u32_s8(dc_i8x16);
384
385 cond_32x4 = vceqq_u32(m_32x4, x0_32x4);
386 dc_32x4 = vbslq_u32(cond_32x4, d_32x4, dc_32x4);
387
388 vst1q_u32(start, dc_32x4);
389 m+=4;
390 start+=4;
391 }
392 end += (size & 3);
393 while (start < end) {
394 DATA32 alpha = *m;
395 alpha++;
396 *start = INTERP_256(alpha, c, *start);
397 m++; start++;
398 }
301#else 399#else
302 DATA32 *e,*tmp; 400 DATA32 *e,*tmp;
303 int alpha; 401 int alpha;