summaryrefslogtreecommitdiff
path: root/src/lib/evas/common/evas_op_blend/op_blend_color_neon.c
diff options
context:
space:
mode:
authorCedric BAIL <cedric@osg.samsung.com>2015-04-28 23:38:34 +0200
committerCedric BAIL <cedric@osg.samsung.com>2015-05-07 09:53:11 +0200
commit10ece61dbf6d77d0a42df05c88742114c0ad6ef2 (patch)
tree0682033fa577b3339445207c3ec020c718adfc2d /src/lib/evas/common/evas_op_blend/op_blend_color_neon.c
parentd364cbdadd6a4f0d59bcdeead90205e847c84c56 (diff)
evas: implement _op_blend_rel_c_dp_neon using NEON intrinsics
Summary: NEON intrinsics can be built both for armv7 and armv8. Reviewers: raster, cedric Reviewed By: cedric Subscribers: cedric Projects: #efl Differential Revision: https://phab.enlightenment.org/D2440 Signed-off-by: Cedric BAIL <cedric@osg.samsung.com>
Diffstat (limited to '')
-rw-r--r--src/lib/evas/common/evas_op_blend/op_blend_color_neon.c98
1 files changed, 89 insertions, 9 deletions
diff --git a/src/lib/evas/common/evas_op_blend/op_blend_color_neon.c b/src/lib/evas/common/evas_op_blend/op_blend_color_neon.c
index 7ba2ffdbda..076bad9202 100644
--- a/src/lib/evas/common/evas_op_blend/op_blend_color_neon.c
+++ b/src/lib/evas/common/evas_op_blend/op_blend_color_neon.c
@@ -1,8 +1,6 @@
1#ifdef BUILD_NEON 1#ifdef BUILD_NEON
2#ifdef BUILD_NEON_INTRINSICS
3#include <arm_neon.h> 2#include <arm_neon.h>
4#endif 3#endif
5#endif
6/* blend color --> dst */ 4/* blend color --> dst */
7 5
8#ifdef BUILD_NEON 6#ifdef BUILD_NEON
@@ -278,13 +276,95 @@ init_blend_color_pt_funcs_neon(void)
278#ifdef BUILD_NEON 276#ifdef BUILD_NEON
279static void 277static void
280_op_blend_rel_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) { 278_op_blend_rel_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) {
281 DATA32 *e; 279 uint16x8_t ad0_16x8;
282 int alpha = 256 - (c >> 24); 280 uint16x8_t ad1_16x8;
283 UNROLL8_PLD_WHILE(d, l, e, 281 uint16x8_t dc0_16x8;
284 { 282 uint16x8_t dc1_16x8;
285 *d = MUL_SYM(*d >> 24, c) + MUL_256(alpha, *d); 283 uint16x8_t x255_16x8;
286 d++; 284 uint32x2_t c_32x2;
287 }); 285 uint32x4_t ad_32x4;
286 uint32x4_t d_32x4;
287 uint32x4_t dc_32x4;
288 uint32x4_t x1_32x4;
289 uint8x16_t ad_8x16;
290 uint8x16_t d_8x16;
291 uint8x16_t dc_8x16;
292 uint8x16_t x1_8x16;
293 uint8x8_t ad0_8x8;
294 uint8x8_t ad1_8x8;
295 uint8x8_t alpha_8x8;
296 uint8x8_t c_8x8;
297 uint8x8_t d0_8x8;
298 uint8x8_t d1_8x8;
299 uint8x8_t dc0_8x8;
300 uint8x8_t dc1_8x8;
301
302 // alpha can only be 0 if color is 0x0. In that case we can just return.
303 // Otherwise we can assume alpha != 0. This allows more optimization in
304 // NEON code.
305
306 if(!c)
307 return;
308
309 unsigned char alpha;
310 alpha = ~(c >> 24) + 1; // 256 - (c >> 24)
311
312 alpha_8x8 = vdup_n_u8(alpha);
313 c_32x2 = vdup_n_u32(c);
314 c_8x8 = vreinterpret_u8_u32(c_32x2);
315 x1_8x16 = vdupq_n_u8(0x1);
316 x1_32x4 = vreinterpretq_u32_u8(x1_8x16);
317 x255_16x8 = vdupq_n_u16(0xff);
318
319 DATA32 *end = d + (l & ~3);
320 while (d < end)
321 {
322 // load 4 elements from d
323 d_32x4 = vld1q_u32(d);
324 d_8x16 = vreinterpretq_u8_u32(d_32x4);
325 d0_8x8 = vget_low_u8(d_8x16);
326 d1_8x8 = vget_high_u8(d_8x16);
327
328 // multiply MUL_256(alpha, *d);
329 ad0_16x8 = vmull_u8(alpha_8x8, d0_8x8);
330 ad1_16x8 = vmull_u8(alpha_8x8, d1_8x8);
331 ad0_8x8 = vshrn_n_u16(ad0_16x8,8);
332 ad1_8x8 = vshrn_n_u16(ad1_16x8,8);
333 ad_8x16 = vcombine_u8(ad0_8x8, ad1_8x8);
334 ad_32x4 = vreinterpretq_u32_u8(ad_8x16);
335
336 // shift (*d >> 24)
337 dc_32x4 = vshrq_n_u32(d_32x4, 24);
338 dc_32x4 = vmulq_u32(x1_32x4, dc_32x4);
339 dc_8x16 = vreinterpretq_u8_u32(dc_32x4);
340 dc0_8x8 = vget_low_u8(dc_8x16);
341 dc1_8x8 = vget_high_u8(dc_8x16);
342
343 // multiply MUL_256(*d >> 24, sc);
344 dc0_16x8 = vmull_u8(dc0_8x8, c_8x8);
345 dc1_16x8 = vmull_u8(dc1_8x8, c_8x8);
346 dc0_16x8 = vaddq_u16(dc0_16x8, x255_16x8);
347 dc1_16x8 = vaddq_u16(dc1_16x8, x255_16x8);
348 dc0_8x8 = vshrn_n_u16(dc0_16x8, 8);
349 dc1_8x8 = vshrn_n_u16(dc1_16x8, 8);
350 dc_8x16 = vcombine_u8(dc0_8x8, dc1_8x8);
351
352 // add up everything
353 dc_32x4 = vreinterpretq_u32_u8(dc_8x16);
354 d_32x4 = vaddq_u32(dc_32x4, ad_32x4);
355
356 // save result
357 vst1q_u32(d, d_32x4);
358
359 d+=4;
360 }
361
362 end += (l & 3);
363 while (d < end)
364 {
365 *d = MUL_SYM(*d >> 24, c) + MUL_256(alpha, *d);
366 d++;
367 }
288} 368}
289 369
290#define _op_blend_rel_caa_dp_neon _op_blend_rel_c_dp_neon 370#define _op_blend_rel_caa_dp_neon _op_blend_rel_c_dp_neon