summaryrefslogtreecommitdiff
path: root/src/lib/evas/common/evas_op_blend/op_blend_pixel_neon.c
diff options
context:
space:
mode:
authorCedric BAIL <cedric@osg.samsung.com>2015-04-22 15:20:22 +0200
committerCedric BAIL <cedric@osg.samsung.com>2015-05-07 09:53:09 +0200
commit8fa4d415e4e82316bfaecd7f9dbe64131fff345b (patch)
tree62868029864039abfe7116572e34d1663498e370 /src/lib/evas/common/evas_op_blend/op_blend_pixel_neon.c
parentd88ccf06a5f6ebcfc68dcc21e55a241f64ff9aa9 (diff)
evas: improve _op_blend_p_dp_neon intrinsics implementation
Summary: Use vceqq and vbsl instead of twice as much vmovl and vadd instructions. Replace vaddq_u8 with vaddq_u32. This allows NEON code to behave exactly like C version. Reviewers: raster, cedric Reviewed By: cedric Projects: #efl Differential Revision: https://phab.enlightenment.org/D2361 Signed-off-by: Cedric BAIL <cedric@osg.samsung.com>
Diffstat (limited to '')
-rw-r--r--src/lib/evas/common/evas_op_blend/op_blend_pixel_neon.c196
1 files changed, 108 insertions, 88 deletions
diff --git a/src/lib/evas/common/evas_op_blend/op_blend_pixel_neon.c b/src/lib/evas/common/evas_op_blend/op_blend_pixel_neon.c
index e81466cf39..8d70b9db40 100644
--- a/src/lib/evas/common/evas_op_blend/op_blend_pixel_neon.c
+++ b/src/lib/evas/common/evas_op_blend/op_blend_pixel_neon.c
@@ -9,29 +9,34 @@
9static void 9static void
10_op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) { 10_op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
11#ifdef BUILD_NEON_INTRINSICS 11#ifdef BUILD_NEON_INTRINSICS
12 uint16x8_t alpha00_16x8; 12 uint16x8_t ad00_16x8;
13 uint16x8_t alpha01_16x8; 13 uint16x8_t ad01_16x8;
14 uint16x8_t alpha10_16x8; 14 uint16x8_t ad10_16x8;
15 uint16x8_t alpha11_16x8; 15 uint16x8_t ad11_16x8;
16 uint16x8_t d00_16x8; 16 uint32x4_t ad0_32x4;
17 uint16x8_t d01_16x8; 17 uint32x4_t ad1_32x4;
18 uint16x8_t d10_16x8;
19 uint16x8_t d11_16x8;
20 uint32x4_t alpha0_32x4; 18 uint32x4_t alpha0_32x4;
21 uint32x4_t alpha1_32x4; 19 uint32x4_t alpha1_32x4;
20 uint32x4_t cond0_32x4;
21 uint32x4_t cond1_32x4;
22 uint32x4_t d0_32x4; 22 uint32x4_t d0_32x4;
23 uint32x4_t d1_32x4; 23 uint32x4_t d1_32x4;
24 uint32x4_t s0_32x4; 24 uint32x4_t s0_32x4;
25 uint32x4_t s1_32x4; 25 uint32x4_t s1_32x4;
26 uint32x4_t x0_32x4;
26 uint32x4_t x1_32x4; 27 uint32x4_t x1_32x4;
28 uint8x16_t ad0_8x16;
29 uint8x16_t ad1_8x16;
27 uint8x16_t alpha0_8x16; 30 uint8x16_t alpha0_8x16;
28 uint8x16_t alpha1_8x16; 31 uint8x16_t alpha1_8x16;
29 uint8x16_t d0_8x16; 32 uint8x16_t d0_8x16;
30 uint8x16_t d1_8x16; 33 uint8x16_t d1_8x16;
31 uint8x16_t s0_8x16; 34 uint8x16_t x0_8x16;
32 uint8x16_t s1_8x16;
33 uint8x16_t x1_8x16; 35 uint8x16_t x1_8x16;
34 uint8x16_t x255_8x16; 36 uint8x8_t ad00_8x8;
37 uint8x8_t ad01_8x8;
38 uint8x8_t ad10_8x8;
39 uint8x8_t ad11_8x8;
35 uint8x8_t alpha00_8x8; 40 uint8x8_t alpha00_8x8;
36 uint8x8_t alpha01_8x8; 41 uint8x8_t alpha01_8x8;
37 uint8x8_t alpha10_8x8; 42 uint8x8_t alpha10_8x8;
@@ -43,7 +48,8 @@ _op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
43 48
44 x1_8x16 = vdupq_n_u8(0x1); 49 x1_8x16 = vdupq_n_u8(0x1);
45 x1_32x4 = vreinterpretq_u32_u8(x1_8x16); 50 x1_32x4 = vreinterpretq_u32_u8(x1_8x16);
46 x255_8x16 = vdupq_n_u8(0xff); 51 x0_8x16 = vdupq_n_u8(0x0);
52 x0_32x4 = vreinterpretq_u32_u8(x0_8x16);
47 53
48 DATA32 *start = d; 54 DATA32 *start = d;
49 int size = l; 55 int size = l;
@@ -56,6 +62,13 @@ _op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
56 d0_32x4 = vld1q_u32(start); 62 d0_32x4 = vld1q_u32(start);
57 d1_32x4 = vld1q_u32(start+4); 63 d1_32x4 = vld1q_u32(start+4);
58 64
65 d0_8x16 = vreinterpretq_u8_u32(d0_32x4);
66 d1_8x16 = vreinterpretq_u8_u32(d1_32x4);
67 d00_8x8 = vget_low_u8(d0_8x16);
68 d01_8x8 = vget_high_u8(d0_8x16);
69 d10_8x8 = vget_low_u8(d1_8x16);
70 d11_8x8 = vget_high_u8(d1_8x16);
71
59 alpha0_32x4 = vshrq_n_u32(s0_32x4, 24); 72 alpha0_32x4 = vshrq_n_u32(s0_32x4, 24);
60 alpha1_32x4 = vshrq_n_u32(s1_32x4, 24); 73 alpha1_32x4 = vshrq_n_u32(s1_32x4, 24);
61 74
@@ -65,46 +78,43 @@ _op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
65 alpha0_8x16 = vreinterpretq_u8_u32(alpha0_32x4); 78 alpha0_8x16 = vreinterpretq_u8_u32(alpha0_32x4);
66 alpha1_8x16 = vreinterpretq_u8_u32(alpha1_32x4); 79 alpha1_8x16 = vreinterpretq_u8_u32(alpha1_32x4);
67 80
68 alpha0_8x16 = vsubq_u8(x255_8x16, alpha0_8x16); 81 alpha0_8x16 = vsubq_u8(x0_8x16, alpha0_8x16);
69 alpha1_8x16 = vsubq_u8(x255_8x16, alpha1_8x16); 82 alpha1_8x16 = vsubq_u8(x0_8x16, alpha1_8x16);
83
84 alpha0_32x4 = vreinterpretq_u32_u8(alpha0_8x16);
85 alpha1_32x4 = vreinterpretq_u32_u8(alpha1_8x16);
70 86
71 alpha10_8x8 = vget_low_u8(alpha1_8x16); 87 alpha10_8x8 = vget_low_u8(alpha1_8x16);
72 alpha11_8x8 = vget_high_u8(alpha1_8x16); 88 alpha11_8x8 = vget_high_u8(alpha1_8x16);
73 alpha00_8x8 = vget_low_u8(alpha0_8x16); 89 alpha00_8x8 = vget_low_u8(alpha0_8x16);
74 alpha01_8x8 = vget_high_u8(alpha0_8x16); 90 alpha01_8x8 = vget_high_u8(alpha0_8x16);
75 d0_8x16 = vreinterpretq_u8_u32(d0_32x4); 91
76 d1_8x16 = vreinterpretq_u8_u32(d1_32x4); 92 ad00_16x8 = vmull_u8(alpha00_8x8, d00_8x8);
77 d00_8x8 = vget_low_u8(d0_8x16); 93 ad01_16x8 = vmull_u8(alpha01_8x8, d01_8x8);
78 d01_8x8 = vget_high_u8(d0_8x16); 94 ad10_16x8 = vmull_u8(alpha10_8x8, d10_8x8);
79 d10_8x8 = vget_low_u8(d1_8x16); 95 ad11_16x8 = vmull_u8(alpha11_8x8, d11_8x8);
80 d11_8x8 = vget_high_u8(d1_8x16); 96 ad00_8x8 = vshrn_n_u16(ad00_16x8,8);
81 alpha00_16x8 = vmull_u8(alpha00_8x8, d00_8x8); 97 ad01_8x8 = vshrn_n_u16(ad01_16x8,8);
82 alpha01_16x8 = vmull_u8(alpha01_8x8, d01_8x8); 98 ad10_8x8 = vshrn_n_u16(ad10_16x8,8);
83 alpha10_16x8 = vmull_u8(alpha10_8x8, d10_8x8); 99 ad11_8x8 = vshrn_n_u16(ad11_16x8,8);
84 alpha11_16x8 = vmull_u8(alpha11_8x8, d11_8x8); 100
85 d00_16x8 = vmovl_u8(d00_8x8); 101 ad0_8x16 = vcombine_u8(ad00_8x8, ad01_8x8);
86 d01_16x8 = vmovl_u8(d01_8x8); 102 ad1_8x16 = vcombine_u8(ad10_8x8, ad11_8x8);
87 d10_16x8 = vmovl_u8(d10_8x8); 103 ad0_32x4 = vreinterpretq_u32_u8(ad0_8x16);
88 d11_16x8 = vmovl_u8(d11_8x8); 104 ad1_32x4 = vreinterpretq_u32_u8(ad1_8x16);
89 alpha00_16x8 = vaddq_u16(alpha00_16x8, d00_16x8); 105
90 alpha01_16x8 = vaddq_u16(alpha01_16x8, d01_16x8); 106 cond0_32x4 = vceqq_u32(alpha0_32x4, x0_32x4);
91 alpha10_16x8 = vaddq_u16(alpha10_16x8, d10_16x8); 107 cond1_32x4 = vceqq_u32(alpha1_32x4, x0_32x4);
92 alpha11_16x8 = vaddq_u16(alpha11_16x8, d11_16x8); 108
93 alpha00_8x8 = vshrn_n_u16(alpha00_16x8,8); 109 ad0_32x4 = vbslq_u32(cond0_32x4, d0_32x4, ad0_32x4);
94 alpha01_8x8 = vshrn_n_u16(alpha01_16x8,8); 110 ad1_32x4 = vbslq_u32(cond1_32x4, d1_32x4, ad1_32x4);
95 alpha10_8x8 = vshrn_n_u16(alpha10_16x8,8); 111
96 alpha11_8x8 = vshrn_n_u16(alpha11_16x8,8); 112 d0_32x4 = vaddq_u32(s0_32x4, ad0_32x4);
97 alpha0_8x16 = vcombine_u8(alpha00_8x8, alpha01_8x8); 113 d1_32x4 = vaddq_u32(s1_32x4, ad1_32x4);
98 alpha1_8x16 = vcombine_u8(alpha10_8x8, alpha11_8x8);
99 s0_8x16 = vreinterpretq_u8_u32(s0_32x4);
100 s1_8x16 = vreinterpretq_u8_u32(s1_32x4);
101 d0_8x16 = vaddq_u8(s0_8x16, alpha0_8x16);
102 d1_8x16 = vaddq_u8(s1_8x16, alpha1_8x16);
103 d0_32x4 = vreinterpretq_u32_u8(d0_8x16);
104 d1_32x4 = vreinterpretq_u32_u8(d1_8x16);
105 114
106 vst1q_u32(start, d0_32x4); 115 vst1q_u32(start, d0_32x4);
107 vst1q_u32(start+4, d1_32x4); 116 vst1q_u32(start+4, d1_32x4);
117
108 s+=8; 118 s+=8;
109 start+=8; 119 start+=8;
110 } 120 }
@@ -358,29 +368,34 @@ _op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
358static void 368static void
359_op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) { 369_op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
360#ifdef BUILD_NEON_INTRINSICS 370#ifdef BUILD_NEON_INTRINSICS
361 uint16x8_t alpha00_16x8; 371 uint16x8_t ad00_16x8;
362 uint16x8_t alpha01_16x8; 372 uint16x8_t ad01_16x8;
363 uint16x8_t alpha10_16x8; 373 uint16x8_t ad10_16x8;
364 uint16x8_t alpha11_16x8; 374 uint16x8_t ad11_16x8;
365 uint16x8_t d00_16x8; 375 uint32x4_t ad0_32x4;
366 uint16x8_t d01_16x8; 376 uint32x4_t ad1_32x4;
367 uint16x8_t d10_16x8;
368 uint16x8_t d11_16x8;
369 uint32x4_t alpha0_32x4; 377 uint32x4_t alpha0_32x4;
370 uint32x4_t alpha1_32x4; 378 uint32x4_t alpha1_32x4;
379 uint32x4_t cond0_32x4;
380 uint32x4_t cond1_32x4;
371 uint32x4_t d0_32x4; 381 uint32x4_t d0_32x4;
372 uint32x4_t d1_32x4; 382 uint32x4_t d1_32x4;
373 uint32x4_t s0_32x4; 383 uint32x4_t s0_32x4;
374 uint32x4_t s1_32x4; 384 uint32x4_t s1_32x4;
385 uint32x4_t x0_32x4;
375 uint32x4_t x1_32x4; 386 uint32x4_t x1_32x4;
387 uint8x16_t ad0_8x16;
388 uint8x16_t ad1_8x16;
376 uint8x16_t alpha0_8x16; 389 uint8x16_t alpha0_8x16;
377 uint8x16_t alpha1_8x16; 390 uint8x16_t alpha1_8x16;
378 uint8x16_t d0_8x16; 391 uint8x16_t d0_8x16;
379 uint8x16_t d1_8x16; 392 uint8x16_t d1_8x16;
380 uint8x16_t s0_8x16; 393 uint8x16_t x0_8x16;
381 uint8x16_t s1_8x16;
382 uint8x16_t x1_8x16; 394 uint8x16_t x1_8x16;
383 uint8x16_t x255_8x16; 395 uint8x8_t ad00_8x8;
396 uint8x8_t ad01_8x8;
397 uint8x8_t ad10_8x8;
398 uint8x8_t ad11_8x8;
384 uint8x8_t alpha00_8x8; 399 uint8x8_t alpha00_8x8;
385 uint8x8_t alpha01_8x8; 400 uint8x8_t alpha01_8x8;
386 uint8x8_t alpha10_8x8; 401 uint8x8_t alpha10_8x8;
@@ -392,7 +407,8 @@ _op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
392 407
393 x1_8x16 = vdupq_n_u8(0x1); 408 x1_8x16 = vdupq_n_u8(0x1);
394 x1_32x4 = vreinterpretq_u32_u8(x1_8x16); 409 x1_32x4 = vreinterpretq_u32_u8(x1_8x16);
395 x255_8x16 = vdupq_n_u8(0xff); 410 x0_8x16 = vdupq_n_u8(0x0);
411 x0_32x4 = vreinterpretq_u32_u8(x0_8x16);
396 412
397 DATA32 *start = d; 413 DATA32 *start = d;
398 int size = l; 414 int size = l;
@@ -405,6 +421,13 @@ _op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
405 d0_32x4 = vld1q_u32(start); 421 d0_32x4 = vld1q_u32(start);
406 d1_32x4 = vld1q_u32(start+4); 422 d1_32x4 = vld1q_u32(start+4);
407 423
424 d0_8x16 = vreinterpretq_u8_u32(d0_32x4);
425 d1_8x16 = vreinterpretq_u8_u32(d1_32x4);
426 d00_8x8 = vget_low_u8(d0_8x16);
427 d01_8x8 = vget_high_u8(d0_8x16);
428 d10_8x8 = vget_low_u8(d1_8x16);
429 d11_8x8 = vget_high_u8(d1_8x16);
430
408 alpha0_32x4 = vshrq_n_u32(s0_32x4, 24); 431 alpha0_32x4 = vshrq_n_u32(s0_32x4, 24);
409 alpha1_32x4 = vshrq_n_u32(s1_32x4, 24); 432 alpha1_32x4 = vshrq_n_u32(s1_32x4, 24);
410 433
@@ -414,46 +437,43 @@ _op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
414 alpha0_8x16 = vreinterpretq_u8_u32(alpha0_32x4); 437 alpha0_8x16 = vreinterpretq_u8_u32(alpha0_32x4);
415 alpha1_8x16 = vreinterpretq_u8_u32(alpha1_32x4); 438 alpha1_8x16 = vreinterpretq_u8_u32(alpha1_32x4);
416 439
417 alpha0_8x16 = vsubq_u8(x255_8x16, alpha0_8x16); 440 alpha0_8x16 = vsubq_u8(x0_8x16, alpha0_8x16);
418 alpha1_8x16 = vsubq_u8(x255_8x16, alpha1_8x16); 441 alpha1_8x16 = vsubq_u8(x0_8x16, alpha1_8x16);
442
443 alpha0_32x4 = vreinterpretq_u32_u8(alpha0_8x16);
444 alpha1_32x4 = vreinterpretq_u32_u8(alpha1_8x16);
419 445
420 alpha10_8x8 = vget_low_u8(alpha1_8x16); 446 alpha10_8x8 = vget_low_u8(alpha1_8x16);
421 alpha11_8x8 = vget_high_u8(alpha1_8x16); 447 alpha11_8x8 = vget_high_u8(alpha1_8x16);
422 alpha00_8x8 = vget_low_u8(alpha0_8x16); 448 alpha00_8x8 = vget_low_u8(alpha0_8x16);
423 alpha01_8x8 = vget_high_u8(alpha0_8x16); 449 alpha01_8x8 = vget_high_u8(alpha0_8x16);
424 d0_8x16 = vreinterpretq_u8_u32(d0_32x4); 450
425 d1_8x16 = vreinterpretq_u8_u32(d1_32x4); 451 ad00_16x8 = vmull_u8(alpha00_8x8, d00_8x8);
426 d00_8x8 = vget_low_u8(d0_8x16); 452 ad01_16x8 = vmull_u8(alpha01_8x8, d01_8x8);
427 d01_8x8 = vget_high_u8(d0_8x16); 453 ad10_16x8 = vmull_u8(alpha10_8x8, d10_8x8);
428 d10_8x8 = vget_low_u8(d1_8x16); 454 ad11_16x8 = vmull_u8(alpha11_8x8, d11_8x8);
429 d11_8x8 = vget_high_u8(d1_8x16); 455 ad00_8x8 = vshrn_n_u16(ad00_16x8,8);
430 alpha00_16x8 = vmull_u8(alpha00_8x8, d00_8x8); 456 ad01_8x8 = vshrn_n_u16(ad01_16x8,8);
431 alpha01_16x8 = vmull_u8(alpha01_8x8, d01_8x8); 457 ad10_8x8 = vshrn_n_u16(ad10_16x8,8);
432 alpha10_16x8 = vmull_u8(alpha10_8x8, d10_8x8); 458 ad11_8x8 = vshrn_n_u16(ad11_16x8,8);
433 alpha11_16x8 = vmull_u8(alpha11_8x8, d11_8x8); 459
434 d00_16x8 = vmovl_u8(d00_8x8); 460 ad0_8x16 = vcombine_u8(ad00_8x8, ad01_8x8);
435 d01_16x8 = vmovl_u8(d01_8x8); 461 ad1_8x16 = vcombine_u8(ad10_8x8, ad11_8x8);
436 d10_16x8 = vmovl_u8(d10_8x8); 462 ad0_32x4 = vreinterpretq_u32_u8(ad0_8x16);
437 d11_16x8 = vmovl_u8(d11_8x8); 463 ad1_32x4 = vreinterpretq_u32_u8(ad1_8x16);
438 alpha00_16x8 = vaddq_u16(alpha00_16x8, d00_16x8); 464
439 alpha01_16x8 = vaddq_u16(alpha01_16x8, d01_16x8); 465 cond0_32x4 = vceqq_u32(alpha0_32x4, x0_32x4);
440 alpha10_16x8 = vaddq_u16(alpha10_16x8, d10_16x8); 466 cond1_32x4 = vceqq_u32(alpha1_32x4, x0_32x4);
441 alpha11_16x8 = vaddq_u16(alpha11_16x8, d11_16x8); 467
442 alpha00_8x8 = vshrn_n_u16(alpha00_16x8,8); 468 ad0_32x4 = vbslq_u32(cond0_32x4, d0_32x4, ad0_32x4);
443 alpha01_8x8 = vshrn_n_u16(alpha01_16x8,8); 469 ad1_32x4 = vbslq_u32(cond1_32x4, d1_32x4, ad1_32x4);
444 alpha10_8x8 = vshrn_n_u16(alpha10_16x8,8); 470
445 alpha11_8x8 = vshrn_n_u16(alpha11_16x8,8); 471 d0_32x4 = vaddq_u32(s0_32x4, ad0_32x4);
446 alpha0_8x16 = vcombine_u8(alpha00_8x8, alpha01_8x8); 472 d1_32x4 = vaddq_u32(s1_32x4, ad1_32x4);
447 alpha1_8x16 = vcombine_u8(alpha10_8x8, alpha11_8x8);
448 s0_8x16 = vreinterpretq_u8_u32(s0_32x4);
449 s1_8x16 = vreinterpretq_u8_u32(s1_32x4);
450 d0_8x16 = vaddq_u8(s0_8x16, alpha0_8x16);
451 d1_8x16 = vaddq_u8(s1_8x16, alpha1_8x16);
452 d0_32x4 = vreinterpretq_u32_u8(d0_8x16);
453 d1_32x4 = vreinterpretq_u32_u8(d1_8x16);
454 473
455 vst1q_u32(start, d0_32x4); 474 vst1q_u32(start, d0_32x4);
456 vst1q_u32(start+4, d1_32x4); 475 vst1q_u32(start+4, d1_32x4);
476
457 s+=8; 477 s+=8;
458 start+=8; 478 start+=8;
459 } 479 }