summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorYury Usishchev <y.usishchev@samsung.com>2015-04-15 17:24:03 +0200
committerCedric BAIL <cedric@osg.samsung.com>2015-05-07 09:53:08 +0200
commit9caa6a3597ce5d6cfcb8c99cc4ada4a88f8ad37c (patch)
treeb307c5f59ab6438ff1f2947d5354d1e5a4601d43 /src
parenta30481d27ba5e2dd5ad84cef9f6c55a9c89880a1 (diff)
evas: implement _op_blend_p_dp_neon and _op_blend_pas_dp_neon in NEON intrinsics.
Reviewers: raster, cedric Reviewed By: cedric Subscribers: cedric Projects: #efl Differential Revision: https://phab.enlightenment.org/D2311
Diffstat (limited to 'src')
-rw-r--r--src/lib/evas/common/evas_op_blend/op_blend_pixel_neon.c245
1 files changed, 219 insertions, 26 deletions
diff --git a/src/lib/evas/common/evas_op_blend/op_blend_pixel_neon.c b/src/lib/evas/common/evas_op_blend/op_blend_pixel_neon.c
index 3c32790c81..e81466cf39 100644
--- a/src/lib/evas/common/evas_op_blend/op_blend_pixel_neon.c
+++ b/src/lib/evas/common/evas_op_blend/op_blend_pixel_neon.c
@@ -1,17 +1,121 @@
1#ifdef BUILD_NEON
2#ifdef BUILD_NEON_INTRINSICS
3#include <arm_neon.h>
4#endif
5#endif
1/* blend pixel --> dst */ 6/* blend pixel --> dst */
2 7
3#ifdef BUILD_NEON 8#ifdef BUILD_NEON
4static void 9static void
5_op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) { 10_op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
6#ifdef BUILD_NEON_INTRINSICS 11#ifdef BUILD_NEON_INTRINSICS
7 DATA32 *e; 12 uint16x8_t alpha00_16x8;
8 int alpha; 13 uint16x8_t alpha01_16x8;
9 UNROLL8_PLD_WHILE(d, l, e, 14 uint16x8_t alpha10_16x8;
10 { 15 uint16x8_t alpha11_16x8;
11 alpha = 256 - (*s >> 24); 16 uint16x8_t d00_16x8;
12 *d = *s++ + MUL_256(alpha, *d); 17 uint16x8_t d01_16x8;
13 d++; 18 uint16x8_t d10_16x8;
14 }); 19 uint16x8_t d11_16x8;
20 uint32x4_t alpha0_32x4;
21 uint32x4_t alpha1_32x4;
22 uint32x4_t d0_32x4;
23 uint32x4_t d1_32x4;
24 uint32x4_t s0_32x4;
25 uint32x4_t s1_32x4;
26 uint32x4_t x1_32x4;
27 uint8x16_t alpha0_8x16;
28 uint8x16_t alpha1_8x16;
29 uint8x16_t d0_8x16;
30 uint8x16_t d1_8x16;
31 uint8x16_t s0_8x16;
32 uint8x16_t s1_8x16;
33 uint8x16_t x1_8x16;
34 uint8x16_t x255_8x16;
35 uint8x8_t alpha00_8x8;
36 uint8x8_t alpha01_8x8;
37 uint8x8_t alpha10_8x8;
38 uint8x8_t alpha11_8x8;
39 uint8x8_t d00_8x8;
40 uint8x8_t d01_8x8;
41 uint8x8_t d10_8x8;
42 uint8x8_t d11_8x8;
43
44 x1_8x16 = vdupq_n_u8(0x1);
45 x1_32x4 = vreinterpretq_u32_u8(x1_8x16);
46 x255_8x16 = vdupq_n_u8(0xff);
47
48 DATA32 *start = d;
49 int size = l;
50 DATA32 *end = start + (size & ~7);
51 while (start < end)
52 {
53 s0_32x4 = vld1q_u32(s);
54 s1_32x4 = vld1q_u32(s+4);
55
56 d0_32x4 = vld1q_u32(start);
57 d1_32x4 = vld1q_u32(start+4);
58
59 alpha0_32x4 = vshrq_n_u32(s0_32x4, 24);
60 alpha1_32x4 = vshrq_n_u32(s1_32x4, 24);
61
62 alpha0_32x4 = vmulq_u32(x1_32x4, alpha0_32x4);
63 alpha1_32x4 = vmulq_u32(x1_32x4, alpha1_32x4);
64
65 alpha0_8x16 = vreinterpretq_u8_u32(alpha0_32x4);
66 alpha1_8x16 = vreinterpretq_u8_u32(alpha1_32x4);
67
68 alpha0_8x16 = vsubq_u8(x255_8x16, alpha0_8x16);
69 alpha1_8x16 = vsubq_u8(x255_8x16, alpha1_8x16);
70
71 alpha10_8x8 = vget_low_u8(alpha1_8x16);
72 alpha11_8x8 = vget_high_u8(alpha1_8x16);
73 alpha00_8x8 = vget_low_u8(alpha0_8x16);
74 alpha01_8x8 = vget_high_u8(alpha0_8x16);
75 d0_8x16 = vreinterpretq_u8_u32(d0_32x4);
76 d1_8x16 = vreinterpretq_u8_u32(d1_32x4);
77 d00_8x8 = vget_low_u8(d0_8x16);
78 d01_8x8 = vget_high_u8(d0_8x16);
79 d10_8x8 = vget_low_u8(d1_8x16);
80 d11_8x8 = vget_high_u8(d1_8x16);
81 alpha00_16x8 = vmull_u8(alpha00_8x8, d00_8x8);
82 alpha01_16x8 = vmull_u8(alpha01_8x8, d01_8x8);
83 alpha10_16x8 = vmull_u8(alpha10_8x8, d10_8x8);
84 alpha11_16x8 = vmull_u8(alpha11_8x8, d11_8x8);
85 d00_16x8 = vmovl_u8(d00_8x8);
86 d01_16x8 = vmovl_u8(d01_8x8);
87 d10_16x8 = vmovl_u8(d10_8x8);
88 d11_16x8 = vmovl_u8(d11_8x8);
89 alpha00_16x8 = vaddq_u16(alpha00_16x8, d00_16x8);
90 alpha01_16x8 = vaddq_u16(alpha01_16x8, d01_16x8);
91 alpha10_16x8 = vaddq_u16(alpha10_16x8, d10_16x8);
92 alpha11_16x8 = vaddq_u16(alpha11_16x8, d11_16x8);
93 alpha00_8x8 = vshrn_n_u16(alpha00_16x8,8);
94 alpha01_8x8 = vshrn_n_u16(alpha01_16x8,8);
95 alpha10_8x8 = vshrn_n_u16(alpha10_16x8,8);
96 alpha11_8x8 = vshrn_n_u16(alpha11_16x8,8);
97 alpha0_8x16 = vcombine_u8(alpha00_8x8, alpha01_8x8);
98 alpha1_8x16 = vcombine_u8(alpha10_8x8, alpha11_8x8);
99 s0_8x16 = vreinterpretq_u8_u32(s0_32x4);
100 s1_8x16 = vreinterpretq_u8_u32(s1_32x4);
101 d0_8x16 = vaddq_u8(s0_8x16, alpha0_8x16);
102 d1_8x16 = vaddq_u8(s1_8x16, alpha1_8x16);
103 d0_32x4 = vreinterpretq_u32_u8(d0_8x16);
104 d1_32x4 = vreinterpretq_u32_u8(d1_8x16);
105
106 vst1q_u32(start, d0_32x4);
107 vst1q_u32(start+4, d1_32x4);
108 s+=8;
109 start+=8;
110 }
111 end += (size & 7);
112 while (start < end)
113 {
114 int alpha;
115 alpha = 256 - (*s >> 24);
116 *start = *s++ + MUL_256(alpha, *start);
117 start++;
118 }
15#else 119#else
16#define AP "blend_p_dp_" 120#define AP "blend_p_dp_"
17 asm volatile ( 121 asm volatile (
@@ -254,24 +358,113 @@ _op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
254static void 358static void
255_op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) { 359_op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
256#ifdef BUILD_NEON_INTRINSICS 360#ifdef BUILD_NEON_INTRINSICS
257 DATA32 *e; 361 uint16x8_t alpha00_16x8;
258 int alpha; 362 uint16x8_t alpha01_16x8;
259 UNROLL8_PLD_WHILE(d, l, e, 363 uint16x8_t alpha10_16x8;
260 { 364 uint16x8_t alpha11_16x8;
261 switch (*s & 0xff000000) 365 uint16x8_t d00_16x8;
262 { 366 uint16x8_t d01_16x8;
263 case 0: 367 uint16x8_t d10_16x8;
264 break; 368 uint16x8_t d11_16x8;
265 case 0xff000000: 369 uint32x4_t alpha0_32x4;
266 *d = *s; 370 uint32x4_t alpha1_32x4;
267 break; 371 uint32x4_t d0_32x4;
268 default: 372 uint32x4_t d1_32x4;
269 alpha = 256 - (*s >> 24); 373 uint32x4_t s0_32x4;
270 *d = *s + MUL_256(alpha, *d); 374 uint32x4_t s1_32x4;
271 break; 375 uint32x4_t x1_32x4;
272 } 376 uint8x16_t alpha0_8x16;
273 s++; d++; 377 uint8x16_t alpha1_8x16;
274 }); 378 uint8x16_t d0_8x16;
379 uint8x16_t d1_8x16;
380 uint8x16_t s0_8x16;
381 uint8x16_t s1_8x16;
382 uint8x16_t x1_8x16;
383 uint8x16_t x255_8x16;
384 uint8x8_t alpha00_8x8;
385 uint8x8_t alpha01_8x8;
386 uint8x8_t alpha10_8x8;
387 uint8x8_t alpha11_8x8;
388 uint8x8_t d00_8x8;
389 uint8x8_t d01_8x8;
390 uint8x8_t d10_8x8;
391 uint8x8_t d11_8x8;
392
393 x1_8x16 = vdupq_n_u8(0x1);
394 x1_32x4 = vreinterpretq_u32_u8(x1_8x16);
395 x255_8x16 = vdupq_n_u8(0xff);
396
397 DATA32 *start = d;
398 int size = l;
399 DATA32 *end = start + (size & ~7);
400 while (start < end)
401 {
402 s0_32x4 = vld1q_u32(s);
403 s1_32x4 = vld1q_u32(s+4);
404
405 d0_32x4 = vld1q_u32(start);
406 d1_32x4 = vld1q_u32(start+4);
407
408 alpha0_32x4 = vshrq_n_u32(s0_32x4, 24);
409 alpha1_32x4 = vshrq_n_u32(s1_32x4, 24);
410
411 alpha0_32x4 = vmulq_u32(x1_32x4, alpha0_32x4);
412 alpha1_32x4 = vmulq_u32(x1_32x4, alpha1_32x4);
413
414 alpha0_8x16 = vreinterpretq_u8_u32(alpha0_32x4);
415 alpha1_8x16 = vreinterpretq_u8_u32(alpha1_32x4);
416
417 alpha0_8x16 = vsubq_u8(x255_8x16, alpha0_8x16);
418 alpha1_8x16 = vsubq_u8(x255_8x16, alpha1_8x16);
419
420 alpha10_8x8 = vget_low_u8(alpha1_8x16);
421 alpha11_8x8 = vget_high_u8(alpha1_8x16);
422 alpha00_8x8 = vget_low_u8(alpha0_8x16);
423 alpha01_8x8 = vget_high_u8(alpha0_8x16);
424 d0_8x16 = vreinterpretq_u8_u32(d0_32x4);
425 d1_8x16 = vreinterpretq_u8_u32(d1_32x4);
426 d00_8x8 = vget_low_u8(d0_8x16);
427 d01_8x8 = vget_high_u8(d0_8x16);
428 d10_8x8 = vget_low_u8(d1_8x16);
429 d11_8x8 = vget_high_u8(d1_8x16);
430 alpha00_16x8 = vmull_u8(alpha00_8x8, d00_8x8);
431 alpha01_16x8 = vmull_u8(alpha01_8x8, d01_8x8);
432 alpha10_16x8 = vmull_u8(alpha10_8x8, d10_8x8);
433 alpha11_16x8 = vmull_u8(alpha11_8x8, d11_8x8);
434 d00_16x8 = vmovl_u8(d00_8x8);
435 d01_16x8 = vmovl_u8(d01_8x8);
436 d10_16x8 = vmovl_u8(d10_8x8);
437 d11_16x8 = vmovl_u8(d11_8x8);
438 alpha00_16x8 = vaddq_u16(alpha00_16x8, d00_16x8);
439 alpha01_16x8 = vaddq_u16(alpha01_16x8, d01_16x8);
440 alpha10_16x8 = vaddq_u16(alpha10_16x8, d10_16x8);
441 alpha11_16x8 = vaddq_u16(alpha11_16x8, d11_16x8);
442 alpha00_8x8 = vshrn_n_u16(alpha00_16x8,8);
443 alpha01_8x8 = vshrn_n_u16(alpha01_16x8,8);
444 alpha10_8x8 = vshrn_n_u16(alpha10_16x8,8);
445 alpha11_8x8 = vshrn_n_u16(alpha11_16x8,8);
446 alpha0_8x16 = vcombine_u8(alpha00_8x8, alpha01_8x8);
447 alpha1_8x16 = vcombine_u8(alpha10_8x8, alpha11_8x8);
448 s0_8x16 = vreinterpretq_u8_u32(s0_32x4);
449 s1_8x16 = vreinterpretq_u8_u32(s1_32x4);
450 d0_8x16 = vaddq_u8(s0_8x16, alpha0_8x16);
451 d1_8x16 = vaddq_u8(s1_8x16, alpha1_8x16);
452 d0_32x4 = vreinterpretq_u32_u8(d0_8x16);
453 d1_32x4 = vreinterpretq_u32_u8(d1_8x16);
454
455 vst1q_u32(start, d0_32x4);
456 vst1q_u32(start+4, d1_32x4);
457 s+=8;
458 start+=8;
459 }
460 end += (size & 7);
461 while (start < end)
462 {
463 int alpha;
464 alpha = 256 - (*s >> 24);
465 *start = *s++ + MUL_256(alpha, *start);
466 start++;
467 }
275#else 468#else
276#define AP "blend_pas_dp_" 469#define AP "blend_pas_dp_"
277 DATA32 *e = d + l,*tmp = e + 32,*pl=(void*)912; 470 DATA32 *e = d + l,*tmp = e + 32,*pl=(void*)912;