diff options
author | Yury Usishchev <y.usishchev@samsung.com> | 2015-04-15 17:22:54 +0200 |
---|---|---|
committer | Cedric BAIL <cedric@osg.samsung.com> | 2015-05-07 09:53:08 +0200 |
commit | a30481d27ba5e2dd5ad84cef9f6c55a9c89880a1 (patch) | |
tree | a40c8e65a993857356987afeb11a38a76c72caa6 /src/lib | |
parent | 71eec44ccc9ab43e728ba986fadce6c6cfd2ff7c (diff) |
evas: implement _op_blend_c_dp_neon in NEON intrinsics.
Reviewers: raster, cedric
@feature
Reviewed By: cedric
Subscribers: jpeg, cedric
Projects: #efl
Differential Revision: https://phab.enlightenment.org/D2310
Signed-off-by: Cedric BAIL <cedric@osg.samsung.com>
Diffstat (limited to '')
-rw-r--r-- | src/lib/evas/common/evas_op_blend/op_blend_color_neon.c | 92 |
1 files changed, 86 insertions, 6 deletions
diff --git a/src/lib/evas/common/evas_op_blend/op_blend_color_neon.c b/src/lib/evas/common/evas_op_blend/op_blend_color_neon.c index 2bf14c1f7c..7ba2ffdbda 100644 --- a/src/lib/evas/common/evas_op_blend/op_blend_color_neon.c +++ b/src/lib/evas/common/evas_op_blend/op_blend_color_neon.c | |||
@@ -1,15 +1,95 @@ | |||
1 | #ifdef BUILD_NEON | ||
2 | #ifdef BUILD_NEON_INTRINSICS | ||
3 | #include <arm_neon.h> | ||
4 | #endif | ||
5 | #endif | ||
1 | /* blend color --> dst */ | 6 | /* blend color --> dst */ |
2 | 7 | ||
3 | #ifdef BUILD_NEON | 8 | #ifdef BUILD_NEON |
4 | static void | 9 | static void |
5 | _op_blend_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) { | 10 | _op_blend_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) { |
6 | #ifdef BUILD_NEON_INTRINSICS | 11 | #ifdef BUILD_NEON_INTRINSICS |
7 | DATA32 *e, a = 256 - (c >> 24); | 12 | uint16x8_t temp00_16x8; |
8 | UNROLL8_PLD_WHILE(d, l, e, | 13 | uint16x8_t temp01_16x8; |
9 | { | 14 | uint16x8_t temp10_16x8; |
10 | *d = c + MUL_256(a, *d); | 15 | uint16x8_t temp11_16x8; |
11 | d++; | 16 | uint32x4_t temp0_32x4; |
12 | }); | 17 | uint32x4_t temp1_32x4; |
18 | uint32x4_t c_32x4; | ||
19 | uint32x4_t d0_32x4; | ||
20 | uint32x4_t d1_32x4; | ||
21 | uint8x16_t d0_8x16; | ||
22 | uint8x16_t d1_8x16; | ||
23 | uint8x16_t temp0_8x16; | ||
24 | uint8x16_t temp1_8x16; | ||
25 | uint8x8_t alpha_8x8; | ||
26 | uint8x8_t d00_8x8; | ||
27 | uint8x8_t d01_8x8; | ||
28 | uint8x8_t d10_8x8; | ||
29 | uint8x8_t d11_8x8; | ||
30 | uint8x8_t temp00_8x8; | ||
31 | uint8x8_t temp01_8x8; | ||
32 | uint8x8_t temp10_8x8; | ||
33 | uint8x8_t temp11_8x8; | ||
34 | |||
35 | // alpha can only be 0 if color is 0x0. In that case we can just return. | ||
36 | // Otherwise we can assume alpha != 0. This allows more optimization in | ||
37 | // NEON code. | ||
38 | |||
39 | if(!c) | ||
40 | return; | ||
41 | |||
42 | DATA32 *start = d; | ||
43 | int size = l; | ||
44 | DATA32 *end = start + (size & ~7); | ||
45 | |||
46 | unsigned char alpha; | ||
47 | alpha = ~(c >> 24) + 1; // 256 - (c >> 24) | ||
48 | alpha_8x8 = vdup_n_u8(alpha); | ||
49 | |||
50 | c_32x4 = vdupq_n_u32(c); | ||
51 | |||
52 | while (start < end) | ||
53 | { | ||
54 | d0_32x4 = vld1q_u32(start); | ||
55 | d1_32x4 = vld1q_u32(start+4); | ||
56 | d0_8x16 = vreinterpretq_u8_u32(d0_32x4); | ||
57 | d1_8x16 = vreinterpretq_u8_u32(d1_32x4); | ||
58 | |||
59 | d00_8x8 = vget_low_u8(d0_8x16); | ||
60 | d01_8x8 = vget_high_u8(d0_8x16); | ||
61 | d10_8x8 = vget_low_u8(d1_8x16); | ||
62 | d11_8x8 = vget_high_u8(d1_8x16); | ||
63 | |||
64 | temp00_16x8 = vmull_u8(alpha_8x8, d00_8x8); | ||
65 | temp01_16x8 = vmull_u8(alpha_8x8, d01_8x8); | ||
66 | temp10_16x8 = vmull_u8(alpha_8x8, d10_8x8); | ||
67 | temp11_16x8 = vmull_u8(alpha_8x8, d11_8x8); | ||
68 | |||
69 | temp00_8x8 = vshrn_n_u16(temp00_16x8,8); | ||
70 | temp01_8x8 = vshrn_n_u16(temp01_16x8,8); | ||
71 | temp10_8x8 = vshrn_n_u16(temp10_16x8,8); | ||
72 | temp11_8x8 = vshrn_n_u16(temp11_16x8,8); | ||
73 | |||
74 | temp0_8x16 = vcombine_u8(temp00_8x8, temp01_8x8); | ||
75 | temp1_8x16 = vcombine_u8(temp10_8x8, temp11_8x8); | ||
76 | |||
77 | temp0_32x4 = vreinterpretq_u32_u8(temp0_8x16); | ||
78 | temp1_32x4 = vreinterpretq_u32_u8(temp1_8x16); | ||
79 | |||
80 | d0_32x4 = vaddq_u32(c_32x4, temp0_32x4); | ||
81 | d1_32x4 = vaddq_u32(c_32x4, temp1_32x4); | ||
82 | |||
83 | vst1q_u32(start, d0_32x4); | ||
84 | vst1q_u32(start+4, d1_32x4); | ||
85 | start+=8; | ||
86 | } | ||
87 | end += (size & 7); | ||
88 | while (start < end) | ||
89 | { | ||
90 | *start = c + MUL_256(alpha, *start); | ||
91 | start++; | ||
92 | } | ||
13 | #else | 93 | #else |
14 | DATA32 *e, *tmp = 0; | 94 | DATA32 *e, *tmp = 0; |
15 | #define AP "B_C_DP" | 95 | #define AP "B_C_DP" |