summaryrefslogtreecommitdiff
path: root/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c
diff options
context:
space:
mode:
authorYury Usishchev <y.usishchev@samsung.com>2015-04-15 17:27:58 +0200
committerCedric BAIL <cedric@osg.samsung.com>2015-05-07 09:53:08 +0200
commitd2c5730b812f32b1e0a193e0011afead5110fc08 (patch)
tree59497dcadd63ecf257a61c803421a044f3373376 /src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c
parent9caa6a3597ce5d6cfcb8c99cc4ada4a88f8ad37c (diff)
evas: implement _op_blend_mas_c_dp_neon in NEON intrinsics.
Reviewers: raster Subscribers: cedric Projects: #efl Differential Revision: https://phab.enlightenment.org/D2312
Diffstat (limited to '')
-rw-r--r--src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c150
1 files changed, 128 insertions, 22 deletions
diff --git a/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c b/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c
index dbeb0638b3..0bc8c5ccd1 100644
--- a/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c
+++ b/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c
@@ -1,3 +1,8 @@
1#ifdef BUILD_NEON
2#ifdef BUILD_NEON_INTRINSICS
3#include <arm_neon.h>
4#endif
5#endif
1#define NEONDEBUG 0 6#define NEONDEBUG 0
2 7
3 8
@@ -20,28 +25,129 @@
20static void 25static void
21_op_blend_mas_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, int l) { 26_op_blend_mas_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, int l) {
22#ifdef BUILD_NEON_INTRINSICS 27#ifdef BUILD_NEON_INTRINSICS
23 DATA32 *e; 28 uint16x8_t d0_16x8;
24 int alpha = 256 - (c >> 24); 29 uint16x8_t d1_16x8;
25 UNROLL8_PLD_WHILE(d, l, e, 30 uint16x8_t m_16x8;
26 { 31 uint16x8_t mc0_16x8;
27 DATA32 a = *m; 32 uint16x8_t mc1_16x8;
28 switch(a) 33 uint16x8_t temp0_16x8;
29 { 34 uint16x8_t temp1_16x8;
30 case 0: 35 uint16x8_t x255_16x8;
31 break; 36 uint32x2_t c_32x2;
32 case 255: 37 uint32x2_t m_32x2;
33 *d = c + MUL_256(alpha, *d); 38 uint32x4_t a_32x4;
34 break; 39 uint32x4_t d_32x4;
35 default: 40 uint32x4_t m_32x4;
36 { 41 uint32x4_t x1_32x4;
37 DATA32 mc = MUL_SYM(a, c); 42 uint8x16_t a_8x16;
38 a = 256 - (mc >> 24); 43 uint8x16_t d_8x16;
39 *d = mc + MUL_256(a, *d); 44 uint8x16_t m_8x16;
40 } 45 uint8x16_t mc_8x16;
41 break; 46 uint8x16_t temp_8x16;
42 } 47 uint8x16_t x1_8x16;
43 m++; d++; 48 uint8x8_t a0_8x8;
44 }); 49 uint8x8_t a1_8x8;
50 uint8x8_t c_8x8;
51 uint8x8_t d0_8x8;
52 uint8x8_t d1_8x8;
53 uint8x8_t m0_8x8;
54 uint8x8_t m1_8x8;
55 uint8x8_t m_8x8;
56 uint8x8_t mc0_8x8;
57 uint8x8_t mc1_8x8;
58 uint8x8_t temp0_8x8;
59 uint8x8_t temp1_8x8;
60
61 x1_8x16 = vdupq_n_u8(0x1);
62 x255_16x8 = vdupq_n_u16(0xff);
63 x1_32x4 = vreinterpretq_u32_u8(x1_8x16);
64 c_32x2 = vdup_n_u32(c);
65 c_8x8 = vreinterpret_u8_u32(c_32x2);
66
67 DATA32 *start = d;
68 int size = l;
69 DATA32 *end = start + (size & ~7);
70 while (start < end) {
71 int k = *((int *)m);
72 if (k == 0)
73 {
74 m+=4;
75 start+=4;
76 continue;
77 }
78
79 m_32x2 = vld1_lane_u32((DATA32*)m, m_32x2, 0);
80
81 d_32x4 = vld1q_u32(start);
82
83 m_8x8 = vreinterpret_u8_u32(m_32x2);
84 m_16x8 = vmovl_u8(m_8x8);
85 m_8x16 = vreinterpretq_u8_u16(m_16x8);
86 m_8x8 = vget_low_u8(m_8x16);
87 m_16x8 = vmovl_u8(m_8x8);
88 m_32x4 = vreinterpretq_u32_u16(m_16x8);
89
90 m_32x4 = vmulq_u32(m_32x4, x1_32x4);
91 m_8x16 = vreinterpretq_u8_u32(m_32x4);
92 m0_8x8 = vget_low_u8(m_8x16);
93 m1_8x8 = vget_high_u8(m_8x16);
94
95 mc0_16x8 = vmull_u8(m0_8x8, c_8x8);
96 mc1_16x8 = vmull_u8(m1_8x8, c_8x8);
97
98 mc0_16x8 = vaddq_u16(mc0_16x8, x255_16x8);
99 mc1_16x8 = vaddq_u16(mc1_16x8, x255_16x8);
100
101 mc0_8x8 = vshrn_n_u16(mc0_16x8, 8);
102 mc1_8x8 = vshrn_n_u16(mc1_16x8, 8);
103
104 mc_8x16 = vcombine_u8(mc0_8x8, mc1_8x8);
105 a_8x16 = vmvnq_u8(mc_8x16);
106 a_32x4 = vreinterpretq_u32_u8(a_8x16);
107 a_32x4 = vshrq_n_u32(a_32x4, 24);
108 a_32x4 = vmulq_u32(a_32x4, x1_32x4);
109
110 a_8x16 = vreinterpretq_u8_u32(a_32x4);
111 a0_8x8 = vget_low_u8(a_8x16);
112 a1_8x8 = vget_high_u8(a_8x16);
113
114 d_8x16 = vreinterpretq_u8_u32(d_32x4);
115
116 d0_8x8 = vget_low_u8(d_8x16);
117 d1_8x8 = vget_high_u8(d_8x16);
118
119 d0_16x8 = vmovl_u8(d0_8x8);
120 d1_16x8 = vmovl_u8(d1_8x8);
121
122 temp0_16x8 = vmull_u8(a0_8x8, d0_8x8);
123 temp1_16x8 = vmull_u8(a1_8x8, d1_8x8);
124
125 temp0_16x8 = vaddq_u16(temp0_16x8, d0_16x8);
126 temp1_16x8 = vaddq_u16(temp1_16x8, d1_16x8);
127
128 temp0_8x8 = vshrn_n_u16(temp0_16x8,8);
129 temp1_8x8 = vshrn_n_u16(temp1_16x8,8);
130
131 temp_8x16 = vcombine_u8(temp0_8x8, temp1_8x8);
132
133 d_8x16 = vaddq_u8(mc_8x16, temp_8x16);
134
135 d_32x4 = vreinterpretq_u32_u8(d_8x16);
136
137 vst1q_u32(start, d_32x4);
138
139 start+=4;
140 m+=4;
141
142 }
143 end += (size & 7);
144 while (start < end) {
145 DATA32 a = *m;
146 DATA32 mc = MUL_SYM(a, c);
147 a = 256 - (mc >> 24);
148 *start = mc + MUL_256(a, *start);
149 m++; start++;
150 }
45#else 151#else
46 DATA32 *e = d + l; 152 DATA32 *e = d + l;
47 153