summaryrefslogtreecommitdiff
path: root/src/lib/evas/common
diff options
context:
space:
mode:
authorCedric BAIL <cedric@osg.samsung.com>2015-04-10 11:22:33 +0200
committerCedric BAIL <cedric@osg.samsung.com>2015-04-10 12:05:05 +0200
commit3b46609140c4228490e4d65e754f77e73c86ef2a (patch)
treed07a2dbd6a4ad7a2c705ed8d29f1a079acfac9be /src/lib/evas/common
parent6ceac2509d1b190bf99789379647d09e13b1958b (diff)
evas: _op_blend_p_dp_neon and _op_blend_pas_dp_neon miscalculation fix
Summary: When processing random data result of this function differs from C variant in more than 50% cases. This difference is due to alpha calculation, in C code : alpha = 256 - (*s >> 24) in NEON: "vmvn.u8 q4,q0 \n\t" // ie ~(*s>>24) === 255 - (*s>>24) We cant just add "1" as overflow will occur in case (*s>>24) == 0 (we use only 8 bit per channel in vector registers) So here is the solution: copy *d right before multiplication and add it to the result of it later. Same approach as in D455. Reviewers: raster, cedric, stefan_schmidt Reviewed By: cedric Subscribers: cedric Projects: #efl Differential Revision: https://phab.enlightenment.org/D2308 Signed-off-by: Cedric BAIL <cedric@osg.samsung.com>
Diffstat (limited to 'src/lib/evas/common')
-rw-r--r--src/lib/evas/common/evas_op_blend/op_blend_pixel_neon.c76
1 files changed, 58 insertions, 18 deletions
diff --git a/src/lib/evas/common/evas_op_blend/op_blend_pixel_neon.c b/src/lib/evas/common/evas_op_blend/op_blend_pixel_neon.c
index 1cb50b6..4b9993b 100644
--- a/src/lib/evas/common/evas_op_blend/op_blend_pixel_neon.c
+++ b/src/lib/evas/common/evas_op_blend/op_blend_pixel_neon.c
@@ -30,8 +30,10 @@ _op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
30 30
31 "vmul.u32 d8, d16, d8 \n\t" 31 "vmul.u32 d8, d16, d8 \n\t"
32 32
33 "vmovl.u8 q9, d4 \n\t"
33 "vmull.u8 q6, d4,d8 \n\t" 34 "vmull.u8 q6, d4,d8 \n\t"
34 "vqrshrn.u16 d8, q6, #8 \n\t" 35 "vadd.u16 q6, q6, q9 \n\t"
36 "vshrn.u16 d8, q6, #8 \n\t"
35 // Add to 's' 37 // Add to 's'
36 "vqadd.u8 q2, q4,q0 \n\t" 38 "vqadd.u8 q2, q4,q0 \n\t"
37 39
@@ -61,8 +63,10 @@ _op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
61 63
62 "vmul.u32 d8, d16, d8 \n\t" 64 "vmul.u32 d8, d16, d8 \n\t"
63 65
66 "vmovl.u8 q9, d4 \n\t"
64 "vmull.u8 q6, d4,d8 \n\t" 67 "vmull.u8 q6, d4,d8 \n\t"
65 "vqrshrn.u16 d8, q6, #8 \n\t" 68 "vadd.u16 q6, q6, q9 \n\t"
69 "vshrn.u16 d8, q6, #8 \n\t"
66 // Add to 's' 70 // Add to 's'
67 "vqadd.u8 d4, d8,d0 \n\t" 71 "vqadd.u8 d4, d8,d0 \n\t"
68 "vstr d4, [%[d]] \n\t" 72 "vstr d4, [%[d]] \n\t"
@@ -87,13 +91,18 @@ _op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
87 // Multiply into all fields 91 // Multiply into all fields
88 "vmul.u32 q4, q8,q4 \n\t" 92 "vmul.u32 q4, q8,q4 \n\t"
89 93
94 "vmovl.u8 q9, d4 \n\t"
95 "vmovl.u8 q10, d5 \n\t"
90 // a * d (clobbering 'd'/q7) 96 // a * d (clobbering 'd'/q7)
91 "vmull.u8 q6, d4,d8 \n\t" 97 "vmull.u8 q6, d4,d8 \n\t"
92 "vmull.u8 q2, d5,d9 \n\t" 98 "vmull.u8 q2, d5,d9 \n\t"
93 99
100 "vadd.u16 q6, q6, q9 \n\t"
101 "vadd.u16 q2, q2, q10 \n\t"
102
94 // Shift & narrow it 103 // Shift & narrow it
95 "vqrshrn.u16 d8, q6, #8 \n\t" 104 "vshrn.u16 d8, q6, #8 \n\t"
96 "vqrshrn.u16 d9, q2, #8 \n\t" 105 "vshrn.u16 d9, q2, #8 \n\t"
97 106
98 // Add to s 107 // Add to s
99 "vqadd.u8 q2, q4,q0 \n\t" 108 "vqadd.u8 q2, q4,q0 \n\t"
@@ -126,6 +135,10 @@ _op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
126 "vmul.u32 q4, q8,q4 \n\t" 135 "vmul.u32 q4, q8,q4 \n\t"
127 "vmul.u32 q5, q8,q5 \n\t" 136 "vmul.u32 q5, q8,q5 \n\t"
128 137
138 "vmovl.u8 q9, d4 \n\t"
139 "vmovl.u8 q10, d5 \n\t"
140 "vmovl.u8 q11, d6 \n\t"
141 "vmovl.u8 q12, d7 \n\t"
129 142
130 // a * d (clobbering 'd'/q7) 143 // a * d (clobbering 'd'/q7)
131 "vmull.u8 q6, d4,d8 \n\t" 144 "vmull.u8 q6, d4,d8 \n\t"
@@ -133,13 +146,18 @@ _op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
133 "vmull.u8 q7, d6,d10 \n\t" 146 "vmull.u8 q7, d6,d10 \n\t"
134 "vmull.u8 q3, d7,d11 \n\t" 147 "vmull.u8 q3, d7,d11 \n\t"
135 148
149 "vadd.u16 q6, q6, q9 \n\t"
150 "vadd.u16 q2, q2, q10 \n\t"
151 "vadd.u16 q7, q7, q11 \n\t"
152 "vadd.u16 q3, q3, q12 \n\t"
153
136 "cmp %[tmp], %[d]\n\t" 154 "cmp %[tmp], %[d]\n\t"
137 155
138 // Shift & narrow it 156 // Shift & narrow it
139 "vqrshrn.u16 d8, q6, #8 \n\t" 157 "vshrn.u16 d8, q6, #8 \n\t"
140 "vqrshrn.u16 d9, q2, #8 \n\t" 158 "vshrn.u16 d9, q2, #8 \n\t"
141 "vqrshrn.u16 d10, q7, #8 \n\t" 159 "vshrn.u16 d10, q7, #8 \n\t"
142 "vqrshrn.u16 d11, q3, #8 \n\t" 160 "vshrn.u16 d11, q3, #8 \n\t"
143 161
144 162
145 // Add to s 163 // Add to s
@@ -171,8 +189,10 @@ _op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
171 189
172 "vmul.u32 d8, d16, d8 \n\t" 190 "vmul.u32 d8, d16, d8 \n\t"
173 191
192 "vmovl.u8 q9, d4 \n\t"
174 "vmull.u8 q6, d4,d8 \n\t" 193 "vmull.u8 q6, d4,d8 \n\t"
175 "vqrshrn.u16 d8, q6, #8 \n\t" 194 "vadd.u16 q6, q6, q9 \n\t"
195 "vshrn.u16 d8, q6, #8 \n\t"
176 // Add to 's' 196 // Add to 's'
177 "vqadd.u8 d4, d8,d0 \n\t" 197 "vqadd.u8 d4, d8,d0 \n\t"
178 198
@@ -195,8 +215,10 @@ _op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
195 215
196 "vmul.u32 d8, d8, d16 \n\t" 216 "vmul.u32 d8, d8, d16 \n\t"
197 217
218 "vmovl.u8 q9, d4 \n\t"
198 "vmull.u8 q6, d8,d4 \n\t" 219 "vmull.u8 q6, d8,d4 \n\t"
199 "vqrshrn.u16 d8, q6, #8 \n\t" 220 "vadd.u16 q6, q6, q9 \n\t"
221 "vshrn.u16 d8, q6, #8 \n\t"
200 // Add to 's' 222 // Add to 's'
201 "vqadd.u8 d0, d0,d8 \n\t" 223 "vqadd.u8 d0, d0,d8 \n\t"
202 "vst1.32 d0[0], [%[d]] \n\t" 224 "vst1.32 d0[0], [%[d]] \n\t"
@@ -247,10 +269,12 @@ _op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
247 // Mulitply into all fields 269 // Mulitply into all fields
248 "vmul.u32 d8, d8, d16 \n\t" 270 "vmul.u32 d8, d8, d16 \n\t"
249 271
272 "vmovl.u8 q9, d4 \n\t"
250 // Multiply out 273 // Multiply out
251 "vmull.u8 q6, d8, d4 \n\t" 274 "vmull.u8 q6, d8, d4 \n\t"
275 "vadd.u16 q6, q6, q9 \n\t"
252 276
253 "vqrshrn.u16 d8, q6, #8 \n\t" 277 "vshrn.u16 d8, q6, #8 \n\t"
254 278
255 // Add to s 279 // Add to s
256 "vqadd.u8 d0, d0,d8 \n\t" 280 "vqadd.u8 d0, d0,d8 \n\t"
@@ -278,10 +302,12 @@ _op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
278 // Mulitply into all fields 302 // Mulitply into all fields
279 "vmul.u32 d8, d8, d16 \n\t" 303 "vmul.u32 d8, d8, d16 \n\t"
280 304
305 "vmovl.u8 q9, d4 \n\t"
281 // Multiply out 306 // Multiply out
282 "vmull.u8 q6, d8, d4 \n\t" 307 "vmull.u8 q6, d8, d4 \n\t"
308 "vadd.u16 q6, q6, q9 \n\t"
283 309
284 "vqrshrn.u16 d8, q6, #8 \n\t" 310 "vshrn.u16 d8, q6, #8 \n\t"
285 311
286 // Add to s 312 // Add to s
287 "vqadd.u8 d0, d0,d8 \n\t" 313 "vqadd.u8 d0, d0,d8 \n\t"
@@ -316,18 +342,28 @@ _op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
316 "vmul.u32 q5, q5, q8 \n\t" 342 "vmul.u32 q5, q5, q8 \n\t"
317 "pld [%[pl]] \n\t" 343 "pld [%[pl]] \n\t"
318 344
345 "vmovl.u8 q9, d4 \n\t"
346 "vmovl.u8 q10, d5 \n\t"
347 "vmovl.u8 q11, d6 \n\t"
348 "vmovl.u8 q12, d7 \n\t"
349
319 // Multiply out 350 // Multiply out
320 "vmull.u8 q6, d8, d4 \n\t" 351 "vmull.u8 q6, d8, d4 \n\t"
321 "vmull.u8 q7, d10, d6 \n\t" 352 "vmull.u8 q7, d10, d6 \n\t"
322 "vmull.u8 q2, d9, d5 \n\t" 353 "vmull.u8 q2, d9, d5 \n\t"
323 "vmull.u8 q3, d11, d7 \n\t" 354 "vmull.u8 q3, d11, d7 \n\t"
324 355
356 "vadd.u16 q6, q6, q9 \n\t"
357 "vadd.u16 q2, q2, q10 \n\t"
358 "vadd.u16 q7, q7, q11 \n\t"
359 "vadd.u16 q3, q3, q12 \n\t"
360
325 "add %[pl], %[d], #32 \n\t" 361 "add %[pl], %[d], #32 \n\t"
326 362
327 "vqrshrn.u16 d8, q6, #8 \n\t" 363 "vshrn.u16 d8, q6, #8 \n\t"
328 "vqrshrn.u16 d10, q7, #8 \n\t" 364 "vshrn.u16 d10, q7, #8 \n\t"
329 "vqrshrn.u16 d9, q2, #8 \n\t" 365 "vshrn.u16 d9, q2, #8 \n\t"
330 "vqrshrn.u16 d11, q3, #8 \n\t" 366 "vshrn.u16 d11, q3, #8 \n\t"
331 "pld [%[pl]] \n\t" 367 "pld [%[pl]] \n\t"
332 368
333 "cmp %[tmp], %[pl] \n\t" 369 "cmp %[tmp], %[pl] \n\t"
@@ -360,10 +396,12 @@ _op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
360 // Mulitply into all fields 396 // Mulitply into all fields
361 "vmul.u32 d8, d8, d16 \n\t" 397 "vmul.u32 d8, d8, d16 \n\t"
362 398
399 "vmovl.u8 q9, d4 \n\t"
363 // Multiply out 400 // Multiply out
364 "vmull.u8 q6, d8, d4 \n\t" 401 "vmull.u8 q6, d8, d4 \n\t"
402 "vadd.u16 q6, q6, q9 \n\t"
365 403
366 "vqrshrn.u16 d8, q6, #8 \n\t" 404 "vshrn.u16 d8, q6, #8 \n\t"
367 405
368 // Add to s 406 // Add to s
369 "vqadd.u8 d0, d0,d8 \n\t" 407 "vqadd.u8 d0, d0,d8 \n\t"
@@ -389,9 +427,11 @@ _op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
389 "vmul.u32 d8, d8, d16 \n\t" 427 "vmul.u32 d8, d8, d16 \n\t"
390 428
391 // Multiply out 429 // Multiply out
430 "vmovl.u8 q9, d4 \n\t"
392 "vmull.u8 q6, d8, d4 \n\t" 431 "vmull.u8 q6, d8, d4 \n\t"
432 "vadd.u16 q6, q6, q9 \n\t"
393 433
394 "vqrshrn.u16 d8, q6, #8 \n\t" 434 "vshrn.u16 d8, q6, #8 \n\t"
395 435
396 // Add to s 436 // Add to s
397 "vqadd.u8 d0, d0,d8 \n\t" 437 "vqadd.u8 d0, d0,d8 \n\t"