summaryrefslogtreecommitdiff
path: root/src/lib/evas/common/evas_op_blend/op_blend_pixel_color_neon.c
diff options
context:
space:
mode:
authorCedric BAIL <cedric@osg.samsung.com>2015-04-28 23:36:04 +0200
committerCedric BAIL <cedric@osg.samsung.com>2015-05-07 09:53:11 +0200
commit76a5efe13ae76ce44d02e1f5921db9465e8a739b (patch)
tree0a8c9d224a2ffcc6593a600b81b7e4212557642c /src/lib/evas/common/evas_op_blend/op_blend_pixel_color_neon.c
parentedfd621d0619353a2d17c61718cef72b425e7cf6 (diff)
evas: implement pixel_color blending functions using NEON intrinsics.
Summary: NEON intrinsics can be built both for armv7 and armv8. Implemented functions: _op_blend_pan_c_dp_neon _op_blend_p_can_dp_neon _op_blend_pan_can_dp_neon _op_blend_p_caa_dp_neon _op_blend_pan_caa_dp_neon Reviewers: raster, cedric Subscribers: cedric Projects: #efl Maniphest Tasks: T2341 Differential Revision: https://phab.enlightenment.org/D2409 Signed-off-by: Cedric BAIL <cedric@osg.samsung.com>
Diffstat (limited to '')
-rw-r--r--src/lib/evas/common/evas_op_blend/op_blend_pixel_color_neon.c654
1 files changed, 465 insertions, 189 deletions
diff --git a/src/lib/evas/common/evas_op_blend/op_blend_pixel_color_neon.c b/src/lib/evas/common/evas_op_blend/op_blend_pixel_color_neon.c
index b1bfc25b8a..aec1c8605c 100644
--- a/src/lib/evas/common/evas_op_blend/op_blend_pixel_color_neon.c
+++ b/src/lib/evas/common/evas_op_blend/op_blend_pixel_color_neon.c
@@ -1,8 +1,3 @@
1#ifdef BUILD_NEON
2#ifdef BUILD_NEON_INTRINSICS
3#include <arm_neon.h>
4#endif
5#endif
6/* blend pixel x color --> dst */ 1/* blend pixel x color --> dst */
7#ifdef BUILD_NEON 2#ifdef BUILD_NEON
8 3
@@ -202,240 +197,521 @@ _op_blend_p_c_dp_neon(DATA32 * __restrict s, DATA8 *m EINA_UNUSED, DATA32 c, DAT
202#endif 197#endif
203} 198}
204 199
205static void
206_op_blend_pan_can_dp_neon(DATA32 *s, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) {
207 DATA32 *e;
208 UNROLL8_PLD_WHILE(d, l, e,
209 {
210 *d++ = 0xff000000 + MUL3_SYM(c, *s);
211 s++;
212 });
213}
214 200
215static void 201static void
216_op_blend_pan_caa_dp_neon(DATA32 *s, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) { 202_op_blend_pan_c_dp_neon(DATA32 *s, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) {
217#if 1 203 uint16x8_t ad0_16x8;
218 DATA32 *e; 204 uint16x8_t ad1_16x8;
219 DATA32 sc; 205 uint16x8_t sc0_16x8;
220 int alpha; 206 uint16x8_t sc1_16x8;
221 c = 1 + (c & 0xff); 207 uint16x8_t x255_16x8;
222 UNROLL8_PLD_WHILE(d, l, e, 208 uint32x4_t ad_32x4;
223 { 209 uint32x4_t c_32x4;
224 sc = MUL_256(c, *s); 210 uint32x4_t d_32x4;
225 alpha = 256 - (sc >> 24); 211 uint32x4_t mask_32x4;
226 *d = sc + MUL_256(alpha, *d); 212 uint32x4_t s_32x4;
227 d++; 213 uint32x4_t sc_32x4;
228 s++; 214 uint8x16_t ad_8x16;
229 }); 215 uint8x16_t c_8x16;
230#else // the below neon is buggy!! misses rendering of spans, i think with alignment. quick - just disable this. 216 uint8x16_t d_8x16;
231#define AP "_op_blend_pan_caa_dp_" 217 uint8x16_t mask_8x16;
232 DATA32 *e = d + l, *tmp = (void*)73; 218 uint8x16_t s_8x16;
233 asm volatile ( 219 uint8x16_t sc_8x16;
234 ".fpu neon \n\t" 220 uint8x8_t a_8x8;
235 /* Set up 'c' */ 221 uint8x8_t ad0_8x8;
236 "vdup.u8 d14, %[c] \n\t" 222 uint8x8_t ad1_8x8;
237 "vmov.i8 d15, #1 \n\t" 223 uint8x8_t c_8x8;
238 "vaddl.u8 q15, d14, d15 \n\t" 224 uint8x8_t d0_8x8;
239 "vshr.u8 q15,#1 \n\t" 225 uint8x8_t d1_8x8;
240 226 uint8x8_t s0_8x8;
241 // Pick a loop 227 uint8x8_t s1_8x8;
242 "andS %[tmp], %[d], $0xf \n\t" 228 uint8x8_t sc0_8x8;
243 "beq "AP"quadstart \n\t" 229 uint8x8_t sc1_8x8;
244 230
245 "andS %[tmp], %[d], $0x4 \n\t" 231 // alpha can only be 0 if color is 0x0. In that case we can just return.
246 "beq "AP"dualstart \n\t" 232 // Otherwise we can assume alpha != 0. This allows more optimization in
247 233 // NEON code.
248 AP"singleloop: \n\t" 234
249 "vld1.32 d4[0], [%[d]] \n\t" 235 if(!c)
250 "vld1.32 d0[0], [%[s]]! \n\t" 236 return;
251 237
252 // Long version of 'd' 238 unsigned char a;
253 "vmovl.u8 q8, d4 \n\t" 239 a = ~(c >> 24) + 1; // 256 - (c >> 24)
240
241 a_8x8 = vdup_n_u8(a);
242 c_32x4 = vdupq_n_u32(c);
243 c_8x16 = vreinterpretq_u8_u32(c_32x4);
244 c_8x8 = vget_low_u8(c_8x16);
245 x255_16x8 = vdupq_n_u16(0xff);
246 mask_32x4 = vdupq_n_u32(0xff000000);
247 mask_8x16 = vreinterpretq_u8_u32(mask_32x4);
248
249 DATA32 *end = d + (l & ~3);
250 while (d < end)
251 {
252 // load 4 elements from d
253 d_32x4 = vld1q_u32(d);
254 d_8x16 = vreinterpretq_u8_u32(d_32x4);
255 d0_8x8 = vget_low_u8(d_8x16);
256 d1_8x8 = vget_high_u8(d_8x16);
257
258 // multiply MUL_256(a, *d)
259 ad0_16x8 = vmull_u8(a_8x8, d0_8x8);
260 ad1_16x8 = vmull_u8(a_8x8, d1_8x8);
261 ad0_8x8 = vshrn_n_u16(ad0_16x8,8);
262 ad1_8x8 = vshrn_n_u16(ad1_16x8,8);
263 ad_8x16 = vcombine_u8(ad0_8x8, ad1_8x8);
264 ad_32x4 = vreinterpretq_u32_u8(ad_8x16);
265
266 // load 4 elements from s
267 s_32x4 = vld1q_u32(s);
268 s_8x16 = vreinterpretq_u8_u32(s_32x4);
269 s0_8x8 = vget_low_u8(s_8x16);
270 s1_8x8 = vget_high_u8(s_8x16);
271
272 // multiply MUL_SYM(c, *s);
273 sc0_16x8 = vmull_u8(s0_8x8, c_8x8);
274 sc1_16x8 = vmull_u8(s1_8x8, c_8x8);
275 sc0_16x8 = vaddq_u16(sc0_16x8, x255_16x8);
276 sc1_16x8 = vaddq_u16(sc1_16x8, x255_16x8);
277 sc0_8x8 = vshrn_n_u16(sc0_16x8, 8);
278 sc1_8x8 = vshrn_n_u16(sc1_16x8, 8);
279 sc_8x16 = vcombine_u8(sc0_8x8, sc1_8x8);
254 280
255 // Long version of 's' 281 // select alpha channel from c
256 "vmovl.u8 q6, d0 \n\t" 282 sc_8x16 = vbslq_u8(mask_8x16, c_8x16, sc_8x16);
257 283 sc_32x4 = vreinterpretq_u32_u8(sc_8x16);
258 // d8 = s -d
259 "vsub.s16 d8, d12, d16 \n\t"
260
261 // Multiply
262 "vmul.s16 d8, d8, d30 \n\t"
263
264 // Shift down
265 "vshr.s16 d8, #7 \n\t"
266
267 // Add 'd'
268 "vqadd.s16 d8, d8, d16 \n\t"
269
270 // Shrink to save
271 "vqmovun.s16 d0, q4 \n\t"
272 "vst1.32 d0[0], [%[d]]! \n\t"
273
274 // Now where?
275 "andS %[tmp], %[d], $0xf \n\t"
276 "beq "AP"quadstart \n\t"
277
278 AP"dualstart: \n\t"
279 // Check we have enough
280 "sub %[tmp], %[e], %[d] \n\t"
281 "cmp %[tmp], #16 \n\t"
282 "blt "AP"loopout \n\t"
283 284
284 AP"dualloop:" 285 // add up everything
285 "vldm %[d], {d4} \n\t" 286 d_32x4 = vaddq_u32(sc_32x4, ad_32x4);
286 "vldm %[s]!, {d0} \n\t"
287 287
288 // Long version of d 288 // save result
289 "vmovl.u8 q8, d4 \n\t" 289 vst1q_u32(d, d_32x4);
290 290
291 // Long version of s 291 d+=4;
292 "vmovl.u8 q6, d0 \n\t" 292 s+=4;
293 }
293 294
294 // q4/q5 = s-d 295 end += (l & 3);
295 "vsub.s16 q4, q6, q8 \n\t" 296 while (d < end)
297 {
298 *d = ((c & 0xff000000) + MUL3_SYM(c, *s)) + MUL_256(a, *d);
299 d++;
300 s++;
301 }
296 302
297 // Multiply 303}
298 "vmul.s16 q4, q4,q15 \n\t"
299 304
300 // Shift down 305static void
301 "vshr.s16 q4, #7 \n\t" 306_op_blend_p_can_dp_neon(DATA32 *s, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) {
307 uint16x8_t ad0_16x8;
308 uint16x8_t ad1_16x8;
309 uint16x8_t sc0_16x8;
310 uint16x8_t sc1_16x8;
311 uint16x8_t x255_16x8;
312 uint32x2_t c_32x2;
313 uint32x4_t ad_32x4;
314 uint32x4_t alpha_32x4;
315 uint32x4_t cond_32x4;
316 uint32x4_t d_32x4;
317 uint32x4_t mask_32x4;
318 uint32x4_t s_32x4;
319 uint32x4_t sc_32x4;
320 uint32x4_t x0_32x4;
321 uint32x4_t x1_32x4;
322 uint8x16_t ad_8x16;
323 uint8x16_t alpha_8x16;
324 uint8x16_t d_8x16;
325 uint8x16_t mask_8x16;
326 uint8x16_t s_8x16;
327 uint8x16_t sc_8x16;
328 uint8x16_t x0_8x16;
329 uint8x16_t x1_8x16;
330 uint8x8_t ad0_8x8;
331 uint8x8_t ad1_8x8;
332 uint8x8_t alpha0_8x8;
333 uint8x8_t alpha1_8x8;
334 uint8x8_t c_8x8;
335 uint8x8_t d0_8x8;
336 uint8x8_t d1_8x8;
337 uint8x8_t s0_8x8;
338 uint8x8_t s1_8x8;
339 uint8x8_t sc0_8x8;
340 uint8x8_t sc1_8x8;
341
342 x1_8x16 = vdupq_n_u8(0x1);
343 x1_32x4 = vreinterpretq_u32_u8(x1_8x16);
344 x0_8x16 = vdupq_n_u8(0x0);
345 x0_32x4 = vreinterpretq_u32_u8(x0_8x16);
346 mask_32x4 = vdupq_n_u32(0xff000000);
347 mask_8x16 = vreinterpretq_u8_u32(mask_32x4);
348 c_32x2 = vdup_n_u32(c);
349 c_8x8 = vreinterpret_u8_u32(c_32x2);
350 x255_16x8 = vdupq_n_u16(0xff);
302 351
303 // Add d 352 DATA32 *end = d + (l & ~3);
304 "vqadd.s16 q4, q4, q8 \n\t" 353 while (d < end)
354 {
355 // load 4 elements from s
356 s_32x4 = vld1q_u32(s);
357 s_8x16 = vreinterpretq_u8_u32(s_32x4);
358 s0_8x8 = vget_low_u8(s_8x16);
359 s1_8x8 = vget_high_u8(s_8x16);
305 360
306 // Shrink to save 361 // load 4 elements from d
307 "vqmovun.s16 d0, q4 \n\t" 362 d_32x4 = vld1q_u32(d);
363 d_8x16 = vreinterpretq_u8_u32(d_32x4);
364 d0_8x8 = vget_low_u8(d_8x16);
365 d1_8x8 = vget_high_u8(d_8x16);
308 366
309 "vstm %[d]!, {d0} \n\t" 367 // calculate alpha = 256 - (*s >> 24)
310 AP"quadstart: \n\t" 368 alpha_32x4 = vshrq_n_u32(s_32x4, 24);
311 "sub %[tmp], %[e], %[d] \n\t" 369 alpha_32x4 = vmulq_u32(x1_32x4, alpha_32x4);
312 "cmp %[tmp], #16 \n\t" 370 alpha_8x16 = vreinterpretq_u8_u32(alpha_32x4);
313 "blt "AP"loopout \n\t" 371 alpha_8x16 = vsubq_u8(x0_8x16, alpha_8x16);
372 alpha0_8x8 = vget_low_u8(alpha_8x16);
373 alpha1_8x8 = vget_high_u8(alpha_8x16);
314 374
315 "sub %[tmp], %[e], #15 \n\t" 375 // multiply MUL_SYM(c, *s);
376 sc0_16x8 = vmull_u8(s0_8x8, c_8x8);
377 sc1_16x8 = vmull_u8(s1_8x8, c_8x8);
378 sc0_16x8 = vaddq_u16(sc0_16x8, x255_16x8);
379 sc1_16x8 = vaddq_u16(sc1_16x8, x255_16x8);
380 sc0_8x8 = vshrn_n_u16(sc0_16x8, 8);
381 sc1_8x8 = vshrn_n_u16(sc1_16x8, 8);
382 sc_8x16 = vcombine_u8(sc0_8x8, sc1_8x8);
316 383
317 AP"quadloop: \n\t" 384 // select alpha channel from *s
318 // load 's' -> q0, 'd' -> q2 385 sc_8x16 = vbslq_u8(mask_8x16, s_8x16, sc_8x16);
319 "vldm %[d], {d4,d5} \n\t" 386 sc_32x4 = vreinterpretq_u32_u8(sc_8x16);
320 "vldm %[s]!, {d0,d1} \n\t"
321 387
322 // Long version of d 388 // multiply MUL_256(a, *d)
323 "vmovl.u8 q8, d4 \n\t" 389 ad0_16x8 = vmull_u8(alpha0_8x8, d0_8x8);
324 "vmovl.u8 q9, d5 \n\t" 390 ad1_16x8 = vmull_u8(alpha1_8x8, d1_8x8);
391 ad0_8x8 = vshrn_n_u16(ad0_16x8,8);
392 ad1_8x8 = vshrn_n_u16(ad1_16x8,8);
393 ad_8x16 = vcombine_u8(ad0_8x8, ad1_8x8);
394 ad_32x4 = vreinterpretq_u32_u8(ad_8x16);
325 395
326 // Long version of s 396 // select d if alpha is 0
327 "vmovl.u8 q6, d0 \n\t" 397 cond_32x4 = vceqq_u32(alpha_32x4, x0_32x4);
328 "vmovl.u8 q7, d1 \n\t" 398 ad_32x4 = vbslq_u32(cond_32x4, d_32x4, ad_32x4);
329 399
330 // q4/q5 = s-d 400 // add up everything
331 "vsub.s16 q4, q6, q8 \n\t" 401 d_32x4 = vaddq_u32(sc_32x4, ad_32x4);
332 "vsub.s16 q5, q7, q9 \n\t"
333 402
334 // Multiply 403 // save result
335 "vmul.s16 q4, q4,q15 \n\t" 404 vst1q_u32(d, d_32x4);
336 "vmul.s16 q5, q5,q15 \n\t"
337 405
338 // Shift down 406 d+=4;
339 "vshr.s16 q4, #7 \n\t" 407 s+=4;
340 "vshr.s16 q5, #7 \n\t" 408 }
341 409
342 // Add d 410 end += (l & 3);
343 "vqadd.s16 q4, q4, q8 \n\t" 411 int alpha;
344 "vqadd.s16 q5, q5, q9 \n\t" 412 while (d < end)
413 {
414 alpha = 256 - (*s >> 24);
415 *d = ((*s & 0xff000000) + MUL3_SYM(c, *s)) + MUL_256(alpha, *d);
416 d++;
417 s++;
418 }
419}
345 420
346 // Shrink to save 421static void
347 "vqmovun.s16 d0, q4 \n\t" 422_op_blend_pan_can_dp_neon(DATA32 *s, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) {
348 "vqmovun.s16 d1, q5 \n\t" 423 uint16x8_t sc00_16x8;
349 "vstm %[d]!, {d0,d1} \n\t" 424 uint16x8_t sc01_16x8;
350 "cmp %[tmp], %[d] \n\t" 425 uint16x8_t sc10_16x8;
426 uint16x8_t sc11_16x8;
427 uint16x8_t x255_16x8;
428 uint32x2_t c_32x2;
429 uint32x4_t d0_32x4;
430 uint32x4_t d1_32x4;
431 uint32x4_t mask_32x4;
432 uint32x4_t s0_32x4;
433 uint32x4_t s1_32x4;
434 uint32x4_t sc0_32x4;
435 uint32x4_t sc1_32x4;
436 uint8x16_t s0_8x16;
437 uint8x16_t s1_8x16;
438 uint8x16_t sc0_8x16;
439 uint8x16_t sc1_8x16;
440 uint8x8_t c_8x8;
441 uint8x8_t s00_8x8;
442 uint8x8_t s01_8x8;
443 uint8x8_t s10_8x8;
444 uint8x8_t s11_8x8;
445 uint8x8_t sc00_8x8;
446 uint8x8_t sc01_8x8;
447 uint8x8_t sc10_8x8;
448 uint8x8_t sc11_8x8;
449
450 mask_32x4 = vdupq_n_u32(0xff000000);
451 x255_16x8 = vdupq_n_u16(0xff);
452 c_32x2 = vdup_n_u32(c);
453 c_8x8 = vreinterpret_u8_u32(c_32x2);
351 454
352 "bhi "AP"quadloop\n\t" 455 DATA32 *end = d + (l & ~7);
456 while (d < end)
457 {
458 // load 8 elements from s
459 s0_32x4 = vld1q_u32(s);
460 s0_8x16 = vreinterpretq_u8_u32(s0_32x4);
461 s00_8x8 = vget_low_u8(s0_8x16);
462 s01_8x8 = vget_high_u8(s0_8x16);
463 s1_32x4 = vld1q_u32(s+4);
464 s1_8x16 = vreinterpretq_u8_u32(s1_32x4);
465 s10_8x8 = vget_low_u8(s1_8x16);
466 s11_8x8 = vget_high_u8(s1_8x16);
467
468 // multiply MUL_SYM(c, *s);
469 sc00_16x8 = vmull_u8(s00_8x8, c_8x8);
470 sc01_16x8 = vmull_u8(s01_8x8, c_8x8);
471 sc10_16x8 = vmull_u8(s10_8x8, c_8x8);
472 sc11_16x8 = vmull_u8(s11_8x8, c_8x8);
473 sc00_16x8 = vaddq_u16(sc00_16x8, x255_16x8);
474 sc01_16x8 = vaddq_u16(sc01_16x8, x255_16x8);
475 sc10_16x8 = vaddq_u16(sc10_16x8, x255_16x8);
476 sc11_16x8 = vaddq_u16(sc11_16x8, x255_16x8);
477 sc00_8x8 = vshrn_n_u16(sc00_16x8, 8);
478 sc01_8x8 = vshrn_n_u16(sc01_16x8, 8);
479 sc10_8x8 = vshrn_n_u16(sc10_16x8, 8);
480 sc11_8x8 = vshrn_n_u16(sc11_16x8, 8);
481 sc0_8x16 = vcombine_u8(sc00_8x8, sc01_8x8);
482 sc1_8x16 = vcombine_u8(sc10_8x8, sc11_8x8);
483
484 // add alpha channel
485 sc0_32x4 = vreinterpretq_u32_u8(sc0_8x16);
486 sc1_32x4 = vreinterpretq_u32_u8(sc1_8x16);
487 d0_32x4 = vorrq_u32(sc0_32x4, mask_32x4);
488 d1_32x4 = vorrq_u32(sc1_32x4, mask_32x4);
489
490 // save result
491 vst1q_u32(d, d0_32x4);
492 vst1q_u32(d+4, d1_32x4);
493
494 d+=8;
495 s+=8;
496 }
353 497
498 end += (l & 7);
499 while (d < end)
500 {
501 *d++ = 0xff000000 + MUL3_SYM(c, *s);
502 s++;
503 }
504}
354 505
355 "b "AP"done\n\t" 506static void
356 AP"loopout: \n\t" 507_op_blend_p_caa_dp_neon(DATA32 *s, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) {
357 "cmp %[d], %[e] \n\t" 508 uint16x8_t ad0_16x8;
358 "beq "AP"done\n\t" 509 uint16x8_t ad1_16x8;
359 "sub %[tmp],%[e], %[d] \n\t" 510 uint16x8_t cs0_16x8;
360 "cmp %[tmp],$0x04 \n\t" 511 uint16x8_t cs1_16x8;
361 "beq "AP"singleloop2 \n\t" 512 uint32x4_t ad_32x4;
513 uint32x4_t alpha_32x4;
514 uint32x4_t c_32x4;
515 uint32x4_t cond_32x4;
516 uint32x4_t cs_32x4;
517 uint32x4_t d_32x4;
518 uint32x4_t s_32x4;
519 uint32x4_t x0_32x4;
520 uint32x4_t x1_32x4;
521 uint8x16_t ad_8x16;
522 uint8x16_t alpha_8x16;
523 uint8x16_t c_8x16;
524 uint8x16_t cs_8x16;
525 uint8x16_t d_8x16;
526 uint8x16_t s_8x16;
527 uint8x16_t x0_8x16;
528 uint8x16_t x1_8x16;
529 uint8x8_t ad0_8x8;
530 uint8x8_t ad1_8x8;
531 uint8x8_t alpha0_8x8;
532 uint8x8_t alpha1_8x8;
533 uint8x8_t c_8x8;
534 uint8x8_t cs0_8x8;
535 uint8x8_t cs1_8x8;
536 uint8x8_t d0_8x8;
537 uint8x8_t d1_8x8;
538 uint8x8_t s0_8x8;
539 uint8x8_t s1_8x8;
362 540
363 AP"dualloop2: \n\t" 541 int temp = (1 + c) & 0xff;
364 "vldm %[d], {d4} \n\t"
365 "vldm %[s]!, {d0} \n\t"
366 542
367 // Long version of d 543 x1_8x16 = vdupq_n_u8(0x1);
368 "vmovl.u8 q8, d4 \n\t" 544 x1_32x4 = vreinterpretq_u32_u8(x1_8x16);
545 c_32x4 = vdupq_n_u32(temp);
546 c_32x4 = vmulq_u32(x1_32x4, c_32x4);
547 c_8x16 = vreinterpretq_u8_u32(c_32x4);
548 c_8x8 = vget_low_u8(c_8x16);
549 x0_8x16 = vdupq_n_u8(0x0);
550 x0_32x4 = vreinterpretq_u32_u8(x0_8x16);
369 551
370 // Long version of s 552 DATA32 *end = d + (l & ~3);
371 "vmovl.u8 q6, d0 \n\t" 553 while (d < end)
554 {
555 // load 4 elements from s
556 s_32x4 = vld1q_u32(s);
557 s_8x16 = vreinterpretq_u8_u32(s_32x4);
558 s0_8x8 = vget_low_u8(s_8x16);
559 s1_8x8 = vget_high_u8(s_8x16);
372 560
373 // q4/q5 = s-d 561 // multiply MUL_256(c, *s)
374 "vsub.s16 q4, q6, q8 \n\t" 562 cs0_16x8 = vmull_u8(c_8x8, s0_8x8);
563 cs1_16x8 = vmull_u8(c_8x8, s1_8x8);
564 cs0_8x8 = vshrn_n_u16(cs0_16x8,8);
565 cs1_8x8 = vshrn_n_u16(cs1_16x8,8);
566 cs_8x16 = vcombine_u8(cs0_8x8, cs1_8x8);
567 cs_32x4 = vreinterpretq_u32_u8(cs_8x16);
375 568
376 // Multiply 569 // select s if c is 0
377 "vmul.s16 q4, q4,q15 \n\t" 570 cond_32x4 = vceqq_u32(c_32x4, x0_32x4);
571 cs_32x4 = vbslq_u32(cond_32x4, s_32x4 , cs_32x4);
378 572
379 // Shift down 573 // calculate alpha = 256 - (*s >> 24)
380 "vshr.s16 q4, #7 \n\t" 574 alpha_32x4 = vshrq_n_u32(cs_32x4, 24);
575 alpha_32x4 = vmulq_u32(x1_32x4, alpha_32x4);
576 alpha_8x16 = vreinterpretq_u8_u32(alpha_32x4);
577 alpha_8x16 = vsubq_u8(x0_8x16, alpha_8x16);
578 alpha0_8x8 = vget_low_u8(alpha_8x16);
579 alpha1_8x8 = vget_high_u8(alpha_8x16);
381 580
382 // Add d 581 // load 4 elements from d
383 "vqadd.s16 q4, q4, q8 \n\t" 582 d_32x4 = vld1q_u32(d);
583 d_8x16 = vreinterpretq_u8_u32(d_32x4);
584 d0_8x8 = vget_low_u8(d_8x16);
585 d1_8x8 = vget_high_u8(d_8x16);
384 586
385 // Shrink to save 587 // multiply MUL_256(a, *d)
386 "vqmovun.s16 d0, q4 \n\t" 588 ad0_16x8 = vmull_u8(alpha0_8x8, d0_8x8);
589 ad1_16x8 = vmull_u8(alpha1_8x8, d1_8x8);
590 ad0_8x8 = vshrn_n_u16(ad0_16x8,8);
591 ad1_8x8 = vshrn_n_u16(ad1_16x8,8);
592 ad_8x16 = vcombine_u8(ad0_8x8, ad1_8x8);
593 ad_32x4 = vreinterpretq_u32_u8(ad_8x16);
387 594
388 "vstm %[d]!, {d0} \n\t" 595 // select d if alpha is 0
596 alpha_32x4 = vreinterpretq_u32_u8(alpha_8x16);
597 cond_32x4 = vceqq_u32(alpha_32x4, x0_32x4);
598 ad_32x4 = vbslq_u32(cond_32x4, d_32x4 , ad_32x4);
389 599
390 "cmp %[d], %[e] \n\t" 600 // add up everything
391 "beq "AP"done \n\t" 601 d_32x4 = vaddq_u32(cs_32x4, ad_32x4);
392 602
393 AP"singleloop2: \n\t" 603 // save result
394 "vld1.32 d4[0], [%[d]] \n\t" 604 vst1q_u32(d, d_32x4);
395 "vld1.32 d0[0], [%[s]]! \n\t"
396 605
397 // Long version of 'd' 606 d+=4;
398 "vmovl.u8 q8, d4 \n\t" 607 s+=4;
608 }
399 609
400 // Long version of 's' 610 end += (l & 3);
401 "vmovl.u8 q6, d0 \n\t" 611 int alpha;
612 c = 1 + (c & 0xff);
613 while (d < end)
614 {
615 DATA32 sc = MUL_256(c, *s);
616 alpha = 256 - (sc >> 24);
617 *d = sc + MUL_256(alpha, *d);
618 d++;
619 s++;
620 }
402 621
403 // d8 = s -d 622}
404 "vsub.s16 d8, d12, d16 \n\t"
405 623
406 // Multiply 624static void
407 "vmul.s16 d8, d8, d30 \n\t" 625_op_blend_pan_caa_dp_neon(DATA32 *s, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) {
626 int16x8_t c_i16x8;
627 int16x8_t d0_i16x8;
628 int16x8_t d1_i16x8;
629 int16x8_t ds0_i16x8;
630 int16x8_t ds1_i16x8;
631 int16x8_t s0_i16x8;
632 int16x8_t s1_i16x8;
633 int8x16_t ds_i8x16;
634 int8x8_t ds0_i8x8;
635 int8x8_t ds1_i8x8;
636 uint16x8_t c_16x8;
637 uint16x8_t d0_16x8;
638 uint16x8_t d1_16x8;
639 uint16x8_t s0_16x8;
640 uint16x8_t s1_16x8;
641 uint32x4_t d_32x4;
642 uint32x4_t ds_32x4;
643 uint32x4_t s_32x4;
644 uint8x16_t d_8x16;
645 uint8x16_t s_8x16;
646 uint8x8_t d0_8x8;
647 uint8x8_t d1_8x8;
648 uint8x8_t s0_8x8;
649 uint8x8_t s1_8x8;
408 650
409 // Shift down 651 c = 1 + (c & 0xff);
410 "vshr.s16 d8, #7 \n\t"
411 652
412 // Add 'd' 653 c_16x8 = vdupq_n_u16(c);
413 "vqadd.s16 d8, d8, d16 \n\t" 654 c_i16x8 = vreinterpretq_s16_u16(c_16x8);
414 655
415 // Shrink to save 656 DATA32 *end = d + (l & ~3);
416 "vqmovun.s16 d0, q4 \n\t" 657 while (d < end)
658 {
659 // load 4 elements from d
660 d_32x4 = vld1q_u32(d);
661 d_8x16 = vreinterpretq_u8_u32(d_32x4);
662 d0_8x8 = vget_low_u8(d_8x16);
663 d1_8x8 = vget_high_u8(d_8x16);
417 664
418 "vst1.32 d0[0], [%[d]] \n\t" 665 // spread d so that each channel occupies 16 bit
666 d0_16x8 = vmovl_u8(d0_8x8);
667 d1_16x8 = vmovl_u8(d1_8x8);
668 d0_i16x8 = vreinterpretq_s16_u16(d0_16x8);
669 d1_i16x8 = vreinterpretq_s16_u16(d1_16x8);
419 670
671 // load 4 elements from s
672 s_32x4 = vld1q_u32(s);
673 s_8x16 = vreinterpretq_u8_u32(s_32x4);
674 s0_8x8 = vget_low_u8(s_8x16);
675 s1_8x8 = vget_high_u8(s_8x16);
420 676
421 AP"done: \n\t" 677 // spread s so that each channel occupies 16 bit
678 s0_16x8 = vmovl_u8(s0_8x8);
679 s1_16x8 = vmovl_u8(s1_8x8);
680 s0_i16x8 = vreinterpretq_s16_u16(s0_16x8);
681 s1_i16x8 = vreinterpretq_s16_u16(s1_16x8);
682
683 // interpolate
684 ds0_i16x8 = vsubq_s16(s0_i16x8, d0_i16x8);
685 ds1_i16x8 = vsubq_s16(s1_i16x8, d1_i16x8);
686 ds0_i16x8 = vmulq_s16(ds0_i16x8, c_i16x8);
687 ds1_i16x8 = vmulq_s16(ds1_i16x8, c_i16x8);
688 ds0_i16x8 = vshrq_n_s16(ds0_i16x8, 8);
689 ds1_i16x8 = vshrq_n_s16(ds1_i16x8, 8);
690 ds0_i16x8 = vaddq_s16(ds0_i16x8, d0_i16x8);
691 ds1_i16x8 = vaddq_s16(ds1_i16x8, d1_i16x8);
692 ds0_i8x8 = vmovn_s16(ds0_i16x8);
693 ds1_i8x8 = vmovn_s16(ds1_i16x8);
694
695 // save result
696 ds_i8x16 = vcombine_s8(ds0_i8x8, ds1_i8x8);
697 ds_32x4 = vreinterpretq_u32_s8(ds_i8x16);
698 vst1q_u32(d, ds_32x4);
699
700 d+=4;
701 s+=4;
702 }
422 703
423 // No output 704 end += (l & 3);
424 : 705 while (d < end)
425 // Input 706 {
426 : [s] "r" (s), [d] "r" (d), [e] "r" (e), [c] "r" (c), [tmp] "r" (tmp) 707 *d = INTERP_256(c, *s, *d);
427 // Clobbered 708 d++;
428 : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "memory" 709 s++;
429 ); 710 }
430#undef AP
431#endif
432} 711}
433 712
434#define _op_blend_pas_c_dp_neon _op_blend_p_c_dp_neon 713#define _op_blend_pas_c_dp_neon _op_blend_p_c_dp_neon
435#define _op_blend_pan_c_dp_neon _op_blend_p_c_dp_neon
436#define _op_blend_p_can_dp_neon _op_blend_p_c_dp_neon
437#define _op_blend_pas_can_dp_neon _op_blend_p_c_dp_neon 714#define _op_blend_pas_can_dp_neon _op_blend_p_c_dp_neon
438#define _op_blend_p_caa_dp_neon _op_blend_p_c_dp_neon
439#define _op_blend_pas_caa_dp_neon _op_blend_p_c_dp_neon 715#define _op_blend_pas_caa_dp_neon _op_blend_p_c_dp_neon
440 716
441#define _op_blend_p_c_dpan_neon _op_blend_p_c_dp_neon 717#define _op_blend_p_c_dpan_neon _op_blend_p_c_dp_neon