From deec62c9b66c45705918bf659ce1d2107dbc6831 Mon Sep 17 00:00:00 2001 From: "Carsten Haitzler (Rasterman)" Date: Fri, 15 Nov 2013 19:16:03 +0900 Subject: [PATCH] evas - fix neon blend code used for text rendering to not leave dirty end --- .../evas_op_blend/op_blend_mask_color_neon.c | 249 +++++++----------- 1 file changed, 90 insertions(+), 159 deletions(-) diff --git a/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c b/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c index da7cd3e24d..252f276ba8 100644 --- a/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c +++ b/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c @@ -19,180 +19,111 @@ #ifdef BUILD_NEON static void _op_blend_mas_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, int l) { - DATA32 *e; + // main loop process data in pairs, so we need count to be even + DATA32 *e = d + l - (l % 2); - DEBUG_FNCOUNT(""); + // everything we can do only once per cycle + // loading of 'c', initialization of some registers + __asm__ __volatile__ + ( + ".fpu neon \n\t" + " vmov.32 d30[0], %[c] \n\t" + " vmov.i16 q10, #255 \n\t" + " vmov.i16 q11, #256 \n\t" + " veor d29, d29, d29 \n\t" + " vzip.8 d30, d29 \n\t" + " vmov d31, d30 \n\t" + : + : [c] "r" (c) + : "q10", "q11", "q15", "d29" + ); + while (d < e) + { + // main cycle + __asm__ __volatile__ + ( + // load pair '*d' and '*(d+1)' into vector register + " vldm %[d], {d4} \n\t" -#define AP "blend_mas_c_dp_" - asm volatile ( - ".fpu neon \n\t" - " vdup.i32 q15, %[c] \n\t" - " vmov.i8 q14, #1 \n\t" + // load '*m' and '*(m+1)' + " veor q0, q0, q0 \n\t" + " vld1.8 d0[0], [%[m]]! \n\t" + " vld1.8 d1[0], [%[m]]! \n\t" - // If aligned already - straight to quads - " andS %[tmp], %[d],$0xf \n\t" - " beq "AP"quadloops \n\t" + // spread values from d in vector registers so for each + // 8 bit channel data we have 8 bit of zeros + // so each 32bit value occupies now one 64 bit register + " veor d5, d5, d5 \n\t" + " vzip.8 d4, d5 \n\t" - " andS %[tmp], %[d],$0x4 \n\t" - " beq "AP"dualloop \n\t" + // copy *m values in corresponding registers + " vdup.u16 d0, d0[0] \n\t" + " vdup.u16 d1, d1[0] \n\t" - AP"singleloop: \n\t" - " vld1.8 d0[0], [%[m]]! \n\t" - " vld1.32 d4[0], [%[d]] \n\t" - " vdup.u8 d0, d0[0] \n\t" - " vmull.u8 q4, d0, d30 \n\t" - " vqrshrn.u16 d12, q4, #8 \n\t" - " vmvn.u16 d14, d12 \n\t" - " vshr.u32 d16, d14, #24 \n\t" - " vmul.u32 d16, d16, d28 \n\t" - " vmull.u8 q7, d16, d4 \n\t" - " vqrshrn.u16 d0, q7, #8 \n\t" - " vqadd.u8 d0, d0, d12 \n\t" - " vst1.32 d0[0], [%[d]]! \n\t" + // multiply a * c + " vmul.u16 q13, q0, q15 \n\t" + " vadd.i16 q13, q13, q10 \n\t" + " vsri.16 q13, q13, #8 \n\t" + " vand q13, q13, q10 \n\t" - // Can we go the fast path? - " andS %[tmp], %[d],$0xf \n\t" - " beq "AP"quadloops \n\t" + // extract negated alpha + " vdup.u16 d24, d26[3] \n\t" + " vdup.u16 d25, d27[3] \n\t" + " vsub.i16 q12, q11, q12 \n\t" - AP"dualloop: \n\t" - " sub %[tmp], %[e], %[d] \n\t" - " cmp %[tmp], #16 \n\t" - " blt "AP"loopout \n\t" + // multiply alpha * (*d) and add a*c + " vmul.u16 q2, q2, q12 \n\t" + " vsri.16 q2, q2, #8 \n\t" + " vand q2, q2, q10 \n\t" + " vadd.i16 q2, q2, q13 \n\t" + " vand q2, q2, q10 \n\t" - " vld1.16 d0[0], [%[m]]! \n\t" - " vldm %[d], {d4} \n\t" - " vmovl.u8 q0, d0 \n\t" - " vmovl.u8 q0, d0 \n\t" - " vmul.u32 q0, q14 \n\t" - " vmull.u8 q4, d0, d30 \n\t" - " vqrshrn.u16 d12, q4, #8 \n\t" - " vmvn.u16 d14, d12 \n\t" - " vshr.u32 d16, d14, #24 \n\t" - " vmul.u32 d16, d16, d28 \n\t" - " vmull.u8 q7, d16, d4 \n\t" - " vqrshrn.u16 d0, q7, #8 \n\t" - " vqadd.u8 q0, q0, q6 \n\t" - " vstm %[d]!, {d0} \n\t" + // save results + " vqmovn.u16 d4, q2 \n\t" + " vstm %[d]!, {d4} \n\t" + : [d] "+r" (d), [m] "+r" (m) + : [c] "r" (c) + : "q0", "q2", "q15", "q13", "q12", "q11", "q10", + "memory" + ); + } + if (l % 2) + { + // do analogue of main loop for last element, if needed + __asm__ __volatile__ + ( + " vld1.32 d4[0], [%[d]] \n\t" - AP"quadloops: \n\t" - " sub %[tmp], %[e], %[d] \n\t" - " cmp %[tmp], #16 \n\t" - " blt "AP"loopout \n\t" + " veor d0, d0, d0 \n\t" + " vld1.8 d0[0], [%[m]]! \n\t" + " veor d5, d5, d5 \n\t" + " vzip.8 d4, d5 \n\t" - " sub %[tmp], %[e], #15 \n\t" + " vdup.u16 d0, d0[0] \n\t" - " sub %[d], #16 \n\t" - AP"fastloop:" - " add %[d], #16 \n\t" - " cmp %[tmp], %[d] \n\t" - " ble "AP"loopout \n\t" - AP"quadloopint: \n\t" - " ldr %[x], [%[m]] \n\t" - " add %[m], #4 \n\t" - " cmp %[x], #0 \n\t" - " beq "AP"fastloop \n\t" - " vmov.32 d0[0], %[x] \n\t" - " vldm %[d], {d4,d5} \n\t" + " vmul.u16 d26, d0, d30 \n\t" + " vadd.i16 d26, d26, d20 \n\t" + " vsri.16 d26, d26, #8 \n\t" + " vand d26, d26, d20 \n\t" - // Expand M: Fixme: Can we do this quicker? - " vmovl.u8 q0, d0 \n\t" - " vmovl.u8 q0, d0 \n\t" - " vmul.u32 q0, q14 \n\t" + " vdup.u16 d24, d26[3] \n\t" - // Multiply a * c - " vmull.u8 q4, d0, d30 \n\t" - " vmull.u8 q5, d1, d31 \n\t" + " vsub.i16 d24, d22, d24 \n\t" + " vmul.u16 d4, d4, d24 \n\t" + " vsri.16 d4, d4, #8 \n\t" + " vand d4, d4, d20 \n\t" + " vadd.i16 d4, d4, d26 \n\t" + " vand d4, d4, d20 \n\t" - // Shorten - " vqrshrn.u16 d12, q4, #8 \n\t" - " vqrshrn.u16 d13, q5, #8 \n\t" - - // extract negated alpha - " vmvn.u16 q7, q6 \n\t" - " vshr.u32 q8, q7, #24 \n\t" - " vmul.u32 q8, q8, q14 \n\t" - - // Multiply - " vmull.u8 q7, d16, d4 \n\t" - " vmull.u8 q8, d17, d5 \n\t" - - " vqrshrn.u16 d0, q7, #8 \n\t" - " vqrshrn.u16 d1, q8, #8 \n\t" - - // Add - " vqadd.u8 q0, q0, q6 \n\t" - - " vstm %[d]!, {d0,d1} \n\t" - - " cmp %[tmp], %[d] \n\t" - " bhi "AP"quadloopint \n\t" - - AP"loopout: \n\t" -#if NEONDEBUG - "cmp %[d], %[e] \n\t" - "ble "AP"foo \n\t" - "cmp %[tmp], %[m] \n\t" - "sub %[x], %[x] \n\t" - "vst1.32 d0[0], [%[x]] \n\t" - AP"foo: \n\t" -#endif - - " cmp %[d], %[e] \n\t" - " beq "AP"done \n\t" - " sub %[tmp],%[e], %[d] \n\t" - " cmp %[tmp],#4 \n\t" - " beq "AP"singleout \n\t" - - AP "dualloop2: \n\t" - "sub %[tmp],%[e],$0x8 \n\t" - " vld1.16 d0[0], [%[m]]! \n\t" - " vldm %[d], {d4} \n\t" - " vmovl.u8 q0, d0 \n\t" - " vmovl.u8 q0, d0 \n\t" - " vmul.u32 q0, q14 \n\t" - " vmull.u8 q4, d0, d30 \n\t" - " vqrshrn.u16 d12, q4, #8 \n\t" - " vmvn.u16 d14, d12 \n\t" - " vshr.u32 d16, d14, #24 \n\t" - " vmul.u32 d16, d16, d28 \n\t" - " vmull.u8 q7, d16, d4 \n\t" - " vqrshrn.u16 d0, q7, #8 \n\t" - " vqadd.u8 q0, q0, q6 \n\t" - " vstm %[d]!, {d0} \n\t" - - " cmp %[e], %[d] \n\t" - " beq "AP"done \n\t" - - AP"singleout: \n\t" - " vld1.8 d0[0], [%[m]]! \n\t" - " vld1.32 d4[0], [%[d]] \n\t" - " vdup.u8 d0, d0[0] \n\t" - " vmull.u8 q4, d0, d30 \n\t" - " vqrshrn.u16 d12, q4, #8 \n\t" - " vmvn.u16 d14, d12 \n\t" - " vshr.u32 d16, d14, #24 \n\t" - " vmul.u32 d16, d16, d28 \n\t" - " vmull.u8 q7, d16, d4 \n\t" - " vqrshrn.u16 d0, q7, #8 \n\t" - " vqadd.u8 q0, q0, q6 \n\t" - " vst1.32 d0[0], [%[d]]! \n\t" - - AP"done: \n\t" -#if NEONDEBUG - "cmp %[d], %[e] \n\t" - "beq "AP"reallydone \n\t" - "sub %[tmp], %[tmp] \n\t" - "vst1.32 d0[0], [%[tmp]] \n\t" - AP"reallydone:" -#endif - : // Out - : [e] "r" (d + l), [d] "r" (d), [c] "r" (c), - [tmp] "r" (7), [m] "r" (m), [x] "r" (0) - : "q0", "q1", "q2","q3", "q4","q5","q6", "q7","q8","q14","q15", - "memory" // clobbered - ); -#undef AP + " vqmovn.u16 d4, q2 \n\t" + " vst1.32 {d4[0]}, [%[d]]! \n\t" + : [d] "+r" (d), [m] "+r" (m) + : [c] "r" (c) + : "q0", "q2", "q15", "q13", "q12", "q11", "q10", + "memory" + ); + } } #endif