_op_blend_c_dp_neon miscalculation fix

Summary:
When processing random data result of this function differs from C variant in more than 50% cases.
This difference is due to alpha calculation, in C code :

 a = 256 - (c >> 24)

in NEON:

"vmvn.u8  q7,q6       \n\t"
// ie (8 bit)~(c>>24)    ===   255 - (c>>24)

We cant just add "1" as overflow will occur in case (c>>24) ==  0  (we use only 8 bit per channel in vector registers)
So here is the solution:
copy *d right before multiplication and add it to the result of it later.

This makes the function slower by 20-30% but it is still at least 2 times faster then C code.

Reviewers: raster

Differential Revision: https://phab.enlightenment.org/D455
This commit is contained in:
Carsten Haitzler 2014-01-21 00:11:33 +09:00
parent 0d1d51f64e
commit 1a9ebc02c0
1 changed files with 25 additions and 9 deletions

View File

@ -28,8 +28,10 @@ _op_blend_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m EINA_UNUSED, DATA32 c, DATA3
// Use 'tmp' not 'd'
"vld1.32 d0[0], [%[d]] \n\t"
// Only touch d1
"vmovl.u8 q10, d0 \n\t"
"vmull.u8 q0, d0, d14 \n\t"
"vqrshrn.u16 d0, q0, #8 \n\t"
"vadd.u16 q0, q0, q10 \n\t"
"vshrn.u16 d0, q0, #8 \n\t"
"vadd.u8 d0, d12, d0 \n\t"
"vst1.32 d0[0], [%[d]] \n\t"
@ -47,8 +49,10 @@ _op_blend_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m EINA_UNUSED, DATA32 c, DATA3
AP "dualloopint: \n\t"
"vldr.32 d0, [%[d]] \n\t"
"vmovl.u8 q10, d0 \n\t"
"vmull.u8 q1, d0, d14 \n\t"
"vqrshrn.u16 d0, q1, #8 \n\t"
"vadd.u16 q1, q1, q10 \n\t"
"vshrn.u16 d0, q1, #8 \n\t"
"vqadd.u8 d0, d0, d12 \n\t"
"vstm %[d]!, {d0} \n\t"
@ -66,15 +70,23 @@ _op_blend_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m EINA_UNUSED, DATA32 c, DATA3
AP "quadloopint:\n\t"
"vldm %[d], {d0,d1,d2,d3} \n\t"
"vmovl.u8 q10, d0 \n\t"
"vmovl.u8 q11, d1 \n\t"
"vmovl.u8 q12, d2 \n\t"
"vmovl.u8 q13, d3 \n\t"
"vmull.u8 q2, d0, d14 \n\t"
"vmull.u8 q3, d1, d15 \n\t"
"vmull.u8 q4, d2, d14 \n\t"
"vmull.u8 q5, d3, d15 \n\t"
"vadd.u16 q2, q2, q10 \n\t"
"vadd.u16 q3, q3, q11 \n\t"
"vadd.u16 q4, q4, q12 \n\t"
"vadd.u16 q5, q5, q13 \n\t"
"vqrshrn.u16 d0, q2, #8 \n\t"
"vqrshrn.u16 d1, q3, #8 \n\t"
"vqrshrn.u16 d2, q4, #8 \n\t"
"vqrshrn.u16 d3, q5, #8 \n\t"
"vshrn.u16 d0, q2, #8 \n\t"
"vshrn.u16 d1, q3, #8 \n\t"
"vshrn.u16 d2, q4, #8 \n\t"
"vshrn.u16 d3, q5, #8 \n\t"
"vqadd.u8 q0, q6, q0 \n\t"
"vqadd.u8 q1, q6, q1 \n\t"
@ -95,8 +107,10 @@ _op_blend_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m EINA_UNUSED, DATA32 c, DATA3
"sub %[tmp],%[e],$0x7 \n\t"
AP "dualloop2int: \n\t"
"vldr.64 d0, [%[d]] \n\t"
"vmovl.u8 q10, d0 \n\t"
"vmull.u8 q1, d0, d14 \n\t"
"vqrshrn.u16 d0, q1, #8 \n\t"
"vadd.u16 q1, q1, q10 \n\t"
"vshrn.u16 d0, q1, #8 \n\t"
"vqadd.u8 d0, d0, d12 \n\t"
"vstr.64 d0, [%[d]] \n\t"
@ -111,8 +125,10 @@ _op_blend_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m EINA_UNUSED, DATA32 c, DATA3
AP "singleloop2: \n\t"
"vld1.32 d0[0], [%[d]] \n\t"
"vmovl.u8 q10, d0 \n\t"
"vmull.u8 q1, d0, d14 \n\t"
"vqrshrn.u16 d0, q1, #8 \n\t"
"vadd.u16 q1, q1, q10 \n\t"
"vshrn.u16 d0, q1, #8 \n\t"
"vqadd.u8 d0, d0, d12 \n\t"
"vst1.32 d0[0], [%[d]] \n\t"
@ -122,7 +138,7 @@ _op_blend_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m EINA_UNUSED, DATA32 c, DATA3
: // output regs
// Input
: [e] "r" (e = d + l), [d] "r" (d), [c] "r" (c), [tmp] "r" (tmp)
: "q0", "q1", "q2","q3", "q4","q5","q6", "q7","memory" // clobbered
: "q0", "q1", "q2","q3", "q4","q5","q6", "q7", "q10", "q11", "q12", "q13", "memory" // clobbered
);
#undef AP