evas: _op_blend_p_dp_neon and _op_blend_pas_dp_neon miscalculation fix

Summary:
When processing random data result of this function differs from C variant in more than 50% cases.
This difference is due to alpha calculation, in C code :

alpha = 256 - (*s >> 24)

in NEON:

"vmvn.u8 q4,q0 \n\t"
// ie ~(*s>>24) === 255 - (*s>>24)

We cant just add "1" as overflow will occur in case (*s>>24) == 0 (we use only 8 bit per channel in vector registers)
So here is the solution:
copy *d right before multiplication and add it to the result of it later.

Same approach as in D455.

Reviewers: raster, cedric, stefan_schmidt

Reviewed By: cedric

Subscribers: cedric

Projects: #efl

Differential Revision: https://phab.enlightenment.org/D2308

Signed-off-by: Cedric BAIL <cedric@osg.samsung.com>
This commit is contained in:
Cedric BAIL 2015-04-10 11:22:33 +02:00
parent 6ceac2509d
commit 3b46609140
1 changed files with 58 additions and 18 deletions

View File

@ -30,8 +30,10 @@ _op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
"vmul.u32 d8, d16, d8 \n\t"
"vmovl.u8 q9, d4 \n\t"
"vmull.u8 q6, d4,d8 \n\t"
"vqrshrn.u16 d8, q6, #8 \n\t"
"vadd.u16 q6, q6, q9 \n\t"
"vshrn.u16 d8, q6, #8 \n\t"
// Add to 's'
"vqadd.u8 q2, q4,q0 \n\t"
@ -61,8 +63,10 @@ _op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
"vmul.u32 d8, d16, d8 \n\t"
"vmovl.u8 q9, d4 \n\t"
"vmull.u8 q6, d4,d8 \n\t"
"vqrshrn.u16 d8, q6, #8 \n\t"
"vadd.u16 q6, q6, q9 \n\t"
"vshrn.u16 d8, q6, #8 \n\t"
// Add to 's'
"vqadd.u8 d4, d8,d0 \n\t"
"vstr d4, [%[d]] \n\t"
@ -87,13 +91,18 @@ _op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
// Multiply into all fields
"vmul.u32 q4, q8,q4 \n\t"
"vmovl.u8 q9, d4 \n\t"
"vmovl.u8 q10, d5 \n\t"
// a * d (clobbering 'd'/q7)
"vmull.u8 q6, d4,d8 \n\t"
"vmull.u8 q2, d5,d9 \n\t"
"vadd.u16 q6, q6, q9 \n\t"
"vadd.u16 q2, q2, q10 \n\t"
// Shift & narrow it
"vqrshrn.u16 d8, q6, #8 \n\t"
"vqrshrn.u16 d9, q2, #8 \n\t"
"vshrn.u16 d8, q6, #8 \n\t"
"vshrn.u16 d9, q2, #8 \n\t"
// Add to s
"vqadd.u8 q2, q4,q0 \n\t"
@ -126,6 +135,10 @@ _op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
"vmul.u32 q4, q8,q4 \n\t"
"vmul.u32 q5, q8,q5 \n\t"
"vmovl.u8 q9, d4 \n\t"
"vmovl.u8 q10, d5 \n\t"
"vmovl.u8 q11, d6 \n\t"
"vmovl.u8 q12, d7 \n\t"
// a * d (clobbering 'd'/q7)
"vmull.u8 q6, d4,d8 \n\t"
@ -133,13 +146,18 @@ _op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
"vmull.u8 q7, d6,d10 \n\t"
"vmull.u8 q3, d7,d11 \n\t"
"vadd.u16 q6, q6, q9 \n\t"
"vadd.u16 q2, q2, q10 \n\t"
"vadd.u16 q7, q7, q11 \n\t"
"vadd.u16 q3, q3, q12 \n\t"
"cmp %[tmp], %[d]\n\t"
// Shift & narrow it
"vqrshrn.u16 d8, q6, #8 \n\t"
"vqrshrn.u16 d9, q2, #8 \n\t"
"vqrshrn.u16 d10, q7, #8 \n\t"
"vqrshrn.u16 d11, q3, #8 \n\t"
"vshrn.u16 d8, q6, #8 \n\t"
"vshrn.u16 d9, q2, #8 \n\t"
"vshrn.u16 d10, q7, #8 \n\t"
"vshrn.u16 d11, q3, #8 \n\t"
// Add to s
@ -171,8 +189,10 @@ _op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
"vmul.u32 d8, d16, d8 \n\t"
"vmovl.u8 q9, d4 \n\t"
"vmull.u8 q6, d4,d8 \n\t"
"vqrshrn.u16 d8, q6, #8 \n\t"
"vadd.u16 q6, q6, q9 \n\t"
"vshrn.u16 d8, q6, #8 \n\t"
// Add to 's'
"vqadd.u8 d4, d8,d0 \n\t"
@ -195,8 +215,10 @@ _op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
"vmul.u32 d8, d8, d16 \n\t"
"vmovl.u8 q9, d4 \n\t"
"vmull.u8 q6, d8,d4 \n\t"
"vqrshrn.u16 d8, q6, #8 \n\t"
"vadd.u16 q6, q6, q9 \n\t"
"vshrn.u16 d8, q6, #8 \n\t"
// Add to 's'
"vqadd.u8 d0, d0,d8 \n\t"
"vst1.32 d0[0], [%[d]] \n\t"
@ -247,10 +269,12 @@ _op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
// Mulitply into all fields
"vmul.u32 d8, d8, d16 \n\t"
"vmovl.u8 q9, d4 \n\t"
// Multiply out
"vmull.u8 q6, d8, d4 \n\t"
"vadd.u16 q6, q6, q9 \n\t"
"vqrshrn.u16 d8, q6, #8 \n\t"
"vshrn.u16 d8, q6, #8 \n\t"
// Add to s
"vqadd.u8 d0, d0,d8 \n\t"
@ -278,10 +302,12 @@ _op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
// Mulitply into all fields
"vmul.u32 d8, d8, d16 \n\t"
"vmovl.u8 q9, d4 \n\t"
// Multiply out
"vmull.u8 q6, d8, d4 \n\t"
"vadd.u16 q6, q6, q9 \n\t"
"vqrshrn.u16 d8, q6, #8 \n\t"
"vshrn.u16 d8, q6, #8 \n\t"
// Add to s
"vqadd.u8 d0, d0,d8 \n\t"
@ -316,18 +342,28 @@ _op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
"vmul.u32 q5, q5, q8 \n\t"
"pld [%[pl]] \n\t"
"vmovl.u8 q9, d4 \n\t"
"vmovl.u8 q10, d5 \n\t"
"vmovl.u8 q11, d6 \n\t"
"vmovl.u8 q12, d7 \n\t"
// Multiply out
"vmull.u8 q6, d8, d4 \n\t"
"vmull.u8 q7, d10, d6 \n\t"
"vmull.u8 q2, d9, d5 \n\t"
"vmull.u8 q3, d11, d7 \n\t"
"vadd.u16 q6, q6, q9 \n\t"
"vadd.u16 q2, q2, q10 \n\t"
"vadd.u16 q7, q7, q11 \n\t"
"vadd.u16 q3, q3, q12 \n\t"
"add %[pl], %[d], #32 \n\t"
"vqrshrn.u16 d8, q6, #8 \n\t"
"vqrshrn.u16 d10, q7, #8 \n\t"
"vqrshrn.u16 d9, q2, #8 \n\t"
"vqrshrn.u16 d11, q3, #8 \n\t"
"vshrn.u16 d8, q6, #8 \n\t"
"vshrn.u16 d10, q7, #8 \n\t"
"vshrn.u16 d9, q2, #8 \n\t"
"vshrn.u16 d11, q3, #8 \n\t"
"pld [%[pl]] \n\t"
"cmp %[tmp], %[pl] \n\t"
@ -360,10 +396,12 @@ _op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
// Mulitply into all fields
"vmul.u32 d8, d8, d16 \n\t"
"vmovl.u8 q9, d4 \n\t"
// Multiply out
"vmull.u8 q6, d8, d4 \n\t"
"vadd.u16 q6, q6, q9 \n\t"
"vqrshrn.u16 d8, q6, #8 \n\t"
"vshrn.u16 d8, q6, #8 \n\t"
// Add to s
"vqadd.u8 d0, d0,d8 \n\t"
@ -389,9 +427,11 @@ _op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
"vmul.u32 d8, d8, d16 \n\t"
// Multiply out
"vmovl.u8 q9, d4 \n\t"
"vmull.u8 q6, d8, d4 \n\t"
"vadd.u16 q6, q6, q9 \n\t"
"vqrshrn.u16 d8, q6, #8 \n\t"
"vshrn.u16 d8, q6, #8 \n\t"
// Add to s
"vqadd.u8 d0, d0,d8 \n\t"