summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorCarsten Haitzler (Rasterman) <raster@rasterman.com>2014-01-21 00:11:33 +0900
committerCarsten Haitzler (Rasterman) <raster@rasterman.com>2014-01-21 08:50:34 +0900
commit1a9ebc02c08577edca96d804611d254f59254eb5 (patch)
treee3c3e8c954dd23c85ff98a41ec0cb006b60165f5 /src
parent0d1d51f64ed5813eefe410fb16ec60ffa32d0588 (diff)
_op_blend_c_dp_neon miscalculation fix
Summary: When processing random data result of this function differs from C variant in more than 50% cases. This difference is due to alpha calculation, in C code : a = 256 - (c >> 24) in NEON: "vmvn.u8 q7,q6 \n\t" // ie (8 bit)~(c>>24) === 255 - (c>>24) We cant just add "1" as overflow will occur in case (c>>24) == 0 (we use only 8 bit per channel in vector registers) So here is the solution: copy *d right before multiplication and add it to the result of it later. This makes the function slower by 20-30% but it is still at least 2 times faster then C code. Reviewers: raster Differential Revision: https://phab.enlightenment.org/D455
Diffstat (limited to 'src')
-rw-r--r--src/lib/evas/common/evas_op_blend/op_blend_color_neon.c34
1 files changed, 25 insertions, 9 deletions
diff --git a/src/lib/evas/common/evas_op_blend/op_blend_color_neon.c b/src/lib/evas/common/evas_op_blend/op_blend_color_neon.c
index 8512bb4444..9e94298cc6 100644
--- a/src/lib/evas/common/evas_op_blend/op_blend_color_neon.c
+++ b/src/lib/evas/common/evas_op_blend/op_blend_color_neon.c
@@ -28,8 +28,10 @@ _op_blend_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m EINA_UNUSED, DATA32 c, DATA3
28 // Use 'tmp' not 'd' 28 // Use 'tmp' not 'd'
29 "vld1.32 d0[0], [%[d]] \n\t" 29 "vld1.32 d0[0], [%[d]] \n\t"
30 // Only touch d1 30 // Only touch d1
31 "vmovl.u8 q10, d0 \n\t"
31 "vmull.u8 q0, d0, d14 \n\t" 32 "vmull.u8 q0, d0, d14 \n\t"
32 "vqrshrn.u16 d0, q0, #8 \n\t" 33 "vadd.u16 q0, q0, q10 \n\t"
34 "vshrn.u16 d0, q0, #8 \n\t"
33 "vadd.u8 d0, d12, d0 \n\t" 35 "vadd.u8 d0, d12, d0 \n\t"
34 "vst1.32 d0[0], [%[d]] \n\t" 36 "vst1.32 d0[0], [%[d]] \n\t"
35 37
@@ -47,8 +49,10 @@ _op_blend_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m EINA_UNUSED, DATA32 c, DATA3
47 49
48 AP "dualloopint: \n\t" 50 AP "dualloopint: \n\t"
49 "vldr.32 d0, [%[d]] \n\t" 51 "vldr.32 d0, [%[d]] \n\t"
52 "vmovl.u8 q10, d0 \n\t"
50 "vmull.u8 q1, d0, d14 \n\t" 53 "vmull.u8 q1, d0, d14 \n\t"
51 "vqrshrn.u16 d0, q1, #8 \n\t" 54 "vadd.u16 q1, q1, q10 \n\t"
55 "vshrn.u16 d0, q1, #8 \n\t"
52 "vqadd.u8 d0, d0, d12 \n\t" 56 "vqadd.u8 d0, d0, d12 \n\t"
53 57
54 "vstm %[d]!, {d0} \n\t" 58 "vstm %[d]!, {d0} \n\t"
@@ -66,15 +70,23 @@ _op_blend_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m EINA_UNUSED, DATA32 c, DATA3
66 AP "quadloopint:\n\t" 70 AP "quadloopint:\n\t"
67 "vldm %[d], {d0,d1,d2,d3} \n\t" 71 "vldm %[d], {d0,d1,d2,d3} \n\t"
68 72
73 "vmovl.u8 q10, d0 \n\t"
74 "vmovl.u8 q11, d1 \n\t"
75 "vmovl.u8 q12, d2 \n\t"
76 "vmovl.u8 q13, d3 \n\t"
69 "vmull.u8 q2, d0, d14 \n\t" 77 "vmull.u8 q2, d0, d14 \n\t"
70 "vmull.u8 q3, d1, d15 \n\t" 78 "vmull.u8 q3, d1, d15 \n\t"
71 "vmull.u8 q4, d2, d14 \n\t" 79 "vmull.u8 q4, d2, d14 \n\t"
72 "vmull.u8 q5, d3, d15 \n\t" 80 "vmull.u8 q5, d3, d15 \n\t"
81 "vadd.u16 q2, q2, q10 \n\t"
82 "vadd.u16 q3, q3, q11 \n\t"
83 "vadd.u16 q4, q4, q12 \n\t"
84 "vadd.u16 q5, q5, q13 \n\t"
73 85
74 "vqrshrn.u16 d0, q2, #8 \n\t" 86 "vshrn.u16 d0, q2, #8 \n\t"
75 "vqrshrn.u16 d1, q3, #8 \n\t" 87 "vshrn.u16 d1, q3, #8 \n\t"
76 "vqrshrn.u16 d2, q4, #8 \n\t" 88 "vshrn.u16 d2, q4, #8 \n\t"
77 "vqrshrn.u16 d3, q5, #8 \n\t" 89 "vshrn.u16 d3, q5, #8 \n\t"
78 90
79 "vqadd.u8 q0, q6, q0 \n\t" 91 "vqadd.u8 q0, q6, q0 \n\t"
80 "vqadd.u8 q1, q6, q1 \n\t" 92 "vqadd.u8 q1, q6, q1 \n\t"
@@ -95,8 +107,10 @@ _op_blend_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m EINA_UNUSED, DATA32 c, DATA3
95 "sub %[tmp],%[e],$0x7 \n\t" 107 "sub %[tmp],%[e],$0x7 \n\t"
96 AP "dualloop2int: \n\t" 108 AP "dualloop2int: \n\t"
97 "vldr.64 d0, [%[d]] \n\t" 109 "vldr.64 d0, [%[d]] \n\t"
110 "vmovl.u8 q10, d0 \n\t"
98 "vmull.u8 q1, d0, d14 \n\t" 111 "vmull.u8 q1, d0, d14 \n\t"
99 "vqrshrn.u16 d0, q1, #8 \n\t" 112 "vadd.u16 q1, q1, q10 \n\t"
113 "vshrn.u16 d0, q1, #8 \n\t"
100 "vqadd.u8 d0, d0, d12 \n\t" 114 "vqadd.u8 d0, d0, d12 \n\t"
101 115
102 "vstr.64 d0, [%[d]] \n\t" 116 "vstr.64 d0, [%[d]] \n\t"
@@ -111,8 +125,10 @@ _op_blend_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m EINA_UNUSED, DATA32 c, DATA3
111 125
112 AP "singleloop2: \n\t" 126 AP "singleloop2: \n\t"
113 "vld1.32 d0[0], [%[d]] \n\t" 127 "vld1.32 d0[0], [%[d]] \n\t"
128 "vmovl.u8 q10, d0 \n\t"
114 "vmull.u8 q1, d0, d14 \n\t" 129 "vmull.u8 q1, d0, d14 \n\t"
115 "vqrshrn.u16 d0, q1, #8 \n\t" 130 "vadd.u16 q1, q1, q10 \n\t"
131 "vshrn.u16 d0, q1, #8 \n\t"
116 "vqadd.u8 d0, d0, d12 \n\t" 132 "vqadd.u8 d0, d0, d12 \n\t"
117 133
118 "vst1.32 d0[0], [%[d]] \n\t" 134 "vst1.32 d0[0], [%[d]] \n\t"
@@ -122,7 +138,7 @@ _op_blend_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m EINA_UNUSED, DATA32 c, DATA3
122 : // output regs 138 : // output regs
123 // Input 139 // Input
124 : [e] "r" (e = d + l), [d] "r" (d), [c] "r" (c), [tmp] "r" (tmp) 140 : [e] "r" (e = d + l), [d] "r" (d), [c] "r" (c), [tmp] "r" (tmp)
125 : "q0", "q1", "q2","q3", "q4","q5","q6", "q7","memory" // clobbered 141 : "q0", "q1", "q2","q3", "q4","q5","q6", "q7", "q10", "q11", "q12", "q13", "memory" // clobbered
126 142
127 ); 143 );
128#undef AP 144#undef AP