summaryrefslogtreecommitdiff
path: root/src/lib/evas/common
diff options
context:
space:
mode:
authorChunEon Park <chuneon.park@samsung.com>2015-04-03 19:38:33 +0900
committerChunEon Park <chuneon.park@samsung.com>2015-04-03 19:48:30 +0900
commit2b0fb1ea1d09ca27e73c770f30d9ff8c8e964f0c (patch)
tree7fcb551748a1efd49e8da1cd50df84760929be28 /src/lib/evas/common
parent2748fae3f45170010d2095beee12b656760a1790 (diff)
evas/common Fixed incorrect blend pixel color logic in neon.
previously, it had the remaining value issues on blending computation. The blending color result was in correct. Signed-Off-By: Vladimir Kuramshin <v.kuramshin@samsung.com>
Diffstat (limited to 'src/lib/evas/common')
-rw-r--r--src/lib/evas/common/evas_op_blend/op_blend_pixel_color_neon.c274
1 files changed, 81 insertions, 193 deletions
diff --git a/src/lib/evas/common/evas_op_blend/op_blend_pixel_color_neon.c b/src/lib/evas/common/evas_op_blend/op_blend_pixel_color_neon.c
index c8fa546..d6b3a73 100644
--- a/src/lib/evas/common/evas_op_blend/op_blend_pixel_color_neon.c
+++ b/src/lib/evas/common/evas_op_blend/op_blend_pixel_color_neon.c
@@ -8,202 +8,90 @@
8static void 8static void
9_op_blend_p_c_dp_neon(DATA32 * __restrict s, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 * __restrict d, int l) { 9_op_blend_p_c_dp_neon(DATA32 * __restrict s, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 * __restrict d, int l) {
10 10
11/* Current this neon code is a little buggy, color blending won't be done
12 correctly. So leave the code depend on the compilier optimization. */
13#if 1
14 int i;
15 int alpha;
16
17 for (i = 0; i < l; i++)
18 {
19 DATA32 sc = MUL4_SYM(c, s[i]);
20 alpha = 256 - (sc >> 24);
21 d[i] = sc + MUL_256(alpha, d[i]);
22 }
23#else
24#define AP "blend_p_c_dp_" 11#define AP "blend_p_c_dp_"
25 asm volatile ( 12 asm volatile (
26 ".fpu neon \n\t" 13 ".fpu neon\n\t"
27 // Load 'c' 14 "vdup.u32 d0, %[c]\n\t" // Load 'c'
28 "vdup.u32 q7, %[c] \n\t" 15 "vmov.u16 q1, $0x00ff\n\t" // round_mask
29 "vmov.i8 q6, #1 \n\t" 16 "vmov.u8 q2, #0\n\t" // zero register
30 17 "sub %[tmp], %[e], #16\n\t"
31 // Choose a loop 18 "cmp %[d], %[tmp]\n\t"
32 "andS %[tmp], %[d], $0xf \n\t" 19 "bhi "AP"skipquad\n\t"
33 "beq "AP"quadstart \n\t" 20 AP"quadloop:"
34 21 "vld1.32 {d6, d7}, [%[s]]!\n\t" // Load 's'
35 "andS %[tmp],%[d], $0x4 \n\t" 22 "vld1.32 {d8, d9}, [%[d]]\n\t" // Load 'd'
36 "beq "AP"dualloop \n\t" 23 "vmull.u8 q5, d6, d0\n\t" // s * c
37 24 "vmull.u8 q6, d7, d0\n\t"
38 AP"singleloop:" 25 "vadd.u16 q5, q5, q1\n\t" // rounding
39 "vld1.32 d0[0], [%[s]]! \n\t" 26 "vadd.u16 q6, q6, q1\n\t"
40 "vld1.32 d2[0], [%[d]] \n\t" 27 "vshrn.u16 d10, q5, #8\n\t" // narrowing
41 // Mulitply s * c (= sc) 28 "vshrn.u16 d11, q6, #8\n\t" // sc in q5
42 "vmull.u8 q4, d0,d14 \n\t" 29 "vsub.u8 q6, q2, q5\n\t"
43 // sc in d8 30 "vmov q7, q6\n\t"
44 "vqrshrn.u16 d4, q4, #8 \n\t" 31 "vtrn.u8 q7, q6\n\t"
45 32 "vmov q7, q6\n\t"
46 // sca in d9 33 "vtrn.u16 q7, q6\n\t" // q6 - alpha
47 "vmvn.u32 d6, d4 \n\t" 34 "vmull.u8 q7, d8, d12\n\t"
48 "vshr.u32 d6, d6, #24 \n\t" 35 "vmull.u8 q8, d9, d13\n\t"
49 36 "vshrn.u16 d14, q7, #8\n\t"
50 "vmul.u32 d6, d12, d6 \n\t" 37 "vshrn.u16 d15, q8, #8\n\t" // q7 - d * alpha
51 38 "vceq.i32 q6, q6, #0\n\t" // if alpha = 0x100
52 /* d * alpha */ 39 "vbsl q6, q4, q7\n\t" // just copy d[i]
53 "vmull.u8 q4, d6, d2 \n\t" 40 "vadd.u32 q4, q5, q6\n\t"
54 "vqrshrn.u16 d0, q4, #8 \n\t" 41 "vst1.u32 {d8, d9}, [%[d]]!\n\t"
55 42 "cmp %[d], %[tmp]\n\t"
56 "vqadd.u8 d2, d0, d4 \n\t" 43 "bls "AP"quadloop\n\t"
57 44 AP"skipquad:"
58 // Save dsc + sc 45 "sub %[tmp], %[e], #8\n\t"
59 "vst1.32 d2[0], [%[d]]! \n\t" 46 "cmp %[d], %[tmp]\n\t"
60 47 "bhi "AP"skipdouble\n\t"
61 // Now where? 48 AP"doubleloop:"
62 // Can we go the fast path? 49 "vld1.32 d6, [%[s]]!\n\t"
63 "andS %[tmp], %[d],$0xf \n\t" 50 "vld1.32 d7, [%[d]]\n\t"
64 "beq "AP"quadstart \n\t" 51 "vmull.u8 q4, d6, d0\n\t"
65 52 "vadd.u16 q4, q4, q1\n\t"
66 AP"dualloop: \n\t" 53 "vshrn.u16 d8, q4, #8\n\t"
67 // Check we have enough to bother with! 54 "vsub.u8 d9, d4, d8\n\t"
68 "sub %[tmp], %[e], %[d] \n\t" 55 "vmov d10, d9\n\t"
69 "cmp %[tmp], #16 \n\t" 56 "vtrn.u8 d10, d9\n\t"
70 "blt "AP"loopout \n\t" 57 "vmov d10, d9\n\t"
71 58 "vtrn.u16 d10, d9\n\t" // d9 - alpha
72 // load 's' -> q0, 'd' -> q1 59 "vmull.u8 q5, d7, d9\n\t"
73 "vldm %[s]!, {d0} \n\t" 60 "vshrn.u16 d1, q5, #8\n\t"
74 "vldm %[d], {d2} \n\t" 61 "vceq.i32 d9, d9, #0\n\t"
75 // Mulitply s * c (= sc) 62 "vbsl d9, d7, d1\n\t" // d7 - d[i], d1 - d[i] * alpha
76 "vmull.u8 q4, d0,d14 \n\t" 63 "vadd.u32 d7, d8, d9\n\t"
77 // sc in d8 64 "vst1.u32 d7, [%[d]]!\n\t"
78 "vqrshrn.u16 d4, q4, #8 \n\t" 65 "cmp %[d], %[tmp]\n\t"
79 66 "bls "AP"doubleloop\n\t"
80 // sca in d9 67 AP"skipdouble:"
81 "vmvn.u32 d6, d4 \n\t" 68 "cmp %[d], %[e]\n\t"
82 "vshr.u32 d6, d6, #24 \n\t" 69 "beq "AP"done\n\t"
83 70 AP"singleloop:"
84 "vmul.u32 d6, d12, d6 \n\t" 71 "vld1.32 d6[0], [%[s]]!\n\t"
85 72 "vld1.32 d7[0], [%[d]]\n\t"
86 /* d * alpha */ 73 "vmull.u8 q4, d6, d0\n\t"
87 "vmull.u8 q4, d6, d2 \n\t" 74 "vadd.u16 q4, q4, q1\n\t"
88 "vqrshrn.u16 d0, q4, #8 \n\t" 75 "vshrn.u16 d8, q4, #8\n\t"
89 76 "vsub.u8 d9, d4, d8\n\t"
90 "vqadd.u8 d2, d0, d4 \n\t" 77 "vmov d10, d9\n\t"
91 78 "vtrn.u8 d10, d9\n\t"
92 // Save dsc + sc 79 "vmov d10, d9\n\t"
93 "vst1.32 d2, [%[d]]! \n\t" 80 "vtrn.u16 d10, d9\n\t" // d9 - alpha
94 81 "vmull.u8 q5, d7, d9\n\t"
95 AP"quadstart: \n\t" 82 "vshrn.u16 d1, q5, #8\n\t"
96 "sub %[tmp], %[e], %[d] \n\t" 83 "vceq.i32 d9, d9, #0\n\t"
97 "cmp %[tmp], #16 \n\t" 84 "vbsl d9, d7, d1\n\t" // d7 - d[i], d1 - d[i] * alpha
98 "blt "AP"loopout \n\t" 85 "vadd.u32 d7, d8, d9\n\t"
99 86 "vst1.u32 d7[0], [%[d]]!\n\t"
100 "sub %[tmp], %[e], #15 \n\t" 87 "cmp %[d], %[e]\n\t"
101 88 "blt "AP"singleloop\n\t"
102 AP"quadloop:\n\t" 89 AP"done:"
103 // load 's' -> q0, 'd' -> q1 90 : // No output
104 "vldm %[s]!, {d0,d1} \n\t" 91 : [s] "r" (s), [d] "r" (d), [c] "r" (c), [e] "r" (d + l), [tmp] "r" (12)
105 "vldm %[d], {d2,d3} \n\t" 92 : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "memory"
106 // Mulitply s * c (= sc) 93 );
107 "vmull.u8 q4, d0,d14 \n\t"
108 "vmull.u8 q5, d1,d14 \n\t"
109
110 // Get sc & sc alpha
111 "vqrshrn.u16 d4, q4, #8 \n\t"
112 "vqrshrn.u16 d5, q5, #8 \n\t"
113 // sc is now in q2, 8bpp
114 // Shift out, then spread alpha for q2
115 "vmvn.u32 q3, q2 \n\t"
116 "vshr.u32 q3, q3, $0x18 \n\t"
117 "vmul.u32 q3, q6,q3 \n\t"
118
119 // Multiply 'd' by sc.alpha (dsca)
120 "vmull.u8 q4, d6,d2 \n\t"
121 "vmull.u8 q5, d7,d3 \n\t"
122
123 "vqrshrn.u16 d0, q4, #8 \n\t"
124 "vqrshrn.u16 d1, q5, #8 \n\t"
125
126 "vqadd.u8 q1, q0, q2 \n\t"
127
128 // Save dsc + sc
129 "vstm %[d]!, {d2,d3} \n\t"
130
131 "cmp %[tmp], %[d] \n\t"
132
133 "bhi "AP"quadloop \n\t"
134
135 /* Trailing stuff */
136 AP"loopout: \n\t"
137
138 "cmp %[d], %[e] \n\t"
139 "beq "AP"done\n\t"
140 "sub %[tmp],%[e], %[d] \n\t"
141 "cmp %[tmp],$0x04 \n\t"
142 "beq "AP"singleloop2 \n\t"
143
144 "sub %[tmp], %[e], #7 \n\t"
145 /* Dual loop */
146 AP"dualloop2: \n\t"
147 "vldm %[s]!, {d0} \n\t"
148 "vldm %[d], {d2} \n\t"
149 // Mulitply s * c (= sc)
150 "vmull.u8 q4, d0,d14 \n\t"
151 // sc in d8
152 "vqrshrn.u16 d4, q4, #8 \n\t"
153
154 // sca in d9
155 // XXX: I can probably squash one of these 3
156 "vmvn.u32 d6, d4 \n\t"
157 "vshr.u32 d6, d6, #24 \n\t"
158 "vmul.u32 d6, d6, d12 \n\t"
159
160 /* d * alpha */
161 "vmull.u8 q4, d6, d2 \n\t"
162 "vqrshrn.u16 d0, q4, #8 \n\t"
163
164 "vqadd.u8 d2, d0, d4 \n\t"
165
166 // Save dsc + sc
167 "vstm %[d]!, {d2} \n\t"
168
169 "cmp %[tmp], %[d] \n\t"
170 "bhi "AP"dualloop2 \n\t"
171
172 "cmp %[d], %[e] \n\t"
173 "beq "AP"done \n\t"
174
175 AP"singleloop2: \n\t"
176 "vld1.32 d0[0], [%[s]]! \n\t"
177 "vld1.32 d2[0], [%[d]] \n\t"
178 // Mulitply s * c (= sc)
179 "vmull.u8 q4, d0,d14 \n\t"
180 // sc in d8
181 "vqrshrn.u16 d4, q4, #8 \n\t"
182
183 // sca in d6
184 "vmvn.u32 d6, d4 \n\t"
185 "vshr.u32 d6, d6, #24 \n\t"
186 "vmul.u32 d6, d12,d6 \n\t"
187
188 /* d * alpha */
189 "vmull.u8 q4, d6, d2 \n\t"
190 "vqrshrn.u16 d0, q4, #8 \n\t"
191
192 "vqadd.u8 d2, d0, d4 \n\t"
193
194 // Save dsc + sc
195 "vst1.32 d2[0], [%[d]]! \n\t"
196
197
198 AP"done:"
199 : // No output
200 //
201 : [s] "r" (s), [e] "r" (d + l), [d] "r" (d), [c] "r" (c),
202 [tmp] "r" (12)
203 : "q0","q1","q2","q3","q4","q5","q6","q7","memory"
204 );
205#undef AP 94#undef AP
206#endif
207} 95}
208 96
209static void 97static void