diff options
author | Carsten Haitzler (Rasterman) <raster@rasterman.com> | 2013-11-15 19:16:03 +0900 |
---|---|---|
committer | Carsten Haitzler (Rasterman) <raster@rasterman.com> | 2013-11-15 19:17:01 +0900 |
commit | deec62c9b66c45705918bf659ce1d2107dbc6831 (patch) | |
tree | 5f0e60fd97b768951b29b7b575704d71514fc359 /src/lib/evas/common | |
parent | 0e4b25747f8b86e2a923d078c6124fbeb9590c11 (diff) |
evas - fix neon blend code used for text rendering to not leave dirty end
Diffstat (limited to 'src/lib/evas/common')
-rw-r--r-- | src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c | 279 |
1 files changed, 105 insertions, 174 deletions
diff --git a/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c b/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c index da7cd3e24d..252f276ba8 100644 --- a/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c +++ b/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c | |||
@@ -19,180 +19,111 @@ | |||
19 | #ifdef BUILD_NEON | 19 | #ifdef BUILD_NEON |
20 | static void | 20 | static void |
21 | _op_blend_mas_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, int l) { | 21 | _op_blend_mas_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, int l) { |
22 | DATA32 *e; | 22 | // main loop process data in pairs, so we need count to be even |
23 | 23 | DATA32 *e = d + l - (l % 2); | |
24 | DEBUG_FNCOUNT(""); | 24 | |
25 | 25 | // everything we can do only once per cycle | |
26 | #define AP "blend_mas_c_dp_" | 26 | // loading of 'c', initialization of some registers |
27 | asm volatile ( | 27 | __asm__ __volatile__ |
28 | ".fpu neon \n\t" | 28 | ( |
29 | " vdup.i32 q15, %[c] \n\t" | 29 | ".fpu neon \n\t" |
30 | " vmov.i8 q14, #1 \n\t" | 30 | " vmov.32 d30[0], %[c] \n\t" |
31 | 31 | " vmov.i16 q10, #255 \n\t" | |
32 | // If aligned already - straight to quads | 32 | " vmov.i16 q11, #256 \n\t" |
33 | " andS %[tmp], %[d],$0xf \n\t" | 33 | " veor d29, d29, d29 \n\t" |
34 | " beq "AP"quadloops \n\t" | 34 | " vzip.8 d30, d29 \n\t" |
35 | 35 | " vmov d31, d30 \n\t" | |
36 | " andS %[tmp], %[d],$0x4 \n\t" | 36 | : |
37 | " beq "AP"dualloop \n\t" | 37 | : [c] "r" (c) |
38 | 38 | : "q10", "q11", "q15", "d29" | |
39 | AP"singleloop: \n\t" | 39 | ); |
40 | " vld1.8 d0[0], [%[m]]! \n\t" | 40 | while (d < e) |
41 | " vld1.32 d4[0], [%[d]] \n\t" | 41 | { |
42 | " vdup.u8 d0, d0[0] \n\t" | 42 | // main cycle |
43 | " vmull.u8 q4, d0, d30 \n\t" | 43 | __asm__ __volatile__ |
44 | " vqrshrn.u16 d12, q4, #8 \n\t" | 44 | ( |
45 | " vmvn.u16 d14, d12 \n\t" | 45 | // load pair '*d' and '*(d+1)' into vector register |
46 | " vshr.u32 d16, d14, #24 \n\t" | 46 | " vldm %[d], {d4} \n\t" |
47 | " vmul.u32 d16, d16, d28 \n\t" | 47 | |
48 | " vmull.u8 q7, d16, d4 \n\t" | 48 | // load '*m' and '*(m+1)' |
49 | " vqrshrn.u16 d0, q7, #8 \n\t" | 49 | " veor q0, q0, q0 \n\t" |
50 | " vqadd.u8 d0, d0, d12 \n\t" | 50 | " vld1.8 d0[0], [%[m]]! \n\t" |
51 | " vst1.32 d0[0], [%[d]]! \n\t" | 51 | " vld1.8 d1[0], [%[m]]! \n\t" |
52 | 52 | ||
53 | // Can we go the fast path? | 53 | // spread values from d in vector registers so for each |
54 | " andS %[tmp], %[d],$0xf \n\t" | 54 | // 8 bit channel data we have 8 bit of zeros |
55 | " beq "AP"quadloops \n\t" | 55 | // so each 32bit value occupies now one 64 bit register |
56 | 56 | " veor d5, d5, d5 \n\t" | |
57 | AP"dualloop: \n\t" | 57 | " vzip.8 d4, d5 \n\t" |
58 | " sub %[tmp], %[e], %[d] \n\t" | 58 | |
59 | " cmp %[tmp], #16 \n\t" | 59 | // copy *m values in corresponding registers |
60 | " blt "AP"loopout \n\t" | 60 | " vdup.u16 d0, d0[0] \n\t" |
61 | 61 | " vdup.u16 d1, d1[0] \n\t" | |
62 | " vld1.16 d0[0], [%[m]]! \n\t" | 62 | |
63 | " vldm %[d], {d4} \n\t" | 63 | // multiply a * c |
64 | " vmovl.u8 q0, d0 \n\t" | 64 | " vmul.u16 q13, q0, q15 \n\t" |
65 | " vmovl.u8 q0, d0 \n\t" | 65 | " vadd.i16 q13, q13, q10 \n\t" |
66 | " vmul.u32 q0, q14 \n\t" | 66 | " vsri.16 q13, q13, #8 \n\t" |
67 | " vmull.u8 q4, d0, d30 \n\t" | 67 | " vand q13, q13, q10 \n\t" |
68 | " vqrshrn.u16 d12, q4, #8 \n\t" | 68 | |
69 | " vmvn.u16 d14, d12 \n\t" | 69 | // extract negated alpha |
70 | " vshr.u32 d16, d14, #24 \n\t" | 70 | " vdup.u16 d24, d26[3] \n\t" |
71 | " vmul.u32 d16, d16, d28 \n\t" | 71 | " vdup.u16 d25, d27[3] \n\t" |
72 | " vmull.u8 q7, d16, d4 \n\t" | 72 | " vsub.i16 q12, q11, q12 \n\t" |
73 | " vqrshrn.u16 d0, q7, #8 \n\t" | 73 | |
74 | " vqadd.u8 q0, q0, q6 \n\t" | 74 | // multiply alpha * (*d) and add a*c |
75 | " vstm %[d]!, {d0} \n\t" | 75 | " vmul.u16 q2, q2, q12 \n\t" |
76 | 76 | " vsri.16 q2, q2, #8 \n\t" | |
77 | AP"quadloops: \n\t" | 77 | " vand q2, q2, q10 \n\t" |
78 | " sub %[tmp], %[e], %[d] \n\t" | 78 | " vadd.i16 q2, q2, q13 \n\t" |
79 | " cmp %[tmp], #16 \n\t" | 79 | " vand q2, q2, q10 \n\t" |
80 | " blt "AP"loopout \n\t" | 80 | |
81 | 81 | // save results | |
82 | 82 | " vqmovn.u16 d4, q2 \n\t" | |
83 | " sub %[tmp], %[e], #15 \n\t" | 83 | " vstm %[d]!, {d4} \n\t" |
84 | 84 | : [d] "+r" (d), [m] "+r" (m) | |
85 | " sub %[d], #16 \n\t" | 85 | : [c] "r" (c) |
86 | AP"fastloop:" | 86 | : "q0", "q2", "q15", "q13", "q12", "q11", "q10", |
87 | " add %[d], #16 \n\t" | 87 | "memory" |
88 | " cmp %[tmp], %[d] \n\t" | 88 | ); |
89 | " ble "AP"loopout \n\t" | 89 | } |
90 | AP"quadloopint: \n\t" | 90 | if (l % 2) |
91 | " ldr %[x], [%[m]] \n\t" | 91 | { |
92 | " add %[m], #4 \n\t" | 92 | // do analogue of main loop for last element, if needed |
93 | " cmp %[x], #0 \n\t" | 93 | __asm__ __volatile__ |
94 | " beq "AP"fastloop \n\t" | 94 | ( |
95 | " vmov.32 d0[0], %[x] \n\t" | 95 | " vld1.32 d4[0], [%[d]] \n\t" |
96 | " vldm %[d], {d4,d5} \n\t" | 96 | |
97 | 97 | " veor d0, d0, d0 \n\t" | |
98 | // Expand M: Fixme: Can we do this quicker? | 98 | " vld1.8 d0[0], [%[m]]! \n\t" |
99 | " vmovl.u8 q0, d0 \n\t" | 99 | |
100 | " vmovl.u8 q0, d0 \n\t" | 100 | " veor d5, d5, d5 \n\t" |
101 | " vmul.u32 q0, q14 \n\t" | 101 | " vzip.8 d4, d5 \n\t" |
102 | 102 | ||
103 | // Multiply a * c | 103 | " vdup.u16 d0, d0[0] \n\t" |
104 | " vmull.u8 q4, d0, d30 \n\t" | 104 | |
105 | " vmull.u8 q5, d1, d31 \n\t" | 105 | " vmul.u16 d26, d0, d30 \n\t" |
106 | 106 | " vadd.i16 d26, d26, d20 \n\t" | |
107 | // Shorten | 107 | " vsri.16 d26, d26, #8 \n\t" |
108 | " vqrshrn.u16 d12, q4, #8 \n\t" | 108 | " vand d26, d26, d20 \n\t" |
109 | " vqrshrn.u16 d13, q5, #8 \n\t" | 109 | |
110 | 110 | " vdup.u16 d24, d26[3] \n\t" | |
111 | // extract negated alpha | 111 | |
112 | " vmvn.u16 q7, q6 \n\t" | 112 | " vsub.i16 d24, d22, d24 \n\t" |
113 | " vshr.u32 q8, q7, #24 \n\t" | 113 | " vmul.u16 d4, d4, d24 \n\t" |
114 | " vmul.u32 q8, q8, q14 \n\t" | 114 | " vsri.16 d4, d4, #8 \n\t" |
115 | 115 | " vand d4, d4, d20 \n\t" | |
116 | // Multiply | 116 | " vadd.i16 d4, d4, d26 \n\t" |
117 | " vmull.u8 q7, d16, d4 \n\t" | 117 | " vand d4, d4, d20 \n\t" |
118 | " vmull.u8 q8, d17, d5 \n\t" | 118 | |
119 | 119 | " vqmovn.u16 d4, q2 \n\t" | |
120 | " vqrshrn.u16 d0, q7, #8 \n\t" | 120 | " vst1.32 {d4[0]}, [%[d]]! \n\t" |
121 | " vqrshrn.u16 d1, q8, #8 \n\t" | 121 | : [d] "+r" (d), [m] "+r" (m) |
122 | 122 | : [c] "r" (c) | |
123 | // Add | 123 | : "q0", "q2", "q15", "q13", "q12", "q11", "q10", |
124 | " vqadd.u8 q0, q0, q6 \n\t" | 124 | "memory" |
125 | 125 | ); | |
126 | " vstm %[d]!, {d0,d1} \n\t" | 126 | } |
127 | |||
128 | " cmp %[tmp], %[d] \n\t" | ||
129 | " bhi "AP"quadloopint \n\t" | ||
130 | |||
131 | AP"loopout: \n\t" | ||
132 | #if NEONDEBUG | ||
133 | "cmp %[d], %[e] \n\t" | ||
134 | "ble "AP"foo \n\t" | ||
135 | "cmp %[tmp], %[m] \n\t" | ||
136 | "sub %[x], %[x] \n\t" | ||
137 | "vst1.32 d0[0], [%[x]] \n\t" | ||
138 | AP"foo: \n\t" | ||
139 | #endif | ||
140 | |||
141 | " cmp %[d], %[e] \n\t" | ||
142 | " beq "AP"done \n\t" | ||
143 | " sub %[tmp],%[e], %[d] \n\t" | ||
144 | " cmp %[tmp],#4 \n\t" | ||
145 | " beq "AP"singleout \n\t" | ||
146 | |||
147 | AP "dualloop2: \n\t" | ||
148 | "sub %[tmp],%[e],$0x8 \n\t" | ||
149 | " vld1.16 d0[0], [%[m]]! \n\t" | ||
150 | " vldm %[d], {d4} \n\t" | ||
151 | " vmovl.u8 q0, d0 \n\t" | ||
152 | " vmovl.u8 q0, d0 \n\t" | ||
153 | " vmul.u32 q0, q14 \n\t" | ||
154 | " vmull.u8 q4, d0, d30 \n\t" | ||
155 | " vqrshrn.u16 d12, q4, #8 \n\t" | ||
156 | " vmvn.u16 d14, d12 \n\t" | ||
157 | " vshr.u32 d16, d14, #24 \n\t" | ||
158 | " vmul.u32 d16, d16, d28 \n\t" | ||
159 | " vmull.u8 q7, d16, d4 \n\t" | ||
160 | " vqrshrn.u16 d0, q7, #8 \n\t" | ||
161 | " vqadd.u8 q0, q0, q6 \n\t" | ||
162 | " vstm %[d]!, {d0} \n\t" | ||
163 | |||
164 | " cmp %[e], %[d] \n\t" | ||
165 | " beq "AP"done \n\t" | ||
166 | |||
167 | AP"singleout: \n\t" | ||
168 | " vld1.8 d0[0], [%[m]]! \n\t" | ||
169 | " vld1.32 d4[0], [%[d]] \n\t" | ||
170 | " vdup.u8 d0, d0[0] \n\t" | ||
171 | " vmull.u8 q4, d0, d30 \n\t" | ||
172 | " vqrshrn.u16 d12, q4, #8 \n\t" | ||
173 | " vmvn.u16 d14, d12 \n\t" | ||
174 | " vshr.u32 d16, d14, #24 \n\t" | ||
175 | " vmul.u32 d16, d16, d28 \n\t" | ||
176 | " vmull.u8 q7, d16, d4 \n\t" | ||
177 | " vqrshrn.u16 d0, q7, #8 \n\t" | ||
178 | " vqadd.u8 q0, q0, q6 \n\t" | ||
179 | " vst1.32 d0[0], [%[d]]! \n\t" | ||
180 | |||
181 | AP"done: \n\t" | ||
182 | #if NEONDEBUG | ||
183 | "cmp %[d], %[e] \n\t" | ||
184 | "beq "AP"reallydone \n\t" | ||
185 | "sub %[tmp], %[tmp] \n\t" | ||
186 | "vst1.32 d0[0], [%[tmp]] \n\t" | ||
187 | AP"reallydone:" | ||
188 | #endif | ||
189 | : // Out | ||
190 | : [e] "r" (d + l), [d] "r" (d), [c] "r" (c), | ||
191 | [tmp] "r" (7), [m] "r" (m), [x] "r" (0) | ||
192 | : "q0", "q1", "q2","q3", "q4","q5","q6", "q7","q8","q14","q15", | ||
193 | "memory" // clobbered | ||
194 | ); | ||
195 | #undef AP | ||
196 | } | 127 | } |
197 | #endif | 128 | #endif |
198 | 129 | ||