summaryrefslogtreecommitdiff
path: root/src/lib/evas/common/evas_map_image_loop.c
diff options
context:
space:
mode:
authorCarsten Haitzler (Rasterman) <raster@rasterman.com>2014-12-17 15:28:50 +0900
committerCarsten Haitzler (Rasterman) <raster@rasterman.com>2014-12-17 15:28:50 +0900
commitafb73157227b9625ac768a4bfd606383baf35218 (patch)
tree354ac3c292a6a2bb8956a253d34347e3430c6662 /src/lib/evas/common/evas_map_image_loop.c
parentc280e2f71172273ab573a6965f294b53e5826ee3 (diff)
Use NEON intrinsics for mapping instead of inline asm
Summary: Rewrite linline assembly in mapping func using NEON intrinsics. Reviewers: raster Differential Revision: https://phab.enlightenment.org/D1740
Diffstat (limited to '')
-rw-r--r--src/lib/evas/common/evas_map_image_loop.c328
1 files changed, 216 insertions, 112 deletions
diff --git a/src/lib/evas/common/evas_map_image_loop.c b/src/lib/evas/common/evas_map_image_loop.c
index fbc8459a73..2581b6d7c3 100644
--- a/src/lib/evas/common/evas_map_image_loop.c
+++ b/src/lib/evas/common/evas_map_image_loop.c
@@ -9,24 +9,63 @@
9# endif //SCALE_USING_MMX 9# endif //SCALE_USING_MMX
10 10
11# ifdef SCALE_USING_NEON 11# ifdef SCALE_USING_NEON
12 FPU_NEON; 12# ifndef COLBLACK
13 VMOV_I2R_NEON(q2, #255); 13 uint16x4_t temp_16x4;
14# ifdef COLMUL 14 uint16x4_t rv_16x4;
15# ifndef COLBLACK 15 uint16x4_t val1_16x4;
16 // this part can be done here as c1 and c2 are constants in the cycle 16 uint16x4_t val3_16x4;
17 FPU_NEON; 17 uint16x8_t ru_16x8;
18 VMOV_M2R_NEON(d18, c1); 18 uint16x8_t val1_val3_16x8;
19 VEOR_NEON(q8); 19 uint16x8_t val2_val4_16x8;
20# ifndef COLSAME 20 uint16x8_t x255_16x8;
21 VMOV_M2R_NEON(d19, c2); 21 uint32x2_t res_32x2;
22# endif //COLSAME 22 uint32x2_t val1_val3_32x2;
23 VZIP_NEON(q9, q8); 23 uint32x2_t val2_val4_32x2;
24# ifndef COLSAME 24 uint8x8_t val1_val3_8x8;
25 VMOV_R2R_NEON(d19, d16); 25 uint8x8_t val2_val4_8x8;
26
27 x255_16x8 = vdupq_n_u16(0xff);
28# ifdef COLMUL
29 uint16x4_t x255_16x4;
30 x255_16x4 = vget_low_u16(x255_16x8);
31 uint16x4_t c1_16x4;
32# ifdef COLSAME
33 uint16x4_t c1_val3_16x4;
34 uint16x8_t c1_16x8;
35 uint16x8_t c1_val3_16x8;
36 uint32x2_t c1_32x2;
37 uint8x8_t c1_8x8;
38 uint8x8_t c1_val3_8x8;
39
40 c1_32x2 = vset_lane_u32(c1, c1_32x2, 0);
41 c1_8x8 = vreinterpret_u8_u32(c1_32x2);
42 c1_16x8 = vmovl_u8(c1_8x8);
43 c1_16x4 = vget_low_u16(c1_16x8);
44# else //COLSAME
45 uint16x4_t c2_16x4;
46 uint16x4_t c2_local_16x4;
47 uint16x4_t cv_16x4;
48 uint16x8_t c1_c2_16x8;
49 uint16x8_t c1_val1_16x8;
50 uint16x8_t c2_val3_16x8;
51 uint16x8_t cv_rv_16x8;
52 uint32x2_t c1_c2_32x2;
53 uint8x8_t c1_c2_8x8;
54 uint8x8_t val3_8x8;
55 uint16x8_t val3_16x8;
56
57 c1_c2_32x2 = vset_lane_u32(c1, c1_c2_32x2, 0);
58 c1_c2_32x2 = vset_lane_u32(c2, c1_c2_32x2, 1);
59 c1_c2_8x8 = vreinterpret_u8_u32(c1_c2_32x2);
60 c1_c2_16x8 = vmovl_u8(c1_c2_8x8);
61 c1_16x4 = vget_low_u16(c1_c2_16x8);
62 c2_16x4 = vget_high_u16(c1_c2_16x8);
26# endif //COLSAME 63# endif //COLSAME
27 // here we have c1 and c2 spread through q9 register 64# else //COLMUL
28# endif //COLBLACK 65 uint8x8_t val3_8x8;
29# endif //COLMUL 66 uint16x8_t val3_16x8;
67# endif //COLMUL
68# endif //COLBLACK
30# endif //SCALE_USING_NEON 69# endif //SCALE_USING_NEON
31 70
32 while (ww > 0) 71 while (ww > 0)
@@ -99,54 +138,83 @@
99# endif //COLMUL 138# endif //COLMUL
100 MOV_R2P(mm1, *d, mm0); 139 MOV_R2P(mm1, *d, mm0);
101# elif defined SCALE_USING_NEON 140# elif defined SCALE_USING_NEON
102 // not sure if we need this condition, but it doesn't affect the result
103 if (val1 | val2 | val3 | val4) 141 if (val1 | val2 | val3 | val4)
104 { 142 {
105 FPU_NEON; 143 rv_16x4 = vdup_n_u16(rv);
106# ifdef COLMUL 144 ru_16x8 = vdupq_n_u16(ru);
107 // initialize alpha for interpolation of c1 and c2 145
108 VDUP_NEON(d15, cv >> 16); 146 val1_val3_32x2 = vset_lane_u32(val1, val1_val3_32x2, 0);
109 // copy c1 and c2 as algorithm will overwrite it 147 val1_val3_32x2 = vset_lane_u32(val3, val1_val3_32x2, 1);
110 VMOV_R2R_NEON(q6, q9); 148 val2_val4_32x2 = vset_lane_u32(val2, val2_val4_32x2, 0);
111 cv += cd; // col 149 val2_val4_32x2 = vset_lane_u32(val4, val2_val4_32x2, 1);
112# endif //COLMUL 150
113 VMOV_M2R_NEON(d8, val1); 151 val1_val3_8x8 = vreinterpret_u8_u32(val1_val3_32x2);
114 VEOR_NEON(q0); 152 val2_val4_8x8 = vreinterpret_u8_u32(val2_val4_32x2);
115 VMOV_M2R_NEON(d9, val3); 153
116 VMOV_M2R_NEON(d10, val2); 154 val2_val4_16x8 = vmovl_u8(val2_val4_8x8);
117 VEOR_NEON(q1); 155 val1_val3_16x8 = vmovl_u8(val1_val3_8x8);
118 VMOV_M2R_NEON(d11, val4); 156
119 VDUP_NEON(q3, ru); 157 val2_val4_16x8 = vsubq_u16(val2_val4_16x8, val1_val3_16x8);
120 VDUP_NEON(d14, rv); 158 val2_val4_16x8 = vmulq_u16(val2_val4_16x8, ru_16x8);
121 VZIP_NEON(q4, q0); 159 val2_val4_16x8 = vshrq_n_u16(val2_val4_16x8, 8);
122 VZIP_NEON(q5, q1); 160 val2_val4_16x8 = vaddq_u16(val2_val4_16x8, val1_val3_16x8);
123 VMOV_R2R_NEON(d9, d0); 161 val2_val4_16x8 = vandq_u16(val2_val4_16x8, x255_16x8);
124 VMOV_R2R_NEON(d11, d2); 162
125 // by this point we have all required data in right registers 163 val1_16x4 = vget_low_u16(val2_val4_16x8);
126 // interpolate val1,val2 and val3,val4 164 val3_16x4 = vget_high_u16(val2_val4_16x8);
127 INTERP_256_NEON(q3, q5, q4, q2);
128# ifdef COLMUL 165# ifdef COLMUL
129# ifdef COLSAME 166# ifdef COLSAME
130 INTERP_256_NEON(d14, d9, d8, d4); 167
168 val3_16x4 = vsub_u16(val3_16x4, val1_16x4);
169 val3_16x4 = vmul_u16(val3_16x4, rv_16x4);
170 val3_16x4 = vshr_n_u16(val3_16x4, 8);
171 val3_16x4 = vadd_u16(val3_16x4, val1_16x4);
172 val3_16x4 = vand_u16(val3_16x4, x255_16x4);
173
174 c1_val3_16x4 = vmul_u16(c1_16x4, val3_16x4);
175 c1_val3_16x4 = vadd_u16(c1_val3_16x4, x255_16x4);
176
177 c1_val3_16x8 = vcombine_u16(c1_val3_16x4, temp_16x4);
178
179 c1_val3_8x8 = vshrn_n_u16(c1_val3_16x8, 8);
180 res_32x2 = vreinterpret_u32_u8(c1_val3_8x8);
131# else //COLSAME 181# else //COLSAME
132 /* move result of val3,val4 interpolation (and c1 if COLMUL is 182 c1_val1_16x8 = vcombine_u16(c1_16x4, val1_16x4);
133 defined) for next step */ 183 c2_val3_16x8 = vcombine_u16(c2_16x4, val3_16x4);
134 VSWP_NEON(d9, d12); 184
135 /* second stage of interpolation, also here c1 and c2 are 185 cv_16x4 = vdup_n_u16(cv>>16);
136 interpolated */ 186 cv += cd;
137 INTERP_256_NEON(q7, q6, q4, q2); 187 cv_rv_16x8 = vcombine_u16(cv_16x4, rv_16x4);
188
189 c2_val3_16x8 = vsubq_u16(c2_val3_16x8, c1_val1_16x8);
190 c2_val3_16x8 = vmulq_u16(c2_val3_16x8, cv_rv_16x8);
191 c2_val3_16x8 = vshrq_n_u16(c2_val3_16x8, 8);
192 c2_val3_16x8 = vaddq_u16(c2_val3_16x8, c1_val1_16x8);
193 c2_val3_16x8 = vandq_u16(c2_val3_16x8, x255_16x8);
194
195 c2_local_16x4 = vget_low_u16(c2_val3_16x8);
196 val3_16x4 = vget_high_u16(c2_val3_16x8);
197
198 val3_16x4 = vmul_u16(c2_local_16x4, val3_16x4);
199 val3_16x4 = vadd_u16(val3_16x4, x255_16x4);
200
201 val3_16x8 = vcombine_u16(val3_16x4, temp_16x4);
202
203 val3_8x8 = vshrn_n_u16(val3_16x8, 8);
204 res_32x2 = vreinterpret_u32_u8(val3_8x8);
138# endif //COLSAME 205# endif //COLSAME
139# else //COLMUL 206# else //COLMUL
140 INTERP_256_NEON(d14, d9, d8, d4); 207 val3_16x4 = vsub_u16(val3_16x4, val1_16x4);
141# endif //COLMUL 208 val3_16x4 = vmul_u16(val3_16x4, rv_16x4);
142# ifdef COLMUL 209 val3_16x4 = vshr_n_u16(val3_16x4, 8);
143# ifdef COLSAME 210 val3_16x4 = vadd_u16(val3_16x4, val1_16x4);
144 MUL4_SYM_NEON(d8, d12, d4); 211
145# else //COLSAME 212 val3_16x8 = vcombine_u16(val3_16x4, temp_16x4);
146 MUL4_SYM_NEON(d8, d9, d4); // do required multiplication 213
147# endif //COLSAME 214 val3_8x8 = vmovn_u16(val3_16x8);
215 res_32x2 = vreinterpret_u32_u8(val3_8x8);
148# endif //COLMUL 216# endif //COLMUL
149 VMOV_R2M_NEON(q4, d8, d); // save result to d 217 vst1_lane_u32(d, res_32x2, 0);
150 } 218 }
151 else 219 else
152 *d = val1; 220 *d = val1;
@@ -177,79 +245,115 @@
177#else //SMOOTH 245#else //SMOOTH
178{ 246{
179# ifdef SCALE_USING_NEON 247# ifdef SCALE_USING_NEON
180# ifdef COLMUL 248# ifndef COLBLACK
181# ifndef COLBLACK 249# ifdef COLMUL
250 uint16x4_t x255_16x4;
251 uint16x4_t temp_16x4;
252 uint16x8_t cval_16x8;
253 uint32x2_t res_32x2;
254 uint8x8_t cval_8x8;
255 uint16x4_t c1_16x4;
256 uint16x4_t cval_16x4;
257 uint16x4_t val1_16x4;
258 uint32x2_t val1_32x2;
259 uint8x8_t val1_8x8;
260
261 x255_16x4 = vdup_n_u16(0xff);
182# ifdef COLSAME 262# ifdef COLSAME
183 FPU_NEON; 263 uint16x8_t c1_16x8;
184 VMOV_I2R_NEON(q2, #255); 264 uint16x8_t val1_16x8;
185 VMOV_M2R_NEON(d10, c1); 265 uint32x2_t c1_32x2;
186 VEOR_NEON(d0); 266 uint8x8_t c1_8x8;
187 VZIP_NEON(d10, d0); 267
188# else 268 c1_32x2 = vset_lane_u32(c1, c1_32x2, 0);
189 // c1 and c2 are constants inside the cycle 269
190 FPU_NEON; 270 c1_8x8 = vreinterpret_u8_u32(c1_32x2);
191 VMOV_I2R_NEON(q2, #255); 271 c1_16x8 = vmovl_u8(c1_8x8);
192 VMOV_M2R_NEON(d10, c1); 272
193 VEOR_NEON(q0); 273 c1_16x4 = vget_low_u16(c1_16x8);
194 VMOV_M2R_NEON(d11, c2); 274# else //COLSAME
195 VZIP_NEON(q5, q0); 275 uint16x4_t c2_16x4;
196 VMOV_R2R_NEON(d11, d0); 276 uint16x4_t c2_c1_16x4;
277 uint16x4_t c2_c1_local_16x4;
278 uint16x4_t cv_16x4;
279 uint16x8_t c1_c2_16x8;
280 uint16x8_t val1_16x8;
281 uint32x2_t c1_c2_32x2;
282 uint8x8_t c1_c2_8x8;
283
284 c1_c2_32x2 = vset_lane_u32(c1, c1_c2_32x2, 0);
285 c1_c2_32x2 = vset_lane_u32(c2, c1_c2_32x2, 1);
286
287 c1_c2_8x8 = vreinterpret_u8_u32(c1_c2_32x2);
288 c1_c2_16x8 = vmovl_u8(c1_c2_8x8);
289
290 c1_16x4 = vget_low_u16(c1_c2_16x8);
291 c2_16x4 = vget_high_u16(c1_c2_16x8);
292
293 c2_c1_16x4 = vsub_u16(c2_16x4, c1_16x4);
197# endif //COLSAME 294# endif //COLSAME
198# endif //COLBLACK 295# endif //COLMUL
199# endif //COLMUL 296# endif //COLBLACK
200# endif //SCALE_USING_NEON 297# endif //SCALE_USING_NEON
201 298
202 while (ww > 0) 299 while (ww > 0)
203 { 300 {
204# ifdef COLMUL 301# ifndef SCALE_USING_NEON
205# ifndef COLBLACK 302# ifdef COLMUL
206 DATA32 val1; 303# ifndef COLBLACK
207# ifdef COLSAME 304 DATA32 val1;
208# else 305# ifndef COLSAME
209 DATA32 cval; // col 306 DATA32 cval; // col
210# endif //COLSAME 307# endif //COLSAME
211# endif //COLBLACK 308# endif //COLBLACK
212# endif //COLMUL 309# endif //COLMUL
310# endif //SCALE_USING_NEON
213 311
214# ifdef COLBLACK 312# ifdef COLBLACK
215 *d = 0xff000000; // col 313 *d = 0xff000000; // col
216# else //COLBLACK 314# else //COLBLACK
217 s = sp + ((v >> (FP + FPI)) * sw) + (u >> (FP + FPI)); 315 s = sp + ((v >> (FP + FPI)) * sw) + (u >> (FP + FPI));
218# ifdef COLMUL 316# ifdef COLMUL
317# ifdef SCALE_USING_NEON
318# ifdef COLSAME
319 val1_32x2 = vset_lane_u32(*s, val1_32x2, 0);
320 val1_8x8 = vreinterpret_u8_u32(val1_32x2);
321 val1_16x8 = vmovl_u8(val1_8x8);
322 val1_16x4 = vget_low_u16(val1_16x8);
323 cval_16x4 = c1_16x4;
324# else //COLSAME
325 cv_16x4 = vdup_n_u16(cv>>16);
326 cv += cd; // col
327
328 c2_c1_local_16x4 = vmul_u16(c2_c1_16x4, cv_16x4);
329 c2_c1_local_16x4 = vshr_n_u16(c2_c1_local_16x4, 8);
330 c2_c1_local_16x4 = vadd_u16(c2_c1_local_16x4, c1_16x4);
331 cval_16x4 = vand_u16(c2_c1_local_16x4, x255_16x4);
332 val1_32x2 = vset_lane_u32(*s, val1_32x2, 0);
333 val1_8x8 = vreinterpret_u8_u32(val1_32x2);
334 val1_16x8 = vmovl_u8(val1_8x8);
335 val1_16x4 = vget_low_u16(val1_16x8);
336# endif //COLSAME
337 cval_16x4 = vmul_u16(cval_16x4, val1_16x4);
338 cval_16x4 = vadd_u16(cval_16x4, x255_16x4);
339
340 cval_16x8 = vcombine_u16(cval_16x4, temp_16x4);
341
342 cval_8x8 = vshrn_n_u16(cval_16x8, 8);
343 res_32x2 = vreinterpret_u32_u8(cval_8x8);
344
345 vst1_lane_u32(d, res_32x2, 0);
346# else //SCALE_USING_NEON
219 val1 = *s; // col 347 val1 = *s; // col
220# ifdef COLSAME 348# ifdef COLSAME
221# ifdef SCALE_USING_NEON
222 VMOV_M2R_NEON(d1, val1);
223 VEOR_NEON(d0);
224 VZIP_NEON(d1, d0);
225 VMOV_R2R_NEON(d0, d10);
226 MUL4_SYM_NEON(d0, d1, d4)
227 VMOV_R2M_NEON(q0, d0, d);
228# else
229 *d = MUL4_SYM(c1, val1); 349 *d = MUL4_SYM(c1, val1);
230# endif //SCALE_USING_NEON
231# else //COLSAME
232/* XXX: this neon is broken! :( FIXME
233# ifdef SCALE_USING_NEON
234 FPU_NEON;
235 VMOV_M2R_NEON(d12, val1);
236 VMOV_R2R_NEON(q4, q5);
237 VEOR_NEON(q1);
238 VDUP_NEON(d15, cv >> 16);
239 VZIP_NEON(q6, q1);
240 INTERP_256_NEON(d15, d9, d8, d4); // interpolate c1 and c2
241 MUL4_SYM_NEON(d8, d12, d4); // multiply
242 VMOV_R2M_NEON(q4, d8, d); // save result
243# else 350# else
244 */
245 cval = INTERP_256((cv >> 16), c2, c1); // col 351 cval = INTERP_256((cv >> 16), c2, c1); // col
246 val1 = MUL4_SYM(cval, val1); 352 *d = MUL4_SYM(cval, val1);
247 cv += cd; // col 353 cv += cd; // col
248/*
249# endif 354# endif
250 */ 355# endif
251# endif //COLSAME 356# else
252# else //COLMUL
253 *d = *s; 357 *d = *s;
254# endif //COLMUL 358# endif //COLMUL
255 u += ud; 359 u += ud;