summaryrefslogtreecommitdiff
path: root/src/lib/evas/common
diff options
context:
space:
mode:
authorCedric BAIL <cedric@osg.samsung.com>2015-07-29 21:22:17 +0200
committerCedric BAIL <cedric@osg.samsung.com>2015-08-04 17:34:29 +0200
commit10fb77cc558672ebe6ae2d60fa085837c6b97320 (patch)
treef172b71285ed5f6cb79870eff7b85aa1a36e72dd /src/lib/evas/common
parentd1afa0e19b3f3f6476551857a4fc0b860ebf84ac (diff)
evas: twice faster NEON intrinsics scaling up implementation
Summary: Already checked it for tizen 2.3/2.4. Scaling function works properly and much faster. Reviewers: raster, jolfzverb, cedric Reviewed By: cedric Subscribers: cedric Projects: #efl Differential Revision: https://phab.enlightenment.org/D2881 Signed-off-by: Cedric BAIL <cedric@osg.samsung.com>
Diffstat (limited to 'src/lib/evas/common')
-rw-r--r--src/lib/evas/common/evas_scale_smooth_scaler_up.c138
1 files changed, 86 insertions, 52 deletions
diff --git a/src/lib/evas/common/evas_scale_smooth_scaler_up.c b/src/lib/evas/common/evas_scale_smooth_scaler_up.c
index 3fbbbb1..5ba7805 100644
--- a/src/lib/evas/common/evas_scale_smooth_scaler_up.c
+++ b/src/lib/evas/common/evas_scale_smooth_scaler_up.c
@@ -216,35 +216,57 @@
216 pxor_r2r(mm0, mm0); 216 pxor_r2r(mm0, mm0);
217 MOV_A2R(ALPHA_255, mm5) 217 MOV_A2R(ALPHA_255, mm5)
218#elif defined SCALE_USING_NEON 218#elif defined SCALE_USING_NEON
219 uint16x4_t ay_16x4; 219 uint16x8_t vay = vdupq_n_u16(ay);
220 uint16x4_t p0_16x4;
221 uint16x4_t p2_16x4;
222 uint16x8_t ax_16x8;
223 uint16x8_t p0_p2_16x8;
224 uint16x8_t p1_p3_16x8;
225 uint16x8_t x255_16x8;
226 uint32x2_t p0_p2_32x2;
227 uint32x2_t p1_p3_32x2;
228 uint32x2_t res_32x2;
229 uint8x8_t p0_p2_8x8;
230 uint8x8_t p1_p3_8x8;
231 uint8x8_t p2_8x8;
232 uint16x4_t temp_16x4;
233
234 ay_16x4 = vdup_n_u16(ay);
235 x255_16x8 = vdupq_n_u16(0xff);
236#endif 220#endif
237 pbuf = buf; pbuf_end = buf + dst_clip_w; 221 pbuf = buf; pbuf_end = buf + dst_clip_w;
238 sxx = sxx0; 222 sxx = sxx0;
223#ifdef SCALE_USING_NEON
224 while (pbuf+1 < pbuf_end) // 2 iterations only for NEON
225#else
239 while (pbuf < pbuf_end) 226 while (pbuf < pbuf_end)
227#endif
240 { 228 {
241 int ax; 229 int ax;
242 DATA32 *p, *q; 230 DATA32 *p, *q;
231#ifdef SCALE_USING_NEON
232 int ax1;
233 DATA32 *p1, *q1;
234 uint32x2x2_t vp0, vp1;
235 uint16x8_t vax;
236 uint16x8_t vax1;
237#else
243 DATA32 p0, p1, p2, p3; 238 DATA32 p0, p1, p2, p3;
239#endif
244 240
245 sx = sxx >> 16; 241 sx = sxx >> 16;
246 ax = 1 + ((sxx - (sx << 16)) >> 8); 242 ax = 1 + ((sxx - (sx << 16)) >> 8);
247 p = psrc + sx; q = p + src_w; 243 p = psrc + sx; q = p + src_w;
244#ifdef SCALE_USING_NEON
245 vax = vdupq_n_u16(ax);
246 vp0.val[0] = vld1_u32(p);
247 vp0.val[1] = vld1_u32(q);
248 if ((sx + 1) >= srw)
249 {
250 vp0.val[0] = vdup_lane_u32(vp0.val[0], 0); // p0, p1
251 vp0.val[1] = vdup_lane_u32(vp0.val[1], 0); // p2, p3
252 }
253 if ((sy + 1) >= srh)
254 vp0.val[1] = vdup_lane_u32(vp0.val[0], 0);
255 sxx += dsxx;
256 sx = sxx >> 16;
257 ax1 = 1 + ((sxx - (sx << 16)) >> 8);
258 vax1 = vdupq_n_u16(ax1);
259 p1 = psrc + sx; q1 = p1 + src_w;
260 vp1.val[0] = vld1_u32(p1);
261 vp1.val[1] = vld1_u32(q1);
262 if ((sx + 1) >= srw)
263 {
264 vp1.val[0] = vdup_lane_u32(vp1.val[0], 0); // p4, p5
265 vp1.val[1] = vdup_lane_u32(vp1.val[1], 0); // p6, p7
266 }
267 if ((sy + 1) >= srh)
268 vp1.val[1] = vdup_lane_u32(vp1.val[0], 0);
269#else
248 p0 = p1 = p2 = p3 = *p; 270 p0 = p1 = p2 = p3 = *p;
249 if ((sx + 1) < srw) 271 if ((sx + 1) < srw)
250 p1 = *(p + 1); 272 p1 = *(p + 1);
@@ -254,6 +276,7 @@
254 if ((sx + 1) < srw) 276 if ((sx + 1) < srw)
255 p3 = *(q + 1); 277 p3 = *(q + 1);
256 } 278 }
279#endif
257#ifdef SCALE_USING_MMX 280#ifdef SCALE_USING_MMX
258 MOV_A2R(ax, mm6) 281 MOV_A2R(ax, mm6)
259 MOV_P2R(p0, mm1, mm0) 282 MOV_P2R(p0, mm1, mm0)
@@ -272,41 +295,23 @@
272 MOV_R2P(mm1, *pbuf, mm0) 295 MOV_R2P(mm1, *pbuf, mm0)
273 pbuf++; 296 pbuf++;
274#elif defined SCALE_USING_NEON 297#elif defined SCALE_USING_NEON
275 if (p0 | p1 | p2 | p3) 298 // (p0, p1), (p2, p3) ==> (p0, p2), (p1, p3)
276 { 299 vp0 = vzip_u32(vp0.val[0], vp0.val[1]);
277 ax_16x8 = vdupq_n_u16(ax); 300 // (p1 - p0, p3 - p2)
278 301 uint16x8_t vtmpq = vsubl_u8(vreinterpret_u8_u32(vp0.val[1]), vreinterpret_u8_u32(vp0.val[0]));
279 p0_p2_32x2 = vset_lane_u32(p0, p0_p2_32x2, 0); 302 // p0 + (p1 - p0)*ax, p2 + (p3 - p2)*ax
280 p0_p2_32x2 = vset_lane_u32(p2, p0_p2_32x2, 1); 303 vp0.val[0] = vreinterpret_u32_u8(vadd_u8(vreinterpret_u8_u32(vp0.val[0]), vshrn_n_u16(vmulq_u16(vtmpq, vax), 8)));
281 p1_p3_32x2 = vset_lane_u32(p1, p1_p3_32x2, 0); 304 vp1 = vzip_u32(vp1.val[0], vp1.val[1]);
282 p1_p3_32x2 = vset_lane_u32(p3, p1_p3_32x2, 1); 305 vtmpq = vsubl_u8(vreinterpret_u8_u32(vp1.val[1]), vreinterpret_u8_u32(vp1.val[0]));
283 306 vp1.val[0] = vreinterpret_u32_u8(vadd_u8(vreinterpret_u8_u32(vp1.val[0]), vshrn_n_u16(vmulq_u16(vtmpq, vax1), 8)));
284 p0_p2_8x8 = vreinterpret_u8_u32(p0_p2_32x2); 307 // (p0, p2), (p4, p6) ==> (p0, p4), (p2, p6)
285 p1_p3_8x8 = vreinterpret_u8_u32(p1_p3_32x2); 308 vp0 = vzip_u32(vp0.val[0], vp1.val[0]);
286 p1_p3_16x8 = vmovl_u8(p1_p3_8x8); 309 // (p2 - p0), (p6 - p4)
287 p0_p2_16x8 = vmovl_u8(p0_p2_8x8); 310 vtmpq = vsubl_u8(vreinterpret_u8_u32(vp0.val[1]), vreinterpret_u8_u32(vp0.val[0]));
288 311 // p0 + (p2 - p0)*ay, p4 + (p6 - p4)*ay
289 p1_p3_16x8 = vsubq_u16(p1_p3_16x8, p0_p2_16x8); 312 vp0.val[0] = vreinterpret_u32_u8(vadd_u8(vreinterpret_u8_u32(vp0.val[0]), vshrn_n_u16(vmulq_u16(vtmpq, vay), 8)));
290 p1_p3_16x8 = vmulq_u16(p1_p3_16x8, ax_16x8); 313 vst1_u32(pbuf, vp0.val[0]);
291 p1_p3_16x8 = vshrq_n_u16(p1_p3_16x8, 8); 314 pbuf += 2;
292 p1_p3_16x8 = vaddq_u16(p1_p3_16x8, p0_p2_16x8);
293 p1_p3_16x8 = vandq_u16(p1_p3_16x8, x255_16x8);
294
295 p0_16x4 = vget_low_u16(p1_p3_16x8);
296 p2_16x4 = vget_high_u16(p1_p3_16x8);
297
298 p2_16x4 = vsub_u16(p2_16x4, p0_16x4);
299 p2_16x4 = vmul_u16(p2_16x4, ay_16x4);
300 p2_16x4 = vshr_n_u16(p2_16x4, 8);
301 p2_16x4 = vadd_u16(p2_16x4, p0_16x4);
302
303 p1_p3_16x8 = vcombine_u16(temp_16x4, p2_16x4);
304 p2_8x8 = vmovn_u16(p1_p3_16x8);
305 res_32x2 = vreinterpret_u32_u8(p2_8x8);
306 vst1_lane_u32(pbuf++, res_32x2, 1);
307 }
308 else
309 *pbuf++ = p0;
310#else 315#else
311 if (p0 | p1) 316 if (p0 | p1)
312 p0 = INTERP_256(ax, p1, p0); 317 p0 = INTERP_256(ax, p1, p0);
@@ -318,6 +323,35 @@
318#endif 323#endif
319 sxx += dsxx; 324 sxx += dsxx;
320 } 325 }
326#if defined SCALE_USING_NEON
327 if (pbuf < pbuf_end) // For non-even length case
328 {
329 int ax;
330 DATA32 *p, *q;
331 DATA32 p0, p1, p2, p3;
332
333 sx = sxx >> 16;
334 ax = 1 + ((sxx - (sx << 16)) >> 8);
335 p = psrc + sx; q = p + src_w;
336 p0 = p1 = p2 = p3 = *p;
337 if ((sx + 1) < srw)
338 p1 = *(p + 1);
339 if ((sy + 1) < srh)
340 {
341 p2 = *q; p3 = p2;
342 if ((sx + 1) < srw)
343 p3 = *(q + 1);
344 }
345 if (p0 | p1)
346 p0 = INTERP_256(ax, p1, p0);
347 if (p2 | p3)
348 p2 = INTERP_256(ax, p3, p2);
349 if (p0 | p2)
350 p0 = INTERP_256(ay, p2, p0);
351 *pbuf++ = p0;
352 sxx += dsxx;
353 }
354#endif
321 /* * blend here [clip_w *] buf -> dptr * */ 355 /* * blend here [clip_w *] buf -> dptr * */
322 if (!direct_scale) 356 if (!direct_scale)
323 { 357 {