summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCarsten Haitzler (Rasterman) <raster@rasterman.com>2014-11-29 15:50:03 +0900
committerCarsten Haitzler (Rasterman) <raster@rasterman.com>2014-11-29 15:50:03 +0900
commitac7d7c9cbed92f21aa4a7555c4fee701227559a8 (patch)
treec1a216aed724626a0233c63295b87a883b7267f1
parent8fcfae57d18510c2c283108a42d638e7513a9af8 (diff)
Use intrinsics for scaling up instead of inline asm
Summary: Rewrite linline assembly in scaling func using NEON intrinsics. Reviewers: raster Differential Revision: https://phab.enlightenment.org/D1666
-rw-r--r--src/lib/evas/common/evas_scale_smooth.c3
-rw-r--r--src/lib/evas/common/evas_scale_smooth_scaler_up.c66
2 files changed, 50 insertions, 19 deletions
diff --git a/src/lib/evas/common/evas_scale_smooth.c b/src/lib/evas/common/evas_scale_smooth.c
index b4b4db5..a1957f1 100644
--- a/src/lib/evas/common/evas_scale_smooth.c
+++ b/src/lib/evas/common/evas_scale_smooth.c
@@ -1,6 +1,9 @@
1#include "evas_common_private.h" 1#include "evas_common_private.h"
2#include "evas_scale_smooth.h" 2#include "evas_scale_smooth.h"
3#include "evas_blend_private.h" 3#include "evas_blend_private.h"
4#ifdef BUILD_NEON
5#include <arm_neon.h>
6#endif
4 7
5#define SCALE_CALC_X_POINTS(P, SW, DW, CX, CW) \ 8#define SCALE_CALC_X_POINTS(P, SW, DW, CX, CW) \
6 P = alloca((CW + 1) * sizeof (int)); \ 9 P = alloca((CW + 1) * sizeof (int)); \
diff --git a/src/lib/evas/common/evas_scale_smooth_scaler_up.c b/src/lib/evas/common/evas_scale_smooth_scaler_up.c
index 3921d01..44bfbfa 100644
--- a/src/lib/evas/common/evas_scale_smooth_scaler_up.c
+++ b/src/lib/evas/common/evas_scale_smooth_scaler_up.c
@@ -173,9 +173,23 @@
173 pxor_r2r(mm0, mm0); 173 pxor_r2r(mm0, mm0);
174 MOV_A2R(ALPHA_255, mm5) 174 MOV_A2R(ALPHA_255, mm5)
175#elif defined SCALE_USING_NEON 175#elif defined SCALE_USING_NEON
176 FPU_NEON; 176 uint16x4_t ay_16x4;
177 VDUP_NEON(d12, ay); 177 uint16x4_t p0_16x4;
178 VMOV_I2R_NEON(q2, #255); 178 uint16x4_t p2_16x4;
179 uint16x8_t ax_16x8;
180 uint16x8_t p0_p2_16x8;
181 uint16x8_t p1_p3_16x8;
182 uint16x8_t x255_16x8;
183 uint32x2_t p0_p2_32x2;
184 uint32x2_t p1_p3_32x2;
185 uint32x2_t res_32x2;
186 uint8x8_t p0_p2_8x8;
187 uint8x8_t p1_p3_8x8;
188 uint8x8_t p2_8x8;
189 uint16x4_t temp_16x4;
190
191 ay_16x4 = vdup_n_u16(ay);
192 x255_16x8 = vdupq_n_u16(0xff);
179#endif 193#endif
180 pbuf = buf; pbuf_end = buf + dst_clip_w; 194 pbuf = buf; pbuf_end = buf + dst_clip_w;
181 sxx = sxx0; 195 sxx = sxx0;
@@ -217,22 +231,36 @@
217#elif defined SCALE_USING_NEON 231#elif defined SCALE_USING_NEON
218 if (p0 | p1 | p2 | p3) 232 if (p0 | p1 | p2 | p3)
219 { 233 {
220 FPU_NEON; 234 ax_16x8 = vdupq_n_u16(ax);
221 VMOV_M2R_NEON(d8, p0); 235
222 VEOR_NEON(q0); 236 p0_p2_32x2 = vset_lane_u32(p0, p0_p2_32x2, 0);
223 VMOV_M2R_NEON(d9, p2); 237 p0_p2_32x2 = vset_lane_u32(p2, p0_p2_32x2, 1);
224 VMOV_M2R_NEON(d10, p1); 238 p1_p3_32x2 = vset_lane_u32(p1, p1_p3_32x2, 0);
225 VEOR_NEON(q1); 239 p1_p3_32x2 = vset_lane_u32(p3, p1_p3_32x2, 1);
226 VMOV_M2R_NEON(d11, p3); 240
227 VDUP_NEON(q3, ax); 241 p0_p2_8x8 = vreinterpret_u8_u32(p0_p2_32x2);
228 VZIP_NEON(q4, q0); 242 p1_p3_8x8 = vreinterpret_u8_u32(p1_p3_32x2);
229 VZIP_NEON(q5, q1); 243 p1_p3_16x8 = vmovl_u8(p1_p3_8x8);
230 VMOV_R2R_NEON(d9, d0); 244 p0_p2_16x8 = vmovl_u8(p0_p2_8x8);
231 VMOV_R2R_NEON(d11, d2); 245
232 INTERP_256_NEON(q3, q5, q4, q2); 246 p1_p3_16x8 = vsubq_u16(p1_p3_16x8, p0_p2_16x8);
233 INTERP_256_NEON(d12, d9, d8, d5); 247 p1_p3_16x8 = vmulq_u16(p1_p3_16x8, ax_16x8);
234 VMOV_R2M_NEON(q4, d8, pbuf); 248 p1_p3_16x8 = vshrq_n_u16(p1_p3_16x8, 8);
235 pbuf++; 249 p1_p3_16x8 = vaddq_u16(p1_p3_16x8, p0_p2_16x8);
250 p1_p3_16x8 = vandq_u16(p1_p3_16x8, x255_16x8);
251
252 p0_16x4 = vget_low_u16(p1_p3_16x8);
253 p2_16x4 = vget_high_u16(p1_p3_16x8);
254
255 p2_16x4 = vsub_u16(p2_16x4, p0_16x4);
256 p2_16x4 = vmul_u16(p2_16x4, ay_16x4);
257 p2_16x4 = vshr_n_u16(p2_16x4, 8);
258 p2_16x4 = vadd_u16(p2_16x4, p0_16x4);
259
260 p1_p3_16x8 = vcombine_u16(temp_16x4, p2_16x4);
261 p2_8x8 = vmovn_u16(p1_p3_16x8);
262 res_32x2 = vreinterpret_u32_u8(p2_8x8);
263 vst1_lane_u32(pbuf++, res_32x2, 1);
236 } 264 }
237 else 265 else
238 *pbuf++ = p0; 266 *pbuf++ = p0;