From afb73157227b9625ac768a4bfd606383baf35218 Mon Sep 17 00:00:00 2001
From: "Carsten Haitzler (Rasterman)" <raster@rasterman.com>
Date: Wed, 17 Dec 2014 15:28:50 +0900
Subject: [PATCH] Use NEON intrinsics for mapping instead of inline asm

Summary: Rewrite linline assembly in mapping func using NEON intrinsics.

Reviewers: raster

Differential Revision: https://phab.enlightenment.org/D1740
---
 src/lib/evas/common/evas_map_image.c      |   3 +
 src/lib/evas/common/evas_map_image_loop.c | 328 ++++++++++++++--------
 2 files changed, 219 insertions(+), 112 deletions(-)

diff --git a/src/lib/evas/common/evas_map_image.c b/src/lib/evas/common/evas_map_image.c
index 69e7f5f00a..05ad32acfb 100644
--- a/src/lib/evas/common/evas_map_image.c
+++ b/src/lib/evas/common/evas_map_image.c
@@ -4,6 +4,9 @@
 #ifdef EVAS_CSERVE2
 #include "evas_cs2_private.h"
 #endif
+#ifdef BUILD_NEON
+#include <arm_neon.h>
+#endif
 
 #ifdef BUILD_MMX
 # undef SCALE_USING_MMX
diff --git a/src/lib/evas/common/evas_map_image_loop.c b/src/lib/evas/common/evas_map_image_loop.c
index fbc8459a73..2581b6d7c3 100644
--- a/src/lib/evas/common/evas_map_image_loop.c
+++ b/src/lib/evas/common/evas_map_image_loop.c
@@ -9,24 +9,63 @@
 # endif //SCALE_USING_MMX
 
 # ifdef SCALE_USING_NEON
-   FPU_NEON;
-   VMOV_I2R_NEON(q2, #255);
-#  ifdef COLMUL
-#   ifndef COLBLACK
-   // this part can be done here as c1 and c2 are constants in the cycle
-   FPU_NEON;
-   VMOV_M2R_NEON(d18, c1);
-   VEOR_NEON(q8);
-#    ifndef COLSAME
-   VMOV_M2R_NEON(d19, c2);
+#  ifndef COLBLACK
+   uint16x4_t temp_16x4;
+   uint16x4_t rv_16x4;
+   uint16x4_t val1_16x4;
+   uint16x4_t val3_16x4;
+   uint16x8_t ru_16x8;
+   uint16x8_t val1_val3_16x8;
+   uint16x8_t val2_val4_16x8;
+   uint16x8_t x255_16x8;
+   uint32x2_t res_32x2;
+   uint32x2_t val1_val3_32x2;
+   uint32x2_t val2_val4_32x2;
+   uint8x8_t val1_val3_8x8;
+   uint8x8_t val2_val4_8x8;
+
+   x255_16x8 = vdupq_n_u16(0xff);
+#   ifdef COLMUL
+   uint16x4_t x255_16x4;
+   x255_16x4 = vget_low_u16(x255_16x8);
+   uint16x4_t c1_16x4;
+#    ifdef COLSAME
+   uint16x4_t c1_val3_16x4;
+   uint16x8_t c1_16x8;
+   uint16x8_t c1_val3_16x8;
+   uint32x2_t c1_32x2;
+   uint8x8_t c1_8x8;
+   uint8x8_t c1_val3_8x8;
+
+   c1_32x2 = vset_lane_u32(c1, c1_32x2, 0);
+   c1_8x8 = vreinterpret_u8_u32(c1_32x2);
+   c1_16x8 = vmovl_u8(c1_8x8);
+   c1_16x4 = vget_low_u16(c1_16x8);
+#    else //COLSAME
+   uint16x4_t c2_16x4;
+   uint16x4_t c2_local_16x4;
+   uint16x4_t cv_16x4;
+   uint16x8_t c1_c2_16x8;
+   uint16x8_t c1_val1_16x8;
+   uint16x8_t c2_val3_16x8;
+   uint16x8_t cv_rv_16x8;
+   uint32x2_t c1_c2_32x2;
+   uint8x8_t c1_c2_8x8;
+   uint8x8_t val3_8x8;
+   uint16x8_t val3_16x8;
+
+   c1_c2_32x2 = vset_lane_u32(c1, c1_c2_32x2, 0);
+   c1_c2_32x2 = vset_lane_u32(c2, c1_c2_32x2, 1);
+   c1_c2_8x8 = vreinterpret_u8_u32(c1_c2_32x2);
+   c1_c2_16x8 = vmovl_u8(c1_c2_8x8);
+   c1_16x4 = vget_low_u16(c1_c2_16x8);
+   c2_16x4 = vget_high_u16(c1_c2_16x8);
 #    endif //COLSAME
-   VZIP_NEON(q9, q8);
-#    ifndef COLSAME
-   VMOV_R2R_NEON(d19, d16);
-#    endif //COLSAME
-   // here we have c1 and c2 spread through q9 register
-#   endif //COLBLACK
-#  endif //COLMUL
+#   else //COLMUL
+   uint8x8_t val3_8x8;
+   uint16x8_t val3_16x8;
+#   endif //COLMUL
+#  endif //COLBLACK
 # endif //SCALE_USING_NEON
 
    while (ww > 0)
@@ -99,54 +138,83 @@
 #   endif //COLMUL
         MOV_R2P(mm1, *d, mm0);
 #  elif defined SCALE_USING_NEON
-        // not sure if we need this condition, but it doesn't affect the result
         if (val1 | val2 | val3 | val4)
           {
-            FPU_NEON;
-#   ifdef COLMUL
-            // initialize alpha for interpolation of c1 and c2
-            VDUP_NEON(d15, cv >> 16);
-            // copy c1 and c2 as algorithm will overwrite it
-            VMOV_R2R_NEON(q6, q9);
-            cv += cd; // col
-#   endif //COLMUL
-            VMOV_M2R_NEON(d8, val1);
-            VEOR_NEON(q0);
-            VMOV_M2R_NEON(d9, val3);
-            VMOV_M2R_NEON(d10, val2);
-            VEOR_NEON(q1);
-            VMOV_M2R_NEON(d11, val4);
-            VDUP_NEON(q3, ru);
-            VDUP_NEON(d14, rv);
-            VZIP_NEON(q4, q0);
-            VZIP_NEON(q5, q1);
-            VMOV_R2R_NEON(d9, d0);
-            VMOV_R2R_NEON(d11, d2);
-            // by this point we have all required data in right registers
-            // interpolate val1,val2 and val3,val4
-            INTERP_256_NEON(q3, q5, q4, q2);
+             rv_16x4 = vdup_n_u16(rv);
+             ru_16x8 = vdupq_n_u16(ru);
+
+             val1_val3_32x2 = vset_lane_u32(val1, val1_val3_32x2, 0);
+             val1_val3_32x2 = vset_lane_u32(val3, val1_val3_32x2, 1);
+             val2_val4_32x2 = vset_lane_u32(val2, val2_val4_32x2, 0);
+             val2_val4_32x2 = vset_lane_u32(val4, val2_val4_32x2, 1);
+
+             val1_val3_8x8 = vreinterpret_u8_u32(val1_val3_32x2);
+             val2_val4_8x8 = vreinterpret_u8_u32(val2_val4_32x2);
+
+             val2_val4_16x8 = vmovl_u8(val2_val4_8x8);
+             val1_val3_16x8 = vmovl_u8(val1_val3_8x8);
+
+             val2_val4_16x8 = vsubq_u16(val2_val4_16x8, val1_val3_16x8);
+             val2_val4_16x8 = vmulq_u16(val2_val4_16x8, ru_16x8);
+             val2_val4_16x8 = vshrq_n_u16(val2_val4_16x8, 8);
+             val2_val4_16x8 = vaddq_u16(val2_val4_16x8, val1_val3_16x8);
+             val2_val4_16x8 = vandq_u16(val2_val4_16x8, x255_16x8);
+
+             val1_16x4 = vget_low_u16(val2_val4_16x8);
+             val3_16x4 = vget_high_u16(val2_val4_16x8);
 #   ifdef COLMUL
 #    ifdef COLSAME
-            INTERP_256_NEON(d14, d9, d8, d4);
+
+             val3_16x4 = vsub_u16(val3_16x4, val1_16x4);
+             val3_16x4 = vmul_u16(val3_16x4, rv_16x4);
+             val3_16x4 = vshr_n_u16(val3_16x4, 8);
+             val3_16x4 = vadd_u16(val3_16x4, val1_16x4);
+             val3_16x4 = vand_u16(val3_16x4, x255_16x4);
+
+             c1_val3_16x4 = vmul_u16(c1_16x4, val3_16x4);
+             c1_val3_16x4 = vadd_u16(c1_val3_16x4, x255_16x4);
+
+             c1_val3_16x8 = vcombine_u16(c1_val3_16x4, temp_16x4);
+
+             c1_val3_8x8 = vshrn_n_u16(c1_val3_16x8, 8);
+             res_32x2 = vreinterpret_u32_u8(c1_val3_8x8);
 #    else //COLSAME
-            /* move result of val3,val4 interpolation (and c1 if COLMUL is
-               defined) for next step */
-            VSWP_NEON(d9, d12);
-            /* second stage of interpolation, also here c1 and c2 are
-               interpolated */
-            INTERP_256_NEON(q7, q6, q4, q2);
+             c1_val1_16x8 = vcombine_u16(c1_16x4, val1_16x4);
+             c2_val3_16x8 = vcombine_u16(c2_16x4, val3_16x4);
+
+             cv_16x4 = vdup_n_u16(cv>>16);
+             cv += cd;
+             cv_rv_16x8 = vcombine_u16(cv_16x4, rv_16x4);
+
+             c2_val3_16x8 = vsubq_u16(c2_val3_16x8, c1_val1_16x8);
+             c2_val3_16x8 = vmulq_u16(c2_val3_16x8, cv_rv_16x8);
+             c2_val3_16x8 = vshrq_n_u16(c2_val3_16x8, 8);
+             c2_val3_16x8 = vaddq_u16(c2_val3_16x8, c1_val1_16x8);
+             c2_val3_16x8 = vandq_u16(c2_val3_16x8, x255_16x8);
+
+             c2_local_16x4 = vget_low_u16(c2_val3_16x8);
+             val3_16x4 = vget_high_u16(c2_val3_16x8);
+
+             val3_16x4 = vmul_u16(c2_local_16x4, val3_16x4);
+             val3_16x4 = vadd_u16(val3_16x4, x255_16x4);
+
+             val3_16x8 = vcombine_u16(val3_16x4, temp_16x4);
+
+             val3_8x8 = vshrn_n_u16(val3_16x8, 8);
+             res_32x2 = vreinterpret_u32_u8(val3_8x8);
 #    endif //COLSAME
 #   else //COLMUL
-            INTERP_256_NEON(d14, d9, d8, d4);
+             val3_16x4 = vsub_u16(val3_16x4, val1_16x4);
+             val3_16x4 = vmul_u16(val3_16x4, rv_16x4);
+             val3_16x4 = vshr_n_u16(val3_16x4, 8);
+             val3_16x4 = vadd_u16(val3_16x4, val1_16x4);
+
+             val3_16x8 = vcombine_u16(val3_16x4, temp_16x4);
+
+             val3_8x8 = vmovn_u16(val3_16x8);
+             res_32x2 = vreinterpret_u32_u8(val3_8x8);
 #   endif //COLMUL
-#   ifdef COLMUL
-#    ifdef COLSAME
-            MUL4_SYM_NEON(d8, d12, d4);
-#    else //COLSAME
-            MUL4_SYM_NEON(d8, d9, d4); // do required multiplication
-#    endif //COLSAME
-#   endif //COLMUL
-            VMOV_R2M_NEON(q4, d8, d); // save result to d
+             vst1_lane_u32(d, res_32x2, 0);
           }
         else
           *d = val1;
@@ -177,79 +245,115 @@
 #else //SMOOTH
 {
 # ifdef SCALE_USING_NEON
-#  ifdef COLMUL
-#   ifndef COLBLACK
+#  ifndef COLBLACK
+#   ifdef COLMUL
+   uint16x4_t x255_16x4;
+   uint16x4_t temp_16x4;
+   uint16x8_t cval_16x8;
+   uint32x2_t res_32x2;
+   uint8x8_t cval_8x8;
+   uint16x4_t c1_16x4;
+   uint16x4_t cval_16x4;
+   uint16x4_t val1_16x4;
+   uint32x2_t val1_32x2;
+   uint8x8_t val1_8x8;
+
+   x255_16x4 = vdup_n_u16(0xff);
 #    ifdef COLSAME
-   FPU_NEON;
-   VMOV_I2R_NEON(q2, #255);
-   VMOV_M2R_NEON(d10, c1);
-   VEOR_NEON(d0);
-   VZIP_NEON(d10, d0);
-#    else
-   // c1 and c2 are constants inside the cycle
-   FPU_NEON;
-   VMOV_I2R_NEON(q2, #255);
-   VMOV_M2R_NEON(d10, c1);
-   VEOR_NEON(q0);
-   VMOV_M2R_NEON(d11, c2);
-   VZIP_NEON(q5, q0);
-   VMOV_R2R_NEON(d11, d0);
+   uint16x8_t c1_16x8;
+   uint16x8_t val1_16x8;
+   uint32x2_t c1_32x2;
+   uint8x8_t c1_8x8;
+
+   c1_32x2 = vset_lane_u32(c1, c1_32x2, 0);
+
+   c1_8x8 = vreinterpret_u8_u32(c1_32x2);
+   c1_16x8 = vmovl_u8(c1_8x8);
+
+   c1_16x4 = vget_low_u16(c1_16x8);
+#    else //COLSAME
+   uint16x4_t c2_16x4;
+   uint16x4_t c2_c1_16x4;
+   uint16x4_t c2_c1_local_16x4;
+   uint16x4_t cv_16x4;
+   uint16x8_t c1_c2_16x8;
+   uint16x8_t val1_16x8;
+   uint32x2_t c1_c2_32x2;
+   uint8x8_t c1_c2_8x8;
+
+   c1_c2_32x2 = vset_lane_u32(c1, c1_c2_32x2, 0);
+   c1_c2_32x2 = vset_lane_u32(c2, c1_c2_32x2, 1);
+
+   c1_c2_8x8 = vreinterpret_u8_u32(c1_c2_32x2);
+   c1_c2_16x8 = vmovl_u8(c1_c2_8x8);
+
+   c1_16x4 = vget_low_u16(c1_c2_16x8);
+   c2_16x4 = vget_high_u16(c1_c2_16x8);
+
+   c2_c1_16x4 = vsub_u16(c2_16x4, c1_16x4);
 #    endif //COLSAME
-#   endif //COLBLACK
-#  endif //COLMUL
+#   endif //COLMUL
+#  endif //COLBLACK
 # endif //SCALE_USING_NEON
 
    while (ww > 0)
      {
-# ifdef COLMUL
-#  ifndef COLBLACK
-      DATA32 val1;
-#   ifdef COLSAME
-#   else
+# ifndef SCALE_USING_NEON
+#  ifdef COLMUL
+#   ifndef COLBLACK
+        DATA32 val1;
+#    ifndef COLSAME
         DATA32 cval; // col
-#   endif //COLSAME
-#  endif  //COLBLACK
-# endif //COLMUL
+#    endif //COLSAME
+#   endif //COLBLACK
+#  endif //COLMUL
+# endif //SCALE_USING_NEON
 
 # ifdef COLBLACK
         *d = 0xff000000; // col
 # else //COLBLACK
         s = sp + ((v >> (FP + FPI)) * sw) + (u >> (FP + FPI));
 #  ifdef COLMUL
+#   ifdef SCALE_USING_NEON
+#    ifdef COLSAME
+        val1_32x2 = vset_lane_u32(*s, val1_32x2, 0);
+        val1_8x8 = vreinterpret_u8_u32(val1_32x2);
+        val1_16x8 = vmovl_u8(val1_8x8);
+        val1_16x4 = vget_low_u16(val1_16x8);
+        cval_16x4 = c1_16x4;
+#    else //COLSAME
+        cv_16x4 = vdup_n_u16(cv>>16);
+        cv += cd; // col
+
+        c2_c1_local_16x4 = vmul_u16(c2_c1_16x4, cv_16x4);
+        c2_c1_local_16x4 = vshr_n_u16(c2_c1_local_16x4, 8);
+        c2_c1_local_16x4 = vadd_u16(c2_c1_local_16x4, c1_16x4);
+        cval_16x4 = vand_u16(c2_c1_local_16x4, x255_16x4);
+        val1_32x2 = vset_lane_u32(*s, val1_32x2, 0);
+        val1_8x8 = vreinterpret_u8_u32(val1_32x2);
+        val1_16x8 = vmovl_u8(val1_8x8);
+        val1_16x4 = vget_low_u16(val1_16x8);
+#    endif //COLSAME
+        cval_16x4 = vmul_u16(cval_16x4, val1_16x4);
+        cval_16x4 = vadd_u16(cval_16x4, x255_16x4);
+
+        cval_16x8 = vcombine_u16(cval_16x4, temp_16x4);
+
+        cval_8x8 = vshrn_n_u16(cval_16x8, 8);
+        res_32x2 = vreinterpret_u32_u8(cval_8x8);
+
+        vst1_lane_u32(d, res_32x2, 0);
+#   else //SCALE_USING_NEON
         val1 = *s; // col
-#   ifdef COLSAME
-#    ifdef SCALE_USING_NEON
-        VMOV_M2R_NEON(d1, val1);
-        VEOR_NEON(d0);
-        VZIP_NEON(d1, d0);
-        VMOV_R2R_NEON(d0, d10);
-        MUL4_SYM_NEON(d0, d1, d4)
-        VMOV_R2M_NEON(q0, d0, d);
-#    else
+#    ifdef COLSAME
         *d = MUL4_SYM(c1, val1);
-#    endif  //SCALE_USING_NEON
-#   else //COLSAME
-/* XXX: this neon is broken! :( FIXME
-#    ifdef SCALE_USING_NEON
-        FPU_NEON;
-        VMOV_M2R_NEON(d12, val1);
-        VMOV_R2R_NEON(q4, q5);
-        VEOR_NEON(q1);
-        VDUP_NEON(d15, cv >> 16);
-        VZIP_NEON(q6, q1);
-        INTERP_256_NEON(d15, d9, d8, d4); // interpolate c1 and c2
-        MUL4_SYM_NEON(d8, d12, d4); // multiply
-        VMOV_R2M_NEON(q4, d8, d); // save result
 #    else
- */
         cval = INTERP_256((cv >> 16), c2, c1); // col
-        val1 = MUL4_SYM(cval, val1);
-        cv += cd; // col              
-/*
+        *d = MUL4_SYM(cval, val1);
+        cv += cd; // col
 #    endif
- */
-#   endif //COLSAME
-#  else //COLMUL
+#   endif
+#  else
         *d = *s;
 #  endif //COLMUL
         u += ud;