evas: implement pixel_color blending functions using NEON intrinsics.

Summary: NEON intrinsics can be built both for armv7 and armv8. Implemented functions: _op_blend_pan_c_dp_neon _op_blend_p_can_dp_neon _op_blend_pan_can_dp_neon _op_blend_p_caa_dp_neon _op_blend_pan_caa_dp_neon Reviewers: raster, cedric Subscribers: cedric Projects: #efl Maniphest Tasks: T2341 Differential Revision: https://phab.enlightenment.org/D2409 Signed-off-by: Cedric BAIL <cedric@osg.samsung.com>
2015-04-28 23:36:04 +02:00 · 2015-04-28 23:36:04 +02:00 · 76a5efe13a
parent edfd621d06
commit 76a5efe13a
1 changed files with 495 additions and 219 deletions
--- a/src/lib/evas/common/evas_op_blend/op_blend_pixel_color_neon.c
+++ b/src/lib/evas/common/evas_op_blend/op_blend_pixel_color_neon.c
@ -1,8 +1,3 @@
-#ifdef BUILD_NEON
-#ifdef BUILD_NEON_INTRINSICS
-#include <arm_neon.h>
-#endif
-#endif
 /* blend pixel x color --> dst */
 #ifdef BUILD_NEON

@ -202,240 +197,521 @@ _op_blend_p_c_dp_neon(DATA32 * __restrict s, DATA8 *m EINA_UNUSED, DATA32 c, DAT
 #endif
 }

+
 static void
-_op_blend_pan_can_dp_neon(DATA32 *s, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) {
-   DATA32 *e;
-   UNROLL8_PLD_WHILE(d, l, e,
+_op_blend_pan_c_dp_neon(DATA32 *s, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) {
+   uint16x8_t ad0_16x8;
+   uint16x8_t ad1_16x8;
+   uint16x8_t sc0_16x8;
+   uint16x8_t sc1_16x8;
+   uint16x8_t x255_16x8;
+   uint32x4_t ad_32x4;
+   uint32x4_t c_32x4;
+   uint32x4_t d_32x4;
+   uint32x4_t mask_32x4;
+   uint32x4_t s_32x4;
+   uint32x4_t sc_32x4;
+   uint8x16_t ad_8x16;
+   uint8x16_t c_8x16;
+   uint8x16_t d_8x16;
+   uint8x16_t mask_8x16;
+   uint8x16_t s_8x16;
+   uint8x16_t sc_8x16;
+   uint8x8_t a_8x8;
+   uint8x8_t ad0_8x8;
+   uint8x8_t ad1_8x8;
+   uint8x8_t c_8x8;
+   uint8x8_t d0_8x8;
+   uint8x8_t d1_8x8;
+   uint8x8_t s0_8x8;
+   uint8x8_t s1_8x8;
+   uint8x8_t sc0_8x8;
+   uint8x8_t sc1_8x8;
+
+   // alpha can only be 0 if color is 0x0. In that case we can just return.
+   // Otherwise we can assume alpha != 0. This allows more optimization in
+   // NEON code.
+
+   if(!c)
+      return;
+
+   unsigned char a;
+   a = ~(c >> 24) + 1; // 256 - (c >> 24)
+
+   a_8x8 = vdup_n_u8(a);
+   c_32x4 = vdupq_n_u32(c);
+   c_8x16 = vreinterpretq_u8_u32(c_32x4);
+   c_8x8 = vget_low_u8(c_8x16);
+   x255_16x8 = vdupq_n_u16(0xff);
+   mask_32x4 = vdupq_n_u32(0xff000000);
+   mask_8x16 = vreinterpretq_u8_u32(mask_32x4);
+
+   DATA32 *end = d + (l & ~3);
+   while (d < end)
   {
-                        *d++ = 0xff000000 + MUL3_SYM(c, *s);
+      // load 4 elements from d
+      d_32x4 = vld1q_u32(d);
+      d_8x16 = vreinterpretq_u8_u32(d_32x4);
+      d0_8x8 = vget_low_u8(d_8x16);
+      d1_8x8 = vget_high_u8(d_8x16);
+
+      // multiply MUL_256(a, *d)
+      ad0_16x8 = vmull_u8(a_8x8, d0_8x8);
+      ad1_16x8 = vmull_u8(a_8x8, d1_8x8);
+      ad0_8x8 = vshrn_n_u16(ad0_16x8,8);
+      ad1_8x8 = vshrn_n_u16(ad1_16x8,8);
+      ad_8x16 = vcombine_u8(ad0_8x8, ad1_8x8);
+      ad_32x4 = vreinterpretq_u32_u8(ad_8x16);
+
+      // load 4 elements from s
+      s_32x4 = vld1q_u32(s);
+      s_8x16 = vreinterpretq_u8_u32(s_32x4);
+      s0_8x8 = vget_low_u8(s_8x16);
+      s1_8x8 = vget_high_u8(s_8x16);
+
+      // multiply MUL_SYM(c, *s);
+      sc0_16x8 = vmull_u8(s0_8x8, c_8x8);
+      sc1_16x8 = vmull_u8(s1_8x8, c_8x8);
+      sc0_16x8 = vaddq_u16(sc0_16x8, x255_16x8);
+      sc1_16x8 = vaddq_u16(sc1_16x8, x255_16x8);
+      sc0_8x8 = vshrn_n_u16(sc0_16x8, 8);
+      sc1_8x8 = vshrn_n_u16(sc1_16x8, 8);
+      sc_8x16 = vcombine_u8(sc0_8x8, sc1_8x8);
+
+      // select alpha channel from c
+      sc_8x16 = vbslq_u8(mask_8x16, c_8x16, sc_8x16);
+      sc_32x4 = vreinterpretq_u32_u8(sc_8x16);
+
+      // add up everything
+      d_32x4 = vaddq_u32(sc_32x4, ad_32x4);
+
+      // save result
+      vst1q_u32(d, d_32x4);
+
+      d+=4;
+      s+=4;
+   }
+
+   end += (l & 3);
+   while (d < end)
+   {
+      *d = ((c & 0xff000000) + MUL3_SYM(c, *s)) + MUL_256(a, *d);
+      d++;
      s++;
-                     });
+   }
+
 }

 static void
-_op_blend_pan_caa_dp_neon(DATA32 *s, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) {
-#if 1
-   DATA32 *e;
-   DATA32 sc;
+_op_blend_p_can_dp_neon(DATA32 *s, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) {
+   uint16x8_t ad0_16x8;
+   uint16x8_t ad1_16x8;
+   uint16x8_t sc0_16x8;
+   uint16x8_t sc1_16x8;
+   uint16x8_t x255_16x8;
+   uint32x2_t c_32x2;
+   uint32x4_t ad_32x4;
+   uint32x4_t alpha_32x4;
+   uint32x4_t cond_32x4;
+   uint32x4_t d_32x4;
+   uint32x4_t mask_32x4;
+   uint32x4_t s_32x4;
+   uint32x4_t sc_32x4;
+   uint32x4_t x0_32x4;
+   uint32x4_t x1_32x4;
+   uint8x16_t ad_8x16;
+   uint8x16_t alpha_8x16;
+   uint8x16_t d_8x16;
+   uint8x16_t mask_8x16;
+   uint8x16_t s_8x16;
+   uint8x16_t sc_8x16;
+   uint8x16_t x0_8x16;
+   uint8x16_t x1_8x16;
+   uint8x8_t ad0_8x8;
+   uint8x8_t ad1_8x8;
+   uint8x8_t alpha0_8x8;
+   uint8x8_t alpha1_8x8;
+   uint8x8_t c_8x8;
+   uint8x8_t d0_8x8;
+   uint8x8_t d1_8x8;
+   uint8x8_t s0_8x8;
+   uint8x8_t s1_8x8;
+   uint8x8_t sc0_8x8;
+   uint8x8_t sc1_8x8;
+
+   x1_8x16 = vdupq_n_u8(0x1);
+   x1_32x4 = vreinterpretq_u32_u8(x1_8x16);
+   x0_8x16 = vdupq_n_u8(0x0);
+   x0_32x4 = vreinterpretq_u32_u8(x0_8x16);
+   mask_32x4 = vdupq_n_u32(0xff000000);
+   mask_8x16 = vreinterpretq_u8_u32(mask_32x4);
+   c_32x2 = vdup_n_u32(c);
+   c_8x8 = vreinterpret_u8_u32(c_32x2);
+   x255_16x8 = vdupq_n_u16(0xff);
+
+   DATA32 *end = d + (l & ~3);
+   while (d < end)
+   {
+      // load 4 elements from s
+      s_32x4 = vld1q_u32(s);
+      s_8x16 = vreinterpretq_u8_u32(s_32x4);
+      s0_8x8 = vget_low_u8(s_8x16);
+      s1_8x8 = vget_high_u8(s_8x16);
+
+      // load 4 elements from d
+      d_32x4 = vld1q_u32(d);
+      d_8x16 = vreinterpretq_u8_u32(d_32x4);
+      d0_8x8 = vget_low_u8(d_8x16);
+      d1_8x8 = vget_high_u8(d_8x16);
+
+      // calculate alpha = 256 - (*s >> 24)
+      alpha_32x4 = vshrq_n_u32(s_32x4, 24);
+      alpha_32x4 = vmulq_u32(x1_32x4, alpha_32x4);
+      alpha_8x16 = vreinterpretq_u8_u32(alpha_32x4);
+      alpha_8x16 = vsubq_u8(x0_8x16, alpha_8x16);
+      alpha0_8x8 = vget_low_u8(alpha_8x16);
+      alpha1_8x8 = vget_high_u8(alpha_8x16);
+
+      // multiply MUL_SYM(c, *s);
+      sc0_16x8 = vmull_u8(s0_8x8, c_8x8);
+      sc1_16x8 = vmull_u8(s1_8x8, c_8x8);
+      sc0_16x8 = vaddq_u16(sc0_16x8, x255_16x8);
+      sc1_16x8 = vaddq_u16(sc1_16x8, x255_16x8);
+      sc0_8x8 = vshrn_n_u16(sc0_16x8, 8);
+      sc1_8x8 = vshrn_n_u16(sc1_16x8, 8);
+      sc_8x16 = vcombine_u8(sc0_8x8, sc1_8x8);
+
+      // select alpha channel from *s
+      sc_8x16 = vbslq_u8(mask_8x16, s_8x16, sc_8x16);
+      sc_32x4 = vreinterpretq_u32_u8(sc_8x16);
+
+      // multiply MUL_256(a, *d)
+      ad0_16x8 = vmull_u8(alpha0_8x8, d0_8x8);
+      ad1_16x8 = vmull_u8(alpha1_8x8, d1_8x8);
+      ad0_8x8 = vshrn_n_u16(ad0_16x8,8);
+      ad1_8x8 = vshrn_n_u16(ad1_16x8,8);
+      ad_8x16 = vcombine_u8(ad0_8x8, ad1_8x8);
+      ad_32x4 = vreinterpretq_u32_u8(ad_8x16);
+
+      // select d if alpha is 0
+      cond_32x4 = vceqq_u32(alpha_32x4, x0_32x4);
+      ad_32x4 = vbslq_u32(cond_32x4, d_32x4, ad_32x4);
+
+      // add up everything
+      d_32x4 = vaddq_u32(sc_32x4, ad_32x4);
+
+      // save result
+      vst1q_u32(d, d_32x4);
+
+      d+=4;
+      s+=4;
+   }
+
+   end += (l & 3);
+   int alpha;
+   while (d < end)
+   {
+      alpha = 256 - (*s >> 24);
+      *d = ((*s & 0xff000000) + MUL3_SYM(c, *s)) + MUL_256(alpha, *d);
+      d++;
+      s++;
+   }
+}
+
+static void
+_op_blend_pan_can_dp_neon(DATA32 *s, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) {
+   uint16x8_t sc00_16x8;
+   uint16x8_t sc01_16x8;
+   uint16x8_t sc10_16x8;
+   uint16x8_t sc11_16x8;
+   uint16x8_t x255_16x8;
+   uint32x2_t c_32x2;
+   uint32x4_t d0_32x4;
+   uint32x4_t d1_32x4;
+   uint32x4_t mask_32x4;
+   uint32x4_t s0_32x4;
+   uint32x4_t s1_32x4;
+   uint32x4_t sc0_32x4;
+   uint32x4_t sc1_32x4;
+   uint8x16_t s0_8x16;
+   uint8x16_t s1_8x16;
+   uint8x16_t sc0_8x16;
+   uint8x16_t sc1_8x16;
+   uint8x8_t c_8x8;
+   uint8x8_t s00_8x8;
+   uint8x8_t s01_8x8;
+   uint8x8_t s10_8x8;
+   uint8x8_t s11_8x8;
+   uint8x8_t sc00_8x8;
+   uint8x8_t sc01_8x8;
+   uint8x8_t sc10_8x8;
+   uint8x8_t sc11_8x8;
+
+   mask_32x4 = vdupq_n_u32(0xff000000);
+   x255_16x8 = vdupq_n_u16(0xff);
+   c_32x2 = vdup_n_u32(c);
+   c_8x8 = vreinterpret_u8_u32(c_32x2);
+
+   DATA32 *end = d + (l & ~7);
+   while (d < end)
+   {
+      // load 8 elements from s
+      s0_32x4 = vld1q_u32(s);
+      s0_8x16 = vreinterpretq_u8_u32(s0_32x4);
+      s00_8x8 = vget_low_u8(s0_8x16);
+      s01_8x8 = vget_high_u8(s0_8x16);
+      s1_32x4 = vld1q_u32(s+4);
+      s1_8x16 = vreinterpretq_u8_u32(s1_32x4);
+      s10_8x8 = vget_low_u8(s1_8x16);
+      s11_8x8 = vget_high_u8(s1_8x16);
+
+      // multiply MUL_SYM(c, *s);
+      sc00_16x8 = vmull_u8(s00_8x8, c_8x8);
+      sc01_16x8 = vmull_u8(s01_8x8, c_8x8);
+      sc10_16x8 = vmull_u8(s10_8x8, c_8x8);
+      sc11_16x8 = vmull_u8(s11_8x8, c_8x8);
+      sc00_16x8 = vaddq_u16(sc00_16x8, x255_16x8);
+      sc01_16x8 = vaddq_u16(sc01_16x8, x255_16x8);
+      sc10_16x8 = vaddq_u16(sc10_16x8, x255_16x8);
+      sc11_16x8 = vaddq_u16(sc11_16x8, x255_16x8);
+      sc00_8x8 = vshrn_n_u16(sc00_16x8, 8);
+      sc01_8x8 = vshrn_n_u16(sc01_16x8, 8);
+      sc10_8x8 = vshrn_n_u16(sc10_16x8, 8);
+      sc11_8x8 = vshrn_n_u16(sc11_16x8, 8);
+      sc0_8x16 = vcombine_u8(sc00_8x8, sc01_8x8);
+      sc1_8x16 = vcombine_u8(sc10_8x8, sc11_8x8);
+
+      // add alpha channel
+      sc0_32x4 = vreinterpretq_u32_u8(sc0_8x16);
+      sc1_32x4 = vreinterpretq_u32_u8(sc1_8x16);
+      d0_32x4 = vorrq_u32(sc0_32x4, mask_32x4);
+      d1_32x4 = vorrq_u32(sc1_32x4, mask_32x4);
+
+      // save result
+      vst1q_u32(d, d0_32x4);
+      vst1q_u32(d+4, d1_32x4);
+
+      d+=8;
+      s+=8;
+   }
+
+   end += (l & 7);
+   while (d < end)
+   {
+      *d++ = 0xff000000 + MUL3_SYM(c, *s);
+      s++;
+   }
+}
+
+static void
+_op_blend_p_caa_dp_neon(DATA32 *s, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) {
+   uint16x8_t ad0_16x8;
+   uint16x8_t ad1_16x8;
+   uint16x8_t cs0_16x8;
+   uint16x8_t cs1_16x8;
+   uint32x4_t ad_32x4;
+   uint32x4_t alpha_32x4;
+   uint32x4_t c_32x4;
+   uint32x4_t cond_32x4;
+   uint32x4_t cs_32x4;
+   uint32x4_t d_32x4;
+   uint32x4_t s_32x4;
+   uint32x4_t x0_32x4;
+   uint32x4_t x1_32x4;
+   uint8x16_t ad_8x16;
+   uint8x16_t alpha_8x16;
+   uint8x16_t c_8x16;
+   uint8x16_t cs_8x16;
+   uint8x16_t d_8x16;
+   uint8x16_t s_8x16;
+   uint8x16_t x0_8x16;
+   uint8x16_t x1_8x16;
+   uint8x8_t ad0_8x8;
+   uint8x8_t ad1_8x8;
+   uint8x8_t alpha0_8x8;
+   uint8x8_t alpha1_8x8;
+   uint8x8_t c_8x8;
+   uint8x8_t cs0_8x8;
+   uint8x8_t cs1_8x8;
+   uint8x8_t d0_8x8;
+   uint8x8_t d1_8x8;
+   uint8x8_t s0_8x8;
+   uint8x8_t s1_8x8;
+
+   int temp = (1 + c) & 0xff;
+
+   x1_8x16 = vdupq_n_u8(0x1);
+   x1_32x4 = vreinterpretq_u32_u8(x1_8x16);
+   c_32x4 = vdupq_n_u32(temp);
+   c_32x4 = vmulq_u32(x1_32x4, c_32x4);
+   c_8x16 = vreinterpretq_u8_u32(c_32x4);
+   c_8x8 = vget_low_u8(c_8x16);
+   x0_8x16 = vdupq_n_u8(0x0);
+   x0_32x4 = vreinterpretq_u32_u8(x0_8x16);
+
+   DATA32 *end = d + (l & ~3);
+   while (d < end)
+   {
+      // load 4 elements from s
+      s_32x4 = vld1q_u32(s);
+      s_8x16 = vreinterpretq_u8_u32(s_32x4);
+      s0_8x8 = vget_low_u8(s_8x16);
+      s1_8x8 = vget_high_u8(s_8x16);
+
+      // multiply MUL_256(c, *s)
+      cs0_16x8 = vmull_u8(c_8x8, s0_8x8);
+      cs1_16x8 = vmull_u8(c_8x8, s1_8x8);
+      cs0_8x8 = vshrn_n_u16(cs0_16x8,8);
+      cs1_8x8 = vshrn_n_u16(cs1_16x8,8);
+      cs_8x16 = vcombine_u8(cs0_8x8, cs1_8x8);
+      cs_32x4 = vreinterpretq_u32_u8(cs_8x16);
+
+      // select s if c is 0
+      cond_32x4 = vceqq_u32(c_32x4, x0_32x4);
+      cs_32x4 = vbslq_u32(cond_32x4, s_32x4 , cs_32x4);
+
+      // calculate alpha = 256 - (*s >> 24)
+      alpha_32x4 = vshrq_n_u32(cs_32x4, 24);
+      alpha_32x4 = vmulq_u32(x1_32x4, alpha_32x4);
+      alpha_8x16 = vreinterpretq_u8_u32(alpha_32x4);
+      alpha_8x16 = vsubq_u8(x0_8x16, alpha_8x16);
+      alpha0_8x8 = vget_low_u8(alpha_8x16);
+      alpha1_8x8 = vget_high_u8(alpha_8x16);
+
+      // load 4 elements from d
+      d_32x4 = vld1q_u32(d);
+      d_8x16 = vreinterpretq_u8_u32(d_32x4);
+      d0_8x8 = vget_low_u8(d_8x16);
+      d1_8x8 = vget_high_u8(d_8x16);
+
+      // multiply MUL_256(a, *d)
+      ad0_16x8 = vmull_u8(alpha0_8x8, d0_8x8);
+      ad1_16x8 = vmull_u8(alpha1_8x8, d1_8x8);
+      ad0_8x8 = vshrn_n_u16(ad0_16x8,8);
+      ad1_8x8 = vshrn_n_u16(ad1_16x8,8);
+      ad_8x16 = vcombine_u8(ad0_8x8, ad1_8x8);
+      ad_32x4 = vreinterpretq_u32_u8(ad_8x16);
+
+      // select d if alpha is 0
+      alpha_32x4 = vreinterpretq_u32_u8(alpha_8x16);
+      cond_32x4 = vceqq_u32(alpha_32x4, x0_32x4);
+      ad_32x4 = vbslq_u32(cond_32x4, d_32x4 , ad_32x4);
+
+      // add up everything
+      d_32x4 = vaddq_u32(cs_32x4, ad_32x4);
+
+      // save result
+      vst1q_u32(d, d_32x4);
+
+      d+=4;
+      s+=4;
+   }
+
+   end += (l & 3);
   int alpha;
   c = 1 + (c & 0xff);
-   UNROLL8_PLD_WHILE(d, l, e,
+   while (d < end)
   {
-                       sc = MUL_256(c, *s);
+      DATA32 sc = MUL_256(c, *s);
      alpha = 256 - (sc >> 24);
      *d = sc + MUL_256(alpha, *d);
      d++;
      s++;
-                    });
-#else // the below neon is buggy!! misses rendering of spans, i think with alignment. quick - just disable this.
-#define AP	"_op_blend_pan_caa_dp_"
-   DATA32 *e = d + l, *tmp = (void*)73;
-      asm volatile (
-	".fpu neon					\n\t"
-		/* Set up 'c' */
-		"vdup.u8     d14, %[c]		\n\t"
-		"vmov.i8     d15, #1		\n\t"
-		"vaddl.u8   q15, d14, d15	\n\t"
-		"vshr.u8	q15,#1		\n\t"
+   }

-		// Pick a loop
-		"andS		%[tmp], %[d], $0xf	\n\t"
-		"beq		"AP"quadstart		\n\t"
+}

-		"andS		%[tmp], %[d], $0x4	\n\t"
-		"beq		"AP"dualstart		\n\t"
+static void
+_op_blend_pan_caa_dp_neon(DATA32 *s, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) {
+   int16x8_t c_i16x8;
+   int16x8_t d0_i16x8;
+   int16x8_t d1_i16x8;
+   int16x8_t ds0_i16x8;
+   int16x8_t ds1_i16x8;
+   int16x8_t s0_i16x8;
+   int16x8_t s1_i16x8;
+   int8x16_t ds_i8x16;
+   int8x8_t ds0_i8x8;
+   int8x8_t ds1_i8x8;
+   uint16x8_t c_16x8;
+   uint16x8_t d0_16x8;
+   uint16x8_t d1_16x8;
+   uint16x8_t s0_16x8;
+   uint16x8_t s1_16x8;
+   uint32x4_t d_32x4;
+   uint32x4_t ds_32x4;
+   uint32x4_t s_32x4;
+   uint8x16_t d_8x16;
+   uint8x16_t s_8x16;
+   uint8x8_t d0_8x8;
+   uint8x8_t d1_8x8;
+   uint8x8_t s0_8x8;
+   uint8x8_t s1_8x8;

-	AP"singleloop:					\n\t"
-		"vld1.32	d4[0],  [%[d]]		\n\t"
-		"vld1.32	d0[0],  [%[s]]!		\n\t"
+   c = 1 + (c & 0xff);

-		// Long version of 'd'
-		"vmovl.u8	q8, d4			\n\t"
+   c_16x8 = vdupq_n_u16(c);
+   c_i16x8 = vreinterpretq_s16_u16(c_16x8);

-		// Long version of 's'
-		"vmovl.u8	q6, d0			\n\t"
+   DATA32 *end = d + (l & ~3);
+   while (d < end)
+   {
+      // load 4 elements from d
+      d_32x4 = vld1q_u32(d);
+      d_8x16 = vreinterpretq_u8_u32(d_32x4);
+      d0_8x8 = vget_low_u8(d_8x16);
+      d1_8x8 = vget_high_u8(d_8x16);

-		// d8 = s -d
-		"vsub.s16	d8, d12, d16		\n\t"
+      // spread d so that each channel occupies 16 bit
+      d0_16x8 = vmovl_u8(d0_8x8);
+      d1_16x8 = vmovl_u8(d1_8x8);
+      d0_i16x8 = vreinterpretq_s16_u16(d0_16x8);
+      d1_i16x8 = vreinterpretq_s16_u16(d1_16x8);

-		// Multiply
-		"vmul.s16	d8, d8, d30		\n\t"
+      // load 4 elements from s
+      s_32x4 = vld1q_u32(s);
+      s_8x16 = vreinterpretq_u8_u32(s_32x4);
+      s0_8x8 = vget_low_u8(s_8x16);
+      s1_8x8 = vget_high_u8(s_8x16);

-		// Shift down
-		"vshr.s16	d8, #7			\n\t"
+      // spread s so that each channel occupies 16 bit
+      s0_16x8 = vmovl_u8(s0_8x8);
+      s1_16x8 = vmovl_u8(s1_8x8);
+      s0_i16x8 = vreinterpretq_s16_u16(s0_16x8);
+      s1_i16x8 = vreinterpretq_s16_u16(s1_16x8);

-		// Add 'd'
-		"vqadd.s16	d8, d8, d16		\n\t"
+      // interpolate
+      ds0_i16x8 = vsubq_s16(s0_i16x8, d0_i16x8);
+      ds1_i16x8 = vsubq_s16(s1_i16x8, d1_i16x8);
+      ds0_i16x8 = vmulq_s16(ds0_i16x8, c_i16x8);
+      ds1_i16x8 = vmulq_s16(ds1_i16x8, c_i16x8);
+      ds0_i16x8 = vshrq_n_s16(ds0_i16x8, 8);
+      ds1_i16x8 = vshrq_n_s16(ds1_i16x8, 8);
+      ds0_i16x8 = vaddq_s16(ds0_i16x8, d0_i16x8);
+      ds1_i16x8 = vaddq_s16(ds1_i16x8, d1_i16x8);
+      ds0_i8x8 = vmovn_s16(ds0_i16x8);
+      ds1_i8x8 = vmovn_s16(ds1_i16x8);

-		// Shrink to save
-		"vqmovun.s16	d0,  q4			\n\t"
-		"vst1.32	d0[0], [%[d]]!		\n\t"
+      // save result
+      ds_i8x16 = vcombine_s8(ds0_i8x8, ds1_i8x8);
+      ds_32x4 = vreinterpretq_u32_s8(ds_i8x16);
+      vst1q_u32(d, ds_32x4);

-		// Now where?
-		"andS		%[tmp], %[d], $0xf	\n\t"
-		"beq		"AP"quadstart		\n\t"
+      d+=4;
+      s+=4;
+   }

-	AP"dualstart:					\n\t"
-		// Check we have enough
-		"sub		%[tmp], %[e], %[d]	\n\t"
-		"cmp		%[tmp], #16		\n\t"
-		"blt		"AP"loopout		\n\t"
-
-	AP"dualloop:"
-		"vldm		%[d], 	{d4}		\n\t"
-		"vldm		%[s]!,	{d0}		\n\t"
-
-		// Long version of d
-		"vmovl.u8	q8, d4		\n\t"
-
-		// Long version of s
-		"vmovl.u8	q6, d0		\n\t"
-
-		// q4/q5 = s-d
-		"vsub.s16 	q4, q6, q8 	\n\t"
-
-		// Multiply
-		"vmul.s16	q4,  q4,q15	\n\t"
-
-		// Shift down
-		"vshr.s16	q4, #7		\n\t"
-
-		// Add d
-		"vqadd.s16	q4, q4, q8	\n\t"
-
-		// Shrink to save
-		"vqmovun.s16	d0,  q4		\n\t"
-
-		"vstm		%[d]!,	{d0}	\n\t"
-	AP"quadstart:					\n\t"
-		"sub		%[tmp], %[e], %[d]	\n\t"
-		"cmp		%[tmp], #16		\n\t"
-		"blt		"AP"loopout		\n\t"
-
-		"sub		%[tmp], %[e], #15	\n\t"
-
-	AP"quadloop:				\n\t"
-		//  load 's' -> q0, 'd' -> q2
-		"vldm	%[d],  {d4,d5}		\n\t"
-		"vldm	%[s]!, {d0,d1}		\n\t"
-
-		// Long version of d
-		"vmovl.u8	q8, d4		\n\t"
-		"vmovl.u8	q9, d5		\n\t"
-
-		// Long version of s
-		"vmovl.u8	q6, d0		\n\t"
-		"vmovl.u8	q7, d1		\n\t"
-
-		// q4/q5 = s-d
-		"vsub.s16 	q4, q6, q8 	\n\t"
-		"vsub.s16 	q5, q7, q9 	\n\t"
-
-		// Multiply
-		"vmul.s16	q4,  q4,q15	\n\t"
-		"vmul.s16	q5,  q5,q15	\n\t"
-
-		// Shift down
-		"vshr.s16	q4, #7		\n\t"
-		"vshr.s16	q5, #7		\n\t"
-
-		// Add d
-		"vqadd.s16	q4, q4, q8	\n\t"
-		"vqadd.s16	q5, q5, q9	\n\t"
-
-		// Shrink to save
-		"vqmovun.s16	d0,  q4		\n\t"
-		"vqmovun.s16	d1,  q5		\n\t"
-		"vstm		%[d]!,	{d0,d1}	\n\t"
-		"cmp		%[tmp], %[d] 		\n\t"
-
-		"bhi "AP"quadloop\n\t"
-
-
-		"b "AP"done\n\t"
-	AP"loopout:					\n\t"
-		"cmp 		%[d], %[e]		\n\t"
-                "beq 		"AP"done\n\t"
-		"sub		%[tmp],%[e], %[d]	\n\t"
-		"cmp		%[tmp],$0x04		\n\t"
-		"beq		"AP"singleloop2		\n\t"
-
-	AP"dualloop2:					\n\t"
-		"vldm		%[d], 	{d4}		\n\t"
-		"vldm		%[s]!,	{d0}		\n\t"
-
-		// Long version of d
-		"vmovl.u8	q8, d4		\n\t"
-
-		// Long version of s
-		"vmovl.u8	q6, d0		\n\t"
-
-		// q4/q5 = s-d
-		"vsub.s16 	q4, q6, q8 	\n\t"
-
-		// Multiply
-		"vmul.s16	q4,  q4,q15	\n\t"
-
-		// Shift down
-		"vshr.s16	q4, #7		\n\t"
-
-		// Add d
-		"vqadd.s16	q4, q4, q8	\n\t"
-
-		// Shrink to save
-		"vqmovun.s16	d0,  q4		\n\t"
-
-		"vstm		%[d]!,	{d0}	\n\t"
-
-		"cmp 		%[d], %[e]		\n\t"
-                "beq 		"AP"done		\n\t"
-
-	AP"singleloop2:					\n\t"
-		"vld1.32	d4[0],  [%[d]]		\n\t"
-		"vld1.32	d0[0],  [%[s]]!		\n\t"
-
-		// Long version of 'd'
-		"vmovl.u8	q8, d4			\n\t"
-
-		// Long version of 's'
-		"vmovl.u8	q6, d0			\n\t"
-
-		// d8 = s -d
-		"vsub.s16	d8, d12, d16		\n\t"
-
-		// Multiply
-		"vmul.s16	d8, d8, d30		\n\t"
-
-		// Shift down
-		"vshr.s16	d8, #7			\n\t"
-
-		// Add 'd'
-		"vqadd.s16	d8, d8, d16		\n\t"
-
-		// Shrink to save
-		"vqmovun.s16	d0,  q4			\n\t"
-
-		"vst1.32	d0[0], [%[d]]		\n\t"
-
-
-	AP"done:					\n\t"
-
-	// No output
-	:
-	// Input
-	: [s] "r" (s), [d] "r" (d), [e] "r" (e), [c] "r" (c), [tmp] "r" (tmp)
-	// Clobbered
-	: "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "memory"
-      );
-#undef AP
-#endif   
+   end += (l & 3);
+   while (d < end)
+   {
+      *d = INTERP_256(c, *s, *d);
+      d++;
+      s++;
+   }
 }

 #define _op_blend_pas_c_dp_neon _op_blend_p_c_dp_neon
-#define _op_blend_pan_c_dp_neon _op_blend_p_c_dp_neon
-#define _op_blend_p_can_dp_neon _op_blend_p_c_dp_neon
 #define _op_blend_pas_can_dp_neon _op_blend_p_c_dp_neon
-#define _op_blend_p_caa_dp_neon _op_blend_p_c_dp_neon
 #define _op_blend_pas_caa_dp_neon _op_blend_p_c_dp_neon

 #define _op_blend_p_c_dpan_neon _op_blend_p_c_dp_neon