#ifndef EVAS_BLEND_OPS_H
#define EVAS_BLEND_OPS_H

#include "config.h"

#if defined BUILD_MMX || defined BUILD_SSE
#include "evas_mmx.h"
#endif

#ifdef NEED_SSE3
# if defined BUILD_SSE3
#  include <immintrin.h>
# endif
#endif

/* src pixel flags: */

/* pixels none */
#define SP_N 0
/* pixels (argb default) */
#define SP 1
/* pixels are rgb (ie. alphas == 255) */
#define SP_AN 2
/* pixels alpha are sparse */
#define SP_AS 3
/* src pixels flags count */
#define SP_LAST 4

/* src mask flags: */

/* mask none */
#define SM_N 0
/* mask (alpha) */
#define SM 1
/* mask alphas are 'trivial - ie. only 0 or 255 */
#define SM_AT 2
/* mask alphas are sparse */
#define SM_AS 3
/* src mask flags count */
#define SM_LAST 4

/* src color flags: */

/* color is 0xffffffff */
#define SC_N 0
/* color (argb default) */
#define SC 1
/* color is rgb (ie. 0xffrrggbb) */
#define SC_AN 2
/* color is 'alpha' (ie. 0xaaaaaaaa) */
#define SC_AA 3
/* src color flags count */
#define SC_LAST 4

/* dst pixels flags: */

/* pixels (argb default) */
#define DP  0
/* pixels are rgb (ie. alphas == 255) */
#define DP_AN  1
/* dst pixels flags count */
#define DP_LAST 2

/* cpu types flags */

/* none, bad news */
#define CPU_N  0
/* cpu C */
#define CPU_C  1
/* cpu MMX */
#define CPU_MMX 2
/* cpu SSE */
#define CPU_SSE 3
/* cpu SSE2 */
#define CPU_SSE2 4
/* cpu flags count */
#define CPU_NEON 5
/* CPU SSE3 */
#define CPU_SSE3 6
/* cpu flags count */
#define CPU_LAST 7


/* some useful constants */

extern const DATA32 ALPHA_255;
extern const DATA32 ALPHA_256;

/* some useful C macros */

#define MUL4_256(a, r, g, b, c) \
 ( (((((c) >> 8) & 0xff0000) * (a)) & 0xff000000) + \
   (((((c) & 0xff0000) * (r)) >> 8) & 0xff0000) + \
   (((((c) & 0xff00) * (g)) >> 8) & 0xff00) + \
   ((((c) & 0xff) * (b)) >> 8) )

#define MUL3_256(r, g, b, c) \
 ( (((((c) & 0xff0000) * (r)) >> 8) & 0xff0000) + \
   (((((c) & 0xff00) * (g)) >> 8) & 0xff00) + \
   ((((c) & 0xff) * (b)) >> 8) )

#define MUL_256(a, c) \
 ( (((((c) >> 8) & 0x00ff00ff) * (a)) & 0xff00ff00) + \
   (((((c) & 0x00ff00ff) * (a)) >> 8) & 0x00ff00ff) )

#define MUL4_SYM(x, y) \
 ( ((((((x) >> 16) & 0xff00) * (((y) >> 16) & 0xff00)) + 0xff0000) & 0xff000000) + \
   ((((((x) >> 8) & 0xff00) * (((y) >> 16) & 0xff)) + 0xff00) & 0xff0000) + \
   ((((((x) & 0xff00) * ((y) & 0xff00)) + 0xff0000) >> 16) & 0xff00) + \
   (((((x) & 0xff) * ((y) & 0xff)) + 0xff) >> 8) )

#define MUL3_SYM(x, y) \
 ( ((((((x) >> 8) & 0xff00) * (((y) >> 16) & 0xff)) + 0xff00) & 0xff0000) + \
   ((((((x) & 0xff00) * ((y) & 0xff00)) + 0xff0000) >> 16) & 0xff00) + \
   (((((x) & 0xff) * ((y) & 0xff)) + 0xff) >> 8) )

#define MUL_SYM(a, x) \
 ( (((((x) >> 8) & 0x00ff00ff) * (a) + 0xff00ff) & 0xff00ff00) + \
   (((((x) & 0x00ff00ff) * (a) + 0xff00ff) >> 8) & 0x00ff00ff) )

#define MUL_A_256(a, c) \
 ( ((((c) >> 8) & 0x00ff0000) * (a)) & 0xff000000 )

#define MUL_A_SYM(a, c) \
 ( (((((c) >> 8) & 0x00ff0000) * (a)) + 0x00ff0000) & 0xff000000 )

#define INTERP_256(a, c0, c1) \
 ( (((((((c0) >> 8) & 0xff00ff) - (((c1) >> 8) & 0xff00ff)) * (a)) \
   + ((c1) & 0xff00ff00)) & 0xff00ff00) + \
   (((((((c0) & 0xff00ff) - ((c1) & 0xff00ff)) * (a)) >> 8) \
   + ((c1) & 0xff00ff)) & 0xff00ff) )

#define INTERP_RGB_256(a, c0, c1) \
 ( (((((((c0) >> 8) & 0xff) - (((c1) >> 8) & 0xff)) * (a)) \
   + ((c1) & 0xff00)) & 0xff00) + \
   (((((((c0) & 0xff00ff) - ((c1) & 0xff00ff)) * (a)) >> 8) \
   + ((c1) & 0xff00ff)) & 0xff00ff) )

#define INTERP_A_256(a, c0, c1) \
 ( (((((((c0) >> 8) & 0xff0000) - (((c1) >> 8) & 0xff0000)) * (a)) \
   + ((c1) & 0xff000000)) & 0xff000000) )


/* some useful MMX macros */

#ifdef BUILD_MMX
#define MOV_A2R(a, mma) \
	movd_m2r(a, mma); \
	punpcklwd_r2r(mma, mma); \
	punpckldq_r2r(mma, mma);

#define MOV_P2R(c, mmc, mmz) \
	movd_m2r(c, mmc); \
	punpcklbw_r2r(mmz, mmc);

#define MOV_R2P(mmc, c, mmz) \
	packuswb_r2r(mmz, mmc); \
	movd_r2m(mmc, c);

#define MUL4_256_R2R(mmx, mmy) \
	pmullw_r2r(mmx, mmy); \
	psrlw_i2r(8, mmy);

#define MUL4_SYM_R2R(mmx, mmy, mm255) \
	pmullw_r2r(mmx, mmy); \
	paddw_r2r(mm255, mmy); \
	psrlw_i2r(8, mmy);

#define MOV_RA2R(mmx, mma) \
	movq_r2r(mmx, mma); \
	punpckhwd_r2r(mma, mma); \
	punpckhdq_r2r(mma, mma);

#define MOV_PA2R(c, mma) \
	movd_m2r(c, mma); \
	punpcklbw_r2r(mma, mma); \
	punpckhwd_r2r(mma, mma); \
	punpckhdq_r2r(mma, mma);

#define INTERP_256_R2R(mma, mmx, mmy, mm255) \
	psubw_r2r(mmy, mmx); \
	pmullw_r2r(mma, mmx); \
	psrlw_i2r(8, mmx); \
	paddw_r2r(mmx, mmy); \
	pand_r2r(mm255, mmy);

#endif

/* some useful NEON macros */

#ifdef BUILD_NEON
#define FPU_NEON \
	__asm__ __volatile__(".fpu neon \n\t");

/* copy reg1 to reg2 */
#define VMOV_R2R_NEON(reg1, reg2) \
	__asm__ __volatile__("vmov " #reg1 ", " #reg2 " \n\t" ::: #reg1);

/* copy 32bit value to lower bits of register reg */
#define VMOV_M2R_NEON(reg, value) \
	__asm__ __volatile__("vmov.32 " #reg "[0], %[val] \n\t" :: [val] "r" (value) : #reg); 

/* save 32bit value from lower 64 bits of register regq to memory location */
/* pointed to by pointer, using 64bit register regd as temporary location */
#define VMOV_R2M_NEON(regq, regd, pointer) \
	__asm__ __volatile__("vqmovn.u16 " #regd ", " #regq " \n\t" \
			     "vst1.32 {" #regd "[0]}, [%[p]] \n\t" :: [p] "r" (pointer) : #regd, "memory");

/* spread constant imm in register reg */
#define VMOV_I2R_NEON(reg, imm) \
	__asm__ __volatile__("vmov.i16 " #reg ", " #imm " \n\t" ::: #reg);

/* spread value in register reg */
#define VDUP_NEON(reg, value) \
	__asm__ __volatile__("vdup.16 " #reg ", %[val] \n\t" :: [val] "r" (value) : #reg); 

/* interleave contents of reg1 and reg2 */
#define VZIP_NEON(reg1, reg2) \
	__asm__ __volatile__("vzip.8 " #reg1 ", " #reg2 " \n\t" ::: #reg1 , #reg2);

/* swap contents of two registers */
#define VSWP_NEON(reg1, reg2) \
	__asm__ __volatile__("vswp " #reg1 ", " #reg2 " \n\t" ::: #reg1 , #reg2);

/* set register to zero */
#define VEOR_NEON(reg) \
	__asm__ __volatile__("veor " #reg ", " #reg ", " #reg " \n\t" ::: #reg);

/* do interpolation of every channel RGBA, result is contained in regy */
#define INTERP_256_NEON(rega, regx, regy, reg255) \
	__asm__ __volatile__("vsub.i16 " #regx ", " #regx ", " #regy " \n\t" \
			     "vmul.u16 " #regx ", " #regx ", " #rega " \n\t" \
			     "vsri.16 " #regx ", " #regx ", #8 \n\t" \
			     "vadd.i16 " #regx ", " #regx ", " #regy " \n\t" \
			     "vand " #regy ", " #regx ", " #reg255 " \n\t" \
			     ::: #regx, #regy );

/* multiply every channel of regx and regy */
#define MUL4_SYM_NEON(regx, regy, reg255) \
	__asm__ __volatile__("vmul.u16 " #regx ", " #regx ", " #regy " \n\t" \
			     "vadd.i16 " #regx ", " #regx ", " #reg255 " \n\t" \
			     "vsri.16 " #regx ", " #regx ", #8 \n\t" \
			     "vand " #regx ", " #regx ", " #reg255 " \n\t" \
			     ::: #regx );

#endif

/* some useful SSE3 inline functions */

#ifdef NEED_SSE3
#ifdef BUILD_SSE3

static __m128i GA_MASK_SSE3;
static __m128i RB_MASK_SSE3;
static __m128i SYM4_MASK_SSE3;
static __m128i RGB_MASK_SSE3;
//static __m128i A_MASK_SSE3;

static __m128i ALPHA_SSE3;

#ifndef EFL_ALWAYS_INLINE
# define EFL_ALWAYS_INLINE inline
#endif

static EFL_ALWAYS_INLINE __m128i
mul_256_sse3(__m128i a, __m128i c) {

   /* prepare alpha for word multiplication */
   __m128i a_l = a;
   __m128i a_h = a;
   a_l = _mm_unpacklo_epi16(a_l, a_l);
   a_h = _mm_unpackhi_epi16(a_h, a_h);
   __m128i a0 = (__m128i) _mm_shuffle_ps( (__m128)a_l, (__m128)a_h, 0x88);

   /* first half of calc */
   __m128i c0 = c;
   c0 = _mm_srli_epi32(c0, 8);
   c0 = _mm_and_si128(GA_MASK_SSE3, c0);
   c0 = _mm_mullo_epi16(a0, c0);
   c0 = _mm_and_si128(RB_MASK_SSE3, c0);

   /* second half of calc */
   __m128i c1 = c;
   c1 = _mm_and_si128(GA_MASK_SSE3, c1);
   c1 = _mm_mullo_epi16(a0, c1);
   c1 = _mm_srli_epi32(c1, 8);
   c1 = _mm_and_si128(GA_MASK_SSE3, c1);

   /* combine */
   return _mm_add_epi32(c0, c1);
}

static EFL_ALWAYS_INLINE __m128i
sub4_alpha_sse3(__m128i c) {

   __m128i c0 = c;

   c0 = _mm_srli_epi32(c0, 24);
   return _mm_sub_epi32(ALPHA_SSE3, c0);
}

static EFL_ALWAYS_INLINE __m128i
interp4_256_sse3(__m128i a, __m128i c0, __m128i c1)
{
   const __m128i zero = _mm_setzero_si128();

   __m128i a_l = a;
   __m128i a_h = a;
   a_l = _mm_unpacklo_epi16(a_l, a_l);
   a_h = _mm_unpackhi_epi16(a_h, a_h);

   __m128i a_t = _mm_slli_epi64(a_l, 32);
   __m128i a_t0 = _mm_slli_epi64(a_h, 32);

   a_l = _mm_add_epi32(a_l, a_t);
   a_h = _mm_add_epi32(a_h, a_t0);

   __m128i c0_l = c0;
   __m128i c0_h = c0;

   c0_l = _mm_unpacklo_epi8(c0_l, zero);
   c0_h = _mm_unpackhi_epi8(c0_h, zero);

   __m128i c1_l = c1;
   __m128i c1_h = c1;

   c1_l = _mm_unpacklo_epi8(c1_l, zero);
   c1_h = _mm_unpackhi_epi8(c1_h, zero);

   __m128i cl_sub = _mm_sub_epi16(c0_l, c1_l);
   __m128i ch_sub = _mm_sub_epi16(c0_h, c1_h);

   cl_sub = _mm_mullo_epi16(cl_sub, a_l);
   ch_sub = _mm_mullo_epi16(ch_sub, a_h);

   __m128i c1ls = _mm_slli_epi16(c1_l, 8);
   __m128i c1hs = _mm_slli_epi16(c1_h, 8);

   cl_sub = _mm_add_epi16(cl_sub, c1ls);
   ch_sub = _mm_add_epi16(ch_sub, c1hs);

   cl_sub = _mm_and_si128(cl_sub, RB_MASK_SSE3);
   ch_sub = _mm_and_si128(ch_sub, RB_MASK_SSE3);

   cl_sub = _mm_srli_epi64(cl_sub, 8);
   ch_sub = _mm_srli_epi64(ch_sub, 8);

   cl_sub = _mm_packus_epi16(cl_sub, cl_sub);
   ch_sub = _mm_packus_epi16(ch_sub, ch_sub);

   return  (__m128i) _mm_shuffle_ps( (__m128)cl_sub, (__m128)ch_sub, 0x44);
}

static EFL_ALWAYS_INLINE __m128i
mul_sym_sse3(__m128i a, __m128i c) {

      /* Prepare alpha for word mult */
      __m128i a_l = a;
      __m128i a_h = a;
      a_l = _mm_unpacklo_epi16(a_l, a_l);
      a_h = _mm_unpackhi_epi16(a_h, a_h);
      __m128i a0 = (__m128i) _mm_shuffle_ps( (__m128)a_l, (__m128)a_h, 0x88);

      /* first part */
      __m128i c0 = c;
      c0 = _mm_srli_epi32(c0, 8);
      c0 = _mm_and_si128(GA_MASK_SSE3, c0);
      c0 = _mm_mullo_epi16(a0, c0);
      c0 = _mm_add_epi32(c0, GA_MASK_SSE3);
      c0 = _mm_and_si128(RB_MASK_SSE3, c0);

      /* second part */
      __m128i c1 = c;
      c1 = _mm_and_si128(GA_MASK_SSE3, c1);
      c1 = _mm_mullo_epi16(a0, c1);
      c1 = _mm_add_epi32(c1, GA_MASK_SSE3);
      c1 = _mm_srli_epi32(c1, 8);
      c1 = _mm_and_si128(GA_MASK_SSE3, c1);

      return _mm_add_epi32(c0, c1);
}

static EFL_ALWAYS_INLINE __m128i
mul4_sym_sse3(__m128i x, __m128i y) {

   const __m128i zero = _mm_setzero_si128();

   __m128i x_l = _mm_unpacklo_epi8(x, zero);
   __m128i x_h = _mm_unpackhi_epi8(x, zero);

   __m128i y_l = _mm_unpacklo_epi8(y, zero);
   __m128i y_h = _mm_unpackhi_epi8(y, zero);

   __m128i r_l = _mm_mullo_epi16(x_l, y_l);
   __m128i r_h = _mm_mullo_epi16(x_h, y_h);

   r_l = _mm_add_epi16(r_l, SYM4_MASK_SSE3);
   r_h = _mm_add_epi16(r_h, SYM4_MASK_SSE3);

   r_l = _mm_srli_epi16(r_l, 8);
   r_h = _mm_srli_epi16(r_h, 8);

   return  _mm_packus_epi16(r_l, r_h);
}

static EFL_ALWAYS_INLINE __m128i
mul3_sym_sse3(__m128i x, __m128i y) {

   __m128i res = mul4_sym_sse3(x, y);
   return  _mm_and_si128(res, RGB_MASK_SSE3);
}

#endif
#endif

#define LOOP_ALIGNED_U1_A48(DEST, LENGTH, UOP, A4OP, A8OP) \
  {                                                        \
      while((uintptr_t)DEST & 0xF && LENGTH) UOP \
   \
      while(LENGTH) { \
        switch(LENGTH) {                        \
          case 3: UOP; EINA_FALLTHROUGH;        \
          case 2: UOP; EINA_FALLTHROUGH;        \
          case 1: UOP;                          \
           break;                               \
          case 7:                               \
           EINA_FALLTHROUGH;                    \
          case 6:                               \
           EINA_FALLTHROUGH;                    \
          case 5:                               \
           EINA_FALLTHROUGH;                    \
          case 4:                               \
           A4OP                                 \
           break;                               \
          default:                              \
           A8OP                                 \
           break;                               \
        }                                       \
      } \
   }

#endif