#ifndef EVAS_BLEND_OPS_H #define EVAS_BLEND_OPS_H #include "config.h" #if defined BUILD_MMX || defined BUILD_SSE #include "evas_mmx.h" #endif #ifdef NEED_SSE3 # if defined BUILD_SSE3 # include # endif #endif /* src pixel flags: */ /* pixels none */ #define SP_N 0 /* pixels (argb default) */ #define SP 1 /* pixels are rgb (ie. alphas == 255) */ #define SP_AN 2 /* pixels alpha are sparse */ #define SP_AS 3 /* src pixels flags count */ #define SP_LAST 4 /* src mask flags: */ /* mask none */ #define SM_N 0 /* mask (alpha) */ #define SM 1 /* mask alphas are 'trivial - ie. only 0 or 255 */ #define SM_AT 2 /* mask alphas are sparse */ #define SM_AS 3 /* src mask flags count */ #define SM_LAST 4 /* src color flags: */ /* color is 0xffffffff */ #define SC_N 0 /* color (argb default) */ #define SC 1 /* color is rgb (ie. 0xffrrggbb) */ #define SC_AN 2 /* color is 'alpha' (ie. 0xaaaaaaaa) */ #define SC_AA 3 /* src color flags count */ #define SC_LAST 4 /* dst pixels flags: */ /* pixels (argb default) */ #define DP 0 /* pixels are rgb (ie. alphas == 255) */ #define DP_AN 1 /* dst pixels flags count */ #define DP_LAST 2 /* cpu types flags */ /* none, bad news */ #define CPU_N 0 /* cpu C */ #define CPU_C 1 /* cpu MMX */ #define CPU_MMX 2 /* cpu SSE */ #define CPU_SSE 3 /* cpu SSE2 */ #define CPU_SSE2 4 /* cpu flags count */ #define CPU_NEON 5 /* CPU SSE3 */ #define CPU_SSE3 6 /* cpu flags count */ #define CPU_LAST 7 /* some useful constants */ extern const DATA32 ALPHA_255; extern const DATA32 ALPHA_256; /* some useful C macros */ #define MUL4_256(a, r, g, b, c) \ ( (((((c) >> 8) & 0xff0000) * (a)) & 0xff000000) + \ (((((c) & 0xff0000) * (r)) >> 8) & 0xff0000) + \ (((((c) & 0xff00) * (g)) >> 8) & 0xff00) + \ ((((c) & 0xff) * (b)) >> 8) ) #define MUL3_256(r, g, b, c) \ ( (((((c) & 0xff0000) * (r)) >> 8) & 0xff0000) + \ (((((c) & 0xff00) * (g)) >> 8) & 0xff00) + \ ((((c) & 0xff) * (b)) >> 8) ) #define MUL_256(a, c) \ ( (((((c) >> 8) & 0x00ff00ff) * (a)) & 0xff00ff00) + \ (((((c) & 0x00ff00ff) * (a)) >> 8) & 0x00ff00ff) ) #define MUL4_SYM(x, y) \ ( ((((((x) >> 16) & 0xff00) * (((y) >> 16) & 0xff00)) + 0xff0000) & 0xff000000) + \ ((((((x) >> 8) & 0xff00) * (((y) >> 16) & 0xff)) + 0xff00) & 0xff0000) + \ ((((((x) & 0xff00) * ((y) & 0xff00)) + 0xff0000) >> 16) & 0xff00) + \ (((((x) & 0xff) * ((y) & 0xff)) + 0xff) >> 8) ) #define MUL3_SYM(x, y) \ ( ((((((x) >> 8) & 0xff00) * (((y) >> 16) & 0xff)) + 0xff00) & 0xff0000) + \ ((((((x) & 0xff00) * ((y) & 0xff00)) + 0xff0000) >> 16) & 0xff00) + \ (((((x) & 0xff) * ((y) & 0xff)) + 0xff) >> 8) ) #define MUL_SYM(a, x) \ ( (((((x) >> 8) & 0x00ff00ff) * (a) + 0xff00ff) & 0xff00ff00) + \ (((((x) & 0x00ff00ff) * (a) + 0xff00ff) >> 8) & 0x00ff00ff) ) #define MUL_A_256(a, c) \ ( ((((c) >> 8) & 0x00ff0000) * (a)) & 0xff000000 ) #define MUL_A_SYM(a, c) \ ( (((((c) >> 8) & 0x00ff0000) * (a)) + 0x00ff0000) & 0xff000000 ) #define INTERP_256(a, c0, c1) \ ( (((((((c0) >> 8) & 0xff00ff) - (((c1) >> 8) & 0xff00ff)) * (a)) \ + ((c1) & 0xff00ff00)) & 0xff00ff00) + \ (((((((c0) & 0xff00ff) - ((c1) & 0xff00ff)) * (a)) >> 8) \ + ((c1) & 0xff00ff)) & 0xff00ff) ) #define INTERP_RGB_256(a, c0, c1) \ ( (((((((c0) >> 8) & 0xff) - (((c1) >> 8) & 0xff)) * (a)) \ + ((c1) & 0xff00)) & 0xff00) + \ (((((((c0) & 0xff00ff) - ((c1) & 0xff00ff)) * (a)) >> 8) \ + ((c1) & 0xff00ff)) & 0xff00ff) ) #define INTERP_A_256(a, c0, c1) \ ( (((((((c0) >> 8) & 0xff0000) - (((c1) >> 8) & 0xff0000)) * (a)) \ + ((c1) & 0xff000000)) & 0xff000000) ) /* some useful MMX macros */ #ifdef BUILD_MMX #define MOV_A2R(a, mma) \ movd_m2r(a, mma); \ punpcklwd_r2r(mma, mma); \ punpckldq_r2r(mma, mma); #define MOV_P2R(c, mmc, mmz) \ movd_m2r(c, mmc); \ punpcklbw_r2r(mmz, mmc); #define MOV_R2P(mmc, c, mmz) \ packuswb_r2r(mmz, mmc); \ movd_r2m(mmc, c); #define MUL4_256_R2R(mmx, mmy) \ pmullw_r2r(mmx, mmy); \ psrlw_i2r(8, mmy); #define MUL4_SYM_R2R(mmx, mmy, mm255) \ pmullw_r2r(mmx, mmy); \ paddw_r2r(mm255, mmy); \ psrlw_i2r(8, mmy); #define MOV_RA2R(mmx, mma) \ movq_r2r(mmx, mma); \ punpckhwd_r2r(mma, mma); \ punpckhdq_r2r(mma, mma); #define MOV_PA2R(c, mma) \ movd_m2r(c, mma); \ punpcklbw_r2r(mma, mma); \ punpckhwd_r2r(mma, mma); \ punpckhdq_r2r(mma, mma); #define INTERP_256_R2R(mma, mmx, mmy, mm255) \ psubw_r2r(mmy, mmx); \ pmullw_r2r(mma, mmx); \ psrlw_i2r(8, mmx); \ paddw_r2r(mmx, mmy); \ pand_r2r(mm255, mmy); #endif /* some useful NEON macros */ #ifdef BUILD_NEON #define FPU_NEON \ __asm__ __volatile__(".fpu neon \n\t"); /* copy reg1 to reg2 */ #define VMOV_R2R_NEON(reg1, reg2) \ __asm__ __volatile__("vmov " #reg1 ", " #reg2 " \n\t" ::: #reg1); /* copy 32bit value to lower bits of register reg */ #define VMOV_M2R_NEON(reg, value) \ __asm__ __volatile__("vmov.32 " #reg "[0], %[val] \n\t" :: [val] "r" (value) : #reg); /* save 32bit value from lower 64 bits of register regq to memory location */ /* pointed to by pointer, using 64bit register regd as temporary location */ #define VMOV_R2M_NEON(regq, regd, pointer) \ __asm__ __volatile__("vqmovn.u16 " #regd ", " #regq " \n\t" \ "vst1.32 {" #regd "[0]}, [%[p]] \n\t" :: [p] "r" (pointer) : #regd, "memory"); /* spread constant imm in register reg */ #define VMOV_I2R_NEON(reg, imm) \ __asm__ __volatile__("vmov.i16 " #reg ", " #imm " \n\t" ::: #reg); /* spread value in register reg */ #define VDUP_NEON(reg, value) \ __asm__ __volatile__("vdup.16 " #reg ", %[val] \n\t" :: [val] "r" (value) : #reg); /* interleave contents of reg1 and reg2 */ #define VZIP_NEON(reg1, reg2) \ __asm__ __volatile__("vzip.8 " #reg1 ", " #reg2 " \n\t" ::: #reg1 , #reg2); /* swap contents of two registers */ #define VSWP_NEON(reg1, reg2) \ __asm__ __volatile__("vswp " #reg1 ", " #reg2 " \n\t" ::: #reg1 , #reg2); /* set register to zero */ #define VEOR_NEON(reg) \ __asm__ __volatile__("veor " #reg ", " #reg ", " #reg " \n\t" ::: #reg); /* do interpolation of every channel RGBA, result is contained in regy */ #define INTERP_256_NEON(rega, regx, regy, reg255) \ __asm__ __volatile__("vsub.i16 " #regx ", " #regx ", " #regy " \n\t" \ "vmul.u16 " #regx ", " #regx ", " #rega " \n\t" \ "vsri.16 " #regx ", " #regx ", #8 \n\t" \ "vadd.i16 " #regx ", " #regx ", " #regy " \n\t" \ "vand " #regy ", " #regx ", " #reg255 " \n\t" \ ::: #regx, #regy ); /* multiply every channel of regx and regy */ #define MUL4_SYM_NEON(regx, regy, reg255) \ __asm__ __volatile__("vmul.u16 " #regx ", " #regx ", " #regy " \n\t" \ "vadd.i16 " #regx ", " #regx ", " #reg255 " \n\t" \ "vsri.16 " #regx ", " #regx ", #8 \n\t" \ "vand " #regx ", " #regx ", " #reg255 " \n\t" \ ::: #regx ); #endif /* some useful SSE3 inline functions */ #ifdef NEED_SSE3 #ifdef BUILD_SSE3 static __m128i GA_MASK_SSE3; static __m128i RB_MASK_SSE3; static __m128i SYM4_MASK_SSE3; static __m128i RGB_MASK_SSE3; //static __m128i A_MASK_SSE3; static __m128i ALPHA_SSE3; #ifndef EFL_ALWAYS_INLINE # define EFL_ALWAYS_INLINE inline #endif static EFL_ALWAYS_INLINE __m128i mul_256_sse3(__m128i a, __m128i c) { /* prepare alpha for word multiplication */ __m128i a_l = a; __m128i a_h = a; a_l = _mm_unpacklo_epi16(a_l, a_l); a_h = _mm_unpackhi_epi16(a_h, a_h); __m128i a0 = (__m128i) _mm_shuffle_ps( (__m128)a_l, (__m128)a_h, 0x88); /* first half of calc */ __m128i c0 = c; c0 = _mm_srli_epi32(c0, 8); c0 = _mm_and_si128(GA_MASK_SSE3, c0); c0 = _mm_mullo_epi16(a0, c0); c0 = _mm_and_si128(RB_MASK_SSE3, c0); /* second half of calc */ __m128i c1 = c; c1 = _mm_and_si128(GA_MASK_SSE3, c1); c1 = _mm_mullo_epi16(a0, c1); c1 = _mm_srli_epi32(c1, 8); c1 = _mm_and_si128(GA_MASK_SSE3, c1); /* combine */ return _mm_add_epi32(c0, c1); } static EFL_ALWAYS_INLINE __m128i sub4_alpha_sse3(__m128i c) { __m128i c0 = c; c0 = _mm_srli_epi32(c0, 24); return _mm_sub_epi32(ALPHA_SSE3, c0); } static EFL_ALWAYS_INLINE __m128i interp4_256_sse3(__m128i a, __m128i c0, __m128i c1) { const __m128i zero = _mm_setzero_si128(); __m128i a_l = a; __m128i a_h = a; a_l = _mm_unpacklo_epi16(a_l, a_l); a_h = _mm_unpackhi_epi16(a_h, a_h); __m128i a_t = _mm_slli_epi64(a_l, 32); __m128i a_t0 = _mm_slli_epi64(a_h, 32); a_l = _mm_add_epi32(a_l, a_t); a_h = _mm_add_epi32(a_h, a_t0); __m128i c0_l = c0; __m128i c0_h = c0; c0_l = _mm_unpacklo_epi8(c0_l, zero); c0_h = _mm_unpackhi_epi8(c0_h, zero); __m128i c1_l = c1; __m128i c1_h = c1; c1_l = _mm_unpacklo_epi8(c1_l, zero); c1_h = _mm_unpackhi_epi8(c1_h, zero); __m128i cl_sub = _mm_sub_epi16(c0_l, c1_l); __m128i ch_sub = _mm_sub_epi16(c0_h, c1_h); cl_sub = _mm_mullo_epi16(cl_sub, a_l); ch_sub = _mm_mullo_epi16(ch_sub, a_h); __m128i c1ls = _mm_slli_epi16(c1_l, 8); __m128i c1hs = _mm_slli_epi16(c1_h, 8); cl_sub = _mm_add_epi16(cl_sub, c1ls); ch_sub = _mm_add_epi16(ch_sub, c1hs); cl_sub = _mm_and_si128(cl_sub, RB_MASK_SSE3); ch_sub = _mm_and_si128(ch_sub, RB_MASK_SSE3); cl_sub = _mm_srli_epi64(cl_sub, 8); ch_sub = _mm_srli_epi64(ch_sub, 8); cl_sub = _mm_packus_epi16(cl_sub, cl_sub); ch_sub = _mm_packus_epi16(ch_sub, ch_sub); return (__m128i) _mm_shuffle_ps( (__m128)cl_sub, (__m128)ch_sub, 0x44); } static EFL_ALWAYS_INLINE __m128i mul_sym_sse3(__m128i a, __m128i c) { /* Prepare alpha for word mult */ __m128i a_l = a; __m128i a_h = a; a_l = _mm_unpacklo_epi16(a_l, a_l); a_h = _mm_unpackhi_epi16(a_h, a_h); __m128i a0 = (__m128i) _mm_shuffle_ps( (__m128)a_l, (__m128)a_h, 0x88); /* first part */ __m128i c0 = c; c0 = _mm_srli_epi32(c0, 8); c0 = _mm_and_si128(GA_MASK_SSE3, c0); c0 = _mm_mullo_epi16(a0, c0); c0 = _mm_add_epi32(c0, GA_MASK_SSE3); c0 = _mm_and_si128(RB_MASK_SSE3, c0); /* second part */ __m128i c1 = c; c1 = _mm_and_si128(GA_MASK_SSE3, c1); c1 = _mm_mullo_epi16(a0, c1); c1 = _mm_add_epi32(c1, GA_MASK_SSE3); c1 = _mm_srli_epi32(c1, 8); c1 = _mm_and_si128(GA_MASK_SSE3, c1); return _mm_add_epi32(c0, c1); } static EFL_ALWAYS_INLINE __m128i mul4_sym_sse3(__m128i x, __m128i y) { const __m128i zero = _mm_setzero_si128(); __m128i x_l = _mm_unpacklo_epi8(x, zero); __m128i x_h = _mm_unpackhi_epi8(x, zero); __m128i y_l = _mm_unpacklo_epi8(y, zero); __m128i y_h = _mm_unpackhi_epi8(y, zero); __m128i r_l = _mm_mullo_epi16(x_l, y_l); __m128i r_h = _mm_mullo_epi16(x_h, y_h); r_l = _mm_add_epi16(r_l, SYM4_MASK_SSE3); r_h = _mm_add_epi16(r_h, SYM4_MASK_SSE3); r_l = _mm_srli_epi16(r_l, 8); r_h = _mm_srli_epi16(r_h, 8); return _mm_packus_epi16(r_l, r_h); } static EFL_ALWAYS_INLINE __m128i mul3_sym_sse3(__m128i x, __m128i y) { __m128i res = mul4_sym_sse3(x, y); return _mm_and_si128(res, RGB_MASK_SSE3); } #endif #endif #define LOOP_ALIGNED_U1_A48(DEST, LENGTH, UOP, A4OP, A8OP) \ { \ while((uintptr_t)DEST & 0xF && LENGTH) UOP \ \ while(LENGTH) { \ switch(LENGTH) { \ case 3: UOP; EINA_FALLTHROUGH; \ case 2: UOP; EINA_FALLTHROUGH; \ case 1: UOP; \ break; \ case 7: \ EINA_FALLTHROUGH; \ case 6: \ EINA_FALLTHROUGH; \ case 5: \ EINA_FALLTHROUGH; \ case 4: \ A4OP \ break; \ default: \ A8OP \ break; \ } \ } \ } #endif