summaryrefslogtreecommitdiff
path: root/src/lib/evas/include/evas_blend_ops.h
blob: aad724a1df1e5082356075b5319a52516a0c80f6 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
#ifndef EVAS_BLEND_OPS_H
#define EVAS_BLEND_OPS_H

#include "config.h"

#if defined BUILD_MMX || defined BUILD_SSE
#include "evas_mmx.h"
#endif

#ifdef NEED_SSE3
# if defined BUILD_SSE3
#  include <immintrin.h>
# endif
#endif

/* src pixel flags: */

/* pixels none */
#define SP_N 0
/* pixels (argb default) */
#define SP 1
/* pixels are rgb (ie. alphas == 255) */
#define SP_AN 2
/* pixels alpha are sparse */
#define SP_AS 3
/* src pixels flags count */
#define SP_LAST 4

/* src mask flags: */

/* mask none */
#define SM_N 0
/* mask (alpha) */
#define SM 1
/* mask alphas are 'trivial - ie. only 0 or 255 */
#define SM_AT 2
/* mask alphas are sparse */
#define SM_AS 3
/* src mask flags count */
#define SM_LAST 4

/* src color flags: */

/* color is 0xffffffff */
#define SC_N 0
/* color (argb default) */
#define SC 1
/* color is rgb (ie. 0xffrrggbb) */
#define SC_AN 2
/* color is 'alpha' (ie. 0xaaaaaaaa) */
#define SC_AA 3
/* src color flags count */
#define SC_LAST 4

/* dst pixels flags: */

/* pixels (argb default) */
#define DP  0
/* pixels are rgb (ie. alphas == 255) */
#define DP_AN  1
/* dst pixels flags count */
#define DP_LAST 2

/* cpu types flags */

/* none, bad news */
#define CPU_N  0
/* cpu C */
#define CPU_C  1
/* cpu MMX */
#define CPU_MMX 2
/* cpu SSE */
#define CPU_SSE 3
/* cpu SSE2 */
#define CPU_SSE2 4
/* cpu flags count */
#define CPU_NEON 5
/* CPU SSE3 */
#define CPU_SSE3 6
/* cpu flags count */
#define CPU_LAST 7


/* some useful constants */

extern const DATA32 ALPHA_255;
extern const DATA32 ALPHA_256;

/* some useful C macros */

#define MUL4_256(a, r, g, b, c) \
 ( (((((c) >> 8) & 0xff0000) * (a)) & 0xff000000) + \
   (((((c) & 0xff0000) * (r)) >> 8) & 0xff0000) + \
   (((((c) & 0xff00) * (g)) >> 8) & 0xff00) + \
   ((((c) & 0xff) * (b)) >> 8) )

#define MUL3_256(r, g, b, c) \
 ( (((((c) & 0xff0000) * (r)) >> 8) & 0xff0000) + \
   (((((c) & 0xff00) * (g)) >> 8) & 0xff00) + \
   ((((c) & 0xff) * (b)) >> 8) )

#define MUL_256(a, c) \
 ( (((((c) >> 8) & 0x00ff00ff) * (a)) & 0xff00ff00) + \
   (((((c) & 0x00ff00ff) * (a)) >> 8) & 0x00ff00ff) )

#define MUL4_SYM(x, y) \
 ( ((((((x) >> 16) & 0xff00) * (((y) >> 16) & 0xff00)) + 0xff0000) & 0xff000000) + \
   ((((((x) >> 8) & 0xff00) * (((y) >> 16) & 0xff)) + 0xff00) & 0xff0000) + \
   ((((((x) & 0xff00) * ((y) & 0xff00)) + 0xff0000) >> 16) & 0xff00) + \
   (((((x) & 0xff) * ((y) & 0xff)) + 0xff) >> 8) )

#define MUL3_SYM(x, y) \
 ( ((((((x) >> 8) & 0xff00) * (((y) >> 16) & 0xff)) + 0xff00) & 0xff0000) + \
   ((((((x) & 0xff00) * ((y) & 0xff00)) + 0xff0000) >> 16) & 0xff00) + \
   (((((x) & 0xff) * ((y) & 0xff)) + 0xff) >> 8) )

#define MUL_SYM(a, x) \
 ( (((((x) >> 8) & 0x00ff00ff) * (a) + 0xff00ff) & 0xff00ff00) + \
   (((((x) & 0x00ff00ff) * (a) + 0xff00ff) >> 8) & 0x00ff00ff) )

#define MUL_A_256(a, c) \
 ( ((((c) >> 8) & 0x00ff0000) * (a)) & 0xff000000 )

#define MUL_A_SYM(a, c) \
 ( (((((c) >> 8) & 0x00ff0000) * (a)) + 0x00ff0000) & 0xff000000 )

#define INTERP_256(a, c0, c1) \
 ( (((((((c0) >> 8) & 0xff00ff) - (((c1) >> 8) & 0xff00ff)) * (a)) \
   + ((c1) & 0xff00ff00)) & 0xff00ff00) + \
   (((((((c0) & 0xff00ff) - ((c1) & 0xff00ff)) * (a)) >> 8) \
   + ((c1) & 0xff00ff)) & 0xff00ff) )

#define INTERP_RGB_256(a, c0, c1) \
 ( (((((((c0) >> 8) & 0xff) - (((c1) >> 8) & 0xff)) * (a)) \
   + ((c1) & 0xff00)) & 0xff00) + \
   (((((((c0) & 0xff00ff) - ((c1) & 0xff00ff)) * (a)) >> 8) \
   + ((c1) & 0xff00ff)) & 0xff00ff) )

#define INTERP_A_256(a, c0, c1) \
 ( (((((((c0) >> 8) & 0xff0000) - (((c1) >> 8) & 0xff0000)) * (a)) \
   + ((c1) & 0xff000000)) & 0xff000000) )


/* some useful MMX macros */

#ifdef BUILD_MMX
#define MOV_A2R(a, mma) \
	movd_m2r(a, mma); \
	punpcklwd_r2r(mma, mma); \
	punpckldq_r2r(mma, mma);

#define MOV_P2R(c, mmc, mmz) \
	movd_m2r(c, mmc); \
	punpcklbw_r2r(mmz, mmc);

#define MOV_R2P(mmc, c, mmz) \
	packuswb_r2r(mmz, mmc); \
	movd_r2m(mmc, c);

#define MUL4_256_R2R(mmx, mmy) \
	pmullw_r2r(mmx, mmy); \
	psrlw_i2r(8, mmy);

#define MUL4_SYM_R2R(mmx, mmy, mm255) \
	pmullw_r2r(mmx, mmy); \
	paddw_r2r(mm255, mmy); \
	psrlw_i2r(8, mmy);

#define MOV_RA2R(mmx, mma) \
	movq_r2r(mmx, mma); \
	punpckhwd_r2r(mma, mma); \
	punpckhdq_r2r(mma, mma);

#define MOV_PA2R(c, mma) \
	movd_m2r(c, mma); \
	punpcklbw_r2r(mma, mma); \
	punpckhwd_r2r(mma, mma); \
	punpckhdq_r2r(mma, mma);

#define INTERP_256_R2R(mma, mmx, mmy, mm255) \
	psubw_r2r(mmy, mmx); \
	pmullw_r2r(mma, mmx); \
	psrlw_i2r(8, mmx); \
	paddw_r2r(mmx, mmy); \
	pand_r2r(mm255, mmy);

#endif

/* some useful NEON macros */

#ifdef BUILD_NEON
#define FPU_NEON \
	__asm__ __volatile__(".fpu neon \n\t");

/* copy reg1 to reg2 */
#define VMOV_R2R_NEON(reg1, reg2) \
	__asm__ __volatile__("vmov " #reg1 ", " #reg2 " \n\t" ::: #reg1);

/* copy 32bit value to lower bits of register reg */
#define VMOV_M2R_NEON(reg, value) \
	__asm__ __volatile__("vmov.32 " #reg "[0], %[val] \n\t" :: [val] "r" (value) : #reg); 

/* save 32bit value from lower 64 bits of register regq to memory location */
/* pointed to by pointer, using 64bit register regd as temporary location */
#define VMOV_R2M_NEON(regq, regd, pointer) \
	__asm__ __volatile__("vqmovn.u16 " #regd ", " #regq " \n\t" \
			     "vst1.32 {" #regd "[0]}, [%[p]] \n\t" :: [p] "r" (pointer) : #regd, "memory");

/* spread constant imm in register reg */
#define VMOV_I2R_NEON(reg, imm) \
	__asm__ __volatile__("vmov.i16 " #reg ", " #imm " \n\t" ::: #reg);

/* spread value in register reg */
#define VDUP_NEON(reg, value) \
	__asm__ __volatile__("vdup.16 " #reg ", %[val] \n\t" :: [val] "r" (value) : #reg); 

/* interleave contents of reg1 and reg2 */
#define VZIP_NEON(reg1, reg2) \
	__asm__ __volatile__("vzip.8 " #reg1 ", " #reg2 " \n\t" ::: #reg1 , #reg2);

/* swap contents of two registers */
#define VSWP_NEON(reg1, reg2) \
	__asm__ __volatile__("vswp " #reg1 ", " #reg2 " \n\t" ::: #reg1 , #reg2);

/* set register to zero */
#define VEOR_NEON(reg) \
	__asm__ __volatile__("veor " #reg ", " #reg ", " #reg " \n\t" ::: #reg);

/* do interpolation of every channel RGBA, result is contained in regy */
#define INTERP_256_NEON(rega, regx, regy, reg255) \
	__asm__ __volatile__("vsub.i16 " #regx ", " #regx ", " #regy " \n\t" \
			     "vmul.u16 " #regx ", " #regx ", " #rega " \n\t" \
			     "vsri.16 " #regx ", " #regx ", #8 \n\t" \
			     "vadd.i16 " #regx ", " #regx ", " #regy " \n\t" \
			     "vand " #regy ", " #regx ", " #reg255 " \n\t" \
			     ::: #regx, #regy );

/* multiply every channel of regx and regy */
#define MUL4_SYM_NEON(regx, regy, reg255) \
	__asm__ __volatile__("vmul.u16 " #regx ", " #regx ", " #regy " \n\t" \
			     "vadd.i16 " #regx ", " #regx ", " #reg255 " \n\t" \
			     "vsri.16 " #regx ", " #regx ", #8 \n\t" \
			     "vand " #regx ", " #regx ", " #reg255 " \n\t" \
			     ::: #regx );

#endif

/* some useful SSE3 inline functions */

#ifdef NEED_SSE3
#ifdef BUILD_SSE3

static __m128i GA_MASK_SSE3;
static __m128i RB_MASK_SSE3;
static __m128i SYM4_MASK_SSE3;
static __m128i RGB_MASK_SSE3;
//static __m128i A_MASK_SSE3;

static __m128i ALPHA_SSE3;

static EFL_ALWAYS_INLINE __m128i
mul_256_sse3(__m128i a, __m128i c) {

   /* prepare alpha for word multiplication */
   __m128i a_l = a;
   __m128i a_h = a;
   a_l = _mm_unpacklo_epi16(a_l, a_l);
   a_h = _mm_unpackhi_epi16(a_h, a_h);
   __m128i a0 = (__m128i) _mm_shuffle_ps( (__m128)a_l, (__m128)a_h, 0x88);

   /* first half of calc */
   __m128i c0 = c;
   c0 = _mm_srli_epi32(c0, 8);
   c0 = _mm_and_si128(GA_MASK_SSE3, c0);
   c0 = _mm_mullo_epi16(a0, c0);
   c0 = _mm_and_si128(RB_MASK_SSE3, c0);

   /* second half of calc */
   __m128i c1 = c;
   c1 = _mm_and_si128(GA_MASK_SSE3, c1);
   c1 = _mm_mullo_epi16(a0, c1);
   c1 = _mm_srli_epi32(c1, 8);
   c1 = _mm_and_si128(GA_MASK_SSE3, c1);

   /* combine */
   return _mm_add_epi32(c0, c1);
}

static EFL_ALWAYS_INLINE __m128i
sub4_alpha_sse3(__m128i c) {

   __m128i c0 = c;

   c0 = _mm_srli_epi32(c0, 24);
   return _mm_sub_epi32(ALPHA_SSE3, c0);
}

static EFL_ALWAYS_INLINE __m128i
interp4_256_sse3(__m128i a, __m128i c0, __m128i c1)
{
   const __m128i zero = _mm_setzero_si128();

   __m128i a_l = a;
   __m128i a_h = a;
   a_l = _mm_unpacklo_epi16(a_l, a_l);
   a_h = _mm_unpackhi_epi16(a_h, a_h);

   __m128i a_t = _mm_slli_epi64(a_l, 32);
   __m128i a_t0 = _mm_slli_epi64(a_h, 32);

   a_l = _mm_add_epi32(a_l, a_t);
   a_h = _mm_add_epi32(a_h, a_t0);

   __m128i c0_l = c0;
   __m128i c0_h = c0;

   c0_l = _mm_unpacklo_epi8(c0_l, zero);
   c0_h = _mm_unpackhi_epi8(c0_h, zero);

   __m128i c1_l = c1;
   __m128i c1_h = c1;

   c1_l = _mm_unpacklo_epi8(c1_l, zero);
   c1_h = _mm_unpackhi_epi8(c1_h, zero);

   __m128i cl_sub = _mm_sub_epi16(c0_l, c1_l);
   __m128i ch_sub = _mm_sub_epi16(c0_h, c1_h);

   cl_sub = _mm_mullo_epi16(cl_sub, a_l);
   ch_sub = _mm_mullo_epi16(ch_sub, a_h);

   __m128i c1ls = _mm_slli_epi16(c1_l, 8);
   __m128i c1hs = _mm_slli_epi16(c1_h, 8);

   cl_sub = _mm_add_epi16(cl_sub, c1ls);
   ch_sub = _mm_add_epi16(ch_sub, c1hs);

   cl_sub = _mm_and_si128(cl_sub, RB_MASK_SSE3);
   ch_sub = _mm_and_si128(ch_sub, RB_MASK_SSE3);

   cl_sub = _mm_srli_epi64(cl_sub, 8);
   ch_sub = _mm_srli_epi64(ch_sub, 8);

   cl_sub = _mm_packus_epi16(cl_sub, cl_sub);
   ch_sub = _mm_packus_epi16(ch_sub, ch_sub);

   return  (__m128i) _mm_shuffle_ps( (__m128)cl_sub, (__m128)ch_sub, 0x44);
}

static EFL_ALWAYS_INLINE __m128i
mul_sym_sse3(__m128i a, __m128i c) {

      /* Prepare alpha for word mult */
      __m128i a_l = a;
      __m128i a_h = a;
      a_l = _mm_unpacklo_epi16(a_l, a_l);
      a_h = _mm_unpackhi_epi16(a_h, a_h);
      __m128i a0 = (__m128i) _mm_shuffle_ps( (__m128)a_l, (__m128)a_h, 0x88);

      /* first part */
      __m128i c0 = c;
      c0 = _mm_srli_epi32(c0, 8);
      c0 = _mm_and_si128(GA_MASK_SSE3, c0);
      c0 = _mm_mullo_epi16(a0, c0);
      c0 = _mm_add_epi32(c0, GA_MASK_SSE3);
      c0 = _mm_and_si128(RB_MASK_SSE3, c0);

      /* second part */
      __m128i c1 = c;
      c1 = _mm_and_si128(GA_MASK_SSE3, c1);
      c1 = _mm_mullo_epi16(a0, c1);
      c1 = _mm_add_epi32(c1, GA_MASK_SSE3);
      c1 = _mm_srli_epi32(c1, 8);
      c1 = _mm_and_si128(GA_MASK_SSE3, c1);

      return _mm_add_epi32(c0, c1);
}

static EFL_ALWAYS_INLINE __m128i
mul4_sym_sse3(__m128i x, __m128i y) {

   const __m128i zero = _mm_setzero_si128();

   __m128i x_l = _mm_unpacklo_epi8(x, zero);
   __m128i x_h = _mm_unpackhi_epi8(x, zero);

   __m128i y_l = _mm_unpacklo_epi8(y, zero);
   __m128i y_h = _mm_unpackhi_epi8(y, zero);

   __m128i r_l = _mm_mullo_epi16(x_l, y_l);
   __m128i r_h = _mm_mullo_epi16(x_h, y_h);

   r_l = _mm_add_epi16(r_l, SYM4_MASK_SSE3);
   r_h = _mm_add_epi16(r_h, SYM4_MASK_SSE3);

   r_l = _mm_srli_epi16(r_l, 8);
   r_h = _mm_srli_epi16(r_h, 8);

   return  _mm_packus_epi16(r_l, r_h);
}

static EFL_ALWAYS_INLINE __m128i
mul3_sym_sse3(__m128i x, __m128i y) {

   __m128i res = mul4_sym_sse3(x, y);
   return  _mm_and_si128(res, RGB_MASK_SSE3);
}

#endif
#endif

#define LOOP_ALIGNED_U1_A48(DEST, LENGTH, UOP, A4OP, A8OP) \
  {                                                        \
      while((uintptr_t)DEST & 0xF && LENGTH) UOP \
   \
      while(LENGTH) { \
        switch(LENGTH) {                        \
          case 3: UOP; EINA_FALLTHROUGH;        \
          case 2: UOP; EINA_FALLTHROUGH;        \
          case 1: UOP;                          \
           break;                               \
          case 7:                               \
           EINA_FALLTHROUGH;                    \
          case 6:                               \
           EINA_FALLTHROUGH;                    \
          case 5:                               \
           EINA_FALLTHROUGH;                    \
          case 4:                               \
           A4OP                                 \
           break;                               \
          default:                              \
           A8OP                                 \
           break;                               \
        }                                       \
      } \
   }

#endif