summaryrefslogtreecommitdiff
path: root/src/lib/evas/common/evas_op_blend
diff options
context:
space:
mode:
authorVincent Torri <vincent.torri@gmail.com>2012-11-04 11:51:42 +0000
committerVincent Torri <vincent.torri@gmail.com>2012-11-04 11:51:42 +0000
commitc15e9c6575c3b5f39ded167dda5259de3de96151 (patch)
tree5115d7ae3620af24c2bc094cd062575af7adeda9 /src/lib/evas/common/evas_op_blend
parenta5ac6a987caec5a7f7596a25d0a065b9cc94c50c (diff)
merge: and now Evas
I've tested make -j 3 install and it works nicely I've tested expedite with software and opengl xlib, and it works. Not tested other engines, so please report any problems (engines or other) on the ML. TODO: examples and tests, I'll add them later ISSUE: Eina_Unicode size check. It indirectly depends on eina_config.h, which is created at the end of the configure script. So its size is always 0. I don't know how that size is used, so I can't do a lot, for now. SVN revision: 78895
Diffstat (limited to 'src/lib/evas/common/evas_op_blend')
-rw-r--r--src/lib/evas/common/evas_op_blend/op_blend_color_.c101
-rw-r--r--src/lib/evas/common/evas_op_blend/op_blend_color_i386.c138
-rw-r--r--src/lib/evas/common/evas_op_blend/op_blend_color_neon.c223
-rw-r--r--src/lib/evas/common/evas_op_blend/op_blend_color_sse3.c167
-rw-r--r--src/lib/evas/common/evas_op_blend/op_blend_mask_color_.c177
-rw-r--r--src/lib/evas/common/evas_op_blend/op_blend_mask_color_i386.c251
-rw-r--r--src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c562
-rw-r--r--src/lib/evas/common/evas_op_blend/op_blend_mask_color_sse3.c321
-rw-r--r--src/lib/evas/common/evas_op_blend/op_blend_master_sse3.c77
-rw-r--r--src/lib/evas/common/evas_op_blend/op_blend_pixel_.c154
-rw-r--r--src/lib/evas/common/evas_op_blend/op_blend_pixel_color_.c276
-rw-r--r--src/lib/evas/common/evas_op_blend/op_blend_pixel_color_i386.c221
-rw-r--r--src/lib/evas/common/evas_op_blend/op_blend_pixel_color_neon.c570
-rw-r--r--src/lib/evas/common/evas_op_blend/op_blend_pixel_color_sse3.c543
-rw-r--r--src/lib/evas/common/evas_op_blend/op_blend_pixel_i386.c217
-rw-r--r--src/lib/evas/common/evas_op_blend/op_blend_pixel_mask_.c181
-rw-r--r--src/lib/evas/common/evas_op_blend/op_blend_pixel_mask_i386.c157
-rw-r--r--src/lib/evas/common/evas_op_blend/op_blend_pixel_mask_neon.c129
-rw-r--r--src/lib/evas/common/evas_op_blend/op_blend_pixel_mask_sse3.c300
-rw-r--r--src/lib/evas/common/evas_op_blend/op_blend_pixel_neon.c530
-rw-r--r--src/lib/evas/common/evas_op_blend/op_blend_pixel_sse3.c315
21 files changed, 5610 insertions, 0 deletions
diff --git a/src/lib/evas/common/evas_op_blend/op_blend_color_.c b/src/lib/evas/common/evas_op_blend/op_blend_color_.c
new file mode 100644
index 0000000000..d92f0cb1d1
--- /dev/null
+++ b/src/lib/evas/common/evas_op_blend/op_blend_color_.c
@@ -0,0 +1,101 @@
1/* blend color -> dst */
2
3static void
4_op_blend_c_dp(DATA32 *s EINA_UNUSED, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) {
5 DATA32 *e, a = 256 - (c >> 24);
6 UNROLL8_PLD_WHILE(d, l, e,
7 {
8 *d = c + MUL_256(a, *d);
9 d++;
10 });
11}
12
13#define _op_blend_caa_dp _op_blend_c_dp
14
15#define _op_blend_c_dpan _op_blend_c_dp
16#define _op_blend_caa_dpan _op_blend_c_dpan
17
18static void
19init_blend_color_span_funcs_c(void)
20{
21 op_blend_span_funcs[SP_N][SM_N][SC][DP][CPU_C] = _op_blend_c_dp;
22 op_blend_span_funcs[SP_N][SM_N][SC_AA][DP][CPU_C] = _op_blend_caa_dp;
23
24 op_blend_span_funcs[SP_N][SM_N][SC][DP_AN][CPU_C] = _op_blend_c_dpan;
25 op_blend_span_funcs[SP_N][SM_N][SC_AA][DP_AN][CPU_C] = _op_blend_caa_dpan;
26}
27
28static void
29_op_blend_pt_c_dp(DATA32 s, DATA8 m EINA_UNUSED, DATA32 c, DATA32 *d) {
30 s = 256 - (c >> 24);
31 *d = c + MUL_256(s, *d);
32}
33
34#define _op_blend_pt_caa_dp _op_blend_pt_c_dp
35
36#define _op_blend_pt_c_dpan _op_blend_pt_c_dp
37#define _op_blend_pt_caa_dpan _op_blend_pt_c_dpan
38
39#define _op_blend_pt_c_dpas _op_blend_pt_c_dp
40#define _op_blend_pt_caa_dpas _op_blend_pt_c_dp
41
42static void
43init_blend_color_pt_funcs_c(void)
44{
45 op_blend_pt_funcs[SP_N][SM_N][SC][DP][CPU_C] = _op_blend_pt_c_dp;
46 op_blend_pt_funcs[SP_N][SM_N][SC_AA][DP][CPU_C] = _op_blend_pt_caa_dp;
47
48 op_blend_pt_funcs[SP_N][SM_N][SC][DP_AN][CPU_C] = _op_blend_pt_c_dpan;
49 op_blend_pt_funcs[SP_N][SM_N][SC_AA][DP_AN][CPU_C] = _op_blend_pt_caa_dpan;
50}
51
52/*-----*/
53
54/* blend_rel color -> dst */
55
56static void
57_op_blend_rel_c_dp(DATA32 *s EINA_UNUSED, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) {
58 DATA32 *e;
59 int alpha = 256 - (c >> 24);
60 UNROLL8_PLD_WHILE(d, l, e,
61 {
62 *d = MUL_SYM(*d >> 24, c) + MUL_256(alpha, *d);
63 d++;
64 });
65}
66
67#define _op_blend_rel_caa_dp _op_blend_rel_c_dp
68
69#define _op_blend_rel_c_dpan _op_blend_c_dpan
70#define _op_blend_rel_caa_dpan _op_blend_caa_dpan
71
72static void
73init_blend_rel_color_span_funcs_c(void)
74{
75 op_blend_rel_span_funcs[SP_N][SM_N][SC][DP][CPU_C] = _op_blend_rel_c_dp;
76 op_blend_rel_span_funcs[SP_N][SM_N][SC_AA][DP][CPU_C] = _op_blend_rel_caa_dp;
77
78 op_blend_rel_span_funcs[SP_N][SM_N][SC][DP_AN][CPU_C] = _op_blend_rel_c_dpan;
79 op_blend_rel_span_funcs[SP_N][SM_N][SC_AA][DP_AN][CPU_C] = _op_blend_rel_caa_dpan;
80}
81
82static void
83_op_blend_rel_pt_c_dp(DATA32 s, DATA8 m EINA_UNUSED, DATA32 c, DATA32 *d) {
84 s = *d >> 24;
85 *d = MUL_SYM(s, c) + MUL_256(256 - (c >> 24), *d);
86}
87
88#define _op_blend_rel_pt_caa_dp _op_blend_rel_pt_c_dp
89
90#define _op_blend_rel_pt_c_dpan _op_blend_pt_c_dpan
91#define _op_blend_rel_pt_caa_dpan _op_blend_pt_caa_dpan
92
93static void
94init_blend_rel_color_pt_funcs_c(void)
95{
96 op_blend_rel_pt_funcs[SP_N][SM_N][SC][DP][CPU_C] = _op_blend_rel_pt_c_dp;
97 op_blend_rel_pt_funcs[SP_N][SM_N][SC_AA][DP][CPU_C] = _op_blend_rel_pt_caa_dp;
98
99 op_blend_rel_pt_funcs[SP_N][SM_N][SC][DP_AN][CPU_C] = _op_blend_rel_pt_c_dpan;
100 op_blend_rel_pt_funcs[SP_N][SM_N][SC_AA][DP_AN][CPU_C] = _op_blend_rel_pt_caa_dpan;
101}
diff --git a/src/lib/evas/common/evas_op_blend/op_blend_color_i386.c b/src/lib/evas/common/evas_op_blend/op_blend_color_i386.c
new file mode 100644
index 0000000000..ae9888ffad
--- /dev/null
+++ b/src/lib/evas/common/evas_op_blend/op_blend_color_i386.c
@@ -0,0 +1,138 @@
1/* blend color --> dst */
2
3#ifdef BUILD_MMX
4static void
5_op_blend_c_dp_mmx(DATA32 *s EINA_UNUSED, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) {
6 DATA32 *e = d + l;
7 pxor_r2r(mm0, mm0);
8 MOV_P2R(c, mm2, mm0)
9 c = 256 - (c >> 24);
10 MOV_A2R(c, mm3)
11 while (d < e) {
12 MOV_P2R(*d, mm1, mm0)
13 MUL4_256_R2R(mm3, mm1)
14 paddw_r2r(mm2, mm1);
15 MOV_R2P(mm1, *d, mm0)
16 d++;
17 }
18}
19
20#define _op_blend_caa_dp_mmx _op_blend_c_dp_mmx
21
22#define _op_blend_c_dpan_mmx _op_blend_c_dp_mmx
23#define _op_blend_caa_dpan_mmx _op_blend_c_dpan_mmx
24
25static void
26init_blend_color_span_funcs_mmx(void)
27{
28 op_blend_span_funcs[SP_N][SM_N][SC][DP][CPU_MMX] = _op_blend_c_dp_mmx;
29 op_blend_span_funcs[SP_N][SM_N][SC_AA][DP][CPU_MMX] = _op_blend_caa_dp_mmx;
30
31 op_blend_span_funcs[SP_N][SM_N][SC][DP_AN][CPU_MMX] = _op_blend_c_dpan_mmx;
32 op_blend_span_funcs[SP_N][SM_N][SC_AA][DP_AN][CPU_MMX] = _op_blend_caa_dpan_mmx;
33}
34#endif
35
36#ifdef BUILD_MMX
37static void
38_op_blend_pt_c_dp_mmx(DATA32 s EINA_UNUSED, DATA8 m EINA_UNUSED, DATA32 c, DATA32 *d) {
39 pxor_r2r(mm0, mm0);
40 MOV_P2R(c, mm2, mm0)
41 c = 256 - (c >> 24);
42 MOV_A2R(c, mm3)
43 MOV_P2R(*d, mm1, mm0)
44 MUL4_256_R2R(mm3, mm1)
45 paddw_r2r(mm2, mm1);
46 MOV_R2P(mm1, *d, mm0)
47}
48
49#define _op_blend_pt_caa_dp_mmx _op_blend_pt_c_dp_mmx
50
51#define _op_blend_pt_c_dpan_mmx _op_blend_pt_c_dp_mmx
52#define _op_blend_pt_caa_dpan_mmx _op_blend_pt_c_dpan_mmx
53
54static void
55init_blend_color_pt_funcs_mmx(void)
56{
57 op_blend_pt_funcs[SP_N][SM_N][SC][DP][CPU_MMX] = _op_blend_pt_c_dp_mmx;
58 op_blend_pt_funcs[SP_N][SM_N][SC_AA][DP][CPU_MMX] = _op_blend_pt_caa_dp_mmx;
59
60 op_blend_pt_funcs[SP_N][SM_N][SC][DP_AN][CPU_MMX] = _op_blend_pt_c_dpan_mmx;
61 op_blend_pt_funcs[SP_N][SM_N][SC_AA][DP_AN][CPU_MMX] = _op_blend_pt_caa_dpan_mmx;
62}
63#endif
64/*-----*/
65
66/* blend_rel color -> dst */
67
68#ifdef BUILD_MMX
69static void
70_op_blend_rel_c_dp_mmx(DATA32 *s EINA_UNUSED, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) {
71 DATA32 *e = d + l;
72 pxor_r2r(mm0, mm0);
73 MOV_P2R(c, mm2, mm0)
74 c = 256 - (c >> 24);
75 MOV_A2R(c, mm3)
76 MOV_A2R(ALPHA_255, mm5)
77 while (d < e) {
78 MOV_P2R(*d, mm1, mm0)
79 MOV_RA2R(mm1, mm4)
80 MUL4_256_R2R(mm3, mm1)
81 MUL4_SYM_R2R(mm2, mm4, mm5)
82 paddw_r2r(mm4, mm1);
83 MOV_R2P(mm1, *d, mm0)
84 d++;
85 }
86}
87
88#define _op_blend_rel_caa_dp_mmx _op_blend_rel_c_dp_mmx
89
90#define _op_blend_rel_c_dpan_mmx _op_blend_c_dpan_mmx
91#define _op_blend_rel_caa_dpan_mmx _op_blend_caa_dpan_mmx
92
93static void
94init_blend_rel_color_span_funcs_mmx(void)
95{
96 op_blend_rel_span_funcs[SP_N][SM_N][SC][DP][CPU_MMX] = _op_blend_rel_c_dp_mmx;
97 op_blend_rel_span_funcs[SP_N][SM_N][SC_AA][DP][CPU_MMX] = _op_blend_rel_caa_dp_mmx;
98
99 op_blend_rel_span_funcs[SP_N][SM_N][SC][DP_AN][CPU_MMX] = _op_blend_rel_c_dpan_mmx;
100 op_blend_rel_span_funcs[SP_N][SM_N][SC_AA][DP_AN][CPU_MMX] = _op_blend_rel_caa_dpan_mmx;
101}
102#endif
103
104#ifdef BUILD_MMX
105static void
106_op_blend_rel_pt_c_dp_mmx(DATA32 s EINA_UNUSED, DATA8 m EINA_UNUSED, DATA32 c, DATA32 *d) {
107 pxor_r2r(mm0, mm0);
108 MOV_A2R(ALPHA_256, mm6)
109 MOV_A2R(ALPHA_255, mm5)
110
111 MOV_P2R(c, mm2, mm0)
112 MOV_RA2R(mm2, mm1)
113 psubw_r2r(mm1, mm6);
114
115 MOV_P2R(*d, mm1, mm0)
116 MOV_RA2R(mm1, mm4)
117 MUL4_256_R2R(mm6, mm1)
118
119 MUL4_SYM_R2R(mm4, mm2, mm5)
120 paddw_r2r(mm2, mm1);
121 MOV_R2P(mm1, *d, mm0)
122}
123
124#define _op_blend_rel_pt_caa_dp_mmx _op_blend_rel_pt_c_dp_mmx
125
126#define _op_blend_rel_pt_c_dpan_mmx _op_blend_pt_c_dpan_mmx
127#define _op_blend_rel_pt_caa_dpan_mmx _op_blend_pt_caa_dpan_mmx
128
129static void
130init_blend_rel_color_pt_funcs_mmx(void)
131{
132 op_blend_rel_pt_funcs[SP_N][SM_N][SC][DP][CPU_MMX] = _op_blend_rel_pt_c_dp_mmx;
133 op_blend_rel_pt_funcs[SP_N][SM_N][SC_AA][DP][CPU_MMX] = _op_blend_rel_pt_caa_dp_mmx;
134
135 op_blend_rel_pt_funcs[SP_N][SM_N][SC][DP_AN][CPU_MMX] = _op_blend_rel_pt_c_dpan_mmx;
136 op_blend_rel_pt_funcs[SP_N][SM_N][SC_AA][DP_AN][CPU_MMX] = _op_blend_rel_pt_caa_dpan_mmx;
137}
138#endif
diff --git a/src/lib/evas/common/evas_op_blend/op_blend_color_neon.c b/src/lib/evas/common/evas_op_blend/op_blend_color_neon.c
new file mode 100644
index 0000000000..8512bb4444
--- /dev/null
+++ b/src/lib/evas/common/evas_op_blend/op_blend_color_neon.c
@@ -0,0 +1,223 @@
1/* blend color --> dst */
2
3#ifdef BUILD_NEON
4static void
5_op_blend_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) {
6 DATA32 *e, *tmp = 0;
7#define AP "B_C_DP"
8 asm volatile (
9 ".fpu neon \n\t"
10 "vdup.u32 q6, %[c] \n\t"
11 "vmov.i8 q5, #1 \n\t"
12 "vmvn.u8 q7,q6 \n\t"
13 "vshr.u32 q7, q7, $0x18 \n\t"
14 "vmul.u32 q7,q5, q7 \n\t"
15 "bic %[e], #3 \n\t"
16 "bic %[d], #3 \n\t"
17
18 AP "loopchoose: \n\t"
19 // If aligned already - straight to quads
20 "andS %[tmp], %[d],$0x1f \n\t"
21 "beq "AP"quadloops \n\t"
22
23 "andS %[tmp], %[d],$0x4 \n\t"
24 "beq "AP"dualloop \n\t"
25
26 // Only ever executes once, fall through to dual
27 AP "singleloop: \n\t"
28 // Use 'tmp' not 'd'
29 "vld1.32 d0[0], [%[d]] \n\t"
30 // Only touch d1
31 "vmull.u8 q0, d0, d14 \n\t"
32 "vqrshrn.u16 d0, q0, #8 \n\t"
33 "vadd.u8 d0, d12, d0 \n\t"
34 "vst1.32 d0[0], [%[d]] \n\t"
35
36 "add %[d], #4 \n\t"
37
38 // Can we go the fast path?
39 "andS %[tmp], %[d],$0x1f \n\t"
40 "beq "AP"quadloops \n\t"
41
42 AP "dualloop: \n\t"
43 "sub %[tmp], %[e], %[d] \n\t"
44 "cmp %[tmp], #32 \n\t"
45 "blt "AP"loopout \n\t"
46
47
48 AP "dualloopint: \n\t"
49 "vldr.32 d0, [%[d]] \n\t"
50 "vmull.u8 q1, d0, d14 \n\t"
51 "vqrshrn.u16 d0, q1, #8 \n\t"
52 "vqadd.u8 d0, d0, d12 \n\t"
53
54 "vstm %[d]!, {d0} \n\t"
55
56 "ands %[tmp], %[d], $0x1f \n\t"
57 "bne "AP"dualloopint \n\t"
58
59 AP "quadloops: \n\t"
60 "sub %[tmp], %[e], %[d] \n\t"
61 "cmp %[tmp], #32 \n\t"
62 "blt "AP"loopout \n\t"
63
64 "sub %[tmp],%[e],#31 \n\t"
65
66 AP "quadloopint:\n\t"
67 "vldm %[d], {d0,d1,d2,d3} \n\t"
68
69 "vmull.u8 q2, d0, d14 \n\t"
70 "vmull.u8 q3, d1, d15 \n\t"
71 "vmull.u8 q4, d2, d14 \n\t"
72 "vmull.u8 q5, d3, d15 \n\t"
73
74 "vqrshrn.u16 d0, q2, #8 \n\t"
75 "vqrshrn.u16 d1, q3, #8 \n\t"
76 "vqrshrn.u16 d2, q4, #8 \n\t"
77 "vqrshrn.u16 d3, q5, #8 \n\t"
78
79 "vqadd.u8 q0, q6, q0 \n\t"
80 "vqadd.u8 q1, q6, q1 \n\t"
81
82 "vstm %[d]!, {d0,d1,d2,d3} \n\t"
83
84 "cmp %[tmp], %[d]\n\t"
85 "bhi "AP"quadloopint\n\t"
86
87 AP "loopout: \n\t"
88 "cmp %[d], %[e]\n\t"
89 "beq "AP"done\n\t"
90 "sub %[tmp],%[e], %[d] \n\t"
91 "cmp %[tmp],#8 \n\t"
92 "blt "AP"singleloop2 \n\t"
93
94 AP "dualloop2: \n\t"
95 "sub %[tmp],%[e],$0x7 \n\t"
96 AP "dualloop2int: \n\t"
97 "vldr.64 d0, [%[d]] \n\t"
98 "vmull.u8 q1, d0, d14 \n\t"
99 "vqrshrn.u16 d0, q1, #8 \n\t"
100 "vqadd.u8 d0, d0, d12 \n\t"
101
102 "vstr.64 d0, [%[d]] \n\t"
103
104 "add %[d], #8 \n\t"
105 "cmp %[tmp], %[d] \n\t"
106 "bhi "AP"dualloop2int \n\t"
107
108 // Single ??
109 "cmp %[e], %[d] \n\t"
110 "beq "AP"done \n\t"
111
112 AP "singleloop2: \n\t"
113 "vld1.32 d0[0], [%[d]] \n\t"
114 "vmull.u8 q1, d0, d14 \n\t"
115 "vqrshrn.u16 d0, q1, #8 \n\t"
116 "vqadd.u8 d0, d0, d12 \n\t"
117
118 "vst1.32 d0[0], [%[d]] \n\t"
119
120 AP "done:\n\t"
121
122 : // output regs
123 // Input
124 : [e] "r" (e = d + l), [d] "r" (d), [c] "r" (c), [tmp] "r" (tmp)
125 : "q0", "q1", "q2","q3", "q4","q5","q6", "q7","memory" // clobbered
126
127 );
128#undef AP
129
130}
131
132#define _op_blend_caa_dp_neon _op_blend_c_dp_neon
133
134#define _op_blend_c_dpan_neon _op_blend_c_dp_neon
135#define _op_blend_caa_dpan_neon _op_blend_c_dpan_neon
136
137static void
138init_blend_color_span_funcs_neon(void)
139{
140 op_blend_span_funcs[SP_N][SM_N][SC][DP][CPU_NEON] = _op_blend_c_dp_neon;
141 op_blend_span_funcs[SP_N][SM_N][SC_AA][DP][CPU_NEON] = _op_blend_caa_dp_neon;
142
143 op_blend_span_funcs[SP_N][SM_N][SC][DP_AN][CPU_NEON] = _op_blend_c_dpan_neon;
144 op_blend_span_funcs[SP_N][SM_N][SC_AA][DP_AN][CPU_NEON] = _op_blend_caa_dpan_neon;
145}
146#endif
147
148#ifdef BUILD_NEON
149static void
150_op_blend_pt_c_dp_neon(DATA32 s EINA_UNUSED, DATA8 m EINA_UNUSED, DATA32 c, DATA32 *d) {
151 s = 256 - (c >> 24);
152 *d = c + MUL_256(s, *d);
153}
154
155#define _op_blend_pt_caa_dp_neon _op_blend_pt_c_dp_neon
156
157#define _op_blend_pt_c_dpan_neon _op_blend_pt_c_dp_neon
158#define _op_blend_pt_caa_dpan_neon _op_blend_pt_c_dpan_neon
159
160static void
161init_blend_color_pt_funcs_neon(void)
162{
163 op_blend_pt_funcs[SP_N][SM_N][SC][DP][CPU_NEON] = _op_blend_pt_c_dp_neon;
164 op_blend_pt_funcs[SP_N][SM_N][SC_AA][DP][CPU_NEON] = _op_blend_pt_caa_dp_neon;
165
166 op_blend_pt_funcs[SP_N][SM_N][SC][DP_AN][CPU_NEON] = _op_blend_pt_c_dpan_neon;
167 op_blend_pt_funcs[SP_N][SM_N][SC_AA][DP_AN][CPU_NEON] = _op_blend_pt_caa_dpan_neon;
168}
169#endif
170/*-----*/
171
172/* blend_rel color -> dst */
173
174#ifdef BUILD_NEON
175static void
176_op_blend_rel_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) {
177 DATA32 *e;
178 int alpha = 256 - (c >> 24);
179 UNROLL8_PLD_WHILE(d, l, e,
180 {
181 *d = MUL_SYM(*d >> 24, c) + MUL_256(alpha, *d);
182 d++;
183 });
184}
185
186#define _op_blend_rel_caa_dp_neon _op_blend_rel_c_dp_neon
187
188#define _op_blend_rel_c_dpan_neon _op_blend_c_dpan_neon
189#define _op_blend_rel_caa_dpan_neon _op_blend_caa_dpan_neon
190
191static void
192init_blend_rel_color_span_funcs_neon(void)
193{
194 op_blend_rel_span_funcs[SP_N][SM_N][SC][DP][CPU_NEON] = _op_blend_rel_c_dp_neon;
195 op_blend_rel_span_funcs[SP_N][SM_N][SC_AA][DP][CPU_NEON] = _op_blend_rel_caa_dp_neon;
196
197 op_blend_rel_span_funcs[SP_N][SM_N][SC][DP_AN][CPU_NEON] = _op_blend_rel_c_dpan_neon;
198 op_blend_rel_span_funcs[SP_N][SM_N][SC_AA][DP_AN][CPU_NEON] = _op_blend_rel_caa_dpan_neon;
199}
200#endif
201
202#ifdef BUILD_NEON
203static void
204_op_blend_rel_pt_c_dp_neon(DATA32 s EINA_UNUSED, DATA8 m EINA_UNUSED, DATA32 c, DATA32 *d) {
205 s = *d >> 24;
206 *d = MUL_SYM(s, c) + MUL_256(256 - (c >> 24), *d);
207}
208
209#define _op_blend_rel_pt_caa_dp_neon _op_blend_rel_pt_c_dp_neon
210
211#define _op_blend_rel_pt_c_dpan_neon _op_blend_pt_c_dpan_neon
212#define _op_blend_rel_pt_caa_dpan_neon _op_blend_pt_caa_dpan_neon
213
214static void
215init_blend_rel_color_pt_funcs_neon(void)
216{
217 op_blend_rel_pt_funcs[SP_N][SM_N][SC][DP][CPU_NEON] = _op_blend_rel_pt_c_dp_neon;
218 op_blend_rel_pt_funcs[SP_N][SM_N][SC_AA][DP][CPU_NEON] = _op_blend_rel_pt_caa_dp_neon;
219
220 op_blend_rel_pt_funcs[SP_N][SM_N][SC][DP_AN][CPU_NEON] = _op_blend_rel_pt_c_dpan_neon;
221 op_blend_rel_pt_funcs[SP_N][SM_N][SC_AA][DP_AN][CPU_NEON] = _op_blend_rel_pt_caa_dpan_neon;
222}
223#endif
diff --git a/src/lib/evas/common/evas_op_blend/op_blend_color_sse3.c b/src/lib/evas/common/evas_op_blend/op_blend_color_sse3.c
new file mode 100644
index 0000000000..02321b5bd8
--- /dev/null
+++ b/src/lib/evas/common/evas_op_blend/op_blend_color_sse3.c
@@ -0,0 +1,167 @@
1/* blend color -> dst */
2
3#ifdef BUILD_SSE3
4
5static void
6_op_blend_c_dp_sse3(DATA32 *s EINA_UNUSED, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) {
7
8 DATA32 a = 256 - (c >> 24);
9
10 const __m128i c_packed = _mm_set_epi32(c, c, c, c);
11 const __m128i a_packed = _mm_set_epi32(a, a, a, a);
12
13 LOOP_ALIGNED_U1_A48_SSE3(d, l,
14 { /* UOP */
15
16 *d = c + MUL_256(a, *d);
17 d++; l--;
18 },
19 { /* A4OP */
20
21 __m128i d0 = _mm_load_si128((__m128i *)d);
22
23 d0 = mul_256_sse3(a_packed, d0);
24 d0 = _mm_add_epi32(d0, c_packed);
25
26 _mm_store_si128((__m128i *)d, d0);
27
28 d += 4; l -= 4;
29 },
30 { /* A8OP */
31
32 __m128i d0 = _mm_load_si128((__m128i *)d);
33 __m128i d1 = _mm_load_si128((__m128i *)(d+4));
34
35 d0 = mul_256_sse3(a_packed, d0);
36 d1 = mul_256_sse3(a_packed, d1);
37
38 d0 = _mm_add_epi32(d0, c_packed);
39 d1 = _mm_add_epi32(d1, c_packed);
40
41 _mm_store_si128((__m128i *)d, d0);
42 _mm_store_si128((__m128i *)(d+4), d1);
43
44 d += 8; l -= 8;
45 })
46}
47
48#define _op_blend_caa_dp_sse3 _op_blend_c_dp_sse3
49
50#define _op_blend_c_dpan_sse3 _op_blend_c_dp_sse3
51#define _op_blend_caa_dpan_sse3 _op_blend_c_dpan_sse3
52
53static void
54init_blend_color_span_funcs_sse3(void)
55{
56// FIXME: BUGGY BUGGY Core i5 750 (32bit), 4.5.2 (Ubuntu/Linaro 4.5.2-8ubuntu4), ello (text and rectangle)
57// op_blend_span_funcs[SP_N][SM_N][SC][DP][CPU_SSE3] = _op_blend_c_dp_sse3;
58 op_blend_span_funcs[SP_N][SM_N][SC_AA][DP][CPU_SSE3] = _op_blend_caa_dp_sse3;
59
60// FIXME: BUGGY BUGGY Core i5 750 (32bit), 4.5.2 (Ubuntu/Linaro 4.5.2-8ubuntu4), ello (text and rectangle)
61// op_blend_span_funcs[SP_N][SM_N][SC][DP_AN][CPU_SSE3] = _op_blend_c_dpan_sse3;
62 op_blend_span_funcs[SP_N][SM_N][SC_AA][DP_AN][CPU_SSE3] = _op_blend_caa_dpan_sse3;
63}
64
65#define _op_blend_pt_c_dp_sse3 NULL
66#define _op_blend_pt_caa_dp_sse3 _op_blend_pt_c_dp_sse3
67
68#define _op_blend_pt_c_dpan_sse3 _op_blend_pt_c_dp_sse3
69#define _op_blend_pt_caa_dpan_sse3 _op_blend_pt_c_dpan_sse3
70
71#define _op_blend_pt_c_dpas_sse3 _op_blend_pt_c_dp_sse3
72#define _op_blend_pt_caa_dpas_sse3 _op_blend_pt_c_dp_sse3
73
74static void
75init_blend_color_pt_funcs_sse3(void)
76{
77 op_blend_pt_funcs[SP_N][SM_N][SC][DP][CPU_SSE3] = _op_blend_pt_c_dp_sse3;
78 op_blend_pt_funcs[SP_N][SM_N][SC_AA][DP][CPU_SSE3] = _op_blend_pt_caa_dp_sse3;
79
80 op_blend_pt_funcs[SP_N][SM_N][SC][DP_AN][CPU_SSE3] = _op_blend_pt_c_dpan_sse3;
81 op_blend_pt_funcs[SP_N][SM_N][SC_AA][DP_AN][CPU_SSE3] = _op_blend_pt_caa_dpan_sse3;
82}
83
84
85/*-----*/
86
87/* blend_rel color -> dst */
88
89static void
90_op_blend_rel_c_dp_sse3(DATA32 *s EINA_UNUSED, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) {
91
92 int alpha = 256 - (c >> 24);
93
94 const __m128i c_packed = _mm_set_epi32(c, c, c, c);
95 const __m128i alpha_packed = _mm_set_epi32(alpha, alpha, alpha, alpha);
96
97 LOOP_ALIGNED_U1_A48_SSE3(d, l,
98 { /* UOP */
99
100 *d = MUL_SYM(*d >> 24, c) + MUL_256(alpha, *d);
101 d++; l--;
102 },
103 { /* A4OP */
104
105 __m128i d0 = _mm_load_si128((__m128i *)d);
106
107 __m128i mul0 = mul_256_sse3(alpha_packed, d0);
108 __m128i sym0 = mul_sym_sse3(_mm_srli_epi32(d0, 24), c_packed);
109
110 d0 = _mm_add_epi32(mul0, sym0);
111
112 _mm_store_si128((__m128i *)d, d0);
113
114 d += 4; l -= 4;
115 },
116 { /* A8OP */
117
118 __m128i d0 = _mm_load_si128((__m128i *)d);
119 __m128i d1 = _mm_load_si128((__m128i *)(d+4));
120
121 __m128i mul0 = mul_256_sse3(alpha_packed, d0);
122 __m128i mul1 = mul_256_sse3(alpha_packed, d1);
123
124 __m128i sym0 = mul_sym_sse3(_mm_srli_epi32(d0, 24), c_packed);
125 __m128i sym1 = mul_sym_sse3(_mm_srli_epi32(d1, 24), c_packed);
126
127 d0 = _mm_add_epi32(mul0, sym0);
128 d1 = _mm_add_epi32(mul1, sym1);
129
130 _mm_store_si128((__m128i *)d, d0);
131 _mm_store_si128((__m128i *)(d+4), d1);
132
133 d += 8; l -= 8;
134 })
135}
136
137#define _op_blend_rel_caa_dp_sse3 _op_blend_rel_c_dp_sse3
138#define _op_blend_rel_c_dpan_sse3 _op_blend_c_dpan_sse3
139#define _op_blend_rel_caa_dpan_sse3 _op_blend_caa_dpan_sse3
140
141static void
142init_blend_rel_color_span_funcs_sse3(void)
143{
144 op_blend_rel_span_funcs[SP_N][SM_N][SC][DP][CPU_SSE3] = _op_blend_rel_c_dp_sse3;
145 op_blend_rel_span_funcs[SP_N][SM_N][SC_AA][DP][CPU_SSE3] = _op_blend_rel_caa_dp_sse3;
146
147 op_blend_rel_span_funcs[SP_N][SM_N][SC][DP_AN][CPU_SSE3] = _op_blend_rel_c_dpan_sse3;
148 op_blend_rel_span_funcs[SP_N][SM_N][SC_AA][DP_AN][CPU_SSE3] = _op_blend_rel_caa_dpan_sse3;
149}
150
151#define _op_blend_rel_pt_c_dp_sse3 NULL
152#define _op_blend_rel_pt_caa_dp_sse3 _op_blend_rel_pt_c_dp_sse3
153
154#define _op_blend_rel_pt_c_dpan_sse3 _op_blend_pt_c_dpan_sse3
155#define _op_blend_rel_pt_caa_dpan_sse3 _op_blend_pt_caa_dpan_sse3
156
157static void
158init_blend_rel_color_pt_funcs_sse3(void)
159{
160 op_blend_rel_pt_funcs[SP_N][SM_N][SC][DP][CPU_SSE3] = _op_blend_rel_pt_c_dp_sse3;
161 op_blend_rel_pt_funcs[SP_N][SM_N][SC_AA][DP][CPU_SSE3] = _op_blend_rel_pt_caa_dp_sse3;
162
163 op_blend_rel_pt_funcs[SP_N][SM_N][SC][DP_AN][CPU_SSE3] = _op_blend_rel_pt_c_dpan_sse3;
164 op_blend_rel_pt_funcs[SP_N][SM_N][SC_AA][DP_AN][CPU_SSE3] = _op_blend_rel_pt_caa_dpan_sse3;
165}
166
167#endif
diff --git a/src/lib/evas/common/evas_op_blend/op_blend_mask_color_.c b/src/lib/evas/common/evas_op_blend/op_blend_mask_color_.c
new file mode 100644
index 0000000000..f0b6ef7582
--- /dev/null
+++ b/src/lib/evas/common/evas_op_blend/op_blend_mask_color_.c
@@ -0,0 +1,177 @@
1/* blend mask x color -> dst */
2
3static void
4_op_blend_mas_c_dp(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, int l) {
5 DATA32 *e;
6 int alpha = 256 - (c >> 24);
7 UNROLL8_PLD_WHILE(d, l, e,
8 {
9 DATA32 a = *m;
10 switch(a)
11 {
12 case 0:
13 break;
14 case 255:
15 *d = c + MUL_256(alpha, *d);
16 break;
17 default:
18 {
19 DATA32 mc = MUL_SYM(a, c);
20 a = 256 - (mc >> 24);
21 *d = mc + MUL_256(a, *d);
22 }
23 break;
24 }
25 m++; d++;
26 });
27}
28
29static void
30_op_blend_mas_can_dp(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, int l) {
31 DATA32 *e;
32 int alpha;
33 UNROLL8_PLD_WHILE(d, l, e,
34 {
35 alpha = *m;
36 switch(alpha)
37 {
38 case 0:
39 break;
40 case 255:
41 *d = c;
42 break;
43 default:
44 alpha++;
45 *d = INTERP_256(alpha, c, *d);
46 break;
47 }
48 m++; d++;
49 });
50}
51
52#define _op_blend_mas_cn_dp _op_blend_mas_can_dp
53#define _op_blend_mas_caa_dp _op_blend_mas_c_dp
54
55#define _op_blend_mas_c_dpan _op_blend_mas_c_dp
56#define _op_blend_mas_cn_dpan _op_blend_mas_cn_dp
57#define _op_blend_mas_can_dpan _op_blend_mas_can_dp
58#define _op_blend_mas_caa_dpan _op_blend_mas_caa_dp
59
60static void
61init_blend_mask_color_span_funcs_c(void)
62{
63 op_blend_span_funcs[SP_N][SM_AS][SC][DP][CPU_C] = _op_blend_mas_c_dp;
64 op_blend_span_funcs[SP_N][SM_AS][SC_N][DP][CPU_C] = _op_blend_mas_cn_dp;
65 op_blend_span_funcs[SP_N][SM_AS][SC_AN][DP][CPU_C] = _op_blend_mas_can_dp;
66 op_blend_span_funcs[SP_N][SM_AS][SC_AA][DP][CPU_C] = _op_blend_mas_caa_dp;
67
68 op_blend_span_funcs[SP_N][SM_AS][SC][DP_AN][CPU_C] = _op_blend_mas_c_dpan;
69 op_blend_span_funcs[SP_N][SM_AS][SC_N][DP_AN][CPU_C] = _op_blend_mas_cn_dpan;
70 op_blend_span_funcs[SP_N][SM_AS][SC_AN][DP_AN][CPU_C] = _op_blend_mas_can_dpan;
71 op_blend_span_funcs[SP_N][SM_AS][SC_AA][DP_AN][CPU_C] = _op_blend_mas_caa_dpan;
72}
73
74static void
75_op_blend_pt_mas_c_dp(DATA32 s, DATA8 m, DATA32 c, DATA32 *d) {
76 s = MUL_SYM(m, c);
77 m = 255 - (s >> 24);
78 *d = s + MUL_256(m, *d);
79}
80
81static void
82_op_blend_pt_mas_can_dp(DATA32 s EINA_UNUSED, DATA8 m, DATA32 c, DATA32 *d) {
83 *d = INTERP_256(m + 1, c, *d);
84}
85
86#define _op_blend_pt_mas_cn_dp _op_blend_pt_mas_can_dp
87#define _op_blend_pt_mas_caa_dp _op_blend_pt_mas_c_dp
88
89#define _op_blend_pt_mas_c_dpan _op_blend_pt_mas_c_dp
90#define _op_blend_pt_mas_cn_dpan _op_blend_pt_mas_cn_dp
91#define _op_blend_pt_mas_can_dpan _op_blend_pt_mas_can_dp
92#define _op_blend_pt_mas_caa_dpan _op_blend_pt_mas_caa_dp
93
94static void
95init_blend_mask_color_pt_funcs_c(void)
96{
97 op_blend_pt_funcs[SP_N][SM_AS][SC][DP][CPU_C] = _op_blend_pt_mas_c_dp;
98 op_blend_pt_funcs[SP_N][SM_AS][SC_N][DP][CPU_C] = _op_blend_pt_mas_cn_dp;
99 op_blend_pt_funcs[SP_N][SM_AS][SC_AN][DP][CPU_C] = _op_blend_pt_mas_can_dp;
100 op_blend_pt_funcs[SP_N][SM_AS][SC_AA][DP][CPU_C] = _op_blend_pt_mas_caa_dp;
101
102 op_blend_pt_funcs[SP_N][SM_AS][SC][DP_AN][CPU_C] = _op_blend_pt_mas_c_dpan;
103 op_blend_pt_funcs[SP_N][SM_AS][SC_N][DP_AN][CPU_C] = _op_blend_pt_mas_cn_dpan;
104 op_blend_pt_funcs[SP_N][SM_AS][SC_AN][DP_AN][CPU_C] = _op_blend_pt_mas_can_dpan;
105 op_blend_pt_funcs[SP_N][SM_AS][SC_AA][DP_AN][CPU_C] = _op_blend_pt_mas_caa_dpan;
106}
107
108/*-----*/
109
110/* blend_rel mask x color --> dst */
111
112static void
113_op_blend_rel_mas_c_dp(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, int l) {
114 DATA32 *e;
115 int alpha;
116 UNROLL8_PLD_WHILE(d, l, e,
117 {
118 DATA32 mc = MUL_SYM(*m, c);
119 alpha = 256 - (mc >> 24);
120 *d = MUL_SYM(*d >> 24, mc) + MUL_256(alpha, *d);
121 d++;
122 m++;
123 });
124}
125
126#define _op_blend_rel_mas_cn_dp _op_blend_rel_mas_c_dp
127#define _op_blend_rel_mas_can_dp _op_blend_rel_mas_c_dp
128#define _op_blend_rel_mas_caa_dp _op_blend_rel_mas_c_dp
129
130#define _op_blend_rel_mas_c_dpan _op_blend_mas_c_dpan
131#define _op_blend_rel_mas_cn_dpan _op_blend_mas_cn_dpan
132#define _op_blend_rel_mas_can_dpan _op_blend_mas_can_dpan
133#define _op_blend_rel_mas_caa_dpan _op_blend_mas_caa_dpan
134
135static void
136init_blend_rel_mask_color_span_funcs_c(void)
137{
138 op_blend_rel_span_funcs[SP_N][SM_AS][SC][DP][CPU_C] = _op_blend_rel_mas_c_dp;
139 op_blend_rel_span_funcs[SP_N][SM_AS][SC_N][DP][CPU_C] = _op_blend_rel_mas_can_dp;
140 op_blend_rel_span_funcs[SP_N][SM_AS][SC_AN][DP][CPU_C] = _op_blend_rel_mas_can_dp;
141 op_blend_rel_span_funcs[SP_N][SM_AS][SC_AA][DP][CPU_C] = _op_blend_rel_mas_caa_dp;
142
143 op_blend_rel_span_funcs[SP_N][SM_AS][SC][DP_AN][CPU_C] = _op_blend_rel_mas_c_dpan;
144 op_blend_rel_span_funcs[SP_N][SM_AS][SC_N][DP_AN][CPU_C] = _op_blend_rel_mas_cn_dpan;
145 op_blend_rel_span_funcs[SP_N][SM_AS][SC_AN][DP_AN][CPU_C] = _op_blend_rel_mas_can_dpan;
146 op_blend_rel_span_funcs[SP_N][SM_AS][SC_AA][DP_AN][CPU_C] = _op_blend_rel_mas_caa_dpan;
147}
148
149static void
150_op_blend_rel_pt_mas_c_dp(DATA32 s, DATA8 m, DATA32 c, DATA32 *d) {
151 s = MUL_SYM(m, c);
152 c = 256 - (s >> 24);
153 *d = MUL_SYM(*d >> 24, s) + MUL_256(c, *d);
154}
155
156#define _op_blend_rel_pt_mas_cn_dp _op_blend_rel_pt_mas_c_dp
157#define _op_blend_rel_pt_mas_can_dp _op_blend_rel_pt_mas_c_dp
158#define _op_blend_rel_pt_mas_caa_dp _op_blend_rel_pt_mas_c_dp
159
160#define _op_blend_rel_pt_mas_c_dpan _op_blend_pt_mas_c_dpan
161#define _op_blend_rel_pt_mas_cn_dpan _op_blend_pt_mas_cn_dpan
162#define _op_blend_rel_pt_mas_can_dpan _op_blend_pt_mas_can_dpan
163#define _op_blend_rel_pt_mas_caa_dpan _op_blend_pt_mas_caa_dpan
164
165static void
166init_blend_rel_mask_color_pt_funcs_c(void)
167{
168 op_blend_rel_pt_funcs[SP_N][SM_AS][SC][DP][CPU_C] = _op_blend_rel_pt_mas_c_dp;
169 op_blend_rel_pt_funcs[SP_N][SM_AS][SC_N][DP][CPU_C] = _op_blend_rel_pt_mas_cn_dp;
170 op_blend_rel_pt_funcs[SP_N][SM_AS][SC_AN][DP][CPU_C] = _op_blend_rel_pt_mas_can_dp;
171 op_blend_rel_pt_funcs[SP_N][SM_AS][SC_AA][DP][CPU_C] = _op_blend_rel_pt_mas_caa_dp;
172
173 op_blend_rel_pt_funcs[SP_N][SM_AS][SC][DP_AN][CPU_C] = _op_blend_rel_pt_mas_c_dpan;
174 op_blend_rel_pt_funcs[SP_N][SM_AS][SC_N][DP_AN][CPU_C] = _op_blend_rel_pt_mas_cn_dpan;
175 op_blend_rel_pt_funcs[SP_N][SM_AS][SC_AN][DP_AN][CPU_C] = _op_blend_rel_pt_mas_can_dpan;
176 op_blend_rel_pt_funcs[SP_N][SM_AS][SC_AA][DP_AN][CPU_C] = _op_blend_rel_pt_mas_caa_dpan;
177}
diff --git a/src/lib/evas/common/evas_op_blend/op_blend_mask_color_i386.c b/src/lib/evas/common/evas_op_blend/op_blend_mask_color_i386.c
new file mode 100644
index 0000000000..0031a91194
--- /dev/null
+++ b/src/lib/evas/common/evas_op_blend/op_blend_mask_color_i386.c
@@ -0,0 +1,251 @@
1/* blend mask x color -> dst */
2
3#ifdef BUILD_MMX
4static void
5_op_blend_mas_c_dp_mmx(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, int l) {
6 DATA32 *e = d + l;
7 pxor_r2r(mm0, mm0);
8 MOV_A2R(ALPHA_256, mm6)
9 MOV_P2R(c, mm2, mm0)
10 c = 256 - (c >> 24);
11 MOV_A2R(c, mm4)
12 while (d < e) {
13 l = *m;
14 switch(l)
15 {
16 case 0:
17 break;
18 case 255:
19 MOV_P2R(*d, mm1, mm0)
20 MUL4_256_R2R(mm4, mm1)
21 paddw_r2r(mm2, mm1);
22 MOV_R2P(mm1, *d, mm0)
23 break;
24 default:
25 l++;
26 MOV_A2R(l, mm3)
27 MUL4_256_R2R(mm2, mm3)
28
29 MOV_RA2R(mm3, mm1)
30 movq_r2r(mm6, mm7);
31 psubw_r2r(mm1, mm7);
32
33 MOV_P2R(*d, mm1, mm0)
34 MUL4_256_R2R(mm7, mm1)
35
36 paddw_r2r(mm3, mm1);
37 MOV_R2P(mm1, *d, mm0)
38 break;
39 }
40 m++; d++;
41 }
42}
43
44static void
45_op_blend_mas_can_dp_mmx(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, int l) {
46 DATA32 *e = d + l;
47 pxor_r2r(mm0, mm0);
48 MOV_P2R(c, mm2, mm0)
49 MOV_A2R(ALPHA_255, mm5)
50 while (d < e) {
51 l = *m;
52 switch(l)
53 {
54 case 0:
55 break;
56 case 255:
57 *d = c;
58 break;
59 default:
60 l++;
61 MOV_A2R(l, mm3)
62 MOV_P2R(*d, mm1, mm0)
63 movq_r2r(mm2, mm4);
64 INTERP_256_R2R(mm3, mm4, mm1, mm5)
65 MOV_R2P(mm1, *d, mm0)
66 break;
67 }
68 m++; d++;
69 }
70}
71
72#define _op_blend_mas_cn_dp_mmx _op_blend_mas_can_dp_mmx
73#define _op_blend_mas_caa_dp_mmx _op_blend_mas_c_dp_mmx
74
75#define _op_blend_mas_c_dpan_mmx _op_blend_mas_c_dp_mmx
76#define _op_blend_mas_cn_dpan_mmx _op_blend_mas_cn_dp_mmx
77#define _op_blend_mas_can_dpan_mmx _op_blend_mas_can_dp_mmx
78#define _op_blend_mas_caa_dpan_mmx _op_blend_mas_caa_dp_mmx
79
80static void
81init_blend_mask_color_span_funcs_mmx(void)
82{
83 op_blend_span_funcs[SP_N][SM_AS][SC][DP][CPU_MMX] = _op_blend_mas_c_dp_mmx;
84 op_blend_span_funcs[SP_N][SM_AS][SC_N][DP][CPU_MMX] = _op_blend_mas_cn_dp_mmx;
85 op_blend_span_funcs[SP_N][SM_AS][SC_AN][DP][CPU_MMX] = _op_blend_mas_can_dp_mmx;
86 op_blend_span_funcs[SP_N][SM_AS][SC_AA][DP][CPU_MMX] = _op_blend_mas_caa_dp_mmx;
87
88 op_blend_span_funcs[SP_N][SM_AS][SC][DP_AN][CPU_MMX] = _op_blend_mas_c_dpan_mmx;
89 op_blend_span_funcs[SP_N][SM_AS][SC_N][DP_AN][CPU_MMX] = _op_blend_mas_cn_dpan_mmx;
90 op_blend_span_funcs[SP_N][SM_AS][SC_AN][DP_AN][CPU_MMX] = _op_blend_mas_can_dpan_mmx;
91 op_blend_span_funcs[SP_N][SM_AS][SC_AA][DP_AN][CPU_MMX] = _op_blend_mas_caa_dpan_mmx;
92}
93#endif
94
95#ifdef BUILD_MMX
96static void
97_op_blend_pt_mas_c_dp_mmx(DATA32 s, DATA8 m, DATA32 c, DATA32 *d) {
98 s = m + 1;
99 MOV_A2R(s, mm3)
100 MOV_A2R(ALPHA_256, mm6)
101 pxor_r2r(mm0, mm0);
102 MOV_P2R(c, mm2, mm0)
103 MUL4_256_R2R(mm2, mm3)
104
105 MOV_RA2R(mm3, mm1)
106 psubw_r2r(mm1, mm6);
107
108 MOV_P2R(*d, mm1, mm0)
109 MUL4_256_R2R(mm6, mm1)
110
111 paddw_r2r(mm3, mm1);
112 MOV_R2P(mm1, *d, mm0)
113}
114
115
116#define _op_blend_pt_mas_cn_dp_mmx _op_blend_pt_mas_c_dp_mmx
117#define _op_blend_pt_mas_can_dp_mmx _op_blend_pt_mas_c_dp_mmx
118#define _op_blend_pt_mas_caa_dp_mmx _op_blend_pt_mas_c_dp_mmx
119
120#define _op_blend_pt_mas_c_dpan_mmx _op_blend_pt_mas_c_dp_mmx
121#define _op_blend_pt_mas_cn_dpan_mmx _op_blend_pt_mas_cn_dp_mmx
122#define _op_blend_pt_mas_can_dpan_mmx _op_blend_pt_mas_can_dp_mmx
123#define _op_blend_pt_mas_caa_dpan_mmx _op_blend_pt_mas_caa_dp_mmx
124
125static void
126init_blend_mask_color_pt_funcs_mmx(void)
127{
128 op_blend_pt_funcs[SP_N][SM_AS][SC][DP][CPU_MMX] = _op_blend_pt_mas_c_dp_mmx;
129 op_blend_pt_funcs[SP_N][SM_AS][SC_N][DP][CPU_MMX] = _op_blend_pt_mas_cn_dp_mmx;
130 op_blend_pt_funcs[SP_N][SM_AS][SC_AN][DP][CPU_MMX] = _op_blend_pt_mas_can_dp_mmx;
131 op_blend_pt_funcs[SP_N][SM_AS][SC_AA][DP][CPU_MMX] = _op_blend_pt_mas_caa_dp_mmx;
132
133 op_blend_pt_funcs[SP_N][SM_AS][SC][DP_AN][CPU_MMX] = _op_blend_pt_mas_c_dpan_mmx;
134 op_blend_pt_funcs[SP_N][SM_AS][SC_N][DP_AN][CPU_MMX] = _op_blend_pt_mas_cn_dpan_mmx;
135 op_blend_pt_funcs[SP_N][SM_AS][SC_AN][DP_AN][CPU_MMX] = _op_blend_pt_mas_can_dpan_mmx;
136 op_blend_pt_funcs[SP_N][SM_AS][SC_AA][DP_AN][CPU_MMX] = _op_blend_pt_mas_caa_dpan_mmx;
137}
138#endif
139
140/*-----*/
141
142/* blend_rel mask x color -> dst */
143
144#ifdef BUILD_MMX
145static void
146_op_blend_rel_mas_c_dp_mmx(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, int l) {
147 DATA32 *e = d + l;
148 pxor_r2r(mm0, mm0);
149 MOV_A2R(ALPHA_256, mm6)
150 MOV_A2R(ALPHA_255, mm5)
151 MOV_P2R(c, mm2, mm0)
152 while (d < e) {
153 l = *m;
154 switch(l)
155 {
156 case 0:
157 break;
158 default:
159 l++;
160 MOV_A2R(l, mm3)
161 MUL4_256_R2R(mm2, mm3)
162
163 MOV_RA2R(mm3, mm1)
164 movq_r2r(mm6, mm7);
165 psubw_r2r(mm1, mm7);
166
167 MOV_P2R(*d, mm1, mm0)
168 MOV_RA2R(mm1, mm4)
169 MUL4_256_R2R(mm7, mm1)
170
171 MUL4_SYM_R2R(mm4, mm3, mm5)
172
173 paddw_r2r(mm3, mm1);
174 MOV_R2P(mm1, *d, mm0)
175 break;
176 }
177 m++; d++;
178 }
179}
180
181#define _op_blend_rel_mas_cn_dp_mmx _op_blend_rel_mas_c_dp_mmx
182#define _op_blend_rel_mas_can_dp_mmx _op_blend_rel_mas_c_dp_mmx
183#define _op_blend_rel_mas_caa_dp_mmx _op_blend_rel_mas_c_dp_mmx
184
185#define _op_blend_rel_mas_c_dpan_mmx _op_blend_mas_c_dpan_mmx
186#define _op_blend_rel_mas_cn_dpan_mmx _op_blend_mas_cn_dpan_mmx
187#define _op_blend_rel_mas_can_dpan_mmx _op_blend_mas_can_dpan_mmx
188#define _op_blend_rel_mas_caa_dpan_mmx _op_blend_mas_caa_dpan_mmx
189
190static void
191init_blend_rel_mask_color_span_funcs_mmx(void)
192{
193 op_blend_rel_span_funcs[SP_N][SM_AS][SC][DP][CPU_MMX] = _op_blend_rel_mas_c_dp_mmx;
194 op_blend_rel_span_funcs[SP_N][SM_AS][SC_N][DP][CPU_MMX] = _op_blend_rel_mas_cn_dp_mmx;
195 op_blend_rel_span_funcs[SP_N][SM_AS][SC_AN][DP][CPU_MMX] = _op_blend_rel_mas_can_dp_mmx;
196 op_blend_rel_span_funcs[SP_N][SM_AS][SC_AA][DP][CPU_MMX] = _op_blend_rel_mas_caa_dp_mmx;
197
198 op_blend_rel_span_funcs[SP_N][SM_AS][SC][DP_AN][CPU_MMX] = _op_blend_rel_mas_c_dpan_mmx;
199 op_blend_rel_span_funcs[SP_N][SM_AS][SC_N][DP_AN][CPU_MMX] = _op_blend_rel_mas_cn_dpan_mmx;
200 op_blend_rel_span_funcs[SP_N][SM_AS][SC_AN][DP_AN][CPU_MMX] = _op_blend_rel_mas_can_dpan_mmx;
201 op_blend_rel_span_funcs[SP_N][SM_AS][SC_AA][DP_AN][CPU_MMX] = _op_blend_rel_mas_caa_dpan_mmx;
202}
203#endif
204
205#ifdef BUILD_MMX
206static void
207_op_blend_rel_pt_mas_c_dp_mmx(DATA32 s, DATA8 m, DATA32 c, DATA32 *d) {
208 pxor_r2r(mm0, mm0);
209 MOV_A2R(ALPHA_256, mm6)
210 MOV_A2R(ALPHA_255, mm5)
211 s = m + 1;
212 MOV_A2R(s, mm3)
213 MOV_P2R(c, mm2, mm0)
214 MUL4_256_R2R(mm2, mm3)
215
216 MOV_RA2R(mm3, mm1)
217 psubw_r2r(mm1, mm6);
218
219 MOV_P2R(*d, mm1, mm0)
220 MOV_RA2R(mm1, mm4)
221 MUL4_256_R2R(mm6, mm1)
222
223 MUL4_SYM_R2R(mm4, mm3, mm5)
224
225 paddw_r2r(mm3, mm1);
226 MOV_R2P(mm1, *d, mm0)
227}
228
229#define _op_blend_rel_pt_mas_cn_dp_mmx _op_blend_rel_pt_mas_c_dp_mmx
230#define _op_blend_rel_pt_mas_can_dp_mmx _op_blend_rel_pt_mas_c_dp_mmx
231#define _op_blend_rel_pt_mas_caa_dp_mmx _op_blend_rel_pt_mas_c_dp_mmx
232
233#define _op_blend_rel_pt_mas_c_dpan_mmx _op_blend_pt_mas_c_dpan_mmx
234#define _op_blend_rel_pt_mas_cn_dpan_mmx _op_blend_pt_mas_cn_dpan_mmx
235#define _op_blend_rel_pt_mas_can_dpan_mmx _op_blend_pt_mas_can_dpan_mmx
236#define _op_blend_rel_pt_mas_caa_dpan_mmx _op_blend_pt_mas_caa_dpan_mmx
237
238static void
239init_blend_rel_mask_color_pt_funcs_mmx(void)
240{
241 op_blend_rel_pt_funcs[SP_N][SM_AS][SC][DP][CPU_MMX] = _op_blend_rel_pt_mas_c_dp_mmx;
242 op_blend_rel_pt_funcs[SP_N][SM_AS][SC_N][DP][CPU_MMX] = _op_blend_rel_pt_mas_cn_dp_mmx;
243 op_blend_rel_pt_funcs[SP_N][SM_AS][SC_AN][DP][CPU_MMX] = _op_blend_rel_pt_mas_can_dp_mmx;
244 op_blend_rel_pt_funcs[SP_N][SM_AS][SC_AA][DP][CPU_MMX] = _op_blend_rel_pt_mas_caa_dp_mmx;
245
246 op_blend_rel_pt_funcs[SP_N][SM_AS][SC][DP_AN][CPU_MMX] = _op_blend_rel_pt_mas_c_dpan_mmx;
247 op_blend_rel_pt_funcs[SP_N][SM_AS][SC_N][DP_AN][CPU_MMX] = _op_blend_rel_pt_mas_cn_dpan_mmx;
248 op_blend_rel_pt_funcs[SP_N][SM_AS][SC_AN][DP_AN][CPU_MMX] = _op_blend_rel_pt_mas_can_dpan_mmx;
249 op_blend_rel_pt_funcs[SP_N][SM_AS][SC_AA][DP_AN][CPU_MMX] = _op_blend_rel_pt_mas_caa_dpan_mmx;
250}
251#endif
diff --git a/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c b/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c
new file mode 100644
index 0000000000..da7cd3e24d
--- /dev/null
+++ b/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c
@@ -0,0 +1,562 @@
1#define NEONDEBUG 0
2
3
4#if NEONDEBUG
5#define DEBUG_FNCOUNT(x) \
6 do { \
7 static int _foo = 0; \
8 if (_foo++%10000 ==0) \
9 printf("%s %+d %s: %d (%s)\n",__FILE__,__LINE__,__FUNCTION__,\
10 _foo, x " optimised");\
11 } while (0)
12#else
13#define DEBUG_FNCOUNT(x) ((void)x)
14#endif
15
16
17/* blend mask x color -> dst */
18
19#ifdef BUILD_NEON
20static void
21_op_blend_mas_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, int l) {
22 DATA32 *e;
23
24 DEBUG_FNCOUNT("");
25
26#define AP "blend_mas_c_dp_"
27 asm volatile (
28 ".fpu neon \n\t"
29 " vdup.i32 q15, %[c] \n\t"
30 " vmov.i8 q14, #1 \n\t"
31
32 // If aligned already - straight to quads
33 " andS %[tmp], %[d],$0xf \n\t"
34 " beq "AP"quadloops \n\t"
35
36 " andS %[tmp], %[d],$0x4 \n\t"
37 " beq "AP"dualloop \n\t"
38
39 AP"singleloop: \n\t"
40 " vld1.8 d0[0], [%[m]]! \n\t"
41 " vld1.32 d4[0], [%[d]] \n\t"
42 " vdup.u8 d0, d0[0] \n\t"
43 " vmull.u8 q4, d0, d30 \n\t"
44 " vqrshrn.u16 d12, q4, #8 \n\t"
45 " vmvn.u16 d14, d12 \n\t"
46 " vshr.u32 d16, d14, #24 \n\t"
47 " vmul.u32 d16, d16, d28 \n\t"
48 " vmull.u8 q7, d16, d4 \n\t"
49 " vqrshrn.u16 d0, q7, #8 \n\t"
50 " vqadd.u8 d0, d0, d12 \n\t"
51 " vst1.32 d0[0], [%[d]]! \n\t"
52
53 // Can we go the fast path?
54 " andS %[tmp], %[d],$0xf \n\t"
55 " beq "AP"quadloops \n\t"
56
57 AP"dualloop: \n\t"
58 " sub %[tmp], %[e], %[d] \n\t"
59 " cmp %[tmp], #16 \n\t"
60 " blt "AP"loopout \n\t"
61
62 " vld1.16 d0[0], [%[m]]! \n\t"
63 " vldm %[d], {d4} \n\t"
64 " vmovl.u8 q0, d0 \n\t"
65 " vmovl.u8 q0, d0 \n\t"
66 " vmul.u32 q0, q14 \n\t"
67 " vmull.u8 q4, d0, d30 \n\t"
68 " vqrshrn.u16 d12, q4, #8 \n\t"
69 " vmvn.u16 d14, d12 \n\t"
70 " vshr.u32 d16, d14, #24 \n\t"
71 " vmul.u32 d16, d16, d28 \n\t"
72 " vmull.u8 q7, d16, d4 \n\t"
73 " vqrshrn.u16 d0, q7, #8 \n\t"
74 " vqadd.u8 q0, q0, q6 \n\t"
75 " vstm %[d]!, {d0} \n\t"
76
77 AP"quadloops: \n\t"
78 " sub %[tmp], %[e], %[d] \n\t"
79 " cmp %[tmp], #16 \n\t"
80 " blt "AP"loopout \n\t"
81
82
83 " sub %[tmp], %[e], #15 \n\t"
84
85 " sub %[d], #16 \n\t"
86 AP"fastloop:"
87 " add %[d], #16 \n\t"
88 " cmp %[tmp], %[d] \n\t"
89 " ble "AP"loopout \n\t"
90 AP"quadloopint: \n\t"
91 " ldr %[x], [%[m]] \n\t"
92 " add %[m], #4 \n\t"
93 " cmp %[x], #0 \n\t"
94 " beq "AP"fastloop \n\t"
95 " vmov.32 d0[0], %[x] \n\t"
96 " vldm %[d], {d4,d5} \n\t"
97
98 // Expand M: Fixme: Can we do this quicker?
99 " vmovl.u8 q0, d0 \n\t"
100 " vmovl.u8 q0, d0 \n\t"
101 " vmul.u32 q0, q14 \n\t"
102
103 // Multiply a * c
104 " vmull.u8 q4, d0, d30 \n\t"
105 " vmull.u8 q5, d1, d31 \n\t"
106
107 // Shorten
108 " vqrshrn.u16 d12, q4, #8 \n\t"
109 " vqrshrn.u16 d13, q5, #8 \n\t"
110
111 // extract negated alpha
112 " vmvn.u16 q7, q6 \n\t"
113 " vshr.u32 q8, q7, #24 \n\t"
114 " vmul.u32 q8, q8, q14 \n\t"
115
116 // Multiply
117 " vmull.u8 q7, d16, d4 \n\t"
118 " vmull.u8 q8, d17, d5 \n\t"
119
120 " vqrshrn.u16 d0, q7, #8 \n\t"
121 " vqrshrn.u16 d1, q8, #8 \n\t"
122
123 // Add
124 " vqadd.u8 q0, q0, q6 \n\t"
125
126 " vstm %[d]!, {d0,d1} \n\t"
127
128 " cmp %[tmp], %[d] \n\t"
129 " bhi "AP"quadloopint \n\t"
130
131 AP"loopout: \n\t"
132#if NEONDEBUG
133 "cmp %[d], %[e] \n\t"
134 "ble "AP"foo \n\t"
135 "cmp %[tmp], %[m] \n\t"
136 "sub %[x], %[x] \n\t"
137 "vst1.32 d0[0], [%[x]] \n\t"
138 AP"foo: \n\t"
139#endif
140
141 " cmp %[d], %[e] \n\t"
142 " beq "AP"done \n\t"
143 " sub %[tmp],%[e], %[d] \n\t"
144 " cmp %[tmp],#4 \n\t"
145 " beq "AP"singleout \n\t"
146
147 AP "dualloop2: \n\t"
148 "sub %[tmp],%[e],$0x8 \n\t"
149 " vld1.16 d0[0], [%[m]]! \n\t"
150 " vldm %[d], {d4} \n\t"
151 " vmovl.u8 q0, d0 \n\t"
152 " vmovl.u8 q0, d0 \n\t"
153 " vmul.u32 q0, q14 \n\t"
154 " vmull.u8 q4, d0, d30 \n\t"
155 " vqrshrn.u16 d12, q4, #8 \n\t"
156 " vmvn.u16 d14, d12 \n\t"
157 " vshr.u32 d16, d14, #24 \n\t"
158 " vmul.u32 d16, d16, d28 \n\t"
159 " vmull.u8 q7, d16, d4 \n\t"
160 " vqrshrn.u16 d0, q7, #8 \n\t"
161 " vqadd.u8 q0, q0, q6 \n\t"
162 " vstm %[d]!, {d0} \n\t"
163
164 " cmp %[e], %[d] \n\t"
165 " beq "AP"done \n\t"
166
167 AP"singleout: \n\t"
168 " vld1.8 d0[0], [%[m]]! \n\t"
169 " vld1.32 d4[0], [%[d]] \n\t"
170 " vdup.u8 d0, d0[0] \n\t"
171 " vmull.u8 q4, d0, d30 \n\t"
172 " vqrshrn.u16 d12, q4, #8 \n\t"
173 " vmvn.u16 d14, d12 \n\t"
174 " vshr.u32 d16, d14, #24 \n\t"
175 " vmul.u32 d16, d16, d28 \n\t"
176 " vmull.u8 q7, d16, d4 \n\t"
177 " vqrshrn.u16 d0, q7, #8 \n\t"
178 " vqadd.u8 q0, q0, q6 \n\t"
179 " vst1.32 d0[0], [%[d]]! \n\t"
180
181 AP"done: \n\t"
182#if NEONDEBUG
183 "cmp %[d], %[e] \n\t"
184 "beq "AP"reallydone \n\t"
185 "sub %[tmp], %[tmp] \n\t"
186 "vst1.32 d0[0], [%[tmp]] \n\t"
187 AP"reallydone:"
188#endif
189 : // Out
190 : [e] "r" (d + l), [d] "r" (d), [c] "r" (c),
191 [tmp] "r" (7), [m] "r" (m), [x] "r" (0)
192 : "q0", "q1", "q2","q3", "q4","q5","q6", "q7","q8","q14","q15",
193 "memory" // clobbered
194 );
195#undef AP
196}
197#endif
198
199#ifdef BUILD_NEON
200static void
201_op_blend_mas_can_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, int l) {
202 DATA32 *e,*tmp;
203 int alpha;
204
205 DEBUG_FNCOUNT("");
206
207#define AP "_blend_mas_can_dp_neon_"
208 asm volatile (
209 ".fpu neon \n\t"
210 "vdup.u32 q9, %[c] \n\t"
211 "vmov.i8 q15, #1 \n\t"
212 "vmov.i8 q14, #0 \n\t"
213
214 // Make C 16 bit (C in q3/q2)
215 "vmovl.u8 q3, d19 \n\t"
216 "vmovl.u8 q2, d18 \n\t"
217
218 // Which loop to start
219 " andS %[tmp], %[d],$0xf \n\t"
220 " beq "AP"quadloop \n\t"
221
222 " andS %[tmp], %[d], #4 \n\t"
223 " beq "AP"dualstart \n\t"
224
225
226 AP"singleloop: \n\t"
227 " vld1.8 d0[0], [%[m]]! \n\t"
228 " vld1.32 d8[0], [%[d]] \n\t"
229 " vdup.u8 d0, d0[0] \n\t"
230 " vshr.u8 d0, d0, #1 \n\t"
231 " vmovl.u8 q0, d0 \n\t"
232 " vmovl.u8 q4, d8 \n\t"
233 " vsub.s16 q6, q2, q4 \n\t"
234 " vmul.s16 q6, q0 \n\t"
235 " vshr.s16 q6, #7 \n\t"
236 " vadd.s16 q6, q4 \n\t"
237 " vqmovun.s16 d2, q6 \n\t"
238 " vst1.32 d2[0], [%[d]]! \n\t"
239
240 " andS %[tmp], %[d], $0xf \n\t"
241 " beq "AP"quadloop \n\t"
242
243 AP"dualstart: \n\t"
244 " sub %[tmp], %[e], %[d] \n\t"
245 " cmp %[tmp], #16 \n\t"
246 " blt "AP"loopout \n\t"
247
248 AP"dualloop: \n\t"
249 " vld1.16 d0[0], [%[m]]! \n\t"
250 " vldm %[d], {d8} \n\t"
251 " vmovl.u8 q0, d0 \n\t"
252 " vmovl.u8 q0, d0 \n\t"
253 " vmul.u32 d0, d0, d30 \n\t"
254 " vshr.u8 d0, d0, #1 \n\t"
255 " vmovl.u8 q0, d0 \n\t"
256 " vmovl.u8 q4, d8 \n\t"
257 " vsub.s16 q6, q2, q4 \n\t"
258 " vmul.s16 q6, q0 \n\t"
259 " vshr.s16 q6, #7 \n\t"
260 " vadd.s16 q6, q4 \n\t"
261 " vqmovun.s16 d2, q6 \n\t"
262 " vstm %[d]!, {d2} \n\t"
263
264 AP"quadloop: \n\t"
265 " sub %[tmp], %[e], %[d] \n\t"
266 " cmp %[tmp], #16 \n\t"
267 " blt "AP"loopout \n\t"
268 " sub %[tmp], %[e], #15 \n\t"
269
270 " sub %[d], #16 \n\t"
271 AP"fastloop: \n\t"
272 " add %[d], #16 \n\t"
273 " cmp %[tmp], %[d] \n\t"
274 " blt "AP"loopout \n\t"
275
276 AP"quadloopint: \n\t"
277 // Load the mask: 4 bytes: It has d0/d1
278 " ldr %[x], [%[m]] \n\t"
279 " add %[m], #4 \n\t"
280
281 // Check for shortcuts
282 " cmp %[x], #0 \n\t"
283 " beq "AP"fastloop \n\t"
284
285 " cmp %[x], $0xffffffff \n\t"
286 " beq "AP"quadstore \n\t"
287
288 " vmov.32 d0[0], %[x] \n\t"
289 // Load d into d8/d9 q4
290 " vldm %[d], {d8,d9} \n\t"
291
292 // Get the alpha channel ready (m)
293 " vmovl.u8 q0, d0 \n\t"
294 " vmovl.u8 q0, d0 \n\t"
295 " vmul.u32 q0, q0,q15 \n\t"
296 // Lop a bit off to prevent overflow
297 " vshr.u8 q0, q0, #1 \n\t"
298
299 // Now make it 16 bit
300 " vmovl.u8 q1, d1 \n\t"
301 " vmovl.u8 q0, d0 \n\t"
302
303 // 16 bit 'd'
304 " vmovl.u8 q5, d9 \n\t"
305 " vmovl.u8 q4, d8 \n\t"
306
307 // Diff 'd' & 'c'
308 " vsub.s16 q7, q3, q5 \n\t"
309 " vsub.s16 q6, q2, q4 \n\t"
310
311 " vmul.s16 q7, q1 \n\t"
312 " vmul.s16 q6, q0 \n\t"
313
314 // Shift results a bit
315 " vshr.s16 q7, #7 \n\t"
316 " vshr.s16 q6, #7 \n\t"
317
318 // Add 'd'
319 " vadd.s16 q7, q5 \n\t"
320 " vadd.s16 q6, q4 \n\t"
321
322 // Make sure none are negative
323 " vqmovun.s16 d9, q7 \n\t"
324 " vqmovun.s16 d8, q6 \n\t"
325
326 " vstm %[d]!, {d8,d9} \n\t"
327
328 " cmp %[tmp], %[d] \n\t"
329 " bhi "AP"quadloopint \n\t"
330 " b "AP"loopout \n\t"
331
332 AP"quadstore: \n\t"
333 " vstm %[d]!, {d18,d19} \n\t"
334 " cmp %[tmp], %[d] \n\t"
335 " bhi "AP"quadloopint \n\t"
336
337 AP"loopout: \n\t"
338#if NEONDEBUG
339 "cmp %[d], %[e] \n\t"
340 "ble "AP"foo \n\t"
341 "sub %[tmp], %[tmp] \n\t"
342 "vst1.32 d0[0], [%[tmp]] \n\t"
343 AP"foo: \n\t"
344#endif
345
346 " cmp %[e], %[d] \n\t"
347 " beq "AP"done \n\t"
348
349 " sub %[tmp],%[e], %[d] \n\t"
350 " cmp %[tmp],#8 \n\t"
351
352 " blt "AP"onebyte \n\t"
353
354 // Load the mask: 2 bytes: It has d0
355 " vld1.16 d0[0], [%[m]]! \n\t"
356
357 // Load d into d8/d9 q4
358 " vldm %[d], {d8} \n\t"
359
360 // Get the alpha channel ready (m)
361 " vmovl.u8 q0, d0 \n\t"
362 " vmovl.u8 q0, d0 \n\t"
363 " vmul.u32 d0, d0, d30 \n\t"
364 // Lop a bit off to prevent overflow
365 " vshr.u8 d0, d0, #1 \n\t"
366
367 // Now make it 16 bit
368 " vmovl.u8 q0, d0 \n\t"
369
370 // 16 bit 'd'
371 " vmovl.u8 q4, d8 \n\t"
372
373 // Diff 'd' & 'c'
374 " vsub.s16 q6, q2, q4 \n\t"
375
376 " vmul.s16 q6, q0 \n\t"
377
378 // Shift results a bit
379 " vshr.s16 q6, #7 \n\t"
380
381 // Add 'd'
382 "vadd.s16 q6, q4 \n\t"
383
384 // Make sure none are negative
385 "vqmovun.s16 d2, q6 \n\t"
386
387 "vstm %[d]!, {d2} \n\t"
388
389 "cmp %[e], %[d] \n\t"
390 "beq "AP"done \n\t"
391
392 AP"onebyte: \n\t"
393 "vld1.8 d0[0], [%[m]]! \n\t"
394 "vld1.32 d8[0], [%[d]] \n\t"
395 "vdup.u8 d0, d0[0] \n\t"
396 "vshr.u8 d0, d0, #1 \n\t"
397 "vmovl.u8 q0, d0 \n\t"
398 "vmovl.u8 q4, d8 \n\t"
399 "vsub.s16 q6, q2, q4 \n\t"
400 "vmul.s16 q6, q0 \n\t"
401 "vshr.s16 q6, #7 \n\t"
402 "vadd.s16 q6, q4 \n\t"
403 "vqmovun.s16 d2, q6 \n\t"
404 "vst1.32 d2[0], [%[d]]! \n\t"
405
406
407 AP"done: \n\t"
408#if NEONDEBUG
409 "cmp %[d], %[e] \n\t"
410 "beq "AP"reallydone \n\t"
411 "sub %[m], %[m] \n\t"
412 "vst1.32 d0[0], [%[m]] \n\t"
413 AP"reallydone:"
414#endif
415
416
417 : // output regs
418 // Input
419 : [e] "r" (e = d + l), [d] "r" (d), [c] "r" (c),
420 [m] "r" (m), [tmp] "r" (7), [x] "r" (33)
421 : "q0", "q1", "q2","q3", "q4","q5","q6", "q7","q9","q14","q15",
422 "memory" // clobbered
423
424 );
425#undef AP
426}
427#endif
428
429#ifdef BUILD_NEON
430#define _op_blend_mas_cn_dp_neon _op_blend_mas_can_dp_neon
431#define _op_blend_mas_caa_dp_neon _op_blend_mas_c_dp_neon
432
433#define _op_blend_mas_c_dpan_neon _op_blend_mas_c_dp_neon
434#define _op_blend_mas_cn_dpan_neon _op_blend_mas_cn_dp_neon
435#define _op_blend_mas_can_dpan_neon _op_blend_mas_can_dp_neon
436#define _op_blend_mas_caa_dpan_neon _op_blend_mas_caa_dp_neon
437
438static void
439init_blend_mask_color_span_funcs_neon(void)
440{
441 op_blend_span_funcs[SP_N][SM_AS][SC][DP][CPU_NEON] = _op_blend_mas_c_dp_neon;
442 op_blend_span_funcs[SP_N][SM_AS][SC_N][DP][CPU_NEON] = _op_blend_mas_cn_dp_neon;
443 op_blend_span_funcs[SP_N][SM_AS][SC_AN][DP][CPU_NEON] = _op_blend_mas_can_dp_neon;
444 op_blend_span_funcs[SP_N][SM_AS][SC_AA][DP][CPU_NEON] = _op_blend_mas_caa_dp_neon;
445
446 op_blend_span_funcs[SP_N][SM_AS][SC][DP_AN][CPU_NEON] = _op_blend_mas_c_dpan_neon;
447 op_blend_span_funcs[SP_N][SM_AS][SC_N][DP_AN][CPU_NEON] = _op_blend_mas_cn_dpan_neon;
448 op_blend_span_funcs[SP_N][SM_AS][SC_AN][DP_AN][CPU_NEON] = _op_blend_mas_can_dpan_neon;
449 op_blend_span_funcs[SP_N][SM_AS][SC_AA][DP_AN][CPU_NEON] = _op_blend_mas_caa_dpan_neon;
450}
451#endif
452
453#ifdef BUILD_NEON
454static void
455_op_blend_pt_mas_c_dp_neon(DATA32 s, DATA8 m, DATA32 c, DATA32 *d) {
456 s = MUL_SYM(m, c);
457 c = 256 - (s >> 24);
458 *d = MUL_SYM(*d >> 24, s) + MUL_256(c, *d);
459}
460
461
462#define _op_blend_pt_mas_cn_dp_neon _op_blend_pt_mas_c_dp_neon
463#define _op_blend_pt_mas_can_dp_neon _op_blend_pt_mas_c_dp_neon
464#define _op_blend_pt_mas_caa_dp_neon _op_blend_pt_mas_c_dp_neon
465
466#define _op_blend_pt_mas_c_dpan_neon _op_blend_pt_mas_c_dp_neon
467#define _op_blend_pt_mas_cn_dpan_neon _op_blend_pt_mas_cn_dp_neon
468#define _op_blend_pt_mas_can_dpan_neon _op_blend_pt_mas_can_dp_neon
469#define _op_blend_pt_mas_caa_dpan_neon _op_blend_pt_mas_caa_dp_neon
470
471static void
472init_blend_mask_color_pt_funcs_neon(void)
473{
474 op_blend_pt_funcs[SP_N][SM_AS][SC][DP][CPU_NEON] = _op_blend_pt_mas_c_dp_neon;
475 op_blend_pt_funcs[SP_N][SM_AS][SC_N][DP][CPU_NEON] = _op_blend_pt_mas_cn_dp_neon;
476 op_blend_pt_funcs[SP_N][SM_AS][SC_AN][DP][CPU_NEON] = _op_blend_pt_mas_can_dp_neon;
477 op_blend_pt_funcs[SP_N][SM_AS][SC_AA][DP][CPU_NEON] = _op_blend_pt_mas_caa_dp_neon;
478
479 op_blend_pt_funcs[SP_N][SM_AS][SC][DP_AN][CPU_NEON] = _op_blend_pt_mas_c_dpan_neon;
480 op_blend_pt_funcs[SP_N][SM_AS][SC_N][DP_AN][CPU_NEON] = _op_blend_pt_mas_cn_dpan_neon;
481 op_blend_pt_funcs[SP_N][SM_AS][SC_AN][DP_AN][CPU_NEON] = _op_blend_pt_mas_can_dpan_neon;
482 op_blend_pt_funcs[SP_N][SM_AS][SC_AA][DP_AN][CPU_NEON] = _op_blend_pt_mas_caa_dpan_neon;
483}
484#endif
485
486/*-----*/
487
488/* blend_rel mask x color -> dst */
489
490#ifdef BUILD_NEON
491static void
492_op_blend_rel_mas_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, int l) {
493 DATA32 *e;
494 int alpha;
495
496 DEBUG_FNCOUNT("not");
497
498 UNROLL8_PLD_WHILE(d, l, e,
499 {
500 DATA32 mc = MUL_SYM(*m, c);
501 alpha = 256 - (mc >> 24);
502 *d = MUL_SYM(*d >> 24, mc) + MUL_256(alpha, *d);
503 d++;
504 m++;
505 });
506}
507
508#define _op_blend_rel_mas_cn_dp_neon _op_blend_rel_mas_c_dp_neon
509#define _op_blend_rel_mas_can_dp_neon _op_blend_rel_mas_c_dp_neon
510#define _op_blend_rel_mas_caa_dp_neon _op_blend_rel_mas_c_dp_neon
511
512#define _op_blend_rel_mas_c_dpan_neon _op_blend_mas_c_dpan_neon
513#define _op_blend_rel_mas_cn_dpan_neon _op_blend_mas_cn_dpan_neon
514#define _op_blend_rel_mas_can_dpan_neon _op_blend_mas_can_dpan_neon
515#define _op_blend_rel_mas_caa_dpan_neon _op_blend_mas_caa_dpan_neon
516
517static void
518init_blend_rel_mask_color_span_funcs_neon(void)
519{
520 op_blend_rel_span_funcs[SP_N][SM_AS][SC][DP][CPU_NEON] = _op_blend_rel_mas_c_dp_neon;
521 op_blend_rel_span_funcs[SP_N][SM_AS][SC_N][DP][CPU_NEON] = _op_blend_rel_mas_cn_dp_neon;
522 op_blend_rel_span_funcs[SP_N][SM_AS][SC_AN][DP][CPU_NEON] = _op_blend_rel_mas_can_dp_neon;
523 op_blend_rel_span_funcs[SP_N][SM_AS][SC_AA][DP][CPU_NEON] = _op_blend_rel_mas_caa_dp_neon;
524
525 op_blend_rel_span_funcs[SP_N][SM_AS][SC][DP_AN][CPU_NEON] = _op_blend_rel_mas_c_dpan_neon;
526 op_blend_rel_span_funcs[SP_N][SM_AS][SC_N][DP_AN][CPU_NEON] = _op_blend_rel_mas_cn_dpan_neon;
527 op_blend_rel_span_funcs[SP_N][SM_AS][SC_AN][DP_AN][CPU_NEON] = _op_blend_rel_mas_can_dpan_neon;
528 op_blend_rel_span_funcs[SP_N][SM_AS][SC_AA][DP_AN][CPU_NEON] = _op_blend_rel_mas_caa_dpan_neon;
529}
530#endif
531
532#ifdef BUILD_NEON
533static void
534_op_blend_rel_pt_mas_c_dp_neon(DATA32 s, DATA8 m, DATA32 c, DATA32 *d) {
535 s = MUL_SYM(m, c);
536 c = 256 - (s >> 24);
537 *d = MUL_SYM(*d >> 24, s) + MUL_256(c, *d);
538}
539
540#define _op_blend_rel_pt_mas_cn_dp_neon _op_blend_rel_pt_mas_c_dp_neon
541#define _op_blend_rel_pt_mas_can_dp_neon _op_blend_rel_pt_mas_c_dp_neon
542#define _op_blend_rel_pt_mas_caa_dp_neon _op_blend_rel_pt_mas_c_dp_neon
543
544#define _op_blend_rel_pt_mas_c_dpan_neon _op_blend_pt_mas_c_dpan_neon
545#define _op_blend_rel_pt_mas_cn_dpan_neon _op_blend_pt_mas_cn_dpan_neon
546#define _op_blend_rel_pt_mas_can_dpan_neon _op_blend_pt_mas_can_dpan_neon
547#define _op_blend_rel_pt_mas_caa_dpan_neon _op_blend_pt_mas_caa_dpan_neon
548
549static void
550init_blend_rel_mask_color_pt_funcs_neon(void)
551{
552 op_blend_rel_pt_funcs[SP_N][SM_AS][SC][DP][CPU_NEON] = _op_blend_rel_pt_mas_c_dp_neon;
553 op_blend_rel_pt_funcs[SP_N][SM_AS][SC_N][DP][CPU_NEON] = _op_blend_rel_pt_mas_cn_dp_neon;
554 op_blend_rel_pt_funcs[SP_N][SM_AS][SC_AN][DP][CPU_NEON] = _op_blend_rel_pt_mas_can_dp_neon;
555 op_blend_rel_pt_funcs[SP_N][SM_AS][SC_AA][DP][CPU_NEON] = _op_blend_rel_pt_mas_caa_dp_neon;
556
557 op_blend_rel_pt_funcs[SP_N][SM_AS][SC][DP_AN][CPU_NEON] = _op_blend_rel_pt_mas_c_dpan_neon;
558 op_blend_rel_pt_funcs[SP_N][SM_AS][SC_N][DP_AN][CPU_NEON] = _op_blend_rel_pt_mas_cn_dpan_neon;
559 op_blend_rel_pt_funcs[SP_N][SM_AS][SC_AN][DP_AN][CPU_NEON] = _op_blend_rel_pt_mas_can_dpan_neon;
560 op_blend_rel_pt_funcs[SP_N][SM_AS][SC_AA][DP_AN][CPU_NEON] = _op_blend_rel_pt_mas_caa_dpan_neon;
561}
562#endif
diff --git a/src/lib/evas/common/evas_op_blend/op_blend_mask_color_sse3.c b/src/lib/evas/common/evas_op_blend/op_blend_mask_color_sse3.c
new file mode 100644
index 0000000000..429e8d5ce0
--- /dev/null
+++ b/src/lib/evas/common/evas_op_blend/op_blend_mask_color_sse3.c
@@ -0,0 +1,321 @@
1/* blend mask x color -> dst */
2
3#ifdef BUILD_SSE3
4
5static void
6_op_blend_mas_c_dp_sse3(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, int l) {
7
8 const __m128i c_packed = _mm_set_epi32(c, c, c, c);
9
10 LOOP_ALIGNED_U1_A48_SSE3(d, l,
11 { /* UOP */
12
13 DATA32 a = *m;
14 DATA32 mc = MUL_SYM(a, c);
15 a = 256 - (mc >> 24);
16 *d = mc + MUL_256(a, *d);
17 m++; d++; l--;
18 },
19 { /* A4OP */
20
21 if ((m[3] | m[2] | m[1] | m[0]) == 0) {
22 m += 4; d += 4; l -= 4;
23 continue;
24 }
25
26 __m128i m0 = _mm_set_epi32(m[3], m[2], m[1], m[0]);
27 __m128i d0 = _mm_load_si128((__m128i *)d);
28
29 __m128i mc0 = mul_sym_sse3(m0, c_packed);
30 __m128i a0 = sub4_alpha_sse3(mc0);
31 __m128i mul0 = mul_256_sse3(a0, d0);
32
33 mul0 = _mm_add_epi32(mul0, mc0);
34
35 _mm_store_si128((__m128i *)d, mul0);
36
37 m += 4; d += 4; l -= 4;
38 },
39 { /* A8OP */
40
41 if((m[7] | m[6] | m[5] | m[4] | m[3] | m[2] | m[1] | m[0]) == 0) {
42 m += 8; d += 8; l -= 8;
43 continue;
44 }
45
46 __m128i m0 = _mm_set_epi32(m[3], m[2], m[1], m[0]);
47 __m128i d0 = _mm_load_si128((__m128i *)d);
48
49 __m128i m1 = _mm_set_epi32(m[7], m[6], m[5], m[4]);
50 __m128i d1 = _mm_load_si128((__m128i *)(d+4));
51
52 __m128i mc0 = mul_sym_sse3(m0, c_packed);
53 __m128i a0 = sub4_alpha_sse3(mc0);
54 __m128i mul0 = mul_256_sse3(a0, d0);
55
56 mul0 = _mm_add_epi32(mc0, mul0);
57
58 __m128i mc1 = mul_sym_sse3(m1, c_packed);
59 __m128i a1 = sub4_alpha_sse3(mc1);
60 __m128i mul1 = mul_256_sse3(a1, d1);
61
62 mul1 = _mm_add_epi32(mc1, mul1);
63
64 _mm_store_si128((__m128i *)d, mul0);
65 _mm_store_si128((__m128i *)(d+4), mul1);
66
67 m += 8; d += 8; l -= 8;
68 })
69}
70
71static void
72_op_blend_mas_can_dp_sse3(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, int l) {
73
74 DATA32 alpha;
75
76 const __m128i one = _mm_set_epi32(1, 1, 1, 1);
77 const __m128i c_packed = _mm_set_epi32(c, c, c, c);
78
79 LOOP_ALIGNED_U1_A48_SSE3(d, l,
80 { /* UOP */
81
82 alpha = *m;
83 switch(alpha)
84 {
85 case 0:
86 break;
87 case 255:
88 *d = c;
89 break;
90 default:
91 alpha++;
92 *d = INTERP_256(alpha, c, *d);
93 break;
94 }
95 m++; d++; l--;
96 },
97 { /* A4OP */
98
99 if ((m[3] | m[2] | m[1] | m[0]) == 0) {
100 m += 4; d += 4; l -= 4;
101 continue;
102 }
103
104 __m128i m0 = _mm_set_epi32(m[3], m[2], m[1], m[0]);
105 __m128i d0 = _mm_load_si128((__m128i *)d);
106
107 __m128i zm0 = _mm_cmpeq_epi32(m0, _mm_setzero_si128());
108
109 m0 = _mm_add_epi32(one, m0);
110
111 __m128i r0 = interp4_256_sse3(m0, c_packed, d0);
112
113 r0 = _mm_and_si128(~zm0, r0);
114 d0 = _mm_and_si128(zm0, d0);
115
116 d0 = _mm_add_epi32(r0, d0);
117
118 _mm_store_si128((__m128i *)d, d0);
119
120 m += 4; d += 4; l -= 4;
121 },
122 { /* A8OP */
123
124 if ((m[7] | m[6] | m[5] | m[4] | m[3] | m[2] | m[1] | m[0]) == 0) {
125 m += 8; d += 8; l -= 8;
126 continue;
127 }
128
129 __m128i m0 = _mm_set_epi32(m[3], m[2], m[1], m[0]);
130 __m128i d0 = _mm_load_si128((__m128i *)d);
131
132 __m128i m1 = _mm_set_epi32(m[7], m[6], m[5], m[4]);
133 __m128i d1 = _mm_load_si128((__m128i *)(d+4));
134
135 __m128i zm0 = _mm_cmpeq_epi32(m0, _mm_setzero_si128());
136 __m128i zm1 = _mm_cmpeq_epi32(m1, _mm_setzero_si128());
137
138 m0 = _mm_add_epi32(one, m0);
139 m1 = _mm_add_epi32(one, m1);
140
141 __m128i r0 = interp4_256_sse3(m0, c_packed, d0);
142 __m128i r1 = interp4_256_sse3(m1, c_packed, d1);
143
144 r0 = _mm_and_si128(~zm0, r0);
145 d0 = _mm_and_si128(zm0, d0);
146
147 r1 = _mm_and_si128(~zm1, r1);
148 d1 = _mm_and_si128(zm1, d1);
149
150 d0 = _mm_add_epi32(d0, r0);
151 d1 = _mm_add_epi32(d1, r1);
152
153 _mm_store_si128((__m128i *)d, d0);
154 _mm_store_si128((__m128i *)(d+4), d1);
155
156 m += 8; d += 8; l -= 8;
157 })
158}
159
160#define _op_blend_mas_cn_dp_sse3 _op_blend_mas_can_dp_sse3
161#define _op_blend_mas_caa_dp_sse3 _op_blend_mas_c_dp_sse3
162
163#define _op_blend_mas_c_dpan_sse3 _op_blend_mas_c_dp_sse3
164#define _op_blend_mas_cn_dpan_sse3 _op_blend_mas_cn_dp_sse3
165#define _op_blend_mas_can_dpan_sse3 _op_blend_mas_can_dp_sse3
166#define _op_blend_mas_caa_dpan_sse3 _op_blend_mas_caa_dp_sse3
167
168static void
169init_blend_mask_color_span_funcs_sse3(void)
170{
171// FIXME: BUGGY BUGGY Core i5 750 (32bit), 4.5.2 (Ubuntu/Linaro 4.5.2-8ubuntu4), ello (text and rectangle)
172// op_blend_span_funcs[SP_N][SM_AS][SC][DP][CPU_SSE3] = _op_blend_mas_c_dp_sse3;
173 op_blend_span_funcs[SP_N][SM_AS][SC_N][DP][CPU_SSE3] = _op_blend_mas_cn_dp_sse3;
174 op_blend_span_funcs[SP_N][SM_AS][SC_AN][DP][CPU_SSE3] = _op_blend_mas_can_dp_sse3;
175 op_blend_span_funcs[SP_N][SM_AS][SC_AA][DP][CPU_SSE3] = _op_blend_mas_caa_dp_sse3;
176
177// FIXME: BUGGY BUGGY Core i5 2500 (64bit), gcc version 4.5.2 (Ubuntu/Linaro 4.5.2-8ubuntu4), ello (text)
178// op_blend_span_funcs[SP_N][SM_AS][SC][DP_AN][CPU_SSE3] = _op_blend_mas_c_dpan_sse3;
179 op_blend_span_funcs[SP_N][SM_AS][SC_N][DP_AN][CPU_SSE3] = _op_blend_mas_cn_dpan_sse3;
180 op_blend_span_funcs[SP_N][SM_AS][SC_AN][DP_AN][CPU_SSE3] = _op_blend_mas_can_dpan_sse3;
181 op_blend_span_funcs[SP_N][SM_AS][SC_AA][DP_AN][CPU_SSE3] = _op_blend_mas_caa_dpan_sse3;
182}
183
184#define _op_blend_pt_mas_c_dp_sse3 NULL
185#define _op_blend_pt_mas_can_dp_sse3 NULL
186
187#define _op_blend_pt_mas_cn_dp_sse3 _op_blend_pt_mas_can_dp_sse3
188#define _op_blend_pt_mas_caa_dp_sse3 _op_blend_pt_mas_c_dp_sse3
189
190#define _op_blend_pt_mas_c_dpan_sse3 _op_blend_pt_mas_c_dp_sse3
191#define _op_blend_pt_mas_cn_dpan_sse3 _op_blend_pt_mas_cn_dp_sse3
192#define _op_blend_pt_mas_can_dpan_sse3 _op_blend_pt_mas_can_dp_sse3
193#define _op_blend_pt_mas_caa_dpan_sse3 _op_blend_pt_mas_caa_dp_sse3
194
195static void
196init_blend_mask_color_pt_funcs_sse3(void)
197{
198 op_blend_pt_funcs[SP_N][SM_AS][SC][DP][CPU_SSE3] = _op_blend_pt_mas_c_dp_sse3;
199 op_blend_pt_funcs[SP_N][SM_AS][SC_N][DP][CPU_SSE3] = _op_blend_pt_mas_cn_dp_sse3;
200 op_blend_pt_funcs[SP_N][SM_AS][SC_AN][DP][CPU_SSE3] = _op_blend_pt_mas_can_dp_sse3;
201 op_blend_pt_funcs[SP_N][SM_AS][SC_AA][DP][CPU_SSE3] = _op_blend_pt_mas_caa_dp_sse3;
202
203 op_blend_pt_funcs[SP_N][SM_AS][SC][DP_AN][CPU_SSE3] = _op_blend_pt_mas_c_dpan_sse3;
204 op_blend_pt_funcs[SP_N][SM_AS][SC_N][DP_AN][CPU_SSE3] = _op_blend_pt_mas_cn_dpan_sse3;
205 op_blend_pt_funcs[SP_N][SM_AS][SC_AN][DP_AN][CPU_SSE3] = _op_blend_pt_mas_can_dpan_sse3;
206 op_blend_pt_funcs[SP_N][SM_AS][SC_AA][DP_AN][CPU_SSE3] = _op_blend_pt_mas_caa_dpan_sse3;
207}
208
209/*-----*/
210
211/* blend_rel mask x color --> dst */
212
213static void
214_op_blend_rel_mas_c_dp_sse3(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, int l) {
215
216 const __m128i c_packed = _mm_set_epi32(c, c, c, c);
217
218 LOOP_ALIGNED_U1_A48_SSE3(d, l,
219 { /* UOP */
220
221 DATA32 mc = MUL_SYM(*m, c);
222 int alpha = 256 - (mc >> 24);
223 *d = MUL_SYM(*d >> 24, mc) + MUL_256(alpha, *d);
224 d++; m++; l--;
225 },
226 { /* A4OP */
227
228 __m128i m0 = _mm_set_epi32(m[3], m[2], m[1], m[0]);
229 __m128i d0 = _mm_load_si128((__m128i *) d);
230
231 __m128i mc0 = mul_sym_sse3(m0, c_packed);
232 __m128i a0 = sub4_alpha_sse3(mc0);
233
234 __m128i d0_sym = mul_sym_sse3(_mm_srli_epi32(d0, 24), mc0);
235 d0 = mul_256_sse3(a0, d0);
236
237 d0 = _mm_add_epi32(d0, d0_sym);
238
239 _mm_store_si128((__m128i *)d, d0);
240
241 d += 4; m += 4; l -= 4;
242 },
243 { /* A8OP */
244
245 __m128i m0 = _mm_set_epi32(m[3], m[2], m[1], m[0]);
246 __m128i d0 = _mm_load_si128((__m128i *)d);
247
248 __m128i m1 = _mm_set_epi32(m[7], m[6], m[5], m[4]);
249 __m128i d1 = _mm_load_si128((__m128i *)(d+4));
250
251 __m128i mc0 = mul_sym_sse3(m0, c_packed);
252 __m128i mc1 = mul_sym_sse3(m1, c_packed);
253
254 __m128i a0 = sub4_alpha_sse3(mc0);
255 __m128i a1 = sub4_alpha_sse3(mc1);
256
257 __m128i d0_sym = mul_sym_sse3(_mm_srli_epi32(d0, 24), mc0);
258 __m128i d1_sym = mul_sym_sse3(_mm_srli_epi32(d1, 24), mc1);
259
260 d0 = mul_256_sse3(a0, d0);
261 d1 = mul_256_sse3(a1, d1);
262
263 d0 = _mm_add_epi32(d0, d0_sym);
264 d1 = _mm_add_epi32(d1, d1_sym);
265
266 _mm_store_si128((__m128i *)d, d0);
267 _mm_store_si128((__m128i *)(d+4), d1);
268
269 d += 8; m += 8; l -= 8;
270 })
271}
272
273#define _op_blend_rel_mas_cn_dp_sse3 _op_blend_rel_mas_c_dp_sse3
274#define _op_blend_rel_mas_can_dp_sse3 _op_blend_rel_mas_c_dp_sse3
275#define _op_blend_rel_mas_caa_dp_sse3 _op_blend_rel_mas_c_dp_sse3
276
277#define _op_blend_rel_mas_c_dpan_sse3 _op_blend_mas_c_dpan_sse3
278#define _op_blend_rel_mas_cn_dpan_sse3 _op_blend_mas_cn_dpan_sse3
279#define _op_blend_rel_mas_can_dpan_sse3 _op_blend_mas_can_dpan_sse3
280#define _op_blend_rel_mas_caa_dpan_sse3 _op_blend_mas_caa_dpan_sse3
281
282static void
283init_blend_rel_mask_color_span_funcs_sse3(void)
284{
285 op_blend_rel_span_funcs[SP_N][SM_AS][SC][DP][CPU_SSE3] = _op_blend_rel_mas_c_dp_sse3;
286 op_blend_rel_span_funcs[SP_N][SM_AS][SC_N][DP][CPU_SSE3] = _op_blend_rel_mas_can_dp_sse3;
287 op_blend_rel_span_funcs[SP_N][SM_AS][SC_AN][DP][CPU_SSE3] = _op_blend_rel_mas_can_dp_sse3;
288 op_blend_rel_span_funcs[SP_N][SM_AS][SC_AA][DP][CPU_SSE3] = _op_blend_rel_mas_caa_dp_sse3;
289
290 op_blend_rel_span_funcs[SP_N][SM_AS][SC][DP_AN][CPU_SSE3] = _op_blend_rel_mas_c_dpan_sse3;
291 op_blend_rel_span_funcs[SP_N][SM_AS][SC_N][DP_AN][CPU_SSE3] = _op_blend_rel_mas_cn_dpan_sse3;
292 op_blend_rel_span_funcs[SP_N][SM_AS][SC_AN][DP_AN][CPU_SSE3] = _op_blend_rel_mas_can_dpan_sse3;
293 op_blend_rel_span_funcs[SP_N][SM_AS][SC_AA][DP_AN][CPU_SSE3] = _op_blend_rel_mas_caa_dpan_sse3;
294}
295
296#define _op_blend_rel_pt_mas_c_dp_sse3 NULL
297
298#define _op_blend_rel_pt_mas_cn_dp_sse3 _op_blend_rel_pt_mas_c_dp_sse3
299#define _op_blend_rel_pt_mas_can_dp_sse3 _op_blend_rel_pt_mas_c_dp_sse3
300#define _op_blend_rel_pt_mas_caa_dp_sse3 _op_blend_rel_pt_mas_c_dp_sse3
301
302#define _op_blend_rel_pt_mas_c_dpan_sse3 _op_blend_pt_mas_c_dpan_sse3
303#define _op_blend_rel_pt_mas_cn_dpan_sse3 _op_blend_pt_mas_cn_dpan_sse3
304#define _op_blend_rel_pt_mas_can_dpan_sse3 _op_blend_pt_mas_can_dpan_sse3
305#define _op_blend_rel_pt_mas_caa_dpan_sse3 _op_blend_pt_mas_caa_dpan_sse3
306
307static void
308init_blend_rel_mask_color_pt_funcs_sse3(void)
309{
310 op_blend_rel_pt_funcs[SP_N][SM_AS][SC][DP][CPU_SSE3] = _op_blend_rel_pt_mas_c_dp_sse3;
311 op_blend_rel_pt_funcs[SP_N][SM_AS][SC_N][DP][CPU_SSE3] = _op_blend_rel_pt_mas_cn_dp_sse3;
312 op_blend_rel_pt_funcs[SP_N][SM_AS][SC_AN][DP][CPU_SSE3] = _op_blend_rel_pt_mas_can_dp_sse3;
313 op_blend_rel_pt_funcs[SP_N][SM_AS][SC_AA][DP][CPU_SSE3] = _op_blend_rel_pt_mas_caa_dp_sse3;
314
315 op_blend_rel_pt_funcs[SP_N][SM_AS][SC][DP_AN][CPU_SSE3] = _op_blend_rel_pt_mas_c_dpan_sse3;
316 op_blend_rel_pt_funcs[SP_N][SM_AS][SC_N][DP_AN][CPU_SSE3] = _op_blend_rel_pt_mas_cn_dpan_sse3;
317 op_blend_rel_pt_funcs[SP_N][SM_AS][SC_AN][DP_AN][CPU_SSE3] = _op_blend_rel_pt_mas_can_dpan_sse3;
318 op_blend_rel_pt_funcs[SP_N][SM_AS][SC_AA][DP_AN][CPU_SSE3] = _op_blend_rel_pt_mas_caa_dpan_sse3;
319}
320
321#endif
diff --git a/src/lib/evas/common/evas_op_blend/op_blend_master_sse3.c b/src/lib/evas/common/evas_op_blend/op_blend_master_sse3.c
new file mode 100644
index 0000000000..eac6755b97
--- /dev/null
+++ b/src/lib/evas/common/evas_op_blend/op_blend_master_sse3.c
@@ -0,0 +1,77 @@
1#define NEED_SSE3 1
2
3#include "evas_common.h"
4
5#ifdef BUILD_SSE3
6static __m128i A_MASK_SSE3;
7#endif
8
9extern RGBA_Gfx_Func op_blend_span_funcs[SP_LAST][SM_LAST][SC_LAST][DP_LAST][CPU_LAST];
10extern RGBA_Gfx_Pt_Func op_blend_pt_funcs[SP_LAST][SM_LAST][SC_LAST][DP_LAST][CPU_LAST];
11
12extern RGBA_Gfx_Func op_blend_rel_span_funcs[SP_LAST][SM_LAST][SC_LAST][DP_LAST][CPU_LAST];
13extern RGBA_Gfx_Pt_Func op_blend_rel_pt_funcs[SP_LAST][SM_LAST][SC_LAST][DP_LAST][CPU_LAST];
14
15# include "op_blend_pixel_sse3.c"
16# include "op_blend_color_sse3.c"
17# include "op_blend_pixel_color_sse3.c"
18# include "op_blend_pixel_mask_sse3.c"
19# include "op_blend_mask_color_sse3.c"
20
21void
22evas_common_op_blend_init_sse3(void)
23{
24#ifdef BUILD_SSE3
25 GA_MASK_SSE3 = _mm_set_epi32(0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF);
26 RB_MASK_SSE3 = _mm_set_epi32(0xFF00FF00, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00);
27 SYM4_MASK_SSE3 = _mm_set_epi32(0x00FF00FF, 0x000000FF, 0x00FF00FF, 0x000000FF);
28 RGB_MASK_SSE3 = _mm_set_epi32(0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF);
29 A_MASK_SSE3 = _mm_set_epi32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000);
30 ALPHA_SSE3 = _mm_set_epi32(256, 256, 256, 256);
31
32 init_blend_pixel_span_funcs_sse3();
33 init_blend_pixel_color_span_funcs_sse3();
34 init_blend_pixel_mask_span_funcs_sse3();
35 init_blend_color_span_funcs_sse3();
36 init_blend_mask_color_span_funcs_sse3();
37
38 init_blend_pixel_pt_funcs_sse3();
39 init_blend_pixel_color_pt_funcs_sse3();
40 init_blend_pixel_mask_pt_funcs_sse3();
41 init_blend_color_pt_funcs_sse3();
42 init_blend_mask_color_pt_funcs_sse3();
43#endif
44}
45
46void
47evas_common_op_blend_rel_init_sse3(void)
48{
49#ifdef BUILD_SSE3
50 init_blend_rel_pixel_span_funcs_sse3();
51 init_blend_rel_pixel_color_span_funcs_sse3();
52 init_blend_rel_pixel_mask_span_funcs_sse3();
53 init_blend_rel_color_span_funcs_sse3();
54 init_blend_rel_mask_color_span_funcs_sse3();
55
56 init_blend_rel_pixel_pt_funcs_sse3();
57 init_blend_rel_pixel_color_pt_funcs_sse3();
58 init_blend_rel_pixel_mask_pt_funcs_sse3();
59 init_blend_rel_color_pt_funcs_sse3();
60 init_blend_rel_mask_color_pt_funcs_sse3();
61#endif
62}
63
64//#pragma GCC push_options
65//#pragma GCC optimize ("O0")
66void
67evas_common_op_sse3_test(void)
68{
69#ifdef BUILD_SSE3
70 DATA32 s[64] = {0x11883399}, d[64] = {0xff88cc33};
71
72 s[0] = rand(); d[1] = rand();
73 _op_blend_pas_dp_sse3(s, NULL, 0, d, 64);
74 evas_common_cpu_end_opt();
75#endif
76}
77//#pragma GCC pop_options
diff --git a/src/lib/evas/common/evas_op_blend/op_blend_pixel_.c b/src/lib/evas/common/evas_op_blend/op_blend_pixel_.c
new file mode 100644
index 0000000000..b1d1196134
--- /dev/null
+++ b/src/lib/evas/common/evas_op_blend/op_blend_pixel_.c
@@ -0,0 +1,154 @@
1/* blend pixel --> dst */
2
3static void
4_op_blend_p_dp(DATA32 *s, DATA8 *m EINA_UNUSED, DATA32 c EINA_UNUSED, DATA32 *d, int l) {
5 DATA32 *e;
6 int alpha;
7 UNROLL8_PLD_WHILE(d, l, e,
8 {
9 alpha = 256 - (*s >> 24);
10 *d = *s++ + MUL_256(alpha, *d);
11 d++;
12 });
13}
14
15static void
16_op_blend_pas_dp(DATA32 *s, DATA8 *m EINA_UNUSED, DATA32 c EINA_UNUSED, DATA32 *d, int l) {
17 DATA32 *e;
18 int alpha;
19 UNROLL8_PLD_WHILE(d, l, e,
20 {
21 switch (*s & 0xff000000)
22 {
23 case 0:
24 break;
25 case 0xff000000:
26 *d = *s;
27 break;
28 default:
29 alpha = 256 - (*s >> 24);
30 *d = *s + MUL_256(alpha, *d);
31 break;
32 }
33 s++; d++;
34 });
35}
36
37#define _op_blend_pan_dp NULL
38
39#define _op_blend_p_dpan _op_blend_p_dp
40#define _op_blend_pas_dpan _op_blend_pas_dp
41#define _op_blend_pan_dpan _op_blend_pan_dp
42
43static void
44init_blend_pixel_span_funcs_c(void)
45{
46 op_blend_span_funcs[SP][SM_N][SC_N][DP][CPU_C] = _op_blend_p_dp;
47 op_blend_span_funcs[SP_AS][SM_N][SC_N][DP][CPU_C] = _op_blend_pas_dp;
48 op_blend_span_funcs[SP_AN][SM_N][SC_N][DP][CPU_C] = _op_blend_pan_dp;
49
50 op_blend_span_funcs[SP][SM_N][SC_N][DP_AN][CPU_C] = _op_blend_p_dpan;
51 op_blend_span_funcs[SP_AS][SM_N][SC_N][DP_AN][CPU_C] = _op_blend_pas_dpan;
52 op_blend_span_funcs[SP_AN][SM_N][SC_N][DP_AN][CPU_C] = _op_blend_pan_dpan;
53}
54
55static void
56_op_blend_pt_p_dp(DATA32 s, DATA8 m EINA_UNUSED, DATA32 c, DATA32 *d) {
57 c = 256 - (s >> 24);
58 *d = s + MUL_256(c, *d);
59}
60
61#define _op_blend_pt_pas_dp _op_blend_pt_p_dp
62#define _op_blend_pt_pan_dp NULL
63
64#define _op_blend_pt_p_dpan _op_blend_pt_p_dp
65#define _op_blend_pt_pan_dpan _op_blend_pt_pan_dp
66#define _op_blend_pt_pas_dpan _op_blend_pt_pas_dp
67
68static void
69init_blend_pixel_pt_funcs_c(void)
70{
71 op_blend_pt_funcs[SP][SM_N][SC_N][DP][CPU_C] = _op_blend_pt_p_dp;
72 op_blend_pt_funcs[SP_AS][SM_N][SC_N][DP][CPU_C] = _op_blend_pt_pas_dp;
73 op_blend_pt_funcs[SP_AN][SM_N][SC_N][DP][CPU_C] = _op_blend_pt_pan_dp;
74
75 op_blend_pt_funcs[SP][SM_N][SC_N][DP_AN][CPU_C] = _op_blend_pt_p_dpan;
76 op_blend_pt_funcs[SP_AS][SM_N][SC_N][DP_AN][CPU_C] = _op_blend_pt_pas_dpan;
77 op_blend_pt_funcs[SP_AN][SM_N][SC_N][DP_AN][CPU_C] = _op_blend_pt_pan_dpan;
78}
79
80/*-----*/
81
82/* blend_rel pixel -> dst */
83
84static void
85_op_blend_rel_p_dp(DATA32 *s, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) {
86 DATA32 *e;
87 int alpha;
88 UNROLL8_PLD_WHILE(d, l, e,
89 {
90 alpha = 256 - (*s >> 24);
91 c = 1 + (*d >> 24);
92 *d = MUL_256(c, *s) + MUL_256(alpha, *d);
93 d++;
94 s++;
95 });
96}
97
98static void
99_op_blend_rel_pan_dp(DATA32 *s, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) {
100 DATA32 *e;
101 UNROLL8_PLD_WHILE(d, l, e,
102 {
103 c = 1 + (*d >> 24);
104 *d++ = MUL_256(c, *s);
105 s++;
106 });
107}
108
109#define _op_blend_rel_pas_dp _op_blend_rel_p_dp
110
111#define _op_blend_rel_p_dpan _op_blend_p_dpan
112#define _op_blend_rel_pan_dpan _op_blend_pan_dpan
113#define _op_blend_rel_pas_dpan _op_blend_pas_dpan
114
115static void
116init_blend_rel_pixel_span_funcs_c(void)
117{
118 op_blend_rel_span_funcs[SP][SM_N][SC_N][DP][CPU_C] = _op_blend_rel_p_dp;
119 op_blend_rel_span_funcs[SP_AS][SM_N][SC_N][DP][CPU_C] = _op_blend_rel_pas_dp;
120 op_blend_rel_span_funcs[SP_AN][SM_N][SC_N][DP][CPU_C] = _op_blend_rel_pan_dp;
121
122 op_blend_rel_span_funcs[SP][SM_N][SC_N][DP_AN][CPU_C] = _op_blend_rel_p_dpan;
123 op_blend_rel_span_funcs[SP_AS][SM_N][SC_N][DP_AN][CPU_C] = _op_blend_rel_pas_dpan;
124 op_blend_rel_span_funcs[SP_AN][SM_N][SC_N][DP_AN][CPU_C] = _op_blend_rel_pan_dpan;
125}
126
127static void
128_op_blend_rel_pt_p_dp(DATA32 s, DATA8 m EINA_UNUSED, DATA32 c, DATA32 *d) {
129 c = 256 - (s >> 24);
130 *d = MUL_SYM(*d >> 24, s) + MUL_256(c, *d);
131}
132
133static void
134_op_blend_rel_pt_pan_dp(DATA32 s, DATA8 m EINA_UNUSED, DATA32 c EINA_UNUSED, DATA32 *d) {
135 *d = MUL_SYM(*d >> 24, s);
136}
137
138#define _op_blend_rel_pt_pas_dp _op_blend_rel_pt_p_dp
139
140#define _op_blend_rel_pt_p_dpan _op_blend_pt_p_dpan
141#define _op_blend_rel_pt_pan_dpan _op_blend_pt_pan_dpan
142#define _op_blend_rel_pt_pas_dpan _op_blend_pt_pas_dpan
143
144static void
145init_blend_rel_pixel_pt_funcs_c(void)
146{
147 op_blend_rel_pt_funcs[SP][SM_N][SC_N][DP][CPU_C] = _op_blend_rel_pt_p_dp;
148 op_blend_rel_pt_funcs[SP_AS][SM_N][SC_N][DP][CPU_C] = _op_blend_rel_pt_pas_dp;
149 op_blend_rel_pt_funcs[SP_AN][SM_N][SC_N][DP][CPU_C] = _op_blend_rel_pt_pan_dp;
150
151 op_blend_rel_pt_funcs[SP][SM_N][SC_N][DP_AN][CPU_C] = _op_blend_rel_pt_p_dpan;
152 op_blend_rel_pt_funcs[SP_AS][SM_N][SC_N][DP_AN][CPU_C] = _op_blend_rel_pt_pas_dpan;
153 op_blend_rel_pt_funcs[SP_AN][SM_N][SC_N][DP_AN][CPU_C] = _op_blend_rel_pt_pan_dpan;
154}
diff --git a/src/lib/evas/common/evas_op_blend/op_blend_pixel_color_.c b/src/lib/evas/common/evas_op_blend/op_blend_pixel_color_.c
new file mode 100644
index 0000000000..8d6e944def
--- /dev/null
+++ b/src/lib/evas/common/evas_op_blend/op_blend_pixel_color_.c
@@ -0,0 +1,276 @@
1/* blend pixel x color --> dst */
2
3static void
4_op_blend_p_c_dp(DATA32 *s, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) {
5 DATA32 *e;
6 int alpha;
7 UNROLL8_PLD_WHILE(d, l, e,
8 {
9 DATA32 sc = MUL4_SYM(c, *s);
10 alpha = 256 - (sc >> 24);
11 *d = sc + MUL_256(alpha, *d);
12 d++;
13 s++;
14 });
15}
16
17static void
18_op_blend_pan_c_dp(DATA32 *s, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) {
19 DATA32 *e;
20 int alpha = 256 - (c >> 24);
21 UNROLL8_PLD_WHILE(d, l, e,
22 {
23 *d = ((c & 0xff000000) + MUL3_SYM(c, *s)) + MUL_256(alpha, *d);
24 d++;
25 s++;
26 });
27}
28
29static void
30_op_blend_p_can_dp(DATA32 *s, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) {
31 DATA32 *e;
32 int alpha;
33 UNROLL8_PLD_WHILE(d, l, e,
34 {
35 alpha = 256 - (*s >> 24);
36 *d = ((*s & 0xff000000) + MUL3_SYM(c, *s)) + MUL_256(alpha, *d);
37 d++;
38 s++;
39 });
40}
41
42static void
43_op_blend_pan_can_dp(DATA32 *s, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) {
44 DATA32 *e;
45 UNROLL8_PLD_WHILE(d, l, e,
46 {
47 *d++ = 0xff000000 + MUL3_SYM(c, *s);
48 s++;
49 });
50}
51
52static void
53_op_blend_p_caa_dp(DATA32 *s, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) {
54 DATA32 *e;
55 int alpha;
56 c = 1 + (c & 0xff);
57 UNROLL8_PLD_WHILE(d, l, e,
58 {
59 DATA32 sc = MUL_256(c, *s);
60 alpha = 256 - (sc >> 24);
61 *d = sc + MUL_256(alpha, *d);
62 d++;
63 s++;
64 });
65}
66
67static void
68_op_blend_pan_caa_dp(DATA32 *s, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) {
69 DATA32 *e;
70 c = 1 + (c & 0xff);
71 UNROLL8_PLD_WHILE(d, l, e,
72 {
73 *d = INTERP_256(c, *s, *d);
74 d++;
75 s++;
76 });
77}
78
79#define _op_blend_pas_c_dp _op_blend_p_c_dp
80#define _op_blend_pas_can_dp _op_blend_p_can_dp
81#define _op_blend_pas_caa_dp _op_blend_p_caa_dp
82
83#define _op_blend_p_c_dpan _op_blend_p_c_dp
84#define _op_blend_pas_c_dpan _op_blend_pas_c_dp
85#define _op_blend_pan_c_dpan _op_blend_pan_c_dp
86#define _op_blend_p_can_dpan _op_blend_p_can_dp
87#define _op_blend_pas_can_dpan _op_blend_pas_can_dp
88#define _op_blend_pan_can_dpan _op_blend_pan_can_dp
89#define _op_blend_p_caa_dpan _op_blend_p_caa_dp
90#define _op_blend_pas_caa_dpan _op_blend_pas_caa_dp
91#define _op_blend_pan_caa_dpan _op_blend_pan_caa_dp
92
93static void
94init_blend_pixel_color_span_funcs_c(void)
95{
96 op_blend_span_funcs[SP][SM_N][SC][DP][CPU_C] = _op_blend_p_c_dp;
97 op_blend_span_funcs[SP_AS][SM_N][SC][DP][CPU_C] = _op_blend_pas_c_dp;
98 op_blend_span_funcs[SP_AN][SM_N][SC][DP][CPU_C] = _op_blend_pan_c_dp;
99 op_blend_span_funcs[SP][SM_N][SC_AN][DP][CPU_C] = _op_blend_p_can_dp;
100 op_blend_span_funcs[SP_AS][SM_N][SC_AN][DP][CPU_C] = _op_blend_pas_can_dp;
101 op_blend_span_funcs[SP_AN][SM_N][SC_AN][DP][CPU_C] = _op_blend_pan_can_dp;
102 op_blend_span_funcs[SP][SM_N][SC_AA][DP][CPU_C] = _op_blend_p_caa_dp;
103 op_blend_span_funcs[SP_AS][SM_N][SC_AA][DP][CPU_C] = _op_blend_pas_caa_dp;
104 op_blend_span_funcs[SP_AN][SM_N][SC_AA][DP][CPU_C] = _op_blend_pan_caa_dp;
105
106 op_blend_span_funcs[SP][SM_N][SC][DP_AN][CPU_C] = _op_blend_p_c_dpan;
107 op_blend_span_funcs[SP_AS][SM_N][SC][DP_AN][CPU_C] = _op_blend_pas_c_dpan;
108 op_blend_span_funcs[SP_AN][SM_N][SC][DP_AN][CPU_C] = _op_blend_pan_c_dpan;
109 op_blend_span_funcs[SP][SM_N][SC_AN][DP_AN][CPU_C] = _op_blend_p_can_dpan;
110 op_blend_span_funcs[SP_AS][SM_N][SC_AN][DP_AN][CPU_C] = _op_blend_pas_can_dpan;
111 op_blend_span_funcs[SP_AN][SM_N][SC_AN][DP_AN][CPU_C] = _op_blend_pan_can_dpan;
112 op_blend_span_funcs[SP][SM_N][SC_AA][DP_AN][CPU_C] = _op_blend_p_caa_dpan;
113 op_blend_span_funcs[SP_AS][SM_N][SC_AA][DP_AN][CPU_C] = _op_blend_pas_caa_dpan;
114 op_blend_span_funcs[SP_AN][SM_N][SC_AA][DP_AN][CPU_C] = _op_blend_pan_caa_dpan;
115}
116
117static void
118_op_blend_pt_p_c_dp(DATA32 s, DATA8 m EINA_UNUSED, DATA32 c, DATA32 *d) {
119 s = MUL4_SYM(c, s);
120 c = 256 - (s >> 24);
121 *d = s + MUL_256(c, *d);
122}
123
124#define _op_blend_pt_pas_c_dp _op_blend_pt_p_c_dp
125#define _op_blend_pt_pan_c_dp _op_blend_pt_p_c_dp
126#define _op_blend_pt_p_can_dp _op_blend_pt_p_c_dp
127#define _op_blend_pt_pas_can_dp _op_blend_pt_p_c_dp
128#define _op_blend_pt_pan_can_dp _op_blend_pt_p_c_dp
129#define _op_blend_pt_p_caa_dp _op_blend_pt_p_c_dp
130#define _op_blend_pt_pas_caa_dp _op_blend_pt_p_c_dp
131#define _op_blend_pt_pan_caa_dp _op_blend_pt_p_c_dp
132
133#define _op_blend_pt_p_c_dpan _op_blend_pt_p_c_dp
134#define _op_blend_pt_pas_c_dpan _op_blend_pt_pas_c_dp
135#define _op_blend_pt_pan_c_dpan _op_blend_pt_pan_c_dp
136#define _op_blend_pt_p_can_dpan _op_blend_pt_p_can_dp
137#define _op_blend_pt_pas_can_dpan _op_blend_pt_pas_can_dp
138#define _op_blend_pt_pan_can_dpan _op_blend_pt_pan_can_dp
139#define _op_blend_pt_p_caa_dpan _op_blend_pt_p_caa_dp
140#define _op_blend_pt_pas_caa_dpan _op_blend_pt_pas_caa_dp
141#define _op_blend_pt_pan_caa_dpan _op_blend_pt_pan_caa_dp
142
143static void
144init_blend_pixel_color_pt_funcs_c(void)
145{
146 op_blend_pt_funcs[SP][SM_N][SC][DP][CPU_C] = _op_blend_pt_p_c_dp;
147 op_blend_pt_funcs[SP_AS][SM_N][SC][DP][CPU_C] = _op_blend_pt_pas_c_dp;
148 op_blend_pt_funcs[SP_AN][SM_N][SC][DP][CPU_C] = _op_blend_pt_pan_c_dp;
149 op_blend_pt_funcs[SP][SM_N][SC_AN][DP][CPU_C] = _op_blend_pt_p_can_dp;
150 op_blend_pt_funcs[SP_AS][SM_N][SC_AN][DP][CPU_C] = _op_blend_pt_pas_can_dp;
151 op_blend_pt_funcs[SP_AN][SM_N][SC_AN][DP][CPU_C] = _op_blend_pt_pan_can_dp;
152 op_blend_pt_funcs[SP][SM_N][SC_AA][DP][CPU_C] = _op_blend_pt_p_caa_dp;
153 op_blend_pt_funcs[SP_AS][SM_N][SC_AA][DP][CPU_C] = _op_blend_pt_pas_caa_dp;
154 op_blend_pt_funcs[SP_AN][SM_N][SC_AA][DP][CPU_C] = _op_blend_pt_pan_caa_dp;
155
156 op_blend_pt_funcs[SP][SM_N][SC][DP_AN][CPU_C] = _op_blend_pt_p_c_dpan;
157 op_blend_pt_funcs[SP_AS][SM_N][SC][DP_AN][CPU_C] = _op_blend_pt_pas_c_dpan;
158 op_blend_pt_funcs[SP_AN][SM_N][SC][DP_AN][CPU_C] = _op_blend_pt_pan_c_dpan;
159 op_blend_pt_funcs[SP][SM_N][SC_AN][DP_AN][CPU_C] = _op_blend_pt_p_can_dpan;
160 op_blend_pt_funcs[SP_AS][SM_N][SC_AN][DP_AN][CPU_C] = _op_blend_pt_pas_can_dpan;
161 op_blend_pt_funcs[SP_AN][SM_N][SC_AN][DP_AN][CPU_C] = _op_blend_pt_pan_can_dpan;
162 op_blend_pt_funcs[SP][SM_N][SC_AA][DP_AN][CPU_C] = _op_blend_pt_p_caa_dpan;
163 op_blend_pt_funcs[SP_AS][SM_N][SC_AA][DP_AN][CPU_C] = _op_blend_pt_pas_caa_dpan;
164 op_blend_pt_funcs[SP_AN][SM_N][SC_AA][DP_AN][CPU_C] = _op_blend_pt_pan_caa_dpan;
165}
166
167/*-----*/
168
169/* blend_rel pixel x color -> dst */
170
171static void
172_op_blend_rel_p_c_dp(DATA32 *s, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) {
173 DATA32 *e;
174 int alpha;
175 UNROLL8_PLD_WHILE(d, l, e,
176 {
177 DATA32 sc = MUL4_SYM(c, *s);
178 alpha = 256 - (sc >> 24);
179 *d = MUL_SYM(*d >> 24, sc) + MUL_256(alpha, *d);
180 d++;
181 s++;
182 });
183}
184
185#define _op_blend_rel_pas_c_dp _op_blend_rel_p_c_dp
186#define _op_blend_rel_pan_c_dp _op_blend_rel_p_c_dp
187#define _op_blend_rel_p_can_dp _op_blend_rel_p_c_dp
188#define _op_blend_rel_pas_can_dp _op_blend_rel_p_c_dp
189#define _op_blend_rel_pan_can_dp _op_blend_rel_p_c_dp
190#define _op_blend_rel_p_caa_dp _op_blend_rel_p_c_dp
191#define _op_blend_rel_pas_caa_dp _op_blend_rel_p_c_dp
192#define _op_blend_rel_pan_caa_dp _op_blend_rel_p_c_dp
193
194#define _op_blend_rel_p_c_dpan _op_blend_p_c_dpan
195#define _op_blend_rel_pas_c_dpan _op_blend_pas_c_dpan
196#define _op_blend_rel_pan_c_dpan _op_blend_pan_c_dpan
197#define _op_blend_rel_p_can_dpan _op_blend_p_can_dpan
198#define _op_blend_rel_pas_can_dpan _op_blend_pas_can_dpan
199#define _op_blend_rel_pan_can_dpan _op_blend_pan_can_dpan
200#define _op_blend_rel_p_caa_dpan _op_blend_p_caa_dpan
201#define _op_blend_rel_pas_caa_dpan _op_blend_pas_caa_dpan
202#define _op_blend_rel_pan_caa_dpan _op_blend_pan_caa_dpan
203
204static void
205init_blend_rel_pixel_color_span_funcs_c(void)
206{
207 op_blend_rel_span_funcs[SP][SM_N][SC][DP][CPU_C] = _op_blend_rel_p_c_dp;
208 op_blend_rel_span_funcs[SP_AS][SM_N][SC][DP][CPU_C] = _op_blend_rel_pas_c_dp;
209 op_blend_rel_span_funcs[SP_AN][SM_N][SC][DP][CPU_C] = _op_blend_rel_pan_c_dp;
210 op_blend_rel_span_funcs[SP][SM_N][SC_AN][DP][CPU_C] = _op_blend_rel_p_can_dp;
211 op_blend_rel_span_funcs[SP_AS][SM_N][SC_AN][DP][CPU_C] = _op_blend_rel_pas_can_dp;
212 op_blend_rel_span_funcs[SP_AN][SM_N][SC_AN][DP][CPU_C] = _op_blend_rel_pan_can_dp;
213 op_blend_rel_span_funcs[SP][SM_N][SC_AA][DP][CPU_C] = _op_blend_rel_p_caa_dp;
214 op_blend_rel_span_funcs[SP_AS][SM_N][SC_AA][DP][CPU_C] = _op_blend_rel_pas_caa_dp;
215 op_blend_rel_span_funcs[SP_AN][SM_N][SC_AA][DP][CPU_C] = _op_blend_rel_pan_caa_dp;
216
217 op_blend_rel_span_funcs[SP][SM_N][SC][DP_AN][CPU_C] = _op_blend_rel_p_c_dpan;
218 op_blend_rel_span_funcs[SP_AS][SM_N][SC][DP_AN][CPU_C] = _op_blend_rel_pas_c_dpan;
219 op_blend_rel_span_funcs[SP_AN][SM_N][SC][DP_AN][CPU_C] = _op_blend_rel_pan_c_dpan;
220 op_blend_rel_span_funcs[SP][SM_N][SC_AN][DP_AN][CPU_C] = _op_blend_rel_p_can_dpan;
221 op_blend_rel_span_funcs[SP_AS][SM_N][SC_AN][DP_AN][CPU_C] = _op_blend_rel_pas_can_dpan;
222 op_blend_rel_span_funcs[SP_AN][SM_N][SC_AN][DP_AN][CPU_C] = _op_blend_rel_pan_can_dpan;
223 op_blend_rel_span_funcs[SP][SM_N][SC_AA][DP_AN][CPU_C] = _op_blend_rel_p_caa_dpan;
224 op_blend_rel_span_funcs[SP_AS][SM_N][SC_AA][DP_AN][CPU_C] = _op_blend_rel_pas_caa_dpan;
225 op_blend_rel_span_funcs[SP_AN][SM_N][SC_AA][DP_AN][CPU_C] = _op_blend_rel_pan_caa_dpan;
226}
227
228static void
229_op_blend_rel_pt_p_c_dp(DATA32 s, DATA8 m EINA_UNUSED, DATA32 c, DATA32 *d) {
230 s = MUL4_SYM(c, s);
231 c = 256 - (s >> 24);
232 *d = MUL_SYM(*d >> 24, s) + MUL_256(c, *d);
233}
234
235#define _op_blend_rel_pt_pas_c_dp _op_blend_rel_pt_p_c_dp
236#define _op_blend_rel_pt_pan_c_dp _op_blend_rel_pt_p_c_dp
237#define _op_blend_rel_pt_p_can_dp _op_blend_rel_pt_p_c_dp
238#define _op_blend_rel_pt_pas_can_dp _op_blend_rel_pt_p_c_dp
239#define _op_blend_rel_pt_pan_can_dp _op_blend_rel_pt_p_c_dp
240#define _op_blend_rel_pt_p_caa_dp _op_blend_rel_pt_p_c_dp
241#define _op_blend_rel_pt_pas_caa_dp _op_blend_rel_pt_p_c_dp
242#define _op_blend_rel_pt_pan_caa_dp _op_blend_rel_pt_p_c_dp
243
244#define _op_blend_rel_pt_p_c_dpan _op_blend_pt_p_c_dpan
245#define _op_blend_rel_pt_pas_c_dpan _op_blend_pt_pas_c_dpan
246#define _op_blend_rel_pt_pan_c_dpan _op_blend_pt_pan_c_dpan
247#define _op_blend_rel_pt_p_can_dpan _op_blend_pt_p_can_dpan
248#define _op_blend_rel_pt_pas_can_dpan _op_blend_pt_pas_can_dpan
249#define _op_blend_rel_pt_pan_can_dpan _op_blend_pt_pan_can_dpan
250#define _op_blend_rel_pt_p_caa_dpan _op_blend_pt_p_caa_dpan
251#define _op_blend_rel_pt_pas_caa_dpan _op_blend_pt_pas_caa_dpan
252#define _op_blend_rel_pt_pan_caa_dpan _op_blend_pt_pan_caa_dpan
253
254static void
255init_blend_rel_pixel_color_pt_funcs_c(void)
256{
257 op_blend_rel_pt_funcs[SP][SM_N][SC][DP][CPU_C] = _op_blend_rel_pt_p_c_dp;
258 op_blend_rel_pt_funcs[SP_AS][SM_N][SC][DP][CPU_C] = _op_blend_rel_pt_pas_c_dp;
259 op_blend_rel_pt_funcs[SP_AN][SM_N][SC][DP][CPU_C] = _op_blend_rel_pt_pan_c_dp;
260 op_blend_rel_pt_funcs[SP][SM_N][SC_AN][DP][CPU_C] = _op_blend_rel_pt_p_can_dp;
261 op_blend_rel_pt_funcs[SP_AS][SM_N][SC_AN][DP][CPU_C] = _op_blend_rel_pt_pas_can_dp;
262 op_blend_rel_pt_funcs[SP_AN][SM_N][SC_AN][DP][CPU_C] = _op_blend_rel_pt_pan_can_dp;
263 op_blend_rel_pt_funcs[SP][SM_N][SC_AA][DP][CPU_C] = _op_blend_rel_pt_p_caa_dp;
264 op_blend_rel_pt_funcs[SP_AS][SM_N][SC_AA][DP][CPU_C] = _op_blend_rel_pt_pas_caa_dp;
265 op_blend_rel_pt_funcs[SP_AN][SM_N][SC_AA][DP][CPU_C] = _op_blend_rel_pt_pan_caa_dp;
266
267 op_blend_rel_pt_funcs[SP][SM_N][SC][DP_AN][CPU_C] = _op_blend_rel_pt_p_c_dpan;
268 op_blend_rel_pt_funcs[SP_AS][SM_N][SC][DP_AN][CPU_C] = _op_blend_rel_pt_pas_c_dpan;
269 op_blend_rel_pt_funcs[SP_AN][SM_N][SC][DP_AN][CPU_C] = _op_blend_rel_pt_pan_c_dpan;
270 op_blend_rel_pt_funcs[SP][SM_N][SC_AN][DP_AN][CPU_C] = _op_blend_rel_pt_p_can_dpan;
271 op_blend_rel_pt_funcs[SP_AS][SM_N][SC_AN][DP_AN][CPU_C] = _op_blend_rel_pt_pas_can_dpan;
272 op_blend_rel_pt_funcs[SP_AN][SM_N][SC_AN][DP_AN][CPU_C] = _op_blend_rel_pt_pan_can_dpan;
273 op_blend_rel_pt_funcs[SP][SM_N][SC_AA][DP_AN][CPU_C] = _op_blend_rel_pt_p_caa_dpan;
274 op_blend_rel_pt_funcs[SP_AS][SM_N][SC_AA][DP_AN][CPU_C] = _op_blend_rel_pt_pas_caa_dpan;
275 op_blend_rel_pt_funcs[SP_AN][SM_N][SC_AA][DP_AN][CPU_C] = _op_blend_rel_pt_pan_caa_dpan;
276}
diff --git a/src/lib/evas/common/evas_op_blend/op_blend_pixel_color_i386.c b/src/lib/evas/common/evas_op_blend/op_blend_pixel_color_i386.c
new file mode 100644
index 0000000000..7cb97ae318
--- /dev/null
+++ b/src/lib/evas/common/evas_op_blend/op_blend_pixel_color_i386.c
@@ -0,0 +1,221 @@
1/* blend pixel x color --> dst */
2
3#ifdef BUILD_MMX
4static void
5_op_blend_p_c_dp_mmx(DATA32 *s, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) {
6 DATA32 *e = d + l;
7 MOV_A2R(ALPHA_256, mm6)
8 MOV_A2R(ALPHA_255, mm5)
9 pxor_r2r(mm0, mm0);
10 MOV_P2R(c, mm2, mm0)
11 while (d < e) {
12 MOV_P2R(*s, mm3, mm0)
13 MUL4_SYM_R2R(mm2, mm3, mm5)
14
15 MOV_RA2R(mm3, mm1)
16 movq_r2r(mm6, mm4);
17 psubw_r2r(mm1, mm4);
18
19 MOV_P2R(*d, mm1, mm0)
20 MUL4_256_R2R(mm4, mm1)
21
22 paddw_r2r(mm3, mm1);
23 MOV_R2P(mm1, *d, mm0)
24 s++; d++;
25 }
26}
27
28static void
29_op_blend_pan_can_dp_mmx(DATA32 *s, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) {
30 DATA32 *e = d + l;
31 pxor_r2r(mm0, mm0);
32 MOV_P2R(c, mm2, mm0)
33 MOV_A2R(ALPHA_255, mm5)
34 while (d < e) {
35 MOV_P2R(*s, mm1, mm0)
36 MUL4_SYM_R2R(mm2, mm1, mm5)
37 MOV_R2P(mm1, *d, mm0)
38 s++; d++;
39 }
40}
41
42static void
43_op_blend_pan_caa_dp_mmx(DATA32 *s, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) {
44 DATA32 *e = d + l;
45 c = 1 + (c & 0xff);
46 MOV_A2R(c, mm2)
47 MOV_A2R(ALPHA_255, mm5)
48 pxor_r2r(mm0, mm0);
49 while (d < e) {
50 MOV_P2R(*s, mm3, mm0)
51 MOV_P2R(*d, mm1, mm0)
52 INTERP_256_R2R(mm2, mm3, mm1, mm5)
53 MOV_R2P(mm1, *d, mm0)
54 s++; d++;
55 }
56}
57
58#define _op_blend_pas_c_dp_mmx _op_blend_p_c_dp_mmx
59#define _op_blend_pan_c_dp_mmx _op_blend_p_c_dp_mmx
60#define _op_blend_p_can_dp_mmx _op_blend_p_c_dp_mmx
61#define _op_blend_pas_can_dp_mmx _op_blend_p_c_dp_mmx
62#define _op_blend_p_caa_dp_mmx _op_blend_p_c_dp_mmx
63#define _op_blend_pas_caa_dp_mmx _op_blend_p_c_dp_mmx
64
65#define _op_blend_p_c_dpan_mmx _op_blend_p_c_dp_mmx
66#define _op_blend_pas_c_dpan_mmx _op_blend_pas_c_dp_mmx
67#define _op_blend_pan_c_dpan_mmx _op_blend_pan_c_dp_mmx
68#define _op_blend_p_can_dpan_mmx _op_blend_p_can_dp_mmx
69#define _op_blend_pas_can_dpan_mmx _op_blend_pas_can_dp_mmx
70#define _op_blend_pan_can_dpan_mmx _op_blend_pan_can_dp_mmx
71#define _op_blend_p_caa_dpan_mmx _op_blend_p_caa_dp_mmx
72#define _op_blend_pas_caa_dpan_mmx _op_blend_pas_caa_dp_mmx
73#define _op_blend_pan_caa_dpan_mmx _op_blend_pan_caa_dp_mmx
74
75
76static void
77init_blend_pixel_color_span_funcs_mmx(void)
78{
79 op_blend_span_funcs[SP][SM_N][SC][DP][CPU_MMX] = _op_blend_p_c_dp_mmx;
80 op_blend_span_funcs[SP_AS][SM_N][SC][DP][CPU_MMX] = _op_blend_pas_c_dp_mmx;
81 op_blend_span_funcs[SP_AN][SM_N][SC][DP][CPU_MMX] = _op_blend_pan_c_dp_mmx;
82 op_blend_span_funcs[SP][SM_N][SC_AN][DP][CPU_MMX] = _op_blend_p_can_dp_mmx;
83 op_blend_span_funcs[SP_AS][SM_N][SC_AN][DP][CPU_MMX] = _op_blend_pas_can_dp_mmx;
84 op_blend_span_funcs[SP_AN][SM_N][SC_AN][DP][CPU_MMX] = _op_blend_pan_can_dp_mmx;
85 op_blend_span_funcs[SP][SM_N][SC_AA][DP][CPU_MMX] = _op_blend_p_caa_dp_mmx;
86 op_blend_span_funcs[SP_AS][SM_N][SC_AA][DP][CPU_MMX] = _op_blend_pas_caa_dp_mmx;
87 op_blend_span_funcs[SP_AN][SM_N][SC_AA][DP][CPU_MMX] = _op_blend_pan_caa_dp_mmx;
88
89 op_blend_span_funcs[SP][SM_N][SC][DP_AN][CPU_MMX] = _op_blend_p_c_dpan_mmx;
90 op_blend_span_funcs[SP_AS][SM_N][SC][DP_AN][CPU_MMX] = _op_blend_pas_c_dpan_mmx;
91 op_blend_span_funcs[SP_AN][SM_N][SC][DP_AN][CPU_MMX] = _op_blend_pan_c_dpan_mmx;
92 op_blend_span_funcs[SP][SM_N][SC_AN][DP_AN][CPU_MMX] = _op_blend_p_can_dpan_mmx;
93 op_blend_span_funcs[SP_AS][SM_N][SC_AN][DP_AN][CPU_MMX] = _op_blend_pas_can_dpan_mmx;
94 op_blend_span_funcs[SP_AN][SM_N][SC_AN][DP_AN][CPU_MMX] = _op_blend_pan_can_dpan_mmx;
95 op_blend_span_funcs[SP][SM_N][SC_AA][DP_AN][CPU_MMX] = _op_blend_p_caa_dpan_mmx;
96 op_blend_span_funcs[SP_AS][SM_N][SC_AA][DP_AN][CPU_MMX] = _op_blend_pas_caa_dpan_mmx;
97 op_blend_span_funcs[SP_AN][SM_N][SC_AA][DP_AN][CPU_MMX] = _op_blend_pan_caa_dpan_mmx;
98}
99#endif
100
101#ifdef BUILD_MMX
102static void
103_op_blend_pt_p_c_dp_mmx(DATA32 s, DATA8 m EINA_UNUSED, DATA32 c, DATA32 *d) {
104 MOV_A2R(ALPHA_256, mm4)
105 MOV_A2R(ALPHA_255, mm5)
106 pxor_r2r(mm0, mm0);
107 MOV_P2R(c, mm2, mm0)
108 MOV_P2R(s, mm3, mm0)
109 MUL4_SYM_R2R(mm2, mm3, mm5)
110
111 MOV_RA2R(mm3, mm1)
112 psubw_r2r(mm1, mm4);
113
114 MOV_P2R(*d, mm1, mm0)
115 MUL4_256_R2R(mm4, mm1)
116
117 paddw_r2r(mm3, mm1);
118 MOV_R2P(mm1, *d, mm0)
119}
120
121#define _op_blend_pt_pas_c_dp_mmx _op_blend_pt_p_c_dp_mmx
122#define _op_blend_pt_pan_c_dp_mmx _op_blend_pt_p_c_dp_mmx
123#define _op_blend_pt_p_can_dp_mmx _op_blend_pt_p_c_dp_mmx
124#define _op_blend_pt_pas_can_dp_mmx _op_blend_pt_p_c_dp_mmx
125#define _op_blend_pt_pan_can_dp_mmx _op_blend_pt_p_c_dp_mmx
126#define _op_blend_pt_p_caa_dp_mmx _op_blend_pt_p_c_dp_mmx
127#define _op_blend_pt_pas_caa_dp_mmx _op_blend_pt_p_c_dp_mmx
128#define _op_blend_pt_pan_caa_dp_mmx _op_blend_pt_p_c_dp_mmx
129
130#define _op_blend_pt_p_c_dpan_mmx _op_blend_pt_p_c_dp_mmx
131#define _op_blend_pt_pas_c_dpan_mmx _op_blend_pt_p_c_dp_mmx
132#define _op_blend_pt_pan_c_dpan_mmx _op_blend_pt_p_c_dp_mmx
133#define _op_blend_pt_p_can_dpan_mmx _op_blend_pt_p_c_dp_mmx
134#define _op_blend_pt_pas_can_dpan_mmx _op_blend_pt_p_c_dp_mmx
135#define _op_blend_pt_pan_can_dpan_mmx _op_blend_pt_p_c_dp_mmx
136#define _op_blend_pt_p_caa_dpan_mmx _op_blend_pt_p_c_dp_mmx
137#define _op_blend_pt_pas_caa_dpan_mmx _op_blend_pt_p_c_dp_mmx
138#define _op_blend_pt_pan_caa_dpan_mmx _op_blend_pt_p_c_dp_mmx
139
140static void
141init_blend_pixel_color_pt_funcs_mmx(void)
142{
143 op_blend_pt_funcs[SP][SM_N][SC][DP][CPU_MMX] = _op_blend_pt_p_c_dp_mmx;
144 op_blend_pt_funcs[SP_AS][SM_N][SC][DP][CPU_MMX] = _op_blend_pt_pas_c_dp_mmx;
145 op_blend_pt_funcs[SP_AN][SM_N][SC][DP][CPU_MMX] = _op_blend_pt_pan_c_dp_mmx;
146 op_blend_pt_funcs[SP][SM_N][SC_AN][DP][CPU_MMX] = _op_blend_pt_p_can_dp_mmx;
147 op_blend_pt_funcs[SP_AS][SM_N][SC_AN][DP][CPU_MMX] = _op_blend_pt_pas_can_dp_mmx;
148 op_blend_pt_funcs[SP_AN][SM_N][SC_AN][DP][CPU_MMX] = _op_blend_pt_pan_can_dp_mmx;
149 op_blend_pt_funcs[SP][SM_N][SC_AA][DP][CPU_MMX] = _op_blend_pt_p_caa_dp_mmx;
150 op_blend_pt_funcs[SP_AS][SM_N][SC_AA][DP][CPU_MMX] = _op_blend_pt_pas_caa_dp_mmx;
151 op_blend_pt_funcs[SP_AN][SM_N][SC_AA][DP][CPU_MMX] = _op_blend_pt_pan_caa_dp_mmx;
152
153 op_blend_pt_funcs[SP][SM_N][SC][DP_AN][CPU_MMX] = _op_blend_pt_p_c_dpan_mmx;
154 op_blend_pt_funcs[SP_AS][SM_N][SC][DP_AN][CPU_MMX] = _op_blend_pt_pas_c_dpan_mmx;
155 op_blend_pt_funcs[SP_AN][SM_N][SC][DP_AN][CPU_MMX] = _op_blend_pt_pan_c_dpan_mmx;
156 op_blend_pt_funcs[SP][SM_N][SC_AN][DP_AN][CPU_MMX] = _op_blend_pt_p_can_dpan_mmx;
157 op_blend_pt_funcs[SP_AS][SM_N][SC_AN][DP_AN][CPU_MMX] = _op_blend_pt_pas_can_dpan_mmx;
158 op_blend_pt_funcs[SP_AN][SM_N][SC_AN][DP_AN][CPU_MMX] = _op_blend_pt_pan_can_dpan_mmx;
159 op_blend_pt_funcs[SP][SM_N][SC_AA][DP_AN][CPU_MMX] = _op_blend_pt_p_caa_dpan_mmx;
160 op_blend_pt_funcs[SP_AS][SM_N][SC_AA][DP_AN][CPU_MMX] = _op_blend_pt_pas_caa_dpan_mmx;
161 op_blend_pt_funcs[SP_AN][SM_N][SC_AA][DP_AN][CPU_MMX] = _op_blend_pt_pan_caa_dpan_mmx;
162}
163#endif
164
165/*-----*/
166
167/* blend_rel pixel x color -> dst */
168
169#ifdef BUILD_MMX
170
171#define _op_blend_rel_p_c_dpan_mmx _op_blend_p_c_dpan_mmx
172#define _op_blend_rel_pas_c_dpan_mmx _op_blend_pas_c_dpan_mmx
173#define _op_blend_rel_pan_c_dpan_mmx _op_blend_pan_c_dpan_mmx
174#define _op_blend_rel_p_can_dpan_mmx _op_blend_p_can_dpan_mmx
175#define _op_blend_rel_pas_can_dpan_mmx _op_blend_pas_can_dpan_mmx
176#define _op_blend_rel_pan_can_dpan_mmx _op_blend_pan_can_dpan_mmx
177#define _op_blend_rel_p_caa_dpan_mmx _op_blend_p_caa_dpan_mmx
178#define _op_blend_rel_pas_caa_dpan_mmx _op_blend_pas_caa_dpan_mmx
179#define _op_blend_rel_pan_caa_dpan_mmx _op_blend_pan_caa_dpan_mmx
180
181static void
182init_blend_rel_pixel_color_span_funcs_mmx(void)
183{
184 op_blend_rel_span_funcs[SP][SM_N][SC][DP_AN][CPU_MMX] = _op_blend_rel_p_c_dpan_mmx;
185 op_blend_rel_span_funcs[SP_AS][SM_N][SC][DP_AN][CPU_MMX] = _op_blend_rel_pas_c_dpan_mmx;
186 op_blend_rel_span_funcs[SP_AN][SM_N][SC][DP_AN][CPU_MMX] = _op_blend_rel_pan_c_dpan_mmx;
187 op_blend_rel_span_funcs[SP][SM_N][SC_AN][DP_AN][CPU_MMX] = _op_blend_rel_p_can_dpan_mmx;
188 op_blend_rel_span_funcs[SP_AS][SM_N][SC_AN][DP_AN][CPU_MMX] = _op_blend_rel_pas_can_dpan_mmx;
189 op_blend_rel_span_funcs[SP_AN][SM_N][SC_AN][DP_AN][CPU_MMX] = _op_blend_rel_pan_can_dpan_mmx;
190 op_blend_rel_span_funcs[SP][SM_N][SC_AA][DP_AN][CPU_MMX] = _op_blend_rel_p_caa_dpan_mmx;
191 op_blend_rel_span_funcs[SP_AS][SM_N][SC_AA][DP_AN][CPU_MMX] = _op_blend_rel_pas_caa_dpan_mmx;
192 op_blend_rel_span_funcs[SP_AN][SM_N][SC_AA][DP_AN][CPU_MMX] = _op_blend_rel_pan_caa_dpan_mmx;
193}
194#endif
195
196#ifdef BUILD_MMX
197
198#define _op_blend_rel_pt_p_c_dpan_mmx _op_blend_pt_p_c_dpan_mmx
199#define _op_blend_rel_pt_pas_c_dpan_mmx _op_blend_pt_pas_c_dpan_mmx
200#define _op_blend_rel_pt_pan_c_dpan_mmx _op_blend_pt_pan_c_dpan_mmx
201#define _op_blend_rel_pt_p_can_dpan_mmx _op_blend_pt_p_can_dpan_mmx
202#define _op_blend_rel_pt_pas_can_dpan_mmx _op_blend_pt_pas_can_dpan_mmx
203#define _op_blend_rel_pt_pan_can_dpan_mmx _op_blend_pt_pan_can_dpan_mmx
204#define _op_blend_rel_pt_p_caa_dpan_mmx _op_blend_pt_p_caa_dpan_mmx
205#define _op_blend_rel_pt_pas_caa_dpan_mmx _op_blend_pt_pas_caa_dpan_mmx
206#define _op_blend_rel_pt_pan_caa_dpan_mmx _op_blend_pt_pan_caa_dpan_mmx
207
208static void
209init_blend_rel_pixel_color_pt_funcs_mmx(void)
210{
211 op_blend_rel_pt_funcs[SP][SM_N][SC][DP_AN][CPU_MMX] = _op_blend_rel_pt_p_c_dpan_mmx;
212 op_blend_rel_pt_funcs[SP_AS][SM_N][SC][DP_AN][CPU_MMX] = _op_blend_rel_pt_pas_c_dpan_mmx;
213 op_blend_rel_pt_funcs[SP_AN][SM_N][SC][DP_AN][CPU_MMX] = _op_blend_rel_pt_pan_c_dpan_mmx;
214 op_blend_rel_pt_funcs[SP][SM_N][SC_AN][DP_AN][CPU_MMX] = _op_blend_rel_pt_p_can_dpan_mmx;
215 op_blend_rel_pt_funcs[SP_AS][SM_N][SC_AN][DP_AN][CPU_MMX] = _op_blend_rel_pt_pas_can_dpan_mmx;
216 op_blend_rel_pt_funcs[SP_AN][SM_N][SC_AN][DP_AN][CPU_MMX] = _op_blend_rel_pt_pan_can_dpan_mmx;
217 op_blend_rel_pt_funcs[SP][SM_N][SC_AA][DP_AN][CPU_MMX] = _op_blend_rel_pt_p_caa_dpan_mmx;
218 op_blend_rel_pt_funcs[SP_AS][SM_N][SC_AA][DP_AN][CPU_MMX] = _op_blend_rel_pt_pas_caa_dpan_mmx;
219 op_blend_rel_pt_funcs[SP_AN][SM_N][SC_AA][DP_AN][CPU_MMX] = _op_blend_rel_pt_pan_caa_dpan_mmx;
220}
221#endif
diff --git a/src/lib/evas/common/evas_op_blend/op_blend_pixel_color_neon.c b/src/lib/evas/common/evas_op_blend/op_blend_pixel_color_neon.c
new file mode 100644
index 0000000000..765f55878e
--- /dev/null
+++ b/src/lib/evas/common/evas_op_blend/op_blend_pixel_color_neon.c
@@ -0,0 +1,570 @@
1/* blend pixel x color --> dst */
2#ifdef BUILD_NEON
3/* Note: Optimisation is based on keeping _dest_ aligned: else it's a pair of
4 * reads, then two writes, a miss on read is 'just' two reads */
5static void
6_op_blend_p_c_dp_neon(DATA32 *s, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) {
7#define AP "blend_p_c_dp_"
8 asm volatile (
9 ".fpu neon \n\t"
10 // Load 'c'
11 "vdup.u32 q7, %[c] \n\t"
12 "vmov.i8 q6, #1 \n\t"
13
14 // Choose a loop
15 "andS %[tmp], %[d], $0xf \n\t"
16 "beq "AP"quadstart \n\t"
17
18 "andS %[tmp],%[d], $0x4 \n\t"
19 "beq "AP"dualloop \n\t"
20
21 AP"singleloop:"
22 "vld1.32 d0[0], [%[s]]! \n\t"
23 "vld1.32 d2[0], [%[d]] \n\t"
24 // Mulitply s * c (= sc)
25 "vmull.u8 q4, d0,d14 \n\t"
26 // sc in d8
27 "vqrshrn.u16 d4, q4, #8 \n\t"
28
29 // sca in d9
30 "vmvn.u32 d6, d4 \n\t"
31 "vshr.u32 d6, d6, #24 \n\t"
32
33 "vmul.u32 d6, d12, d6 \n\t"
34
35 /* d * alpha */
36 "vmull.u8 q4, d6, d2 \n\t"
37 "vqrshrn.u16 d0, q4, #8 \n\t"
38
39 "vqadd.u8 d2, d0, d4 \n\t"
40
41 // Save dsc + sc
42 "vst1.32 d2[0], [%[d]]! \n\t"
43
44 // Now where?
45 // Can we go the fast path?
46 "andS %[tmp], %[d],$0xf \n\t"
47 "beq "AP"quadstart \n\t"
48
49 AP"dualloop: \n\t"
50 // Check we have enough to bother with!
51 "sub %[tmp], %[e], %[d] \n\t"
52 "cmp %[tmp], #16 \n\t"
53 "blt "AP"loopout \n\t"
54
55 // load 's' -> q0, 'd' -> q1
56 "vldm %[s]!, {d0} \n\t"
57 "vldm %[d], {d2} \n\t"
58 // Mulitply s * c (= sc)
59 "vmull.u8 q4, d0,d14 \n\t"
60 // sc in d8
61 "vqrshrn.u16 d4, q4, #8 \n\t"
62
63 // sca in d9
64 "vmvn.u32 d6, d4 \n\t"
65 "vshr.u32 d6, d6, #24 \n\t"
66
67 "vmul.u32 d6, d12, d6 \n\t"
68
69 /* d * alpha */
70 "vmull.u8 q4, d6, d2 \n\t"
71 "vqrshrn.u16 d0, q4, #8 \n\t"
72
73 "vqadd.u8 d2, d0, d4 \n\t"
74
75 // Save dsc + sc
76 "vst1.32 d2, [%[d]]! \n\t"
77
78 AP"quadstart: \n\t"
79 "sub %[tmp], %[e], %[d] \n\t"
80 "cmp %[tmp], #16 \n\t"
81 "blt "AP"loopout \n\t"
82
83 "sub %[tmp], %[e], #15 \n\t"
84
85 AP"quadloop:\n\t"
86 // load 's' -> q0, 'd' -> q1
87 "vldm %[s]!, {d0,d1} \n\t"
88 "vldm %[d], {d2,d3} \n\t"
89 // Mulitply s * c (= sc)
90 "vmull.u8 q4, d0,d14 \n\t"
91 "vmull.u8 q5, d1,d14 \n\t"
92
93 // Get sc & sc alpha
94 "vqrshrn.u16 d4, q4, #8 \n\t"
95 "vqrshrn.u16 d5, q5, #8 \n\t"
96 // sc is now in q2, 8bpp
97 // Shift out, then spread alpha for q2
98 "vmvn.u32 q3, q2 \n\t"
99 "vshr.u32 q3, q3, $0x18 \n\t"
100 "vmul.u32 q3, q6,q3 \n\t"
101
102 // Multiply 'd' by sc.alpha (dsca)
103 "vmull.u8 q4, d6,d2 \n\t"
104 "vmull.u8 q5, d7,d3 \n\t"
105
106 "vqrshrn.u16 d0, q4, #8 \n\t"
107 "vqrshrn.u16 d1, q5, #8 \n\t"
108
109 "vqadd.u8 q1, q0, q2 \n\t"
110
111 // Save dsc + sc
112 "vstm %[d]!, {d2,d3} \n\t"
113
114 "cmp %[tmp], %[d] \n\t"
115
116 "bhi "AP"quadloop \n\t"
117
118 /* Trailing stuff */
119 AP"loopout: \n\t"
120
121 "cmp %[d], %[e] \n\t"
122 "beq "AP"done\n\t"
123 "sub %[tmp],%[e], %[d] \n\t"
124 "cmp %[tmp],$0x04 \n\t"
125 "beq "AP"singleloop2 \n\t"
126
127 "sub %[tmp], %[e], #7 \n\t"
128 /* Dual loop */
129 AP"dualloop2: \n\t"
130 "vldm %[s]!, {d0} \n\t"
131 "vldm %[d], {d2} \n\t"
132 // Mulitply s * c (= sc)
133 "vmull.u8 q4, d0,d14 \n\t"
134 // sc in d8
135 "vqrshrn.u16 d4, q4, #8 \n\t"
136
137 // sca in d9
138 // XXX: I can probably squash one of these 3
139 "vmvn.u32 d6, d4 \n\t"
140 "vshr.u32 d6, d6, #24 \n\t"
141 "vmul.u32 d6, d6, d12 \n\t"
142
143 /* d * alpha */
144 "vmull.u8 q4, d6, d2 \n\t"
145 "vqrshrn.u16 d0, q4, #8 \n\t"
146
147 "vqadd.u8 d2, d0, d4 \n\t"
148
149 // Save dsc + sc
150 "vstm %[d]!, {d2} \n\t"
151
152 "cmp %[tmp], %[d] \n\t"
153 "bhi "AP"dualloop2 \n\t"
154
155 "cmp %[d], %[e] \n\t"
156 "beq "AP"done \n\t"
157
158 AP"singleloop2: \n\t"
159 "vld1.32 d0[0], [%[s]]! \n\t"
160 "vld1.32 d2[0], [%[d]] \n\t"
161 // Mulitply s * c (= sc)
162 "vmull.u8 q4, d0,d14 \n\t"
163 // sc in d8
164 "vqrshrn.u16 d4, q4, #8 \n\t"
165
166 // sca in d6
167 "vmvn.u32 d6, d4 \n\t"
168 "vshr.u32 d6, d6, #24 \n\t"
169 "vmul.u32 d6, d12,d6 \n\t"
170
171 /* d * alpha */
172 "vmull.u8 q4, d6, d2 \n\t"
173 "vqrshrn.u16 d0, q4, #8 \n\t"
174
175 "vqadd.u8 d2, d0, d4 \n\t"
176
177 // Save dsc + sc
178 "vst1.32 d2[0], [%[d]]! \n\t"
179
180
181 AP"done:"
182 : // No output
183 //
184 : [s] "r" (s), [e] "r" (d + l), [d] "r" (d), [c] "r" (c),
185 [tmp] "r" (12)
186 : "q0","q1","q2","q3","q4","q5","q6","q7","memory"
187 );
188#undef AP
189}
190
191static void
192_op_blend_pan_can_dp_neon(DATA32 *s, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) {
193 DATA32 *e;
194 UNROLL8_PLD_WHILE(d, l, e,
195 {
196 *d++ = 0xff000000 + MUL3_SYM(c, *s);
197 s++;
198 });
199}
200
201static void
202_op_blend_pan_caa_dp_neon(DATA32 *s, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) {
203#if 1
204 DATA32 *e;
205 int alpha;
206 c = 1 + (c & 0xff);
207 UNROLL8_PLD_WHILE(d, l, e,
208 {
209 DATA32 sc = MUL_256(c, *s);
210 alpha = 256 - (sc >> 24);
211 *d = sc + MUL_256(alpha, *d);
212 d++;
213 s++;
214 });
215#else // the below neon is buggy!! misses rendering of spans, i think with alignment. quick - just disable this.
216#define AP "_op_blend_pan_caa_dp_"
217 DATA32 *e = d + l, *tmp = (void*)73;
218 asm volatile (
219 ".fpu neon \n\t"
220 /* Set up 'c' */
221 "vdup.u8 d14, %[c] \n\t"
222 "vmov.i8 d15, #1 \n\t"
223 "vaddl.u8 q15, d14, d15 \n\t"
224 "vshr.u8 q15,#1 \n\t"
225
226 // Pick a loop
227 "andS %[tmp], %[d], $0xf \n\t"
228 "beq "AP"quadstart \n\t"
229
230 "andS %[tmp], %[d], $0x4 \n\t"
231 "beq "AP"dualstart \n\t"
232
233 AP"singleloop: \n\t"
234 "vld1.32 d4[0], [%[d]] \n\t"
235 "vld1.32 d0[0], [%[s]]! \n\t"
236
237 // Long version of 'd'
238 "vmovl.u8 q8, d4 \n\t"
239
240 // Long version of 's'
241 "vmovl.u8 q6, d0 \n\t"
242
243 // d8 = s -d
244 "vsub.s16 d8, d12, d16 \n\t"
245
246 // Multiply
247 "vmul.s16 d8, d8, d30 \n\t"
248
249 // Shift down
250 "vshr.s16 d8, #7 \n\t"
251
252 // Add 'd'
253 "vqadd.s16 d8, d8, d16 \n\t"
254
255 // Shrink to save
256 "vqmovun.s16 d0, q4 \n\t"
257 "vst1.32 d0[0], [%[d]]! \n\t"
258
259 // Now where?
260 "andS %[tmp], %[d], $0xf \n\t"
261 "beq "AP"quadstart \n\t"
262
263 AP"dualstart: \n\t"
264 // Check we have enough
265 "sub %[tmp], %[e], %[d] \n\t"
266 "cmp %[tmp], #16 \n\t"
267 "blt "AP"loopout \n\t"
268
269 AP"dualloop:"
270 "vldm %[d], {d4} \n\t"
271 "vldm %[s]!, {d0} \n\t"
272
273 // Long version of d
274 "vmovl.u8 q8, d4 \n\t"
275
276 // Long version of s
277 "vmovl.u8 q6, d0 \n\t"
278
279 // q4/q5 = s-d
280 "vsub.s16 q4, q6, q8 \n\t"
281
282 // Multiply
283 "vmul.s16 q4, q4,q15 \n\t"
284
285 // Shift down
286 "vshr.s16 q4, #7 \n\t"
287
288 // Add d
289 "vqadd.s16 q4, q4, q8 \n\t"
290
291 // Shrink to save
292 "vqmovun.s16 d0, q4 \n\t"
293
294 "vstm %[d]!, {d0} \n\t"
295 AP"quadstart: \n\t"
296 "sub %[tmp], %[e], %[d] \n\t"
297 "cmp %[tmp], #16 \n\t"
298 "blt "AP"loopout \n\t"
299
300 "sub %[tmp], %[e], #15 \n\t"
301
302 AP"quadloop: \n\t"
303 // load 's' -> q0, 'd' -> q2
304 "vldm %[d], {d4,d5} \n\t"
305 "vldm %[s]!, {d0,d1} \n\t"
306
307 // Long version of d
308 "vmovl.u8 q8, d4 \n\t"
309 "vmovl.u8 q9, d5 \n\t"
310
311 // Long version of s
312 "vmovl.u8 q6, d0 \n\t"
313 "vmovl.u8 q7, d1 \n\t"
314
315 // q4/q5 = s-d
316 "vsub.s16 q4, q6, q8 \n\t"
317 "vsub.s16 q5, q7, q9 \n\t"
318
319 // Multiply
320 "vmul.s16 q4, q4,q15 \n\t"
321 "vmul.s16 q5, q5,q15 \n\t"
322
323 // Shift down
324 "vshr.s16 q4, #7 \n\t"
325 "vshr.s16 q5, #7 \n\t"
326
327 // Add d
328 "vqadd.s16 q4, q4, q8 \n\t"
329 "vqadd.s16 q5, q5, q9 \n\t"
330
331 // Shrink to save
332 "vqmovun.s16 d0, q4 \n\t"
333 "vqmovun.s16 d1, q5 \n\t"
334 "vstm %[d]!, {d0,d1} \n\t"
335 "cmp %[tmp], %[d] \n\t"
336
337 "bhi "AP"quadloop\n\t"
338
339
340 "b "AP"done\n\t"
341 AP"loopout: \n\t"
342 "cmp %[d], %[e] \n\t"
343 "beq "AP"done\n\t"
344 "sub %[tmp],%[e], %[d] \n\t"
345 "cmp %[tmp],$0x04 \n\t"
346 "beq "AP"singleloop2 \n\t"
347
348 AP"dualloop2: \n\t"
349 "vldm %[d], {d4} \n\t"
350 "vldm %[s]!, {d0} \n\t"
351
352 // Long version of d
353 "vmovl.u8 q8, d4 \n\t"
354
355 // Long version of s
356 "vmovl.u8 q6, d0 \n\t"
357
358 // q4/q5 = s-d
359 "vsub.s16 q4, q6, q8 \n\t"
360
361 // Multiply
362 "vmul.s16 q4, q4,q15 \n\t"
363
364 // Shift down
365 "vshr.s16 q4, #7 \n\t"
366
367 // Add d
368 "vqadd.s16 q4, q4, q8 \n\t"
369
370 // Shrink to save
371 "vqmovun.s16 d0, q4 \n\t"
372
373 "vstm %[d]!, {d0} \n\t"
374
375 "cmp %[d], %[e] \n\t"
376 "beq "AP"done \n\t"
377
378 AP"singleloop2: \n\t"
379 "vld1.32 d4[0], [%[d]] \n\t"
380 "vld1.32 d0[0], [%[s]]! \n\t"
381
382 // Long version of 'd'
383 "vmovl.u8 q8, d4 \n\t"
384
385 // Long version of 's'
386 "vmovl.u8 q6, d0 \n\t"
387
388 // d8 = s -d
389 "vsub.s16 d8, d12, d16 \n\t"
390
391 // Multiply
392 "vmul.s16 d8, d8, d30 \n\t"
393
394 // Shift down
395 "vshr.s16 d8, #7 \n\t"
396
397 // Add 'd'
398 "vqadd.s16 d8, d8, d16 \n\t"
399
400 // Shrink to save
401 "vqmovun.s16 d0, q4 \n\t"
402
403 "vst1.32 d0[0], [%[d]] \n\t"
404
405
406 AP"done: \n\t"
407
408 // No output
409 :
410 // Input
411 : [s] "r" (s), [d] "r" (d), [e] "r" (e), [c] "r" (c), [tmp] "r" (tmp)
412 // Clobbered
413 : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "memory"
414 );
415#undef AP
416#endif
417}
418
419#define _op_blend_pas_c_dp_neon _op_blend_p_c_dp_neon
420#define _op_blend_pan_c_dp_neon _op_blend_p_c_dp_neon
421#define _op_blend_p_can_dp_neon _op_blend_p_c_dp_neon
422#define _op_blend_pas_can_dp_neon _op_blend_p_c_dp_neon
423#define _op_blend_p_caa_dp_neon _op_blend_p_c_dp_neon
424#define _op_blend_pas_caa_dp_neon _op_blend_p_c_dp_neon
425
426#define _op_blend_p_c_dpan_neon _op_blend_p_c_dp_neon
427#define _op_blend_pas_c_dpan_neon _op_blend_pas_c_dp_neon
428#define _op_blend_pan_c_dpan_neon _op_blend_pan_c_dp_neon
429#define _op_blend_p_can_dpan_neon _op_blend_p_can_dp_neon
430#define _op_blend_pas_can_dpan_neon _op_blend_pas_can_dp_neon
431#define _op_blend_pan_can_dpan_neon _op_blend_pan_can_dp_neon
432#define _op_blend_p_caa_dpan_neon _op_blend_p_caa_dp_neon
433#define _op_blend_pas_caa_dpan_neon _op_blend_pas_caa_dp_neon
434#define _op_blend_pan_caa_dpan_neon _op_blend_pan_caa_dp_neon
435
436
437static void
438init_blend_pixel_color_span_funcs_neon(void)
439{
440 op_blend_span_funcs[SP][SM_N][SC][DP][CPU_NEON] = _op_blend_p_c_dp_neon;
441 op_blend_span_funcs[SP_AS][SM_N][SC][DP][CPU_NEON] = _op_blend_pas_c_dp_neon;
442 op_blend_span_funcs[SP_AN][SM_N][SC][DP][CPU_NEON] = _op_blend_pan_c_dp_neon;
443 op_blend_span_funcs[SP][SM_N][SC_AN][DP][CPU_NEON] = _op_blend_p_can_dp_neon;
444 op_blend_span_funcs[SP_AS][SM_N][SC_AN][DP][CPU_NEON] = _op_blend_pas_can_dp_neon;
445 op_blend_span_funcs[SP_AN][SM_N][SC_AN][DP][CPU_NEON] = _op_blend_pan_can_dp_neon;
446 op_blend_span_funcs[SP][SM_N][SC_AA][DP][CPU_NEON] = _op_blend_p_caa_dp_neon;
447 op_blend_span_funcs[SP_AS][SM_N][SC_AA][DP][CPU_NEON] = _op_blend_pas_caa_dp_neon;
448 op_blend_span_funcs[SP_AN][SM_N][SC_AA][DP][CPU_NEON] = _op_blend_pan_caa_dp_neon;
449
450 op_blend_span_funcs[SP][SM_N][SC][DP_AN][CPU_NEON] = _op_blend_p_c_dpan_neon;
451 op_blend_span_funcs[SP_AS][SM_N][SC][DP_AN][CPU_NEON] = _op_blend_pas_c_dpan_neon;
452 op_blend_span_funcs[SP_AN][SM_N][SC][DP_AN][CPU_NEON] = _op_blend_pan_c_dpan_neon;
453 op_blend_span_funcs[SP][SM_N][SC_AN][DP_AN][CPU_NEON] = _op_blend_p_can_dpan_neon;
454 op_blend_span_funcs[SP_AS][SM_N][SC_AN][DP_AN][CPU_NEON] = _op_blend_pas_can_dpan_neon;
455 op_blend_span_funcs[SP_AN][SM_N][SC_AN][DP_AN][CPU_NEON] = _op_blend_pan_can_dpan_neon;
456 op_blend_span_funcs[SP][SM_N][SC_AA][DP_AN][CPU_NEON] = _op_blend_p_caa_dpan_neon;
457 op_blend_span_funcs[SP_AS][SM_N][SC_AA][DP_AN][CPU_NEON] = _op_blend_pas_caa_dpan_neon;
458 op_blend_span_funcs[SP_AN][SM_N][SC_AA][DP_AN][CPU_NEON] = _op_blend_pan_caa_dpan_neon;
459}
460#endif
461
462#ifdef BUILD_NEON
463static void
464_op_blend_pt_p_c_dp_neon(DATA32 s, DATA8 m EINA_UNUSED, DATA32 c, DATA32 *d) {
465 s = MUL4_SYM(c, s);
466 c = 256 - (s >> 24);
467 *d = s + MUL_256(c, *d);
468}
469
470#define _op_blend_pt_pas_c_dp_neon _op_blend_pt_p_c_dp_neon
471#define _op_blend_pt_pan_c_dp_neon _op_blend_pt_p_c_dp_neon
472#define _op_blend_pt_p_can_dp_neon _op_blend_pt_p_c_dp_neon
473#define _op_blend_pt_pas_can_dp_neon _op_blend_pt_p_c_dp_neon
474#define _op_blend_pt_pan_can_dp_neon _op_blend_pt_p_c_dp_neon
475#define _op_blend_pt_p_caa_dp_neon _op_blend_pt_p_c_dp_neon
476#define _op_blend_pt_pas_caa_dp_neon _op_blend_pt_p_c_dp_neon
477#define _op_blend_pt_pan_caa_dp_neon _op_blend_pt_p_c_dp_neon
478
479#define _op_blend_pt_p_c_dpan_neon _op_blend_pt_p_c_dp_neon
480#define _op_blend_pt_pas_c_dpan_neon _op_blend_pt_p_c_dp_neon
481#define _op_blend_pt_pan_c_dpan_neon _op_blend_pt_p_c_dp_neon
482#define _op_blend_pt_p_can_dpan_neon _op_blend_pt_p_c_dp_neon
483#define _op_blend_pt_pas_can_dpan_neon _op_blend_pt_p_c_dp_neon
484#define _op_blend_pt_pan_can_dpan_neon _op_blend_pt_p_c_dp_neon
485#define _op_blend_pt_p_caa_dpan_neon _op_blend_pt_p_c_dp_neon
486#define _op_blend_pt_pas_caa_dpan_neon _op_blend_pt_p_c_dp_neon
487#define _op_blend_pt_pan_caa_dpan_neon _op_blend_pt_p_c_dp_neon
488
489static void
490init_blend_pixel_color_pt_funcs_neon(void)
491{
492 op_blend_pt_funcs[SP][SM_N][SC][DP][CPU_NEON] = _op_blend_pt_p_c_dp_neon;
493 op_blend_pt_funcs[SP_AS][SM_N][SC][DP][CPU_NEON] = _op_blend_pt_pas_c_dp_neon;
494 op_blend_pt_funcs[SP_AN][SM_N][SC][DP][CPU_NEON] = _op_blend_pt_pan_c_dp_neon;
495 op_blend_pt_funcs[SP][SM_N][SC_AN][DP][CPU_NEON] = _op_blend_pt_p_can_dp_neon;
496 op_blend_pt_funcs[SP_AS][SM_N][SC_AN][DP][CPU_NEON] = _op_blend_pt_pas_can_dp_neon;
497 op_blend_pt_funcs[SP_AN][SM_N][SC_AN][DP][CPU_NEON] = _op_blend_pt_pan_can_dp_neon;
498 op_blend_pt_funcs[SP][SM_N][SC_AA][DP][CPU_NEON] = _op_blend_pt_p_caa_dp_neon;
499 op_blend_pt_funcs[SP_AS][SM_N][SC_AA][DP][CPU_NEON] = _op_blend_pt_pas_caa_dp_neon;
500 op_blend_pt_funcs[SP_AN][SM_N][SC_AA][DP][CPU_NEON] = _op_blend_pt_pan_caa_dp_neon;
501
502 op_blend_pt_funcs[SP][SM_N][SC][DP_AN][CPU_NEON] = _op_blend_pt_p_c_dpan_neon;
503 op_blend_pt_funcs[SP_AS][SM_N][SC][DP_AN][CPU_NEON] = _op_blend_pt_pas_c_dpan_neon;
504 op_blend_pt_funcs[SP_AN][SM_N][SC][DP_AN][CPU_NEON] = _op_blend_pt_pan_c_dpan_neon;
505 op_blend_pt_funcs[SP][SM_N][SC_AN][DP_AN][CPU_NEON] = _op_blend_pt_p_can_dpan_neon;
506 op_blend_pt_funcs[SP_AS][SM_N][SC_AN][DP_AN][CPU_NEON] = _op_blend_pt_pas_can_dpan_neon;
507 op_blend_pt_funcs[SP_AN][SM_N][SC_AN][DP_AN][CPU_NEON] = _op_blend_pt_pan_can_dpan_neon;
508 op_blend_pt_funcs[SP][SM_N][SC_AA][DP_AN][CPU_NEON] = _op_blend_pt_p_caa_dpan_neon;
509 op_blend_pt_funcs[SP_AS][SM_N][SC_AA][DP_AN][CPU_NEON] = _op_blend_pt_pas_caa_dpan_neon;
510 op_blend_pt_funcs[SP_AN][SM_N][SC_AA][DP_AN][CPU_NEON] = _op_blend_pt_pan_caa_dpan_neon;
511}
512#endif
513
514/*-----*/
515
516/* blend_rel pixel x color -> dst */
517
518#ifdef BUILD_NEON
519
520#define _op_blend_rel_p_c_dpan_neon _op_blend_p_c_dpan_neon
521#define _op_blend_rel_pas_c_dpan_neon _op_blend_pas_c_dpan_neon
522#define _op_blend_rel_pan_c_dpan_neon _op_blend_pan_c_dpan_neon
523#define _op_blend_rel_p_can_dpan_neon _op_blend_p_can_dpan_neon
524#define _op_blend_rel_pas_can_dpan_neon _op_blend_pas_can_dpan_neon
525#define _op_blend_rel_pan_can_dpan_neon _op_blend_pan_can_dpan_neon
526#define _op_blend_rel_p_caa_dpan_neon _op_blend_p_caa_dpan_neon
527#define _op_blend_rel_pas_caa_dpan_neon _op_blend_pas_caa_dpan_neon
528#define _op_blend_rel_pan_caa_dpan_neon _op_blend_pan_caa_dpan_neon
529
530static void
531init_blend_rel_pixel_color_span_funcs_neon(void)
532{
533 op_blend_rel_span_funcs[SP][SM_N][SC][DP_AN][CPU_NEON] = _op_blend_rel_p_c_dpan_neon;
534 op_blend_rel_span_funcs[SP_AS][SM_N][SC][DP_AN][CPU_NEON] = _op_blend_rel_pas_c_dpan_neon;
535 op_blend_rel_span_funcs[SP_AN][SM_N][SC][DP_AN][CPU_NEON] = _op_blend_rel_pan_c_dpan_neon;
536 op_blend_rel_span_funcs[SP][SM_N][SC_AN][DP_AN][CPU_NEON] = _op_blend_rel_p_can_dpan_neon;
537 op_blend_rel_span_funcs[SP_AS][SM_N][SC_AN][DP_AN][CPU_NEON] = _op_blend_rel_pas_can_dpan_neon;
538 op_blend_rel_span_funcs[SP_AN][SM_N][SC_AN][DP_AN][CPU_NEON] = _op_blend_rel_pan_can_dpan_neon;
539 op_blend_rel_span_funcs[SP][SM_N][SC_AA][DP_AN][CPU_NEON] = _op_blend_rel_p_caa_dpan_neon;
540 op_blend_rel_span_funcs[SP_AS][SM_N][SC_AA][DP_AN][CPU_NEON] = _op_blend_rel_pas_caa_dpan_neon;
541 op_blend_rel_span_funcs[SP_AN][SM_N][SC_AA][DP_AN][CPU_NEON] = _op_blend_rel_pan_caa_dpan_neon;
542}
543#endif
544
545#ifdef BUILD_NEON
546
547#define _op_blend_rel_pt_p_c_dpan_neon _op_blend_pt_p_c_dpan_neon
548#define _op_blend_rel_pt_pas_c_dpan_neon _op_blend_pt_pas_c_dpan_neon
549#define _op_blend_rel_pt_pan_c_dpan_neon _op_blend_pt_pan_c_dpan_neon
550#define _op_blend_rel_pt_p_can_dpan_neon _op_blend_pt_p_can_dpan_neon
551#define _op_blend_rel_pt_pas_can_dpan_neon _op_blend_pt_pas_can_dpan_neon
552#define _op_blend_rel_pt_pan_can_dpan_neon _op_blend_pt_pan_can_dpan_neon
553#define _op_blend_rel_pt_p_caa_dpan_neon _op_blend_pt_p_caa_dpan_neon
554#define _op_blend_rel_pt_pas_caa_dpan_neon _op_blend_pt_pas_caa_dpan_neon
555#define _op_blend_rel_pt_pan_caa_dpan_neon _op_blend_pt_pan_caa_dpan_neon
556
557static void
558init_blend_rel_pixel_color_pt_funcs_neon(void)
559{
560 op_blend_rel_pt_funcs[SP][SM_N][SC][DP_AN][CPU_NEON] = _op_blend_rel_pt_p_c_dpan_neon;
561 op_blend_rel_pt_funcs[SP_AS][SM_N][SC][DP_AN][CPU_NEON] = _op_blend_rel_pt_pas_c_dpan_neon;
562 op_blend_rel_pt_funcs[SP_AN][SM_N][SC][DP_AN][CPU_NEON] = _op_blend_rel_pt_pan_c_dpan_neon;
563 op_blend_rel_pt_funcs[SP][SM_N][SC_AN][DP_AN][CPU_NEON] = _op_blend_rel_pt_p_can_dpan_neon;
564 op_blend_rel_pt_funcs[SP_AS][SM_N][SC_AN][DP_AN][CPU_NEON] = _op_blend_rel_pt_pas_can_dpan_neon;
565 op_blend_rel_pt_funcs[SP_AN][SM_N][SC_AN][DP_AN][CPU_NEON] = _op_blend_rel_pt_pan_can_dpan_neon;
566 op_blend_rel_pt_funcs[SP][SM_N][SC_AA][DP_AN][CPU_NEON] = _op_blend_rel_pt_p_caa_dpan_neon;
567 op_blend_rel_pt_funcs[SP_AS][SM_N][SC_AA][DP_AN][CPU_NEON] = _op_blend_rel_pt_pas_caa_dpan_neon;
568 op_blend_rel_pt_funcs[SP_AN][SM_N][SC_AA][DP_AN][CPU_NEON] = _op_blend_rel_pt_pan_caa_dpan_neon;
569}
570#endif
diff --git a/src/lib/evas/common/evas_op_blend/op_blend_pixel_color_sse3.c b/src/lib/evas/common/evas_op_blend/op_blend_pixel_color_sse3.c
new file mode 100644
index 0000000000..b0c2b84f8e
--- /dev/null
+++ b/src/lib/evas/common/evas_op_blend/op_blend_pixel_color_sse3.c
@@ -0,0 +1,543 @@
1/* blend pixel x color --> dst */
2
3#ifdef BUILD_SSE3
4
5static void
6_op_blend_p_c_dp_sse3(DATA32 *s, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) {
7
8 DATA32 alpha;
9
10 const __m128i c_packed = _mm_set_epi32(c, c, c, c);
11
12 LOOP_ALIGNED_U1_A48_SSE3(d, l,
13 { /* UOP */
14
15 DATA32 sc = MUL4_SYM(c, *s);
16 alpha = 256 - (sc >> 24);
17 *d = sc + MUL_256(alpha, *d);
18 d++; s++; l--;
19 },
20 { /* A4OP */
21
22 __m128i s0 = _mm_lddqu_si128((__m128i *)s);
23 __m128i d0 = _mm_load_si128((__m128i *)d);
24
25 __m128i sc0 = mul4_sym_sse3(c_packed, s0);
26 __m128i a0 = sub4_alpha_sse3(sc0);
27 __m128i mul0 = mul_256_sse3(a0, d0);
28
29 d0 = _mm_add_epi32(sc0, mul0);
30
31 _mm_store_si128((__m128i *)d, d0);
32
33 d += 4; s += 4; l -= 4;
34 },
35 { /* A8OP */
36
37 __m128i s0 = _mm_lddqu_si128((__m128i *)s);
38 __m128i d0 = _mm_load_si128((__m128i *)d);
39
40 __m128i s1 = _mm_lddqu_si128((__m128i *)(s+4));
41 __m128i d1 = _mm_load_si128((__m128i *)(d+4));
42
43 __m128i sc0 = mul4_sym_sse3(c_packed, s0);
44 __m128i sc1 = mul4_sym_sse3(c_packed, s1);
45
46 __m128i a0 = sub4_alpha_sse3(sc0);
47 __m128i a1 = sub4_alpha_sse3(sc1);
48
49 __m128i mul0 = mul_256_sse3(a0, d0);
50 __m128i mul1 = mul_256_sse3(a1, d1);
51
52 d0 = _mm_add_epi32(sc0, mul0);
53 d1 = _mm_add_epi32(sc1, mul1);
54
55 _mm_store_si128((__m128i *)d, d0);
56 _mm_store_si128((__m128i *)(d+4), d1);
57
58 d += 8; s += 8; l -= 8;
59 })
60}
61
62static void
63_op_blend_pan_c_dp_sse3(DATA32 *s, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) {
64
65 DATA32 c_a = c & 0xFF000000;
66 DATA32 alpha = 256 - (c >> 24);
67
68 const __m128i c_packed = _mm_set_epi32(c, c, c, c);
69 const __m128i c_alpha = _mm_set_epi32(c_a, c_a, c_a, c_a);
70 const __m128i a0 = _mm_set_epi32(alpha, alpha, alpha, alpha);
71
72 LOOP_ALIGNED_U1_A48_SSE3(d, l,
73 { /* UOP */
74
75 *d = ((c & 0xff000000) + MUL3_SYM(c, *s)) + MUL_256(alpha, *d);
76 d++; s++; l--;
77 },
78 { /* A4OP */
79
80 __m128i s0 = _mm_lddqu_si128((__m128i *)s);
81 __m128i d0 = _mm_load_si128((__m128i *)d);
82
83 __m128i r0 = _mm_add_epi32(mul3_sym_sse3(c_packed, s0),