summaryrefslogtreecommitdiff
path: root/src/lib/evas/common/evas_op_blend
diff options
context:
space:
mode:
authorCarsten Haitzler (Rasterman) <raster@rasterman.com>2013-11-15 19:16:03 +0900
committerCarsten Haitzler (Rasterman) <raster@rasterman.com>2013-11-15 19:17:01 +0900
commitdeec62c9b66c45705918bf659ce1d2107dbc6831 (patch)
tree5f0e60fd97b768951b29b7b575704d71514fc359 /src/lib/evas/common/evas_op_blend
parent0e4b25747f8b86e2a923d078c6124fbeb9590c11 (diff)
evas - fix neon blend code used for text rendering to not leave dirty end
Diffstat (limited to 'src/lib/evas/common/evas_op_blend')
-rw-r--r--src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c279
1 files changed, 105 insertions, 174 deletions
diff --git a/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c b/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c
index da7cd3e24d..252f276ba8 100644
--- a/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c
+++ b/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c
@@ -19,180 +19,111 @@
19#ifdef BUILD_NEON 19#ifdef BUILD_NEON
20static void 20static void
21_op_blend_mas_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, int l) { 21_op_blend_mas_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, int l) {
22 DATA32 *e; 22 // main loop process data in pairs, so we need count to be even
23 23 DATA32 *e = d + l - (l % 2);
24 DEBUG_FNCOUNT(""); 24
25 25 // everything we can do only once per cycle
26#define AP "blend_mas_c_dp_" 26 // loading of 'c', initialization of some registers
27 asm volatile ( 27 __asm__ __volatile__
28 ".fpu neon \n\t" 28 (
29 " vdup.i32 q15, %[c] \n\t" 29 ".fpu neon \n\t"
30 " vmov.i8 q14, #1 \n\t" 30 " vmov.32 d30[0], %[c] \n\t"
31 31 " vmov.i16 q10, #255 \n\t"
32 // If aligned already - straight to quads 32 " vmov.i16 q11, #256 \n\t"
33 " andS %[tmp], %[d],$0xf \n\t" 33 " veor d29, d29, d29 \n\t"
34 " beq "AP"quadloops \n\t" 34 " vzip.8 d30, d29 \n\t"
35 35 " vmov d31, d30 \n\t"
36 " andS %[tmp], %[d],$0x4 \n\t" 36 :
37 " beq "AP"dualloop \n\t" 37 : [c] "r" (c)
38 38 : "q10", "q11", "q15", "d29"
39 AP"singleloop: \n\t" 39 );
40 " vld1.8 d0[0], [%[m]]! \n\t" 40 while (d < e)
41 " vld1.32 d4[0], [%[d]] \n\t" 41 {
42 " vdup.u8 d0, d0[0] \n\t" 42 // main cycle
43 " vmull.u8 q4, d0, d30 \n\t" 43 __asm__ __volatile__
44 " vqrshrn.u16 d12, q4, #8 \n\t" 44 (
45 " vmvn.u16 d14, d12 \n\t" 45 // load pair '*d' and '*(d+1)' into vector register
46 " vshr.u32 d16, d14, #24 \n\t" 46 " vldm %[d], {d4} \n\t"
47 " vmul.u32 d16, d16, d28 \n\t" 47
48 " vmull.u8 q7, d16, d4 \n\t" 48 // load '*m' and '*(m+1)'
49 " vqrshrn.u16 d0, q7, #8 \n\t" 49 " veor q0, q0, q0 \n\t"
50 " vqadd.u8 d0, d0, d12 \n\t" 50 " vld1.8 d0[0], [%[m]]! \n\t"
51 " vst1.32 d0[0], [%[d]]! \n\t" 51 " vld1.8 d1[0], [%[m]]! \n\t"
52 52
53 // Can we go the fast path? 53 // spread values from d in vector registers so for each
54 " andS %[tmp], %[d],$0xf \n\t" 54 // 8 bit channel data we have 8 bit of zeros
55 " beq "AP"quadloops \n\t" 55 // so each 32bit value occupies now one 64 bit register
56 56 " veor d5, d5, d5 \n\t"
57 AP"dualloop: \n\t" 57 " vzip.8 d4, d5 \n\t"
58 " sub %[tmp], %[e], %[d] \n\t" 58
59 " cmp %[tmp], #16 \n\t" 59 // copy *m values in corresponding registers
60 " blt "AP"loopout \n\t" 60 " vdup.u16 d0, d0[0] \n\t"
61 61 " vdup.u16 d1, d1[0] \n\t"
62 " vld1.16 d0[0], [%[m]]! \n\t" 62
63 " vldm %[d], {d4} \n\t" 63 // multiply a * c
64 " vmovl.u8 q0, d0 \n\t" 64 " vmul.u16 q13, q0, q15 \n\t"
65 " vmovl.u8 q0, d0 \n\t" 65 " vadd.i16 q13, q13, q10 \n\t"
66 " vmul.u32 q0, q14 \n\t" 66 " vsri.16 q13, q13, #8 \n\t"
67 " vmull.u8 q4, d0, d30 \n\t" 67 " vand q13, q13, q10 \n\t"
68 " vqrshrn.u16 d12, q4, #8 \n\t" 68
69 " vmvn.u16 d14, d12 \n\t" 69 // extract negated alpha
70 " vshr.u32 d16, d14, #24 \n\t" 70 " vdup.u16 d24, d26[3] \n\t"
71 " vmul.u32 d16, d16, d28 \n\t" 71 " vdup.u16 d25, d27[3] \n\t"
72 " vmull.u8 q7, d16, d4 \n\t" 72 " vsub.i16 q12, q11, q12 \n\t"
73 " vqrshrn.u16 d0, q7, #8 \n\t" 73
74 " vqadd.u8 q0, q0, q6 \n\t" 74 // multiply alpha * (*d) and add a*c
75 " vstm %[d]!, {d0} \n\t" 75 " vmul.u16 q2, q2, q12 \n\t"
76 76 " vsri.16 q2, q2, #8 \n\t"
77 AP"quadloops: \n\t" 77 " vand q2, q2, q10 \n\t"
78 " sub %[tmp], %[e], %[d] \n\t" 78 " vadd.i16 q2, q2, q13 \n\t"
79 " cmp %[tmp], #16 \n\t" 79 " vand q2, q2, q10 \n\t"
80 " blt "AP"loopout \n\t" 80
81 81 // save results
82 82 " vqmovn.u16 d4, q2 \n\t"
83 " sub %[tmp], %[e], #15 \n\t" 83 " vstm %[d]!, {d4} \n\t"
84 84 : [d] "+r" (d), [m] "+r" (m)
85 " sub %[d], #16 \n\t" 85 : [c] "r" (c)
86 AP"fastloop:" 86 : "q0", "q2", "q15", "q13", "q12", "q11", "q10",
87 " add %[d], #16 \n\t" 87 "memory"
88 " cmp %[tmp], %[d] \n\t" 88 );
89 " ble "AP"loopout \n\t" 89 }
90 AP"quadloopint: \n\t" 90 if (l % 2)
91 " ldr %[x], [%[m]] \n\t" 91 {
92 " add %[m], #4 \n\t" 92 // do analogue of main loop for last element, if needed
93 " cmp %[x], #0 \n\t" 93 __asm__ __volatile__
94 " beq "AP"fastloop \n\t" 94 (
95 " vmov.32 d0[0], %[x] \n\t" 95 " vld1.32 d4[0], [%[d]] \n\t"
96 " vldm %[d], {d4,d5} \n\t" 96
97 97 " veor d0, d0, d0 \n\t"
98 // Expand M: Fixme: Can we do this quicker? 98 " vld1.8 d0[0], [%[m]]! \n\t"
99 " vmovl.u8 q0, d0 \n\t" 99
100 " vmovl.u8 q0, d0 \n\t" 100 " veor d5, d5, d5 \n\t"
101 " vmul.u32 q0, q14 \n\t" 101 " vzip.8 d4, d5 \n\t"
102 102
103 // Multiply a * c 103 " vdup.u16 d0, d0[0] \n\t"
104 " vmull.u8 q4, d0, d30 \n\t" 104
105 " vmull.u8 q5, d1, d31 \n\t" 105 " vmul.u16 d26, d0, d30 \n\t"
106 106 " vadd.i16 d26, d26, d20 \n\t"
107 // Shorten 107 " vsri.16 d26, d26, #8 \n\t"
108 " vqrshrn.u16 d12, q4, #8 \n\t" 108 " vand d26, d26, d20 \n\t"
109 " vqrshrn.u16 d13, q5, #8 \n\t" 109
110 110 " vdup.u16 d24, d26[3] \n\t"
111 // extract negated alpha 111
112 " vmvn.u16 q7, q6 \n\t" 112 " vsub.i16 d24, d22, d24 \n\t"
113 " vshr.u32 q8, q7, #24 \n\t" 113 " vmul.u16 d4, d4, d24 \n\t"
114 " vmul.u32 q8, q8, q14 \n\t" 114 " vsri.16 d4, d4, #8 \n\t"
115 115 " vand d4, d4, d20 \n\t"
116 // Multiply 116 " vadd.i16 d4, d4, d26 \n\t"
117 " vmull.u8 q7, d16, d4 \n\t" 117 " vand d4, d4, d20 \n\t"
118 " vmull.u8 q8, d17, d5 \n\t" 118
119 119 " vqmovn.u16 d4, q2 \n\t"
120 " vqrshrn.u16 d0, q7, #8 \n\t" 120 " vst1.32 {d4[0]}, [%[d]]! \n\t"
121 " vqrshrn.u16 d1, q8, #8 \n\t" 121 : [d] "+r" (d), [m] "+r" (m)
122 122 : [c] "r" (c)
123 // Add 123 : "q0", "q2", "q15", "q13", "q12", "q11", "q10",
124 " vqadd.u8 q0, q0, q6 \n\t" 124 "memory"
125 125 );
126 " vstm %[d]!, {d0,d1} \n\t" 126 }
127
128 " cmp %[tmp], %[d] \n\t"
129 " bhi "AP"quadloopint \n\t"
130
131 AP"loopout: \n\t"
132#if NEONDEBUG
133 "cmp %[d], %[e] \n\t"
134 "ble "AP"foo \n\t"
135 "cmp %[tmp], %[m] \n\t"
136 "sub %[x], %[x] \n\t"
137 "vst1.32 d0[0], [%[x]] \n\t"
138 AP"foo: \n\t"
139#endif
140
141 " cmp %[d], %[e] \n\t"
142 " beq "AP"done \n\t"
143 " sub %[tmp],%[e], %[d] \n\t"
144 " cmp %[tmp],#4 \n\t"
145 " beq "AP"singleout \n\t"
146
147 AP "dualloop2: \n\t"
148 "sub %[tmp],%[e],$0x8 \n\t"
149 " vld1.16 d0[0], [%[m]]! \n\t"
150 " vldm %[d], {d4} \n\t"
151 " vmovl.u8 q0, d0 \n\t"
152 " vmovl.u8 q0, d0 \n\t"
153 " vmul.u32 q0, q14 \n\t"
154 " vmull.u8 q4, d0, d30 \n\t"
155 " vqrshrn.u16 d12, q4, #8 \n\t"
156 " vmvn.u16 d14, d12 \n\t"
157 " vshr.u32 d16, d14, #24 \n\t"
158 " vmul.u32 d16, d16, d28 \n\t"
159 " vmull.u8 q7, d16, d4 \n\t"
160 " vqrshrn.u16 d0, q7, #8 \n\t"
161 " vqadd.u8 q0, q0, q6 \n\t"
162 " vstm %[d]!, {d0} \n\t"
163
164 " cmp %[e], %[d] \n\t"
165 " beq "AP"done \n\t"
166
167 AP"singleout: \n\t"
168 " vld1.8 d0[0], [%[m]]! \n\t"
169 " vld1.32 d4[0], [%[d]] \n\t"
170 " vdup.u8 d0, d0[0] \n\t"
171 " vmull.u8 q4, d0, d30 \n\t"
172 " vqrshrn.u16 d12, q4, #8 \n\t"
173 " vmvn.u16 d14, d12 \n\t"
174 " vshr.u32 d16, d14, #24 \n\t"
175 " vmul.u32 d16, d16, d28 \n\t"
176 " vmull.u8 q7, d16, d4 \n\t"
177 " vqrshrn.u16 d0, q7, #8 \n\t"
178 " vqadd.u8 q0, q0, q6 \n\t"
179 " vst1.32 d0[0], [%[d]]! \n\t"
180
181 AP"done: \n\t"
182#if NEONDEBUG
183 "cmp %[d], %[e] \n\t"
184 "beq "AP"reallydone \n\t"
185 "sub %[tmp], %[tmp] \n\t"
186 "vst1.32 d0[0], [%[tmp]] \n\t"
187 AP"reallydone:"
188#endif
189 : // Out
190 : [e] "r" (d + l), [d] "r" (d), [c] "r" (c),
191 [tmp] "r" (7), [m] "r" (m), [x] "r" (0)
192 : "q0", "q1", "q2","q3", "q4","q5","q6", "q7","q8","q14","q15",
193 "memory" // clobbered
194 );
195#undef AP
196} 127}
197#endif 128#endif
198 129