summaryrefslogtreecommitdiff
path: root/src/static_libs/draw/draw_main_neon.c
blob: 24b3d0146ab3835d569942c8fa8e0418f08a2951 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#include "draw_private.h"

#undef BUILD_NEON
#ifdef BUILD_NEON
#include <arm_neon.h>

static void
comp_func_solid_source_over_neon(uint32_t * __restrict dest, int length, uint32_t color, uint32_t const_alpha)
{
   uint16x8_t temp00_16x8;
   uint16x8_t temp01_16x8;
   uint16x8_t temp10_16x8;
   uint16x8_t temp11_16x8;
   uint32x4_t temp0_32x4;
   uint32x4_t temp1_32x4;
   uint32x4_t c_32x4;
   uint32x4_t d0_32x4;
   uint32x4_t d1_32x4;
   uint8x16_t d0_8x16;
   uint8x16_t d1_8x16;
   uint8x16_t temp0_8x16;
   uint8x16_t temp1_8x16;
   uint8x8_t alpha_8x8;
   uint8x8_t d00_8x8;
   uint8x8_t d01_8x8;
   uint8x8_t d10_8x8;
   uint8x8_t d11_8x8;
   uint8x8_t temp00_8x8;
   uint8x8_t temp01_8x8;
   uint8x8_t temp10_8x8;
   uint8x8_t temp11_8x8;

   if (const_alpha != 255)
     color = DRAW_BYTE_MUL(color, const_alpha);

   // alpha can only be 0 if color is 0x0. In that case we can just return.
   // Otherwise we can assume alpha != 0. This allows more optimization in
   // NEON code.
   if (!color)
     return;

   DATA32 *start = dest;
   int size = length;
   DATA32 *end = start + (size & ~7);

   unsigned char alpha;
   alpha = ~(color >> 24) + 1;
   alpha_8x8 = vdup_n_u8(alpha);

   c_32x4 = vdupq_n_u32(color);

   while (start < end)
     {
        d0_32x4 = vld1q_u32(start);
        d1_32x4 = vld1q_u32(start+4);
        d0_8x16 = vreinterpretq_u8_u32(d0_32x4);
        d1_8x16 = vreinterpretq_u8_u32(d1_32x4);

        d00_8x8 = vget_low_u8(d0_8x16);
        d01_8x8 = vget_high_u8(d0_8x16);
        d10_8x8 = vget_low_u8(d1_8x16);
        d11_8x8 = vget_high_u8(d1_8x16);

        temp00_16x8 = vmull_u8(alpha_8x8, d00_8x8);
        temp01_16x8 = vmull_u8(alpha_8x8, d01_8x8);
        temp10_16x8 = vmull_u8(alpha_8x8, d10_8x8);
        temp11_16x8 = vmull_u8(alpha_8x8, d11_8x8);

        temp00_8x8 = vshrn_n_u16(temp00_16x8,8);
        temp01_8x8 = vshrn_n_u16(temp01_16x8,8);
        temp10_8x8 = vshrn_n_u16(temp10_16x8,8);
        temp11_8x8 = vshrn_n_u16(temp11_16x8,8);

        temp0_8x16 = vcombine_u8(temp00_8x8, temp01_8x8);
        temp1_8x16 = vcombine_u8(temp10_8x8, temp11_8x8);

        temp0_32x4 = vreinterpretq_u32_u8(temp0_8x16);
        temp1_32x4 = vreinterpretq_u32_u8(temp1_8x16);

        d0_32x4 = vaddq_u32(c_32x4, temp0_32x4);
        d1_32x4 = vaddq_u32(c_32x4, temp1_32x4);

        vst1q_u32(start, d0_32x4);
        vst1q_u32(start+4, d1_32x4);
        start+=8;
     }

   end += (size & 7);
   while (start <  end)
     {
        *start = color + MUL_256(alpha, *start);
        start++;
     }
}

/* Note: Optimisation is based on keeping _dest_ aligned: else it's a pair of
 * reads, then two writes, a miss on read is 'just' two reads */
static void
comp_func_source_over_sse2(uint32_t * __restrict dest, const uint32_t * __restrict src, int length, uint32_t color, uint32_t const_alpha)
{
   uint16x8_t ad0_16x8;
   uint16x8_t ad1_16x8;
   uint16x8_t sc0_16x8;
   uint16x8_t sc1_16x8;
   uint16x8_t x255_16x8;
   uint32x2_t c_32x2;
   uint32x4_t ad_32x4;
   uint32x4_t alpha_32x4;
   uint32x4_t cond_32x4;
   uint32x4_t d_32x4;
   uint32x4_t s_32x4;
   uint32x4_t sc_32x4;
   uint32x4_t x0_32x4;
   uint32x4_t x1_32x4;
   uint8x16_t ad_8x16;
   uint8x16_t alpha_8x16;
   uint8x16_t d_8x16;
   uint8x16_t s_8x16;
   uint8x16_t sc_8x16;
   uint8x16_t x0_8x16;
   uint8x16_t x1_8x16;
   uint8x8_t ad0_8x8;
   uint8x8_t ad1_8x8;
   uint8x8_t alpha0_8x8;
   uint8x8_t alpha1_8x8;
   uint8x8_t c_8x8;
   uint8x8_t d0_8x8;
   uint8x8_t d1_8x8;
   uint8x8_t s0_8x8;
   uint8x8_t s1_8x8;
   uint8x8_t sc0_8x8;
   uint8x8_t sc1_8x8;
   int size;
   DATA32 *start;
   DATA32 *end;

   if (const_alpha != 255)
     color = DRAW_BYTE_MUL(color, const_alpha);

   c_32x2 = vdup_n_u32(color);
   c_8x8 = vreinterpret_u8_u32(c_32x2);
   x255_16x8 = vdupq_n_u16(0xff);
   x0_8x16 = vdupq_n_u8(0x0);
   x0_32x4 = vreinterpretq_u32_u8(x0_8x16);
   x1_8x16 = vdupq_n_u8(0x1);
   x1_32x4 = vreinterpretq_u32_u8(x1_8x16);
   start = dest;
   size = l;
   end = start + (size & ~3);

   while (start < end)
     {
        s_32x4 = vld1q_u32(src);
        s_8x16 = vreinterpretq_u8_u32(s_32x4);

        d_32x4 = vld1q_u32(start);
        d_8x16 = vreinterpretq_u8_u32(d_32x4);
        d0_8x8 = vget_low_u8(d_8x16);
        d1_8x8 = vget_high_u8(d_8x16);

        s0_8x8 = vget_low_u8(s_8x16);
        s1_8x8 = vget_high_u8(s_8x16);

        sc0_16x8 = vmull_u8(s0_8x8, c_8x8);
        sc1_16x8 = vmull_u8(s1_8x8, c_8x8);
        sc0_16x8 = vaddq_u16(sc0_16x8, x255_16x8);
        sc1_16x8 = vaddq_u16(sc1_16x8, x255_16x8);
        sc0_8x8 = vshrn_n_u16(sc0_16x8, 8);
        sc1_8x8 = vshrn_n_u16(sc1_16x8, 8);
        sc_8x16 = vcombine_u8(sc0_8x8, sc1_8x8);

        alpha_32x4 = vreinterpretq_u32_u8(sc_8x16);
        alpha_32x4 = vshrq_n_u32(alpha_32x4, 24);
        alpha_32x4 = vmulq_u32(x1_32x4, alpha_32x4);
        alpha_8x16 = vreinterpretq_u8_u32(alpha_32x4);
        alpha_8x16 = vsubq_u8(x0_8x16, alpha_8x16);
        alpha0_8x8 = vget_low_u8(alpha_8x16);
        alpha1_8x8 = vget_high_u8(alpha_8x16);

        ad0_16x8 = vmull_u8(alpha0_8x8, d0_8x8);
        ad1_16x8 = vmull_u8(alpha1_8x8, d1_8x8);
        ad0_8x8 = vshrn_n_u16(ad0_16x8,8);
        ad1_8x8 = vshrn_n_u16(ad1_16x8,8);
        ad_8x16 = vcombine_u8(ad0_8x8, ad1_8x8);
        ad_32x4 = vreinterpretq_u32_u8(ad_8x16);

        alpha_32x4 = vreinterpretq_u32_u8(alpha_8x16);
        cond_32x4 = vceqq_u32(alpha_32x4, x0_32x4);
        ad_32x4 = vbslq_u32(cond_32x4, d_32x4 , ad_32x4);

        sc_32x4 = vreinterpretq_u32_u8(sc_8x16);
        d_32x4 = vaddq_u32(sc_32x4, ad_32x4);

        vst1q_u32(start, d_32x4);

        src+=4;
        start+=4;
     }

   end += (size & 3);
   while (start <  end)
     {
        DATA32 sc = MUL4_SYM(color, *s);
        DATA32 alpha = 256 - (sc >> 24);
        *start = sc + MUL_256(alpha, *start);
        start++;
        src++;
     }
}
#endif

void
efl_draw_neon_init(void)
{
#ifdef BUILD_NEON
   if (eina_cpu_features_get() & EINA_CPU_NEON)
     {
        // update the comp_function table for solid color
        //func_for_mode_solid[EFL_GFX_RENDER_OP_COPY] = comp_func_solid_source_sse2;
        func_for_mode_solid[EFL_GFX_RENDER_OP_BLEND] = comp_func_solid_source_over_neon;

        // update the comp_function table for source data
        //func_for_mode[EFL_GFX_RENDER_OP_COPY] = comp_func_source_sse2;
        func_for_mode[EFL_GFX_RENDER_OP_BLEND] = comp_func_source_over_neon;
      }
#endif
}