summaryrefslogtreecommitdiff
path: root/src/lib/evas/common
diff options
context:
space:
mode:
authorYury Usishchev <y.usishchev@samsung.com>2015-04-16 19:23:29 +0200
committerCedric BAIL <cedric@osg.samsung.com>2015-05-07 09:53:08 +0200
commita0d0c9883995e0e04979f5382fc8954941b19edc (patch)
treec1e9c433b0b0f21a1fc6ff30e020146952c65c64 /src/lib/evas/common
parent68343123c1c5710da165bb9b1184b2ea53dd078b (diff)
evas: improve _op_blend_mas_c_dp_neon intrinsics implementation.
Summary: Use vceqq and vbsl instead of twice as much vmovl and vadd instructions. Replace vaddq_u8 with vaddq_u32. This allows NEON code to behave exactly like C version. Reviewers: cedric, raster Projects: #efl Differential Revision: https://phab.enlightenment.org/D2362 Signed-off-by: Cedric BAIL <cedric@osg.samsung.com>
Diffstat (limited to 'src/lib/evas/common')
-rw-r--r--src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c37
1 files changed, 18 insertions, 19 deletions
diff --git a/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c b/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c
index 0bc8c5c..a09277e 100644
--- a/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c
+++ b/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c
@@ -25,8 +25,6 @@
25static void 25static void
26_op_blend_mas_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, int l) { 26_op_blend_mas_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, int l) {
27#ifdef BUILD_NEON_INTRINSICS 27#ifdef BUILD_NEON_INTRINSICS
28 uint16x8_t d0_16x8;
29 uint16x8_t d1_16x8;
30 uint16x8_t m_16x8; 28 uint16x8_t m_16x8;
31 uint16x8_t mc0_16x8; 29 uint16x8_t mc0_16x8;
32 uint16x8_t mc1_16x8; 30 uint16x8_t mc1_16x8;
@@ -36,14 +34,20 @@ _op_blend_mas_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, in
36 uint32x2_t c_32x2; 34 uint32x2_t c_32x2;
37 uint32x2_t m_32x2; 35 uint32x2_t m_32x2;
38 uint32x4_t a_32x4; 36 uint32x4_t a_32x4;
37 uint32x4_t ad_32x4;
38 uint32x4_t cond_32x4;
39 uint32x4_t d_32x4; 39 uint32x4_t d_32x4;
40 uint32x4_t m_32x4; 40 uint32x4_t m_32x4;
41 uint32x4_t temp_32x4;
42 uint32x4_t mc_32x4;
43 uint32x4_t x0_32x4;
41 uint32x4_t x1_32x4; 44 uint32x4_t x1_32x4;
42 uint8x16_t a_8x16; 45 uint8x16_t a_8x16;
43 uint8x16_t d_8x16; 46 uint8x16_t d_8x16;
44 uint8x16_t m_8x16; 47 uint8x16_t m_8x16;
45 uint8x16_t mc_8x16; 48 uint8x16_t mc_8x16;
46 uint8x16_t temp_8x16; 49 uint8x16_t temp_8x16;
50 uint8x16_t x0_8x16;
47 uint8x16_t x1_8x16; 51 uint8x16_t x1_8x16;
48 uint8x8_t a0_8x8; 52 uint8x8_t a0_8x8;
49 uint8x8_t a1_8x8; 53 uint8x8_t a1_8x8;
@@ -59,6 +63,8 @@ _op_blend_mas_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, in
59 uint8x8_t temp1_8x8; 63 uint8x8_t temp1_8x8;
60 64
61 x1_8x16 = vdupq_n_u8(0x1); 65 x1_8x16 = vdupq_n_u8(0x1);
66 x0_8x16 = vdupq_n_u8(0x0);
67 x0_32x4 = vreinterpretq_u32_u8(x0_8x16);
62 x255_16x8 = vdupq_n_u16(0xff); 68 x255_16x8 = vdupq_n_u16(0xff);
63 x1_32x4 = vreinterpretq_u32_u8(x1_8x16); 69 x1_32x4 = vreinterpretq_u32_u8(x1_8x16);
64 c_32x2 = vdup_n_u32(c); 70 c_32x2 = vdup_n_u32(c);
@@ -66,7 +72,7 @@ _op_blend_mas_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, in
66 72
67 DATA32 *start = d; 73 DATA32 *start = d;
68 int size = l; 74 int size = l;
69 DATA32 *end = start + (size & ~7); 75 DATA32 *end = start + (size & ~3);
70 while (start < end) { 76 while (start < end) {
71 int k = *((int *)m); 77 int k = *((int *)m);
72 if (k == 0) 78 if (k == 0)
@@ -77,7 +83,6 @@ _op_blend_mas_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, in
77 } 83 }
78 84
79 m_32x2 = vld1_lane_u32((DATA32*)m, m_32x2, 0); 85 m_32x2 = vld1_lane_u32((DATA32*)m, m_32x2, 0);
80
81 d_32x4 = vld1q_u32(start); 86 d_32x4 = vld1q_u32(start);
82 87
83 m_8x8 = vreinterpret_u8_u32(m_32x2); 88 m_8x8 = vreinterpret_u8_u32(m_32x2);
@@ -94,15 +99,15 @@ _op_blend_mas_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, in
94 99
95 mc0_16x8 = vmull_u8(m0_8x8, c_8x8); 100 mc0_16x8 = vmull_u8(m0_8x8, c_8x8);
96 mc1_16x8 = vmull_u8(m1_8x8, c_8x8); 101 mc1_16x8 = vmull_u8(m1_8x8, c_8x8);
97
98 mc0_16x8 = vaddq_u16(mc0_16x8, x255_16x8); 102 mc0_16x8 = vaddq_u16(mc0_16x8, x255_16x8);
99 mc1_16x8 = vaddq_u16(mc1_16x8, x255_16x8); 103 mc1_16x8 = vaddq_u16(mc1_16x8, x255_16x8);
100 104
101 mc0_8x8 = vshrn_n_u16(mc0_16x8, 8); 105 mc0_8x8 = vshrn_n_u16(mc0_16x8, 8);
102 mc1_8x8 = vshrn_n_u16(mc1_16x8, 8); 106 mc1_8x8 = vshrn_n_u16(mc1_16x8, 8);
103
104 mc_8x16 = vcombine_u8(mc0_8x8, mc1_8x8); 107 mc_8x16 = vcombine_u8(mc0_8x8, mc1_8x8);
105 a_8x16 = vmvnq_u8(mc_8x16); 108
109 a_8x16 = vsubq_u8(x0_8x16, mc_8x16);
110
106 a_32x4 = vreinterpretq_u32_u8(a_8x16); 111 a_32x4 = vreinterpretq_u32_u8(a_8x16);
107 a_32x4 = vshrq_n_u32(a_32x4, 24); 112 a_32x4 = vshrq_n_u32(a_32x4, 24);
108 a_32x4 = vmulq_u32(a_32x4, x1_32x4); 113 a_32x4 = vmulq_u32(a_32x4, x1_32x4);
@@ -112,35 +117,29 @@ _op_blend_mas_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, in
112 a1_8x8 = vget_high_u8(a_8x16); 117 a1_8x8 = vget_high_u8(a_8x16);
113 118
114 d_8x16 = vreinterpretq_u8_u32(d_32x4); 119 d_8x16 = vreinterpretq_u8_u32(d_32x4);
115
116 d0_8x8 = vget_low_u8(d_8x16); 120 d0_8x8 = vget_low_u8(d_8x16);
117 d1_8x8 = vget_high_u8(d_8x16); 121 d1_8x8 = vget_high_u8(d_8x16);
118 122
119 d0_16x8 = vmovl_u8(d0_8x8);
120 d1_16x8 = vmovl_u8(d1_8x8);
121
122 temp0_16x8 = vmull_u8(a0_8x8, d0_8x8); 123 temp0_16x8 = vmull_u8(a0_8x8, d0_8x8);
123 temp1_16x8 = vmull_u8(a1_8x8, d1_8x8); 124 temp1_16x8 = vmull_u8(a1_8x8, d1_8x8);
124
125 temp0_16x8 = vaddq_u16(temp0_16x8, d0_16x8);
126 temp1_16x8 = vaddq_u16(temp1_16x8, d1_16x8);
127
128 temp0_8x8 = vshrn_n_u16(temp0_16x8,8); 125 temp0_8x8 = vshrn_n_u16(temp0_16x8,8);
129 temp1_8x8 = vshrn_n_u16(temp1_16x8,8); 126 temp1_8x8 = vshrn_n_u16(temp1_16x8,8);
130 127
131 temp_8x16 = vcombine_u8(temp0_8x8, temp1_8x8); 128 temp_8x16 = vcombine_u8(temp0_8x8, temp1_8x8);
129 temp_32x4 = vreinterpretq_u32_u8(temp_8x16);
132 130
133 d_8x16 = vaddq_u8(mc_8x16, temp_8x16); 131 cond_32x4 = vceqq_u32(a_32x4, x0_32x4);
132 ad_32x4 = vbslq_u32(cond_32x4, d_32x4, temp_32x4);
134 133
135 d_32x4 = vreinterpretq_u32_u8(d_8x16); 134 mc_32x4 = vreinterpretq_u32_u8(mc_8x16);
135 d_32x4 = vaddq_u32(mc_32x4, ad_32x4);
136 136
137 vst1q_u32(start, d_32x4); 137 vst1q_u32(start, d_32x4);
138 138
139 start+=4; 139 start+=4;
140 m+=4; 140 m+=4;
141
142 } 141 }
143 end += (size & 7); 142 end += (size & 3);
144 while (start < end) { 143 while (start < end) {
145 DATA32 a = *m; 144 DATA32 a = *m;
146 DATA32 mc = MUL_SYM(a, c); 145 DATA32 mc = MUL_SYM(a, c);