summaryrefslogtreecommitdiff
path: root/src/lib/evas/common/evas_op_blend
diff options
context:
space:
mode:
authorYury Usishchev <y.usishchev@samsung.com>2013-12-29 13:27:54 +0900
committerCarsten Haitzler (Rasterman) <raster@rasterman.com>2013-12-29 13:29:28 +0900
commit011fb2d10aed57c286c8df6cd6f7bde73dec0601 (patch)
tree2599a59bcf50b9deff502d5481e47b70e02125b2 /src/lib/evas/common/evas_op_blend
parentd16f0ceaf6454e6cf9b7f6c10cd4ba5ffebbf808 (diff)
Blending function rework and speedup
Summary: _op_blend_mas_c_dp_neon rework: main loop now process 4 pixels per iteration fast path when *m == 0 Reviewers: raster Reviewed By: raster CC: cedric Differential Revision: https://phab.enlightenment.org/D418
Diffstat (limited to 'src/lib/evas/common/evas_op_blend')
-rw-r--r--src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c208
1 files changed, 113 insertions, 95 deletions
diff --git a/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c b/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c
index 252f276ba8..99f4b38625 100644
--- a/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c
+++ b/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c
@@ -19,111 +19,129 @@
19#ifdef BUILD_NEON 19#ifdef BUILD_NEON
20static void 20static void
21_op_blend_mas_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, int l) { 21_op_blend_mas_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, int l) {
22 // main loop process data in pairs, so we need count to be even 22 DATA32 *e = d + l;
23 DATA32 *e = d + l - (l % 2);
24 23
25 // everything we can do only once per cycle 24 // everything we can do only once per cycle
26 // loading of 'c', initialization of some registers 25 // loading of 'c', initialization of some registers
27 __asm__ __volatile__ 26 asm volatile (
28 ( 27 " .fpu neon \n\t"
29 ".fpu neon \n\t" 28 " vdup.i32 q15, %[c] \n\t"
30 " vmov.32 d30[0], %[c] \n\t" 29 " vmov.i8 q14, #1 \n\t"
31 " vmov.i16 q10, #255 \n\t" 30 " vmov.i16 q12, #255 \n\t"
32 " vmov.i16 q11, #256 \n\t" 31
33 " veor d29, d29, d29 \n\t" 32 :
34 " vzip.8 d30, d29 \n\t" 33 : [c] "r" (c)
35 " vmov d31, d30 \n\t" 34 : "q12", "q14", "q15"
36 :
37 : [c] "r" (c)
38 : "q10", "q11", "q15", "d29"
39 ); 35 );
40 while (d < e) 36 //here we do unaligned part of 'd'
37 while (((int)d & 0xf) && (d < e))
41 { 38 {
42 // main cycle 39 asm volatile (
43 __asm__ __volatile__ 40 " vld1.8 d0[0], [%[m]]! \n\t"
44 ( 41 " vld1.32 d4[0], [%[d]] \n\t"
45 // load pair '*d' and '*(d+1)' into vector register 42 " vdup.u8 d0, d0[0] \n\t"
46 " vldm %[d], {d4} \n\t" 43 " vmull.u8 q4, d0, d30 \n\t"
47 44 " vadd.u16 q4, q4, q12 \n\t"
48 // load '*m' and '*(m+1)' 45 " vshrn.u16 d12, q4, #8 \n\t"
49 " veor q0, q0, q0 \n\t" 46 " vmvn.u16 d14, d12 \n\t"
50 " vld1.8 d0[0], [%[m]]! \n\t" 47 " vshr.u32 d16, d14, #24 \n\t"
51 " vld1.8 d1[0], [%[m]]! \n\t" 48 " vmul.u32 d16, d16, d28 \n\t"
52 49 " vmovl.u8 q9, d4 \n\t"
53 // spread values from d in vector registers so for each 50 " vmull.u8 q7, d16, d4 \n\t"
54 // 8 bit channel data we have 8 bit of zeros 51 " vadd.u16 q7, q9, q7 \n\t"
55 // so each 32bit value occupies now one 64 bit register 52 " vshrn.u16 d0, q7, #8 \n\t"
56 " veor d5, d5, d5 \n\t" 53 " vadd.u8 d0, d0, d12 \n\t"
57 " vzip.8 d4, d5 \n\t" 54 " vst1.32 d0[0], [%[d]]! \n\t"
58 55
59 // copy *m values in corresponding registers 56 : [d] "+r" (d), [m] "+r" (m)
60 " vdup.u16 d0, d0[0] \n\t" 57 : [c] "r" (c)
61 " vdup.u16 d1, d1[0] \n\t" 58 : "q0", "q2", "q4", "q5", "q6", "q7", "q8", "q9",
62 59 "q10", "q15", "q14", "memory"
63 // multiply a * c
64 " vmul.u16 q13, q0, q15 \n\t"
65 " vadd.i16 q13, q13, q10 \n\t"
66 " vsri.16 q13, q13, #8 \n\t"
67 " vand q13, q13, q10 \n\t"
68
69 // extract negated alpha
70 " vdup.u16 d24, d26[3] \n\t"
71 " vdup.u16 d25, d27[3] \n\t"
72 " vsub.i16 q12, q11, q12 \n\t"
73
74 // multiply alpha * (*d) and add a*c
75 " vmul.u16 q2, q2, q12 \n\t"
76 " vsri.16 q2, q2, #8 \n\t"
77 " vand q2, q2, q10 \n\t"
78 " vadd.i16 q2, q2, q13 \n\t"
79 " vand q2, q2, q10 \n\t"
80
81 // save results
82 " vqmovn.u16 d4, q2 \n\t"
83 " vstm %[d]!, {d4} \n\t"
84 : [d] "+r" (d), [m] "+r" (m)
85 : [c] "r" (c)
86 : "q0", "q2", "q15", "q13", "q12", "q11", "q10",
87 "memory"
88 ); 60 );
89 } 61 }
90 if (l % 2) 62 //here e - d should be divisible by 4
63 while((unsigned int)d < ((unsigned int)e & 0xfffffff0))
91 { 64 {
92 // do analogue of main loop for last element, if needed 65 //check if all 4 *m values are zeros
93 __asm__ __volatile__ 66 int k = *((int *)m);
94 ( 67 if (k == 0)
95 " vld1.32 d4[0], [%[d]] \n\t" 68 {
96 69 m+=4;
97 " veor d0, d0, d0 \n\t" 70 d+=4;
98 " vld1.8 d0[0], [%[m]]! \n\t" 71 continue;
99 72 }
100 " veor d5, d5, d5 \n\t" 73
101 " vzip.8 d4, d5 \n\t" 74 asm volatile (
102 75 // load pair '*d' and '*(d+1)' into vector register
103 " vdup.u16 d0, d0[0] \n\t" 76 " vld1.32 d0[0], [%[m]]! \n\t"
104 77 " vldm %[d], {q2} \n\t"
105 " vmul.u16 d26, d0, d30 \n\t" 78 " vmovl.u8 q0, d0 \n\t"
106 " vadd.i16 d26, d26, d20 \n\t" 79 " vmovl.u8 q0, d0 \n\t"
107 " vsri.16 d26, d26, #8 \n\t" 80 " vmul.u32 q0, q14 \n\t"
108 " vand d26, d26, d20 \n\t" 81
109 82 // Multiply a * c
110 " vdup.u16 d24, d26[3] \n\t" 83 " vmull.u8 q4, d0, d30 \n\t"
111 84 " vadd.u16 q4, q4, q12 \n\t"
112 " vsub.i16 d24, d22, d24 \n\t" 85 " vmull.u8 q5, d1, d31 \n\t"
113 " vmul.u16 d4, d4, d24 \n\t" 86 " vadd.u16 q5, q5, q12 \n\t"
114 " vsri.16 d4, d4, #8 \n\t" 87
115 " vand d4, d4, d20 \n\t" 88 // Shorten
116 " vadd.i16 d4, d4, d26 \n\t" 89 " vshrn.u16 d12, q4, #8 \n\t"
117 " vand d4, d4, d20 \n\t" 90 " vshrn.u16 d13, q5, #8 \n\t"
118 91
119 " vqmovn.u16 d4, q2 \n\t" 92 // extract negated alpha
120 " vst1.32 {d4[0]}, [%[d]]! \n\t" 93 " vmvn.u16 q7, q6 \n\t"
121 : [d] "+r" (d), [m] "+r" (m) 94 " vshr.u32 q8, q7, #24 \n\t"
122 : [c] "r" (c) 95 " vmul.u32 q8, q8, q14 \n\t"
123 : "q0", "q2", "q15", "q13", "q12", "q11", "q10", 96
124 "memory" 97 // Multiply
98 " vmovl.u8 q9, d4 \n\t"
99 " vmull.u8 q7, d16, d4 \n\t"
100 " vadd.u16 q7, q9, q7 \n\t"
101 " vmovl.u8 q10, d5 \n\t"
102 " vmull.u8 q8, d17, d5 \n\t"
103 " vadd.u16 q8, q10, q8 \n\t"
104
105 " vshrn.u16 d0, q7, #8 \n\t"
106 " vshrn.u16 d1, q8, #8 \n\t"
107
108 // Add
109 " vadd.u8 q0, q0, q6 \n\t"
110
111 " vstm %[d]!, {d0,d1} \n\t"
112
113 : [d] "+r" (d), [m] "+r" (m)
114 : [c] "r" (c), [x] "r" (42)
115 : "q0", "q2", "q4", "q5", "q6", "q7", "q8", "q9",
116 "q10", "q15", "q14", "memory"
125 ); 117 );
126 } 118 }
119 //do the remaining part
120 while (d < e)
121 {
122 asm volatile (
123 " vld1.8 d0[0], [%[m]]! \n\t"
124 " vld1.32 d4[0], [%[d]] \n\t"
125 " vdup.u8 d0, d0[0] \n\t"
126 " vmull.u8 q4, d0, d30 \n\t"
127 " vadd.u16 q4, q4, q12 \n\t"
128 " vshrn.u16 d12, q4, #8 \n\t"
129 " vmvn.u16 d14, d12 \n\t"
130 " vshr.u32 d16, d14, #24 \n\t"
131 " vmul.u32 d16, d16, d28 \n\t"
132 " vmovl.u8 q9, d4 \n\t"
133 " vmull.u8 q7, d16, d4 \n\t"
134 " vadd.u16 q7, q9, q7 \n\t"
135 " vshrn.u16 d0, q7, #8 \n\t"
136 " vadd.u8 d0, d0, d12 \n\t"
137 " vst1.32 d0[0], [%[d]]! \n\t"
138
139 : [d] "+r" (d), [m] "+r" (m)
140 : [c] "r" (c)
141 : "q0", "q2", "q4", "q5", "q6", "q7", "q8", "q9",
142 "q10", "q15", "q14", "memory"
143 );
144 }
127} 145}
128#endif 146#endif
129 147