Neon improvements for ARM Cortex.

Samsung Electronics just allowed me to release the first set of ARM neon
patches under the Evas licence.  They are silently helping improve EFL and
this another product of their help.

These patches have been tested on a Cortex A8 and show consistent improvement
across the board.  For expedite some tests up to 100% improvements, and
practical real world examples show that rendering limited applications show
similar improvements.  For instance in one application from 17fps->30fps or
for another 40->63fps.

The patches are pure neon code (intrinsics tend to generate worse code).  To
build under GCC you will need a recent GCC and the following C flags:
	-mfloat-abi=softfp -mfpu=neon
I also recommend -O2 and -ffast-math.

If you have any problems please let me know.


SVN revision: 48733
This commit is contained in:
Brett Nash 2010-05-10 09:24:11 +00:00
parent b923e09da0
commit bd6b2d5cb1
5 changed files with 1343 additions and 229 deletions

View File

@ -4,12 +4,129 @@
#ifdef BUILD_NEON
static void
_op_blend_c_dp_neon(DATA32 *s __UNUSED__, DATA8 *m __UNUSED__, DATA32 c, DATA32 *d, int l) {
DATA32 *e, a = 256 - (c >> 24);
UNROLL8_PLD_WHILE(d, l, e,
{
*d = c + MUL_256(a, *d);
d++;
});
DATA32 *e, *tmp = 0;
#define AP "B_C_DP"
asm volatile (
"vdup.u32 q6, %[c] \n\t"
"vmov.i8 q5, #1 \n\t"
"vmvn.u8 q7,q6 \n\t"
"vshr.u32 q7, q7, $0x18 \n\t"
"vmul.u32 q7,q5, q7 \n\t"
"bic %[e], #3 \n\t"
"bic %[d], #3 \n\t"
AP "loopchoose: \n\t"
// If aligned already - straight to quads
"andS %[tmp], %[d],$0x1f \n\t"
"beq "AP"quadloops \n\t"
"andS %[tmp], %[d],$0x4 \n\t"
"beq "AP"dualloop \n\t"
// Only ever executes once, fall through to dual
AP "singleloop: \n\t"
// Use 'tmp' not 'd'
"vld1.32 d0[0], [%[d]] \n\t"
// Only touch d1
"vmull.u8 q0, d0, d14 \n\t"
"vshrn.u16 d0, q0, #8 \n\t"
"vadd.u8 d0, d12, d0 \n\t"
"vst1.32 d0[0], [%[d]] \n\t"
"add %[d], #4 \n\t"
// Can we go the fast path?
"andS %[tmp], %[d],$0x1f \n\t"
"beq "AP"quadloops \n\t"
AP "dualloop: \n\t"
"sub %[tmp], %[e], %[d] \n\t"
"cmp %[tmp], #32 \n\t"
"blt "AP"loopout \n\t"
AP "dualloopint: \n\t"
"vldr.32 d0, [%[d]] \n\t"
"vmull.u8 q1, d0, d14 \n\t"
"vshrn.u16 d0, q1, #8 \n\t"
"vqadd.u8 d0, d0, d12 \n\t"
"vstm %[d]!, {d0} \n\t"
"ands %[tmp], %[d], $0x1f \n\t"
"bne "AP"dualloopint \n\t"
AP "quadloops: \n\t"
"sub %[tmp], %[e], %[d] \n\t"
"cmp %[tmp], #32 \n\t"
"blt "AP"loopout \n\t"
"sub %[tmp],%[e],#31 \n\t"
AP "quadloopint:\n\t"
"vldm %[d], {d0,d1,d2,d3} \n\t"
"vmull.u8 q2, d0, d14 \n\t"
"vmull.u8 q3, d1, d15 \n\t"
"vmull.u8 q4, d2, d14 \n\t"
"vmull.u8 q5, d3, d15 \n\t"
"vshrn.u16 d0, q2, #8 \n\t"
"vshrn.u16 d1, q3, #8 \n\t"
"vshrn.u16 d2, q4, #8 \n\t"
"vshrn.u16 d3, q5, #8 \n\t"
"vqadd.u8 q0, q6, q0 \n\t"
"vqadd.u8 q1, q6, q1 \n\t"
"vstm %[d]!, {d0,d1,d2,d3} \n\t"
"cmp %[tmp], %[d]\n\t"
"bhi "AP"quadloopint\n\t"
AP "loopout: \n\t"
"cmp %[d], %[e]\n\t"
"beq "AP"done\n\t"
"sub %[tmp],%[e], %[d] \n\t"
"cmp %[tmp],#8 \n\t"
"blt "AP"singleloop2 \n\t"
AP "dualloop2: \n\t"
"sub %[tmp],%[e],$0x7 \n\t"
AP "dualloop2int: \n\t"
"vldr.64 d0, [%[d]] \n\t"
"vmull.u8 q1, d0, d14 \n\t"
"vshrn.u16 d0, q1, #8 \n\t"
"vqadd.u8 d0, d0, d12 \n\t"
"vstr.64 d0, [%[d]] \n\t"
"add %[d], #8 \n\t"
"cmp %[tmp], %[d] \n\t"
"bhi "AP"dualloop2int \n\t"
// Single ??
"cmp %[e], %[d] \n\t"
"beq "AP"done \n\t"
AP "singleloop2: \n\t"
"vld1.32 d0[0], [%[d]] \n\t"
"vmull.u8 q1, d0, d14 \n\t"
"vshrn.u16 d0, q1, #8 \n\t"
"vqadd.u8 d0, d0, d12 \n\t"
"vst1.32 d0[0], [%[d]] \n\t"
AP "done:\n\t"
: // output regs
// Input
: [e] "r" (e = d + l), [d] "r" (d), [c] "r" (c), [tmp] "r" (tmp)
: "q0", "q1", "q2","q3", "q4","q5","q6", "q7","memory" // clobbered
);
#undef AP
}
#define _op_blend_caa_dp_neon _op_blend_c_dp_neon

View File

@ -6,51 +6,367 @@ static void
_op_blend_mas_c_dp_neon(DATA32 *s __UNUSED__, DATA8 *m, DATA32 c, DATA32 *d, int l) {
DATA32 *e;
int alpha = 256 - (c >> 24);
UNROLL8_PLD_WHILE(d, l, e,
{
DATA32 a = *m;
switch(a)
{
case 0:
break;
case 255:
*d = c + MUL_256(alpha, *d);
break;
default:
{
DATA32 mc = MUL_SYM(a, c);
a = 256 - (mc >> 24);
*d = mc + MUL_256(a, *d);
}
break;
}
m++; d++;
});
}
#define AP "blend_mas_c_dp_"
asm volatile (
" vdup.i32 q15, %[c] \n\t"
" vmov.i8 q14, #1 \n\t"
// If aligned already - straight to quads
" andS %[tmp], %[d],$0xf \n\t"
" beq "AP"quadloops \n\t"
" andS %[tmp], %[d],$0x4 \n\t"
" beq "AP"dualloop \n\t"
AP"singleloop: \n\t"
" vld1.8 d0[0], [%[m]]! \n\t"
" vld1.32 d4[0], [%[d]] \n\t"
" vdup.u8 d0, d0[0] \n\t"
" vmull.u8 q4, d0, d30 \n\t"
" vshrn.u16 d12, q4, #8 \n\t"
" vmvn.u16 d14, d12 \n\t"
" vshr.u32 d16, d14, #24 \n\t"
" vmul.u32 d16, d16, d28 \n\t"
" vmull.u8 q7, d16, d4 \n\t"
" vshrn.u16 d0, q7, #8 \n\t"
" vqadd.u8 d0, d0, d12 \n\t"
" vst1.32 d0[0], [%[d]]! \n\t"
// Can we go the fast path?
" andS %[tmp], %[d],$0xf \n\t"
" beq "AP"quadloops \n\t"
AP"dualloop: \n\t"
" sub %[tmp], %[e], %[d] \n\t"
" cmp %[tmp], #16 \n\t"
" blt "AP"loopout \n\t"
" vld1.16 d0[0], [%[m]]! \n\t"
" vldm %[d], {d4} \n\t"
" vmovl.u8 q0, d0 \n\t"
" vmovl.u8 q0, d0 \n\t"
" vmul.u32 q0, q14 \n\t"
" vmull.u8 q4, d0, d30 \n\t"
" vshrn.u16 d12, q4, #8 \n\t"
" vmvn.u16 d14, d12 \n\t"
" vshr.u32 d16, d14, #24 \n\t"
" vmul.u32 d16, d16, d28 \n\t"
" vmull.u8 q7, d16, d4 \n\t"
" vshrn.u16 d0, q7, #8 \n\t"
" vqadd.u8 q0, q0, q6 \n\t"
" vstm %[d]!, {d0} \n\t"
AP"quadloops: \n\t"
" sub %[tmp], %[e], %[d] \n\t"
" cmp %[tmp], #16 \n\t"
" blt "AP"loopout \n\t"
" sub %[tmp], %[e], #15 \n\t"
" sub %[d], #16 \n\t"
AP"fastloop:"
" add %[d], #16 \n\t"
" cmp %[tmp], %[d] \n\t"
" ble "AP"loopout \n\t"
AP"quadloopint: \n\t"
// " vld1.32 d0[0], [%[m]]! \n\t"
" ldr.32 %[x], [%[m]] \n\t"
" add %[m], #4 \n\t"
" cmp %[x], #0 \n\t"
" beq "AP"fastloop \n\t"
" vmov.32 d0[0], %[x] \n\t"
" vldm %[d], {d4,d5} \n\t"
// Expand M: Fixme: Can we do this quicker?
" vmovl.u8 q0, d0 \n\t"
" vmovl.u8 q0, d0 \n\t"
" vmul.u32 q0, q14 \n\t"
// Multiply a * c
" vmull.u8 q4, d0, d30 \n\t"
" vmull.u8 q5, d1, d31 \n\t"
// Shorten
" vshrn.u16 d12, q4, #8 \n\t"
" vshrn.u16 d13, q5, #8 \n\t"
// extract negated alpha
" vmvn.u16 q7, q6 \n\t"
" vshr.u32 q8, q7, #24 \n\t"
" vmul.u32 q8, q8, q14 \n\t"
// Multiply
" vmull.u8 q7, d16, d4 \n\t"
" vmull.u8 q8, d17, d5 \n\t"
" vshrn.u16 d0, q7, #8 \n\t"
" vshrn.u16 d1, q8, #8 \n\t"
// Add
" vqadd.u8 q0, q0, q6 \n\t"
" vstm %[d]!, {d0,d1} \n\t"
" cmp %[tmp], %[d] \n\t"
" bhi "AP"quadloopint \n\t"
AP"loopout: \n\t"
" cmp %[d], %[e] \n\t"
" beq "AP"done \n\t"
" sub %[tmp],%[e], %[d] \n\t"
" cmp %[tmp],#4 \n\t"
" beq "AP"singleout \n\t"
AP "dualloop2: \n\t"
"sub %[tmp],%[e],$0x7 \n\t"
" vld1.16 d0[0], [%[m]]! \n\t"
" vldm %[d], {d4} \n\t"
" vmovl.u8 q0, d0 \n\t"
" vmovl.u8 q0, d0 \n\t"
" vmul.u32 q0, q14 \n\t"
" vmull.u8 q4, d0, d30 \n\t"
" vshrn.u16 d12, q4, #8 \n\t"
" vmvn.u16 d14, d12 \n\t"
" vshr.u32 d16, d14, #24 \n\t"
" vmul.u32 d16, d16, d28 \n\t"
" vmull.u8 q7, d16, d4 \n\t"
" vshrn.u16 d0, q7, #8 \n\t"
" vqadd.u8 q0, q0, q6 \n\t"
" vstm %[d]!, {d0} \n\t"
" cmp %[e], %[d] \n\t"
" beq "AP"done \n\t"
AP"singleout: \n\t"
" vld1.8 d0[0], [%[m]]! \n\t"
" vld1.32 d4[0], [%[d]] \n\t"
" vdup.u8 d0, d0[0] \n\t"
" vmull.u8 q4, d0, d30 \n\t"
" vshrn.u16 d12, q4, #8 \n\t"
" vmvn.u16 d14, d12 \n\t"
" vshr.u32 d16, d14, #24 \n\t"
" vmul.u32 d16, d16, d28 \n\t"
" vmull.u8 q7, d16, d4 \n\t"
" vshrn.u16 d0, q7, #8 \n\t"
" vqadd.u8 q0, q0, q6 \n\t"
" vst1.32 d0[0], [%[d]]! \n\t"
AP"done: \n\t"
: // Out
: [e] "r" (d + l), [d] "r" (d), [c] "r" (c),
[tmp] "r" (7), [m] "r" (m), [x] "r" (0)
: "q0", "q1", "q2","q3", "q4","q5","q6", "q7","q8","q14","q15",
"memory" // clobbered
);
#undef AP
}
#endif
#ifdef BUILD_NEON
static void
_op_blend_mas_can_dp_neon(DATA32 *s __UNUSED__, DATA8 *m, DATA32 c, DATA32 *d, int l) {
DATA32 *e;
DATA32 *e,*tmp;
int alpha;
UNROLL8_PLD_WHILE(d, l, e,
{
alpha = *m;
switch(alpha)
{
case 0:
break;
case 255:
*d = c;
break;
default:
alpha++;
*d = INTERP_256(alpha, c, *d);
break;
}
m++; d++;
});
}
#define AP "_blend_mas_can_dp_neon_"
asm volatile (
"vdup.u32 q9, %[c] \n\t"
"vmov.i8 q15, #1 \n\t"
"vmov.i8 q14, #0 \n\t"
// Make C 16 bit (C in q3/q2)
"vmovl.u8 q3, d19 \n\t"
"vmovl.u8 q2, d18 \n\t"
// Which loop to start
" andS %[tmp], %[d],$0xf \n\t"
" beq "AP"quadloop \n\t"
" andS %[tmp], %[d], #4 \n\t"
" beq "AP"dualloop \n\t"
AP"singleloop: \n\t"
" vld1.8 d0[0], [%[m]]! \n\t"
" vld1.32 d8[0], [%[d]] \n\t"
" vdup.u8 d0, d0[0] \n\t"
" vshr.u8 d0, d0, #1 \n\t"
" vmovl.u8 q0, d0 \n\t"
" vmovl.u8 q4, d8 \n\t"
" vsub.s16 q6, q2, q4 \n\t"
" vmul.s16 q6, q0 \n\t"
" vshr.s16 q6, #7 \n\t"
" vadd.s16 q6, q4 \n\t"
" vqmovun.s16 d2, q6 \n\t"
" vst1.32 d2[0], [%[d]]! \n\t"
" andS %[tmp], %[d], #15 \n\t"
" beq "AP"quadloop \n\t"
AP"dualloop: \n\t"
" vld1.16 d0[0], [%[m]]! \n\t"
" vldm %[d], {d8} \n\t"
" vmovl.u8 q0, d0 \n\t"
" vmovl.u8 q0, d0 \n\t"
" vmul.u32 d0, d0, d30 \n\t"
" vshr.u8 d0, d0, #1 \n\t"
" vmovl.u8 q0, d0 \n\t"
" vmovl.u8 q4, d8 \n\t"
" vsub.s16 q6, q2, q4 \n\t"
" vmul.s16 q6, q0 \n\t"
" vshr.s16 q6, #7 \n\t"
" vadd.s16 q6, q4 \n\t"
" vqmovun.s16 d2, q6 \n\t"
" vstm %[d]!, {d2} \n\t"
AP"quadloop: \n\t"
" sub %[tmp], %[e], %[d] \n\t"
" cmp %[tmp], #16 \n\t"
" blt "AP"loopout \n\t"
" sub %[tmp], %[e], #15 \n\t"
" sub %[d], #16 \n\t"
AP"fastloop: \n\t"
" add %[d], #16 \n\t"
" cmp %[tmp], %[d] \n\t"
" ble "AP"loopout \n\t"
AP"quadloopint: \n\t"
// Load the mask: 4 bytes: It has d0/d1
" ldr.32 %[x], [%[m]] \n\t"
" add %[m], #4 \n\t"
" cmp %[x], #0 \n\t"
" beq "AP"fastloop \n\t"
" vmov.32 d0[0], %[x] \n\t"
// Load d into d8/d9 q4
" vldm %[d], {d8,d9} \n\t"
" cmp %[x], $0xffffffff \n\t"
" beq "AP"quadstore \n\t"
// Get the alpha channel ready (m)
" vmovl.u8 q0, d0 \n\t"
" vmovl.u8 q0, d0 \n\t"
" vmul.u32 q0, q0,q15 \n\t"
// Lop a bit off to prevent overflow
" vshr.u8 q0, q0, #1 \n\t"
// Now make it 16 bit
" vmovl.u8 q1, d1 \n\t"
" vmovl.u8 q0, d0 \n\t"
// 16 bit 'd'
" vmovl.u8 q5, d9 \n\t"
" vmovl.u8 q4, d8 \n\t"
// Diff 'd' & 'c'
" vsub.s16 q7, q3, q5 \n\t"
" vsub.s16 q6, q2, q4 \n\t"
" vmul.s16 q7, q1 \n\t"
" vmul.s16 q6, q0 \n\t"
// Shift results a bit
" vshr.s16 q7, #7 \n\t"
" vshr.s16 q6, #7 \n\t"
// Add 'd'
" vadd.s16 q7, q5 \n\t"
" vadd.s16 q6, q4 \n\t"
// Make sure none are negative
" vqmovun.s16 d9, q7 \n\t"
" vqmovun.s16 d8, q6 \n\t"
" vstm %[d]!, {d8,d9} \n\t"
" cmp %[tmp], %[d] \n\t"
" bhi "AP"quadloopint \n\t"
" b "AP"loopout \n\t"
AP"quadstore: \n\t"
" vstm %[d]!, {d18,d19} \n\t"
" cmp %[tmp], %[d] \n\t"
" bhi "AP"quadloopint \n\t"
AP"loopout: \n\t"
" cmp %[e], %[d] \n\t"
" beq "AP"done \n\t"
" sub %[tmp],%[e], %[d] \n\t"
" cmp %[tmp],#8 \n\t"
" blt "AP"onebyte \n\t"
// Load the mask: 2 bytes: It has d0
" vld1.16 d0[0], [%[m]]! \n\t"
// Load d into d8/d9 q4
" vldm %[d], {d8} \n\t"
// Get the alpha channel ready (m)
" vmovl.u8 q0, d0 \n\t"
" vmovl.u8 q0, d0 \n\t"
" vmul.u32 d0, d0, d30 \n\t"
// Lop a bit off to prevent overflow
" vshr.u8 d0, d0, #1 \n\t"
// Now make it 16 bit
" vmovl.u8 q0, d0 \n\t"
// 16 bit 'd'
" vmovl.u8 q4, d8 \n\t"
// Diff 'd' & 'c'
" vsub.s16 q6, q2, q4 \n\t"
" vmul.s16 q6, q0 \n\t"
// Shift results a bit
" vshr.s16 q6, #7 \n\t"
// Add 'd'
"vadd.s16 q6, q4 \n\t"
// Make sure none are negative
"vqmovun.s16 d2, q6 \n\t"
"vstm %[d]!, {d2} \n\t"
"cmp %[e], %[d] \n\t"
"beq "AP"done \n\t"
AP"onebyte: \n\t"
"vld1.8 d0[0], [%[m]]! \n\t"
"vld1.32 d8[0], [%[d]] \n\t"
"vdup.u8 d0, d0[0] \n\t"
"vshr.u8 d0, d0, #1 \n\t"
"vmovl.u8 q0, d0 \n\t"
"vmovl.u8 q4, d8 \n\t"
"vsub.s16 q6, q2, q4 \n\t"
"vmul.s16 q6, q0 \n\t"
"vshr.s16 q6, #7 \n\t"
"vadd.s16 q6, q4 \n\t"
"vqmovun.s16 d2, q6 \n\t"
"vst1.32 d2[0], [%[d]]! \n\t"
AP"done: \n\t"
: // output regs
// Input
: [e] "r" (e = d + l), [d] "r" (d), [c] "r" (c),
[m] "r" (m), [tmp] "r" (7), [x] "r" (33)
: "q0", "q1", "q2","q3", "q4","q5","q6", "q7","q14","q15",
"memory" // clobbered
);
#undef AP
}
#endif
#ifdef BUILD_NEON
#define _op_blend_mas_cn_dp_neon _op_blend_mas_can_dp_neon
#define _op_blend_mas_caa_dp_neon _op_blend_mas_c_dp_neon

View File

@ -1,19 +1,191 @@
/* blend pixel x color --> dst */
#ifdef BUILD_NEON
/* Note: Optimisation is based on keeping _dest_ aligned: else it's a pair of
* reads, then two writes, a miss on read is 'just' two reads */
static void
_op_blend_p_c_dp_neon(DATA32 *s, DATA8 *m __UNUSED__, DATA32 c, DATA32 *d, int l) {
DATA32 *e;
int alpha;
UNROLL8_PLD_WHILE(d, l, e,
{
DATA32 sc = MUL4_SYM(c, *s);
alpha = 256 - (sc >> 24);
*d = sc + MUL_256(alpha, *d);
d++;
s++;
});
#define AP "blend_p_c_dp_"
asm volatile (
// Load 'c'
"vdup.u32 q7, %[c] \n\t"
"vmov.i8 q6, #1 \n\t"
// Choose a loop
"andS %[tmp], %[d], $0xf \n\t"
"beq "AP"quadstart \n\t"
"andS %[tmp],%[d], $0x4 \n\t"
"beq "AP"dualloop \n\t"
AP"singleloop:"
"vld1.32 d0[0], [%[s]]! \n\t"
"vld1.32 d2[0], [%[d]] \n\t"
// Mulitply s * c (= sc)
"vmull.u8 q4, d0,d14 \n\t"
// sc in d8
"vshrn.u16 d4, q4, #8 \n\t"
// sca in d9
"vmvn.u32 d6, d4 \n\t"
"vshr.u32 d6, d6, #24 \n\t"
"vmul.u32 d6, d12, d6 \n\t"
/* d * alpha */
"vmull.u8 q4, d6, d2 \n\t"
"vshrn.u16 d0, q4, #8 \n\t"
"vqadd.u8 d2, d0, d4 \n\t"
// Save dsc + sc
"vst1.32 d2[0], [%[d]]! \n\t"
// Now where?
// Can we go the fast path?
"andS %[tmp], %[d],$0xf \n\t"
"beq "AP"quadstart \n\t"
AP"dualloop: \n\t"
// Check we have enough to bother with!
"sub %[tmp], %[e], %[d] \n\t"
"cmp %[tmp], #16 \n\t"
"blt "AP"loopout \n\t"
// load 's' -> q0, 'd' -> q1
"vldm %[s]!, {d0} \n\t"
"vldm %[d], {d2} \n\t"
// Mulitply s * c (= sc)
"vmull.u8 q4, d0,d14 \n\t"
// sc in d8
"vshrn.u16 d4, q4, #8 \n\t"
// sca in d9
"vmvn.u32 d6, d4 \n\t"
"vshr.u32 d6, d6, #24 \n\t"
"vmul.u32 d6, d12, d6 \n\t"
/* d * alpha */
"vmull.u8 q4, d6, d2 \n\t"
"vshrn.u16 d0, q4, #8 \n\t"
"vqadd.u8 d2, d0, d4 \n\t"
// Save dsc + sc
"vst1.32 d2, [%[d]]! \n\t"
AP"quadstart: \n\t"
"sub %[tmp], %[e], %[d] \n\t"
"cmp %[tmp], #16 \n\t"
"blt "AP"loopout \n\t"
"sub %[tmp], %[e], #15 \n\t"
AP"quadloop:\n\t"
// load 's' -> q0, 'd' -> q1
"vldm %[s]!, {d0,d1} \n\t"
"vldm %[d], {d2,d3} \n\t"
// Mulitply s * c (= sc)
"vmull.u8 q4, d0,d14 \n\t"
"vmull.u8 q5, d1,d14 \n\t"
// Get sc & sc alpha
"vshrn.u16 d4, q4, #8 \n\t"
"vshrn.u16 d5, q5, #8 \n\t"
// sc is now in q2, 8bpp
// Shift out, then spread alpha for q2
"vmvn.u32 q3, q2 \n\t"
"vshr.u32 q3, q3, $0x18 \n\t"
"vmul.u32 q3, q6,q3 \n\t"
// Multiply 'd' by sc.alpha (dsca)
"vmull.u8 q4, d6,d2 \n\t"
"vmull.u8 q5, d7,d3 \n\t"
"vshrn.u16 d0, q4, #8 \n\t"
"vshrn.u16 d1, q5, #8 \n\t"
"vqadd.u8 q1, q0, q2 \n\t"
// Save dsc + sc
"vstm %[d]!, {d2,d3} \n\t"
"cmp %[tmp], %[d] \n\t"
"bhi "AP"quadloop \n\t"
/* Trailing stuff */
AP"loopout: \n\t"
"cmp %[d], %[e] \n\t"
"beq "AP"done\n\t"
"sub %[tmp],%[e], %[d] \n\t"
"cmp %[tmp],$0x04 \n\t"
"beq "AP"singleloop2 \n\t"
"sub %[tmp], %[e], #7 \n\t"
/* Dual loop */
AP"dualloop2: \n\t"
"vldm %[s]!, {d0} \n\t"
"vldm %[d], {d2} \n\t"
// Mulitply s * c (= sc)
"vmull.u8 q4, d0,d14 \n\t"
// sc in d8
"vshrn.u16 d4, q4, #8 \n\t"
// sca in d9
// XXX: I can probably squash one of these 3
"vmvn.u32 d6, d4 \n\t"
"vshr.u32 d6, d6, #24 \n\t"
"vmul.u32 d6, d6, d12 \n\t"
/* d * alpha */
"vmull.u8 q4, d6, d2 \n\t"
"vshrn.u16 d0, q4, #8 \n\t"
"vqadd.u8 d2, d0, d4 \n\t"
// Save dsc + sc
"vstm %[d]!, {d2} \n\t"
"cmp %[tmp], %[d] \n\t"
"bhi "AP"dualloop2 \n\t"
"cmp %[d], %[e] \n\t"
"beq "AP"done \n\t"
AP"singleloop2: \n\t"
"vld1.32 d0[0], [%[s]]! \n\t"
"vld1.32 d2[0], [%[d]] \n\t"
// Mulitply s * c (= sc)
"vmull.u8 q4, d0,d14 \n\t"
// sc in d8
"vshrn.u16 d4, q4, #8 \n\t"
// sca in d6
"vmvn.u32 d6, d4 \n\t"
"vshr.u32 d6, d6, #24 \n\t"
"vmul.u32 d6, d12,d6 \n\t"
/* d * alpha */
"vmull.u8 q4, d6, d2 \n\t"
"vshrn.u16 d0, q4, #8 \n\t"
"vqadd.u8 d2, d0, d4 \n\t"
// Save dsc + sc
"vst1.32 d2[0], [%[d]]! \n\t"
AP"done:"
: // No output
//
: [s] "r" (s), [e] "r" (d + l), [d] "r" (d), [c] "r" (c),
[tmp] "r" (12)
: "q0","q1","q2","q3","q4","q5","q6","q7","memory"
);
#undef AP
}
static void
@ -28,14 +200,205 @@ _op_blend_pan_can_dp_neon(DATA32 *s, DATA8 *m __UNUSED__, DATA32 c, DATA32 *d, i
static void
_op_blend_pan_caa_dp_neon(DATA32 *s, DATA8 *m __UNUSED__, DATA32 c, DATA32 *d, int l) {
DATA32 *e;
c = 1 + (c & 0xff);
UNROLL8_PLD_WHILE(d, l, e,
{
*d = INTERP_256(c, *s, *d);
d++;
s++;
});
#define AP "_op_blend_pan_caa_dp_"
DATA32 *e = d + l, *tmp = (void*)73;
asm volatile (
/* Set up 'c' */
"vdup.u8 d14, %[c] \n\t"
"vmov.i8 d15, #1 \n\t"
"vaddl.u8 q15, d14, d15 \n\t"
"vshr.u8 q15,#1 \n\t"
// Pick a loop
"andS %[tmp], %[d], $0xf \n\t"
"beq "AP"quadstart \n\t"
"andS %[tmp], %[d], $0x4 \n\t"
"beq "AP"dualstart \n\t"
AP"singleloop: \n\t"
"vld1.32 d4[0], [%[d]] \n\t"
"vld1.32 d0[0], [%[s]]! \n\t"
// Long version of 'd'
"vmovl.u8 q8, d4 \n\t"
// Long version of 's'
"vmovl.u8 q6, d0 \n\t"
// d8 = s -d
"vsub.s16 d8, d12, d16 \n\t"
// Multiply
"vmul.s16 d8, d8, d30 \n\t"
// Shift down
"vshr.s16 d8, #7 \n\t"
// Add 'd'
"vqadd.s16 d8, d8, d16 \n\t"
// Shrink to save
"vqmovun.s16 d0, q4 \n\t"
"vst1.32 d0[0], [%[d]]! \n\t"
// Now where?
"andS %[tmp], %[d], $0xf \n\t"
"beq "AP"quadstart \n\t"
AP"dualstart: \n\t"
// Check we have enough
"sub %[tmp], %[e], %[d] \n\t"
"cmp %[tmp], #16 \n\t"
"blt "AP"loopout \n\t"
AP"dualloop:"
"vldm %[d], {d4} \n\t"
"vldm %[s]!, {d0} \n\t"
// Long version of d
"vmovl.u8 q8, d4 \n\t"
// Long version of s
"vmovl.u8 q6, d0 \n\t"
// q4/q5 = s-d
"vsub.s16 q4, q6, q8 \n\t"
// Multiply
"vmul.s16 q4, q4,q15 \n\t"
// Shift down
"vshr.s16 q4, #7 \n\t"
// Add d
"vqadd.s16 q4, q4, q8 \n\t"
// Shrink to save
"vqmovun.s16 d0, q4 \n\t"
"vstm %[d]!, {d0} \n\t"
AP"quadstart: \n\t"
"sub %[tmp], %[e], %[d] \n\t"
"cmp %[tmp], #16 \n\t"
"blt "AP"loopout \n\t"
"sub %[tmp], %[e], #15 \n\t"
AP"quadloop: \n\t"
// load 's' -> q0, 'd' -> q2
"vldm %[d], {d4,d5} \n\t"
"vldm %[s]!, {d0,d1} \n\t"
// Long version of d
"vmovl.u8 q8, d4 \n\t"
"vmovl.u8 q9, d5 \n\t"
// Long version of s
"vmovl.u8 q6, d0 \n\t"
"vmovl.u8 q7, d1 \n\t"
// q4/q5 = s-d
"vsub.s16 q4, q6, q8 \n\t"
"vsub.s16 q5, q7, q9 \n\t"
// Multiply
"vmul.s16 q4, q4,q15 \n\t"
"vmul.s16 q5, q5,q15 \n\t"
// Shift down
"vshr.s16 q4, #7 \n\t"
"vshr.s16 q5, #7 \n\t"
// Add d
"vqadd.s16 q4, q4, q8 \n\t"
"vqadd.s16 q5, q5, q9 \n\t"
// Shrink to save
"vqmovun.s16 d0, q4 \n\t"
"vqmovun.s16 d1, q5 \n\t"
"vstm %[d]!, {d0,d1} \n\t"
"cmp %[tmp], %[d] \n\t"
"bhi "AP"quadloop\n\t"
"b "AP"done\n\t"
AP"loopout: \n\t"
"cmp %[d], %[e] \n\t"
"beq "AP"done\n\t"
"sub %[tmp],%[e], %[d] \n\t"
"cmp %[tmp],$0x04 \n\t"
"beq "AP"singleloop2 \n\t"
AP"dualloop2: \n\t"
"vldm %[d], {d4} \n\t"
"vldm %[s]!, {d0} \n\t"
// Long version of d
"vmovl.u8 q8, d4 \n\t"
// Long version of s
"vmovl.u8 q6, d0 \n\t"
// q4/q5 = s-d
"vsub.s16 q4, q6, q8 \n\t"
// Multiply
"vmul.s16 q4, q4,q15 \n\t"
// Shift down
"vshr.s16 q4, #7 \n\t"
// Add d
"vqadd.s16 q4, q4, q8 \n\t"
// Shrink to save
"vqmovun.s16 d0, q4 \n\t"
"vstm %[d]!, {d0} \n\t"
"cmp %[d], %[e] \n\t"
"beq "AP"done \n\t"
AP"singleloop2: \n\t"
"vld1.32 d4[0], [%[d]] \n\t"
"vld1.32 d0[0], [%[s]]! \n\t"
// Long version of 'd'
"vmovl.u8 q8, d4 \n\t"
// Long version of 's'
"vmovl.u8 q6, d0 \n\t"
// d8 = s -d
"vsub.s16 d8, d12, d16 \n\t"
// Multiply
"vmul.s16 d8, d8, d30 \n\t"
// Shift down
"vshr.s16 d8, #7 \n\t"
// Add 'd'
"vqadd.s16 d8, d8, d16 \n\t"
// Shrink to save
"vqmovun.s16 d0, q4 \n\t"
"vst1.32 d0[0], [%[d]] \n\t"
AP"done: \n\t"
// No output
:
// Input
: [s] "r" (s), [d] "r" (d), [e] "r" (e), [c] "r" (c), [tmp] "r" (tmp)
// Clobbered
: "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "memory"
);
#undef AP
}
#define _op_blend_pas_c_dp_neon _op_blend_p_c_dp_neon

View File

@ -4,122 +4,411 @@
#ifdef BUILD_NEON
static void
_op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
DATA32 *e = d + l;
while (d < e) {
l = 256 - (*s >> 24);
*d = *s++ + MUL_256(l, *d);
d++;
}
#define AP "blend_p_dp_"
asm volatile (
//** init
"vmov.i8 q8, $0x1 \n\t"
AP "loopchoose: \n\t"
// If aligned already - straight to octs
"andS %[tmp], %[d],$0x1f \n\t"
"beq "AP"octloops \n\t"
"andS %[tmp], %[d],$0xf \n\t"
"beq "AP"quadloops \n\t"
"andS %[tmp], %[d],$0x4 \n\t"
"beq "AP"dualloop \n\t"
// Only ever executes once, fall through to dual
AP "singleloop: \n\t"
"vld1.32 d0[0], [%[s]]! \n\t"
"vld1.32 d4[0], [%[d]] \n\t"
"vmvn.u8 d8, d0 \n\t"
"vshr.u32 d8, d8, #24 \n\t"
"vmul.u32 d8, d16, d8 \n\t"
"vmull.u8 q6, d4,d8 \n\t"
"vshrn.u16 d8, q6, #8 \n\t"
// Add to 's'
"vqadd.u8 q2, q4,q0 \n\t"
"vst1.32 d4[0], [%[d]] \n\t"
"add %[d], #4 \n\t"
// Can we go the fast path?
"andS %[tmp], %[d],$0x1f \n\t"
"beq "AP"octloops \n\t"
"andS %[tmp], %[d],$0x0f \n\t"
"beq "AP"quadloops \n\t"
AP "dualloop: \n\t"
"sub %[tmp], %[e], %[d] \n\t"
"cmp %[tmp], #32 \n\t"
"blt "AP"loopout \n\t"
AP "dualloopint: \n\t"
//** Dual Loop
"vldm %[s]!, {d0} \n\t"
"vldr d4, [%[d]] \n\t"
"vmvn.u8 d8, d0 \n\t"
"vshr.u32 d8, d8, #24 \n\t"
"vmul.u32 d8, d16, d8 \n\t"
"vmull.u8 q6, d4,d8 \n\t"
"vshrn.u16 d8, q6, #8 \n\t"
// Add to 's'
"vqadd.u8 d4, d8,d0 \n\t"
"vstr d4, [%[d]] \n\t"
"add %[d], #8 \n\t"
"ands %[tmp], %[d], $0x1f \n\t"
"beq "AP"octloops \n\t"
AP"quadloops: \n\t"
"sub %[tmp], %[e], %[d] \n\t"
"cmp %[tmp], #32 \n\t"
"blt "AP"loopout \n\t"
"vldm %[s]!, {d0,d1) \n\t"
"vldm %[d], {d4,d5} \n\t"
// Copy s.a into q2 (>> 24) & subtract from 255
"vmvn.u8 q4, q0 \n\t"
"vshr.u32 q4, q4,$0x18 \n\t"
// Multiply into all fields
"vmul.u32 q4, q8,q4 \n\t"
// a * d (clobbering 'd'/q7)
"vmull.u8 q6, d4,d8 \n\t"
"vmull.u8 q2, d5,d9 \n\t"
// Shift & narrow it
"vshrn.u16 d8, q6, #8 \n\t"
"vshrn.u16 d9, q2, #8 \n\t"
// Add to s
"vqadd.u8 q2, q4,q0 \n\t"
// Write it
"vstm %[d]!, {d4,d5} \n\t"
AP "octloops: \n\t"
"sub %[tmp], %[e], %[d] \n\t"
"cmp %[tmp], #32 \n\t"
"ble "AP"loopout \n\t"
"sub %[tmp],%[e],#64 \n\t"
AP "octloopint:\n\t"
//** Oct loop
"vldm %[s]!, {d0,d1,d2,d3) \n\t"
"vldm %[d], {d4,d5,d6,d7} \n\t"
// Copy s.a into q2 (>> 24) & subtract from 255
"vmvn.u8 q4, q0 \n\t"
"vmvn.u8 q5, q1 \n\t"
"vshr.u32 q4, q4,$0x18 \n\t"
"vshr.u32 q5, q5,$0x18\n\t"
// Multiply into all fields
"vmul.u32 q4, q8,q4 \n\t"
"vmul.u32 q5, q8,q5 \n\t"
// a * d (clobbering 'd'/q7)
"vmull.u8 q6, d4,d8 \n\t"
"vmull.u8 q2, d5,d9 \n\t"
"vmull.u8 q7, d6,d10 \n\t"
"vmull.u8 q3, d7,d11 \n\t"
"cmp %[tmp], %[d]\n\t"
// Shift & narrow it
"vshrn.u16 d8, q6, #8 \n\t"
"vshrn.u16 d9, q2, #8 \n\t"
"vshrn.u16 d10, q7, #8 \n\t"
"vshrn.u16 d11, q3, #8 \n\t"
// Add to s
"vqadd.u8 q2, q4,q0 \n\t"
"vqadd.u8 q3, q5,q1 \n\t"
// Write it
"vstm %[d]!, {d4,d5,d6,d7} \n\t"
"bhi "AP"octloopint\n\t"
AP "loopout: \n\t"
//"sub %[tmp], %[d], #4\n\t"
//"vmov.i16 d0, $0xff00 \n\t"
//"vst1.32 d0[0], [%[tmp]] \n\t"
"cmp %[d], %[e]\n\t"
"beq "AP"done\n\t"
"sub %[tmp],%[e], %[d] \n\t"
"cmp %[tmp],$0x04 \n\t"
"ble "AP"singleloop2 \n\t"
AP "dualloop2: \n\t"
"sub %[tmp],%[e],$0x7 \n\t"
AP "dualloop2int: \n\t"
//** Trailing double
"vldm %[s]!, {d0} \n\t"
"vldm %[d], {d4} \n\t"
"vmvn.u8 d8, d0 \n\t"
"vshr.u32 d8, d8, #24 \n\t"
"vmul.u32 d8, d16, d8 \n\t"
"vmull.u8 q6, d4,d8 \n\t"
"vshrn.u16 d8, q6, #8 \n\t"
// Add to 's'
"vqadd.u8 d4, d8,d0 \n\t"
"vstr.32 d4, [%[d]] \n\t"
"add %[d], #8 \n\t"
"cmp %[tmp], %[d] \n\t"
"bhi "AP"dualloop2int \n\t"
// Single ??
"cmp %[e], %[d] \n\t"
"beq "AP"done \n\t"
AP"singleloop2: \n\t"
"vld1.32 d0[0], [%[s]] \n\t"
"vld1.32 d4[0], [%[d]] \n\t"
"vmvn.u8 d8, d0 \n\t"
"vshr.u32 d8, d8, #24 \n\t"
"vmul.u32 d8, d8, d16 \n\t"
"vmull.u8 q6, d8,d4 \n\t"
"vshrn.u16 d8, q6, #8 \n\t"
// Add to 's'
"vqadd.u8 d0, d0,d8 \n\t"
"vst1.32 d0[0], [%[d]] \n\t"
//** Trailing single
AP"done:\n\t"
//"sub %[tmp], %[e], #4 \n\t"
//"vmov.i32 d0, $0xffff0000 \n\t"
//"vst1.32 d0[0], [%[tmp]] \n\t"
: // output regs
// Input
: [e] "r" (d + l), [d] "r" (d), [s] "r" (s), [c] "r" (c),
[tmp] "r" (7)
: "q0", "q1", "q2","q3", "q4","q5","q6", "q7","q8","memory" // clobbered
);
#undef AP
}
static void
_op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
#if 0
#ifdef NEON_INSTRINSICS_OK
#else
DATA32 *e = d + l;
#if 1
if (l >= 4)
{
e -= 4;
asm volatile (
// "vmov.i32 q3, $0xff000000\n\t"
// "asmloop3:\n\t"
// "vld1.32 {d0-d1}, [%[s]]!\n\t"
// "vmov.32 q2, q0\n\t"
// "vand.32 q2, q2, q3\n\t"
// "vceq.i32 q2, q2, #0\n\t"
// "beq blank\n\t"
// "vmov.32 d3, d0\n\t"
// "vmovl.u8 q0, d1\n\t"
// "vmovl.u8 q1, d3\n\t"
// "\n\t"
// "vmovn.u16 d1, q0\n\t"
// "vmovn.u16 d3, q1\n\t"
// "vmov.32 d0, d3\n\t"
// "\n\t"
// "vst1.32 {d0-d1}, [%[d]]!\n\t"
// "cmp %[e], %[d]\n\t" // if d < e ...
// "bhi asmloop3\n\t" // (if d < e) ... goto asmloop3
// "b done\n\t"
// "blank:\n\t"
// "add %[s], %[s], #16\n\t"
// "add %[d], %[d], #16\n\t"
// "cmp %[e], %[d]\n\t" // if d < e ...
// "bhi asmloop3\n\t" // (if d < e) ... goto asmloop3
// "done:\n\t"
"asmloop3:\n\t"
"vld4.8 {d0-d3}, [%[s]]\n\t" // d0-d3 = s
"vld4.8 {d4-d7}, [%[d]]\n\t" // d4-d7 = d
"vmvn.8 d31, d3\n\t" // d31 = 255 - s.a
"vmull.u8 q4, d31, d4\n\t"
"vmull.u8 q5, d31, d5\n\t"
"vmull.u8 q6, d31, d6\n\t"
"vmull.u8 q7, d31, d7\n\t"
"vrshr.u16 q8, q4, #8\n\t"
"vrshr.u16 q9, q5, #8\n\t"
"vraddhn.u16 d20, q4, q8\n\t"
"vrshr.u16 q8, q6, #8\n\t"
"vraddhn.u16 d21, q5, q9\n\t"
"vrshr.u16 q9, q7, #8\n\t"
"vraddhn.u16 d22, q6, q8\n\t"
"vraddhn.u16 d23, q7, q9\n\t"
"vqadd.u8 d20, d0, d20\n\t"
"vqadd.u8 d21, d1, d21\n\t"
"vqadd.u8 d22, d2, d22\n\t"
"vqadd.u8 d23, d3, d23\n\t"
"vst4.8 {d20-d23}, [%[d]]!\n\t"
"vst4.8 {d20-d23}, [%[d]]\n\t"
"add %[s], %[s], #4\n\t" // s++
"add %[d], %[d], #4\n\t" // d++
"cmp %[e], %[d]\n\t" // if d < e ...
"bhi asmloop3\n\t" // (if d < e) ... goto asmloop3
: // output regs
: [s] "r" (s), [e] "r" (e), [d] "r" (d) // input
: "d0", "d1", "memory" // clobbered
);
e += 4;
}
#endif
while (d < e)
{
switch (*s & 0xff000000)
{
case 0:
break;
case 0xff000000:
*d = *s;
break;
default :
l = 256 - (*s >> 24);
*d = *s + MUL_256(l, *d);
break;
}
s++; d++;
}
#endif
#else
DATA32 *e = d + l;
while (d < e)
{
switch (*s & 0xff000000)
{
case 0:
break;
case 0xff000000:
*d = *s;
break;
default :
l = 256 - (*s >> 24);
*d = *s + MUL_256(l, *d);
break;
}
s++; d++;
}
#endif
#define AP "blend_pas_dp_"
DATA32 *e = d + l,*tmp = e + 32,*pl=(void*)912;
asm volatile (
"vmov.i8 q8, #1 \n\t"
AP"loopchoose: \n\t"
// If aliged - go as fast we can
"andS %[tmp], %[d], #31 \n\t"
"beq "AP"quadstart \n\t"
// See if we can at least do our double loop
"andS %[tmp], %[d], $0x7 \n\t"
"beq "AP"dualstart \n\t"
// Ugly single word version
AP "singleloop: \n\t"
"vld1.32 d0[0], [%[s]]! \n\t"
"vld1.32 d4[0], [%[d]] \n\t"
"vmvn.u8 d8, d0 \n\t"
"vshr.u32 d8, d8,$0x18 \n\t"
// Mulitply into all fields
"vmul.u32 d8, d8, d16 \n\t"
// Multiply out
"vmull.u8 q6, d8, d4 \n\t"
"vshrn.u16 d8, q6, #8 \n\t"
// Add to s
"vqadd.u8 d0, d0,d8 \n\t"
"vst1.32 d0[0], [%[d]]! \n\t"
AP"dualstart: \n\t"
"sub %[tmp], %[e], %[d] \n\t"
"cmp %[tmp], #32 \n\t"
"blt "AP"loopout \n\t"
// If aligned - go as fast we can
"andS %[tmp], %[d], #31 \n\t"
"beq "AP"quadstart \n\t"
AP"dualloop: \n\t"
"vldm %[s]!, {d0) \n\t"
"vldm %[d], {d4} \n\t"
// Subtract from 255 (ie negate) and extract alpha channel
"vmvn.u8 d8, d0 \n\t"
"vshr.u32 d8, d8,$0x18 \n\t"
// Mulitply into all fields
"vmul.u32 d8, d8, d16 \n\t"
// Multiply out
"vmull.u8 q6, d8, d4 \n\t"
"vshrn.u16 d8, q6, #8 \n\t"
// Add to s
"vqadd.u8 d0, d0,d8 \n\t"
"vstm %[d]!, {d0} \n\t"
"andS %[tmp], %[d], $0x1f \n\t"
"bne "AP"dualloop \n\t"
AP"quadstart: \n\t"
"sub %[tmp], %[e], %[d] \n\t"
"cmp %[tmp], #32 \n\t"
"blt "AP"loopout \n\t"
"sub %[tmp], %[e], #31 \n\t"
AP"quadloop:\n\t"
"vldm %[s]!, {d0,d1,d2,d3) \n\t"
"vldm %[d], {d4,d5,d6,d7} \n\t"
// Subtract from 255 (ie negate) and extract alpha channel
"vmvn.u8 q4, q0 \n\t"
"vmvn.u8 q5, q1 \n\t"
"vshr.u32 q4, q4,$0x18 \n\t"
"vshr.u32 q5, q5,$0x18 \n\t"
// Prepare to preload
"add %[pl], %[s], #32\n\t"
// Mulitply into all fields
"vmul.u32 q4, q4, q8 \n\t"
"vmul.u32 q5, q5, q8 \n\t"
"pld [%[pl]]\n\t"
// Multiply out
"vmull.u8 q6, d8, d4 \n\t"
"vmull.u8 q7, d10, d6 \n\t"
"vmull.u8 q2, d9, d5 \n\t"
"vmull.u8 q3, d11, d7 \n\t"
"add %[pl], %[d], #32\n\t"
"vshrn.u16 d8, q6, #8 \n\t"
"vshrn.u16 d10, q7, #8 \n\t"
"vshrn.u16 d9, q2, #8 \n\t"
"vshrn.u16 d11, q3, #8 \n\t"
"pld [%[pl]]\n\t"
"cmp %[tmp], %[pl] \n\t"
// Add to s
"vqadd.u8 q0, q0,q4 \n\t"
"vqadd.u8 q1, q1,q5 \n\t"
"vstm %[d]!, {d0,d1,d2,d3} \n\t"
"bhi "AP"quadloop \n\t"
AP "loopout: \n\t"
"cmp %[d], %[e] \n\t"
"beq "AP"done \n\t"
"sub %[tmp],%[e], %[d] \n\t"
"cmp %[tmp],$0x04 \n\t"
"beq "AP"singleloop2 \n\t"
"sub %[tmp],%[e],$0x7 \n\t"
AP"dualloop2: \n\t"
"vldm %[s]!, {d0) \n\t"
"vldm %[d], {d4} \n\t"
// Subtract from 255 (ie negate) and extract alpha channel
"vmvn.u8 d8, d0 \n\t"
"vshr.u32 d8, d8,$0x18 \n\t"
// Mulitply into all fields
"vmul.u32 d8, d8, d16 \n\t"
// Multiply out
"vmull.u8 q6, d8, d4 \n\t"
"vshrn.u16 d8, q6, #8 \n\t"
// Add to s
"vqadd.u8 d0, d0,d8 \n\t"
"vstm %[d]!, {d0} \n\t"
"cmp %[tmp], %[d] \n\t"
"bhi "AP"dualloop2 \n\t"
// Single ??
"cmp %[e], %[d] \n\t"
"beq "AP"done \n\t"
AP "singleloop2: \n\t"
"vld1.32 d0[0], [%[s]] \n\t"
"vld1.32 d4[0], [%[d]] \n\t"
"vmvn.u8 d8, d0 \n\t"
"vshr.u32 d8, d8,$0x18 \n\t"
// Mulitply into all fields
"vmul.u32 d8, d8, d16 \n\t"
// Multiply out
"vmull.u8 q6, d8, d4 \n\t"
"vshrn.u16 d8, q6, #8 \n\t"
// Add to s
"vqadd.u8 d0, d0,d8 \n\t"
"vst1.32 d0[0], [%[d]] \n\t"
AP "done:\n\t"
: /* Out */
: /* In */ [s] "r" (s), [e] "r" (e), [d] "r" (d), [tmp] "r" (tmp),
[pl] "r" (pl)
: /* Clobbered */
"q0","q1","q2","q3","q4","q5","q6","q7","q8","memory"
);
#undef AP
}
#define _op_blend_pan_dp_neon NULL

View File

@ -4,58 +4,87 @@
#ifdef BUILD_NEON
static void
_op_copy_c_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
uint32_t *e;
uint32_t dalign = ((uint32_t)d) & 0xf; // get alignment
// handle unaligned stores - stores not aligned to 16bytes may suck
if (dalign > 0)
{
dalign = (16 - dalign) >> 2;
if (l < dalign) dalign = l;
l -= dalign;
e = d + dalign;
for (; d < e; d++) {
*d = c; // OP
}
if (l <= 0) return;
}
e = d + l;
#ifdef NEON_INSTRINSICS_OK
e -= 15;
// expand the color in c to a 128 bit register as "cccc" i.e 4 pixels of c
uint32x4_t col = vdupq_n_u32(c);
// fill a run of 4x4 (16) pixels with the color
for (; d < e; d += 16) {
vst1q_u32(d+0, col); // OP
vst1q_u32(d+4, col); // OP
vst1q_u32(d+8, col); // OP
vst1q_u32(d+12, col); // OP
}
e += 15;
#else
if ((e - d) >= 16)
{
e -= 31;
asm volatile (
"vdup.32 q8, %[c]\n\t"
"asmloop1:\n\t"
// "pld [%[d], #128]\n\t"
"cmp %[e], %[d]\n\t"
"vst1.32 {d16-d17}, [%[d],:128]!\n\t"
"vst1.32 {d16-d17}, [%[d],:128]!\n\t"
"vst1.32 {d16-d17}, [%[d],:128]!\n\t"
"vst1.32 {d16-d17}, [%[d],:128]!\n\t"
"bhi asmloop1\n\t"
: // output regs
: [c] "r" (c), [e] "r" (e), [d] "r" (d) // input
: "q8", "d16", "d17", "memory" // clobbered
);
e += 31;
}
#endif
// fixup any leftover pixels in the run
for (; d < e; d++) {
*d = c; // OP
}
#define AP "COPY_C_DP_"
uint32_t *e = d + l,*tmp;
asm volatile (
"vdup.i32 q0, %[c] \n\t"
// Can we do 32 byte?
"andS %[tmp], %[d], $0x1f \n\t"
"beq "AP"quadstart \n\t"
// Can we do at least 16 byte?
"andS %[tmp], %[d], $0x4 \n\t"
"beq "AP"dualstart \n\t"
// Only once
AP"singleloop: \n\t"
"vst1.32 d0[0], [%[d]] \n\t"
"add %[d], #4 \n\t"
// Up to 3 times
AP"dualstart: \n\t"
"sub %[tmp], %[e], %[d] \n\t"
"cmp %[tmp], #32 \n\t"
"blt "AP"loopout \n\t"
AP"dualloop: \n\t"
"vstr.32 d0, [%[d]] \n\t"
"add %[d], #8 \n\t"
"andS %[tmp], %[d], $0x1f \n\t"
"bne "AP"dualloop \n\t"
AP"quadstart: \n\t"
"sub %[tmp], %[e], %[d] \n\t"
"cmp %[tmp], #32 \n\t"
"blt "AP"loopout \n\t"
"vmov q1, q0 \n\t"
"sub %[tmp],%[e],#31 \n\t"
AP "quadloop: \n\t"
"vstm %[d]!, {d0,d1,d2,d3} \n\t"
"cmp %[tmp], %[d] \n\t"
"bhi "AP"quadloop \n\t"
AP "loopout: \n\t"
"cmp %[d], %[e] \n\t"
"beq "AP"done \n\t"
"sub %[tmp],%[e], %[d] \n\t"
"cmp %[tmp],$0x04 \n\t"
"beq "AP"singleloop2 \n\t"
AP "dualloop2: \n\t"
"sub %[tmp],%[e],#7 \n\t"
AP "dualloop2int: \n\t"
"vstr.64 d0, [%[d]] \n\t"
"add %[d], #8 \n\t"
"cmp %[tmp], %[d] \n\t"
"bhi "AP"dualloop2int \n\t"
// Single ??
"cmp %[e], %[d] \n\t"
"beq "AP"done \n\t"
AP "singleloop2: \n\t"
"vst1.32 d0[0], [%[d]] \n\t"
AP "done:\n\t"
: // No output regs
// Input
: [c] "r" (c), [e] "r" (e), [d] "r" (d),[tmp] "r" (tmp)
// Clobbered
: "q0","q1","memory"
);
}
#define _op_copy_cn_dp_neon _op_copy_c_dp_neon