forked from enlightenment/efl
fix slow path in neon - memcpy is actually faster! :)
SVN revision: 56240
This commit is contained in:
parent
88a8651acb
commit
ecd007c049
|
@ -3,61 +3,47 @@
|
||||||
#ifdef BUILD_NEON
|
#ifdef BUILD_NEON
|
||||||
static void
|
static void
|
||||||
_op_copy_p_dp_neon(DATA32 *s, DATA8 *m __UNUSED__, DATA32 c __UNUSED__, DATA32 *d, int l) {
|
_op_copy_p_dp_neon(DATA32 *s, DATA8 *m __UNUSED__, DATA32 c __UNUSED__, DATA32 *d, int l) {
|
||||||
uint32_t *e;
|
DATA32 *e;
|
||||||
e = d + l;
|
// if (((unsigned long)s & 0xf) || ((unsigned long)d & 0xf))
|
||||||
//#ifdef NEON_INSTRINSICS_OK
|
// {
|
||||||
#if 0
|
memcpy(d, s, l * sizeof(DATA32));
|
||||||
// odd this is faster than the below asm... :(
|
// return;
|
||||||
e -= 15;
|
// }
|
||||||
uint32x4_t col1, col2, col3, col4;
|
/*
|
||||||
// fill a run of 4x4 (16) pixels with the color
|
e = d + l - 23;
|
||||||
for (; d < e; d += 16, s += 16) {
|
if (e > d)
|
||||||
col1 = vld1q_u32(s+0); // OP
|
|
||||||
col2 = vld1q_u32(s+4); // OP
|
|
||||||
col3 = vld1q_u32(s+8); // OP
|
|
||||||
col4 = vld1q_u32(s+12); // OP
|
|
||||||
vst1q_u32(d+0, col1); // OP
|
|
||||||
vst1q_u32(d+4, col2); // OP
|
|
||||||
vst1q_u32(d+8, col3); // OP
|
|
||||||
vst1q_u32(d+12, col4); // OP
|
|
||||||
}
|
|
||||||
e += 15;
|
|
||||||
#else
|
|
||||||
if ((e - d) >= 16)
|
|
||||||
{
|
{
|
||||||
DATA32 *d2, *d3, *d4;
|
int dl;
|
||||||
DATA32 *s2, *s3, *s4;
|
|
||||||
e -= 31;
|
|
||||||
d2 = d + 4;
|
|
||||||
d3 = d + 8;
|
|
||||||
d4 = d + 12;
|
|
||||||
s2 = s + 4;
|
|
||||||
s3 = s + 8;
|
|
||||||
s4 = s + 12;
|
|
||||||
asm volatile (
|
asm volatile (
|
||||||
".fpu neon \n\t"
|
".fpu neon \n\t"
|
||||||
"asmloop2:\n\t"
|
"asmloop2: \n\t"
|
||||||
"cmp %[e], %[d]\n\t"
|
"cmp %[e], %[d] \n\t" // compare current and end ptr
|
||||||
"vld1.32 {d16-d17}, [%[s]]!\n\t"
|
"pld [%[s], #64] \n\t" // preload 64 bytes ahead
|
||||||
"vld1.32 {d18-d19}, [%[s2]]!\n\t"
|
"pld [%[s], #256] \n\t" // preload 256 bytes ahead
|
||||||
"vld1.32 {d20-d21}, [%[s3]]!\n\t"
|
"pld [%[s], #320] \n\t" // preload 320 bytes ahead
|
||||||
"vld1.32 {d22-d23}, [%[s4]]!\n\t"
|
"vld1.64 {d0-d3}, [%[s]]! \n\t" // load 256bits (32 bytes 8 pix)
|
||||||
"vst1.32 {d16-d17}, [%[d]]!\n\t"
|
"vld1.64 {d4-d7} , [%[s]]! \n\t" // load 256bits (32 bytes 8 pix)
|
||||||
"vst1.32 {d18-d19}, [%[d2]]!\n\t"
|
"vld1.64 {d8-d11}, [%[s]]! \n\t" // load 256bits (32 bytes 8 pix)
|
||||||
"vst1.32 {d20-d21}, [%[d3]]!\n\t"
|
"vst1.64 {d0-d3}, [%[d]]! \n\t" // store 256bits (32 bytes 8 pix)
|
||||||
"vst1.32 {d22-d23}, [%[d4]]!\n\t"
|
"vst1.64 {d4-d7}, [%[d]]! \n\t" // store 256bits (32 bytes 8 pix)
|
||||||
"bhi asmloop2\n\t"
|
"vst1.64 {d8-d11}, [%[d]]! \n\t" // store 256bits (32 bytes 8 pix)
|
||||||
|
"bhi asmloop2 \n\t"
|
||||||
: // output regs
|
: // output regs
|
||||||
: [s] "r" (s), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4), [e] "r" (e), [d] "r" (d), [d2] "r" (d2), [d3] "r" (d3), [d4] "r" (d4) // input
|
: [s] "r" (s), [e] "r" (e), [d] "r" (d) // input
|
||||||
: "q8", "q9", "q10", "q11", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "memory" // clobbered
|
: "q0", "q1", "q2", "q3", "q4", "q5",
|
||||||
);
|
"d0", "d1", "d2", "d3", "d4", "d5",
|
||||||
e += 31;
|
"d6", "d7", "d8", "d9", "d10", "d11",
|
||||||
|
"memory" // clobbered
|
||||||
|
);
|
||||||
|
e = d + l;
|
||||||
|
dl = l - (l % 24);
|
||||||
|
s = s + dl;
|
||||||
|
d = d + dl;
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
// fixup any leftover pixels in the run
|
|
||||||
for (; d < e; d++, s++) {
|
for (; d < e; d++, s++) {
|
||||||
*d = *s; // OP
|
*d = *s;
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
}
|
}
|
||||||
|
|
||||||
#define _op_copy_pan_dp_neon _op_copy_p_dp_neon
|
#define _op_copy_pan_dp_neon _op_copy_p_dp_neon
|
||||||
|
|
Loading…
Reference in New Issue