well neon asm was wrongly documented - and wrong before anyway. it's

disabled for now and uses memcpy. bizarre stuff occurs if i try use it
though.



SVN revision: 56304
This commit is contained in:
Carsten Haitzler 2011-01-26 08:45:11 +00:00
parent 80b71b0a05
commit c472771a84
1 changed files with 35 additions and 35 deletions

View File

@ -3,47 +3,47 @@
#ifdef BUILD_NEON #ifdef BUILD_NEON
static void static void
_op_copy_p_dp_neon(DATA32 *s, DATA8 *m __UNUSED__, DATA32 c __UNUSED__, DATA32 *d, int l) { _op_copy_p_dp_neon(DATA32 *s, DATA8 *m __UNUSED__, DATA32 c __UNUSED__, DATA32 *d, int l) {
//#define USENEON 1
#ifndef USENEON
memcpy(d, s, l * sizeof(DATA32));
return;
#else
DATA32 *e; DATA32 *e;
// if (((unsigned long)s & 0xf) || ((unsigned long)d & 0xf))
// {
memcpy(d, s, l * sizeof(DATA32));
// return;
// }
/*
e = d + l - 23; e = d + l - 23;
if (e > d) if (e > d)
{ {
int dl; int dl;
asm volatile (
".fpu neon \n\t" asm volatile
"asmloop2: \n\t" (".fpu neon \n\t"
"cmp %[e], %[d] \n\t" // compare current and end ptr "_op_copy_p_dp_neon_asmloop: \n\t"
"pld [%[s], #64] \n\t" // preload 64 bytes ahead "pld [%[s], #192] \n\t" // preload 256 bytes ahead
"pld [%[s], #256] \n\t" // preload 256 bytes ahead "pld [%[s], #320] \n\t" // preload 320 bytes ahead
"pld [%[s], #320] \n\t" // preload 320 bytes ahead "vld1.32 {d0-d3}, [%[s]]! \n\t" // load 256bits (32 bytes 8 pix), 32bit aligned
"vld1.64 {d0-d3}, [%[s]]! \n\t" // load 256bits (32 bytes 8 pix) "vld1.32 {d4-d7} , [%[s]]! \n\t" // load 256bits (32 bytes 8 pix), 32bit aligned
"vld1.64 {d4-d7} , [%[s]]! \n\t" // load 256bits (32 bytes 8 pix) "vld1.32 {d8-d11}, [%[s]]! \n\t" // load 256bits (32 bytes 8 pix), 32bit aligned
"vld1.64 {d8-d11}, [%[s]]! \n\t" // load 256bits (32 bytes 8 pix) "vst1.32 {d0-d3}, [%[d]]! \n\t" // store 256bits (32 bytes 8 pix), 32bit aligned
"vst1.64 {d0-d3}, [%[d]]! \n\t" // store 256bits (32 bytes 8 pix) "vst1.32 {d4-d7}, [%[d]]! \n\t" // store 256bits (32 bytes 8 pix), 32bit aligned
"vst1.64 {d4-d7}, [%[d]]! \n\t" // store 256bits (32 bytes 8 pix) "vst1.32 {d8-d11}, [%[d]]! \n\t" // store 256bits (32 bytes 8 pix), 32bit aligned
"vst1.64 {d8-d11}, [%[d]]! \n\t" // store 256bits (32 bytes 8 pix) "cmp %[e], %[d] \n\t" // compare current and end ptr
"bhi asmloop2 \n\t" "bgt _op_copy_p_dp_neon_asmloop \n\t"
: // output regs : /*out*/
: [s] "r" (s), [e] "r" (e), [d] "r" (d) // input : /*in */ [s] "r" (s), [e] "r" (e), [d] "r" (d)
: "q0", "q1", "q2", "q3", "q4", "q5", : /*clobber*/
"d0", "d1", "d2", "d3", "d4", "d5", "q0", "q1", "q2","q3", "q4", "q5", "q6",
"d6", "d7", "d8", "d9", "d10", "d11", "d0", "d1", "d2", "d3",
"memory" // clobbered "d4", "d5", "d6", "d7",
); "d8", "d9", "d10", "d11",
e = d + l; "memory" // clobbered
dl = l - (l % 24); );
s = s + dl; dl = l % 24; // dl is how many pixels at end that is not a multiple of 24
d = d + dl; l = l - dl; // jump to there at the end of the run?
s = s + l;
d = d + l;
} }
for (; d < e; d++, s++) { e += 23;
*d = *s; for (;d < e; d++, s++) *d = *s;
} #endif
*/
} }
#define _op_copy_pan_dp_neon _op_copy_p_dp_neon #define _op_copy_pan_dp_neon _op_copy_p_dp_neon