Evas: Neon: More .fpu neon flags

Also clean up some ugly code.

SVN revision: 55314
This commit is contained in:
Brett Nash 2010-12-06 04:57:54 +00:00
parent ee03254a3e
commit 157bf3a573
6 changed files with 32 additions and 33 deletions

View File

@ -136,6 +136,7 @@ evas_common_copy_pixels_rev_neon(DATA32 *src, DATA32 *dst, int len)
uint32_t *tmp = (void *)37;
#define AP "evas_common_copy_rev_pixels_neon_"
asm volatile (
".fpu neon \n\t"
// Can we do 32 byte?
"andS %[tmp], %[d], $0x1f \n\t"
"beq "AP"quadstart \n\t"
@ -334,6 +335,7 @@ evas_common_copy_pixels_neon(DATA32 *src, DATA32 *dst, int len){
e = dst + len;
#define AP "evas_common_copy_pixels_neon_"
asm volatile (
".fpu neon \n\t"
// Can we do 32 byte?
"andS %[tmp], %[d], $0x1f \n\t"
"beq "AP"quadstart \n\t"

View File

@ -102,6 +102,7 @@ evas_common_convert_rgba_to_32bpp_rgb_8888_rot_90 (DATA32 *src, DATA8 *dst, int
} else {
#define AP "convert_rgba32_rot_90_"
asm volatile (
".fpu neon \n\t"
" mov %[s1], %[src] \n\t"
" add %[s1], %[h],lsl #2 \n\t"
" sub %[s1], #8 \n\t"

View File

@ -151,12 +151,8 @@ _op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
"bhi "AP"octloopint\n\t"
AP "loopout: \n\t"
//"sub %[tmp], %[d], #4\n\t"
//"vmov.i16 d0, $0xff00 \n\t"
//"vst1.32 d0[0], [%[tmp]] \n\t"
"cmp %[d], %[e]\n\t"
"beq "AP"done\n\t"
"cmp %[d], %[e] \n\t"
"beq "AP"done \n\t"
"sub %[tmp],%[e], %[d] \n\t"
"cmp %[tmp],$0x04 \n\t"
"ble "AP"singleloop2 \n\t"
@ -312,12 +308,12 @@ _op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
"vshr.u32 q5, q5,$0x18 \n\t"
// Prepare to preload
"add %[pl], %[s], #32\n\t"
"add %[pl], %[s], #32 \n\t"
// Mulitply into all fields
"vmul.u32 q4, q4, q8 \n\t"
"vmul.u32 q5, q5, q8 \n\t"
"pld [%[pl]]\n\t"
"pld [%[pl]] \n\t"
// Multiply out
"vmull.u8 q6, d8, d4 \n\t"
@ -325,13 +321,13 @@ _op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
"vmull.u8 q2, d9, d5 \n\t"
"vmull.u8 q3, d11, d7 \n\t"
"add %[pl], %[d], #32\n\t"
"add %[pl], %[d], #32 \n\t"
"vqrshrn.u16 d8, q6, #8 \n\t"
"vqrshrn.u16 d10, q7, #8 \n\t"
"vqrshrn.u16 d9, q2, #8 \n\t"
"vqrshrn.u16 d11, q3, #8 \n\t"
"pld [%[pl]]\n\t"
"pld [%[pl]] \n\t"
"cmp %[tmp], %[pl] \n\t"
// Add to s

View File

@ -6,7 +6,7 @@ _op_copy_p_dp_neon(DATA32 *s, DATA8 *m __UNUSED__, DATA32 c __UNUSED__, DATA32 *
uint32_t *e;
e = d + l;
//#ifdef NEON_INSTRINSICS_OK
#if 1
#if 0
// odd this is faster than the below asm... :(
e -= 15;
uint32x4_t col1, col2, col3, col4;