forked from enlightenment/efl
Evas: Neon: More .fpu neon flags
Also clean up some ugly code. SVN revision: 55314
This commit is contained in:
parent
ee03254a3e
commit
157bf3a573
|
@ -136,6 +136,7 @@ evas_common_copy_pixels_rev_neon(DATA32 *src, DATA32 *dst, int len)
|
||||||
uint32_t *tmp = (void *)37;
|
uint32_t *tmp = (void *)37;
|
||||||
#define AP "evas_common_copy_rev_pixels_neon_"
|
#define AP "evas_common_copy_rev_pixels_neon_"
|
||||||
asm volatile (
|
asm volatile (
|
||||||
|
".fpu neon \n\t"
|
||||||
// Can we do 32 byte?
|
// Can we do 32 byte?
|
||||||
"andS %[tmp], %[d], $0x1f \n\t"
|
"andS %[tmp], %[d], $0x1f \n\t"
|
||||||
"beq "AP"quadstart \n\t"
|
"beq "AP"quadstart \n\t"
|
||||||
|
@ -334,6 +335,7 @@ evas_common_copy_pixels_neon(DATA32 *src, DATA32 *dst, int len){
|
||||||
e = dst + len;
|
e = dst + len;
|
||||||
#define AP "evas_common_copy_pixels_neon_"
|
#define AP "evas_common_copy_pixels_neon_"
|
||||||
asm volatile (
|
asm volatile (
|
||||||
|
".fpu neon \n\t"
|
||||||
// Can we do 32 byte?
|
// Can we do 32 byte?
|
||||||
"andS %[tmp], %[d], $0x1f \n\t"
|
"andS %[tmp], %[d], $0x1f \n\t"
|
||||||
"beq "AP"quadstart \n\t"
|
"beq "AP"quadstart \n\t"
|
||||||
|
|
|
@ -102,6 +102,7 @@ evas_common_convert_rgba_to_32bpp_rgb_8888_rot_90 (DATA32 *src, DATA8 *dst, int
|
||||||
} else {
|
} else {
|
||||||
#define AP "convert_rgba32_rot_90_"
|
#define AP "convert_rgba32_rot_90_"
|
||||||
asm volatile (
|
asm volatile (
|
||||||
|
".fpu neon \n\t"
|
||||||
" mov %[s1], %[src] \n\t"
|
" mov %[s1], %[src] \n\t"
|
||||||
" add %[s1], %[h],lsl #2 \n\t"
|
" add %[s1], %[h],lsl #2 \n\t"
|
||||||
" sub %[s1], #8 \n\t"
|
" sub %[s1], #8 \n\t"
|
||||||
|
|
|
@ -25,7 +25,7 @@ _op_blend_mas_c_dp_neon(DATA32 *s __UNUSED__, DATA8 *m, DATA32 c, DATA32 *d, int
|
||||||
|
|
||||||
#define AP "blend_mas_c_dp_"
|
#define AP "blend_mas_c_dp_"
|
||||||
asm volatile (
|
asm volatile (
|
||||||
".fpu neon \n\t"
|
".fpu neon \n\t"
|
||||||
" vdup.i32 q15, %[c] \n\t"
|
" vdup.i32 q15, %[c] \n\t"
|
||||||
" vmov.i8 q14, #1 \n\t"
|
" vmov.i8 q14, #1 \n\t"
|
||||||
|
|
||||||
|
@ -206,7 +206,7 @@ _op_blend_mas_can_dp_neon(DATA32 *s __UNUSED__, DATA8 *m, DATA32 c, DATA32 *d, i
|
||||||
|
|
||||||
#define AP "_blend_mas_can_dp_neon_"
|
#define AP "_blend_mas_can_dp_neon_"
|
||||||
asm volatile (
|
asm volatile (
|
||||||
".fpu neon \n\t"
|
".fpu neon \n\t"
|
||||||
"vdup.u32 q9, %[c] \n\t"
|
"vdup.u32 q9, %[c] \n\t"
|
||||||
"vmov.i8 q15, #1 \n\t"
|
"vmov.i8 q15, #1 \n\t"
|
||||||
"vmov.i8 q14, #0 \n\t"
|
"vmov.i8 q14, #0 \n\t"
|
||||||
|
|
|
@ -6,10 +6,10 @@ static void
|
||||||
_op_blend_p_c_dp_neon(DATA32 *s, DATA8 *m __UNUSED__, DATA32 c, DATA32 *d, int l) {
|
_op_blend_p_c_dp_neon(DATA32 *s, DATA8 *m __UNUSED__, DATA32 c, DATA32 *d, int l) {
|
||||||
#define AP "blend_p_c_dp_"
|
#define AP "blend_p_c_dp_"
|
||||||
asm volatile (
|
asm volatile (
|
||||||
".fpu neon \n\t"
|
".fpu neon \n\t"
|
||||||
// Load 'c'
|
// Load 'c'
|
||||||
"vdup.u32 q7, %[c] \n\t"
|
"vdup.u32 q7, %[c] \n\t"
|
||||||
"vmov.i8 q6, #1 \n\t"
|
"vmov.i8 q6, #1 \n\t"
|
||||||
|
|
||||||
// Choose a loop
|
// Choose a loop
|
||||||
"andS %[tmp], %[d], $0xf \n\t"
|
"andS %[tmp], %[d], $0xf \n\t"
|
||||||
|
|
|
@ -7,18 +7,18 @@ _op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
|
||||||
asm volatile (
|
asm volatile (
|
||||||
".fpu neon \n\t"
|
".fpu neon \n\t"
|
||||||
//** init
|
//** init
|
||||||
"vmov.i8 q8, $0x1 \n\t"
|
"vmov.i8 q8, $0x1 \n\t"
|
||||||
|
|
||||||
AP "loopchoose: \n\t"
|
AP "loopchoose: \n\t"
|
||||||
// If aligned already - straight to octs
|
// If aligned already - straight to octs
|
||||||
"andS %[tmp], %[d],$0x1f \n\t"
|
"andS %[tmp], %[d],$0x1f \n\t"
|
||||||
"beq "AP"octloops \n\t"
|
"beq "AP"octloops \n\t"
|
||||||
|
|
||||||
"andS %[tmp], %[d],$0xf \n\t"
|
"andS %[tmp], %[d],$0xf \n\t"
|
||||||
"beq "AP"quadloops \n\t"
|
"beq "AP"quadloops \n\t"
|
||||||
|
|
||||||
"andS %[tmp], %[d],$0x4 \n\t"
|
"andS %[tmp], %[d],$0x4 \n\t"
|
||||||
"beq "AP"dualloop \n\t"
|
"beq "AP"dualloop \n\t"
|
||||||
|
|
||||||
// Only ever executes once, fall through to dual
|
// Only ever executes once, fall through to dual
|
||||||
AP "singleloop: \n\t"
|
AP "singleloop: \n\t"
|
||||||
|
@ -106,7 +106,7 @@ _op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
|
||||||
"cmp %[tmp], #32 \n\t"
|
"cmp %[tmp], #32 \n\t"
|
||||||
"ble "AP"loopout \n\t"
|
"ble "AP"loopout \n\t"
|
||||||
|
|
||||||
"sub %[tmp],%[e],#64 \n\t"
|
"sub %[tmp],%[e],#64 \n\t"
|
||||||
|
|
||||||
|
|
||||||
AP "octloopint:\n\t"
|
AP "octloopint:\n\t"
|
||||||
|
@ -151,12 +151,8 @@ _op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
|
||||||
"bhi "AP"octloopint\n\t"
|
"bhi "AP"octloopint\n\t"
|
||||||
|
|
||||||
AP "loopout: \n\t"
|
AP "loopout: \n\t"
|
||||||
//"sub %[tmp], %[d], #4\n\t"
|
"cmp %[d], %[e] \n\t"
|
||||||
//"vmov.i16 d0, $0xff00 \n\t"
|
"beq "AP"done \n\t"
|
||||||
//"vst1.32 d0[0], [%[tmp]] \n\t"
|
|
||||||
|
|
||||||
"cmp %[d], %[e]\n\t"
|
|
||||||
"beq "AP"done\n\t"
|
|
||||||
"sub %[tmp],%[e], %[d] \n\t"
|
"sub %[tmp],%[e], %[d] \n\t"
|
||||||
"cmp %[tmp],$0x04 \n\t"
|
"cmp %[tmp],$0x04 \n\t"
|
||||||
"ble "AP"singleloop2 \n\t"
|
"ble "AP"singleloop2 \n\t"
|
||||||
|
@ -183,7 +179,7 @@ _op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
|
||||||
"add %[d], #8 \n\t"
|
"add %[d], #8 \n\t"
|
||||||
|
|
||||||
"cmp %[tmp], %[d] \n\t"
|
"cmp %[tmp], %[d] \n\t"
|
||||||
"bhi "AP"dualloop2int \n\t"
|
"bhi "AP"dualloop2int \n\t"
|
||||||
|
|
||||||
// Single ??
|
// Single ??
|
||||||
"cmp %[e], %[d] \n\t"
|
"cmp %[e], %[d] \n\t"
|
||||||
|
@ -227,11 +223,11 @@ _op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
|
||||||
#define AP "blend_pas_dp_"
|
#define AP "blend_pas_dp_"
|
||||||
DATA32 *e = d + l,*tmp = e + 32,*pl=(void*)912;
|
DATA32 *e = d + l,*tmp = e + 32,*pl=(void*)912;
|
||||||
asm volatile (
|
asm volatile (
|
||||||
".fpu neon \n\t"
|
".fpu neon \n\t"
|
||||||
"vmov.i8 q8, #1 \n\t"
|
"vmov.i8 q8, #1 \n\t"
|
||||||
AP"loopchoose: \n\t"
|
AP"loopchoose: \n\t"
|
||||||
// If aliged - go as fast we can
|
// If aliged - go as fast we can
|
||||||
"andS %[tmp], %[d], #31 \n\t"
|
"andS %[tmp], %[d], #31 \n\t"
|
||||||
"beq "AP"quadstart \n\t"
|
"beq "AP"quadstart \n\t"
|
||||||
|
|
||||||
// See if we can at least do our double loop
|
// See if we can at least do our double loop
|
||||||
|
@ -312,12 +308,12 @@ _op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
|
||||||
"vshr.u32 q5, q5,$0x18 \n\t"
|
"vshr.u32 q5, q5,$0x18 \n\t"
|
||||||
|
|
||||||
// Prepare to preload
|
// Prepare to preload
|
||||||
"add %[pl], %[s], #32\n\t"
|
"add %[pl], %[s], #32 \n\t"
|
||||||
|
|
||||||
// Mulitply into all fields
|
// Mulitply into all fields
|
||||||
"vmul.u32 q4, q4, q8 \n\t"
|
"vmul.u32 q4, q4, q8 \n\t"
|
||||||
"vmul.u32 q5, q5, q8 \n\t"
|
"vmul.u32 q5, q5, q8 \n\t"
|
||||||
"pld [%[pl]]\n\t"
|
"pld [%[pl]] \n\t"
|
||||||
|
|
||||||
// Multiply out
|
// Multiply out
|
||||||
"vmull.u8 q6, d8, d4 \n\t"
|
"vmull.u8 q6, d8, d4 \n\t"
|
||||||
|
@ -325,13 +321,13 @@ _op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
|
||||||
"vmull.u8 q2, d9, d5 \n\t"
|
"vmull.u8 q2, d9, d5 \n\t"
|
||||||
"vmull.u8 q3, d11, d7 \n\t"
|
"vmull.u8 q3, d11, d7 \n\t"
|
||||||
|
|
||||||
"add %[pl], %[d], #32\n\t"
|
"add %[pl], %[d], #32 \n\t"
|
||||||
|
|
||||||
"vqrshrn.u16 d8, q6, #8 \n\t"
|
"vqrshrn.u16 d8, q6, #8 \n\t"
|
||||||
"vqrshrn.u16 d10, q7, #8 \n\t"
|
"vqrshrn.u16 d10, q7, #8 \n\t"
|
||||||
"vqrshrn.u16 d9, q2, #8 \n\t"
|
"vqrshrn.u16 d9, q2, #8 \n\t"
|
||||||
"vqrshrn.u16 d11, q3, #8 \n\t"
|
"vqrshrn.u16 d11, q3, #8 \n\t"
|
||||||
"pld [%[pl]]\n\t"
|
"pld [%[pl]] \n\t"
|
||||||
|
|
||||||
"cmp %[tmp], %[pl] \n\t"
|
"cmp %[tmp], %[pl] \n\t"
|
||||||
// Add to s
|
// Add to s
|
||||||
|
@ -350,11 +346,11 @@ _op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
|
||||||
"cmp %[tmp],$0x04 \n\t"
|
"cmp %[tmp],$0x04 \n\t"
|
||||||
"beq "AP"singleloop2 \n\t"
|
"beq "AP"singleloop2 \n\t"
|
||||||
|
|
||||||
"sub %[tmp],%[e],$0x7 \n\t"
|
"sub %[tmp],%[e],$0x7 \n\t"
|
||||||
|
|
||||||
AP"dualloop2: \n\t"
|
AP"dualloop2: \n\t"
|
||||||
"vldm %[s]!, {d0) \n\t"
|
"vldm %[s]!, {d0) \n\t"
|
||||||
"vldm %[d], {d4} \n\t"
|
"vldm %[d], {d4} \n\t"
|
||||||
|
|
||||||
// Subtract from 255 (ie negate) and extract alpha channel
|
// Subtract from 255 (ie negate) and extract alpha channel
|
||||||
"vmvn.u8 d8, d0 \n\t"
|
"vmvn.u8 d8, d0 \n\t"
|
||||||
|
@ -377,8 +373,8 @@ _op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
|
||||||
"bhi "AP"dualloop2 \n\t"
|
"bhi "AP"dualloop2 \n\t"
|
||||||
|
|
||||||
// Single ??
|
// Single ??
|
||||||
"cmp %[e], %[d] \n\t"
|
"cmp %[e], %[d] \n\t"
|
||||||
"beq "AP"done \n\t"
|
"beq "AP"done \n\t"
|
||||||
|
|
||||||
AP "singleloop2: \n\t"
|
AP "singleloop2: \n\t"
|
||||||
"vld1.32 d0[0], [%[s]] \n\t"
|
"vld1.32 d0[0], [%[s]] \n\t"
|
||||||
|
|
|
@ -6,7 +6,7 @@ _op_copy_p_dp_neon(DATA32 *s, DATA8 *m __UNUSED__, DATA32 c __UNUSED__, DATA32 *
|
||||||
uint32_t *e;
|
uint32_t *e;
|
||||||
e = d + l;
|
e = d + l;
|
||||||
//#ifdef NEON_INSTRINSICS_OK
|
//#ifdef NEON_INSTRINSICS_OK
|
||||||
#if 1
|
#if 0
|
||||||
// odd this is faster than the below asm... :(
|
// odd this is faster than the below asm... :(
|
||||||
e -= 15;
|
e -= 15;
|
||||||
uint32x4_t col1, col2, col3, col4;
|
uint32x4_t col1, col2, col3, col4;
|
||||||
|
@ -35,7 +35,7 @@ _op_copy_p_dp_neon(DATA32 *s, DATA8 *m __UNUSED__, DATA32 c __UNUSED__, DATA32 *
|
||||||
s3 = s + 8;
|
s3 = s + 8;
|
||||||
s4 = s + 12;
|
s4 = s + 12;
|
||||||
asm volatile (
|
asm volatile (
|
||||||
".fpu neon \n\t"
|
".fpu neon \n\t"
|
||||||
"asmloop2:\n\t"
|
"asmloop2:\n\t"
|
||||||
"cmp %[e], %[d]\n\t"
|
"cmp %[e], %[d]\n\t"
|
||||||
"vld1.32 {d16-d17}, [%[s]]!\n\t"
|
"vld1.32 {d16-d17}, [%[s]]!\n\t"
|
||||||
|
|
Loading…
Reference in New Issue