diff --git a/legacy/evas/src/lib/engines/common/evas_blit_main.c b/legacy/evas/src/lib/engines/common/evas_blit_main.c index 9322ffe373..4c077b3fe2 100644 --- a/legacy/evas/src/lib/engines/common/evas_blit_main.c +++ b/legacy/evas/src/lib/engines/common/evas_blit_main.c @@ -136,6 +136,7 @@ evas_common_copy_pixels_rev_neon(DATA32 *src, DATA32 *dst, int len) uint32_t *tmp = (void *)37; #define AP "evas_common_copy_rev_pixels_neon_" asm volatile ( + ".fpu neon \n\t" // Can we do 32 byte? "andS %[tmp], %[d], $0x1f \n\t" "beq "AP"quadstart \n\t" @@ -334,6 +335,7 @@ evas_common_copy_pixels_neon(DATA32 *src, DATA32 *dst, int len){ e = dst + len; #define AP "evas_common_copy_pixels_neon_" asm volatile ( + ".fpu neon \n\t" // Can we do 32 byte? "andS %[tmp], %[d], $0x1f \n\t" "beq "AP"quadstart \n\t" diff --git a/legacy/evas/src/lib/engines/common/evas_convert_rgb_32.c b/legacy/evas/src/lib/engines/common/evas_convert_rgb_32.c index aba2c4a855..41dac6f0ff 100644 --- a/legacy/evas/src/lib/engines/common/evas_convert_rgb_32.c +++ b/legacy/evas/src/lib/engines/common/evas_convert_rgb_32.c @@ -102,6 +102,7 @@ evas_common_convert_rgba_to_32bpp_rgb_8888_rot_90 (DATA32 *src, DATA8 *dst, int } else { #define AP "convert_rgba32_rot_90_" asm volatile ( + ".fpu neon \n\t" " mov %[s1], %[src] \n\t" " add %[s1], %[h],lsl #2 \n\t" " sub %[s1], #8 \n\t" diff --git a/legacy/evas/src/lib/engines/common/evas_op_blend/op_blend_mask_color_neon.c b/legacy/evas/src/lib/engines/common/evas_op_blend/op_blend_mask_color_neon.c index 46929f3cc4..f5eb480cdc 100644 --- a/legacy/evas/src/lib/engines/common/evas_op_blend/op_blend_mask_color_neon.c +++ b/legacy/evas/src/lib/engines/common/evas_op_blend/op_blend_mask_color_neon.c @@ -25,7 +25,7 @@ _op_blend_mas_c_dp_neon(DATA32 *s __UNUSED__, DATA8 *m, DATA32 c, DATA32 *d, int #define AP "blend_mas_c_dp_" asm volatile ( - ".fpu neon \n\t" + ".fpu neon \n\t" " vdup.i32 q15, %[c] \n\t" " vmov.i8 q14, #1 \n\t" @@ -206,7 +206,7 @@ _op_blend_mas_can_dp_neon(DATA32 *s __UNUSED__, DATA8 *m, DATA32 c, DATA32 *d, i #define AP "_blend_mas_can_dp_neon_" asm volatile ( - ".fpu neon \n\t" + ".fpu neon \n\t" "vdup.u32 q9, %[c] \n\t" "vmov.i8 q15, #1 \n\t" "vmov.i8 q14, #0 \n\t" diff --git a/legacy/evas/src/lib/engines/common/evas_op_blend/op_blend_pixel_color_neon.c b/legacy/evas/src/lib/engines/common/evas_op_blend/op_blend_pixel_color_neon.c index 51925c33de..a57052c8fb 100644 --- a/legacy/evas/src/lib/engines/common/evas_op_blend/op_blend_pixel_color_neon.c +++ b/legacy/evas/src/lib/engines/common/evas_op_blend/op_blend_pixel_color_neon.c @@ -6,10 +6,10 @@ static void _op_blend_p_c_dp_neon(DATA32 *s, DATA8 *m __UNUSED__, DATA32 c, DATA32 *d, int l) { #define AP "blend_p_c_dp_" asm volatile ( - ".fpu neon \n\t" + ".fpu neon \n\t" // Load 'c' - "vdup.u32 q7, %[c] \n\t" - "vmov.i8 q6, #1 \n\t" + "vdup.u32 q7, %[c] \n\t" + "vmov.i8 q6, #1 \n\t" // Choose a loop "andS %[tmp], %[d], $0xf \n\t" diff --git a/legacy/evas/src/lib/engines/common/evas_op_blend/op_blend_pixel_neon.c b/legacy/evas/src/lib/engines/common/evas_op_blend/op_blend_pixel_neon.c index 5fcae8d938..cba9c66fcb 100644 --- a/legacy/evas/src/lib/engines/common/evas_op_blend/op_blend_pixel_neon.c +++ b/legacy/evas/src/lib/engines/common/evas_op_blend/op_blend_pixel_neon.c @@ -7,18 +7,18 @@ _op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) { asm volatile ( ".fpu neon \n\t" //** init - "vmov.i8 q8, $0x1 \n\t" + "vmov.i8 q8, $0x1 \n\t" AP "loopchoose: \n\t" // If aligned already - straight to octs - "andS %[tmp], %[d],$0x1f \n\t" - "beq "AP"octloops \n\t" + "andS %[tmp], %[d],$0x1f \n\t" + "beq "AP"octloops \n\t" - "andS %[tmp], %[d],$0xf \n\t" - "beq "AP"quadloops \n\t" + "andS %[tmp], %[d],$0xf \n\t" + "beq "AP"quadloops \n\t" - "andS %[tmp], %[d],$0x4 \n\t" - "beq "AP"dualloop \n\t" + "andS %[tmp], %[d],$0x4 \n\t" + "beq "AP"dualloop \n\t" // Only ever executes once, fall through to dual AP "singleloop: \n\t" @@ -106,7 +106,7 @@ _op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) { "cmp %[tmp], #32 \n\t" "ble "AP"loopout \n\t" - "sub %[tmp],%[e],#64 \n\t" + "sub %[tmp],%[e],#64 \n\t" AP "octloopint:\n\t" @@ -151,12 +151,8 @@ _op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) { "bhi "AP"octloopint\n\t" AP "loopout: \n\t" -//"sub %[tmp], %[d], #4\n\t" -//"vmov.i16 d0, $0xff00 \n\t" -//"vst1.32 d0[0], [%[tmp]] \n\t" - - "cmp %[d], %[e]\n\t" - "beq "AP"done\n\t" + "cmp %[d], %[e] \n\t" + "beq "AP"done \n\t" "sub %[tmp],%[e], %[d] \n\t" "cmp %[tmp],$0x04 \n\t" "ble "AP"singleloop2 \n\t" @@ -183,7 +179,7 @@ _op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) { "add %[d], #8 \n\t" "cmp %[tmp], %[d] \n\t" - "bhi "AP"dualloop2int \n\t" + "bhi "AP"dualloop2int \n\t" // Single ?? "cmp %[e], %[d] \n\t" @@ -227,11 +223,11 @@ _op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) { #define AP "blend_pas_dp_" DATA32 *e = d + l,*tmp = e + 32,*pl=(void*)912; asm volatile ( - ".fpu neon \n\t" + ".fpu neon \n\t" "vmov.i8 q8, #1 \n\t" AP"loopchoose: \n\t" // If aliged - go as fast we can - "andS %[tmp], %[d], #31 \n\t" + "andS %[tmp], %[d], #31 \n\t" "beq "AP"quadstart \n\t" // See if we can at least do our double loop @@ -312,12 +308,12 @@ _op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) { "vshr.u32 q5, q5,$0x18 \n\t" // Prepare to preload - "add %[pl], %[s], #32\n\t" + "add %[pl], %[s], #32 \n\t" // Mulitply into all fields "vmul.u32 q4, q4, q8 \n\t" "vmul.u32 q5, q5, q8 \n\t" - "pld [%[pl]]\n\t" + "pld [%[pl]] \n\t" // Multiply out "vmull.u8 q6, d8, d4 \n\t" @@ -325,13 +321,13 @@ _op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) { "vmull.u8 q2, d9, d5 \n\t" "vmull.u8 q3, d11, d7 \n\t" - "add %[pl], %[d], #32\n\t" + "add %[pl], %[d], #32 \n\t" "vqrshrn.u16 d8, q6, #8 \n\t" "vqrshrn.u16 d10, q7, #8 \n\t" "vqrshrn.u16 d9, q2, #8 \n\t" "vqrshrn.u16 d11, q3, #8 \n\t" - "pld [%[pl]]\n\t" + "pld [%[pl]] \n\t" "cmp %[tmp], %[pl] \n\t" // Add to s @@ -350,11 +346,11 @@ _op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) { "cmp %[tmp],$0x04 \n\t" "beq "AP"singleloop2 \n\t" - "sub %[tmp],%[e],$0x7 \n\t" + "sub %[tmp],%[e],$0x7 \n\t" AP"dualloop2: \n\t" - "vldm %[s]!, {d0) \n\t" - "vldm %[d], {d4} \n\t" + "vldm %[s]!, {d0) \n\t" + "vldm %[d], {d4} \n\t" // Subtract from 255 (ie negate) and extract alpha channel "vmvn.u8 d8, d0 \n\t" @@ -377,8 +373,8 @@ _op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) { "bhi "AP"dualloop2 \n\t" // Single ?? - "cmp %[e], %[d] \n\t" - "beq "AP"done \n\t" + "cmp %[e], %[d] \n\t" + "beq "AP"done \n\t" AP "singleloop2: \n\t" "vld1.32 d0[0], [%[s]] \n\t" diff --git a/legacy/evas/src/lib/engines/common/evas_op_copy/op_copy_pixel_neon.c b/legacy/evas/src/lib/engines/common/evas_op_copy/op_copy_pixel_neon.c index bc3748d119..b55f7a352f 100644 --- a/legacy/evas/src/lib/engines/common/evas_op_copy/op_copy_pixel_neon.c +++ b/legacy/evas/src/lib/engines/common/evas_op_copy/op_copy_pixel_neon.c @@ -6,7 +6,7 @@ _op_copy_p_dp_neon(DATA32 *s, DATA8 *m __UNUSED__, DATA32 c __UNUSED__, DATA32 * uint32_t *e; e = d + l; //#ifdef NEON_INSTRINSICS_OK -#if 1 +#if 0 // odd this is faster than the below asm... :( e -= 15; uint32x4_t col1, col2, col3, col4; @@ -35,7 +35,7 @@ _op_copy_p_dp_neon(DATA32 *s, DATA8 *m __UNUSED__, DATA32 c __UNUSED__, DATA32 * s3 = s + 8; s4 = s + 12; asm volatile ( - ".fpu neon \n\t" + ".fpu neon \n\t" "asmloop2:\n\t" "cmp %[e], %[d]\n\t" "vld1.32 {d16-d17}, [%[s]]!\n\t"