From 8e17290f1903d6ec51af517e4ab67d6e959a1843 Mon Sep 17 00:00:00 2001 From: "Snacker (Vladimir)" Date: Fri, 28 Feb 2014 07:04:52 +0900 Subject: [PATCH] @feature - Apply NEON intrisics improvement to rotation --- configure.ac | 3 + src/Makefile_Evas.am | 16 ++- src/lib/evas/common/evas_convert_rgb_32.c | 153 ++++++++++++++++++---- 3 files changed, 142 insertions(+), 30 deletions(-) diff --git a/configure.ac b/configure.ac index a0baf94d65..de6222b3b9 100644 --- a/configure.ac +++ b/configure.ac @@ -453,6 +453,7 @@ build_cpu_neon="no" SSE3_CFLAGS="" ALTIVEC_CFLAGS="" +NEON_CFLAGS="" case $host_cpu in i*86|x86_64|amd64) @@ -524,6 +525,7 @@ case $host_cpu in AC_MSG_RESULT([yes]) AC_DEFINE([BUILD_NEON], [1], [Build NEON Code]) build_cpu_neon="yes" + NEON_CFLAGS="-mfpu=neon" ], [ AC_MSG_RESULT([no]) @@ -535,6 +537,7 @@ esac AC_SUBST([ALTIVEC_CFLAGS]) AC_SUBST([SSE3_CFLAGS]) +AC_SUBST([NEON_CFLAGS]) #### Checks for linker characteristics diff --git a/src/Makefile_Evas.am b/src/Makefile_Evas.am index 644391b056..1fa48564ef 100644 --- a/src/Makefile_Evas.am +++ b/src/Makefile_Evas.am @@ -138,7 +138,6 @@ lib/evas/common/evas_convert_gry_8.c \ lib/evas/common/evas_convert_main.c \ lib/evas/common/evas_convert_rgb_16.c \ lib/evas/common/evas_convert_rgb_24.c \ -lib/evas/common/evas_convert_rgb_32.c \ lib/evas/common/evas_convert_rgb_8.c \ lib/evas/common/evas_convert_grypal_6.c \ lib/evas/common/evas_convert_yuv.c \ @@ -230,13 +229,28 @@ $(lib_evas_libevas_la_CPPFLAGS) \ lib_evas_common_libevas_op_blend_sse3_la_LIBADD = @EVAS_LIBS@ lib_evas_common_libevas_op_blend_sse3_la_DEPENDENCIES = @EVAS_INTERNAL_LIBS@ +# maybe neon, maybe not +noinst_LTLIBRARIES += lib/evas/common/libevas_convert_rgb_32.la + +lib_evas_common_libevas_convert_rgb_32_la_SOURCES = \ +lib/evas/common/evas_convert_rgb_32.c + +lib_evas_common_libevas_convert_rgb_32_la_CPPFLAGS = -I$(top_builddir)/src/lib/efl \ +$(lib_evas_libevas_la_CPPFLAGS) \ +@NEON_CFLAGS@ + +lib_evas_common_libevas_convert_rgb_32_la_LIBADD = @EVAS_LIBS@ +lib_evas_common_libevas_convert_rgb_32_la_DEPENDENCIES = @EVAS_INTERNAL_LIBS@ + lib_evas_libevas_la_CXXFLAGS = lib_evas_libevas_la_LIBADD = \ lib/evas/common/libevas_op_blend_sse3.la \ +lib/evas/common/libevas_convert_rgb_32.la \ @EVAS_LIBS@ lib_evas_libevas_la_DEPENDENCIES = \ lib/evas/common/libevas_op_blend_sse3.la \ +lib/evas/common/libevas_convert_rgb_32.la \ @EVAS_INTERNAL_LIBS@ lib_evas_libevas_la_LDFLAGS = @EFL_LTLIBRARY_FLAGS@ diff --git a/src/lib/evas/common/evas_convert_rgb_32.c b/src/lib/evas/common/evas_convert_rgb_32.c index 11c47e26b0..aae9d37e12 100644 --- a/src/lib/evas/common/evas_convert_rgb_32.c +++ b/src/lib/evas/common/evas_convert_rgb_32.c @@ -1,5 +1,8 @@ #include "evas_common_private.h" #include "evas_convert_rgb_32.h" +#ifdef BUILD_NEON +#include +#endif void evas_common_convert_rgba_to_32bpp_rgb_8888 (DATA32 *src, DATA8 *dst, int src_jump, int dst_jump, int w, int h, int dith_x EINA_UNUSED, int dith_y EINA_UNUSED, DATA8 *pal EINA_UNUSED) @@ -41,51 +44,143 @@ evas_common_convert_rgba_to_32bpp_rgb_8888_rot_180 (DATA32 *src, DATA8 *dst, int } #ifdef TILE_ROTATE +#ifdef BUILD_NEON +#define ROT90_QUAD_COPY_LOOP \ + if (evas_common_cpu_has_feature(CPU_FEATURE_NEON)) \ + { \ + if((w%4) == 0) \ + { \ + int klght = 4 * src_stride; \ + for(y = 0; y < h; y++) \ + { \ + const pix_type *s = &(src[(h - y - 1)]); \ + pix_type *d = &(dst[(dst_stride * y)]); \ + pix_type *ptr1 = s; \ + pix_type *ptr2 = ptr1 + src_stride; \ + pix_type *ptr3 = ptr2 + src_stride; \ + pix_type *ptr4 = ptr3 + src_stride; \ + for(x = 0; x < w; x+=4) \ + { \ + pix_type s_array[4] = {*ptr1, *ptr2, *ptr3, *ptr4}; \ + vst1q_s32(d, vld1q_s32(s_array)); \ + d += 4; \ + ptr1 += klght; \ + ptr2 += klght; \ + ptr3 += klght; \ + ptr4 += klght; \ + } \ + } \ + } \ + else \ + { \ + for (y = 0; y < h; y++) \ + { \ + const pix_type *s = &(src[(h - y - 1)]); \ + pix_type *d = &(dst[(dst_stride * y)]); \ + for (x = 0; x < w; x++) \ + { \ + *d++ = *s; \ + s += src_stride; \ + } \ + } \ + } \ + } \ + else +#define ROT270_QUAD_COPY_LOOP \ + if (evas_common_cpu_has_feature(CPU_FEATURE_NEON)) \ + if((w%4) == 0) \ + { \ + int klght = 4 * src_stride; \ + for(y = 0; y < h; y++) \ + { \ + const pix_type *s = &(src[(src_stride * (w - 1)) + y]); \ + pix_type *d = &(dst[(dst_stride * y)]); \ + pix_type *ptr1 = s; \ + pix_type *ptr2 = ptr1 + src_stride; \ + pix_type *ptr3 = ptr2 + src_stride; \ + pix_type *ptr4 = ptr3 + src_stride; \ + for(x = 0; x < w; x+=4) \ + { \ + pix_type s_array[4] = {*ptr1, *ptr2, *ptr3, *ptr4}; \ + vst1q_s32(d, vld1q_s32(s_array)); \ + d += 4; \ + ptr1 += klght; \ + ptr2 += klght; \ + ptr3 += klght; \ + ptr4 += klght; \ + } \ + } \ + } \ + else \ + { \ + for (y = 0; y < h; y++) \ + { \ + const pix_type *s = &(src[(src_stride * (w - 1)) + y]); \ + pix_type *d = &(dst[(dst_stride * y)]); \ + for (x = 0; x < w; x++) \ + { \ + *d++ = *s; \ + s += src_stride; \ + } \ + } \ + } \ + } \ + else +#else +#define ROT90_QUAD_COPY_LOOP +#define ROT270_QUAD_COPY_LOOP +#endif #define FAST_SIMPLE_ROTATE(suffix, pix_type) \ static void \ - blt_rotated_90_trivial_##suffix(pix_type *dst, \ + blt_rotated_90_trivial_##suffix(pix_type * restrict dst, \ int dst_stride, \ - const pix_type *src, \ + const pix_type * restrict src, \ int src_stride, \ int w, \ int h) \ { \ int x, y; \ - for (y = 0; y < h; y++) \ - { \ - const pix_type *s = src + (h - y - 1); \ - pix_type *d = dst + (dst_stride * y); \ - for (x = 0; x < w; x++) \ - { \ - *d++ = *s; \ - s += src_stride; \ - } \ - } \ + ROT90_QUAD_COPY_LOOP \ + { \ + for (y = 0; y < h; y++) \ + { \ + const pix_type *s = &(src[(h - y - 1)]); \ + pix_type *d = &(dst[(dst_stride * y)]); \ + for (x = 0; x < w; x++) \ + { \ + *d++ = *s; \ + s += src_stride; \ + } \ + } \ + } \ } \ static void \ - blt_rotated_270_trivial_##suffix(pix_type *dst, \ + blt_rotated_270_trivial_##suffix(pix_type * restrict dst, \ int dst_stride, \ - const pix_type *src, \ + const pix_type * restrict src, \ int src_stride, \ int w, \ int h) \ { \ int x, y; \ - for (y = 0; y < h; y++) \ + ROT270_QUAD_COPY_LOOP \ + { \ + for(y = 0; y < h; y++) \ { \ - const pix_type *s = src + (src_stride * (w - 1)) + y; \ - pix_type *d = dst + (dst_stride * y); \ + const pix_type *s = &(src[(src_stride * (w - 1)) + y]); \ + pix_type *d = &(dst[(dst_stride * y)]); \ for (x = 0; x < w; x++) \ - { \ - *d++ = *s; \ - s -= src_stride; \ - } \ + { \ + *d++ = *s; \ + s -= src_stride; \ + } \ } \ + } \ } \ static void \ - blt_rotated_90_##suffix(pix_type *dst, \ + blt_rotated_90_##suffix(pix_type * restrict dst, \ int dst_stride, \ - const pix_type *src, \ + const pix_type * restrict src, \ int src_stride, \ int w, \ int h) \ @@ -120,7 +215,7 @@ evas_common_convert_rgba_to_32bpp_rgb_8888_rot_180 (DATA32 *src, DATA8 *dst, int { \ blt_rotated_90_trivial_##suffix(dst + x, \ dst_stride, \ - src + (src_stride * x), \ + &(src[(src_stride * x)]), \ src_stride, \ TILE_SIZE, \ h); \ @@ -128,15 +223,15 @@ evas_common_convert_rgba_to_32bpp_rgb_8888_rot_180 (DATA32 *src, DATA8 *dst, int if (trailing_pixels) \ blt_rotated_90_trivial_##suffix(dst + w, \ dst_stride, \ - src + (w * src_stride), \ + &(src[(w * src_stride)]), \ src_stride, \ trailing_pixels, \ h); \ } \ static void \ - blt_rotated_270_##suffix(pix_type *dst, \ + blt_rotated_270_##suffix(pix_type * restrict dst, \ int dst_stride, \ - const pix_type *src, \ + const pix_type * restrict src, \ int src_stride, \ int w, \ int h) \ @@ -151,7 +246,7 @@ evas_common_convert_rgba_to_32bpp_rgb_8888_rot_180 (DATA32 *src, DATA8 *dst, int leading_pixels = w; \ blt_rotated_270_trivial_##suffix(dst, \ dst_stride, \ - src + (src_stride * (w - leading_pixels)), \ + &(src[(src_stride * (w - leading_pixels))]), \ src_stride, \ leading_pixels, \ h); \ @@ -171,7 +266,7 @@ evas_common_convert_rgba_to_32bpp_rgb_8888_rot_180 (DATA32 *src, DATA8 *dst, int { \ blt_rotated_270_trivial_##suffix(dst + x, \ dst_stride, \ - src + (src_stride * (w - x - TILE_SIZE)), \ + &(src[(src_stride * (w - x - TILE_SIZE))]), \ src_stride, \ TILE_SIZE, \ h); \