evas common: NEON version of evas_common_convert_argb_premul.

This patch reduces power consumption by around 18mA in certain scenarios (music player list scroll, my files sound list scroll), making evas_common_convert_argb_premul() ~60% faster (6.2msec->2.6msec). Take music-player application, make 100 copies of the standard Over the Horizon” song, scroll up and down to see those downscaled-from-720x720 thumbnails enter and leave the screen. Every time a list item enters the screen, the image is re-read (as evas image cache is not large enough to store more than two pictures of that size), and one call of _common_convert_argb_premul() occurs, taking ~6.2msec (which is not much compared to ~60msec spent in libpng->libz (the biggest bottleneck here), but still noticeable). A similar power consumption improvement is observed during scrolling sounds list of the same files in My Files application (just with idle level ~100mA lower). We also checked the new code to be correct on random input data. all tests are performed based on tizen device. Signed-Off-By: Artem Dergachev <dergachev.a@samsung.com>
2015-04-07 23:02:15 +09:00 · 2015-04-07 23:02:15 +09:00 · 51d60e649c
parent 2bbdc17bea
commit 51d60e649c
1 changed files with 40 additions and 0 deletions
--- a/src/lib/evas/common/evas_convert_color.c
+++ b/src/lib/evas/common/evas_convert_color.c
@ -1,6 +1,10 @@
 #include "evas_common_private.h"
 #include "evas_convert_color.h"

+#ifdef BUILD_NEON
+#include <arm_neon.h>
+#endif
+
 EAPI DATA32
 evas_common_convert_ag_premul(DATA16 *data, unsigned int len)
 {
@ -28,6 +32,42 @@ evas_common_convert_argb_premul(DATA32 *data, unsigned int len)
   DATA32 *de = data + len;
   DATA32 nas = 0;

+   #ifdef BUILD_NEON
+   if (evas_common_cpu_has_feature(CPU_FEATURE_NEON))
+     {
+        uint8x8_t mask_0x00 = vdup_n_u8(0);
+        uint8x8_t mask_0x01 = vdup_n_u8(1);
+        uint8x8_t mask_0xff = vdup_n_u8(255);
+        uint8x8_t cmp;
+
+        while (data <= de - 8)
+          {
+
+             uint8x8x4_t rgba = vld4_u8(data);
+
+             cmp = vand_u8(vorr_u8(
+               vceq_u8(rgba.val[3], mask_0xff),
+               vceq_u8(rgba.val[3], mask_0x00)
+             ), mask_0x01);
+             nas += vpaddl_u32(vpaddl_u16(vpaddl_u8(cmp)));
+
+             uint16x8x4_t lrgba;
+
+             lrgba.val[0] = vmovl_u8(rgba.val[0]);
+             lrgba.val[1] = vmovl_u8(rgba.val[1]);
+             lrgba.val[2] = vmovl_u8(rgba.val[2]);
+
+             rgba.val[0] = vshrn_n_u16(vmlal_u8(lrgba.val[0], rgba.val[0], rgba.val[3]), 8);
+             rgba.val[1] = vshrn_n_u16(vmlal_u8(lrgba.val[1], rgba.val[1], rgba.val[3]), 8);
+             rgba.val[2] = vshrn_n_u16(vmlal_u8(lrgba.val[2], rgba.val[2], rgba.val[3]), 8);
+
+             vst4_u8(data, rgba);
+             data += 8;
+
+          }
+     }
+   #endif
+
   while (data < de)
     {
 	DATA32  a = 1 + (*data >> 24);