Evas TGV: Optimize loader using NEON intrinsics

This can speed up the load a little (but I forgot the numbers).
Patch by Vladimir Kuramshin.
This commit is contained in:
Jean-Philippe Andre 2014-06-12 17:47:56 +09:00
parent 29770922cf
commit ef80047ac9
1 changed files with 28 additions and 1 deletions

View File

@ -15,6 +15,9 @@
#include "rg_etc1.h"
#include "Evas_Loader.h"
#ifdef BUILD_NEON
#include <arm_neon.h>
#endif
/**************************************************************
* The TGV file format is oriented around compression mecanism
* that hardware are good at decompressing. We do still provide
@ -381,7 +384,7 @@ evas_image_load_file_data_tgv(void *loader_data,
for (j = 0; j < loader->block.width; j += 4, it += etc_block_size)
{
Eina_Rectangle current_etc;
unsigned int temporary[4 * 4] = { 0 };
unsigned int temporary[4 * 4];
unsigned int offset_x, offset_y;
int k;
@ -414,6 +417,30 @@ evas_image_load_file_data_tgv(void *loader_data,
offset_x = current_etc.x - x - j;
offset_y = current_etc.y - y - i;
#ifdef BUILD_NEON
if (evas_common_cpu_has_feature(CPU_FEATURE_NEON))
{
uint32_t *dst = &p[current_etc.x - 1 + (current_etc.y - 1) * master.w];
uint32_t *src = &temporary[offset_x + offset_y * 4];
for (k = 0; k < current_etc.h; k++)
{
if (current_etc.w == 4)
vst1q_u32(dst, vld1q_u32(src));
else if (current_etc.w == 3)
{
vst1_u32(dst, vld1_u32(src));
*(dst + 2) = *(src + 2);
}
else if (current_etc.w == 2)
vst1_u32(dst, vld1_u32(src));
else
*dst = *src;
dst += master.w;
src += 4;
}
}
else
#endif
for (k = 0; k < current_etc.h; k++)
{
memcpy(&p[current_etc.x - 1 +