span lists were a failure. sloweer than the current brute force method - and

slower to boot.

ooh.. found i was fuckign up the memcpy's. fixed :) and much faster too :)


SVN revision: 13103
This commit is contained in:
Carsten Haitzler 2005-01-27 10:05:41 +00:00
parent 2b34d43044
commit 44f0d70286
6 changed files with 51 additions and 130 deletions

View File

@ -204,13 +204,13 @@ evas_common_copy_pixels_rgba_to_rgba_mmx(DATA32 *src, DATA32 *dst, int len)
src_ptr = src; src_ptr = src;
dst_ptr = dst; dst_ptr = dst;
dst_end_ptr = dst + len; dst_end_ptr = dst + len;
dst_end_ptr_pre = dst + ((len / 10) * 10); dst_end_ptr_pre = dst + ((len / 16) * 16);
while (dst_ptr < dst_end_ptr_pre) while (dst_ptr < dst_end_ptr_pre)
{ {
MOVE_10DWORDS_MMX(src_ptr, dst_ptr); MOVE_16DWORDS_MMX(src_ptr, dst_ptr);
src_ptr+=10; src_ptr+=16;
dst_ptr+=10; dst_ptr+=16;
} }
while (dst_ptr < dst_end_ptr) while (dst_ptr < dst_end_ptr)
{ {
@ -310,15 +310,14 @@ evas_common_copy_pixels_rgba_to_rgba_sse(DATA32 *src, DATA32 *dst, int len)
src_ptr = src; src_ptr = src;
dst_ptr = dst; dst_ptr = dst;
dst_end_ptr = dst + len; dst_end_ptr = dst + len;
dst_end_ptr_pre = dst + ((len / 10) * 10); dst_end_ptr_pre = dst + ((len / 16) * 16);
while (dst_ptr < dst_end_ptr_pre) while (dst_ptr < dst_end_ptr_pre)
{ {
prefetch(&src_ptr[128]); prefetch(&src_ptr[16]);
prefetch(&dst_ptr[128]); MOVE_16DWORDS_MMX(src_ptr, dst_ptr);
MOVE_10DWORDS_MMX(src_ptr, dst_ptr); src_ptr+=16;
src_ptr+=10; dst_ptr+=16;
dst_ptr+=10;
} }
while (dst_ptr < dst_end_ptr) while (dst_ptr < dst_end_ptr)
{ {
@ -373,21 +372,21 @@ evas_common_copy_pixels_rev_rgba_to_rgba_mmx(DATA32 *src, DATA32 *dst, int len)
{ {
DATA32 *src_ptr, *dst_ptr, *dst_end_ptr, *dst_end_ptr_pre; DATA32 *src_ptr, *dst_ptr, *dst_end_ptr, *dst_end_ptr_pre;
src_ptr = src + len - 10; src_ptr = src + len - 16;
dst_ptr = dst + len - 10; dst_ptr = dst + len - 16;
dst_end_ptr = dst; dst_end_ptr = dst;
dst_end_ptr_pre = dst + len - ((len / 10) * 10); dst_end_ptr_pre = dst + len - ((len / 16) * 16);
if (len >= 10) if (len >= 16)
{ {
while (dst_ptr >= dst_end_ptr_pre) while (dst_ptr >= dst_end_ptr_pre)
{ {
MOVE_10DWORDS_MMX(src_ptr, dst_ptr); MOVE_16DWORDS_MMX(src_ptr, dst_ptr);
src_ptr-=10; src_ptr-=16;
dst_ptr-=10; dst_ptr-=16;
} }
src_ptr+=9; src_ptr+=15;
dst_ptr+=9; dst_ptr+=15;
while (dst_ptr >= dst_end_ptr) while (dst_ptr >= dst_end_ptr)
{ {
*dst_ptr = *src_ptr; *dst_ptr = *src_ptr;
@ -415,23 +414,22 @@ evas_common_copy_pixels_rev_rgba_to_rgba_sse(DATA32 *src, DATA32 *dst, int len)
{ {
DATA32 *src_ptr, *dst_ptr, *dst_end_ptr, *dst_end_ptr_pre; DATA32 *src_ptr, *dst_ptr, *dst_end_ptr, *dst_end_ptr_pre;
src_ptr = src + len - 10; src_ptr = src + len - 16;
dst_ptr = dst + len - 10; dst_ptr = dst + len - 16;
dst_end_ptr = dst; dst_end_ptr = dst;
dst_end_ptr_pre = dst + len - ((len / 10) * 10); dst_end_ptr_pre = dst + len - ((len / 16) * 16);
if (len >= 10) if (len >= 16)
{ {
while (dst_ptr >= dst_end_ptr_pre) while (dst_ptr >= dst_end_ptr_pre)
{ {
prefetch(&src_ptr[-128]); prefetch(&src_ptr[-16]);
prefetch(&dst_ptr[-128]);
MOVE_10DWORDS_MMX(src_ptr, dst_ptr); MOVE_10DWORDS_MMX(src_ptr, dst_ptr);
src_ptr-=10; src_ptr-=16;
dst_ptr-=10; dst_ptr-=16;
} }
src_ptr+=9; src_ptr+=15;
dst_ptr+=9; dst_ptr+=15;
while (dst_ptr >= dst_end_ptr) while (dst_ptr >= dst_end_ptr)
{ {
*dst_ptr = *src_ptr; *dst_ptr = *src_ptr;

View File

@ -76,22 +76,13 @@ evas_common_image_shutdown(void)
#endif #endif
} }
/* alpha tiles! - asctually span lists - need to do it as span lists */ #if 0
void void
evas_common_image_surface_alpha_tiles_calc(RGBA_Surface *is, int tsize) evas_common_image_surface_alpha_tiles_calc(RGBA_Surface *is, int tsize)
{ {
int x, y; int x, y;
DATA32 *ptr; DATA32 *ptr;
#if 1
return;
#endif
/* hmm i only get about a 15% speedup on my "best cases". the complexity
* imho isn't worth the small gain, so i have disabled it here :( (this
* is best case scenario - average case will be much less gain)
*
* thought for now the only case is
*/
if (is->spans) return; if (is->spans) return;
if (!(is->im->flags & RGBA_IMAGE_HAS_ALPHA)) return; if (!(is->im->flags & RGBA_IMAGE_HAS_ALPHA)) return;
/* FIXME: dont handle alpha only images yet */ /* FIXME: dont handle alpha only images yet */
@ -149,26 +140,7 @@ evas_common_image_surface_alpha_tiles_calc(RGBA_Surface *is, int tsize)
} }
} }
} }
#endif
void
evas_common_image_surface_alpha_tiles_free(RGBA_Surface *is)
{
int i;
if (!is->spans) return;
for (i = 0; i < is->h; i++)
{
while (is->spans[i])
{
RGBA_Image_Span *sp;
sp = is->spans[i];
is->spans[i] = evas_object_list_remove(sp, sp);
free(sp);
}
}
free(is->spans);
}
RGBA_Surface * RGBA_Surface *
evas_common_image_surface_new(RGBA_Image *im) evas_common_image_surface_new(RGBA_Image *im)
@ -219,7 +191,6 @@ evas_common_image_surface_dealloc(RGBA_Surface *is)
free(is->data); free(is->data);
is->data = NULL; is->data = NULL;
} }
evas_common_image_surface_alpha_tiles_free(is);
} }
RGBA_Image * RGBA_Image *
@ -524,7 +495,6 @@ evas_common_image_dirty(RGBA_Image *im)
{ {
int i; int i;
if (im->image) evas_common_image_surface_alpha_tiles_free(im->image);
evas_common_image_unstore(im); evas_common_image_unstore(im);
im->flags |= RGBA_IMAGE_IS_DIRTY; im->flags |= RGBA_IMAGE_IS_DIRTY;
} }

View File

@ -209,8 +209,6 @@ SCALE_FUNC(RGBA_Image *src, RGBA_Image *dst,
* -:- * -:-
* *
*/ */
/* 8x8 tiles - this will incurr about a < 2% memory overhead */
evas_common_image_surface_alpha_tiles_calc(src->image, 8);
/* if 1:1 scale */ /* if 1:1 scale */
if ((dst_region_w == src_region_w) && if ((dst_region_w == src_region_w) &&

View File

@ -32,63 +32,11 @@
Gfx_Func_Blend_Src_Dst func; Gfx_Func_Blend_Src_Dst func;
func = evas_common_draw_func_blend_get(src, dst, dst_clip_w); func = evas_common_draw_func_blend_get(src, dst, dst_clip_w);
#if 0 for (y = 0; y < dst_clip_h; y++)
/* part of the spans experiemnt. doesnt seem to help much on top of
* what we already have
*/
if (src->image->spans)
{ {
int x2, y2; func(ptr, dst_ptr, dst_clip_w);
int xoff, woff; ptr += src_w;
RGBA_Image_Flags pflags; dst_ptr += dst_w;
Gfx_Func_Blend_Src_Dst func_solid;
pflags = src->flags;
src->flags &= ~RGBA_IMAGE_HAS_ALPHA;
func_solid = evas_common_draw_func_blend_get(src, dst, dst_clip_w);
src->flags = pflags;
x2 = (dst_clip_x - dst_region_x) + src_region_x;
y2 = (dst_clip_y - dst_region_y) + src_region_y;
for (y = 0; y < dst_clip_h; y++, y2++)
{
Evas_Object_List *l;
for (l = src->image->spans[y2]; l; l = l->next)
{
RGBA_Image_Span *sp;
sp = l;
if ((sp->x + sp->w) > x2)
{
xoff = sp->x - x2;
woff = sp->w;
if (xoff < 0)
{
woff += xoff;
xoff = 0;
}
if ((xoff + woff) > (dst_clip_w))
woff += (dst_clip_w) - (xoff + woff);
if (sp->v == 2)
func_solid(ptr + xoff, dst_ptr + xoff, woff);
else
func(ptr + xoff, dst_ptr + xoff, woff);
}
}
ptr += src_w;
dst_ptr += dst_w;
}
}
else
#endif
{
for (y = 0; y < dst_clip_h; y++)
{
func(ptr, dst_ptr, dst_clip_w);
ptr += src_w;
dst_ptr += dst_w;
}
} }
} }
} }

View File

@ -291,14 +291,6 @@ struct _RGBA_Surface
DATA32 *data; DATA32 *data;
char no_free : 1; char no_free : 1;
RGBA_Image *im; RGBA_Image *im;
RGBA_Image_Span **spans;
};
struct _RGBA_Image_Span
{
Evas_Object_List _list_data;
int x, w;
int v;
}; };
struct _RGBA_Image struct _RGBA_Image
@ -769,9 +761,6 @@ void evas_common_scale_rgba_in_to_out_clip_sample (RGBA_Image *src, RGBA_Im
/****/ /****/
void evas_common_image_init (void); void evas_common_image_init (void);
void evas_common_image_shutdown (void); void evas_common_image_shutdown (void);
void evas_common_image_surface_alpha_tiles_calc(RGBA_Surface *is, int tsize);
void evas_common_image_surface_alpha_tiles_free(RGBA_Surface *is);
RGBA_Surface *evas_common_image_surface_new (RGBA_Image *im); RGBA_Surface *evas_common_image_surface_new (RGBA_Image *im);
void evas_common_image_surface_free (RGBA_Surface *is); void evas_common_image_surface_free (RGBA_Surface *is);

View File

@ -573,6 +573,24 @@ typedef union {
: \ : \
: "r" (var) \ : "r" (var) \
); );
#define prefetch0(var) \
__asm__ __volatile__ ( \
"prefetcht0 (%0) \n" \
: \
: "r" (var) \
);
#define prefetch1(var) \
__asm__ __volatile__ ( \
"prefetcht1 (%0) \n" \
: \
: "r" (var) \
);
#define prefetch2(var) \
__asm__ __volatile__ ( \
"prefetcht2 (%0) \n" \
: \
: "r" (var) \
);
#define pshufw(r1, r2, imm) \ #define pshufw(r1, r2, imm) \
__asm__ __volatile__ ( \ __asm__ __volatile__ ( \
"pshufw $" #imm ", %" #r1 ", %" #r2 " \n" \ "pshufw $" #imm ", %" #r1 ", %" #r2 " \n" \