ok.. mmx asm for routines again.. and this time... they seem to not segv :)

SVN revision: 2538
2000-04-26 19:36:20 +00:00 · 2000-04-26 19:36:20 +00:00 · bf98465ea4
parent 7c3d60e75b
commit bf98465ea4
15 changed files with 2370 additions and 735 deletions
--- a/src/Makefile.am
+++ b/src/Makefile.am
@ -20,7 +20,7 @@ libImlib2_la_SOURCES = rend.c ximage.c scale.c rgba.c image.c color.c grab.c \
 		       color.h draw.h rend.h ximage.h colormod.h file.h \
 		       rgba.h common.h grab.h rgbadraw.h font.h format.h \
 		       rotate.h grad.h filter.h \
-		       asm_blend.S filter.c asm_rgba.S
+		       asm_blend.S filter.c asm_rgba.S asm_scale.S asm_rotate.S
 libImlib2_la_LIBADD   = @DLLDFLAGS@ $(top_builddir)/libltdl/libltdlc.la \
                       -lX11 -lXext -lttf $(LDFLAGS)
 libImlib2_la_DEPENDENCIES = $(top_builddir)/config.h
--- a/src/api.c
+++ b/src/api.c
@ -5,8 +5,8 @@
 #include <string.h>
 #include "common.h"
 #include "colormod.h"
-#include "scale.h"
 #include "image.h"
+#include "scale.h"
 #include "context.h"
 #include "rgba.h"
 #include "color.h"
@ -2201,8 +2201,14 @@ imlib_create_rotated_image(double angle)
   
   if (ctxt_anti_alias) 
     {
-	__imlib_RotateAA(im_old->data, im->data, im_old->w,
-			 im_old->w, im_old->h, im->w, sz, sz, x, y, dx, dy);
+#ifdef DO_MMX_ASM
+        if (__imlib_get_cpuid() & CPUID_MMX)
+	   __imlib_mmx_RotateAA(im_old->data, im->data, im_old->w,
+			    im_old->w, im_old->h, im->w, sz, sz, x, y, dx, dy);
+	else
+#endif
+	   __imlib_RotateAA(im_old->data, im->data, im_old->w,
+			    im_old->w, im_old->h, im->w, sz, sz, x, y, dx, dy);
     } else 
     {
 	__imlib_RotateSample(im_old->data, im->data, im_old->w,
--- a/src/asm_rgba.S
+++ b/src/asm_rgba.S
@ -32,6 +32,9 @@
 .global __imlib_mmx_bgr555_fast
 	.type __imlib_mmx_bgr555_fast,@function

+.global __imlib_get_cpuid
+	.type __imlib_get_cpuid,@function
+
 .bss
 .text
 .align 8
@ -240,4 +243,34 @@ __imlib_mmx_rgb555_fast:
 	LOOP_END
 	LEAVE

+
+__imlib_get_cpuid:
+	pushl %ebx
+	pushl %edx
+
+	pushf
+	popl %eax
+	movl %eax, %ebx
+	xorl $0x200000, %eax
+	pushl %eax
+	popf
+	pushf
+	popl %eax
+	xorl %ebx, %eax
+	andl $0x200000, %eax
+	jz 1f
+	xorl %eax, %eax
+	cpuid
+	testl %eax, %eax
+	jz 1f
+	movl $1, %eax
+	cpuid
+	and $0x00000f00, %eax
+	and $0xfffff0ff, %edx
+	orl %edx, %eax
+1:
+	popl %edx
+	popl %ebx
+	ret
+
 #endif
--- a/src/asm_rotate.S
+++ b/src/asm_rotate.S
@ -0,0 +1,446 @@
+#include <config.h>
+
+/*\ 
+|*| MMX assembly rotation routine for Imlib2
+|*| Written by Willem Monsuwe <willem@stack.nl>
+\*/
+
+.global __imlib_mmx_RotateAA
+	.type __imlib_mmx_RotateAA,@function
+
+.bss
+.text
+.align 8
+
+/*\ Prototype: __imlib_mmx_RotateAA(DATA32 *src, DATA32 *dest, int sow, int sw,
+|*|	int sh, int dow, int dw, int dh, int x, int y, int dx, int dy)
+\*/
+
+#define src	8(%ebp)
+#define dest	12(%ebp)
+#define sow	16(%ebp)
+#define sw	20(%ebp)
+#define sh	24(%ebp)
+#define dow	28(%ebp)
+#define dw	32(%ebp)
+#define dh	36(%ebp)
+#define x	40(%ebp)
+#define y	44(%ebp)
+#define dx	48(%ebp)
+#define dy	52(%ebp)
+
+/*\ Local variables \*/
+#define j	-4(%ebp)
+#define dly	-8(%ebp)
+#define dlx	-12(%ebp)
+#define sht	-16(%ebp)
+#define swt	-20(%ebp)
+#define m0fffh	-24(%ebp)
+#define m0fff	-28(%ebp)
+#define mulsow	-32(%ebp)
+
+#ifdef DO_MMX_ASM
+
+__imlib_mmx_RotateAA:
+	pushl %ebp
+	movl %esp, %ebp
+	subl $40, %esp
+	pushl %ebx
+	pushl %ecx
+	pushl %edx
+	pushl %edi
+	pushl %esi
+
+	/*\ Check (dw > 0) && (dh > 0) \*/
+	cmpl $0, dw
+	jle .rotate_leave
+	cmpl $0, dh
+	jle .rotate_leave
+
+	pxor %mm7, %mm7
+	movl sow, %eax
+	sall $16, %eax
+	orl $1, %eax
+	movl %eax, mulsow
+	movl $0x0fff, %eax
+	movl %eax, m0fff
+	movl %eax, m0fffh
+
+	/*\ mm6 = x, y \*/
+	movq x, %mm6
+
+	/*\ edi = dest + dw \*/
+	movl dest, %edi
+	movl dw, %eax
+	leal (%edi, %eax, 4), %edi
+
+	/*\ dlx = -dy - dw * dx \*/
+	movl dw, %eax
+	imull dx, %eax
+	negl %eax
+	subl dy, %eax
+	movl %eax, dlx
+
+	/*\ dly = dx - dw * dy \*/
+	movl dw, %eax
+	imull dy, %eax
+	negl %eax
+	addl dx, %eax
+	movl %eax, dly
+
+	/*\ j = dh \*/
+	movl dh, %eax
+	movl %eax, j
+
+	/*\ Check if all coordinates will be inside the source \*/
+	/*\ x < sw \*/
+	movl sw, %edx
+	movl x, %ecx
+	cmpl %edx, %ecx
+	jae .rotate_outside
+	/*\ x + dx * dw < sw \*/
+	movl dx, %ebx
+	imull dw, %ebx
+	addl %ebx, %ecx
+	cmpl %edx, %ecx
+	jae .rotate_outside
+	/*\ x + dx * dw - dy * dh < sw \*/
+	movl dy, %eax
+	imull dh, %eax
+	subl %eax, %ecx
+	cmpl %edx, %ecx
+	jae .rotate_outside
+	/*\ x - dy * dh < sw \*/
+	subl %ebx, %ecx
+	cmpl %edx, %ecx
+	jae .rotate_outside
+
+	/*\ y < sh \*/
+	movl sh, %edx
+	movl y, %ecx
+	cmpl %edx, %ecx
+	jae .rotate_outside
+	/*\ y + dy * dw < sh \*/
+	movl dx, %ebx
+	imull dw, %ebx
+	addl %ebx, %ecx
+	cmpl %edx, %ecx
+	jae .rotate_outside
+	/*\ y + dy * dw + dx * dh < sh \*/
+	movl dy, %eax
+	imull dh, %eax
+	addl %eax, %ecx
+	cmpl %edx, %ecx
+	jae .rotate_outside
+	/*\ y + dx * dh < sh \*/
+	subl %ebx, %ecx
+	cmpl %edx, %ecx
+	jae .rotate_outside
+
+.rotate_inside:
+	movl sow, %ebx
+	movl src, %edx
+.inside_loop_y:
+
+	/*\ i = -dw \*/
+	movl dw, %ecx
+	negl %ecx
+.inside_loop_x:
+	/*\ esi = src + x >> 12 + (y >> 12) * sow \*/
+	movq %mm6, %mm0
+	psrad $12, %mm0
+	packssdw %mm0, %mm0
+	pmaddwd mulsow, %mm0
+	movd %mm0, %eax
+	leal (%edx, %eax, 4), %esi
+
+	/*\ x and y \*/
+	movq %mm6, %mm0
+	pand m0fff, %mm0
+	movq %mm0, %mm1
+	/*\ mm0 = x & 0xfff \*/
+	punpcklwd %mm0, %mm0
+	punpckldq %mm0, %mm0
+	/*\ mm1 = y & 0xfff \*/
+	punpckhwd %mm1, %mm1
+	punpckldq %mm1, %mm1
+
+	/*\ Load and unpack four pixels in parralel
+	|*| %mm2 = ptr[0],   %mm3 = ptr[1]
+	|*| %mm4 = ptr[sow], %mm5 = ptr[sow + 1]
+	\*/
+	movq (%esi), %mm2
+	movq (%esi, %ebx, 4), %mm4
+	movq %mm2, %mm3
+	movq %mm4, %mm5
+	punpcklbw %mm7, %mm2
+	punpcklbw %mm7, %mm4
+	punpckhbw %mm7, %mm3
+	punpckhbw %mm7, %mm5
+
+	/*\ X interpolation: r = l + (r - l) * xap \*/
+	psubw %mm2, %mm3
+	psubw %mm4, %mm5
+	psllw $4, %mm3
+	psllw $4, %mm5
+	pmulhw %mm0, %mm3
+	pmulhw %mm0, %mm5
+	paddw %mm2, %mm3
+	paddw %mm4, %mm5
+
+	/*\ Y interpolation: d = u + (d - u) * yap \*/
+	psubw %mm3, %mm5
+	psllw $4, %mm5
+	pmulhw %mm1, %mm5
+	paddw %mm3, %mm5
+	packuswb %mm5, %mm5
+	movd %mm5, (%edi, %ecx, 4)
+	
+	paddd dx, %mm6
+
+	incl %ecx
+	jnz .inside_loop_x
+
+	paddd dlx, %mm6
+	movl dow, %ecx
+	leal (%edi, %ecx, 4), %edi
+	decl j
+	jnz .inside_loop_y
+
+	jmp .rotate_leave
+
+.rotate_outside:
+	movl sw, %eax
+	decl %eax
+	sall $12, %eax
+	movl %eax, swt
+	movl sh, %eax
+	decl %eax
+	sall $12, %eax
+	movl %eax, sht
+	
+	movl sow, %ebx
+	movl src, %edx
+.outside_loop_y:
+
+	/*\ i = -dw \*/
+	movl dw, %ecx
+	negl %ecx
+.outside_loop_x:
+	/*\ esi = src + x >> 12 + (y >> 12) * sow \*/
+	movq %mm6, %mm0
+	psrad $12, %mm0
+	packssdw %mm0, %mm0
+	pmaddwd mulsow, %mm0
+	movd %mm0, %eax
+	leal (%edx, %eax, 4), %esi
+
+	/*\ x & 0xfff and y & 0xfff \*/
+	movq %mm6, %mm0
+	pand m0fff, %mm0
+	movq %mm0, %mm1
+
+	/*\ x < swt \*/
+	movq %mm6, %mm2
+	psrlq $32, %mm2
+	movd %mm6, %eax
+	cmpl swt, %eax
+	jae 2f
+
+	/*\ y < sht \*/
+	movd %mm2, %eax
+	cmpl sht, %eax
+	jae 1f
+	/*\ 1234 \*/
+.interp_argb:
+	/*\ Unpack x and y \*/
+	punpcklwd %mm0, %mm0
+	punpckldq %mm0, %mm0
+	punpckhwd %mm1, %mm1
+	punpckldq %mm1, %mm1
+	/*\ Load and unpack four pixels in parralel
+	|*| %mm2 = ptr[0],   %mm3 = ptr[1]
+	|*| %mm4 = ptr[sow], %mm5 = ptr[sow + 1]
+	\*/
+	movq (%esi), %mm2
+	movq (%esi, %ebx, 4), %mm4
+	movq %mm2, %mm3
+	movq %mm4, %mm5
+	punpcklbw %mm7, %mm2
+	punpcklbw %mm7, %mm4
+	punpckhbw %mm7, %mm3
+	punpckhbw %mm7, %mm5
+
+	/*\ X interpolation: r = l + (r - l) * xap \*/
+	psubw %mm2, %mm3
+	psubw %mm4, %mm5
+	psllw $4, %mm3
+	psllw $4, %mm5
+	pmulhw %mm0, %mm3
+	pmulhw %mm0, %mm5
+	paddw %mm2, %mm3
+	paddw %mm4, %mm5
+
+	/*\ Y interpolation: d = u + (d - u) * yap \*/
+	psubw %mm3, %mm5
+	psllw $4, %mm5
+	pmulhw %mm1, %mm5
+	paddw %mm3, %mm5
+	packuswb %mm5, %mm5
+	movd %mm5, (%edi, %ecx, 4)
+	jmp .outside_il_end
+1:
+	/*\ (-y-1) < 4096 \*/
+	notl %eax
+	cmpl $4095, %eax
+	ja 1f
+	/*\ ..34 \*/
+	pxor m0fff, %mm1
+	movd (%esi, %ebx, 4), %mm2
+	movd 4(%esi, %ebx, 4), %mm4
+
+.interp_rgb_a0:
+	/*\ Unpack x and y \*/
+	punpcklwd %mm0, %mm0
+	punpckldq %mm0, %mm0
+	punpckhwd %mm1, %mm1
+	/*\ Unpack two pixels \*/
+	punpcklbw %mm7, %mm2
+	punpcklbw %mm7, %mm4
+	/*\ Interpolate \*/
+	psubw %mm2, %mm4
+	psllw $4, %mm4
+	pmulhw %mm0, %mm4
+	paddw %mm2, %mm4
+	/*\ Separate out alpha, multiply with mm1, and subtract \*/
+	movq %mm4, %mm2
+	psllq $48, %mm1
+	psllw $4, %mm4
+	pmulhw %mm1, %mm4
+	psubw %mm4, %mm2
+	packuswb %mm2, %mm2
+	movd %mm2, (%edi, %ecx, 4)
+	jmp .outside_il_end
+1:
+	/*\ (y - sht) < 4096 \*/
+	notl %eax
+	subl sht, %eax
+	cmpl $4095, %eax
+	ja .outside_il_0
+	/*\ 12.. \*/
+	movd (%esi), %mm2
+	movd 4(%esi), %mm4
+	jmp .interp_rgb_a0
+2:
+	/*\ Switch x and y \*/
+	psrlq $32, %mm0
+	psllq $32, %mm1
+	/*\ -x-1 < 4096 \*/
+	notl %eax
+	cmpl $4095, %eax
+	ja 2f
+
+	pxor m0fff, %mm1
+	/*\ y < sht \*/
+	movd %mm2, %eax
+	cmpl sht, %eax
+	jae 1f
+	/*\ .2.4 \*/
+	movd 4(%esi), %mm2
+	movd 4(%esi, %ebx, 4), %mm4
+	jmp .interp_rgb_a0
+1:
+	/*\ (-y-1) < 4096 \*/
+	notl %eax
+	cmpl $4095, %eax
+	ja 1f
+	/*\ ...4 \*/
+	movd 4(%esi, %ebx, 4), %mm2
+.interp_a000:
+	/*\ Separate out alpha, multiply with mm0 and mm1 \*/
+	pxor m0fff, %mm1
+	punpcklbw %mm7, %mm2
+	movq %mm2, %mm3
+	psllq $2, %mm0
+	psrlq $30, %mm1
+	pmulhw %mm0, %mm1
+	pxor m0fff, %mm1
+	psllq $48, %mm1
+	psllw $4, %mm3
+	pmulhw %mm1, %mm3
+	psubw %mm3, %mm2
+	packuswb %mm2, %mm2
+	movd %mm2, (%edi, %ecx, 4)
+	jmp .outside_il_end
+1:
+	/*\ (y - sht) < 4096 \*/
+	notl %eax
+	subl sht, %eax
+	cmpl $4095, %eax
+	ja .outside_il_0
+	/*\ .2.. \*/
+	pxor m0fff, %mm0
+	movd 4(%esi), %mm2
+	jmp .interp_a000
+2:
+	/*\ (x - swt) < 4096 \*/
+	notl %eax
+	subl swt, %eax
+	cmpl $4095, %eax
+	ja .outside_il_0
+
+	/*\ y < sht \*/
+	movd %mm2, %eax
+	cmpl sht, %eax
+	jae 1f
+	/*\ 1.3. \*/
+	movd (%esi), %mm2
+	movd (%esi, %ebx, 4), %mm4
+	jmp .interp_rgb_a0
+1:
+	/*\ (-y-1) < 4096 \*/
+	notl %eax
+	cmpl $4095, %eax
+	ja 1f
+	/*\ ..3. \*/
+	movd (%esi, %ebx, 4), %mm2
+	jmp .interp_a000
+1:
+	/*\ (y - sht) < 4096 \*/
+	notl %eax
+	subl sht, %eax
+	cmpl $4095, %eax
+	ja .outside_il_0
+	/*\ 1... \*/
+	pxor m0fff, %mm0
+	movd (%esi), %mm2
+	jmp .interp_a000
+
+.outside_il_0:
+	movl $0, %eax
+	movl %eax, (%edi, %ecx, 4)
+	
+.outside_il_end:
+	paddd dx, %mm6
+
+	incl %ecx
+	jnz .outside_loop_x
+
+	paddd dlx, %mm6
+	movl dow, %ecx
+	leal (%edi, %ecx, 4), %edi
+	decl j
+	jnz .outside_loop_y
+
+.rotate_leave:
+	emms
+	popl %esi
+	popl %edi
+	popl %edx
+	popl %ecx
+	popl %ebx
+	movl %ebp, %esp
+	popl %ebp
+	ret
+#endif
+
--- a/src/asm_scale.S
+++ b/src/asm_scale.S
@ -0,0 +1,791 @@
+#include <config.h>
+
+/*\ 
+|*| MMX assembly scaling routine for Imlib2
+|*| Written by Willem Monsuwe <willem@stack.nl>
+\*/
+
+.global __imlib_Scale_mmx_AARGBA
+	.type __imlib_Scale_mmx_AARGBA,@function
+
+.bss
+.text
+.align 8
+
+/*\ Prototype: __imlib_Scale_mmx_AARGBA(ImlibScaleInfo *isi, DATA32 *dest,
+|*|	int dxx, int dyy, int dx, int dy, int dw, int dh, int dow, int sow)
+\*/
+
+#define isi	8(%ebp)
+#define dest	12(%ebp)
+#define dxx	16(%ebp)
+#define dyy	20(%ebp)
+#define dx	24(%ebp)
+#define dy	28(%ebp)
+#define dw	32(%ebp)
+#define dh	36(%ebp)
+#define dow	40(%ebp)
+#define sow	44(%ebp)
+
+/*\ Local variables that didn't fit in registers \*/
+#define y	-4(%ebp)
+#define yp	-8(%ebp)
+#define yap	-12(%ebp)
+#define xp	-16(%ebp)
+#define xap	-20(%ebp)
+#define Cx	-24(%ebp)
+#define Mx	-28(%ebp)
+#define Cy	-32(%ebp)
+#define My	-36(%ebp)
+#define sow_4	-40(%ebp)
+
+/*\ When %edx points to ImlibScaleInfo, these are the members \*/
+#define xpoints		(%edx)
+#define ypoints		4(%edx)
+#define xapoints	8(%edx)
+#define yapoints	12(%edx)
+#define xup_yup		16(%edx)
+
+#ifdef DO_MMX_ASM
+
+__imlib_Scale_mmx_AARGBA:
+	pushl %ebp
+	movl %esp, %ebp
+	subl $40, %esp
+	pushl %ebx
+	pushl %ecx
+	pushl %edx
+	pushl %edi
+	pushl %esi
+	movl isi, %edx
+
+	/*\ Check (dw > 0) && (dh > 0) \*/
+	cmpl $0, dw
+	jle .scale_leave
+	cmpl $0, dh
+	jle .scale_leave
+
+	/*\ X-based array pointers point to the end; we're looping up to 0 \*/
+	/*\ %edi = dest + dow * dy + dx + dw \*/
+	movl dow, %eax
+	imull dy, %eax
+	addl dx, %eax
+	addl dw, %eax
+	movl dest, %edi
+	leal (%edi, %eax, 4), %edi
+	/*\ xp = xpoints + dxx + dw \*/
+	movl dxx, %ebx
+	addl dw, %ebx
+	movl xpoints, %eax
+	leal (%eax, %ebx, 4), %eax
+	movl %eax, xp
+	/*\ xap = xapoints + dxx + dw \*/
+	movl xapoints, %eax
+	leal (%eax, %ebx, 4), %eax
+	movl %eax, xap
+	/*\ y = dh \*/
+	movl dh, %eax
+	movl %eax, y
+	/*\ yp = ypoints + dyy \*/
+	movl dyy, %ebx
+	movl ypoints, %eax
+	leal (%eax, %ebx, 4), %eax
+	movl %eax, yp
+	/*\ yap = yapoints + dyy \*/
+	movl yapoints, %eax
+	leal (%eax, %ebx, 4), %eax
+	movl %eax, yap
+
+	pxor %mm7, %mm7
+
+	/*\ Test xup bit \*/
+	movl xup_yup, %eax
+	sarl $1, %eax
+	jnc .scale_x_down
+
+.scale_x_up:
+	/*\ Test yup bit \*/
+	sarl $1, %eax
+	jnc .scale_x_up_y_down
+
+
+/*\ Scaling up both ways \*/
+
+.scale_x_up_y_up:
+	movl sow, %ebx
+
+.up_up_loop_y:
+
+	/*\ x = -dw \*/
+	movl dw, %ecx
+	negl %ecx
+
+	/*\ %eax = *yap << 4 \*/
+	movl yap, %eax
+	movl (%eax), %eax
+	sall $4, %eax
+	jz .up_up_yap_0
+	movd %eax, %mm1
+	punpcklwd %mm1, %mm1
+	punpckldq %mm1, %mm1
+
+.up_up_loop1_x:
+	/*\ %esi = *yp + xp[x] \*/
+	movl yp, %eax
+	movl (%eax), %esi
+	movl xp, %eax
+	movl (%eax, %ecx, 4), %eax
+	leal (%esi, %eax, 4), %esi
+
+	/*\ %eax = xap[x] << 4 \*/
+	movl xap, %eax
+	movl (%eax, %ecx, 4), %eax
+	sall $4, %eax
+	jz .up_up_xap_0
+
+	/*\ %mm0 = xap[x] << 4 \*/
+	movd %eax, %mm0
+	punpcklwd %mm0, %mm0
+	punpckldq %mm0, %mm0
+
+	/*\ Load and unpack four pixels in parralel
+	|*| %mm2 = ptr[0],   %mm3 = ptr[1]
+	|*| %mm4 = ptr[sow], %mm5 = ptr[sow + 1]
+	\*/
+	movq (%esi), %mm2
+	movq (%esi, %ebx, 4), %mm4
+	movq %mm2, %mm3
+	movq %mm4, %mm5
+	punpcklbw %mm7, %mm2
+	punpcklbw %mm7, %mm4
+	punpckhbw %mm7, %mm3
+	punpckhbw %mm7, %mm5
+
+	/*\ X interpolation: r = l + (r - l) * xap \*/
+	psubw %mm2, %mm3
+	psubw %mm4, %mm5
+	psllw $4, %mm3
+	psllw $4, %mm5
+	pmulhw %mm0, %mm3
+	pmulhw %mm0, %mm5
+	paddw %mm2, %mm3
+	paddw %mm4, %mm5
+	/*\ Now %mm3 = I(ptr[0], ptr[1]), %mm5 = I(ptr[sow], ptr[sow + 1]) \*/
+	jmp .up_up_common
+.up_up_xap_0:
+	/*\ Load and unpack two pixels
+	|*| %mm3 = ptr[0], %mm5 = ptr[sow]
+	\*/
+	movd (%esi), %mm3
+	movd (%esi, %ebx, 4), %mm5
+	punpcklbw %mm7, %mm3
+	punpcklbw %mm7, %mm5
+.up_up_common:
+	/*\ Y interpolation: d = u + (d - u) * yap \*/
+	psubw %mm3, %mm5
+	psllw $4, %mm5
+	pmulhw %mm1, %mm5
+	paddw %mm3, %mm5
+	packuswb %mm5, %mm5
+	movd %mm5, (%edi, %ecx, 4)
+
+	/*\ while (++x) \*/
+	incl %ecx
+	jnz .up_up_loop1_x
+	jmp .up_up_yap_end
+.up_up_yap_0:
+
+.up_up_loop2_x:
+	/*\ %esi = *yp + xp[x] \*/
+	movl yp, %eax
+	movl (%eax), %esi
+	movl xp, %eax
+	movl (%eax, %ecx, 4), %eax
+	leal (%esi, %eax, 4), %esi
+
+	/*\ %eax = xap[x] << 4 \*/
+	movl xap, %eax
+	movl (%eax, %ecx, 4), %eax
+	sall $4, %eax
+	jz .up_up_0
+
+	/*\ %mm0 = xap[x] << 4 \*/
+	movd %eax, %mm0
+	punpcklwd %mm0, %mm0
+	punpckldq %mm0, %mm0
+
+	/*\ Load and unpack two pixels in parralel
+	|*| %mm2 = ptr[0], %mm3 = ptr[1]
+	\*/
+	movq (%esi), %mm2
+	movq %mm2, %mm3
+	punpcklbw %mm7, %mm2
+	punpckhbw %mm7, %mm3
+
+	/*\ X interpolation: r = l + (r - l) * xap \*/
+	psubw %mm2, %mm3
+	psllw $4, %mm3
+	pmulhw %mm0, %mm3
+	paddw %mm2, %mm3
+	packuswb %mm3, %mm3
+	movd %mm3, (%edi, %ecx, 4)
+	jmp .up_up_1
+.up_up_0:
+	/*\ dptr[x] = *sptr \*/
+	movl (%esi), %eax
+	movl %eax, (%edi, %ecx, 4)
+.up_up_1:
+	incl %ecx
+	jnz .up_up_loop2_x
+
+.up_up_yap_end:
+	/*\ dptr += dow \*/
+	movl dow, %eax
+	leal (%edi, %eax, 4), %edi
+	/*\ yap++; yp++ \*/
+	addl $4, yap
+	addl $4, yp
+	/*\ while (y--) \*/
+	decl y
+	jnz .up_up_loop_y
+
+	jmp .scale_leave
+
+
+/*\ Scaling down vertically \*/
+
+.scale_x_up_y_down:
+	/*\ sow_4 = sow * 4 \*/
+	movl sow, %eax
+	sall $2, %eax
+	movl %eax, sow_4
+
+.up_down_loop_y:
+
+	/*\ Setup My and Cy \*/
+	movl yap, %eax
+	movzwl (%eax), %ebx
+	movl %ebx, My
+	movzwl 2(%eax), %eax
+	movl %eax, Cy
+
+	/*\ mm4 = Cy \*/
+	movd %eax, %mm4
+	punpcklwd %mm4, %mm4
+	punpckldq %mm4, %mm4
+	/*\ mm5 = My \*/
+	movd %ebx, %mm5
+	punpcklwd %mm5, %mm5
+	punpckldq %mm5, %mm5
+
+	/*\ x = -dw \*/
+	movl dw, %ecx
+	negl %ecx
+.up_down_loop_x:
+	/*\ %esi = *yp + xp[x] \*/
+	movl yp, %eax
+	movl (%eax), %esi
+	movl xp, %eax
+	movl (%eax, %ecx, 4), %eax
+	leal (%esi, %eax, 4), %esi
+
+	movl %esi, %eax
+	/*\ v = (*p * My) >> 10 \*/
+	movd (%eax), %mm0
+	punpcklbw %mm7, %mm0
+	psllw $6, %mm0
+	pmulhw %mm5, %mm0
+	
+	/*\ i = 0x4000 - My \*/
+	movl $0x4000, %ebx
+	subl My, %ebx
+	jbe 5f
+	jmp 2f
+1:
+	/*\ p += sow; v += (*p * Cy) >> 10 \*/
+	addl sow_4, %eax
+	movd (%eax), %mm1
+	punpcklbw %mm7, %mm1
+	psllw $6, %mm1
+	pmulhw %mm4, %mm1
+	paddw %mm1, %mm0
+	
+	/*\ i -= Cy; while (i > Cy) \*/
+	subl Cy, %ebx
+2:
+	cmpl Cy, %ebx
+	jg 1b
+	
+	/*\ mm6 = i \*/
+	movd %ebx, %mm6
+	punpcklwd %mm6, %mm6
+	punpckldq %mm6, %mm6
+	
+	/*\ p += sow; v += (*p * i) >> 10 \*/
+	addl sow_4, %eax
+	movd (%eax), %mm1
+	punpcklbw %mm7, %mm1
+	psllw $6, %mm1
+	pmulhw %mm6, %mm1
+	paddw %mm1, %mm0
+5:
+	/*\ %eax = xap[x] << 5 \*/
+	movl xap, %eax
+	movl (%eax, %ecx, 4), %eax
+	sall $5, %eax
+	jz 6f
+	/*\ mm3 = xap[x] << 5 \*/
+	movd %eax, %mm3
+	punpcklwd %mm3, %mm3
+	punpckldq %mm3, %mm3
+	
+	/*\ p + 1 \*/
+	movl %esi, %eax
+	addl $4, %eax
+	/*\ vv = (*p * My) >> 10 \*/
+	movd (%eax), %mm2
+	punpcklbw %mm7, %mm2
+	psllw $6, %mm2
+	pmulhw %mm5, %mm2
+	
+	/*\ i = 0x4000 - My \*/
+	movl $0x4000, %ebx
+	subl My, %ebx
+	jbe 5f
+	jmp 2f
+1:
+	/*\ p += sow; vv += (*p * Cy) >> 10 \*/
+	addl sow_4, %eax
+	movd (%eax), %mm1
+	punpcklbw %mm7, %mm1
+	psllw $6, %mm1
+	pmulhw %mm4, %mm1
+	paddw %mm1, %mm2
+	
+	/*\ i -= Cy; while (i > Cy) \*/
+	subl Cy, %ebx
+2:
+	cmpl Cy, %ebx
+	jg 1b
+	
+	/*\ p += sow; v += (*p * i) >> 10 \*/
+	addl sow_4, %eax
+	movd (%eax), %mm1
+	punpcklbw %mm7, %mm1
+	psllw $6, %mm1
+	pmulhw %mm6, %mm1
+	paddw %mm1, %mm2
+5:
+	/*\ v = v + (vv - v) * xap \*/
+	psubw %mm0, %mm2
+	psllw $3, %mm2
+	pmulhw %mm3, %mm2
+	paddw %mm2, %mm0
+6:
+	/*\ dest[x] = v >> 4 \*/
+	psrlw $4, %mm0
+	packuswb %mm0, %mm0
+	movd %mm0, (%edi, %ecx, 4)
+
+	/*\ while (++x) \*/
+	incl %ecx
+	jnz .up_down_loop_x
+
+	/*\ dptr += dow \*/
+	movl dow, %eax
+	leal (%edi, %eax, 4), %edi
+	/*\ yap++; yp++ \*/
+	addl $4, yap
+	addl $4, yp
+	/*\ while (y--) \*/
+	decl y
+	jnz .up_down_loop_y
+
+	jmp .scale_leave
+
+.scale_x_down:
+	/*\ Test yup bit \*/
+	sarl $1, %eax
+	jnc .scale_x_down_y_down
+
+
+/*\ Scaling down horizontally \*/
+
+.scale_x_down_y_up:
+	/*\ sow_4 = sow * 4 \*/
+	movl sow, %eax
+	sall $2, %eax
+	movl %eax, sow_4
+
+.down_up_loop_y:
+
+	/*\ %eax = *yap << 5 \*/
+	movl yap, %eax
+	movl (%eax), %eax
+	sall $5, %eax
+	/*\ mm3 = *yap << 5 \*/
+	movd %eax, %mm3
+	punpcklwd %mm3, %mm3
+	punpckldq %mm3, %mm3
+	
+	/*\ x = -dw \*/
+	movl dw, %ecx
+	negl %ecx
+.down_up_loop_x:
+	/*\ %esi = *yp + xp[x] \*/
+	movl yp, %eax
+	movl (%eax), %esi
+	movl xp, %eax
+	movl (%eax, %ecx, 4), %eax
+	leal (%esi, %eax, 4), %esi
+
+	/*\ Setup Mx and Cx \*/
+	movl xap, %eax
+	movzwl (%eax, %ecx, 4), %ebx
+	movl %ebx, Mx
+	movzwl 2(%eax, %ecx, 4), %eax
+	movl %eax, Cx
+
+	/*\ mm4 = Cx \*/
+	movd %eax, %mm4
+	punpcklwd %mm4, %mm4
+	punpckldq %mm4, %mm4
+	/*\ mm5 = Mx \*/
+	movd %ebx, %mm5
+	punpcklwd %mm5, %mm5
+	punpckldq %mm5, %mm5
+
+	movl %esi, %eax
+	/*\ v = (*p * Mx) >> 10 \*/
+	movd (%eax), %mm0
+	punpcklbw %mm7, %mm0
+	psllw $6, %mm0
+	pmulhw %mm5, %mm0
+	
+	/*\ i = 0x4000 - Mx \*/
+	movl $0x4000, %ebx
+	subl Mx, %ebx
+	jbe 5f
+	jmp 2f
+1:
+	/*\ p += sow; v += (*p * Cx) >> 10 \*/
+	addl $4, %eax
+	movd (%eax), %mm1
+	punpcklbw %mm7, %mm1
+	psllw $6, %mm1
+	pmulhw %mm4, %mm1
+	paddw %mm1, %mm0
+	
+	/*\ i -= Cx; while (i > Cx) \*/
+	subl Cx, %ebx
+2:
+	cmpl Cx, %ebx
+	jg 1b
+	
+	/*\ mm6 = i \*/
+	movd %ebx, %mm6
+	punpcklwd %mm6, %mm6
+	punpckldq %mm6, %mm6
+	
+	/*\ p += sow; v += (*p * i) >> 10 \*/
+	addl $4, %eax
+	movd (%eax), %mm1
+	punpcklbw %mm7, %mm1
+	psllw $6, %mm1
+	pmulhw %mm6, %mm1
+	paddw %mm1, %mm0
+5:
+	movd %mm3, %eax
+	testl %eax, %eax
+	jz 6f
+	/*\ p + sow \*/
+	movl %esi, %eax
+	addl sow_4, %eax
+	/*\ vv = (*p * Mx) >> 10 \*/
+	movd (%eax), %mm2
+	punpcklbw %mm7, %mm2
+	psllw $6, %mm2
+	pmulhw %mm5, %mm2
+	
+	/*\ i = 0x4000 - Mx \*/
+	movl $0x4000, %ebx
+	subl Mx, %ebx
+	jbe 5f
+	jmp 2f
+1:
+	/*\ p += sow; vv += (*p * Cx) >> 10 \*/
+	addl $4, %eax
+	movd (%eax), %mm1
+	punpcklbw %mm7, %mm1
+	psllw $6, %mm1
+	pmulhw %mm4, %mm1
+	paddw %mm1, %mm2
+	
+	/*\ i -= Cx; while (i > Cx) \*/
+	subl Cx, %ebx
+2:
+	cmpl Cx, %ebx
+	jg 1b
+	
+	/*\ p += sow; v += (*p * i) >> 10 \*/
+	addl $4, %eax
+	movd (%eax), %mm1
+	punpcklbw %mm7, %mm1
+	psllw $6, %mm1
+	pmulhw %mm6, %mm1
+	paddw %mm1, %mm2
+5:
+	/*\ v = v + (vv - v) * yap \*/
+	psubw %mm0, %mm2
+	psllw $3, %mm2
+	pmulhw %mm3, %mm2
+	paddw %mm2, %mm0
+6:
+	/*\ dest[x] = v >> 4 \*/
+	psrlw $4, %mm0
+	packuswb %mm0, %mm0
+	movd %mm0, (%edi, %ecx, 4)
+
+	/*\ while (++x) \*/
+	incl %ecx
+	jnz .down_up_loop_x
+
+	/*\ dptr += dow \*/
+	movl dow, %eax
+	leal (%edi, %eax, 4), %edi
+	/*\ yap++; yp++ \*/
+	addl $4, yap
+	addl $4, yp
+	/*\ while (y--) \*/
+	decl y
+	jnz .down_up_loop_y
+
+	jmp .scale_leave
+
+
+/*\ Scaling down both ways \*/
+
+.scale_x_down_y_down:
+	/*\ sow_4 = sow * 4 \*/
+	movl sow, %eax
+	sall $2, %eax
+	movl %eax, sow_4
+
+.down_down_loop_y:
+
+	/*\ Setup My and Cy \*/
+	movl yap, %eax
+	movzwl (%eax), %ebx
+	movl %ebx, My
+	movzwl 2(%eax), %eax
+	movl %eax, Cy
+
+	/*\ x = -dw \*/
+	movl dw, %ecx
+	negl %ecx
+.down_down_loop_x:
+	/*\ %esi = *yp + xp[x] \*/
+	movl yp, %eax
+	movl (%eax), %esi
+	movl xp, %eax
+	movl (%eax, %ecx, 4), %eax
+	leal (%esi, %eax, 4), %esi
+
+	/*\ Setup Mx and Cx \*/
+	movl xap, %eax
+	movzwl (%eax, %ecx, 4), %ebx
+	movl %ebx, Mx
+	movzwl 2(%eax, %ecx, 4), %eax
+	movl %eax, Cx
+
+	/*\ mm3 = Cx \*/
+	movd %eax, %mm3
+	punpcklwd %mm3, %mm3
+	punpckldq %mm3, %mm3
+	/*\ mm5 = Mx \*/
+	movd %ebx, %mm5
+	punpcklwd %mm5, %mm5
+	punpckldq %mm5, %mm5
+	
+	/*\ p = sptr; v = (*p * Mx) >> 9 \*/
+	movl %esi, %eax
+	movd (%eax), %mm0
+	punpcklbw %mm7, %mm0
+	psllw $7, %mm0
+	pmulhw %mm5, %mm0
+	
+	/*\ i = 0x4000 - Mx \*/
+	movl $0x4000, %ebx
+	subl Mx, %ebx
+	jbe 5f
+	jmp 2f
+1:
+	/*\ v += (*++p * Cx) >> 9 \*/
+	addl $4, %eax
+	movd (%eax), %mm1
+	punpcklbw %mm7, %mm1
+	psllw $7, %mm1
+	pmulhw %mm3, %mm1
+	paddw %mm1, %mm0
+	
+	/*\ i -= Cx; while (i > Cx) \*/
+	subl Cx, %ebx
+2:
+	cmpl Cx, %ebx
+	jg 1b
+	
+	/*\ mm6 = i \*/
+	movd %ebx, %mm6
+	punpcklwd %mm6, %mm6
+	punpckldq %mm6, %mm6
+	
+	/*\ v += (*++p * i) >> 9 \*/
+	addl $4, %eax
+	movd (%eax), %mm1
+	punpcklbw %mm7, %mm1
+	psllw $7, %mm1
+	pmulhw %mm6, %mm1
+	paddw %mm1, %mm0
+5:
+	/*\ v *= My \*/
+	movd My, %mm4
+	punpcklwd %mm4, %mm4
+	punpckldq %mm4, %mm4
+	psllw $2, %mm0
+	pmulhw %mm4, %mm0
+	
+	/*\ j = 0x4000 - My \*/
+	movl $0x4000, %edx
+	subl My, %edx
+	jbe 6f
+	jmp 4f
+3:
+	/*\ sptr += sow; p = sptr \*/
+	addl sow_4, %esi
+	movl %esi, %eax
+	/*\ vx = (*p * Mx) >> 9 \*/
+	movd (%eax), %mm1
+	punpcklbw %mm7, %mm1
+	psllw $7, %mm1
+	pmulhw %mm5, %mm1
+	
+	/*\ i = 0x4000 - Mx \*/
+	movl $0x4000, %ebx
+	subl Mx, %ebx
+	jbe 5f
+	jmp 2f
+1:
+	/*\ vx += (*++p * Cx) >> 9 \*/
+	addl $4, %eax
+	movd (%eax), %mm2
+	punpcklbw %mm7, %mm2
+	psllw $7, %mm2
+	pmulhw %mm3, %mm2
+	paddw %mm2, %mm1
+	
+	/*\ i -= Cx; while (i > Cx) \*/
+	subl Cx, %ebx
+2:
+	cmpl Cx, %ebx
+	jg 1b
+	
+	/*\ vx += (*++p * i) >> 9 \*/
+	addl $4, %eax
+	movd (%eax), %mm2
+	punpcklbw %mm7, %mm2
+	psllw $7, %mm2
+	pmulhw %mm6, %mm2
+	paddw %mm2, %mm1
+5:
+	/*\ v += (vx * Cy) >> 14 \*/
+	movd Cy, %mm4
+	punpcklwd %mm4, %mm4
+	punpckldq %mm4, %mm4
+	psllw $2, %mm1
+	pmulhw %mm4, %mm1
+	paddw %mm1, %mm0
+	
+	/*\ j -= Cy; while (j > Cy) \*/
+	subl Cy, %edx
+4:
+	cmpl Cy, %edx
+	jg 3b
+	
+	/*\ sptr += sow; p = sptr \*/
+	addl sow_4, %esi
+	movl %esi, %eax
+	/*\ vx = (*p * Mx) >> 9 \*/
+	movd (%eax), %mm1
+	punpcklbw %mm7, %mm1
+	psllw $7, %mm1
+	pmulhw %mm5, %mm1
+	
+	/*\ i = 0x4000 - Mx \*/
+	movl $0x4000, %ebx
+	subl Mx, %ebx
+	jbe 5f
+	jmp 2f
+1:
+	/*\ vx += (*++p * Cx) >> 9 \*/
+	addl $4, %eax
+	movd (%eax), %mm2
+	punpcklbw %mm7, %mm2
+	psllw $7, %mm2
+	pmulhw %mm3, %mm2
+	paddw %mm2, %mm1
+	
+	/*\ i -= Cx; while (i > Cx) \*/
+	subl Cx, %ebx
+2:
+	cmpl Cx, %ebx
+	jg 1b
+	
+	/*\ vx += (*++p * i) >> 9 \*/
+	addl $4, %eax
+	movd (%eax), %mm2
+	punpcklbw %mm7, %mm2
+	psllw $7, %mm2
+	pmulhw %mm6, %mm2
+	paddw %mm2, %mm1
+5:
+	/*\ v += (vx * j) >> 14 \*/
+	movd %edx, %mm4
+	punpcklwd %mm4, %mm4
+	punpckldq %mm4, %mm4
+	psllw $2, %mm1
+	pmulhw %mm4, %mm1
+	paddw %mm1, %mm0
+6:
+	/*\ dptr[x] = mm0 >> 5 \*/
+	psrlw $5, %mm0
+	packuswb %mm0, %mm0
+	movd %mm0, (%edi, %ecx, 4)
+
+	/*\ while (++x) \*/
+	incl %ecx
+	jnz .down_down_loop_x
+
+	/*\ dptr += dow \*/
+	movl dow, %eax
+	leal (%edi, %eax, 4), %edi
+	/*\ yap++; yp++ \*/
+	addl $4, yap
+	addl $4, yp
+	/*\ while (y--) \*/
+	decl y
+	jnz .down_down_loop_y
+
+	jmp .scale_leave
+
+.scale_leave:
+	emms
+	popl %esi
+	popl %edi
+	popl %edx
+	popl %ecx
+	popl %ebx
+	movl %ebp, %esp
+	popl %ebp
+	ret
+
+#endif
--- a/src/blend.c
+++ b/src/blend.c
@ -693,165 +693,170 @@ __imlib_GetBlendFunction(ImlibOp op, char blend, char merge_alpha, char rgb_src,
   else
     {
 #ifdef DO_MMX_ASM
-	switch(op)
+	if (__imlib_get_cpuid() & CPUID_MMX)
 	  {
-	  case OP_COPY:
-	     if (merge_alpha)
+	     switch(op)
 	       {
-		  if (rgb_src)
+	       case OP_COPY:
+		  if (merge_alpha)
 		    {
-		       blender = __imlib_mmx_copy_rgb_to_rgba;
-		    }
-		  else
-		    {
-		       if (blend)
-			  blender = __imlib_mmx_blend_rgba_to_rgba;
-		       else
-			  blender = __imlib_mmx_copy_rgba_to_rgba;
-		    }
-	       }
-	     else
-	       {
-		  if (blend)
-		     blender = __imlib_mmx_blend_rgba_to_rgb;
-		  else
-		     blender = __imlib_mmx_copy_rgba_to_rgb;
-	       }
-	     break;
-	  case OP_ADD:
-	     if (merge_alpha)
-	       {
-		  if (blend)
-		     blender = __imlib_mmx_add_blend_rgba_to_rgba;
-		  else
-		     blender = __imlib_mmx_add_copy_rgba_to_rgba;
-	       }
-	     else
-	       {
-		  if (blend)
-		     blender = __imlib_mmx_add_blend_rgba_to_rgb;
-		  else
-		     blender = __imlib_mmx_add_copy_rgba_to_rgb;
-	       }
-	     break;
-	  case OP_SUBTRACT:
-	     if (merge_alpha)
-	       {
-		  if (blend)
-		     blender = __imlib_mmx_subtract_blend_rgba_to_rgba;
-		  else
-		     blender = __imlib_mmx_subtract_copy_rgba_to_rgba;
-	       }
-	     else
-	       {
-		  if (blend)
-		     blender = __imlib_mmx_subtract_blend_rgba_to_rgb;
-		  else
-		     blender = __imlib_mmx_subtract_copy_rgba_to_rgb;
-	       }
-	     break;
-	  case OP_RESHADE:
-	     if (merge_alpha)
-	       {
-		  if (blend)
-		     blender = __imlib_mmx_reshade_blend_rgba_to_rgba;
-		  else
-		     blender = __imlib_mmx_reshade_copy_rgba_to_rgba;
-	       }
-	     else
-	       {
-		  if (blend)
-		     blender = __imlib_mmx_reshade_blend_rgba_to_rgb;
-		  else
-		     blender = __imlib_mmx_reshade_copy_rgba_to_rgb;
-	       }
-	     break;
-	  default:
-	     break;
-	  }
-#else
-	switch(op)
-	  {
-	  case OP_COPY:
-	     if (merge_alpha)
-	       {
-		  if (rgb_src)
-		    {
-		       if (blend)
+		       if (rgb_src)
 			 {
-			    blender = __imlib_BlendRGBToRGBA;
+			    blender = __imlib_mmx_copy_rgb_to_rgba;
 			 }
 		       else
-			  blender = __imlib_CopyRGBAToRGBA;
+			 {
+			    if (blend)
+			       blender = __imlib_mmx_blend_rgba_to_rgba;
+			    else
+			       blender = __imlib_mmx_copy_rgba_to_rgba;
+			 }
 		    }
 		  else
 		    {
 		       if (blend)
-			  blender = __imlib_BlendRGBAToRGBA;
+			  blender = __imlib_mmx_blend_rgba_to_rgb;
 		       else
-			  blender = __imlib_CopyRGBAToRGBA;
+			  blender = __imlib_mmx_copy_rgba_to_rgb;
+		    }
+		  break;
+	       case OP_ADD:
+		  if (merge_alpha)
+		    {
+		       if (blend)
+			  blender = __imlib_mmx_add_blend_rgba_to_rgba;
+		       else
+			  blender = __imlib_mmx_add_copy_rgba_to_rgba;
 		    }
-	       }
-             else
-	       {
-		  if (blend)
-		     blender = __imlib_BlendRGBAToRGB;
 		  else
-		     blender = __imlib_CopyRGBAToRGB;
-	       }
-	     break;
-	  case OP_ADD:
-	     if (merge_alpha)
-	       {
-		  if (blend)
-		     blender = __imlib_AddBlendRGBAToRGBA;
+		    {
+		       if (blend)
+			  blender = __imlib_mmx_add_blend_rgba_to_rgb;
+		       else
+			  blender = __imlib_mmx_add_copy_rgba_to_rgb;
+		    }
+		  break;
+	       case OP_SUBTRACT:
+		  if (merge_alpha)
+		    {
+		       if (blend)
+			  blender = __imlib_mmx_subtract_blend_rgba_to_rgba;
+		       else
+			  blender = __imlib_mmx_subtract_copy_rgba_to_rgba;
+		    }
 		  else
-		     blender = __imlib_AddCopyRGBAToRGBA;
-	       }
-             else
-	       {
-		  if (blend)
-		     blender = __imlib_AddBlendRGBAToRGB;
+		    {
+		       if (blend)
+			  blender = __imlib_mmx_subtract_blend_rgba_to_rgb;
+		       else
+			  blender = __imlib_mmx_subtract_copy_rgba_to_rgb;
+		    }
+		  break;
+	       case OP_RESHADE:
+		  if (merge_alpha)
+		    {
+		       if (blend)
+			  blender = __imlib_mmx_reshade_blend_rgba_to_rgba;
+		       else
+			  blender = __imlib_mmx_reshade_copy_rgba_to_rgba;
+		    }
 		  else
-		     blender = __imlib_AddCopyRGBAToRGB;
+		    {
+		       if (blend)
+			  blender = __imlib_mmx_reshade_blend_rgba_to_rgb;
+		       else
+			  blender = __imlib_mmx_reshade_copy_rgba_to_rgb;
+		    }
+		  break;
+	       default:
+		  break;
 	       }
-	     break;
-	  case OP_SUBTRACT:
-	     if (merge_alpha)
-	       {
-		  if (blend)
-		     blender = __imlib_SubBlendRGBAToRGBA;
-		  else
-		     blender = __imlib_SubCopyRGBAToRGBA;
-	       }
-             else
-	       {
-		  if (blend)
-		     blender = __imlib_SubBlendRGBAToRGB;
-		  else
-		     blender = __imlib_SubCopyRGBAToRGB;
-	       }
-	     break;
-	  case OP_RESHADE:
-	     if (merge_alpha)
-	       {
-		  if (blend)
-		     blender = __imlib_ReBlendRGBAToRGBA;
-		  else
-		     blender = __imlib_ReCopyRGBAToRGBA;
-	       }
-             else
-	       {
-		  if (blend)
-		     blender = __imlib_ReBlendRGBAToRGB;
-		  else
-		     blender = __imlib_ReCopyRGBAToRGB;
-	       }
-	     break;
-	  default:
-	     break;
 	  }
+	else
 #endif
+	  {
+	     switch(op)
+	       {
+	       case OP_COPY:
+		  if (merge_alpha)
+		    {
+		       if (rgb_src)
+			 {
+			    if (blend)
+			      {
+				 blender = __imlib_BlendRGBToRGBA;
+			      }
+			    else
+			       blender = __imlib_CopyRGBAToRGBA;
+			 }
+		       else
+			 {
+			    if (blend)
+			       blender = __imlib_BlendRGBAToRGBA;
+			    else
+			       blender = __imlib_CopyRGBAToRGBA;
+			 }
+		    }
+		  else
+		    {
+		       if (blend)
+			  blender = __imlib_BlendRGBAToRGB;
+		       else
+			  blender = __imlib_CopyRGBAToRGB;
+		    }
+		  break;
+	       case OP_ADD:
+		  if (merge_alpha)
+		    {
+		       if (blend)
+			  blender = __imlib_AddBlendRGBAToRGBA;
+		       else
+			  blender = __imlib_AddCopyRGBAToRGBA;
+		    }
+		  else
+		    {
+		       if (blend)
+			  blender = __imlib_AddBlendRGBAToRGB;
+		       else
+			  blender = __imlib_AddCopyRGBAToRGB;
+		    }
+		  break;
+	       case OP_SUBTRACT:
+		  if (merge_alpha)
+		    {
+		       if (blend)
+			  blender = __imlib_SubBlendRGBAToRGBA;
+		       else
+			  blender = __imlib_SubCopyRGBAToRGBA;
+		    }
+		  else
+		    {
+		       if (blend)
+			  blender = __imlib_SubBlendRGBAToRGB;
+		       else
+			  blender = __imlib_SubCopyRGBAToRGB;
+		    }
+		  break;
+	       case OP_RESHADE:
+		  if (merge_alpha)
+		    {
+		       if (blend)
+			  blender = __imlib_ReBlendRGBAToRGBA;
+		       else
+			  blender = __imlib_ReCopyRGBAToRGBA;
+		    }
+		  else
+		    {
+		       if (blend)
+			  blender = __imlib_ReBlendRGBAToRGB;
+		       else
+			  blender = __imlib_ReCopyRGBAToRGB;
+		    }
+		  break;
+	       default:
+		  break;
+	       }
+	  }
     }

   return blender;
@ -955,15 +960,12 @@ __imlib_BlendImageToImage(ImlibImage *im_src, ImlibImage *im_dst,
     }
   else
     {
-	DATA32  **ypoints = NULL;
-	int      *xpoints = NULL;
-	int      *yapoints = NULL;
-	int      *xapoints = NULL;
+	ImlibScaleInfo *scaleinfo = NULL;
 	DATA32   *buf = NULL;
-	int       sx, sy, sw, sh, dx, dy, dw, dh, dxx, dyy, scw, sch, y2, x2;
+	int       sx, sy, sw, sh, dx, dy, dw, dh, dxx, dyy, y2, x2;
 	int       psx, psy, psw, psh;
-	char      xup = 0, yup = 0;
 	int       y, h, hh;
+	int       do_mmx;
 	sx = ssx;
 	sy = ssy;
 	sw = ssw;
@ -1030,49 +1032,11 @@ __imlib_BlendImageToImage(ImlibImage *im_src, ImlibImage *im_dst,
 	  {
 	     return;
 	  }
-	/* calculate the scaling factors of width and height for a whole image */
-	scw = (ddw * im_src->w) / ssw;
-	sch = (ddh * im_src->h) / ssh;
 	/* if we are scaling the image at all make a scaling buffer */
 	if (!((sw == dw) && (sh == dh)))
 	  {
-	     /* need to calculate ypoitns and xpoints array */
-	     ypoints = __imlib_CalcYPoints(im_src->data, im_src->w, im_src->h,
-					   sch, im_src->border.top,
-					   im_src->border.bottom);
-	     if (!ypoints)
-		return;
-	     xpoints = __imlib_CalcXPoints(im_src->w, scw,
-					   im_src->border.left,
-					   im_src->border.right);
-	     if (!xpoints)
-	       {
-		  free(ypoints);
-		  return;
-	       }
-	     /* calculate aliasing counts */
-	     if (aa)
-	       {
-		  yapoints = __imlib_CalcApoints(im_src->h, sch,
-						 im_src->border.top,
-						 im_src->border.bottom);
-		  if (!yapoints)
-		    {
-		       free(ypoints);
-		       free(xpoints);
-		       return;
-		    }
-		  xapoints = __imlib_CalcApoints(im_src->w, scw,
-						 im_src->border.left,
-						 im_src->border.right);
-		  if (!xapoints)
-		    {
-		       free(yapoints);
-		       free(ypoints);
-		       free(xpoints);
-		       return;
-		    }
-	       }
+	     scaleinfo = __imlib_CalcScaleInfo(im_src, ssw, ssh, ddw, ddh, aa);
+	     if (!scaleinfo) return;
 	  }
 	else
 	  {
@ -1098,21 +1062,11 @@ __imlib_BlendImageToImage(ImlibImage *im_src, ImlibImage *im_dst,
 	buf = malloc(dw * LINESIZE * sizeof(DATA32));
 	if (!buf)
 	  {
-	     if (aa)
-	       {
-		  free(xapoints);
-		  free(yapoints);
-	       }
-	     free(ypoints);
-	     free(xpoints);
+	     __imlib_FreeScaleInfo(scaleinfo);
+	     return;
 	  }
 	/* setup h */
 	h = dh;
-	/* set our scaling up in x / y dir flags */
-	if (dw > sw)
-	   xup = 1;
-	if (dh > sh)
-	   yup = 1;
 	if (!IMAGE_HAS_ALPHA(im_dst))
 	   merge_alpha = 0;
 	if (!IMAGE_HAS_ALPHA(im_src))
@ -1124,6 +1078,9 @@ __imlib_BlendImageToImage(ImlibImage *im_src, ImlibImage *im_dst,
 		blend = 0;
 	  }
 	/* scale in LINESIZE Y chunks and convert to depth*/
+#ifdef DO_MMX_ASM
+	do_mmx = __imlib_get_cpuid() & CPUID_MMX;
+#endif
 	for (y = 0; y < dh; y += LINESIZE)
 	  {
 	     hh = LINESIZE;
@ -1132,17 +1089,21 @@ __imlib_BlendImageToImage(ImlibImage *im_src, ImlibImage *im_dst,
 	     /* scale the imagedata for this LINESIZE lines chunk of image */
 	     if (aa)
 	       {
+#ifdef DO_MMX_ASM
+		  if (do_mmx)
+		     __imlib_Scale_mmx_AARGBA(scaleinfo, buf, dxx, dyy + y,
+					      0, 0, dw, hh, dw, im_src->w);
+		  else
+#endif
 		  if (IMAGE_HAS_ALPHA(im_src))
-		     __imlib_ScaleAARGBA(ypoints, xpoints, buf, xapoints,
-					 yapoints, xup, yup, dxx, dyy + y,
+		     __imlib_ScaleAARGBA(scaleinfo, buf, dxx, dyy + y,
 					 0, 0, dw, hh, dw, im_src->w);
 		  else
-		     __imlib_ScaleAARGB(ypoints, xpoints, buf, xapoints,
-					yapoints, xup, yup, dxx, dyy + y,
+		     __imlib_ScaleAARGB(scaleinfo, buf, dxx, dyy + y,
 					0, 0, dw, hh, dw, im_src->w);
 	       }
 	     else
-		__imlib_ScaleSampleRGBA(ypoints, xpoints, buf, dxx, dyy + y,
+		__imlib_ScaleSampleRGBA(scaleinfo, buf, dxx, dyy + y,
 					0, 0, dw, hh, dw);

 	     __imlib_BlendRGBAToData(buf, dw, hh,
@ -1153,16 +1114,7 @@ __imlib_BlendImageToImage(ImlibImage *im_src, ImlibImage *im_dst,
 	     h -= LINESIZE;
 	  }
 	/* free up our buffers and point tables */
-	if (buf)
-	  {
-	     free(buf);
-	     free(ypoints);
-	     free(xpoints);
-	  }
-	if (aa)
-	  {
-	     free(yapoints);
-	     free(xapoints);
-	  }
+	free(buf);
+	__imlib_FreeScaleInfo(scaleinfo);
     }
 }
--- a/src/common.h
+++ b/src/common.h
@ -15,4 +15,10 @@
 #define DATA16  unsigned short
 #define DATA8   unsigned char

+#ifdef DO_MMX_ASM
+int __imlib_get_cpuid(void);
+#define CPUID_MMX (1 << 23)
+#define CPUID_XMM (1 << 25)
+#endif
+
 #endif
--- a/src/loaderpath.h
+++ b/src/loaderpath.h
@ -0,0 +1,2 @@
+#define SYS_LOADERS_PATH "/usr/local/lib/loaders"
+#define USER_LOADERS_PATH ".loaders"
--- a/src/rend.c
+++ b/src/rend.c
@ -2,8 +2,8 @@
 #include <X11/extensions/XShm.h>
 #include "common.h"
 #include "colormod.h"
-#include "scale.h"
 #include "image.h"
+#include "scale.h"
 #include "ximage.h"
 #include "context.h"
 #include "rgba.h"
@ -39,16 +39,16 @@ __imlib_RenderImage(Display *d, ImlibImage *im,
   static GC gc = 0;
   static GC gcm = 0;
   XGCValues gcv;
-   DATA32  **ypoints = NULL;
-   int      *xpoints = NULL;
-   int      *yapoints = NULL;
-   int      *xapoints = NULL;
-   int       scw, sch;
+   ImlibScaleInfo *scaleinfo = NULL;
   int       psx, psy, psw, psh;
   int       actual_depth = 0;
-   char      xup = 0, yup = 0;
   char      shm = 0, bgr = 0;
   ImlibRGBAFunction rgbaer, masker;
+   ImlibBlendFunction blender = NULL;
+   int       do_mmx;
+
+   blender = __imlib_GetBlendFunction(op, 1, 0,
+				      (!(im->flags & F_HAS_ALPHA)), NULL);
   
   /* dont do anything if we have a 0 widht or height image to render */
   if ((dw <= 0) || (dh <= 0))
@ -84,41 +84,11 @@ __imlib_RenderImage(Display *d, ImlibImage *im,
   /* if the output is too big (8k arbitary limit here) dont bother */
   if ((dw > 8192) || (dh > 8192))
      return;
-   /* calculate the scaling factors of width and height for a whole image */
-   scw = dw * im->w / sw;
-   sch = dh * im->h / sh;
   /* if we are scaling the image at all make a scaling buffer */
   if (!((sw == dw) && (sh == dh)))
     {
-	/* need to calculate ypoitns and xpoints array */
-	ypoints = __imlib_CalcYPoints(im->data, im->w, im->h, sch, im->border.top, im->border.bottom);
-	if (!ypoints)
-	   return;
-	xpoints = __imlib_CalcXPoints(im->w, scw, im->border.left, im->border.right);
-	if (!xpoints)
-	  {
-	     free(ypoints);
-	     return;
-	  }
-	/* calculate aliasing counts */
-	if (antialias)
-	  {
-	     yapoints = __imlib_CalcApoints(im->h, sch, im->border.top, im->border.bottom);
-	     if (!yapoints)
-	       {
-		  free(ypoints);
-		  free(xpoints);
-		  return;
-	       }
-	     xapoints = __imlib_CalcApoints(im->w, scw, im->border.left, im->border.right);
-	     if (!xapoints)
-	       {
-		  free(yapoints);
-		  free(ypoints);
-		  free(xpoints);
-		  return;
-	       }
-	  }
+	scaleinfo = __imlib_CalcScaleInfo(im, sw, sh, dw, dh, antialias);
+	if (!scaleinfo) return;
     }
   ct = __imlib_GetContext(d, v, cm, depth);
   actual_depth = depth;
@ -140,15 +110,8 @@ __imlib_RenderImage(Display *d, ImlibImage *im,
   xim = __imlib_ProduceXImage(d, v, depth, dw, dh, &shm);
   if (!xim)
     {
-	if (antialias)
-	  {
-	     free(xapoints);
-	     free(yapoints);
-	  }
-	free(ypoints);
-	free(xpoints);
-	if (back)
-	   free(back);
+	__imlib_FreeScaleInfo(scaleinfo);
+	free(back);
 	return;
     }
   /* do a double check in 24/32bpp */
@ -160,20 +123,13 @@ __imlib_RenderImage(Display *d, ImlibImage *im,
 	if (!mxim)
 	  {
 	     __imlib_ConsumeXImage(d, xim);
-	     if (antialias)
-	       {
-		  free(xapoints);
-		  free(yapoints);
-	       }
-	     free(ypoints);
-	     free(xpoints);
-	     if (back)
-		free(back);
+	     __imlib_FreeScaleInfo(scaleinfo);
+	     free(back);
 	     return;
 	  }
     }
   /* if we are scaling the image at all make a scaling buffer */
-   if (!((sw == dw) && (sh == dh)))
+   if (scaleinfo)
     {
 	/* allocate a buffer to render scaled RGBA data into */
 	buf = malloc(dw * LINESIZE * sizeof(DATA32));
@ -182,54 +138,51 @@ __imlib_RenderImage(Display *d, ImlibImage *im,
 	     __imlib_ConsumeXImage(d, xim);
 	     if (m)
 		__imlib_ConsumeXImage(d, mxim);
-	     if (antialias)
-	       {
-		  free(xapoints);
-		  free(yapoints);
-	       }
-	     free(ypoints);
-	     free(xpoints);
-	     if (back)
-		free(back);
+	     __imlib_FreeScaleInfo(scaleinfo);
+	     free(back);
 	     return;
 	  }
     }
   /* setup h */
   h = dh;
-   /* set our scaling up in x / y dir flags */
-   if (dw > sw)
-      xup = 1;
-   if (dh > sh)
-      yup = 1;
   /* scale in LINESIZE Y chunks and convert to depth*/
   /*\ Get rgba and mask functions for XImage rendering \*/
   rgbaer = __imlib_GetRGBAFunction(actual_depth, bgr, hiq, ct->palette_type);
   if (m) masker = __imlib_GetMaskFunction(dither_mask);
+#ifdef DO_MMX_ASM
+   do_mmx = __imlib_get_cpuid() & CPUID_MMX;
+#endif
   for (y = 0; y < dh; y += LINESIZE)
     {
 	hh = LINESIZE;
 	if (h < LINESIZE)
 	   hh = h;
 	/* if we're scaling it */
-	if (ypoints)
+	if (scaleinfo)
 	  {
 	     /* scale the imagedata for this LINESIZE lines chunk of image data */
 	     if (antialias)
 	       {
+#ifdef DO_MMX_ASM
+		  if (do_mmx)
+		     __imlib_Scale_mmx_AARGBA(scaleinfo, buf,
+					      ((sx * dw) / sw),
+					      ((sy * dh) / sh) + y, 
+					      0, 0, dw, hh, dw, im->w);
+		  else
+#endif
 		  if (IMAGE_HAS_ALPHA(im))
-		     __imlib_ScaleAARGBA(ypoints, xpoints, buf, xapoints, 
-					 yapoints, xup, yup, 
-					 ((sx * dw) / sw), ((sy * dh) / sh) + y, 
+		     __imlib_ScaleAARGBA(scaleinfo, buf, ((sx * dw) / sw),
+					 ((sy * dh) / sh) + y, 
 					 0, 0, dw, hh, dw, im->w);
 		  else
-		     __imlib_ScaleAARGB(ypoints, xpoints, buf, xapoints, 
-					yapoints, xup, yup, 
-					((sx * dw) / sw), ((sy * dh) / sh) + y, 
+		     __imlib_ScaleAARGB(scaleinfo, buf, ((sx * dw) / sw),
+					((sy * dh) / sh) + y, 
 					0, 0, dw, hh, dw, im->w);
 	       }
 	     else
-		__imlib_ScaleSampleRGBA(ypoints, xpoints, buf, 
-					 ((sx * dw) / sw), ((sy * dh) / sh) + y, 
+		__imlib_ScaleSampleRGBA(scaleinfo, buf, ((sx * dw) / sw),
+					((sy * dh) / sh) + y, 
 					0, 0, dw, hh, dw);
 	     jump = 0;
 	     pointer = buf;
@ -257,12 +210,6 @@ __imlib_RenderImage(Display *d, ImlibImage *im,
 	/* if we have a back buffer - we're blending to the bg */
 	if (back)
 	  {
-	     char rgb_src = 0;
-	     ImlibBlendFunction blender = NULL;
-	     
-	     if (!(im->flags & F_HAS_ALPHA))
-		rgb_src = 1;
-	     blender = __imlib_GetBlendFunction(op, 1, 0, rgb_src, NULL);
 	     blender(pointer, jump + dw, back + (y * dw), dw, dw, hh, NULL);
 	     pointer = back + (y * dw);
 	     jump = 0;
@ -278,19 +225,9 @@ __imlib_RenderImage(Display *d, ImlibImage *im,
 	h -= LINESIZE;
     }
   /* free up our buffers and poit tables */
-   if (buf)
-     {
-	free(buf);
-	free(ypoints);
-	free(xpoints);
-     }
-   if (antialias)
-     {
-	if (yapoints) free(yapoints);
-	if (xapoints) free(xapoints);
-     }
-   if (back)
-      free(back);
+   free(buf);
+   __imlib_FreeScaleInfo(scaleinfo);
+   free(back);
   /* if we didnt have a gc... create it */
   if (!gc)
     {
--- a/src/rgba.c
+++ b/src/rgba.c
@ -3502,7 +3502,8 @@ __imlib_GetRGBAFunction(int depth, char bgr, char hiq, DATA8 palette_type)
   /*\ Boolean sanity \*/
   bgr = bgr ? 1 : 0; hiq = hiq ? 1 : 0;
 #ifdef DO_MMX_ASM
-   return mmx_functions[did][bgr][hiq];
+   if (__imlib_get_cpuid() && CPUID_MMX)
+      return mmx_functions[did][bgr][hiq];
 #endif
   return functions[did][bgr][hiq];
 }
--- a/src/rgbadraw.c
+++ b/src/rgbadraw.c
@ -1,8 +1,8 @@
 #include <X11/Xlib.h>
 #include "common.h"
 #include "colormod.h"
-#include "scale.h"
 #include "image.h"
+#include "scale.h"
 #include "context.h"
 #include "rgba.h"
 #include "blend.h"
--- a/src/rotate.c
+++ b/src/rotate.c
@ -5,7 +5,7 @@
 /*\ Linear interpolation functions \*/
 /*\ Between two values \*/
 #define INTERP(v1, v2, f) \
-	((v1) * (_ROTATE_PREC_MAX - (f)) + (v2) * (f))
+	(((v1) << _ROTATE_PREC) + (((v2) - (v1)) * (f)))

 /*\ Between two colour bytes \*/
 #define INTERP_VAL1(x_VAL, dest, l, r, x) \
@ -22,106 +22,31 @@
 			      INTERP(x_VAL(ll), x_VAL(lr), (x)),	\
 			     (y)) >> (2 * _ROTATE_PREC))

-#if defined(DO_MMX_ASM) && defined(__GNUC__)
-/*\ MMX asm version, TODO: insn order for PMMX pairing \*/
-#define INTERP_ARGB(dest, src, sow, x, y) __asm__ (\
-	"pxor %%mm6, %%mm6\n\t"			\
-	"movd %3, %%mm0\n\t"			\
-	"movd %4, %%mm1\n\t"			\
-	"punpcklwd %%mm0, %%mm0\n\t"		\
-	"punpcklwd %%mm1, %%mm1\n\t"		\
-	"punpckldq %%mm0, %%mm0\n\t"		\
-	"punpckldq %%mm1, %%mm1\n\t"		\
-	"movq (%1), %%mm2\n\t"			\
-	"movq (%1, %2, 4), %%mm4\n\t"		\
-	"movq %%mm2, %%mm3\n\t"			\
-	"movq %%mm4, %%mm5\n\t"			\
-	"punpcklbw %%mm6, %%mm2\n\t"		\
-	"punpcklbw %%mm6, %%mm4\n\t"		\
-	"punpckhbw %%mm6, %%mm3\n\t"		\
-	"punpckhbw %%mm6, %%mm5\n\t"		\
-	"psubw %%mm2, %%mm3\n\t"		\
-	"psubw %%mm4, %%mm5\n\t"		\
-	"psllw %5, %%mm3\n\t"			\
-	"psllw %5, %%mm5\n\t"			\
-	"pmulhw %%mm0, %%mm3\n\t"		\
-	"pmulhw %%mm0, %%mm5\n\t"		\
-	"paddw %%mm2, %%mm3\n\t"		\
-	"paddw %%mm4, %%mm5\n\t"		\
-	"psubw %%mm3, %%mm5\n\t"		\
-	"psllw %5, %%mm5\n\t"			\
-	"pmulhw %%mm1, %%mm5\n\t"		\
-	"paddw %%mm3, %%mm5\n\t"		\
-	"packuswb %%mm5, %%mm5\n\t"		\
-	"movd %%mm5, (%0)"			\
-	: /*\ No outputs \*/			\
-	: "r" (dest), "r" (src), "r" (sow),	\
-	  "g" ((x) & _ROTATE_PREC_BITS),	\
-	  "g" ((y) & _ROTATE_PREC_BITS),	\
-	  "I" (16 - _ROTATE_PREC))
-
-#define INTERP_RGB_A0(dest, v1, v2, f, f2) __asm__(\
-	"pxor %%mm6, %%mm6\n\t"			\
-	"movd %3, %%mm0\n\t"			\
-	"movd %4, %%mm1\n\t"			\
-	"punpcklwd %%mm0, %%mm0\n\t"		\
-	"punpcklwd %%mm1, %%mm1\n\t"		\
-	"punpckldq %%mm0, %%mm0\n\t"		\
-	"punpckldq %%mm1, %%mm1\n\t"		\
-	"movd (%1), %%mm2\n\t"			\
-	"movd (%2), %%mm4\n\t"			\
-	"punpcklbw %%mm6, %%mm2\n\t"		\
-	"punpcklbw %%mm6, %%mm4\n\t"		\
-	"psubw %%mm2, %%mm4\n\t"		\
-	"psllw %5, %%mm4\n\t"			\
-	"pmulhw %%mm0, %%mm4\n\t"		\
-	"paddw %%mm2, %%mm4\n\t"		\
-	"movq %%mm4, %%mm2\n\t"			\
-	"psllq $48, %%mm1\n\t"			\
-	"psllw %5, %%mm4\n\t"			\
-	"pmulhw %%mm1, %%mm4\n\t"		\
-	"psubw %%mm4, %%mm2\n\t"		\
-	"packuswb %%mm2, %%mm2\n\t"		\
-	"movd %%mm2, (%0)"			\
-	: /*\ No outputs \*/			\
-	: "r" (dest), "r" (v1), "r" (v2),	\
-	  "g" ((f) & _ROTATE_PREC_BITS),	\
-	  "g" (_ROTATE_PREC_MAX - ((f2) & _ROTATE_PREC_BITS)),	\
-	  "I" (16 - _ROTATE_PREC))
-
-#define EMMS() __asm__ ("emms" : : )
-#endif
 /*\ Functions used in rotation routines.
 |*| The do { } while(0) construction is to make it one statement.
 \*/
 /*\ Between four colours \*/
-#ifndef INTERP_ARGB
 #define INTERP_ARGB(dest, src, sow, x, y) do { \
 	INTERP_VAL2(R_VAL, (dest), (src), (src) + 1, (src) + (sow), (src) + (sow) + 1, (x) & _ROTATE_PREC_BITS, (y) & _ROTATE_PREC_BITS);	\
 	INTERP_VAL2(G_VAL, (dest), (src), (src) + 1, (src) + (sow), (src) + (sow) + 1, (x) & _ROTATE_PREC_BITS, (y) & _ROTATE_PREC_BITS);	\
 	INTERP_VAL2(B_VAL, (dest), (src), (src) + 1, (src) + (sow), (src) + (sow) + 1, (x) & _ROTATE_PREC_BITS, (y) & _ROTATE_PREC_BITS);	\
 	INTERP_VAL2(A_VAL, (dest), (src), (src) + 1, (src) + (sow), (src) + (sow) + 1, (x) & _ROTATE_PREC_BITS, (y) & _ROTATE_PREC_BITS);	\
 	} while (0)
-#endif

 /*\ Between two colours, alpha between two values and zeroes \*/
-#ifndef INTERP_RGB_A0
 #define INTERP_RGB_A0(dest, v1, v2, f, f2) do { \
 	INTERP_VAL1(R_VAL, (dest), (v1), (v2), (f) & _ROTATE_PREC_BITS); \
 	INTERP_VAL1(G_VAL, (dest), (v1), (v2), (f) & _ROTATE_PREC_BITS); \
 	INTERP_VAL1(B_VAL, (dest), (v1), (v2), (f) & _ROTATE_PREC_BITS); \
 	INTERP_VAL1_A0(dest, (v1), (v2), (f) & _ROTATE_PREC_BITS, (f2) & _ROTATE_PREC_BITS);	\
 	} while (0)
-#endif

 /*\ One colour, alpha between one value and three zeroes \*/
-#ifndef INTERP_A000
 #define INTERP_A000(dest, v, f1, f2) do {	\
 	*(dest) = *(v);				\
 	A_VAL(dest) = (A_VAL(dest) *		\
 		((f1) & _ROTATE_PREC_BITS) * ((f2) & _ROTATE_PREC_BITS)) >> (2 * _ROTATE_PREC);	\
 	} while (0)
-#endif

 /*\ Rotate by pixel sampling only, target inside source \*/
 static void
@ -175,9 +100,6 @@ __imlib_RotateAAInside(DATA32 *src, DATA32 *dest, int sow, int dow,
      y += dx - dw * dy;
      dest += (dow - dw);
   }
-#ifdef EMMS
-   EMMS();
-#endif
 }

 /*\ NOTE: To check if v is in [b .. t) ((v >= b) && (v < t))
@ -185,26 +107,25 @@ __imlib_RotateAAInside(DATA32 *src, DATA32 *dest, int sow, int dow,
 |*|  as negative values, cast to unsigned, become large positive
 |*|  values, and fall through the compare.
 |*|  v in [0 .. t) is a special case: ((unsigned)v < t)
-|*|  v in [-t .. 0) is also special, as its the same as -v-1 in [0 .. t),
-|*|  and (-v-1) translates to one asm instruction, namely 'not'
+|*|  v in [-t .. 0) is also special, as its the same as ~v in [0 .. t)
 \*/
 static int
 __check_inside_coords(int x, int y, int dx, int dy,
-		      int dw, int dh, int sow, int soh)
+		      int dw, int dh, int sw, int sh)
 {
-   sow <<= _ROTATE_PREC;
-   soh <<= _ROTATE_PREC;
+   sw <<= _ROTATE_PREC;
+   sh <<= _ROTATE_PREC;
   
-   if (((unsigned)x >= sow) || ((unsigned)y >= sow))
+   if (((unsigned)x >= sw) || ((unsigned)y >= sh))
      return 0;
   x += dx * dw; y += dy * dw;
-   if (((unsigned)x >= sow) || ((unsigned)y >= sow))
+   if (((unsigned)x >= sw) || ((unsigned)y >= sh))
      return 0;
   x -= dy * dh; y += dx * dh;
-   if (((unsigned)x >= sow) || ((unsigned)y >= sow))
+   if (((unsigned)x >= sw) || ((unsigned)y >= sh))
      return 0;
   x -= dx * dw; y -= dy * dw;
-   if (((unsigned)x >= sow) || ((unsigned)y >= sow))
+   if (((unsigned)x >= sw) || ((unsigned)y >= sh))
      return 0;
   
   return 1;
@ -232,6 +153,7 @@ __imlib_RotateSample(DATA32 *src, DATA32 *dest, int sow, int sw, int sh,
      do {
 	 if (((unsigned)x < sw) && ((unsigned)y < sh))
 	    *dest = src[(x >> _ROTATE_PREC) + ((y >> _ROTATE_PREC) * sow)];
+	 else *dest = 0;
 	 /*\ RIGHT; \*/
 	 x += dx;
 	 y += dy;
@ -285,34 +207,31 @@ __imlib_RotateAA(DATA32 *src, DATA32 *dest, int sow, int sw, int sh,
 	       /*\  12
 	       |*|  ..
 	       \*/
-	       INTERP_RGB_A0(dest, src_x_y, src_x_y + 1, x,
-			     (_ROTATE_PREC_MAX - y));
-	    } else if ((unsigned)(-y-1) < _ROTATE_PREC_MAX) {
+	       INTERP_RGB_A0(dest, src_x_y, src_x_y + 1, x, ~y);
+	    } else if ((unsigned)(~y) < _ROTATE_PREC_MAX) {
 	       /*\  ..
 	       |*|  34
 	       \*/
 	       INTERP_RGB_A0(dest, src_x_y + sow, src_x_y + sow + 1, x, y);
-	    }
+	    } else *dest = 0;
 	 } else if ((unsigned)(x - sw) < (_ROTATE_PREC_MAX)) {
 	    if ((unsigned)y < sh) {
 	       /*\  1.
 	       |*|  3.
 	       \*/
-	       INTERP_RGB_A0(dest, src_x_y, src_x_y + sow, y,
-			     (_ROTATE_PREC_MAX - x));
+	       INTERP_RGB_A0(dest, src_x_y, src_x_y + sow, y, ~x);
 	    } else if ((unsigned)(y - sh) < _ROTATE_PREC_MAX) {
 	       /*\  1.
 	       |*|  ..
 	       \*/
-	       INTERP_A000(dest, src_x_y, (_ROTATE_PREC_MAX - x),
-			   (_ROTATE_PREC_MAX - y));
-	    } else if ((unsigned)(-y-1) < _ROTATE_PREC_MAX) {
+	       INTERP_A000(dest, src_x_y, ~x, ~y);
+	    } else if ((unsigned)(~y) < _ROTATE_PREC_MAX) {
 	       /*\  ..
 	       |*|  3.
 	       \*/
-	       INTERP_A000(dest, src_x_y + sow, (_ROTATE_PREC_MAX - x), y);
-	    }
-	 } else if ((unsigned)(-x-1) < _ROTATE_PREC_MAX) {
+	       INTERP_A000(dest, src_x_y + sow, ~x, y);
+	    } else *dest = 0;
+	 } else if ((unsigned)(~x) < _ROTATE_PREC_MAX) {
 	    if ((unsigned)y < sh) {
 	       /*\  .2
 	       |*|  .4
@ -322,14 +241,14 @@ __imlib_RotateAA(DATA32 *src, DATA32 *dest, int sow, int sw, int sh,
 	       /*\  .2
 	       |*|  ..
 	       \*/
-	       INTERP_A000(dest, src_x_y + 1, x, (_ROTATE_PREC_MAX - y));
-	    } else if ((unsigned)(-y-1) < _ROTATE_PREC_MAX) {
+	       INTERP_A000(dest, src_x_y + 1, x, ~y);
+	    } else if ((unsigned)(~y) < _ROTATE_PREC_MAX) {
 	       /*\  ..
 	       |*|  .4
 	       \*/
 	       INTERP_A000(dest, src_x_y + sow + 1, x, y);
-	    }
-	 }
+	    } else *dest = 0;
+	 } else *dest = 0;
 	 /*\ RIGHT; \*/
 	 x += dx;
 	 y += dy;
@ -343,9 +262,6 @@ __imlib_RotateAA(DATA32 *src, DATA32 *dest, int sow, int sw, int sh,
      dest += (dow - dw);

   }
-#ifdef EMMS
-   EMMS();
-#endif
 }

 /*\ Should this be in blend.c ?? \*/
@ -362,6 +278,7 @@ __imlib_BlendImageToImageAtAngle(ImlibImage *im_src, ImlibImage *im_dst,
   int ddw, ddh, x, y, dx, dy, i;
   double xy2;
   DATA32 *data, *src;
+   int do_mmx;
   
   if ((ssw < 0) || (ssh < 0))
      return;
@ -424,6 +341,9 @@ __imlib_BlendImageToImageAtAngle(ImlibImage *im_src, ImlibImage *im_dst,
      x += _ROTATE_PREC_MAX;
      y += _ROTATE_PREC_MAX;
   }
+#ifdef DO_MMX_ASM
+   do_mmx = __imlib_get_cpuid() & CPUID_MMX;
+#endif
   for (i = 0; i < im_dst->h; i += LINESIZE) {
      int x2, y2, w, h, l, r;
      
@ -496,16 +416,22 @@ __imlib_BlendImageToImageAtAngle(ImlibImage *im_src, ImlibImage *im_dst,
      
      w = r - l;
      h = MIN(LINESIZE, im_dst->h - i);
-      memset(data, 0, h * w * sizeof(DATA32));
      x += l * dx;
      y += l * dy;
      if (aa) {
-	 __imlib_RotateAA(src, data, im_src->w, ssw, ssh, w, w, h,
-			  x - _ROTATE_PREC_MAX, y - _ROTATE_PREC_MAX, dx, dy);
+	 x -= _ROTATE_PREC_MAX; y -= _ROTATE_PREC_MAX;
+#ifdef DO_MMX_ASM
+	 if (do_mmx)
+	    __imlib_mmx_RotateAA(src, data, im_src->w, ssw, ssh, w, w, h,
+				 x, y, dx, dy);
+	 else
+#endif
+	    __imlib_RotateAA(src, data, im_src->w, ssw, ssh, w, w, h,
+			     x, y, dx, dy);
 	 
      } else {
-	 __imlib_RotateSample(src, data, im_src->w,
-			      ssw, ssh, w, w, h, x, y, dx, dy);
+	 __imlib_RotateSample(src, data, im_src->w, ssw, ssh, w, w, h,
+			      x, y, dx, dy);
 	 
      }
      __imlib_BlendRGBAToData(data, w, h, im_dst->data,
--- a/src/rotate.h
+++ b/src/rotate.h
@ -21,4 +21,8 @@ void __imlib_BlendImageToImageAtAngle(ImlibImage *im_src, ImlibImage *im_dst,
 				      int ddx, int ddy, int ddw, int ddh,
 				      ImlibColorModifier *cm, ImlibOp op);

+#ifdef DO_MMX_ASM
+void __imlib_mmx_RotateAA(DATA32 *src, DATA32 *dest, int sow, int sw, int sh,
+		      int dow, int dw, int dh, int x, int y, int dx, int dy);
+#endif
 #endif
--- a/src/scale.c
+++ b/src/scale.c
--- a/src/scale.h
+++ b/src/scale.h
@ -1,21 +1,22 @@
 #ifndef __SCALE
 #define __SCALE 1

-DATA32 **
-__imlib_CalcYPoints(DATA32 *src, int sw, int sh, int dh, int b1, int b2);
-int *
-__imlib_CalcXPoints(int sw, int dw, int b1, int b2);
-int *
-__imlib_CalcApoints(int s, int d, int b1, int b2);
+typedef struct _imlib_scale_info ImlibScaleInfo;
+
+ImlibScaleInfo *
+__imlib_CalcScaleInfo(ImlibImage *im, int sw, int sh, int dw, int dh, char aa);
+ImlibScaleInfo *
+__imlib_FreeScaleInfo(ImlibScaleInfo *isi);
 void
-__imlib_ScaleSampleRGBA(DATA32 **ypoints, int *xpoints, DATA32 *dest,
-		int dxx, int dyy, int dx, int dy, int dw, int dh, int dow);
+__imlib_ScaleSampleRGBA(ImlibScaleInfo *isi, DATA32 *dest, int dxx, int dyy,
+			int dx, int dy, int dw, int dh, int dow);
 void
-__imlib_ScaleAARGBA(DATA32 **ypoints, int *xpoints, DATA32 *dest,
-	    int *xapoints, int *yapoints, char xup, char yup,
-	    int dxx, int dyy, int dx, int dy, int dw, int dh, int dow, int sow);
+__imlib_ScaleAARGBA(ImlibScaleInfo *isi, DATA32 *dest, int dxx, int dyy,
+		    int dx, int dy, int dw, int dh, int dow, int sow);
 void
-__imlib_ScaleAARGB(DATA32 **ypoints, int *xpoints, DATA32 *dest,
-	   int *xapoints, int *yapoints, char xup, char yup,
-	   int dxx, int dyy, int dx, int dy, int dw, int dh, int dow, int sow);
+__imlib_ScaleAARGB(ImlibScaleInfo *isi, DATA32 *dest, int dxx, int dyy,
+		   int dx, int dy, int dw, int dh, int dow, int sow);
+void
+__imlib_Scale_mmx_AARGBA(ImlibScaleInfo *isi, DATA32 *dest, int dxx, int dyy,
+			 int dx, int dy, int dw, int dh, int dow, int sow);
 #endif