From 2bf270814494e38ffa3d98e3e8f6c10bac9249bd Mon Sep 17 00:00:00 2001
From: Carsten Haitzler <raster@rasterman.com>
Date: Fri, 15 Apr 2005 07:00:40 +0000
Subject: [PATCH] John Slaten's amd64 mmx patch

SVN revision: 14207
---
 configure.in               |     2 +
 src/lib/Makefile.am        |     3 +-
 src/lib/amd64_blend_cmod.S | 16825 +++++++++++++++++++++++++++++++++++
 src/lib/blend.c            |    52 +-
 src/lib/blend.h            |    85 +
 5 files changed, 16950 insertions(+), 17 deletions(-)
 create mode 100644 src/lib/amd64_blend_cmod.S
diff --git a/configure.in b/configure.in
index 83c0f87..c310942 100644
--- a/configure.in
+++ b/configure.in
@@ -104,6 +104,8 @@ AC_ARG_ENABLE(mmx,[  --enable-mmx           attempt compiling using mmx assembly
 [
   if test x$enableval = xyes; then
     mmx=yes
+    # Cannot compile with both options enabled
+    amd64=no
     AC_MSG_RESULT(enabling mmx support)
   else
     mmx=no
diff --git a/src/lib/Makefile.am b/src/lib/Makefile.am
index fb896c7..d191a04 100644
--- a/src/lib/Makefile.am
+++ b/src/lib/Makefile.am
@@ -75,7 +75,8 @@ asm_rotate.S \
 asm_scale.S
 
 AMD64_SRCS = \
-amd64_blend.S
+amd64_blend.S \
+amd64_blend_cmod.S
 
 MMX_OBJS = $(MMX_SRCS:.S=.lo)
 AMD64_OBJS = $(AMD64_SRCS:.S=.lo)
diff --git a/src/lib/amd64_blend_cmod.S b/src/lib/amd64_blend_cmod.S
new file mode 100644
index 0000000..46a95f6
--- /dev/null
+++ b/src/lib/amd64_blend_cmod.S
@@ -0,0 +1,16825 @@
+#include <config.h>
+
+#ifdef __EMX__
+/* Due to strange behaviour of as.exe we use this macros */
+/* For all OS/2 coders - please use PGCC to compile this code */
+#define PR_(foo) ___##foo
+#define PT_(foo,func) ___##foo,##func
+#define SIZE(sym)                              \
+	.___end_##sym:;                        \
+	.size ___##sym,.___end_##sym-___##sym; \
+	.align 16;
+#else
+#define PR_(foo) __##foo
+#define PT_(foo,func) __##foo,##func
+#define SIZE(sym)                           \
+	.__end_##sym:;                      \
+	.size __##sym,.__end_##sym-__##sym; \
+	.align 16;
+#endif
+
+#ifdef DO_AMD64_ASM
+
+/*\ 
+|*| AMD64 SSE2 assembly blending routines for Imlib2
+|*| Written by John Slaten <zartheenumerator@comcast.net>
+|*| Based on MMX routines written by Willem Monsuwe <willem@stack.nl>
+\*/
+
+/*\ All functions have the same calling convention:
+|*|  __imlib_amd64_<op>_rgba_to_rgb[A](void *src, int sw, void *dst, int dw,
+|*|			               int w, int h, ImlibColorModifier *cm)
+|*| AMD64 GCC passes paramters by register, so no aliases exist in this version.
+\*/
+
+.text
+        .align 16
+.globl PR_(imlib_amd64_blend_rgba_to_rgb_cmod)
+	.type PT_(imlib_amd64_blend_rgba_to_rgb_cmod,@function)
+.globl PR_(imlib_amd64_blend_rgba_to_rgba_cmod)
+	.type PT_(imlib_amd64_blend_rgba_to_rgba_cmod,@function)
+.globl PR_(imlib_amd64_blend_rgb_to_rgba_cmod)
+	.type PT_(imlib_amd64_blend_rgb_to_rgba_cmod,@function)
+.globl PR_(imlib_amd64_blend_rgb_to_rgb_cmod)
+	.type PT_(imlib_amd64_blend_rgb_to_rgb_cmod,@function)
+.globl PR_(imlib_amd64_copy_rgba_to_rgb_cmod)
+	.type PT_(imlib_amd64_copy_rgba_to_rgb_cmod,@function)
+.globl PR_(imlib_amd64_copy_rgba_to_rgba_cmod)
+	.type PT_(imlib_amd64_copy_rgba_to_rgba_cmod,@function)
+.globl PR_(imlib_amd64_copy_rgb_to_rgba_cmod)
+	.type PT_(imlib_amd64_copy_rgb_to_rgba_cmod,@function)
+
+.globl PR_(imlib_amd64_add_blend_rgba_to_rgb_cmod)
+	.type PT_(imlib_amd64_add_blend_rgba_to_rgb_cmod,@function)
+.globl PR_(imlib_amd64_add_blend_rgba_to_rgba_cmod)
+	.type PT_(imlib_amd64_add_blend_rgba_to_rgba_cmod,@function)
+.globl PR_(imlib_amd64_add_blend_rgb_to_rgba_cmod)
+	.type PT_(imlib_amd64_add_blend_rgb_to_rgba_cmod,@function)
+.globl PR_(imlib_amd64_add_blend_rgb_to_rgb_cmod)
+	.type PT_(imlib_amd64_add_blend_rgb_to_rgb_cmod,@function)
+.globl PR_(imlib_amd64_add_copy_rgba_to_rgb_cmod)
+	.type PT_(imlib_amd64_add_copy_rgba_to_rgb_cmod,@function)
+.globl PR_(imlib_amd64_add_copy_rgba_to_rgba_cmod)
+	.type PT_(imlib_amd64_add_copy_rgba_to_rgba_cmod,@function)
+.globl PR_(imlib_amd64_add_copy_rgb_to_rgba_cmod)
+	.type PT_(imlib_amd64_add_copy_rgb_to_rgba_cmod,@function)
+
+.globl PR_(imlib_amd64_subtract_blend_rgba_to_rgb_cmod)
+	.type PT_(imlib_amd64_subtract_blend_rgba_to_rgb_cmod,@function)
+.globl PR_(imlib_amd64_subtract_blend_rgba_to_rgba_cmod)
+	.type PT_(imlib_amd64_subtract_blend_rgba_to_rgba_cmod,@function)
+.globl PR_(imlib_amd64_subtract_blend_rgb_to_rgba_cmod)
+	.type PT_(imlib_amd64_subtract_blend_rgb_to_rgba_cmod,@function)
+.globl PR_(imlib_amd64_subtract_blend_rgb_to_rgb_cmod)
+	.type PT_(imlib_amd64_subtract_blend_rgb_to_rgb_cmod,@function)
+.globl PR_(imlib_amd64_subtract_copy_rgba_to_rgb_cmod)
+	.type PT_(imlib_amd64_subtract_copy_rgba_to_rgb_cmod,@function)
+.globl PR_(imlib_amd64_subtract_copy_rgba_to_rgba_cmod)
+	.type PT_(imlib_amd64_subtract_copy_rgba_to_rgba_cmod,@function)
+.globl PR_(imlib_amd64_subtract_copy_rgb_to_rgba_cmod)
+	.type PT_(imlib_amd64_subtract_copy_rgb_to_rgba_cmod,@function)
+
+.globl PR_(imlib_amd64_reshade_blend_rgba_to_rgb_cmod)
+	.type PT_(imlib_amd64_reshade_blend_rgba_to_rgb_cmod,@function)
+.globl PR_(imlib_amd64_reshade_blend_rgba_to_rgba_cmod)
+	.type PT_(imlib_amd64_reshade_blend_rgba_to_rgba_cmod,@function)
+.globl PR_(imlib_amd64_reshade_blend_rgb_to_rgba_cmod)
+	.type PT_(imlib_amd64_reshade_blend_rgb_to_rgba_cmod,@function)
+.globl PR_(imlib_amd64_reshade_blend_rgb_to_rgb_cmod)
+	.type PT_(imlib_amd64_reshade_blend_rgb_to_rgb_cmod,@function)
+.globl PR_(imlib_amd64_reshade_copy_rgba_to_rgb_cmod)
+	.type PT_(imlib_amd64_reshade_copy_rgba_to_rgb_cmod,@function)
+.globl PR_(imlib_amd64_reshade_copy_rgba_to_rgba_cmod)
+	.type PT_(imlib_amd64_reshade_copy_rgba_to_rgba_cmod,@function)
+.globl PR_(imlib_amd64_reshade_copy_rgb_to_rgba_cmod)
+	.type PT_(imlib_amd64_reshade_copy_rgb_to_rgba_cmod,@function)
+
+.extern pow_lut
+	
+/*\ Some useful masks \*/
+m0X000000: .byte   0,   0,   0,   0,   0,   0, 255,   0
+	   .byte   0,   0,   0,   0,   0,   0, 255,   0
+m10000000: .byte   0,   0,   0,   0,   0,   0,   0,   1
+	   .byte   0,   0,   0,   0,   0,   0,   0,   1
+m00XXXXXX: .byte 255, 255, 255, 255, 255, 255,   0,   0
+	   .byte 255, 255, 255, 255, 255, 255,   0,   0
+mVX000000: .byte   0,   0,   0,   0,   0,   0, 255, 127
+	   .byte   0,   0,   0,   0,   0,   0, 255, 127
+mV0000000: .byte   0,   0,   0,   0,   0,   0,   0, 128
+	   .byte   0,   0,   0,   0,   0,   0,   0, 128
+mX000X000:         .byte   0,   0,   0,   0,   0,   0, 255, 255
+		   .byte   0,   0,   0,   0,   0,   0, 255, 255
+m0XXX0XXX0XXX0XXX: .byte 255, 255, 255,   0, 255, 255, 255,   0
+		   .byte 255, 255, 255,   0, 255, 255, 255,   0
+m0XXX0XXX00000000: .byte 255, 255, 255,   0, 255, 255, 255,   0
+		   .byte   0,   0,   0,   0,   0,   0,   0,   0
+m0XXX000000000000: .byte 255, 255, 255,   0,   0,   0,   0,   0
+		   .byte   0,   0,   0,   0,   0,   0,   0,   0
+mX000X000X000X000: .byte   0,   0,   0, 255,   0,   0,   0, 255
+		   .byte   0,   0,   0, 255,   0,   0,   0, 255
+mX000X00000000000: .byte   0,   0,   0, 255,   0,   0,   0, 255
+		   .byte   0,   0,   0, 255,   0,   0,   0, 255
+mX000000000000000: .byte   0,   0,   0, 255,   0,   0,   0, 255
+		   .byte   0,   0,   0, 255,   0,   0,   0, 255
+m1000100010001000: .byte   0,   0,   0,   1,   0,   0,   0,   1
+		   .byte   0,   0,   0,   1,   0,   0,   0,   1
+m000V0V0V000V0V0V: .byte 127,   0, 127,   0, 127,   0,   0,   0
+		   .byte 127,   0, 127,   0, 127,   0,   0,   0
+mI0000000I0000000: .byte   0,   0,   0,   0,   0,   0,   0,  64
+		   .byte   0,   0,   0,   0,   0,   0,   0,  64
+m0VVV0VVV0VVV0VVV: .byte 127, 127, 127,   0, 127, 127, 127,   0
+		   .byte 127, 127, 127,   0, 127, 127, 127,   0
+c1: .word 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1
+
+/*\ SSE register use:
+|*| %xmm1 = Source value
+|*| %xmm2 = Destination value
+|*| %xmm3 = Alpha value
+|*| %xmm4 = 0
+|*| %xmm5-%xmm7 = masks
+\*/
+
+/*\ Variables:
+|*| %rsi = src
+|*| %rdi = dst
+|*| %r8d = w
+|*| %r9d = h
+|*| %r10d = sw
+|*| %r11d = dw
+\*/
+	
+
+
+
+
+
+
+
+
+#define ENTER		\
+	pushq %rbp	; \
+	movq %rsp, %rbp	; \
+	pushq %rbx	; \
+	pushq %r13	; \
+	pushq %r14	; \
+	movq %rsi, %r10	; \
+	movq %rcx, %r11	; \
+	movq %rdi, %rsi	; \
+	movq %rdx, %rdi	; \
+	movq 16(%rbp), %r14 ; \
+			; \
+	/* param sanity check */ ; \
+	testq %r8, %r8	; \
+	jz 9f		; \
+	testq %r9, %r9	; \
+	jz 9f
+	
+#define LEAVE		\
+	popq %r14	; \
+	popq %r13	; \
+	popq %rbx	; \
+	movq %rbp, %rsp	; \
+	popq %rbp	; \
+	ret
+
+
+PR_(imlib_amd64_blend_rgba_to_rgb_cmod):
+	ENTER
+
+	pxor %xmm4, %xmm4
+	movdqa c1(%rip), %xmm5
+	movdqa m00XXXXXX(%rip), %xmm6
+
+	/* Move right to left across each line, */ 
+	/* processing in two pixel chunks */ 
+	leaq (%rsi, %r8, 4), %rsi	
+	leaq (%rdi, %r8, 4), %rdi	
+					
+	/* Last instruction is %rcx = 0 */ 
+	subq $4, %rsi			
+	subq $4, %rdi			
+					
+	negq %r8			
+0:					
+	movq %r8, %rcx			
+					
+	incq %rcx			
+
+	/* prefetch a couple cache lines ahead */
+	prefetchnta (%rsi, %rcx, 4)
+	prefetcht0 (%rdi, %rcx, 4)
+	prefetchnta 64(%rsi, %rcx, 4)
+	prefetcht0 64(%rdi, %rcx, 4)
+
+	jz 2f /* one pixel line */	
+1:
+	/* main loop, unrolled to work on 64 byte chunks */
+	prefetchnta 128(%rsi, %rcx, 4)
+	prefetcht0 128(%rdi, %rcx, 4)
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Get alpha from source and unpack to words
+	 * Result ranges is [0, 0x7fff], and is mapped to
+	 *  point values in [0.0, 1.0) by using the high word
+	 *  of the 32 bit multiplication result.
+	 * Because we want the unsigned value, we shift right one 
+	 *  here and also shift left the other factors to compensate.
+	 */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero out the alpha channel of the source to leave the
+	 * destination alpha unchanged.
+	 */
+	pand %xmm6, %xmm3
+
+	/* Unpack src and dst to words */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (a * ((s - d) + 0.5)) */
+	psubw %xmm2, %xmm1
+	psllw $1, %xmm1
+	paddw %xmm5, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* Repack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Get alpha from source and unpack to words
+	 * Result ranges is [0, 0x7fff], and is mapped to
+	 *  point values in [0.0, 1.0) by using the high word
+	 *  of the 32 bit multiplication result.
+	 * Because we want the unsigned value, we shift right one 
+	 *  here and also shift left the other factors to compensate.
+	 */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero out the alpha channel of the source to leave the
+	 * destination alpha unchanged.
+	 */
+	pand %xmm6, %xmm3
+
+	/* Unpack src and dst to words */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (a * ((s - d) + 0.5)) */
+	psubw %xmm2, %xmm1
+	psllw $1, %xmm1
+	paddw %xmm5, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* Repack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Get alpha from source and unpack to words
+	 * Result ranges is [0, 0x7fff], and is mapped to
+	 *  point values in [0.0, 1.0) by using the high word
+	 *  of the 32 bit multiplication result.
+	 * Because we want the unsigned value, we shift right one 
+	 *  here and also shift left the other factors to compensate.
+	 */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero out the alpha channel of the source to leave the
+	 * destination alpha unchanged.
+	 */
+	pand %xmm6, %xmm3
+
+	/* Unpack src and dst to words */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (a * ((s - d) + 0.5)) */
+	psubw %xmm2, %xmm1
+	psllw $1, %xmm1
+	paddw %xmm5, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* Repack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Get alpha from source and unpack to words
+	 * Result ranges is [0, 0x7fff], and is mapped to
+	 *  point values in [0.0, 1.0) by using the high word
+	 *  of the 32 bit multiplication result.
+	 * Because we want the unsigned value, we shift right one 
+	 *  here and also shift left the other factors to compensate.
+	 */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero out the alpha channel of the source to leave the
+	 * destination alpha unchanged.
+	 */
+	pand %xmm6, %xmm3
+
+	/* Unpack src and dst to words */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (a * ((s - d) + 0.5)) */
+	psubw %xmm2, %xmm1
+	psllw $1, %xmm1
+	paddw %xmm5, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* Repack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Get alpha from source and unpack to words
+	 * Result ranges is [0, 0x7fff], and is mapped to
+	 *  point values in [0.0, 1.0) by using the high word
+	 *  of the 32 bit multiplication result.
+	 * Because we want the unsigned value, we shift right one 
+	 *  here and also shift left the other factors to compensate.
+	 */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero out the alpha channel of the source to leave the
+	 * destination alpha unchanged.
+	 */
+	pand %xmm6, %xmm3
+
+	/* Unpack src and dst to words */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (a * ((s - d) + 0.5)) */
+	psubw %xmm2, %xmm1
+	psllw $1, %xmm1
+	paddw %xmm5, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* Repack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Get alpha from source and unpack to words
+	 * Result ranges is [0, 0x7fff], and is mapped to
+	 *  point values in [0.0, 1.0) by using the high word
+	 *  of the 32 bit multiplication result.
+	 * Because we want the unsigned value, we shift right one 
+	 *  here and also shift left the other factors to compensate.
+	 */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero out the alpha channel of the source to leave the
+	 * destination alpha unchanged.
+	 */
+	pand %xmm6, %xmm3
+
+	/* Unpack src and dst to words */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (a * ((s - d) + 0.5)) */
+	psubw %xmm2, %xmm1
+	psllw $1, %xmm1
+	paddw %xmm5, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* Repack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Get alpha from source and unpack to words
+	 * Result ranges is [0, 0x7fff], and is mapped to
+	 *  point values in [0.0, 1.0) by using the high word
+	 *  of the 32 bit multiplication result.
+	 * Because we want the unsigned value, we shift right one 
+	 *  here and also shift left the other factors to compensate.
+	 */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero out the alpha channel of the source to leave the
+	 * destination alpha unchanged.
+	 */
+	pand %xmm6, %xmm3
+
+	/* Unpack src and dst to words */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (a * ((s - d) + 0.5)) */
+	psubw %xmm2, %xmm1
+	psllw $1, %xmm1
+	paddw %xmm5, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* Repack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Get alpha from source and unpack to words
+	 * Result ranges is [0, 0x7fff], and is mapped to
+	 *  point values in [0.0, 1.0) by using the high word
+	 *  of the 32 bit multiplication result.
+	 * Because we want the unsigned value, we shift right one 
+	 *  here and also shift left the other factors to compensate.
+	 */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero out the alpha channel of the source to leave the
+	 * destination alpha unchanged.
+	 */
+	pand %xmm6, %xmm3
+
+	/* Unpack src and dst to words */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (a * ((s - d) + 0.5)) */
+	psubw %xmm2, %xmm1
+	psllw $1, %xmm1
+	paddw %xmm5, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* Repack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	js 1b				
+	jnz 3f				
+2:
+	/* Grab 1 pixel from src, with colormod */
+	movl (%rsi, %rcx, 4), %eax
+	ror $24, %eax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %edx, %xmm1
+	movd (%rdi, %rcx, 4), %xmm2
+	/* Get alpha from source and unpack to words
+	 * Result ranges is [0, 0x7fff], and is mapped to
+	 *  point values in [0.0, 1.0) by using the high word
+	 *  of the 32 bit multiplication result.
+	 * Because we want the unsigned value, we shift right one 
+	 *  here and also shift left the other factors to compensate.
+	 */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero out the alpha channel of the source to leave the
+	 * destination alpha unchanged.
+	 */
+	pand %xmm6, %xmm3
+
+	/* Unpack src and dst to words */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (a * ((s - d) + 0.5)) */
+	psubw %xmm2, %xmm1
+	psllw $1, %xmm1
+	paddw %xmm5, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* Repack new pixels */
+	packuswb %xmm4, %xmm2
+	movd %xmm2, (%rdi, %rcx, 4)
+3:					
+	leaq (%rsi, %r10, 4), %rsi	
+	leaq (%rdi, %r11, 4), %rdi	
+	decq %r9			
+	jnz 0b
+
+9:
+	LEAVE
+SIZE(imlib_amd64_blend_rgba_to_rgb_cmod)
+PR_(imlib_amd64_blend_rgba_to_rgba_cmod):
+	ENTER
+
+	pxor %xmm4, %xmm4
+	movdqa c1(%rip), %xmm5
+	xorq %rax, %rax
+	movdqa mX000X000X000X000(%rip), %xmm6
+	movq pow_lut@GOTPCREL(%rip), %r13
+
+	/* Move right to left across each line, */ 
+	/* processing in two pixel chunks */ 
+	leaq (%rsi, %r8, 4), %rsi	
+	leaq (%rdi, %r8, 4), %rdi	
+					
+	/* Last instruction is %rcx = 0 */ 
+	subq $4, %rsi			
+	subq $4, %rdi			
+					
+	negq %r8			
+0:					
+	movq %r8, %rcx			
+					
+	incq %rcx			
+
+	/* prefetch a couple cache lines ahead */
+	prefetchnta (%rsi, %rcx, 4)
+	prefetcht0 (%rdi, %rcx, 4)
+	prefetchnta 64(%rsi, %rcx, 4)
+	prefetcht0 64(%rdi, %rcx, 4)
+
+	jz 2f /* one pixel line */	
+1:
+	/* main loop, unrolled to work on 64 byte chunks */
+	prefetchnta 128(%rsi, %rcx, 4)
+	prefetcht0 128(%rdi, %rcx, 4)
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	/* override source alpha to 255 */
+	por %xmm6, %xmm1
+
+	/* unpack alpha to src alpha, combined alpha x 3 */
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	/* unpack source and dest */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (a * ((s - d) + 0.5)) */
+	psubw %xmm2, %xmm1
+	psllw $1, %xmm1
+	paddw %xmm5, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* repack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	/* override source alpha to 255 */
+	por %xmm6, %xmm1
+
+	/* unpack alpha to src alpha, combined alpha x 3 */
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	/* unpack source and dest */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (a * ((s - d) + 0.5)) */
+	psubw %xmm2, %xmm1
+	psllw $1, %xmm1
+	paddw %xmm5, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* repack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	/* override source alpha to 255 */
+	por %xmm6, %xmm1
+
+	/* unpack alpha to src alpha, combined alpha x 3 */
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	/* unpack source and dest */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (a * ((s - d) + 0.5)) */
+	psubw %xmm2, %xmm1
+	psllw $1, %xmm1
+	paddw %xmm5, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* repack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	/* override source alpha to 255 */
+	por %xmm6, %xmm1
+
+	/* unpack alpha to src alpha, combined alpha x 3 */
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	/* unpack source and dest */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (a * ((s - d) + 0.5)) */
+	psubw %xmm2, %xmm1
+	psllw $1, %xmm1
+	paddw %xmm5, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* repack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	/* override source alpha to 255 */
+	por %xmm6, %xmm1
+
+	/* unpack alpha to src alpha, combined alpha x 3 */
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	/* unpack source and dest */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (a * ((s - d) + 0.5)) */
+	psubw %xmm2, %xmm1
+	psllw $1, %xmm1
+	paddw %xmm5, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* repack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	/* override source alpha to 255 */
+	por %xmm6, %xmm1
+
+	/* unpack alpha to src alpha, combined alpha x 3 */
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	/* unpack source and dest */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (a * ((s - d) + 0.5)) */
+	psubw %xmm2, %xmm1
+	psllw $1, %xmm1
+	paddw %xmm5, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* repack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	/* override source alpha to 255 */
+	por %xmm6, %xmm1
+
+	/* unpack alpha to src alpha, combined alpha x 3 */
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	/* unpack source and dest */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (a * ((s - d) + 0.5)) */
+	psubw %xmm2, %xmm1
+	psllw $1, %xmm1
+	paddw %xmm5, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* repack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	/* override source alpha to 255 */
+	por %xmm6, %xmm1
+
+	/* unpack alpha to src alpha, combined alpha x 3 */
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	/* unpack source and dest */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (a * ((s - d) + 0.5)) */
+	psubw %xmm2, %xmm1
+	psllw $1, %xmm1
+	paddw %xmm5, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* repack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	js 1b				
+	jnz 3f				
+2:
+	/* Grab 1 pixel from src, with colormod */
+	movl (%rsi, %rcx, 4), %eax
+	ror $24, %eax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %edx, %xmm1
+	movd (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending */
+	roll $16, %edx
+	andl $0x0000ff00, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+	movd %eax, %xmm3
+	/* override source alpha to 255 */
+	por %xmm6, %xmm1
+
+	/* unpack alpha to src alpha, combined alpha x 3 */
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	/* unpack source and dest */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (a * ((s - d) + 0.5)) */
+	psubw %xmm2, %xmm1
+	psllw $1, %xmm1
+	paddw %xmm5, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* repack new pixels */
+	packuswb %xmm4, %xmm2
+	movd %xmm2, (%rdi, %rcx, 4)
+3:					
+	leaq (%rsi, %r10, 4), %rsi	
+	leaq (%rdi, %r11, 4), %rdi	
+	decq %r9			
+	jnz 0b
+
+9:
+	LEAVE
+SIZE(imlib_amd64_blend_rgba_to_rgba_cmod)
+PR_(imlib_amd64_blend_rgb_to_rgba_cmod):
+	ENTER
+
+	pxor %xmm4, %xmm4
+	movdqa c1(%rip), %xmm5
+	xorq %rax, %rax
+	movdqa mX000X000X000X000(%rip), %xmm6
+	movq pow_lut@GOTPCREL(%rip), %r13
+
+	/* Move right to left across each line, */ 
+	/* processing in two pixel chunks */ 
+	leaq (%rsi, %r8, 4), %rsi	
+	leaq (%rdi, %r8, 4), %rdi	
+					
+	/* Last instruction is %rcx = 0 */ 
+	subq $4, %rsi			
+	subq $4, %rdi			
+					
+	negq %r8			
+0:					
+	movq %r8, %rcx			
+					
+	incq %rcx			
+
+	/* prefetch a couple cache lines ahead */
+	prefetchnta (%rsi, %rcx, 4)
+	prefetcht0 (%rdi, %rcx, 4)
+	prefetchnta 64(%rsi, %rcx, 4)
+	prefetcht0 64(%rdi, %rcx, 4)
+
+	jz 2f /* one pixel line */	
+1:
+	/* main loop, unrolled to work on 64 byte chunks */
+	prefetchnta 128(%rsi, %rcx, 4)
+	prefetcht0 128(%rdi, %rcx, 4)
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	/* override source alpha to 255 */
+	por %xmm6, %xmm1
+
+	/* unpack alpha to src alpha, combined alpha x 3 */
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	/* unpack source and dest */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (a * ((s - d) + 0.5)) */
+	psubw %xmm2, %xmm1
+	psllw $1, %xmm1
+	paddw %xmm5, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* repack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	/* override source alpha to 255 */
+	por %xmm6, %xmm1
+
+	/* unpack alpha to src alpha, combined alpha x 3 */
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	/* unpack source and dest */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (a * ((s - d) + 0.5)) */
+	psubw %xmm2, %xmm1
+	psllw $1, %xmm1
+	paddw %xmm5, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* repack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	/* override source alpha to 255 */
+	por %xmm6, %xmm1
+
+	/* unpack alpha to src alpha, combined alpha x 3 */
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	/* unpack source and dest */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (a * ((s - d) + 0.5)) */
+	psubw %xmm2, %xmm1
+	psllw $1, %xmm1
+	paddw %xmm5, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* repack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	/* override source alpha to 255 */
+	por %xmm6, %xmm1
+
+	/* unpack alpha to src alpha, combined alpha x 3 */
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	/* unpack source and dest */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (a * ((s - d) + 0.5)) */
+	psubw %xmm2, %xmm1
+	psllw $1, %xmm1
+	paddw %xmm5, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* repack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	/* override source alpha to 255 */
+	por %xmm6, %xmm1
+
+	/* unpack alpha to src alpha, combined alpha x 3 */
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	/* unpack source and dest */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (a * ((s - d) + 0.5)) */
+	psubw %xmm2, %xmm1
+	psllw $1, %xmm1
+	paddw %xmm5, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* repack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	/* override source alpha to 255 */
+	por %xmm6, %xmm1
+
+	/* unpack alpha to src alpha, combined alpha x 3 */
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	/* unpack source and dest */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (a * ((s - d) + 0.5)) */
+	psubw %xmm2, %xmm1
+	psllw $1, %xmm1
+	paddw %xmm5, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* repack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	/* override source alpha to 255 */
+	por %xmm6, %xmm1
+
+	/* unpack alpha to src alpha, combined alpha x 3 */
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	/* unpack source and dest */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (a * ((s - d) + 0.5)) */
+	psubw %xmm2, %xmm1
+	psllw $1, %xmm1
+	paddw %xmm5, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* repack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	/* override source alpha to 255 */
+	por %xmm6, %xmm1
+
+	/* unpack alpha to src alpha, combined alpha x 3 */
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	/* unpack source and dest */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (a * ((s - d) + 0.5)) */
+	psubw %xmm2, %xmm1
+	psllw $1, %xmm1
+	paddw %xmm5, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* repack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	js 1b				
+	jnz 3f				
+2:
+	/* Grab 1 pixel from src, with colormod, with a = amod[255] */
+	movl (%rsi, %rcx, 4), %eax
+	ror $16, %eax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shl $8, %edx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %edx, %xmm1
+	movd (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending */
+	roll $16, %edx
+	andl $0x0000ff00, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+	movd %eax, %xmm3
+	/* override source alpha to 255 */
+	por %xmm6, %xmm1
+
+	/* unpack alpha to src alpha, combined alpha x 3 */
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	/* unpack source and dest */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (a * ((s - d) + 0.5)) */
+	psubw %xmm2, %xmm1
+	psllw $1, %xmm1
+	paddw %xmm5, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* repack new pixels */
+	packuswb %xmm4, %xmm2
+	movd %xmm2, (%rdi, %rcx, 4)
+3:					
+	leaq (%rsi, %r10, 4), %rsi	
+	leaq (%rdi, %r11, 4), %rdi	
+	decq %r9			
+	jnz 0b
+
+9:
+	LEAVE
+SIZE(imlib_amd64_blend_rgb_to_rgba_cmod)
+PR_(imlib_amd64_blend_rgb_to_rgb_cmod):
+	ENTER
+
+	pxor %xmm4, %xmm4
+	movdqa c1(%rip), %xmm5
+	movdqa m00XXXXXX(%rip), %xmm6
+
+	/* Move right to left across each line, */ 
+	/* processing in two pixel chunks */ 
+	leaq (%rsi, %r8, 4), %rsi	
+	leaq (%rdi, %r8, 4), %rdi	
+					
+	/* Last instruction is %rcx = 0 */ 
+	subq $4, %rsi			
+	subq $4, %rdi			
+					
+	negq %r8			
+0:					
+	movq %r8, %rcx			
+					
+	incq %rcx			
+
+	/* prefetch a couple cache lines ahead */
+	prefetchnta (%rsi, %rcx, 4)
+	prefetcht0 (%rdi, %rcx, 4)
+	prefetchnta 64(%rsi, %rcx, 4)
+	prefetcht0 64(%rdi, %rcx, 4)
+
+	jz 2f /* one pixel line */	
+1:
+	/* main loop, unrolled to work on 64 byte chunks */
+	prefetchnta 128(%rsi, %rcx, 4)
+	prefetcht0 128(%rdi, %rcx, 4)
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Get alpha from source and unpack to words
+	 * Result ranges is [0, 0x7fff], and is mapped to
+	 *  point values in [0.0, 1.0) by using the high word
+	 *  of the 32 bit multiplication result.
+	 * Because we want the unsigned value, we shift right one 
+	 *  here and also shift left the other factors to compensate.
+	 */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero out the alpha channel of the source to leave the
+	 * destination alpha unchanged.
+	 */
+	pand %xmm6, %xmm3
+
+	/* Unpack src and dst to words */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (a * ((s - d) + 0.5)) */
+	psubw %xmm2, %xmm1
+	psllw $1, %xmm1
+	paddw %xmm5, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* Repack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Get alpha from source and unpack to words
+	 * Result ranges is [0, 0x7fff], and is mapped to
+	 *  point values in [0.0, 1.0) by using the high word
+	 *  of the 32 bit multiplication result.
+	 * Because we want the unsigned value, we shift right one 
+	 *  here and also shift left the other factors to compensate.
+	 */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero out the alpha channel of the source to leave the
+	 * destination alpha unchanged.
+	 */
+	pand %xmm6, %xmm3
+
+	/* Unpack src and dst to words */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (a * ((s - d) + 0.5)) */
+	psubw %xmm2, %xmm1
+	psllw $1, %xmm1
+	paddw %xmm5, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* Repack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Get alpha from source and unpack to words
+	 * Result ranges is [0, 0x7fff], and is mapped to
+	 *  point values in [0.0, 1.0) by using the high word
+	 *  of the 32 bit multiplication result.
+	 * Because we want the unsigned value, we shift right one 
+	 *  here and also shift left the other factors to compensate.
+	 */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero out the alpha channel of the source to leave the
+	 * destination alpha unchanged.
+	 */
+	pand %xmm6, %xmm3
+
+	/* Unpack src and dst to words */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (a * ((s - d) + 0.5)) */
+	psubw %xmm2, %xmm1
+	psllw $1, %xmm1
+	paddw %xmm5, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* Repack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Get alpha from source and unpack to words
+	 * Result ranges is [0, 0x7fff], and is mapped to
+	 *  point values in [0.0, 1.0) by using the high word
+	 *  of the 32 bit multiplication result.
+	 * Because we want the unsigned value, we shift right one 
+	 *  here and also shift left the other factors to compensate.
+	 */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero out the alpha channel of the source to leave the
+	 * destination alpha unchanged.
+	 */
+	pand %xmm6, %xmm3
+
+	/* Unpack src and dst to words */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (a * ((s - d) + 0.5)) */
+	psubw %xmm2, %xmm1
+	psllw $1, %xmm1
+	paddw %xmm5, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* Repack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Get alpha from source and unpack to words
+	 * Result ranges is [0, 0x7fff], and is mapped to
+	 *  point values in [0.0, 1.0) by using the high word
+	 *  of the 32 bit multiplication result.
+	 * Because we want the unsigned value, we shift right one 
+	 *  here and also shift left the other factors to compensate.
+	 */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero out the alpha channel of the source to leave the
+	 * destination alpha unchanged.
+	 */
+	pand %xmm6, %xmm3
+
+	/* Unpack src and dst to words */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (a * ((s - d) + 0.5)) */
+	psubw %xmm2, %xmm1
+	psllw $1, %xmm1
+	paddw %xmm5, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* Repack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Get alpha from source and unpack to words
+	 * Result ranges is [0, 0x7fff], and is mapped to
+	 *  point values in [0.0, 1.0) by using the high word
+	 *  of the 32 bit multiplication result.
+	 * Because we want the unsigned value, we shift right one 
+	 *  here and also shift left the other factors to compensate.
+	 */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero out the alpha channel of the source to leave the
+	 * destination alpha unchanged.
+	 */
+	pand %xmm6, %xmm3
+
+	/* Unpack src and dst to words */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (a * ((s - d) + 0.5)) */
+	psubw %xmm2, %xmm1
+	psllw $1, %xmm1
+	paddw %xmm5, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* Repack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Get alpha from source and unpack to words
+	 * Result ranges is [0, 0x7fff], and is mapped to
+	 *  point values in [0.0, 1.0) by using the high word
+	 *  of the 32 bit multiplication result.
+	 * Because we want the unsigned value, we shift right one 
+	 *  here and also shift left the other factors to compensate.
+	 */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero out the alpha channel of the source to leave the
+	 * destination alpha unchanged.
+	 */
+	pand %xmm6, %xmm3
+
+	/* Unpack src and dst to words */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (a * ((s - d) + 0.5)) */
+	psubw %xmm2, %xmm1
+	psllw $1, %xmm1
+	paddw %xmm5, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* Repack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Get alpha from source and unpack to words
+	 * Result ranges is [0, 0x7fff], and is mapped to
+	 *  point values in [0.0, 1.0) by using the high word
+	 *  of the 32 bit multiplication result.
+	 * Because we want the unsigned value, we shift right one 
+	 *  here and also shift left the other factors to compensate.
+	 */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero out the alpha channel of the source to leave the
+	 * destination alpha unchanged.
+	 */
+	pand %xmm6, %xmm3
+
+	/* Unpack src and dst to words */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (a * ((s - d) + 0.5)) */
+	psubw %xmm2, %xmm1
+	psllw $1, %xmm1
+	paddw %xmm5, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* Repack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	js 1b				
+	jnz 3f				
+2:
+	/* Grab 1 pixel from src, with colormod, with a = amod[255] */
+	movl (%rsi, %rcx, 4), %eax
+	ror $16, %eax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shl $8, %edx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %edx, %xmm1
+	movd (%rdi, %rcx, 4), %xmm2 
+	/* Get alpha from source and unpack to words
+	 * Result ranges is [0, 0x7fff], and is mapped to
+	 *  point values in [0.0, 1.0) by using the high word
+	 *  of the 32 bit multiplication result.
+	 * Because we want the unsigned value, we shift right one 
+	 *  here and also shift left the other factors to compensate.
+	 */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero out the alpha channel of the source to leave the
+	 * destination alpha unchanged.
+	 */
+	pand %xmm6, %xmm3
+
+	/* Unpack src and dst to words */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (a * ((s - d) + 0.5)) */
+	psubw %xmm2, %xmm1
+	psllw $1, %xmm1
+	paddw %xmm5, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* Repack new pixels */
+	packuswb %xmm4, %xmm2
+	movd %xmm2, (%rdi, %rcx, 4)
+3:					
+	leaq (%rsi, %r10, 4), %rsi	
+	leaq (%rdi, %r11, 4), %rdi	
+	decq %r9			
+	jnz 0b
+
+9:
+	LEAVE
+SIZE(imlib_amd64_blend_rgb_to_rgb_cmod)
+PR_(imlib_amd64_copy_rgba_to_rgb_cmod):
+	ENTER
+
+	movq mX000X000X000X000(%rip), %r13
+
+	/* Move right to left across each line, */ 
+	/* processing in two pixel chunks */ 
+	leaq (%rsi, %r8, 4), %rsi	
+	leaq (%rdi, %r8, 4), %rdi	
+					
+	/* Last instruction is %rcx = 0 */ 
+	subq $4, %rsi			
+	subq $4, %rdi			
+					
+	negq %r8			
+0:					
+	movq %r8, %rcx			
+					
+	incq %rcx			
+
+	/* prefetch a couple cache lines ahead */
+	prefetchnta (%rsi, %rcx, 4)
+	prefetcht0 (%rdi, %rcx, 4)
+	prefetchnta 64(%rsi, %rcx, 4)
+	prefetcht0 64(%rdi, %rcx, 4)
+
+	jz 2f /* one pixel line */	
+1:
+	/* main loop, unrolled to work on 64 byte chunks */
+	prefetchnta 128(%rsi, %rcx, 4)
+	prefetcht0 128(%rdi, %rcx, 4)
+
+	/* Grab 2 pixels from src, with colormod, with a = 0 */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movzbq %al, %rbx
+	movzbq 0x000(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $16, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movq (%rdi, %rcx, 4), %rax
+	andq %r13, %rax
+	orq %rax, %rdx
+	movq %rdx, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = 0 */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movzbq %al, %rbx
+	movzbq 0x000(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $16, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movq (%rdi, %rcx, 4), %rax
+	andq %r13, %rax
+	orq %rax, %rdx
+	movq %rdx, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = 0 */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movzbq %al, %rbx
+	movzbq 0x000(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $16, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movq (%rdi, %rcx, 4), %rax
+	andq %r13, %rax
+	orq %rax, %rdx
+	movq %rdx, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = 0 */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movzbq %al, %rbx
+	movzbq 0x000(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $16, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movq (%rdi, %rcx, 4), %rax
+	andq %r13, %rax
+	orq %rax, %rdx
+	movq %rdx, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = 0 */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movzbq %al, %rbx
+	movzbq 0x000(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $16, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movq (%rdi, %rcx, 4), %rax
+	andq %r13, %rax
+	orq %rax, %rdx
+	movq %rdx, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = 0 */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movzbq %al, %rbx
+	movzbq 0x000(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $16, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movq (%rdi, %rcx, 4), %rax
+	andq %r13, %rax
+	orq %rax, %rdx
+	movq %rdx, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = 0 */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movzbq %al, %rbx
+	movzbq 0x000(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $16, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movq (%rdi, %rcx, 4), %rax
+	andq %r13, %rax
+	orq %rax, %rdx
+	movq %rdx, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = 0 */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movzbq %al, %rbx
+	movzbq 0x000(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $16, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movq (%rdi, %rcx, 4), %rax
+	andq %r13, %rax
+	orq %rax, %rdx
+	movq %rdx, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	js 1b				
+	jnz 3f				
+2:
+	/* Grab 1 pixel from src, with colormod, with a = 0 */
+	movl (%rsi, %rcx, 4), %eax
+	ror $16, %eax
+	movzbq %al, %rbx
+	movzbq 0x000(%r14, %rbx), %rdx
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movl (%rdi, %rcx, 4), %eax
+	andq %r13, %rax
+	orq %rax, %rdx
+	movl %edx, (%rdi, %rcx, 4)
+3:					
+	leaq (%rsi, %r10, 4), %rsi	
+	leaq (%rdi, %r11, 4), %rdi	
+	decq %r9			
+	jnz 0b
+
+9:
+	LEAVE
+SIZE(imlib_amd64_copy_rgba_to_rgb_cmod)
+PR_(imlib_amd64_copy_rgba_to_rgba_cmod):
+	ENTER
+
+
+	/* Move right to left across each line, */ 
+	/* processing in two pixel chunks */ 
+	leaq (%rsi, %r8, 4), %rsi	
+	leaq (%rdi, %r8, 4), %rdi	
+					
+	/* Last instruction is %rcx = 0 */ 
+	subq $4, %rsi			
+	subq $4, %rdi			
+					
+	negq %r8			
+0:					
+	movq %r8, %rcx			
+					
+	incq %rcx			
+
+	/* prefetch a couple cache lines ahead */
+	prefetchnta (%rsi, %rcx, 4)
+	prefetcht0 (%rdi, %rcx, 4)
+	prefetchnta 64(%rsi, %rcx, 4)
+	prefetcht0 64(%rdi, %rcx, 4)
+
+	jz 2f /* one pixel line */	
+1:
+	/* main loop, unrolled to work on 64 byte chunks */
+	prefetchnta 128(%rsi, %rcx, 4)
+	prefetcht0 128(%rdi, %rcx, 4)
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movq %rdx, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movq %rdx, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movq %rdx, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movq %rdx, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movq %rdx, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movq %rdx, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movq %rdx, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movq %rdx, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	js 1b				
+	jnz 3f				
+2:
+	/* Grab 1 pixel from src, with colormod */
+	movl (%rsi, %rcx, 4), %eax
+	ror $24, %eax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movl %edx, (%rdi, %rcx, 4)
+3:					
+	leaq (%rsi, %r10, 4), %rsi	
+	leaq (%rdi, %r11, 4), %rdi	
+	decq %r9			
+	jnz 0b
+
+9:
+	LEAVE
+SIZE(imlib_amd64_copy_rgba_to_rgba_cmod)
+PR_(imlib_amd64_copy_rgb_to_rgba_cmod):
+	ENTER
+
+
+	/* Move right to left across each line, */ 
+	/* processing in two pixel chunks */ 
+	leaq (%rsi, %r8, 4), %rsi	
+	leaq (%rdi, %r8, 4), %rdi	
+					
+	/* Last instruction is %rcx = 0 */ 
+	subq $4, %rsi			
+	subq $4, %rdi			
+					
+	negq %r8			
+0:					
+	movq %r8, %rcx			
+					
+	incq %rcx			
+
+	/* prefetch a couple cache lines ahead */
+	prefetchnta (%rsi, %rcx, 4)
+	prefetcht0 (%rdi, %rcx, 4)
+	prefetchnta 64(%rsi, %rcx, 4)
+	prefetcht0 64(%rdi, %rcx, 4)
+
+	jz 2f /* one pixel line */	
+1:
+	/* main loop, unrolled to work on 64 byte chunks */
+	prefetchnta 128(%rsi, %rcx, 4)
+	prefetcht0 128(%rdi, %rcx, 4)
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movq %rdx, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movq %rdx, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movq %rdx, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movq %rdx, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movq %rdx, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movq %rdx, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movq %rdx, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movq %rdx, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	js 1b				
+	jnz 3f				
+2:
+	/* Grab 1 pixel from src, with colormod, with a = amod[255] */
+	movl (%rsi, %rcx, 4), %eax
+	ror $16, %eax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shl $8, %edx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movl %edx, (%rdi, %rcx, 4)
+3:					
+	leaq (%rsi, %r10, 4), %rsi	
+	leaq (%rdi, %r11, 4), %rdi	
+	decq %r9			
+	jnz 0b
+
+9:
+	LEAVE
+SIZE(imlib_amd64_copy_rgb_to_rgba_cmod)
+PR_(imlib_amd64_add_blend_rgba_to_rgb_cmod):
+	ENTER
+
+	pxor %xmm4, %xmm4
+	movdqa m00XXXXXX(%rip), %xmm6
+
+	/* Move right to left across each line, */ 
+	/* processing in two pixel chunks */ 
+	leaq (%rsi, %r8, 4), %rsi	
+	leaq (%rdi, %r8, 4), %rdi	
+					
+	/* Last instruction is %rcx = 0 */ 
+	subq $4, %rsi			
+	subq $4, %rdi			
+					
+	negq %r8			
+0:					
+	movq %r8, %rcx			
+					
+	incq %rcx			
+
+	/* prefetch a couple cache lines ahead */
+	prefetchnta (%rsi, %rcx, 4)
+	prefetcht0 (%rdi, %rcx, 4)
+	prefetchnta 64(%rsi, %rcx, 4)
+	prefetcht0 64(%rdi, %rcx, 4)
+
+	jz 2f /* one pixel line */	
+1:
+	/* main loop, unrolled to work on 64 byte chunks */
+	prefetchnta 128(%rsi, %rcx, 4)
+	prefetcht0 128(%rdi, %rcx, 4)
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Get alpha from source and unpack to words
+	 * Result ranges is [0, 0x7fff], and is mapped to
+	 *  point values in [0.0, 1.0) by using the high word
+	 *  of the 32 bit multiplication result.
+	 * Because we want the unsigned value, we shift right one 
+	 *  here and also shift left the other factors to compensate.
+	 */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero out the alpha channel of the source to leave the
+	 * destination alpha unchanged.
+	 */
+	pand %xmm6, %xmm3
+
+	/* unpack src and dst to words */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (a * s) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Get alpha from source and unpack to words
+	 * Result ranges is [0, 0x7fff], and is mapped to
+	 *  point values in [0.0, 1.0) by using the high word
+	 *  of the 32 bit multiplication result.
+	 * Because we want the unsigned value, we shift right one 
+	 *  here and also shift left the other factors to compensate.
+	 */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero out the alpha channel of the source to leave the
+	 * destination alpha unchanged.
+	 */
+	pand %xmm6, %xmm3
+
+	/* unpack src and dst to words */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (a * s) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Get alpha from source and unpack to words
+	 * Result ranges is [0, 0x7fff], and is mapped to
+	 *  point values in [0.0, 1.0) by using the high word
+	 *  of the 32 bit multiplication result.
+	 * Because we want the unsigned value, we shift right one 
+	 *  here and also shift left the other factors to compensate.
+	 */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero out the alpha channel of the source to leave the
+	 * destination alpha unchanged.
+	 */
+	pand %xmm6, %xmm3
+
+	/* unpack src and dst to words */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (a * s) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Get alpha from source and unpack to words
+	 * Result ranges is [0, 0x7fff], and is mapped to
+	 *  point values in [0.0, 1.0) by using the high word
+	 *  of the 32 bit multiplication result.
+	 * Because we want the unsigned value, we shift right one 
+	 *  here and also shift left the other factors to compensate.
+	 */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero out the alpha channel of the source to leave the
+	 * destination alpha unchanged.
+	 */
+	pand %xmm6, %xmm3
+
+	/* unpack src and dst to words */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (a * s) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Get alpha from source and unpack to words
+	 * Result ranges is [0, 0x7fff], and is mapped to
+	 *  point values in [0.0, 1.0) by using the high word
+	 *  of the 32 bit multiplication result.
+	 * Because we want the unsigned value, we shift right one 
+	 *  here and also shift left the other factors to compensate.
+	 */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero out the alpha channel of the source to leave the
+	 * destination alpha unchanged.
+	 */
+	pand %xmm6, %xmm3
+
+	/* unpack src and dst to words */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (a * s) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Get alpha from source and unpack to words
+	 * Result ranges is [0, 0x7fff], and is mapped to
+	 *  point values in [0.0, 1.0) by using the high word
+	 *  of the 32 bit multiplication result.
+	 * Because we want the unsigned value, we shift right one 
+	 *  here and also shift left the other factors to compensate.
+	 */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero out the alpha channel of the source to leave the
+	 * destination alpha unchanged.
+	 */
+	pand %xmm6, %xmm3
+
+	/* unpack src and dst to words */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (a * s) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Get alpha from source and unpack to words
+	 * Result ranges is [0, 0x7fff], and is mapped to
+	 *  point values in [0.0, 1.0) by using the high word
+	 *  of the 32 bit multiplication result.
+	 * Because we want the unsigned value, we shift right one 
+	 *  here and also shift left the other factors to compensate.
+	 */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero out the alpha channel of the source to leave the
+	 * destination alpha unchanged.
+	 */
+	pand %xmm6, %xmm3
+
+	/* unpack src and dst to words */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (a * s) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Get alpha from source and unpack to words
+	 * Result ranges is [0, 0x7fff], and is mapped to
+	 *  point values in [0.0, 1.0) by using the high word
+	 *  of the 32 bit multiplication result.
+	 * Because we want the unsigned value, we shift right one 
+	 *  here and also shift left the other factors to compensate.
+	 */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero out the alpha channel of the source to leave the
+	 * destination alpha unchanged.
+	 */
+	pand %xmm6, %xmm3
+
+	/* unpack src and dst to words */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (a * s) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	js 1b				
+	jnz 3f				
+2:
+	/* Grab 1 pixel from src, with colormod */
+	movl (%rsi, %rcx, 4), %eax
+	ror $24, %eax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %edx, %xmm1
+	movd (%rdi, %rcx, 4), %xmm2
+	/* Get alpha from source and unpack to words
+	 * Result ranges is [0, 0x7fff], and is mapped to
+	 *  point values in [0.0, 1.0) by using the high word
+	 *  of the 32 bit multiplication result.
+	 * Because we want the unsigned value, we shift right one 
+	 *  here and also shift left the other factors to compensate.
+	 */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero out the alpha channel of the source to leave the
+	 * destination alpha unchanged.
+	 */
+	pand %xmm6, %xmm3
+
+	/* unpack src and dst to words */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (a * s) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+ 	movd %xmm2, (%rdi, %rcx, 4)
+3:					
+	leaq (%rsi, %r10, 4), %rsi	
+	leaq (%rdi, %r11, 4), %rdi	
+	decq %r9			
+	jnz 0b
+
+9:
+	LEAVE
+SIZE(imlib_amd64_add_blend_rgba_to_rgb_cmod)
+
+PR_(imlib_amd64_add_blend_rgba_to_rgba_cmod):
+	ENTER
+
+	pxor %xmm4, %xmm4
+	movdqa c1(%rip), %xmm5
+	xorq %rax, %rax
+	movdqa mX000X000X000X000(%rip), %xmm6
+	movq pow_lut@GOTPCREL(%rip), %r13
+
+	/* Move right to left across each line, */ 
+	/* processing in two pixel chunks */ 
+	leaq (%rsi, %r8, 4), %rsi	
+	leaq (%rdi, %r8, 4), %rdi	
+					
+	/* Last instruction is %rcx = 0 */ 
+	subq $4, %rsi			
+	subq $4, %rdi			
+					
+	negq %r8			
+0:					
+	movq %r8, %rcx			
+					
+	incq %rcx			
+
+	/* prefetch a couple cache lines ahead */
+	prefetchnta (%rsi, %rcx, 4)
+	prefetcht0 (%rdi, %rcx, 4)
+	prefetchnta 64(%rsi, %rcx, 4)
+	prefetcht0 64(%rdi, %rcx, 4)
+
+	jz 2f /* one pixel line */	
+1:
+	/* main loop, unrolled to work on 64 byte chunks */
+	prefetchnta 128(%rsi, %rcx, 4)
+	prefetcht0 128(%rdi, %rcx, 4)
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	/* unpack alpha to src alpha, combined alpha x 3 */
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	/* src alpha = 255 - dst alpha */
+	movdqa %xmm2, %xmm0
+	por %xmm6, %xmm1
+	pand %xmm6, %xmm0
+	psubusb %xmm0, %xmm1
+
+	/* unpack src and dst */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (s * ca) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+	
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	/* unpack alpha to src alpha, combined alpha x 3 */
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	/* src alpha = 255 - dst alpha */
+	movdqa %xmm2, %xmm0
+	por %xmm6, %xmm1
+	pand %xmm6, %xmm0
+	psubusb %xmm0, %xmm1
+
+	/* unpack src and dst */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (s * ca) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+	
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	/* unpack alpha to src alpha, combined alpha x 3 */
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	/* src alpha = 255 - dst alpha */
+	movdqa %xmm2, %xmm0
+	por %xmm6, %xmm1
+	pand %xmm6, %xmm0
+	psubusb %xmm0, %xmm1
+
+	/* unpack src and dst */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (s * ca) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+	
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	/* unpack alpha to src alpha, combined alpha x 3 */
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	/* src alpha = 255 - dst alpha */
+	movdqa %xmm2, %xmm0
+	por %xmm6, %xmm1
+	pand %xmm6, %xmm0
+	psubusb %xmm0, %xmm1
+
+	/* unpack src and dst */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (s * ca) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+	
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	/* unpack alpha to src alpha, combined alpha x 3 */
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	/* src alpha = 255 - dst alpha */
+	movdqa %xmm2, %xmm0
+	por %xmm6, %xmm1
+	pand %xmm6, %xmm0
+	psubusb %xmm0, %xmm1
+
+	/* unpack src and dst */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (s * ca) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+	
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	/* unpack alpha to src alpha, combined alpha x 3 */
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	/* src alpha = 255 - dst alpha */
+	movdqa %xmm2, %xmm0
+	por %xmm6, %xmm1
+	pand %xmm6, %xmm0
+	psubusb %xmm0, %xmm1
+
+	/* unpack src and dst */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (s * ca) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+	
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	/* unpack alpha to src alpha, combined alpha x 3 */
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	/* src alpha = 255 - dst alpha */
+	movdqa %xmm2, %xmm0
+	por %xmm6, %xmm1
+	pand %xmm6, %xmm0
+	psubusb %xmm0, %xmm1
+
+	/* unpack src and dst */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (s * ca) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+	
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	/* unpack alpha to src alpha, combined alpha x 3 */
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	/* src alpha = 255 - dst alpha */
+	movdqa %xmm2, %xmm0
+	por %xmm6, %xmm1
+	pand %xmm6, %xmm0
+	psubusb %xmm0, %xmm1
+
+	/* unpack src and dst */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (s * ca) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+	
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	js 1b				
+	jnz 3f				
+2:
+	/* Grab 1 pixel from src, with colormod */
+	movl (%rsi, %rcx, 4), %eax
+	ror $24, %eax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %edx, %xmm1
+	movd (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending */
+	roll $16, %edx
+	andl $0x0000ff00, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+	movd %eax, %xmm3
+	/* unpack alpha to src alpha, combined alpha x 3 */
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	/* src alpha = 255 - dst alpha */
+	movdqa %xmm2, %xmm0
+	por %xmm6, %xmm1
+	pand %xmm6, %xmm0
+	psubusb %xmm0, %xmm1
+
+	/* unpack src and dst */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (s * ca) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+	
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movd %xmm2, (%rdi, %rcx, 4)
+3:					
+	leaq (%rsi, %r10, 4), %rsi	
+	leaq (%rdi, %r11, 4), %rdi	
+	decq %r9			
+	jnz 0b
+
+9:
+	LEAVE
+SIZE(imlib_amd64_add_blend_rgba_to_rgba_cmod)
+
+PR_(imlib_amd64_add_blend_rgb_to_rgba_cmod):
+	ENTER
+
+	pxor %xmm4, %xmm4
+	movdqa c1(%rip), %xmm5
+	xorq %rax, %rax
+	movdqa mX000X000X000X000(%rip), %xmm6
+	movq pow_lut@GOTPCREL(%rip), %r13
+
+	/* Move right to left across each line, */ 
+	/* processing in two pixel chunks */ 
+	leaq (%rsi, %r8, 4), %rsi	
+	leaq (%rdi, %r8, 4), %rdi	
+					
+	/* Last instruction is %rcx = 0 */ 
+	subq $4, %rsi			
+	subq $4, %rdi			
+					
+	negq %r8			
+0:					
+	movq %r8, %rcx			
+					
+	incq %rcx			
+
+	/* prefetch a couple cache lines ahead */
+	prefetchnta (%rsi, %rcx, 4)
+	prefetcht0 (%rdi, %rcx, 4)
+	prefetchnta 64(%rsi, %rcx, 4)
+	prefetcht0 64(%rdi, %rcx, 4)
+
+	jz 2f /* one pixel line */	
+1:
+	/* main loop, unrolled to work on 64 byte chunks */
+	prefetchnta 128(%rsi, %rcx, 4)
+	prefetcht0 128(%rdi, %rcx, 4)
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	/* unpack alpha to src alpha, combined alpha x 3 */
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	/* src alpha = 255 - dst alpha */
+	movdqa %xmm2, %xmm0
+	por %xmm6, %xmm1
+	pand %xmm6, %xmm0
+	psubusb %xmm0, %xmm1
+
+	/* unpack src and dst */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (s * ca) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+	
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	/* unpack alpha to src alpha, combined alpha x 3 */
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	/* src alpha = 255 - dst alpha */
+	movdqa %xmm2, %xmm0
+	por %xmm6, %xmm1
+	pand %xmm6, %xmm0
+	psubusb %xmm0, %xmm1
+
+	/* unpack src and dst */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (s * ca) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+	
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	/* unpack alpha to src alpha, combined alpha x 3 */
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	/* src alpha = 255 - dst alpha */
+	movdqa %xmm2, %xmm0
+	por %xmm6, %xmm1
+	pand %xmm6, %xmm0
+	psubusb %xmm0, %xmm1
+
+	/* unpack src and dst */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (s * ca) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+	
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	/* unpack alpha to src alpha, combined alpha x 3 */
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	/* src alpha = 255 - dst alpha */
+	movdqa %xmm2, %xmm0
+	por %xmm6, %xmm1
+	pand %xmm6, %xmm0
+	psubusb %xmm0, %xmm1
+
+	/* unpack src and dst */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (s * ca) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+	
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	/* unpack alpha to src alpha, combined alpha x 3 */
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	/* src alpha = 255 - dst alpha */
+	movdqa %xmm2, %xmm0
+	por %xmm6, %xmm1
+	pand %xmm6, %xmm0
+	psubusb %xmm0, %xmm1
+
+	/* unpack src and dst */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (s * ca) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+	
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	/* unpack alpha to src alpha, combined alpha x 3 */
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	/* src alpha = 255 - dst alpha */
+	movdqa %xmm2, %xmm0
+	por %xmm6, %xmm1
+	pand %xmm6, %xmm0
+	psubusb %xmm0, %xmm1
+
+	/* unpack src and dst */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (s * ca) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+	
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	/* unpack alpha to src alpha, combined alpha x 3 */
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	/* src alpha = 255 - dst alpha */
+	movdqa %xmm2, %xmm0
+	por %xmm6, %xmm1
+	pand %xmm6, %xmm0
+	psubusb %xmm0, %xmm1
+
+	/* unpack src and dst */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (s * ca) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+	
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	/* unpack alpha to src alpha, combined alpha x 3 */
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	/* src alpha = 255 - dst alpha */
+	movdqa %xmm2, %xmm0
+	por %xmm6, %xmm1
+	pand %xmm6, %xmm0
+	psubusb %xmm0, %xmm1
+
+	/* unpack src and dst */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (s * ca) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+	
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	js 1b				
+	jnz 3f				
+2:
+	/* Grab 1 pixel from src, with colormod, with a = amod[255] */
+	movl (%rsi, %rcx, 4), %eax
+	ror $16, %eax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shl $8, %edx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %edx, %xmm1
+	movd (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending */
+	roll $16, %edx
+	andl $0x0000ff00, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+	movd %eax, %xmm3
+	/* unpack alpha to src alpha, combined alpha x 3 */
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	/* src alpha = 255 - dst alpha */
+	movdqa %xmm2, %xmm0
+	por %xmm6, %xmm1
+	pand %xmm6, %xmm0
+	psubusb %xmm0, %xmm1
+
+	/* unpack src and dst */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (s * ca) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+	
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movd %xmm2, (%rdi, %rcx, 4)
+3:					
+	leaq (%rsi, %r10, 4), %rsi	
+	leaq (%rdi, %r11, 4), %rdi	
+	decq %r9			
+	jnz 0b
+
+9:
+	LEAVE
+SIZE(imlib_amd64_add_blend_rgb_to_rgba_cmod)
+
+PR_(imlib_amd64_add_blend_rgb_to_rgb_cmod):
+	ENTER
+
+	pxor %xmm4, %xmm4
+	movdqa m00XXXXXX(%rip), %xmm6
+
+	/* Move right to left across each line, */ 
+	/* processing in two pixel chunks */ 
+	leaq (%rsi, %r8, 4), %rsi	
+	leaq (%rdi, %r8, 4), %rdi	
+					
+	/* Last instruction is %rcx = 0 */ 
+	subq $4, %rsi			
+	subq $4, %rdi			
+					
+	negq %r8			
+0:					
+	movq %r8, %rcx			
+					
+	incq %rcx			
+
+	/* prefetch a couple cache lines ahead */
+	prefetchnta (%rsi, %rcx, 4)
+	prefetcht0 (%rdi, %rcx, 4)
+	prefetchnta 64(%rsi, %rcx, 4)
+	prefetcht0 64(%rdi, %rcx, 4)
+
+	jz 2f /* one pixel line */	
+1:
+	/* main loop, unrolled to work on 64 byte chunks */
+	prefetchnta 128(%rsi, %rcx, 4)
+	prefetcht0 128(%rdi, %rcx, 4)
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Get alpha from source and unpack to words
+	 * Result ranges is [0, 0x7fff], and is mapped to
+	 *  point values in [0.0, 1.0) by using the high word
+	 *  of the 32 bit multiplication result.
+	 * Because we want the unsigned value, we shift right one 
+	 *  here and also shift left the other factors to compensate.
+	 */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero out the alpha channel of the source to leave the
+	 * destination alpha unchanged.
+	 */
+	pand %xmm6, %xmm3
+
+	/* unpack src and dst to words */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (a * s) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Get alpha from source and unpack to words
+	 * Result ranges is [0, 0x7fff], and is mapped to
+	 *  point values in [0.0, 1.0) by using the high word
+	 *  of the 32 bit multiplication result.
+	 * Because we want the unsigned value, we shift right one 
+	 *  here and also shift left the other factors to compensate.
+	 */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero out the alpha channel of the source to leave the
+	 * destination alpha unchanged.
+	 */
+	pand %xmm6, %xmm3
+
+	/* unpack src and dst to words */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (a * s) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Get alpha from source and unpack to words
+	 * Result ranges is [0, 0x7fff], and is mapped to
+	 *  point values in [0.0, 1.0) by using the high word
+	 *  of the 32 bit multiplication result.
+	 * Because we want the unsigned value, we shift right one 
+	 *  here and also shift left the other factors to compensate.
+	 */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero out the alpha channel of the source to leave the
+	 * destination alpha unchanged.
+	 */
+	pand %xmm6, %xmm3
+
+	/* unpack src and dst to words */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (a * s) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Get alpha from source and unpack to words
+	 * Result ranges is [0, 0x7fff], and is mapped to
+	 *  point values in [0.0, 1.0) by using the high word
+	 *  of the 32 bit multiplication result.
+	 * Because we want the unsigned value, we shift right one 
+	 *  here and also shift left the other factors to compensate.
+	 */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero out the alpha channel of the source to leave the
+	 * destination alpha unchanged.
+	 */
+	pand %xmm6, %xmm3
+
+	/* unpack src and dst to words */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (a * s) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Get alpha from source and unpack to words
+	 * Result ranges is [0, 0x7fff], and is mapped to
+	 *  point values in [0.0, 1.0) by using the high word
+	 *  of the 32 bit multiplication result.
+	 * Because we want the unsigned value, we shift right one 
+	 *  here and also shift left the other factors to compensate.
+	 */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero out the alpha channel of the source to leave the
+	 * destination alpha unchanged.
+	 */
+	pand %xmm6, %xmm3
+
+	/* unpack src and dst to words */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (a * s) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Get alpha from source and unpack to words
+	 * Result ranges is [0, 0x7fff], and is mapped to
+	 *  point values in [0.0, 1.0) by using the high word
+	 *  of the 32 bit multiplication result.
+	 * Because we want the unsigned value, we shift right one 
+	 *  here and also shift left the other factors to compensate.
+	 */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero out the alpha channel of the source to leave the
+	 * destination alpha unchanged.
+	 */
+	pand %xmm6, %xmm3
+
+	/* unpack src and dst to words */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (a * s) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Get alpha from source and unpack to words
+	 * Result ranges is [0, 0x7fff], and is mapped to
+	 *  point values in [0.0, 1.0) by using the high word
+	 *  of the 32 bit multiplication result.
+	 * Because we want the unsigned value, we shift right one 
+	 *  here and also shift left the other factors to compensate.
+	 */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero out the alpha channel of the source to leave the
+	 * destination alpha unchanged.
+	 */
+	pand %xmm6, %xmm3
+
+	/* unpack src and dst to words */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (a * s) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Get alpha from source and unpack to words
+	 * Result ranges is [0, 0x7fff], and is mapped to
+	 *  point values in [0.0, 1.0) by using the high word
+	 *  of the 32 bit multiplication result.
+	 * Because we want the unsigned value, we shift right one 
+	 *  here and also shift left the other factors to compensate.
+	 */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero out the alpha channel of the source to leave the
+	 * destination alpha unchanged.
+	 */
+	pand %xmm6, %xmm3
+
+	/* unpack src and dst to words */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (a * s) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	js 1b				
+	jnz 3f				
+2:
+	/* Grab 1 pixel from src, with colormod, with a = amod[255] */
+	movl (%rsi, %rcx, 4), %eax
+	ror $16, %eax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shl $8, %edx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %edx, %xmm1
+	movd (%rdi, %rcx, 4), %xmm2
+	/* Get alpha from source and unpack to words
+	 * Result ranges is [0, 0x7fff], and is mapped to
+	 *  point values in [0.0, 1.0) by using the high word
+	 *  of the 32 bit multiplication result.
+	 * Because we want the unsigned value, we shift right one 
+	 *  here and also shift left the other factors to compensate.
+	 */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero out the alpha channel of the source to leave the
+	 * destination alpha unchanged.
+	 */
+	pand %xmm6, %xmm3
+
+	/* unpack src and dst to words */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (a * s) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+ 	movd %xmm2, (%rdi, %rcx, 4)
+3:					
+	leaq (%rsi, %r10, 4), %rsi	
+	leaq (%rdi, %r11, 4), %rdi	
+	decq %r9			
+	jnz 0b
+
+9:
+	LEAVE
+SIZE(imlib_amd64_add_blend_rgb_to_rgb_cmod)
+
+PR_(imlib_amd64_add_copy_rgba_to_rgb_cmod):
+	ENTER
+
+	movdqa m0XXX0XXX0XXX0XXX(%rip), %xmm5
+
+	/* Move right to left across each line, */ 
+	/* processing in two pixel chunks */ 
+	leaq (%rsi, %r8, 4), %rsi	
+	leaq (%rdi, %r8, 4), %rdi	
+					
+	/* Last instruction is %rcx = 0 */ 
+	subq $4, %rsi			
+	subq $4, %rdi			
+					
+	negq %r8			
+0:					
+	movq %r8, %rcx			
+					
+	incq %rcx			
+
+	/* prefetch a couple cache lines ahead */
+	prefetchnta (%rsi, %rcx, 4)
+	prefetcht0 (%rdi, %rcx, 4)
+	prefetchnta 64(%rsi, %rcx, 4)
+	prefetcht0 64(%rdi, %rcx, 4)
+
+	jz 2f /* one pixel line */	
+1:
+	/* main loop, unrolled to work on 64 byte chunks */
+	prefetchnta 128(%rsi, %rcx, 4)
+	prefetcht0 128(%rdi, %rcx, 4)
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* d = d + (s & 0x00ffffff) */
+	pand %xmm5, %xmm1
+	paddusb %xmm1, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* d = d + (s & 0x00ffffff) */
+	pand %xmm5, %xmm1
+	paddusb %xmm1, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* d = d + (s & 0x00ffffff) */
+	pand %xmm5, %xmm1
+	paddusb %xmm1, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* d = d + (s & 0x00ffffff) */
+	pand %xmm5, %xmm1
+	paddusb %xmm1, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* d = d + (s & 0x00ffffff) */
+	pand %xmm5, %xmm1
+	paddusb %xmm1, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* d = d + (s & 0x00ffffff) */
+	pand %xmm5, %xmm1
+	paddusb %xmm1, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* d = d + (s & 0x00ffffff) */
+	pand %xmm5, %xmm1
+	paddusb %xmm1, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* d = d + (s & 0x00ffffff) */
+	pand %xmm5, %xmm1
+	paddusb %xmm1, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	js 1b				
+	jnz 3f				
+2:
+	/* Grab 1 pixel from src, with colormod */
+	movl (%rsi, %rcx, 4), %eax
+	ror $24, %eax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %edx, %xmm1
+	movd (%rdi, %rcx, 4), %xmm2
+	/* d = d + (s & 0x00ffffff) */
+	pand %xmm5, %xmm1
+	paddusb %xmm1, %xmm2
+	movd %xmm2, (%rdi, %rcx, 4)
+3:					
+	leaq (%rsi, %r10, 4), %rsi	
+	leaq (%rdi, %r11, 4), %rdi	
+	decq %r9			
+	jnz 0b
+
+9:
+	LEAVE
+SIZE(imlib_amd64_add_copy_rgba_to_rgb_cmod)
+
+PR_(imlib_amd64_add_copy_rgba_to_rgba_cmod):
+	ENTER
+
+	movdqa m0XXX0XXX0XXX0XXX(%rip), %xmm5
+
+	/* Move right to left across each line, */ 
+	/* processing in two pixel chunks */ 
+	leaq (%rsi, %r8, 4), %rsi	
+	leaq (%rdi, %r8, 4), %rdi	
+					
+	/* Last instruction is %rcx = 0 */ 
+	subq $4, %rsi			
+	subq $4, %rdi			
+					
+	negq %r8			
+0:					
+	movq %r8, %rcx			
+					
+	incq %rcx			
+
+	/* prefetch a couple cache lines ahead */
+	prefetchnta (%rsi, %rcx, 4)
+	prefetcht0 (%rdi, %rcx, 4)
+	prefetchnta 64(%rsi, %rcx, 4)
+	prefetcht0 64(%rdi, %rcx, 4)
+
+	jz 2f /* one pixel line */	
+1:
+	/* main loop, unrolled to work on 64 byte chunks */
+	prefetchnta 128(%rsi, %rcx, 4)
+	prefetcht0 128(%rdi, %rcx, 4)
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* d = (d & 0x00ffffff) + s */
+	pand %xmm5, %xmm2
+	paddusb %xmm1, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* d = (d & 0x00ffffff) + s */
+	pand %xmm5, %xmm2
+	paddusb %xmm1, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* d = (d & 0x00ffffff) + s */
+	pand %xmm5, %xmm2
+	paddusb %xmm1, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* d = (d & 0x00ffffff) + s */
+	pand %xmm5, %xmm2
+	paddusb %xmm1, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* d = (d & 0x00ffffff) + s */
+	pand %xmm5, %xmm2
+	paddusb %xmm1, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* d = (d & 0x00ffffff) + s */
+	pand %xmm5, %xmm2
+	paddusb %xmm1, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* d = (d & 0x00ffffff) + s */
+	pand %xmm5, %xmm2
+	paddusb %xmm1, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* d = (d & 0x00ffffff) + s */
+	pand %xmm5, %xmm2
+	paddusb %xmm1, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	js 1b				
+	jnz 3f				
+2:
+	/* Grab 1 pixel from src, with colormod */
+	movl (%rsi, %rcx, 4), %eax
+	ror $24, %eax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %edx, %xmm1
+	movd (%rdi, %rcx, 4), %xmm2
+	/* d = (d & 0x00ffffff) + s */
+	pand %xmm5, %xmm2
+	paddusb %xmm1, %xmm2
+	movd %xmm2, (%rdi, %rcx, 4)
+3:					
+	leaq (%rsi, %r10, 4), %rsi	
+	leaq (%rdi, %r11, 4), %rdi	
+	decq %r9			
+	jnz 0b
+
+9:
+	LEAVE
+SIZE(imlib_amd64_add_copy_rgba_to_rgba_cmod)
+
+PR_(imlib_amd64_add_copy_rgb_to_rgba_cmod):
+	ENTER
+
+	movdqa m0XXX0XXX0XXX0XXX(%rip), %xmm5
+
+	/* Move right to left across each line, */ 
+	/* processing in two pixel chunks */ 
+	leaq (%rsi, %r8, 4), %rsi	
+	leaq (%rdi, %r8, 4), %rdi	
+					
+	/* Last instruction is %rcx = 0 */ 
+	subq $4, %rsi			
+	subq $4, %rdi			
+					
+	negq %r8			
+0:					
+	movq %r8, %rcx			
+					
+	incq %rcx			
+
+	/* prefetch a couple cache lines ahead */
+	prefetchnta (%rsi, %rcx, 4)
+	prefetcht0 (%rdi, %rcx, 4)
+	prefetchnta 64(%rsi, %rcx, 4)
+	prefetcht0 64(%rdi, %rcx, 4)
+
+	jz 2f /* one pixel line */	
+1:
+	/* main loop, unrolled to work on 64 byte chunks */
+	prefetchnta 128(%rsi, %rcx, 4)
+	prefetcht0 128(%rdi, %rcx, 4)
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	pand %xmm5, %xmm2
+	paddusb %xmm1, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	pand %xmm5, %xmm2
+	paddusb %xmm1, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	pand %xmm5, %xmm2
+	paddusb %xmm1, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	pand %xmm5, %xmm2
+	paddusb %xmm1, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	pand %xmm5, %xmm2
+	paddusb %xmm1, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	pand %xmm5, %xmm2
+	paddusb %xmm1, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	pand %xmm5, %xmm2
+	paddusb %xmm1, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	pand %xmm5, %xmm2
+	paddusb %xmm1, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	js 1b				
+	jnz 3f				
+2:
+	/* Grab 1 pixel from src, with colormod, with a = amod[255] */
+	movl (%rsi, %rcx, 4), %eax
+	ror $16, %eax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shl $8, %edx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %edx, %xmm1
+	movd (%rdi, %rcx, 4), %xmm2
+	pand %xmm5, %xmm2
+	paddusb %xmm1, %xmm2
+	movd %xmm2, (%rdi, %rcx, 4)
+3:					
+	leaq (%rsi, %r10, 4), %rsi	
+	leaq (%rdi, %r11, 4), %rdi	
+	decq %r9			
+	jnz 0b
+
+9:
+	LEAVE
+SIZE(imlib_amd64_add_copy_rgb_to_rgba_cmod)
+
+PR_(imlib_amd64_subtract_blend_rgba_to_rgb_cmod):
+	ENTER
+
+	pxor %xmm4, %xmm4
+	movdqa m00XXXXXX(%rip), %xmm6
+
+	/* Move right to left across each line, */ 
+	/* processing in two pixel chunks */ 
+	leaq (%rsi, %r8, 4), %rsi	
+	leaq (%rdi, %r8, 4), %rdi	
+					
+	/* Last instruction is %rcx = 0 */ 
+	subq $4, %rsi			
+	subq $4, %rdi			
+					
+	negq %r8			
+0:					
+	movq %r8, %rcx			
+					
+	incq %rcx			
+
+	/* prefetch a couple cache lines ahead */
+	prefetchnta (%rsi, %rcx, 4)
+	prefetcht0 (%rdi, %rcx, 4)
+	prefetchnta 64(%rsi, %rcx, 4)
+	prefetcht0 64(%rdi, %rcx, 4)
+
+	jz 2f /* one pixel line */	
+1:
+	/* main loop, unrolled to work on 64 byte chunks */
+	prefetchnta 128(%rsi, %rcx, 4)
+	prefetcht0 128(%rdi, %rcx, 4)
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Get alpha from source and unpack to words
+	 * Result ranges is [0, 0x7fff], and is mapped to
+	 *  point values in [0.0, 1.0) by using the high word
+	 *  of the 32 bit multiplication result.
+	 * Because we want the unsigned value, we shift right one 
+	 *  here and also shift left the other factors to compensate.
+	 */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero out the alpha channel of the source to leave the
+	 * destination alpha unchanged.
+	 */
+	pand %xmm6, %xmm3
+
+	/* Unpack src and dst to words */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d - (s * a) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	psubsw %xmm1, %xmm2
+
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Get alpha from source and unpack to words
+	 * Result ranges is [0, 0x7fff], and is mapped to
+	 *  point values in [0.0, 1.0) by using the high word
+	 *  of the 32 bit multiplication result.
+	 * Because we want the unsigned value, we shift right one 
+	 *  here and also shift left the other factors to compensate.
+	 */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero out the alpha channel of the source to leave the
+	 * destination alpha unchanged.
+	 */
+	pand %xmm6, %xmm3
+
+	/* Unpack src and dst to words */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d - (s * a) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	psubsw %xmm1, %xmm2
+
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Get alpha from source and unpack to words
+	 * Result ranges is [0, 0x7fff], and is mapped to
+	 *  point values in [0.0, 1.0) by using the high word
+	 *  of the 32 bit multiplication result.
+	 * Because we want the unsigned value, we shift right one 
+	 *  here and also shift left the other factors to compensate.
+	 */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero out the alpha channel of the source to leave the
+	 * destination alpha unchanged.
+	 */
+	pand %xmm6, %xmm3
+
+	/* Unpack src and dst to words */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d - (s * a) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	psubsw %xmm1, %xmm2
+
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Get alpha from source and unpack to words
+	 * Result ranges is [0, 0x7fff], and is mapped to
+	 *  point values in [0.0, 1.0) by using the high word
+	 *  of the 32 bit multiplication result.
+	 * Because we want the unsigned value, we shift right one 
+	 *  here and also shift left the other factors to compensate.
+	 */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero out the alpha channel of the source to leave the
+	 * destination alpha unchanged.
+	 */
+	pand %xmm6, %xmm3
+
+	/* Unpack src and dst to words */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d - (s * a) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	psubsw %xmm1, %xmm2
+
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Get alpha from source and unpack to words
+	 * Result ranges is [0, 0x7fff], and is mapped to
+	 *  point values in [0.0, 1.0) by using the high word
+	 *  of the 32 bit multiplication result.
+	 * Because we want the unsigned value, we shift right one 
+	 *  here and also shift left the other factors to compensate.
+	 */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero out the alpha channel of the source to leave the
+	 * destination alpha unchanged.
+	 */
+	pand %xmm6, %xmm3
+
+	/* Unpack src and dst to words */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d - (s * a) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	psubsw %xmm1, %xmm2
+
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Get alpha from source and unpack to words
+	 * Result ranges is [0, 0x7fff], and is mapped to
+	 *  point values in [0.0, 1.0) by using the high word
+	 *  of the 32 bit multiplication result.
+	 * Because we want the unsigned value, we shift right one 
+	 *  here and also shift left the other factors to compensate.
+	 */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero out the alpha channel of the source to leave the
+	 * destination alpha unchanged.
+	 */
+	pand %xmm6, %xmm3
+
+	/* Unpack src and dst to words */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d - (s * a) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	psubsw %xmm1, %xmm2
+
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Get alpha from source and unpack to words
+	 * Result ranges is [0, 0x7fff], and is mapped to
+	 *  point values in [0.0, 1.0) by using the high word
+	 *  of the 32 bit multiplication result.
+	 * Because we want the unsigned value, we shift right one 
+	 *  here and also shift left the other factors to compensate.
+	 */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero out the alpha channel of the source to leave the
+	 * destination alpha unchanged.
+	 */
+	pand %xmm6, %xmm3
+
+	/* Unpack src and dst to words */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d - (s * a) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	psubsw %xmm1, %xmm2
+
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Get alpha from source and unpack to words
+	 * Result ranges is [0, 0x7fff], and is mapped to
+	 *  point values in [0.0, 1.0) by using the high word
+	 *  of the 32 bit multiplication result.
+	 * Because we want the unsigned value, we shift right one 
+	 *  here and also shift left the other factors to compensate.
+	 */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero out the alpha channel of the source to leave the
+	 * destination alpha unchanged.
+	 */
+	pand %xmm6, %xmm3
+
+	/* Unpack src and dst to words */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d - (s * a) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	psubsw %xmm1, %xmm2
+
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	js 1b				
+	jnz 3f				
+2:
+	/* Grab 1 pixel from src, with colormod */
+	movl (%rsi, %rcx, 4), %eax
+	ror $24, %eax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %edx, %xmm1
+	movd (%rdi, %rcx, 4), %xmm2
+	/* Get alpha from source and unpack to words
+	 * Result ranges is [0, 0x7fff], and is mapped to
+	 *  point values in [0.0, 1.0) by using the high word
+	 *  of the 32 bit multiplication result.
+	 * Because we want the unsigned value, we shift right one 
+	 *  here and also shift left the other factors to compensate.
+	 */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero out the alpha channel of the source to leave the
+	 * destination alpha unchanged.
+	 */
+	pand %xmm6, %xmm3
+
+	/* Unpack src and dst to words */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d - (s * a) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	psubsw %xmm1, %xmm2
+
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movd %xmm2, (%rdi, %rcx, 4)
+3:					
+	leaq (%rsi, %r10, 4), %rsi	
+	leaq (%rdi, %r11, 4), %rdi	
+	decq %r9			
+	jnz 0b
+
+9:
+	LEAVE
+SIZE(imlib_amd64_subtract_blend_rgba_to_rgb_cmod)
+PR_(imlib_amd64_subtract_blend_rgba_to_rgba_cmod):
+	ENTER
+
+	movq pow_lut@GOTPCREL(%rip), %r13
+	pxor %xmm4, %xmm4
+	movdqa c1(%rip), %xmm5
+	movdqa mX000X000X000X000(%rip), %xmm6
+	movdqa mX000X000(%rip), %xmm7
+	xorq %rax, %rax
+
+	/* Move right to left across each line, */ 
+	/* processing in two pixel chunks */ 
+	leaq (%rsi, %r8, 4), %rsi	
+	leaq (%rdi, %r8, 4), %rdi	
+					
+	/* Last instruction is %rcx = 0 */ 
+	subq $4, %rsi			
+	subq $4, %rdi			
+					
+	negq %r8			
+0:					
+	movq %r8, %rcx			
+					
+	incq %rcx			
+
+	/* prefetch a couple cache lines ahead */
+	prefetchnta (%rsi, %rcx, 4)
+	prefetcht0 (%rdi, %rcx, 4)
+	prefetchnta 64(%rsi, %rcx, 4)
+	prefetcht0 64(%rdi, %rcx, 4)
+
+	jz 2f /* one pixel line */	
+1:
+	/* main loop, unrolled to work on 64 byte chunks */
+	prefetchnta 128(%rsi, %rcx, 4)
+	prefetcht0 128(%rdi, %rcx, 4)
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	/* unpack alpha to src alpha, combined alpha x 3 */
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	/* src alpha = 255 - dst alpha */
+	movdqa %xmm2, %xmm0
+	pand %xmm6, %xmm0
+	por %xmm6, %xmm1
+	psubusb %xmm0, %xmm1
+
+	/* unpack src and dst */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d - ((s * a) ^ 0xff000000) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	pxor %xmm7, %xmm1
+	psubsw %xmm1, %xmm2
+	
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	/* unpack alpha to src alpha, combined alpha x 3 */
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	/* src alpha = 255 - dst alpha */
+	movdqa %xmm2, %xmm0
+	pand %xmm6, %xmm0
+	por %xmm6, %xmm1
+	psubusb %xmm0, %xmm1
+
+	/* unpack src and dst */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d - ((s * a) ^ 0xff000000) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	pxor %xmm7, %xmm1
+	psubsw %xmm1, %xmm2
+	
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	/* unpack alpha to src alpha, combined alpha x 3 */
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	/* src alpha = 255 - dst alpha */
+	movdqa %xmm2, %xmm0
+	pand %xmm6, %xmm0
+	por %xmm6, %xmm1
+	psubusb %xmm0, %xmm1
+
+	/* unpack src and dst */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d - ((s * a) ^ 0xff000000) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	pxor %xmm7, %xmm1
+	psubsw %xmm1, %xmm2
+	
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	/* unpack alpha to src alpha, combined alpha x 3 */
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	/* src alpha = 255 - dst alpha */
+	movdqa %xmm2, %xmm0
+	pand %xmm6, %xmm0
+	por %xmm6, %xmm1
+	psubusb %xmm0, %xmm1
+
+	/* unpack src and dst */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d - ((s * a) ^ 0xff000000) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	pxor %xmm7, %xmm1
+	psubsw %xmm1, %xmm2
+	
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	/* unpack alpha to src alpha, combined alpha x 3 */
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	/* src alpha = 255 - dst alpha */
+	movdqa %xmm2, %xmm0
+	pand %xmm6, %xmm0
+	por %xmm6, %xmm1
+	psubusb %xmm0, %xmm1
+
+	/* unpack src and dst */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d - ((s * a) ^ 0xff000000) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	pxor %xmm7, %xmm1
+	psubsw %xmm1, %xmm2
+	
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	/* unpack alpha to src alpha, combined alpha x 3 */
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	/* src alpha = 255 - dst alpha */
+	movdqa %xmm2, %xmm0
+	pand %xmm6, %xmm0
+	por %xmm6, %xmm1
+	psubusb %xmm0, %xmm1
+
+	/* unpack src and dst */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d - ((s * a) ^ 0xff000000) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	pxor %xmm7, %xmm1
+	psubsw %xmm1, %xmm2
+	
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	/* unpack alpha to src alpha, combined alpha x 3 */
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	/* src alpha = 255 - dst alpha */
+	movdqa %xmm2, %xmm0
+	pand %xmm6, %xmm0
+	por %xmm6, %xmm1
+	psubusb %xmm0, %xmm1
+
+	/* unpack src and dst */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d - ((s * a) ^ 0xff000000) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	pxor %xmm7, %xmm1
+	psubsw %xmm1, %xmm2
+	
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	/* unpack alpha to src alpha, combined alpha x 3 */
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	/* src alpha = 255 - dst alpha */
+	movdqa %xmm2, %xmm0
+	pand %xmm6, %xmm0
+	por %xmm6, %xmm1
+	psubusb %xmm0, %xmm1
+
+	/* unpack src and dst */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d - ((s * a) ^ 0xff000000) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	pxor %xmm7, %xmm1
+	psubsw %xmm1, %xmm2
+	
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	js 1b				
+	jnz 3f				
+2:
+	/* Grab 1 pixel from src, with colormod */
+	movl (%rsi, %rcx, 4), %eax
+	ror $24, %eax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %edx, %xmm1
+	movd (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending */
+	roll $16, %edx
+	andl $0x0000ff00, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+	movd %eax, %xmm3
+	/* unpack alpha to src alpha, combined alpha x 3 */
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	/* src alpha = 255 - dst alpha */
+	movdqa %xmm2, %xmm0
+	pand %xmm6, %xmm0
+	por %xmm6, %xmm1
+	psubusb %xmm0, %xmm1
+
+	/* unpack src and dst */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d - ((s * a) ^ 0xff000000) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	pxor %xmm7, %xmm1
+	psubsw %xmm1, %xmm2
+	
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movd %xmm2, (%rdi, %rcx, 4)
+3:					
+	leaq (%rsi, %r10, 4), %rsi	
+	leaq (%rdi, %r11, 4), %rdi	
+	decq %r9			
+	jnz 0b
+
+9:
+	LEAVE
+SIZE(imlib_amd64_subtract_blend_rgba_to_rgba_cmod)
+PR_(imlib_amd64_subtract_blend_rgb_to_rgba_cmod):
+	ENTER
+
+	movq pow_lut@GOTPCREL(%rip), %r13
+	pxor %xmm4, %xmm4
+	movdqa c1(%rip), %xmm5
+	movdqa mX000X000X000X000(%rip), %xmm6
+	movdqa mX000X000(%rip), %xmm7
+	xorq %rax, %rax
+
+	/* Move right to left across each line, */ 
+	/* processing in two pixel chunks */ 
+	leaq (%rsi, %r8, 4), %rsi	
+	leaq (%rdi, %r8, 4), %rdi	
+					
+	/* Last instruction is %rcx = 0 */ 
+	subq $4, %rsi			
+	subq $4, %rdi			
+					
+	negq %r8			
+0:					
+	movq %r8, %rcx			
+					
+	incq %rcx			
+
+	/* prefetch a couple cache lines ahead */
+	prefetchnta (%rsi, %rcx, 4)
+	prefetcht0 (%rdi, %rcx, 4)
+	prefetchnta 64(%rsi, %rcx, 4)
+	prefetcht0 64(%rdi, %rcx, 4)
+
+	jz 2f /* one pixel line */	
+1:
+	/* main loop, unrolled to work on 64 byte chunks */
+	prefetchnta 128(%rsi, %rcx, 4)
+	prefetcht0 128(%rdi, %rcx, 4)
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	/* unpack alpha to src alpha, combined alpha x 3 */
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	/* src alpha = 255 - dst alpha */
+	movdqa %xmm2, %xmm0
+	pand %xmm6, %xmm0
+	por %xmm6, %xmm1
+	psubusb %xmm0, %xmm1
+
+	/* unpack src and dst */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d - ((s * a) ^ 0xff000000) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	pxor %xmm7, %xmm1
+	psubsw %xmm1, %xmm2
+	
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	/* unpack alpha to src alpha, combined alpha x 3 */
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	/* src alpha = 255 - dst alpha */
+	movdqa %xmm2, %xmm0
+	pand %xmm6, %xmm0
+	por %xmm6, %xmm1
+	psubusb %xmm0, %xmm1
+
+	/* unpack src and dst */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d - ((s * a) ^ 0xff000000) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	pxor %xmm7, %xmm1
+	psubsw %xmm1, %xmm2
+	
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	/* unpack alpha to src alpha, combined alpha x 3 */
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	/* src alpha = 255 - dst alpha */
+	movdqa %xmm2, %xmm0
+	pand %xmm6, %xmm0
+	por %xmm6, %xmm1
+	psubusb %xmm0, %xmm1
+
+	/* unpack src and dst */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d - ((s * a) ^ 0xff000000) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	pxor %xmm7, %xmm1
+	psubsw %xmm1, %xmm2
+	
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	/* unpack alpha to src alpha, combined alpha x 3 */
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	/* src alpha = 255 - dst alpha */
+	movdqa %xmm2, %xmm0
+	pand %xmm6, %xmm0
+	por %xmm6, %xmm1
+	psubusb %xmm0, %xmm1
+
+	/* unpack src and dst */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d - ((s * a) ^ 0xff000000) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	pxor %xmm7, %xmm1
+	psubsw %xmm1, %xmm2
+	
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	/* unpack alpha to src alpha, combined alpha x 3 */
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	/* src alpha = 255 - dst alpha */
+	movdqa %xmm2, %xmm0
+	pand %xmm6, %xmm0
+	por %xmm6, %xmm1
+	psubusb %xmm0, %xmm1
+
+	/* unpack src and dst */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d - ((s * a) ^ 0xff000000) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	pxor %xmm7, %xmm1
+	psubsw %xmm1, %xmm2
+	
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	/* unpack alpha to src alpha, combined alpha x 3 */
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	/* src alpha = 255 - dst alpha */
+	movdqa %xmm2, %xmm0
+	pand %xmm6, %xmm0
+	por %xmm6, %xmm1
+	psubusb %xmm0, %xmm1
+
+	/* unpack src and dst */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d - ((s * a) ^ 0xff000000) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	pxor %xmm7, %xmm1
+	psubsw %xmm1, %xmm2
+	
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	/* unpack alpha to src alpha, combined alpha x 3 */
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	/* src alpha = 255 - dst alpha */
+	movdqa %xmm2, %xmm0
+	pand %xmm6, %xmm0
+	por %xmm6, %xmm1
+	psubusb %xmm0, %xmm1
+
+	/* unpack src and dst */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d - ((s * a) ^ 0xff000000) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	pxor %xmm7, %xmm1
+	psubsw %xmm1, %xmm2
+	
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	/* unpack alpha to src alpha, combined alpha x 3 */
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	/* src alpha = 255 - dst alpha */
+	movdqa %xmm2, %xmm0
+	pand %xmm6, %xmm0
+	por %xmm6, %xmm1
+	psubusb %xmm0, %xmm1
+
+	/* unpack src and dst */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d - ((s * a) ^ 0xff000000) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	pxor %xmm7, %xmm1
+	psubsw %xmm1, %xmm2
+	
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	js 1b				
+	jnz 3f				
+2:
+	/* Grab 1 pixel from src, with colormod, with a = amod[255] */
+	movl (%rsi, %rcx, 4), %eax
+	ror $16, %eax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shl $8, %edx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %edx, %xmm1
+	movd (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending */
+	roll $16, %edx
+	andl $0x0000ff00, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+	movd %eax, %xmm3
+	/* unpack alpha to src alpha, combined alpha x 3 */
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	/* src alpha = 255 - dst alpha */
+	movdqa %xmm2, %xmm0
+	pand %xmm6, %xmm0
+	por %xmm6, %xmm1
+	psubusb %xmm0, %xmm1
+
+	/* unpack src and dst */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d - ((s * a) ^ 0xff000000) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	pxor %xmm7, %xmm1
+	psubsw %xmm1, %xmm2
+	
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movd %xmm2, (%rdi, %rcx, 4)
+3:					
+	leaq (%rsi, %r10, 4), %rsi	
+	leaq (%rdi, %r11, 4), %rdi	
+	decq %r9			
+	jnz 0b
+
+9:
+	LEAVE
+SIZE(imlib_amd64_subtract_blend_rgb_to_rgba_cmod)
+PR_(imlib_amd64_subtract_blend_rgb_to_rgb_cmod):
+	ENTER
+
+	pxor %xmm4, %xmm4
+	movdqa m00XXXXXX(%rip), %xmm6
+
+	/* Move right to left across each line, */ 
+	/* processing in two pixel chunks */ 
+	leaq (%rsi, %r8, 4), %rsi	
+	leaq (%rdi, %r8, 4), %rdi	
+					
+	/* Last instruction is %rcx = 0 */ 
+	subq $4, %rsi			
+	subq $4, %rdi			
+					
+	negq %r8			
+0:					
+	movq %r8, %rcx			
+					
+	incq %rcx			
+
+	/* prefetch a couple cache lines ahead */
+	prefetchnta (%rsi, %rcx, 4)
+	prefetcht0 (%rdi, %rcx, 4)
+	prefetchnta 64(%rsi, %rcx, 4)
+	prefetcht0 64(%rdi, %rcx, 4)
+
+	jz 2f /* one pixel line */	
+1:
+	/* main loop, unrolled to work on 64 byte chunks */
+	prefetchnta 128(%rsi, %rcx, 4)
+	prefetcht0 128(%rdi, %rcx, 4)
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Get alpha from source and unpack to words
+	 * Result ranges is [0, 0x7fff], and is mapped to
+	 *  point values in [0.0, 1.0) by using the high word
+	 *  of the 32 bit multiplication result.
+	 * Because we want the unsigned value, we shift right one 
+	 *  here and also shift left the other factors to compensate.
+	 */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero out the alpha channel of the source to leave the
+	 * destination alpha unchanged.
+	 */
+	pand %xmm6, %xmm3
+
+	/* Unpack src and dst to words */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d - (s * a) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	psubsw %xmm1, %xmm2
+
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Get alpha from source and unpack to words
+	 * Result ranges is [0, 0x7fff], and is mapped to
+	 *  point values in [0.0, 1.0) by using the high word
+	 *  of the 32 bit multiplication result.
+	 * Because we want the unsigned value, we shift right one 
+	 *  here and also shift left the other factors to compensate.
+	 */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero out the alpha channel of the source to leave the
+	 * destination alpha unchanged.
+	 */
+	pand %xmm6, %xmm3
+
+	/* Unpack src and dst to words */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d - (s * a) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	psubsw %xmm1, %xmm2
+
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Get alpha from source and unpack to words
+	 * Result ranges is [0, 0x7fff], and is mapped to
+	 *  point values in [0.0, 1.0) by using the high word
+	 *  of the 32 bit multiplication result.
+	 * Because we want the unsigned value, we shift right one 
+	 *  here and also shift left the other factors to compensate.
+	 */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero out the alpha channel of the source to leave the
+	 * destination alpha unchanged.
+	 */
+	pand %xmm6, %xmm3
+
+	/* Unpack src and dst to words */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d - (s * a) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	psubsw %xmm1, %xmm2
+
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Get alpha from source and unpack to words
+	 * Result ranges is [0, 0x7fff], and is mapped to
+	 *  point values in [0.0, 1.0) by using the high word
+	 *  of the 32 bit multiplication result.
+	 * Because we want the unsigned value, we shift right one 
+	 *  here and also shift left the other factors to compensate.
+	 */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero out the alpha channel of the source to leave the
+	 * destination alpha unchanged.
+	 */
+	pand %xmm6, %xmm3
+
+	/* Unpack src and dst to words */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d - (s * a) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	psubsw %xmm1, %xmm2
+
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Get alpha from source and unpack to words
+	 * Result ranges is [0, 0x7fff], and is mapped to
+	 *  point values in [0.0, 1.0) by using the high word
+	 *  of the 32 bit multiplication result.
+	 * Because we want the unsigned value, we shift right one 
+	 *  here and also shift left the other factors to compensate.
+	 */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero out the alpha channel of the source to leave the
+	 * destination alpha unchanged.
+	 */
+	pand %xmm6, %xmm3
+
+	/* Unpack src and dst to words */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d - (s * a) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	psubsw %xmm1, %xmm2
+
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Get alpha from source and unpack to words
+	 * Result ranges is [0, 0x7fff], and is mapped to
+	 *  point values in [0.0, 1.0) by using the high word
+	 *  of the 32 bit multiplication result.
+	 * Because we want the unsigned value, we shift right one 
+	 *  here and also shift left the other factors to compensate.
+	 */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero out the alpha channel of the source to leave the
+	 * destination alpha unchanged.
+	 */
+	pand %xmm6, %xmm3
+
+	/* Unpack src and dst to words */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d - (s * a) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	psubsw %xmm1, %xmm2
+
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Get alpha from source and unpack to words
+	 * Result ranges is [0, 0x7fff], and is mapped to
+	 *  point values in [0.0, 1.0) by using the high word
+	 *  of the 32 bit multiplication result.
+	 * Because we want the unsigned value, we shift right one 
+	 *  here and also shift left the other factors to compensate.
+	 */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero out the alpha channel of the source to leave the
+	 * destination alpha unchanged.
+	 */
+	pand %xmm6, %xmm3
+
+	/* Unpack src and dst to words */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d - (s * a) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	psubsw %xmm1, %xmm2
+
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Get alpha from source and unpack to words
+	 * Result ranges is [0, 0x7fff], and is mapped to
+	 *  point values in [0.0, 1.0) by using the high word
+	 *  of the 32 bit multiplication result.
+	 * Because we want the unsigned value, we shift right one 
+	 *  here and also shift left the other factors to compensate.
+	 */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero out the alpha channel of the source to leave the
+	 * destination alpha unchanged.
+	 */
+	pand %xmm6, %xmm3
+
+	/* Unpack src and dst to words */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d - (s * a) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	psubsw %xmm1, %xmm2
+
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	js 1b				
+	jnz 3f				
+2:
+	/* Grab 1 pixel from src, with colormod, with a = amod[255] */
+	movl (%rsi, %rcx, 4), %eax
+	ror $16, %eax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shl $8, %edx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %edx, %xmm1
+	movd (%rdi, %rcx, 4), %xmm2
+	/* Get alpha from source and unpack to words
+	 * Result ranges is [0, 0x7fff], and is mapped to
+	 *  point values in [0.0, 1.0) by using the high word
+	 *  of the 32 bit multiplication result.
+	 * Because we want the unsigned value, we shift right one 
+	 *  here and also shift left the other factors to compensate.
+	 */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero out the alpha channel of the source to leave the
+	 * destination alpha unchanged.
+	 */
+	pand %xmm6, %xmm3
+
+	/* Unpack src and dst to words */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d - (s * a) */
+	psllw $1, %xmm1
+	pmulhw %xmm3, %xmm1
+	psubsw %xmm1, %xmm2
+
+	/* pack new pixels */
+	packuswb %xmm4, %xmm2
+	movd %xmm2, (%rdi, %rcx, 4)
+3:					
+	leaq (%rsi, %r10, 4), %rsi	
+	leaq (%rdi, %r11, 4), %rdi	
+	decq %r9			
+	jnz 0b
+
+9:
+	LEAVE
+SIZE(imlib_amd64_subtract_blend_rgb_to_rgb_cmod)
+PR_(imlib_amd64_subtract_copy_rgba_to_rgb_cmod):
+	ENTER
+
+	movdqa m0XXX0XXX0XXX0XXX(%rip), %xmm5
+
+	/* Move right to left across each line, */ 
+	/* processing in two pixel chunks */ 
+	leaq (%rsi, %r8, 4), %rsi	
+	leaq (%rdi, %r8, 4), %rdi	
+					
+	/* Last instruction is %rcx = 0 */ 
+	subq $4, %rsi			
+	subq $4, %rdi			
+					
+	negq %r8			
+0:					
+	movq %r8, %rcx			
+					
+	incq %rcx			
+
+	/* prefetch a couple cache lines ahead */
+	prefetchnta (%rsi, %rcx, 4)
+	prefetcht0 (%rdi, %rcx, 4)
+	prefetchnta 64(%rsi, %rcx, 4)
+	prefetcht0 64(%rdi, %rcx, 4)
+
+	jz 2f /* one pixel line */	
+1:
+	/* main loop, unrolled to work on 64 byte chunks */
+	prefetchnta 128(%rsi, %rcx, 4)
+	prefetcht0 128(%rdi, %rcx, 4)
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* d = d - (s & 0x00ffffff) */
+	pand %xmm5, %xmm1
+	psubusb %xmm1, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* d = d - (s & 0x00ffffff) */
+	pand %xmm5, %xmm1
+	psubusb %xmm1, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* d = d - (s & 0x00ffffff) */
+	pand %xmm5, %xmm1
+	psubusb %xmm1, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* d = d - (s & 0x00ffffff) */
+	pand %xmm5, %xmm1
+	psubusb %xmm1, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* d = d - (s & 0x00ffffff) */
+	pand %xmm5, %xmm1
+	psubusb %xmm1, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* d = d - (s & 0x00ffffff) */
+	pand %xmm5, %xmm1
+	psubusb %xmm1, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* d = d - (s & 0x00ffffff) */
+	pand %xmm5, %xmm1
+	psubusb %xmm1, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* d = d - (s & 0x00ffffff) */
+	pand %xmm5, %xmm1
+	psubusb %xmm1, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	js 1b				
+	jnz 3f				
+2:
+	/* Grab 1 pixel from src, with colormod */
+	movl (%rsi, %rcx, 4), %eax
+	ror $24, %eax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %edx, %xmm1
+	movd (%rdi, %rcx, 4), %xmm2
+	/* d = d - (s & 0x00ffffff) */
+	pand %xmm5, %xmm1
+	psubusb %xmm1, %xmm2
+	movd %xmm2, (%rdi, %rcx, 4)
+3:					
+	leaq (%rsi, %r10, 4), %rsi	
+	leaq (%rdi, %r11, 4), %rdi	
+	decq %r9			
+	jnz 0b
+
+9:
+	LEAVE
+SIZE(imlib_amd64_subtract_copy_rgba_to_rgb_cmod)
+PR_(imlib_amd64_subtract_copy_rgba_to_rgba_cmod):
+	ENTER
+
+	movdqa m0XXX0XXX0XXX0XXX(%rip), %xmm5
+	movdqa mX000X000X000X000(%rip), %xmm6
+
+	/* Move right to left across each line, */ 
+	/* processing in two pixel chunks */ 
+	leaq (%rsi, %r8, 4), %rsi	
+	leaq (%rdi, %r8, 4), %rdi	
+					
+	/* Last instruction is %rcx = 0 */ 
+	subq $4, %rsi			
+	subq $4, %rdi			
+					
+	negq %r8			
+0:					
+	movq %r8, %rcx			
+					
+	incq %rcx			
+
+	/* prefetch a couple cache lines ahead */
+	prefetchnta (%rsi, %rcx, 4)
+	prefetcht0 (%rdi, %rcx, 4)
+	prefetchnta 64(%rsi, %rcx, 4)
+	prefetcht0 64(%rdi, %rcx, 4)
+
+	jz 2f /* one pixel line */	
+1:
+	/* main loop, unrolled to work on 64 byte chunks */
+	prefetchnta 128(%rsi, %rcx, 4)
+	prefetcht0 128(%rdi, %rcx, 4)
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* d = d - s, d alpha = s alpha */
+	psubusb %xmm1, %xmm2
+	pand %xmm6, %xmm1
+	pand %xmm5, %xmm2
+	por %xmm1, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* d = d - s, d alpha = s alpha */
+	psubusb %xmm1, %xmm2
+	pand %xmm6, %xmm1
+	pand %xmm5, %xmm2
+	por %xmm1, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* d = d - s, d alpha = s alpha */
+	psubusb %xmm1, %xmm2
+	pand %xmm6, %xmm1
+	pand %xmm5, %xmm2
+	por %xmm1, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* d = d - s, d alpha = s alpha */
+	psubusb %xmm1, %xmm2
+	pand %xmm6, %xmm1
+	pand %xmm5, %xmm2
+	por %xmm1, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* d = d - s, d alpha = s alpha */
+	psubusb %xmm1, %xmm2
+	pand %xmm6, %xmm1
+	pand %xmm5, %xmm2
+	por %xmm1, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* d = d - s, d alpha = s alpha */
+	psubusb %xmm1, %xmm2
+	pand %xmm6, %xmm1
+	pand %xmm5, %xmm2
+	por %xmm1, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* d = d - s, d alpha = s alpha */
+	psubusb %xmm1, %xmm2
+	pand %xmm6, %xmm1
+	pand %xmm5, %xmm2
+	por %xmm1, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* d = d - s, d alpha = s alpha */
+	psubusb %xmm1, %xmm2
+	pand %xmm6, %xmm1
+	pand %xmm5, %xmm2
+	por %xmm1, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	js 1b				
+	jnz 3f				
+2:
+	/* Grab 1 pixel from src, with colormod */
+	movl (%rsi, %rcx, 4), %eax
+	ror $24, %eax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %edx, %xmm1
+	movd (%rdi, %rcx, 4), %xmm2
+	/* d = d - s, d alpha = s alpha */
+	psubusb %xmm1, %xmm2
+	pand %xmm6, %xmm1
+	pand %xmm5, %xmm2
+	por %xmm1, %xmm2
+	movd %xmm2, (%rdi, %rcx, 4)
+3:					
+	leaq (%rsi, %r10, 4), %rsi	
+	leaq (%rdi, %r11, 4), %rdi	
+	decq %r9			
+	jnz 0b
+
+9:
+	LEAVE
+SIZE(imlib_amd64_subtract_copy_rgba_to_rgba_cmod)
+PR_(imlib_amd64_subtract_copy_rgb_to_rgba_cmod):
+	ENTER
+
+	movdqa m0XXX0XXX0XXX0XXX(%rip), %xmm5
+	movdqa mX000X000X000X000(%rip), %xmm6
+
+	/* Move right to left across each line, */ 
+	/* processing in two pixel chunks */ 
+	leaq (%rsi, %r8, 4), %rsi	
+	leaq (%rdi, %r8, 4), %rdi	
+					
+	/* Last instruction is %rcx = 0 */ 
+	subq $4, %rsi			
+	subq $4, %rdi			
+					
+	negq %r8			
+0:					
+	movq %r8, %rcx			
+					
+	incq %rcx			
+
+	/* prefetch a couple cache lines ahead */
+	prefetchnta (%rsi, %rcx, 4)
+	prefetcht0 (%rdi, %rcx, 4)
+	prefetchnta 64(%rsi, %rcx, 4)
+	prefetcht0 64(%rdi, %rcx, 4)
+
+	jz 2f /* one pixel line */	
+1:
+	/* main loop, unrolled to work on 64 byte chunks */
+	prefetchnta 128(%rsi, %rcx, 4)
+	prefetcht0 128(%rdi, %rcx, 4)
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* d = (d - s) */
+	psubusb %xmm1, %xmm2
+
+	/* Preserve source alpha */
+	pand %xmm5, %xmm2
+	pand %xmm6, %xmm1
+	por %xmm1, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* d = (d - s) */
+	psubusb %xmm1, %xmm2
+
+	/* Preserve source alpha */
+	pand %xmm5, %xmm2
+	pand %xmm6, %xmm1
+	por %xmm1, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* d = (d - s) */
+	psubusb %xmm1, %xmm2
+
+	/* Preserve source alpha */
+	pand %xmm5, %xmm2
+	pand %xmm6, %xmm1
+	por %xmm1, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* d = (d - s) */
+	psubusb %xmm1, %xmm2
+
+	/* Preserve source alpha */
+	pand %xmm5, %xmm2
+	pand %xmm6, %xmm1
+	por %xmm1, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* d = (d - s) */
+	psubusb %xmm1, %xmm2
+
+	/* Preserve source alpha */
+	pand %xmm5, %xmm2
+	pand %xmm6, %xmm1
+	por %xmm1, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* d = (d - s) */
+	psubusb %xmm1, %xmm2
+
+	/* Preserve source alpha */
+	pand %xmm5, %xmm2
+	pand %xmm6, %xmm1
+	por %xmm1, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* d = (d - s) */
+	psubusb %xmm1, %xmm2
+
+	/* Preserve source alpha */
+	pand %xmm5, %xmm2
+	pand %xmm6, %xmm1
+	por %xmm1, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* d = (d - s) */
+	psubusb %xmm1, %xmm2
+
+	/* Preserve source alpha */
+	pand %xmm5, %xmm2
+	pand %xmm6, %xmm1
+	por %xmm1, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	js 1b				
+	jnz 3f				
+2:
+	/* Grab 1 pixel from src, with colormod, with a = amod[255] */
+	movl (%rsi, %rcx, 4), %eax
+	ror $16, %eax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shl $8, %edx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %edx, %xmm1
+	movd (%rdi, %rcx, 4), %xmm2
+	/* d = (d - s) */
+	psubusb %xmm1, %xmm2
+
+	/* Preserve source alpha */
+	pand %xmm5, %xmm2
+	pand %xmm6, %xmm1
+	por %xmm1, %xmm2
+	movd %xmm2, (%rdi, %rcx, 4)
+3:					
+	leaq (%rsi, %r10, 4), %rsi	
+	leaq (%rdi, %r11, 4), %rdi	
+	decq %r9			
+	jnz 0b
+
+9:
+	LEAVE
+SIZE(imlib_amd64_subtract_copy_rgb_to_rgba_cmod)
+PR_(imlib_amd64_reshade_blend_rgba_to_rgb_cmod):
+	ENTER
+
+	pxor %xmm4, %xmm4
+	movdqa m000V0V0V000V0V0V(%rip), %xmm6
+	movdqa m00XXXXXX(%rip), %xmm7
+
+	/* Move right to left across each line, */ 
+	/* processing in two pixel chunks */ 
+	leaq (%rsi, %r8, 4), %rsi	
+	leaq (%rdi, %r8, 4), %rdi	
+					
+	/* Last instruction is %rcx = 0 */ 
+	subq $4, %rsi			
+	subq $4, %rdi			
+					
+	negq %r8			
+0:					
+	movq %r8, %rcx			
+					
+	incq %rcx			
+
+	/* prefetch a couple cache lines ahead */
+	prefetchnta (%rsi, %rcx, 4)
+	prefetcht0 (%rdi, %rcx, 4)
+	prefetchnta 64(%rsi, %rcx, 4)
+	prefetcht0 64(%rdi, %rcx, 4)
+
+	jz 2f /* one pixel line */	
+1:
+	/* main loop, unrolled to work on 64 byte chunks */
+	prefetchnta 128(%rsi, %rcx, 4)
+	prefetcht0 128(%rdi, %rcx, 4)
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Unpack alpha */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero blending alpha */
+	pand %xmm7, %xmm3
+
+	/* Unpack src and dst */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (2 * a * (s - 127)) */
+	psubw %xmm6, %xmm1
+	psllw $2, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* Repack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Unpack alpha */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero blending alpha */
+	pand %xmm7, %xmm3
+
+	/* Unpack src and dst */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (2 * a * (s - 127)) */
+	psubw %xmm6, %xmm1
+	psllw $2, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* Repack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Unpack alpha */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero blending alpha */
+	pand %xmm7, %xmm3
+
+	/* Unpack src and dst */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (2 * a * (s - 127)) */
+	psubw %xmm6, %xmm1
+	psllw $2, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* Repack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Unpack alpha */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero blending alpha */
+	pand %xmm7, %xmm3
+
+	/* Unpack src and dst */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (2 * a * (s - 127)) */
+	psubw %xmm6, %xmm1
+	psllw $2, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* Repack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Unpack alpha */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero blending alpha */
+	pand %xmm7, %xmm3
+
+	/* Unpack src and dst */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (2 * a * (s - 127)) */
+	psubw %xmm6, %xmm1
+	psllw $2, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* Repack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Unpack alpha */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero blending alpha */
+	pand %xmm7, %xmm3
+
+	/* Unpack src and dst */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (2 * a * (s - 127)) */
+	psubw %xmm6, %xmm1
+	psllw $2, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* Repack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Unpack alpha */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero blending alpha */
+	pand %xmm7, %xmm3
+
+	/* Unpack src and dst */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (2 * a * (s - 127)) */
+	psubw %xmm6, %xmm1
+	psllw $2, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* Repack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Unpack alpha */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero blending alpha */
+	pand %xmm7, %xmm3
+
+	/* Unpack src and dst */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (2 * a * (s - 127)) */
+	psubw %xmm6, %xmm1
+	psllw $2, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* Repack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	js 1b				
+	jnz 3f				
+2:
+	/* Grab 1 pixel from src, with colormod */
+	movl (%rsi, %rcx, 4), %eax
+	ror $24, %eax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %edx, %xmm1
+	movd (%rdi, %rcx, 4), %xmm2
+	/* Unpack alpha */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero blending alpha */
+	pand %xmm7, %xmm3
+
+	/* Unpack src and dst */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (2 * a * (s - 127)) */
+	psubw %xmm6, %xmm1
+	psllw $2, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* Repack new pixels */
+	packuswb %xmm4, %xmm2
+ 	movd %xmm2, (%rdi, %rcx, 4)
+3:					
+	leaq (%rsi, %r10, 4), %rsi	
+	leaq (%rdi, %r11, 4), %rdi	
+	decq %r9			
+	jnz 0b
+
+9:
+	LEAVE
+SIZE(imlib_amd64_reshade_blend_rgba_to_rgb_cmod)
+
+PR_(imlib_amd64_reshade_blend_rgba_to_rgba_cmod):
+	ENTER
+
+	movq pow_lut@GOTPCREL(%rip), %r13
+	pxor %xmm4, %xmm4
+	movdqa c1(%rip), %xmm5
+	movdqa mX000X000X000X000(%rip), %xmm6
+	movdqa m0XXX0XXX0XXX0XXX(%rip), %xmm7
+	movdqa m000V0V0V000V0V0V(%rip), %xmm8
+	xorq %rax, %rax
+
+	/* Move right to left across each line, */ 
+	/* processing in two pixel chunks */ 
+	leaq (%rsi, %r8, 4), %rsi	
+	leaq (%rdi, %r8, 4), %rdi	
+					
+	/* Last instruction is %rcx = 0 */ 
+	subq $4, %rsi			
+	subq $4, %rdi			
+					
+	negq %r8			
+0:					
+	movq %r8, %rcx			
+					
+	incq %rcx			
+
+	/* prefetch a couple cache lines ahead */
+	prefetchnta (%rsi, %rcx, 4)
+	prefetcht0 (%rdi, %rcx, 4)
+	prefetchnta 64(%rsi, %rcx, 4)
+	prefetcht0 64(%rdi, %rcx, 4)
+
+	jz 2f /* one pixel line */	
+1:
+	/* main loop, unrolled to work on 64 byte chunks */
+	prefetchnta 128(%rsi, %rcx, 4)
+	prefetcht0 128(%rdi, %rcx, 4)
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending, specialized for reshade by shifting the source alpha
+	 * right by one */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+	shrb $1, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+	shrb $1, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	movdqa %xmm2, %xmm0
+	pand %xmm6, %xmm0
+	por %xmm6, %xmm1
+	psubusb %xmm0, %xmm1
+
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	psubw %xmm8, %xmm1
+	psllw $2, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+	
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending, specialized for reshade by shifting the source alpha
+	 * right by one */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+	shrb $1, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+	shrb $1, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	movdqa %xmm2, %xmm0
+	pand %xmm6, %xmm0
+	por %xmm6, %xmm1
+	psubusb %xmm0, %xmm1
+
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	psubw %xmm8, %xmm1
+	psllw $2, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+	
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending, specialized for reshade by shifting the source alpha
+	 * right by one */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+	shrb $1, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+	shrb $1, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	movdqa %xmm2, %xmm0
+	pand %xmm6, %xmm0
+	por %xmm6, %xmm1
+	psubusb %xmm0, %xmm1
+
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	psubw %xmm8, %xmm1
+	psllw $2, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+	
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending, specialized for reshade by shifting the source alpha
+	 * right by one */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+	shrb $1, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+	shrb $1, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	movdqa %xmm2, %xmm0
+	pand %xmm6, %xmm0
+	por %xmm6, %xmm1
+	psubusb %xmm0, %xmm1
+
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	psubw %xmm8, %xmm1
+	psllw $2, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+	
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending, specialized for reshade by shifting the source alpha
+	 * right by one */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+	shrb $1, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+	shrb $1, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	movdqa %xmm2, %xmm0
+	pand %xmm6, %xmm0
+	por %xmm6, %xmm1
+	psubusb %xmm0, %xmm1
+
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	psubw %xmm8, %xmm1
+	psllw $2, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+	
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending, specialized for reshade by shifting the source alpha
+	 * right by one */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+	shrb $1, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+	shrb $1, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	movdqa %xmm2, %xmm0
+	pand %xmm6, %xmm0
+	por %xmm6, %xmm1
+	psubusb %xmm0, %xmm1
+
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	psubw %xmm8, %xmm1
+	psllw $2, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+	
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending, specialized for reshade by shifting the source alpha
+	 * right by one */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+	shrb $1, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+	shrb $1, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	movdqa %xmm2, %xmm0
+	pand %xmm6, %xmm0
+	por %xmm6, %xmm1
+	psubusb %xmm0, %xmm1
+
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	psubw %xmm8, %xmm1
+	psllw $2, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+	
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending, specialized for reshade by shifting the source alpha
+	 * right by one */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+	shrb $1, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+	shrb $1, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	movdqa %xmm2, %xmm0
+	pand %xmm6, %xmm0
+	por %xmm6, %xmm1
+	psubusb %xmm0, %xmm1
+
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	psubw %xmm8, %xmm1
+	psllw $2, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+	
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	js 1b				
+	jnz 3f				
+2:
+	/* Grab 1 pixel from src, with colormod */
+	movl (%rsi, %rcx, 4), %eax
+	ror $24, %eax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %edx, %xmm1
+	movd (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending, specialized for reshade by shifting the source alpha
+	 * right by one */
+	roll $16, %edx
+	andl $0x0000ff00, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+	shrb $1, %ah
+	movd %eax, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	movdqa %xmm2, %xmm0
+	pand %xmm6, %xmm0
+	por %xmm6, %xmm1
+	psubusb %xmm0, %xmm1
+
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	psubw %xmm8, %xmm1
+	psllw $2, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+	
+	packuswb %xmm4, %xmm2
+	movd %xmm2, (%rdi, %rcx, 4)
+3:					
+	leaq (%rsi, %r10, 4), %rsi	
+	leaq (%rdi, %r11, 4), %rdi	
+	decq %r9			
+	jnz 0b
+
+9:
+	LEAVE
+SIZE(imlib_amd64_reshade_blend_rgba_to_rgba_cmod)
+
+PR_(imlib_amd64_reshade_blend_rgb_to_rgb_cmod):
+	ENTER
+
+	pxor %xmm4, %xmm4
+	movdqa m000V0V0V000V0V0V(%rip), %xmm6
+	movdqa m00XXXXXX(%rip), %xmm7
+
+	/* Move right to left across each line, */ 
+	/* processing in two pixel chunks */ 
+	leaq (%rsi, %r8, 4), %rsi	
+	leaq (%rdi, %r8, 4), %rdi	
+					
+	/* Last instruction is %rcx = 0 */ 
+	subq $4, %rsi			
+	subq $4, %rdi			
+					
+	negq %r8			
+0:					
+	movq %r8, %rcx			
+					
+	incq %rcx			
+
+	/* prefetch a couple cache lines ahead */
+	prefetchnta (%rsi, %rcx, 4)
+	prefetcht0 (%rdi, %rcx, 4)
+	prefetchnta 64(%rsi, %rcx, 4)
+	prefetcht0 64(%rdi, %rcx, 4)
+
+	jz 2f /* one pixel line */	
+1:
+	/* main loop, unrolled to work on 64 byte chunks */
+	prefetchnta 128(%rsi, %rcx, 4)
+	prefetcht0 128(%rdi, %rcx, 4)
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Unpack alpha */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero blending alpha */
+	pand %xmm7, %xmm3
+
+	/* Unpack src and dst */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (2 * a * (s - 127)) */
+	psubw %xmm6, %xmm1
+	psllw $2, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* Repack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Unpack alpha */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero blending alpha */
+	pand %xmm7, %xmm3
+
+	/* Unpack src and dst */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (2 * a * (s - 127)) */
+	psubw %xmm6, %xmm1
+	psllw $2, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* Repack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Unpack alpha */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero blending alpha */
+	pand %xmm7, %xmm3
+
+	/* Unpack src and dst */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (2 * a * (s - 127)) */
+	psubw %xmm6, %xmm1
+	psllw $2, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* Repack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Unpack alpha */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero blending alpha */
+	pand %xmm7, %xmm3
+
+	/* Unpack src and dst */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (2 * a * (s - 127)) */
+	psubw %xmm6, %xmm1
+	psllw $2, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* Repack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Unpack alpha */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero blending alpha */
+	pand %xmm7, %xmm3
+
+	/* Unpack src and dst */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (2 * a * (s - 127)) */
+	psubw %xmm6, %xmm1
+	psllw $2, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* Repack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Unpack alpha */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero blending alpha */
+	pand %xmm7, %xmm3
+
+	/* Unpack src and dst */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (2 * a * (s - 127)) */
+	psubw %xmm6, %xmm1
+	psllw $2, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* Repack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Unpack alpha */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero blending alpha */
+	pand %xmm7, %xmm3
+
+	/* Unpack src and dst */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (2 * a * (s - 127)) */
+	psubw %xmm6, %xmm1
+	psllw $2, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* Repack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* Unpack alpha */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero blending alpha */
+	pand %xmm7, %xmm3
+
+	/* Unpack src and dst */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (2 * a * (s - 127)) */
+	psubw %xmm6, %xmm1
+	psllw $2, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* Repack new pixels */
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	js 1b				
+	jnz 3f				
+2:
+	/* Grab 1 pixel from src, with colormod, with a = amod[255] */
+	movl (%rsi, %rcx, 4), %eax
+	ror $16, %eax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shl $8, %edx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %edx, %xmm1
+	movd (%rdi, %rcx, 4), %xmm2
+	/* Unpack alpha */
+	movq %xmm1, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0xFF, %xmm3, %xmm3
+	pshuflw $0xFF, %xmm3, %xmm3 
+	psrlw $1, %xmm3
+
+	/* Zero blending alpha */
+	pand %xmm7, %xmm3
+
+	/* Unpack src and dst */
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	/* d = d + (2 * a * (s - 127)) */
+	psubw %xmm6, %xmm1
+	psllw $2, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+
+	/* Repack new pixels */
+	packuswb %xmm4, %xmm2
+ 	movd %xmm2, (%rdi, %rcx, 4)
+3:					
+	leaq (%rsi, %r10, 4), %rsi	
+	leaq (%rdi, %r11, 4), %rdi	
+	decq %r9			
+	jnz 0b
+
+9:
+	LEAVE
+SIZE(imlib_amd64_reshade_blend_rgb_to_rgb_cmod)
+
+PR_(imlib_amd64_reshade_blend_rgb_to_rgba_cmod):
+	ENTER
+
+	movq pow_lut@GOTPCREL(%rip), %r13
+	pxor %xmm4, %xmm4
+	movdqa c1(%rip), %xmm5
+	movdqa mX000X000X000X000(%rip), %xmm6
+	movdqa m0XXX0XXX0XXX0XXX(%rip), %xmm7
+	movdqa m000V0V0V000V0V0V(%rip), %xmm8
+	xorq %rax, %rax
+
+	/* Move right to left across each line, */ 
+	/* processing in two pixel chunks */ 
+	leaq (%rsi, %r8, 4), %rsi	
+	leaq (%rdi, %r8, 4), %rdi	
+					
+	/* Last instruction is %rcx = 0 */ 
+	subq $4, %rsi			
+	subq $4, %rdi			
+					
+	negq %r8			
+0:					
+	movq %r8, %rcx			
+					
+	incq %rcx			
+
+	/* prefetch a couple cache lines ahead */
+	prefetchnta (%rsi, %rcx, 4)
+	prefetcht0 (%rdi, %rcx, 4)
+	prefetchnta 64(%rsi, %rcx, 4)
+	prefetcht0 64(%rdi, %rcx, 4)
+
+	jz 2f /* one pixel line */	
+1:
+	/* main loop, unrolled to work on 64 byte chunks */
+	prefetchnta 128(%rsi, %rcx, 4)
+	prefetcht0 128(%rdi, %rcx, 4)
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending, specialized for reshade by shifting the source alpha
+	 * right by one */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+	shrb $1, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+	shrb $1, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	movdqa %xmm2, %xmm0
+	pand %xmm6, %xmm0
+	por %xmm6, %xmm1
+	psubusb %xmm0, %xmm1
+
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	psubw %xmm8, %xmm1
+	psllw $2, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+	
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending, specialized for reshade by shifting the source alpha
+	 * right by one */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+	shrb $1, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+	shrb $1, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	movdqa %xmm2, %xmm0
+	pand %xmm6, %xmm0
+	por %xmm6, %xmm1
+	psubusb %xmm0, %xmm1
+
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	psubw %xmm8, %xmm1
+	psllw $2, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+	
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending, specialized for reshade by shifting the source alpha
+	 * right by one */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+	shrb $1, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+	shrb $1, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	movdqa %xmm2, %xmm0
+	pand %xmm6, %xmm0
+	por %xmm6, %xmm1
+	psubusb %xmm0, %xmm1
+
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	psubw %xmm8, %xmm1
+	psllw $2, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+	
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending, specialized for reshade by shifting the source alpha
+	 * right by one */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+	shrb $1, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+	shrb $1, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	movdqa %xmm2, %xmm0
+	pand %xmm6, %xmm0
+	por %xmm6, %xmm1
+	psubusb %xmm0, %xmm1
+
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	psubw %xmm8, %xmm1
+	psllw $2, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+	
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending, specialized for reshade by shifting the source alpha
+	 * right by one */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+	shrb $1, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+	shrb $1, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	movdqa %xmm2, %xmm0
+	pand %xmm6, %xmm0
+	por %xmm6, %xmm1
+	psubusb %xmm0, %xmm1
+
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	psubw %xmm8, %xmm1
+	psllw $2, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+	
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending, specialized for reshade by shifting the source alpha
+	 * right by one */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+	shrb $1, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+	shrb $1, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	movdqa %xmm2, %xmm0
+	pand %xmm6, %xmm0
+	por %xmm6, %xmm1
+	psubusb %xmm0, %xmm1
+
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	psubw %xmm8, %xmm1
+	psllw $2, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+	
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending, specialized for reshade by shifting the source alpha
+	 * right by one */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+	shrb $1, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+	shrb $1, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	movdqa %xmm2, %xmm0
+	pand %xmm6, %xmm0
+	por %xmm6, %xmm1
+	psubusb %xmm0, %xmm1
+
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	psubw %xmm8, %xmm1
+	psllw $2, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+	
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending, specialized for reshade by shifting the source alpha
+	 * right by one */
+	movq %rdx, %rax
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+	shrb $1, %ah
+
+	rolq $32, %rax
+	movl %eax, %edx
+
+	andl $0xff000000, %edx
+	roll $16, %edx
+	movb 7(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+	shrb $1, %ah
+
+	rolq $32, %rax
+	movd %rax, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	movdqa %xmm2, %xmm0
+	pand %xmm6, %xmm0
+	por %xmm6, %xmm1
+	psubusb %xmm0, %xmm1
+
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	psubw %xmm8, %xmm1
+	psllw $2, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+	
+	packuswb %xmm4, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	js 1b				
+	jnz 3f				
+2:
+	/* Grab 1 pixel from src, with colormod, with a = amod[255] */
+	movl (%rsi, %rcx, 4), %eax
+	ror $16, %eax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shl $8, %edx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %edx, %xmm1
+	movd (%rdi, %rcx, 4), %xmm2
+
+	/* Convert the cmod alpha to the pow_lut alpha that will be used
+	 * for blending, specialized for reshade by shifting the source alpha
+	 * right by one */
+	roll $16, %edx
+	andl $0x0000ff00, %edx
+	movb 3(%rdi, %rcx, 4), %dl
+	movb (%r13, %rdx), %al
+	movb %dh, %ah
+	shrb $1, %ah
+	movd %eax, %xmm3
+	punpcklbw %xmm3, %xmm3
+	pshufhw $0x40, %xmm3, %xmm3
+	pshuflw $0x40, %xmm3, %xmm3	
+	psrlw $1, %xmm3
+
+	movdqa %xmm2, %xmm0
+	pand %xmm6, %xmm0
+	por %xmm6, %xmm1
+	psubusb %xmm0, %xmm1
+
+	punpcklbw %xmm4, %xmm1
+	punpcklbw %xmm4, %xmm2
+
+	psubw %xmm8, %xmm1
+	psllw $2, %xmm1
+	pmulhw %xmm3, %xmm1
+	paddsw %xmm1, %xmm2
+	
+	packuswb %xmm4, %xmm2
+	movd %xmm2, (%rdi, %rcx, 4)
+3:					
+	leaq (%rsi, %r10, 4), %rsi	
+	leaq (%rdi, %r11, 4), %rdi	
+	decq %r9			
+	jnz 0b
+
+9:
+	LEAVE
+SIZE(imlib_amd64_reshade_blend_rgb_to_rgba_cmod)
+
+PR_(imlib_amd64_reshade_copy_rgba_to_rgb_cmod):
+	ENTER
+
+	movdqa m0XXX0XXX0XXX0XXX(%rip), %xmm5
+	movdqa m0VVV0VVV0VVV0VVV(%rip), %xmm6
+
+	/* Move right to left across each line, */ 
+	/* processing in two pixel chunks */ 
+	leaq (%rsi, %r8, 4), %rsi	
+	leaq (%rdi, %r8, 4), %rdi	
+					
+	/* Last instruction is %rcx = 0 */ 
+	subq $4, %rsi			
+	subq $4, %rdi			
+					
+	negq %r8			
+0:					
+	movq %r8, %rcx			
+					
+	incq %rcx			
+
+	/* prefetch a couple cache lines ahead */
+	prefetchnta (%rsi, %rcx, 4)
+	prefetcht0 (%rdi, %rcx, 4)
+	prefetchnta 64(%rsi, %rcx, 4)
+	prefetcht0 64(%rdi, %rcx, 4)
+
+	jz 2f /* one pixel line */	
+1:
+	/* main loop, unrolled to work on 64 byte chunks */
+	prefetchnta 128(%rsi, %rcx, 4)
+	prefetcht0 128(%rdi, %rcx, 4)
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* To take advantage of saturation and be able to do 8 bytes
+	 *  at a time, we divide reshading into two separate steps:
+	 *  adding values above 128, and subtracting values below 128
+	 * These values go into %mm1 and %mm3 respectively
+	 * - %xmm1 becomes (2 * (s - 127))
+	 * - %xmm3 becomes (2 * (255 - (127 + s))) = (2 * (128 - s))
+	 */
+	movdqa %xmm1, %xmm3
+	psubusb %xmm6, %xmm1
+	paddusb %xmm1, %xmm1
+	paddusb %xmm6, %xmm3
+	pxor %xmm5, %xmm3
+	paddusb %xmm3, %xmm3
+
+	/* dest alpha should not be changed in this func */
+	pand %xmm5, %xmm1
+	pand %xmm5, %xmm3
+
+	/* d = d + s1 - s2, unsigned saturation */
+	paddusb %xmm1, %xmm2
+	psubusb %xmm3, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* To take advantage of saturation and be able to do 8 bytes
+	 *  at a time, we divide reshading into two separate steps:
+	 *  adding values above 128, and subtracting values below 128
+	 * These values go into %mm1 and %mm3 respectively
+	 * - %xmm1 becomes (2 * (s - 127))
+	 * - %xmm3 becomes (2 * (255 - (127 + s))) = (2 * (128 - s))
+	 */
+	movdqa %xmm1, %xmm3
+	psubusb %xmm6, %xmm1
+	paddusb %xmm1, %xmm1
+	paddusb %xmm6, %xmm3
+	pxor %xmm5, %xmm3
+	paddusb %xmm3, %xmm3
+
+	/* dest alpha should not be changed in this func */
+	pand %xmm5, %xmm1
+	pand %xmm5, %xmm3
+
+	/* d = d + s1 - s2, unsigned saturation */
+	paddusb %xmm1, %xmm2
+	psubusb %xmm3, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* To take advantage of saturation and be able to do 8 bytes
+	 *  at a time, we divide reshading into two separate steps:
+	 *  adding values above 128, and subtracting values below 128
+	 * These values go into %mm1 and %mm3 respectively
+	 * - %xmm1 becomes (2 * (s - 127))
+	 * - %xmm3 becomes (2 * (255 - (127 + s))) = (2 * (128 - s))
+	 */
+	movdqa %xmm1, %xmm3
+	psubusb %xmm6, %xmm1
+	paddusb %xmm1, %xmm1
+	paddusb %xmm6, %xmm3
+	pxor %xmm5, %xmm3
+	paddusb %xmm3, %xmm3
+
+	/* dest alpha should not be changed in this func */
+	pand %xmm5, %xmm1
+	pand %xmm5, %xmm3
+
+	/* d = d + s1 - s2, unsigned saturation */
+	paddusb %xmm1, %xmm2
+	psubusb %xmm3, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* To take advantage of saturation and be able to do 8 bytes
+	 *  at a time, we divide reshading into two separate steps:
+	 *  adding values above 128, and subtracting values below 128
+	 * These values go into %mm1 and %mm3 respectively
+	 * - %xmm1 becomes (2 * (s - 127))
+	 * - %xmm3 becomes (2 * (255 - (127 + s))) = (2 * (128 - s))
+	 */
+	movdqa %xmm1, %xmm3
+	psubusb %xmm6, %xmm1
+	paddusb %xmm1, %xmm1
+	paddusb %xmm6, %xmm3
+	pxor %xmm5, %xmm3
+	paddusb %xmm3, %xmm3
+
+	/* dest alpha should not be changed in this func */
+	pand %xmm5, %xmm1
+	pand %xmm5, %xmm3
+
+	/* d = d + s1 - s2, unsigned saturation */
+	paddusb %xmm1, %xmm2
+	psubusb %xmm3, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* To take advantage of saturation and be able to do 8 bytes
+	 *  at a time, we divide reshading into two separate steps:
+	 *  adding values above 128, and subtracting values below 128
+	 * These values go into %mm1 and %mm3 respectively
+	 * - %xmm1 becomes (2 * (s - 127))
+	 * - %xmm3 becomes (2 * (255 - (127 + s))) = (2 * (128 - s))
+	 */
+	movdqa %xmm1, %xmm3
+	psubusb %xmm6, %xmm1
+	paddusb %xmm1, %xmm1
+	paddusb %xmm6, %xmm3
+	pxor %xmm5, %xmm3
+	paddusb %xmm3, %xmm3
+
+	/* dest alpha should not be changed in this func */
+	pand %xmm5, %xmm1
+	pand %xmm5, %xmm3
+
+	/* d = d + s1 - s2, unsigned saturation */
+	paddusb %xmm1, %xmm2
+	psubusb %xmm3, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* To take advantage of saturation and be able to do 8 bytes
+	 *  at a time, we divide reshading into two separate steps:
+	 *  adding values above 128, and subtracting values below 128
+	 * These values go into %mm1 and %mm3 respectively
+	 * - %xmm1 becomes (2 * (s - 127))
+	 * - %xmm3 becomes (2 * (255 - (127 + s))) = (2 * (128 - s))
+	 */
+	movdqa %xmm1, %xmm3
+	psubusb %xmm6, %xmm1
+	paddusb %xmm1, %xmm1
+	paddusb %xmm6, %xmm3
+	pxor %xmm5, %xmm3
+	paddusb %xmm3, %xmm3
+
+	/* dest alpha should not be changed in this func */
+	pand %xmm5, %xmm1
+	pand %xmm5, %xmm3
+
+	/* d = d + s1 - s2, unsigned saturation */
+	paddusb %xmm1, %xmm2
+	psubusb %xmm3, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* To take advantage of saturation and be able to do 8 bytes
+	 *  at a time, we divide reshading into two separate steps:
+	 *  adding values above 128, and subtracting values below 128
+	 * These values go into %mm1 and %mm3 respectively
+	 * - %xmm1 becomes (2 * (s - 127))
+	 * - %xmm3 becomes (2 * (255 - (127 + s))) = (2 * (128 - s))
+	 */
+	movdqa %xmm1, %xmm3
+	psubusb %xmm6, %xmm1
+	paddusb %xmm1, %xmm1
+	paddusb %xmm6, %xmm3
+	pxor %xmm5, %xmm3
+	paddusb %xmm3, %xmm3
+
+	/* dest alpha should not be changed in this func */
+	pand %xmm5, %xmm1
+	pand %xmm5, %xmm3
+
+	/* d = d + s1 - s2, unsigned saturation */
+	paddusb %xmm1, %xmm2
+	psubusb %xmm3, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* To take advantage of saturation and be able to do 8 bytes
+	 *  at a time, we divide reshading into two separate steps:
+	 *  adding values above 128, and subtracting values below 128
+	 * These values go into %mm1 and %mm3 respectively
+	 * - %xmm1 becomes (2 * (s - 127))
+	 * - %xmm3 becomes (2 * (255 - (127 + s))) = (2 * (128 - s))
+	 */
+	movdqa %xmm1, %xmm3
+	psubusb %xmm6, %xmm1
+	paddusb %xmm1, %xmm1
+	paddusb %xmm6, %xmm3
+	pxor %xmm5, %xmm3
+	paddusb %xmm3, %xmm3
+
+	/* dest alpha should not be changed in this func */
+	pand %xmm5, %xmm1
+	pand %xmm5, %xmm3
+
+	/* d = d + s1 - s2, unsigned saturation */
+	paddusb %xmm1, %xmm2
+	psubusb %xmm3, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	js 1b				
+	jnz 3f				
+2:
+	/* Grab 1 pixel from src, with colormod */
+	movl (%rsi, %rcx, 4), %eax
+	ror $24, %eax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %edx, %xmm1
+	movd (%rdi, %rcx, 4), %xmm2
+	/* To take advantage of saturation and be able to do 8 bytes
+	 *  at a time, we divide reshading into two separate steps:
+	 *  adding values above 128, and subtracting values below 128
+	 * These values go into %mm1 and %mm3 respectively
+	 * - %xmm1 becomes (2 * (s - 127))
+	 * - %xmm3 becomes (2 * (255 - (127 + s))) = (2 * (128 - s))
+	 */
+	movdqa %xmm1, %xmm3
+	psubusb %xmm6, %xmm1
+	paddusb %xmm1, %xmm1
+	paddusb %xmm6, %xmm3
+	pxor %xmm5, %xmm3
+	paddusb %xmm3, %xmm3
+
+	/* dest alpha should not be changed in this func */
+	pand %xmm5, %xmm1
+	pand %xmm5, %xmm3
+
+	/* d = d + s1 - s2, unsigned saturation */
+	paddusb %xmm1, %xmm2
+	psubusb %xmm3, %xmm2
+	movd %xmm2, (%rdi, %rcx, 4)
+3:					
+	leaq (%rsi, %r10, 4), %rsi	
+	leaq (%rdi, %r11, 4), %rdi	
+	decq %r9			
+	jnz 0b
+
+9:
+	LEAVE
+SIZE(imlib_amd64_reshade_copy_rgba_to_rgb_cmod)
+
+PR_(imlib_amd64_reshade_copy_rgba_to_rgba_cmod):
+	ENTER
+
+	movdqu m0XXX0XXX0XXX0XXX(%rip), %xmm5
+	movdqu m0VVV0VVV0VVV0VVV(%rip), %xmm6
+	movdqu mX000X000X000X000(%rip), %xmm7
+
+	/* Move right to left across each line, */ 
+	/* processing in two pixel chunks */ 
+	leaq (%rsi, %r8, 4), %rsi	
+	leaq (%rdi, %r8, 4), %rdi	
+					
+	/* Last instruction is %rcx = 0 */ 
+	subq $4, %rsi			
+	subq $4, %rdi			
+					
+	negq %r8			
+0:					
+	movq %r8, %rcx			
+					
+	incq %rcx			
+
+	/* prefetch a couple cache lines ahead */
+	prefetchnta (%rsi, %rcx, 4)
+	prefetcht0 (%rdi, %rcx, 4)
+	prefetchnta 64(%rsi, %rcx, 4)
+	prefetcht0 64(%rdi, %rcx, 4)
+
+	jz 2f /* one pixel line */	
+1:
+	/* main loop, unrolled to work on 64 byte chunks */
+	prefetchnta 128(%rsi, %rcx, 4)
+	prefetcht0 128(%rdi, %rcx, 4)
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */
+	movdqa %xmm1, %xmm3
+	psubusb %xmm6, %xmm1
+	movdqa %xmm1, %xmm0
+	paddusb %xmm1, %xmm1
+	paddusb %xmm6, %xmm3
+	pxor %xmm5, %xmm3
+	paddusb %xmm3, %xmm3
+
+	/* d = d + s1 - s2, unsigned saturation */
+	paddusb %xmm1, %xmm2
+	psubusb %xmm3, %xmm2
+	
+	/* d alpha = s alpha */
+	pand %xmm5, %xmm2
+	pand %xmm7, %xmm0
+	por %xmm0, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */
+	movdqa %xmm1, %xmm3
+	psubusb %xmm6, %xmm1
+	movdqa %xmm1, %xmm0
+	paddusb %xmm1, %xmm1
+	paddusb %xmm6, %xmm3
+	pxor %xmm5, %xmm3
+	paddusb %xmm3, %xmm3
+
+	/* d = d + s1 - s2, unsigned saturation */
+	paddusb %xmm1, %xmm2
+	psubusb %xmm3, %xmm2
+	
+	/* d alpha = s alpha */
+	pand %xmm5, %xmm2
+	pand %xmm7, %xmm0
+	por %xmm0, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */
+	movdqa %xmm1, %xmm3
+	psubusb %xmm6, %xmm1
+	movdqa %xmm1, %xmm0
+	paddusb %xmm1, %xmm1
+	paddusb %xmm6, %xmm3
+	pxor %xmm5, %xmm3
+	paddusb %xmm3, %xmm3
+
+	/* d = d + s1 - s2, unsigned saturation */
+	paddusb %xmm1, %xmm2
+	psubusb %xmm3, %xmm2
+	
+	/* d alpha = s alpha */
+	pand %xmm5, %xmm2
+	pand %xmm7, %xmm0
+	por %xmm0, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */
+	movdqa %xmm1, %xmm3
+	psubusb %xmm6, %xmm1
+	movdqa %xmm1, %xmm0
+	paddusb %xmm1, %xmm1
+	paddusb %xmm6, %xmm3
+	pxor %xmm5, %xmm3
+	paddusb %xmm3, %xmm3
+
+	/* d = d + s1 - s2, unsigned saturation */
+	paddusb %xmm1, %xmm2
+	psubusb %xmm3, %xmm2
+	
+	/* d alpha = s alpha */
+	pand %xmm5, %xmm2
+	pand %xmm7, %xmm0
+	por %xmm0, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */
+	movdqa %xmm1, %xmm3
+	psubusb %xmm6, %xmm1
+	movdqa %xmm1, %xmm0
+	paddusb %xmm1, %xmm1
+	paddusb %xmm6, %xmm3
+	pxor %xmm5, %xmm3
+	paddusb %xmm3, %xmm3
+
+	/* d = d + s1 - s2, unsigned saturation */
+	paddusb %xmm1, %xmm2
+	psubusb %xmm3, %xmm2
+	
+	/* d alpha = s alpha */
+	pand %xmm5, %xmm2
+	pand %xmm7, %xmm0
+	por %xmm0, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */
+	movdqa %xmm1, %xmm3
+	psubusb %xmm6, %xmm1
+	movdqa %xmm1, %xmm0
+	paddusb %xmm1, %xmm1
+	paddusb %xmm6, %xmm3
+	pxor %xmm5, %xmm3
+	paddusb %xmm3, %xmm3
+
+	/* d = d + s1 - s2, unsigned saturation */
+	paddusb %xmm1, %xmm2
+	psubusb %xmm3, %xmm2
+	
+	/* d alpha = s alpha */
+	pand %xmm5, %xmm2
+	pand %xmm7, %xmm0
+	por %xmm0, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */
+	movdqa %xmm1, %xmm3
+	psubusb %xmm6, %xmm1
+	movdqa %xmm1, %xmm0
+	paddusb %xmm1, %xmm1
+	paddusb %xmm6, %xmm3
+	pxor %xmm5, %xmm3
+	paddusb %xmm3, %xmm3
+
+	/* d = d + s1 - s2, unsigned saturation */
+	paddusb %xmm1, %xmm2
+	psubusb %xmm3, %xmm2
+	
+	/* d alpha = s alpha */
+	pand %xmm5, %xmm2
+	pand %xmm7, %xmm0
+	por %xmm0, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $56, %rax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */
+	movdqa %xmm1, %xmm3
+	psubusb %xmm6, %xmm1
+	movdqa %xmm1, %xmm0
+	paddusb %xmm1, %xmm1
+	paddusb %xmm6, %xmm3
+	pxor %xmm5, %xmm3
+	paddusb %xmm3, %xmm3
+
+	/* d = d + s1 - s2, unsigned saturation */
+	paddusb %xmm1, %xmm2
+	psubusb %xmm3, %xmm2
+	
+	/* d alpha = s alpha */
+	pand %xmm5, %xmm2
+	pand %xmm7, %xmm0
+	por %xmm0, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	js 1b				
+	jnz 3f				
+2:
+	/* Grab 1 pixel from src, with colormod */
+	movl (%rsi, %rcx, 4), %eax
+	ror $24, %eax
+	movzbq %al, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %edx, %xmm1
+	movd (%rdi, %rcx, 4), %xmm2
+	/* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */
+	movdqa %xmm1, %xmm3
+	psubusb %xmm6, %xmm1
+	movdqa %xmm1, %xmm0
+	paddusb %xmm1, %xmm1
+	paddusb %xmm6, %xmm3
+	pxor %xmm5, %xmm3
+	paddusb %xmm3, %xmm3
+
+	/* d = d + s1 - s2, unsigned saturation */
+	paddusb %xmm1, %xmm2
+	psubusb %xmm3, %xmm2
+	
+	/* d alpha = s alpha */
+	pand %xmm5, %xmm2
+	pand %xmm7, %xmm0
+	por %xmm0, %xmm2
+	movd %xmm2, (%rdi, %rcx, 4)
+3:					
+	leaq (%rsi, %r10, 4), %rsi	
+	leaq (%rdi, %r11, 4), %rdi	
+	decq %r9			
+	jnz 0b
+
+9:
+	LEAVE
+SIZE(imlib_amd64_reshade_copy_rgba_to_rgba_cmod)
+
+PR_(imlib_amd64_reshade_copy_rgb_to_rgba_cmod):
+	ENTER
+
+	movdqu m0XXX0XXX0XXX0XXX(%rip), %xmm5
+	movdqu m0VVV0VVV0VVV0VVV(%rip), %xmm6
+	movdqu mX000X000X000X000(%rip), %xmm7
+
+	/* Move right to left across each line, */ 
+	/* processing in two pixel chunks */ 
+	leaq (%rsi, %r8, 4), %rsi	
+	leaq (%rdi, %r8, 4), %rdi	
+					
+	/* Last instruction is %rcx = 0 */ 
+	subq $4, %rsi			
+	subq $4, %rdi			
+					
+	negq %r8			
+0:					
+	movq %r8, %rcx			
+					
+	incq %rcx			
+
+	/* prefetch a couple cache lines ahead */
+	prefetchnta (%rsi, %rcx, 4)
+	prefetcht0 (%rdi, %rcx, 4)
+	prefetchnta 64(%rsi, %rcx, 4)
+	prefetcht0 64(%rdi, %rcx, 4)
+
+	jz 2f /* one pixel line */	
+1:
+	/* main loop, unrolled to work on 64 byte chunks */
+	prefetchnta 128(%rsi, %rcx, 4)
+	prefetcht0 128(%rdi, %rcx, 4)
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */
+	movdqa %xmm1, %xmm3
+	psubusb %xmm6, %xmm1
+	movdqa %xmm1, %xmm0
+	paddusb %xmm1, %xmm1
+	paddusb %xmm6, %xmm3
+	pxor %xmm5, %xmm3
+	paddusb %xmm3, %xmm3
+
+	/* d = d + s1 - s2, unsigned saturation */
+	paddusb %xmm1, %xmm2
+	psubusb %xmm3, %xmm2
+	
+	/* d alpha = s alpha */
+	pand %xmm5, %xmm2
+	pand %xmm7, %xmm0
+	por %xmm0, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */
+	movdqa %xmm1, %xmm3
+	psubusb %xmm6, %xmm1
+	movdqa %xmm1, %xmm0
+	paddusb %xmm1, %xmm1
+	paddusb %xmm6, %xmm3
+	pxor %xmm5, %xmm3
+	paddusb %xmm3, %xmm3
+
+	/* d = d + s1 - s2, unsigned saturation */
+	paddusb %xmm1, %xmm2
+	psubusb %xmm3, %xmm2
+	
+	/* d alpha = s alpha */
+	pand %xmm5, %xmm2
+	pand %xmm7, %xmm0
+	por %xmm0, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */
+	movdqa %xmm1, %xmm3
+	psubusb %xmm6, %xmm1
+	movdqa %xmm1, %xmm0
+	paddusb %xmm1, %xmm1
+	paddusb %xmm6, %xmm3
+	pxor %xmm5, %xmm3
+	paddusb %xmm3, %xmm3
+
+	/* d = d + s1 - s2, unsigned saturation */
+	paddusb %xmm1, %xmm2
+	psubusb %xmm3, %xmm2
+	
+	/* d alpha = s alpha */
+	pand %xmm5, %xmm2
+	pand %xmm7, %xmm0
+	por %xmm0, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */
+	movdqa %xmm1, %xmm3
+	psubusb %xmm6, %xmm1
+	movdqa %xmm1, %xmm0
+	paddusb %xmm1, %xmm1
+	paddusb %xmm6, %xmm3
+	pxor %xmm5, %xmm3
+	paddusb %xmm3, %xmm3
+
+	/* d = d + s1 - s2, unsigned saturation */
+	paddusb %xmm1, %xmm2
+	psubusb %xmm3, %xmm2
+	
+	/* d alpha = s alpha */
+	pand %xmm5, %xmm2
+	pand %xmm7, %xmm0
+	por %xmm0, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */
+	movdqa %xmm1, %xmm3
+	psubusb %xmm6, %xmm1
+	movdqa %xmm1, %xmm0
+	paddusb %xmm1, %xmm1
+	paddusb %xmm6, %xmm3
+	pxor %xmm5, %xmm3
+	paddusb %xmm3, %xmm3
+
+	/* d = d + s1 - s2, unsigned saturation */
+	paddusb %xmm1, %xmm2
+	psubusb %xmm3, %xmm2
+	
+	/* d alpha = s alpha */
+	pand %xmm5, %xmm2
+	pand %xmm7, %xmm0
+	por %xmm0, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */
+	movdqa %xmm1, %xmm3
+	psubusb %xmm6, %xmm1
+	movdqa %xmm1, %xmm0
+	paddusb %xmm1, %xmm1
+	paddusb %xmm6, %xmm3
+	pxor %xmm5, %xmm3
+	paddusb %xmm3, %xmm3
+
+	/* d = d + s1 - s2, unsigned saturation */
+	paddusb %xmm1, %xmm2
+	psubusb %xmm3, %xmm2
+	
+	/* d alpha = s alpha */
+	pand %xmm5, %xmm2
+	pand %xmm7, %xmm0
+	por %xmm0, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */
+	movdqa %xmm1, %xmm3
+	psubusb %xmm6, %xmm1
+	movdqa %xmm1, %xmm0
+	paddusb %xmm1, %xmm1
+	paddusb %xmm6, %xmm3
+	pxor %xmm5, %xmm3
+	paddusb %xmm3, %xmm3
+
+	/* d = d + s1 - s2, unsigned saturation */
+	paddusb %xmm1, %xmm2
+	psubusb %xmm3, %xmm2
+	
+	/* d alpha = s alpha */
+	pand %xmm5, %xmm2
+	pand %xmm7, %xmm0
+	por %xmm0, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	jz 2f
+	jns 3f				
+
+	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
+	movq (%rsi, %rcx, 4), %rax
+	rorq $48, %rax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shlq $8, %rdx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	shlq $8, %rdx
+	movl $0x000000FF, %ebx
+	movb 0x300(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $16, %rax
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shlq $8, %rdx
+	rolq $8, %rax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %rdx, %xmm1
+	movq (%rdi, %rcx, 4), %xmm2
+	/* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */
+	movdqa %xmm1, %xmm3
+	psubusb %xmm6, %xmm1
+	movdqa %xmm1, %xmm0
+	paddusb %xmm1, %xmm1
+	paddusb %xmm6, %xmm3
+	pxor %xmm5, %xmm3
+	paddusb %xmm3, %xmm3
+
+	/* d = d + s1 - s2, unsigned saturation */
+	paddusb %xmm1, %xmm2
+	psubusb %xmm3, %xmm2
+	
+	/* d alpha = s alpha */
+	pand %xmm5, %xmm2
+	pand %xmm7, %xmm0
+	por %xmm0, %xmm2
+	movq %xmm2, (%rdi, %rcx, 4)
+
+	incq %rcx			
+	incq %rcx			
+	js 1b				
+	jnz 3f				
+2:
+	/* Grab 1 pixel from src, with colormod, with a = amod[255] */
+	movl (%rsi, %rcx, 4), %eax
+	ror $16, %eax
+	movq $0x000000FF, %rbx
+	movzbq 0x300(%r14, %rbx), %rdx
+	shl $8, %edx
+	movb %al, %bl
+	movb 0x000(%r14, %rbx), %dl
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x100(%r14, %rbx), %dl
+	shl $8, %edx
+	rol $8, %eax
+	movb %al, %bl
+	movb 0x200(%r14, %rbx), %dl
+	movd %edx, %xmm1
+	movd (%rdi, %rcx, 4), %xmm2
+	/* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */
+	movdqa %xmm1, %xmm3
+	psubusb %xmm6, %xmm1
+	movdqa %xmm1, %xmm0
+	paddusb %xmm1, %xmm1
+	paddusb %xmm6, %xmm3
+	pxor %xmm5, %xmm3
+	paddusb %xmm3, %xmm3
+
+	/* d = d + s1 - s2, unsigned saturation */
+	paddusb %xmm1, %xmm2
+	psubusb %xmm3, %xmm2
+	
+	/* d alpha = s alpha */
+	pand %xmm5, %xmm2
+	pand %xmm7, %xmm0
+	por %xmm0, %xmm2
+	movd %xmm2, (%rdi, %rcx, 4)
+3:					
+	leaq (%rsi, %r10, 4), %rsi	
+	leaq (%rdi, %r11, 4), %rdi	
+	decq %r9			
+	jnz 0b
+
+9:
+	LEAVE
+SIZE(imlib_amd64_reshade_copy_rgb_to_rgba_cmod)
+
+
+#endif
diff --git a/src/lib/blend.c b/src/lib/blend.c
index 706e55b..541ea6e 100644
--- a/src/lib/blend.c
+++ b/src/lib/blend.c
@@ -1402,6 +1402,7 @@ __imlib_ReCopyRGBToRGBACmod(DATA32 * src, int srcw, DATA32 * dst, int dstw,
 #define __imlib_amd64_blend_rgb_to_rgba		__imlib_amd64_copy_rgb_to_rgba
 #define __imlib_CopyRGBToRGBCmod		__imlib_CopyRGBAToRGBCmod
 #define __imlib_mmx_copy_rgb_to_rgb_cmod	__imlib_mmx_copy_rgba_to_rgb_cmod
+#define __imlib_amd64_copy_rgb_to_rgb_cmod	__imlib_amd64_copy_rgba_to_rgb_cmod
 
 #define __imlib_AddCopyRGBToRGB			__imlib_AddCopyRGBAToRGB
 #define __imlib_AddBlendRGBToRGB		__imlib_AddCopyRGBToRGB
@@ -1414,6 +1415,7 @@ __imlib_ReCopyRGBToRGBACmod(DATA32 * src, int srcw, DATA32 * dst, int dstw,
 #define __imlib_amd64_add_blend_rgb_to_rgba	__imlib_amd64_add_copy_rgb_to_rgba
 #define __imlib_AddCopyRGBToRGBCmod		__imlib_AddCopyRGBAToRGBCmod
 #define __imlib_mmx_add_copy_rgb_to_rgb_cmod	__imlib_mmx_add_copy_rgb_to_rgba_cmod
+#define __imlib_amd64_add_copy_rgb_to_rgb_cmod	__imlib_amd64_add_copy_rgb_to_rgba_cmod
 
 #define __imlib_SubCopyRGBToRGB			__imlib_SubCopyRGBAToRGB
 #define __imlib_SubBlendRGBToRGB		__imlib_SubCopyRGBToRGB
@@ -1427,6 +1429,7 @@ __imlib_ReCopyRGBToRGBACmod(DATA32 * src, int srcw, DATA32 * dst, int dstw,
 #define __imlib_amd64_subtract_blend_rgb_to_rgba	__imlib_amd64_subtract_copy_rgb_to_rgba
 #define __imlib_SubCopyRGBToRGBCmod		__imlib_SubCopyRGBAToRGBCmod
 #define __imlib_mmx_subtract_copy_rgb_to_rgb_cmod	__imlib_mmx_subtract_copy_rgb_to_rgba_cmod
+#define __imlib_amd64_subtract_copy_rgb_to_rgb_cmod	__imlib_amd64_subtract_copy_rgb_to_rgba_cmod
 
 #define __imlib_ReCopyRGBToRGB			__imlib_ReCopyRGBAToRGB
 #define __imlib_ReBlendRGBToRGB			__imlib_ReCopyRGBToRGB
@@ -1440,6 +1443,7 @@ __imlib_ReCopyRGBToRGBACmod(DATA32 * src, int srcw, DATA32 * dst, int dstw,
 #define __imlib_amd64_reshade_blend_rgb_to_rgba	__imlib_amd64_reshade_copy_rgb_to_rgba
 #define __imlib_ReCopyRGBToRGBCmod		__imlib_ReCopyRGBAToRGBCmod
 #define __imlib_mmx_reshade_copy_rgb_to_rgb_cmod	__imlib_mmx_reshade_copy_rgb_to_rgba_cmod
+#define __imlib_amd64_reshade_copy_rgb_to_rgb_cmod	__imlib_amd64_reshade_copy_rgb_to_rgba_cmod
 
 
 ImlibBlendFunction
@@ -1570,10 +1574,14 @@ __imlib_GetBlendFunction(ImlibOp op, char blend, char merge_alpha, char rgb_src,
          {{__imlib_amd64_copy_rgba_to_rgba, __imlib_amd64_blend_rgba_to_rgba },
 	  {__imlib_amd64_copy_rgb_to_rgba, __imlib_amd64_blend_rgb_to_rgba}}},
 
-        {{{__imlib_CopyRGBAToRGBCmod, __imlib_BlendRGBAToRGBCmod},
-          {__imlib_CopyRGBToRGBCmod, __imlib_BlendRGBToRGBCmod}},
-         {{__imlib_CopyRGBAToRGBACmod, __imlib_BlendRGBAToRGBACmod},
-	 {__imlib_CopyRGBToRGBACmod, __imlib_BlendRGBToRGBACmod}}}},
+        {{{__imlib_amd64_copy_rgba_to_rgb_cmod,
+	   __imlib_amd64_blend_rgba_to_rgb_cmod},
+          {__imlib_amd64_copy_rgb_to_rgb_cmod,
+	   __imlib_amd64_blend_rgb_to_rgb_cmod}},
+         {{__imlib_amd64_copy_rgba_to_rgba_cmod,
+	   __imlib_amd64_blend_rgba_to_rgba_cmod},
+	  {__imlib_amd64_copy_rgb_to_rgba_cmod,
+	   __imlib_amd64_blend_rgb_to_rgba_cmod}}}},
        /*\ OP_ADD \ */
        {{{{__imlib_amd64_add_copy_rgba_to_rgb, 
 	   __imlib_amd64_add_blend_rgba_to_rgb},
@@ -1584,10 +1592,14 @@ __imlib_GetBlendFunction(ImlibOp op, char blend, char merge_alpha, char rgb_src,
           {__imlib_amd64_add_copy_rgb_to_rgba, 
 	  __imlib_amd64_add_blend_rgb_to_rgba}}},
 
-        {{{__imlib_AddCopyRGBAToRGBCmod, __imlib_AddBlendRGBAToRGBCmod},
-          {__imlib_AddCopyRGBToRGBCmod, __imlib_AddBlendRGBToRGBCmod}},
-         {{__imlib_AddCopyRGBAToRGBACmod, __imlib_AddBlendRGBAToRGBACmod},
-          {__imlib_AddCopyRGBToRGBACmod, __imlib_AddBlendRGBToRGBACmod}}}},
+        {{{__imlib_amd64_add_copy_rgba_to_rgb_cmod, 
+	   __imlib_amd64_add_blend_rgba_to_rgb_cmod},
+          {__imlib_amd64_add_copy_rgb_to_rgb_cmod, 
+	   __imlib_amd64_add_blend_rgb_to_rgb_cmod}},
+         {{__imlib_amd64_add_copy_rgba_to_rgba_cmod, 
+	   __imlib_amd64_add_blend_rgba_to_rgba_cmod},
+          {__imlib_amd64_add_copy_rgb_to_rgba_cmod, 
+	   __imlib_amd64_add_blend_rgb_to_rgba_cmod}}}},
        /*\ OP_SUBTRACT \ */
        {{{{__imlib_amd64_subtract_copy_rgba_to_rgb,
            __imlib_amd64_subtract_blend_rgba_to_rgb},
@@ -1598,10 +1610,14 @@ __imlib_GetBlendFunction(ImlibOp op, char blend, char merge_alpha, char rgb_src,
           {__imlib_amd64_subtract_copy_rgb_to_rgba,
 	  __imlib_amd64_subtract_blend_rgb_to_rgba}}},
 
-        {{{__imlib_SubCopyRGBAToRGBCmod, __imlib_SubBlendRGBAToRGBCmod},
-          {__imlib_SubCopyRGBToRGBCmod, __imlib_SubBlendRGBToRGBCmod}},
-         {{__imlib_SubCopyRGBAToRGBACmod, __imlib_SubBlendRGBAToRGBACmod},
-          {__imlib_SubCopyRGBToRGBACmod, __imlib_SubBlendRGBToRGBACmod}}}},
+        {{{__imlib_amd64_subtract_copy_rgba_to_rgb_cmod,
+           __imlib_amd64_subtract_blend_rgba_to_rgb_cmod},
+          {__imlib_amd64_subtract_copy_rgb_to_rgb_cmod,
+           __imlib_amd64_subtract_blend_rgb_to_rgb_cmod}},
+         {{__imlib_amd64_subtract_copy_rgba_to_rgba_cmod,
+           __imlib_amd64_subtract_blend_rgba_to_rgba_cmod},
+          {__imlib_amd64_subtract_copy_rgb_to_rgba_cmod,
+	  __imlib_amd64_subtract_blend_rgb_to_rgba_cmod}}}},
        /*\ OP_RESHADE \ */
        {{{{__imlib_amd64_reshade_copy_rgba_to_rgb,
            __imlib_amd64_reshade_blend_rgba_to_rgb},
@@ -1612,10 +1628,14 @@ __imlib_GetBlendFunction(ImlibOp op, char blend, char merge_alpha, char rgb_src,
           {__imlib_amd64_reshade_copy_rgb_to_rgba,
 	  __imlib_amd64_reshade_blend_rgb_to_rgba}}},
 
-        {{{__imlib_ReCopyRGBAToRGBCmod, __imlib_ReBlendRGBAToRGBCmod},
-          {__imlib_ReCopyRGBToRGBCmod, __imlib_ReBlendRGBToRGBCmod}},
-         {{__imlib_ReCopyRGBAToRGBACmod, __imlib_ReBlendRGBAToRGBACmod},
-          {__imlib_ReCopyRGBToRGBACmod, __imlib_ReBlendRGBToRGBACmod}}}}},
+        {{{__imlib_amd64_reshade_copy_rgba_to_rgb_cmod,
+           __imlib_amd64_reshade_blend_rgba_to_rgb_cmod},
+          {__imlib_amd64_reshade_copy_rgb_to_rgb_cmod,
+           __imlib_amd64_reshade_blend_rgb_to_rgb_cmod}},
+         {{__imlib_amd64_reshade_copy_rgba_to_rgba_cmod,
+           __imlib_amd64_reshade_blend_rgba_to_rgba_cmod},
+          {__imlib_amd64_reshade_copy_rgb_to_rgba_cmod,
+	   __imlib_amd64_reshade_blend_rgb_to_rgba_cmod}}}}},
 #endif
    };
 
diff --git a/src/lib/blend.h b/src/lib/blend.h
index 20144ac..9458f3b 100644
--- a/src/lib/blend.h
+++ b/src/lib/blend.h
@@ -613,5 +613,90 @@ void
 __imlib_amd64_reshade_copy_rgb_to_rgba(DATA32 *src, int sw, DATA32 *dst,
 				      int dw, int w, int h, ImlibColorModifier *cm);
 
+
+void
+__imlib_amd64_blend_rgba_to_rgb_cmod(DATA32 *src, int sw, DATA32 *dst,
+                              int dw, int w, int h, ImlibColorModifier *cm);
+void
+__imlib_amd64_blend_rgba_to_rgba_cmod(DATA32 *src, int sw, DATA32 *dst,
+			       int dw, int w, int h, ImlibColorModifier *cm);
+void
+__imlib_amd64_blend_rgb_to_rgb_cmod(DATA32 *src, int sw, DATA32 *dst,
+                              int dw, int w, int h, ImlibColorModifier *cm);
+void
+__imlib_amd64_blend_rgb_to_rgba_cmod(DATA32 *src, int sw, DATA32 *dst,
+			       int dw, int w, int h, ImlibColorModifier *cm);
+void
+__imlib_amd64_copy_rgba_to_rgb_cmod(DATA32 *src, int sw, DATA32 *dst,
+			     int dw, int w, int h, ImlibColorModifier *cm);
+void
+__imlib_amd64_copy_rgba_to_rgba_cmod(DATA32 *src, int sw, DATA32 *dst,
+                              int dw, int w, int h, ImlibColorModifier *cm);
+void
+__imlib_amd64_copy_rgb_to_rgba_cmod(DATA32 *src, int sw, DATA32 *dst,
+			     int dw, int w, int h, ImlibColorModifier *cm);
+void
+__imlib_amd64_add_blend_rgba_to_rgb_cmod(DATA32 *src, int sw, DATA32 *dst,
+				  int dw, int w, int h, ImlibColorModifier *cm);
+void
+__imlib_amd64_add_blend_rgba_to_rgba_cmod(DATA32 *src, int sw, DATA32 *dst,
+				   int dw, int w, int h, ImlibColorModifier *cm);
+void
+__imlib_amd64_add_blend_rgb_to_rgb_cmod(DATA32 *src, int sw, DATA32 *dst,
+				  int dw, int w, int h, ImlibColorModifier *cm);
+void
+__imlib_amd64_add_blend_rgb_to_rgba_cmod(DATA32 *src, int sw, DATA32 *dst,
+				   int dw, int w, int h, ImlibColorModifier *cm);
+void
+__imlib_amd64_add_copy_rgba_to_rgb_cmod(DATA32 *src, int sw, DATA32 *dst,
+				 int dw, int w, int h, ImlibColorModifier *cm);
+void
+__imlib_amd64_add_copy_rgba_to_rgba_cmod(DATA32 *src, int sw, DATA32 *dst,
+				  int dw, int w, int h, ImlibColorModifier *cm);
+void
+__imlib_amd64_add_copy_rgb_to_rgba_cmod(DATA32 *src, int sw, DATA32 *dst,
+				  int dw, int w, int h, ImlibColorModifier *cm);
+void
+__imlib_amd64_subtract_blend_rgba_to_rgb_cmod(DATA32 *src, int sw, DATA32 *dst,
+				       int dw, int w, int h, ImlibColorModifier *cm);
+void
+__imlib_amd64_subtract_blend_rgba_to_rgba_cmod(DATA32 *src, int sw, DATA32 *dst,
+					int dw, int w, int h, ImlibColorModifier *cm);
+void
+__imlib_amd64_subtract_blend_rgb_to_rgb_cmod(DATA32 *src, int sw, DATA32 *dst,
+				       int dw, int w, int h, ImlibColorModifier *cm);
+void
+__imlib_amd64_subtract_blend_rgb_to_rgba_cmod(DATA32 *src, int sw, DATA32 *dst,
+					int dw, int w, int h, ImlibColorModifier *cm);
+void
+__imlib_amd64_subtract_copy_rgba_to_rgb_cmod(DATA32 *src, int sw, DATA32 *dst,
+				      int dw, int w, int h, ImlibColorModifier *cm);
+void
+__imlib_amd64_subtract_copy_rgba_to_rgba_cmod(DATA32 *src, int sw, DATA32 *dst,
+				       int dw, int w, int h, ImlibColorModifier *cm);
+void
+__imlib_amd64_subtract_copy_rgb_to_rgba_cmod(DATA32 *src, int sw, DATA32 *dst,
+				       int dw, int w, int h, ImlibColorModifier *cm);
+void
+__imlib_amd64_reshade_blend_rgba_to_rgb_cmod(DATA32 *src, int sw, DATA32 *dst,
+				      int dw, int w, int h, ImlibColorModifier *cm);
+void
+__imlib_amd64_reshade_blend_rgba_to_rgba_cmod(DATA32 *src, int sw, DATA32 *dst,
+				       int dw, int w, int h, ImlibColorModifier *cm);
+void
+__imlib_amd64_reshade_blend_rgb_to_rgb_cmod(DATA32 *src, int sw, DATA32 *dst,
+				      int dw, int w, int h, ImlibColorModifier *cm);
+void
+__imlib_amd64_reshade_blend_rgb_to_rgba_cmod(DATA32 *src, int sw, DATA32 *dst,
+				       int dw, int w, int h, ImlibColorModifier *cm);
+void
+__imlib_amd64_reshade_copy_rgba_to_rgb_cmod(DATA32 *src, int sw, DATA32 *dst,
+				     int dw, int w, int h, ImlibColorModifier *cm);
+void
+__imlib_amd64_reshade_copy_rgba_to_rgba_cmod(DATA32 *src, int sw, DATA32 *dst,
+				      int dw, int w, int h, ImlibColorModifier *cm);
+void
+__imlib_amd64_reshade_copy_rgb_to_rgba_cmod(DATA32 *src, int sw, DATA32 *dst,
+				      int dw, int w, int h, ImlibColorModifier *cm);
 #endif
 #endif