diff --git a/src/lib/amd64_blend.S b/src/lib/amd64_blend.S
index 5eabfc4..5721913 100644
--- a/src/lib/amd64_blend.S
+++ b/src/lib/amd64_blend.S
@@ -1,9 +1,7 @@
 #include <config.h>
 #include "asm.h"
 
-#ifdef DO_AMD64_ASM
-
-/*\ 
+/*\
 |*| AMD64 SSE2 assembly blending routines for Imlib2
 |*| Written by John Slaten <zartheenumerator@comcast.net>
 |*| Based on MMX routines written by Willem Monsuwe <willem@stack.nl>
@@ -79,7 +77,7 @@ FN_(imlib_amd64_reshade_copy_rgba_to_rgba)
 FN_(imlib_amd64_reshade_copy_rgb_to_rgba)
 
 .extern pow_lut
-	
+
 /*\ SSE register use:
 |*| %xmm1 = Source value
 |*| %xmm2 = Destination value
@@ -96,9 +94,6 @@ FN_(imlib_amd64_reshade_copy_rgb_to_rgba)
 |*| %r10d = sw
 |*| %r11d = dw
 \*/
-	
-
-
 
 
 #define ENTER		\
@@ -118,7 +113,7 @@ FN_(imlib_amd64_reshade_copy_rgb_to_rgba)
 	jz 9f		; \
 	testq %r9, %r9	; \
 	jz 9f
-	
+
 #define LEAVE		\
 	popq %r14	; \
 	popq %r13	; \
@@ -135,20 +130,20 @@ PR_(imlib_amd64_blend_rgba_to_rgb):
 	movdqu c1(%rip), %xmm5
 	movdqu m00XXXXXX(%rip), %xmm6
 
-	/* Move right to left across each line, */ 
-	/* processing in two pixel chunks */ 
-	leaq (%rsi, %r8, 4), %rsi	
-	leaq (%rdi, %r8, 4), %rdi	
-					
-	/* Last instruction is %rcx = 0 */ 
-	subq $4, %rsi			
-	subq $4, %rdi			
-					
-	negq %r8			
-0:					
-	movq %r8, %rcx			
-					
-	incq %rcx			
+	/* Move right to left across each line, */
+	/* processing in two pixel chunks */
+	leaq (%rsi, %r8, 4), %rsi
+	leaq (%rdi, %r8, 4), %rdi
+
+	/* Last instruction is %rcx = 0 */
+	subq $4, %rsi
+	subq $4, %rdi
+
+	negq %r8
+0:
+	movq %r8, %rcx
+
+	incq %rcx
 
 	/* prefetch a couple cache lines ahead */
 	prefetchnta (%rsi, %rcx, 4)
@@ -156,7 +151,7 @@ PR_(imlib_amd64_blend_rgba_to_rgb):
 	prefetchnta 64(%rsi, %rcx, 4)
 	prefetcht0 64(%rdi, %rcx, 4)
 
-	jz 2f /* one pixel line */	
+	jz 2f /* one pixel line */
 1:
 	/* main loop, unrolled to work on 64 byte chunks */
 	prefetchnta 128(%rsi, %rcx, 4)
@@ -168,13 +163,13 @@ PR_(imlib_amd64_blend_rgba_to_rgb):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -197,10 +192,10 @@ PR_(imlib_amd64_blend_rgba_to_rgb):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
@@ -208,13 +203,13 @@ PR_(imlib_amd64_blend_rgba_to_rgb):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -237,10 +232,10 @@ PR_(imlib_amd64_blend_rgba_to_rgb):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
@@ -248,13 +243,13 @@ PR_(imlib_amd64_blend_rgba_to_rgb):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -277,10 +272,10 @@ PR_(imlib_amd64_blend_rgba_to_rgb):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
@@ -288,13 +283,13 @@ PR_(imlib_amd64_blend_rgba_to_rgb):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -317,10 +312,10 @@ PR_(imlib_amd64_blend_rgba_to_rgb):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
@@ -328,13 +323,13 @@ PR_(imlib_amd64_blend_rgba_to_rgb):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -357,10 +352,10 @@ PR_(imlib_amd64_blend_rgba_to_rgb):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
@@ -368,13 +363,13 @@ PR_(imlib_amd64_blend_rgba_to_rgb):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -397,10 +392,10 @@ PR_(imlib_amd64_blend_rgba_to_rgb):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
@@ -408,13 +403,13 @@ PR_(imlib_amd64_blend_rgba_to_rgb):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -437,10 +432,10 @@ PR_(imlib_amd64_blend_rgba_to_rgb):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
@@ -448,13 +443,13 @@ PR_(imlib_amd64_blend_rgba_to_rgb):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -477,10 +472,10 @@ PR_(imlib_amd64_blend_rgba_to_rgb):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
-	js 1b				
-	jnz 3f				
+	incq %rcx
+	incq %rcx
+	js 1b
+	jnz 3f
 2:
 	movd (%rsi, %rcx, 4), %xmm1
 	movd (%rdi, %rcx, 4), %xmm2
@@ -488,13 +483,13 @@ PR_(imlib_amd64_blend_rgba_to_rgb):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -516,10 +511,10 @@ PR_(imlib_amd64_blend_rgba_to_rgb):
 	/* Repack new pixels */
 	packuswb %xmm4, %xmm2
 	movd %xmm2, (%rdi, %rcx, 4)
-3:					
-	leaq (%rsi, %r10, 4), %rsi	
-	leaq (%rdi, %r11, 4), %rdi	
-	decq %r9			
+3:
+	leaq (%rsi, %r10, 4), %rsi
+	leaq (%rdi, %r11, 4), %rdi
+	decq %r9
 	jnz 0b
 
 9:
@@ -534,20 +529,20 @@ PR_(imlib_amd64_blend_rgba_to_rgba):
 	movdqu mX000X000X000X000(%rip), %xmm6
 	movq pow_lut@GOTPCREL(%rip), %r13
 
-	/* Move right to left across each line, */ 
-	/* processing in two pixel chunks */ 
-	leaq (%rsi, %r8, 4), %rsi	
-	leaq (%rdi, %r8, 4), %rdi	
-					
-	/* Last instruction is %rcx = 0 */ 
-	subq $4, %rsi			
-	subq $4, %rdi			
-					
-	negq %r8			
-0:					
-	movq %r8, %rcx			
-					
-	incq %rcx			
+	/* Move right to left across each line, */
+	/* processing in two pixel chunks */
+	leaq (%rsi, %r8, 4), %rsi
+	leaq (%rdi, %r8, 4), %rdi
+
+	/* Last instruction is %rcx = 0 */
+	subq $4, %rsi
+	subq $4, %rdi
+
+	negq %r8
+0:
+	movq %r8, %rcx
+
+	incq %rcx
 
 	/* prefetch a couple cache lines ahead */
 	prefetchnta (%rsi, %rcx, 4)
@@ -555,7 +550,7 @@ PR_(imlib_amd64_blend_rgba_to_rgba):
 	prefetchnta 64(%rsi, %rcx, 4)
 	prefetcht0 64(%rdi, %rcx, 4)
 
-	jz 2f /* one pixel line */	
+	jz 2f /* one pixel line */
 1:
 	/* main loop, unrolled to work on 64 byte chunks */
 	prefetchnta 128(%rsi, %rcx, 4)
@@ -563,8 +558,8 @@ PR_(imlib_amd64_blend_rgba_to_rgba):
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
-	/* Load two pixels as 00, 00, src alpha, combined alpha 
-	 * Combined alpha is derived from the pow_lut table in blend.c 
+	/* Load two pixels as 00, 00, src alpha, combined alpha
+	 * Combined alpha is derived from the pow_lut table in blend.c
 	 */
 	movzbq 7(%rdi, %rcx, 4), %rdx
 	movb 7(%rsi, %rcx, 4), %dh
@@ -575,14 +570,14 @@ PR_(imlib_amd64_blend_rgba_to_rgba):
 	shlq $32, %rax
 	movb (%r13, %rdx), %al
 	movb %dh, %ah
-	movd %rax, %xmm3 
+	movd %rax, %xmm3
 	/* override source alpha to 255 */
 	por %xmm6, %xmm1
 
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* unpack source and dest */
@@ -600,15 +595,15 @@ PR_(imlib_amd64_blend_rgba_to_rgba):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
-	/* Load two pixels as 00, 00, src alpha, combined alpha 
-	 * Combined alpha is derived from the pow_lut table in blend.c 
+	/* Load two pixels as 00, 00, src alpha, combined alpha
+	 * Combined alpha is derived from the pow_lut table in blend.c
 	 */
 	movzbq 7(%rdi, %rcx, 4), %rdx
 	movb 7(%rsi, %rcx, 4), %dh
@@ -619,14 +614,14 @@ PR_(imlib_amd64_blend_rgba_to_rgba):
 	shlq $32, %rax
 	movb (%r13, %rdx), %al
 	movb %dh, %ah
-	movd %rax, %xmm3 
+	movd %rax, %xmm3
 	/* override source alpha to 255 */
 	por %xmm6, %xmm1
 
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* unpack source and dest */
@@ -644,15 +639,15 @@ PR_(imlib_amd64_blend_rgba_to_rgba):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
-	/* Load two pixels as 00, 00, src alpha, combined alpha 
-	 * Combined alpha is derived from the pow_lut table in blend.c 
+	/* Load two pixels as 00, 00, src alpha, combined alpha
+	 * Combined alpha is derived from the pow_lut table in blend.c
 	 */
 	movzbq 7(%rdi, %rcx, 4), %rdx
 	movb 7(%rsi, %rcx, 4), %dh
@@ -663,14 +658,14 @@ PR_(imlib_amd64_blend_rgba_to_rgba):
 	shlq $32, %rax
 	movb (%r13, %rdx), %al
 	movb %dh, %ah
-	movd %rax, %xmm3 
+	movd %rax, %xmm3
 	/* override source alpha to 255 */
 	por %xmm6, %xmm1
 
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* unpack source and dest */
@@ -688,15 +683,15 @@ PR_(imlib_amd64_blend_rgba_to_rgba):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
-	/* Load two pixels as 00, 00, src alpha, combined alpha 
-	 * Combined alpha is derived from the pow_lut table in blend.c 
+	/* Load two pixels as 00, 00, src alpha, combined alpha
+	 * Combined alpha is derived from the pow_lut table in blend.c
 	 */
 	movzbq 7(%rdi, %rcx, 4), %rdx
 	movb 7(%rsi, %rcx, 4), %dh
@@ -707,14 +702,14 @@ PR_(imlib_amd64_blend_rgba_to_rgba):
 	shlq $32, %rax
 	movb (%r13, %rdx), %al
 	movb %dh, %ah
-	movd %rax, %xmm3 
+	movd %rax, %xmm3
 	/* override source alpha to 255 */
 	por %xmm6, %xmm1
 
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* unpack source and dest */
@@ -732,15 +727,15 @@ PR_(imlib_amd64_blend_rgba_to_rgba):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
-	/* Load two pixels as 00, 00, src alpha, combined alpha 
-	 * Combined alpha is derived from the pow_lut table in blend.c 
+	/* Load two pixels as 00, 00, src alpha, combined alpha
+	 * Combined alpha is derived from the pow_lut table in blend.c
 	 */
 	movzbq 7(%rdi, %rcx, 4), %rdx
 	movb 7(%rsi, %rcx, 4), %dh
@@ -751,14 +746,14 @@ PR_(imlib_amd64_blend_rgba_to_rgba):
 	shlq $32, %rax
 	movb (%r13, %rdx), %al
 	movb %dh, %ah
-	movd %rax, %xmm3 
+	movd %rax, %xmm3
 	/* override source alpha to 255 */
 	por %xmm6, %xmm1
 
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* unpack source and dest */
@@ -776,15 +771,15 @@ PR_(imlib_amd64_blend_rgba_to_rgba):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
-	/* Load two pixels as 00, 00, src alpha, combined alpha 
-	 * Combined alpha is derived from the pow_lut table in blend.c 
+	/* Load two pixels as 00, 00, src alpha, combined alpha
+	 * Combined alpha is derived from the pow_lut table in blend.c
 	 */
 	movzbq 7(%rdi, %rcx, 4), %rdx
 	movb 7(%rsi, %rcx, 4), %dh
@@ -795,14 +790,14 @@ PR_(imlib_amd64_blend_rgba_to_rgba):
 	shlq $32, %rax
 	movb (%r13, %rdx), %al
 	movb %dh, %ah
-	movd %rax, %xmm3 
+	movd %rax, %xmm3
 	/* override source alpha to 255 */
 	por %xmm6, %xmm1
 
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* unpack source and dest */
@@ -820,15 +815,15 @@ PR_(imlib_amd64_blend_rgba_to_rgba):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
-	/* Load two pixels as 00, 00, src alpha, combined alpha 
-	 * Combined alpha is derived from the pow_lut table in blend.c 
+	/* Load two pixels as 00, 00, src alpha, combined alpha
+	 * Combined alpha is derived from the pow_lut table in blend.c
 	 */
 	movzbq 7(%rdi, %rcx, 4), %rdx
 	movb 7(%rsi, %rcx, 4), %dh
@@ -839,14 +834,14 @@ PR_(imlib_amd64_blend_rgba_to_rgba):
 	shlq $32, %rax
 	movb (%r13, %rdx), %al
 	movb %dh, %ah
-	movd %rax, %xmm3 
+	movd %rax, %xmm3
 	/* override source alpha to 255 */
 	por %xmm6, %xmm1
 
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* unpack source and dest */
@@ -864,15 +859,15 @@ PR_(imlib_amd64_blend_rgba_to_rgba):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
-	/* Load two pixels as 00, 00, src alpha, combined alpha 
-	 * Combined alpha is derived from the pow_lut table in blend.c 
+	/* Load two pixels as 00, 00, src alpha, combined alpha
+	 * Combined alpha is derived from the pow_lut table in blend.c
 	 */
 	movzbq 7(%rdi, %rcx, 4), %rdx
 	movb 7(%rsi, %rcx, 4), %dh
@@ -883,14 +878,14 @@ PR_(imlib_amd64_blend_rgba_to_rgba):
 	shlq $32, %rax
 	movb (%r13, %rdx), %al
 	movb %dh, %ah
-	movd %rax, %xmm3 
+	movd %rax, %xmm3
 	/* override source alpha to 255 */
 	por %xmm6, %xmm1
 
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* unpack source and dest */
@@ -908,28 +903,28 @@ PR_(imlib_amd64_blend_rgba_to_rgba):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
-	js 1b				
-	jnz 3f				
+	incq %rcx
+	incq %rcx
+	js 1b
+	jnz 3f
 2:
 	movd (%rsi, %rcx, 4), %xmm1
 	movd (%rdi, %rcx, 4), %xmm2
-	/* Load one pixel as 00, 00, src alpha, combined alpha 
-	 * Combined alpha is derived from the pow_lut table in blend.c 
+	/* Load one pixel as 00, 00, src alpha, combined alpha
+	 * Combined alpha is derived from the pow_lut table in blend.c
 	 */
 	movzbq 3(%rdi, %rcx, 4), %rdx
 	movb 3(%rsi, %rcx, 4), %dh
 	movb (%r13, %rdx), %al
 	movb %dh, %ah
-	movd %eax, %xmm3 
+	movd %eax, %xmm3
 	/* override source alpha to 255 */
 	por %xmm6, %xmm1
 
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* unpack source and dest */
@@ -946,10 +941,10 @@ PR_(imlib_amd64_blend_rgba_to_rgba):
 	/* repack new pixels */
 	packuswb %xmm4, %xmm2
 	movd %xmm2, (%rdi, %rcx, 4)
-3:					
-	leaq (%rsi, %r10, 4), %rsi	
-	leaq (%rdi, %r11, 4), %rdi	
-	decq %r9			
+3:
+	leaq (%rsi, %r10, 4), %rsi
+	leaq (%rdi, %r11, 4), %rdi
+	decq %r9
 	jnz 0b
 
 9:
@@ -1361,20 +1356,20 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb):
 	pxor %xmm4, %xmm4
 	movdqu m00XXXXXX(%rip), %xmm6
 
-	/* Move right to left across each line, */ 
-	/* processing in two pixel chunks */ 
-	leaq (%rsi, %r8, 4), %rsi	
-	leaq (%rdi, %r8, 4), %rdi	
-					
-	/* Last instruction is %rcx = 0 */ 
-	subq $4, %rsi			
-	subq $4, %rdi			
-					
-	negq %r8			
-0:					
-	movq %r8, %rcx			
-					
-	incq %rcx			
+	/* Move right to left across each line, */
+	/* processing in two pixel chunks */
+	leaq (%rsi, %r8, 4), %rsi
+	leaq (%rdi, %r8, 4), %rdi
+
+	/* Last instruction is %rcx = 0 */
+	subq $4, %rsi
+	subq $4, %rdi
+
+	negq %r8
+0:
+	movq %r8, %rcx
+
+	incq %rcx
 
 	/* prefetch a couple cache lines ahead */
 	prefetchnta (%rsi, %rcx, 4)
@@ -1382,7 +1377,7 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb):
 	prefetchnta 64(%rsi, %rcx, 4)
 	prefetcht0 64(%rdi, %rcx, 4)
 
-	jz 2f /* one pixel line */	
+	jz 2f /* one pixel line */
 1:
 	/* main loop, unrolled to work on 64 byte chunks */
 	prefetchnta 128(%rsi, %rcx, 4)
@@ -1394,13 +1389,13 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -1421,10 +1416,10 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
@@ -1432,13 +1427,13 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -1459,10 +1454,10 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
@@ -1470,13 +1465,13 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -1497,10 +1492,10 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
@@ -1508,13 +1503,13 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -1535,10 +1530,10 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
@@ -1546,13 +1541,13 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -1573,10 +1568,10 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
@@ -1584,13 +1579,13 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -1611,10 +1606,10 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
@@ -1622,13 +1617,13 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -1649,10 +1644,10 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
@@ -1660,13 +1655,13 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -1687,10 +1682,10 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
-	js 1b				
-	jnz 3f				
+	incq %rcx
+	incq %rcx
+	js 1b
+	jnz 3f
 2:
 	movd (%rsi, %rcx, 4), %xmm1
 	movd (%rdi, %rcx, 4), %xmm2
@@ -1698,13 +1693,13 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -1724,10 +1719,10 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb):
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movd %xmm2, (%rdi, %rcx, 4)
-3:					
-	leaq (%rsi, %r10, 4), %rsi	
-	leaq (%rdi, %r11, 4), %rdi	
-	decq %r9			
+3:
+	leaq (%rsi, %r10, 4), %rsi
+	leaq (%rdi, %r11, 4), %rdi
+	decq %r9
 	jnz 0b
 
 9:
@@ -1743,20 +1738,20 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba):
 	movdqu mX000X000X000X000(%rip), %xmm6
 	movq pow_lut@GOTPCREL(%rip), %r13
 
-	/* Move right to left across each line, */ 
-	/* processing in two pixel chunks */ 
-	leaq (%rsi, %r8, 4), %rsi	
-	leaq (%rdi, %r8, 4), %rdi	
-					
-	/* Last instruction is %rcx = 0 */ 
-	subq $4, %rsi			
-	subq $4, %rdi			
-					
-	negq %r8			
-0:					
-	movq %r8, %rcx			
-					
-	incq %rcx			
+	/* Move right to left across each line, */
+	/* processing in two pixel chunks */
+	leaq (%rsi, %r8, 4), %rsi
+	leaq (%rdi, %r8, 4), %rdi
+
+	/* Last instruction is %rcx = 0 */
+	subq $4, %rsi
+	subq $4, %rdi
+
+	negq %r8
+0:
+	movq %r8, %rcx
+
+	incq %rcx
 
 	/* prefetch a couple cache lines ahead */
 	prefetchnta (%rsi, %rcx, 4)
@@ -1764,7 +1759,7 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba):
 	prefetchnta 64(%rsi, %rcx, 4)
 	prefetcht0 64(%rdi, %rcx, 4)
 
-	jz 2f /* one pixel line */	
+	jz 2f /* one pixel line */
 1:
 	/* main loop, unrolled to work on 64 byte chunks */
 	prefetchnta 128(%rsi, %rcx, 4)
@@ -1772,8 +1767,8 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba):
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
-	/* Load two pixels as 00, 00, src alpha, combined alpha 
-	 * Combined alpha is derived from the pow_lut table in blend.c 
+	/* Load two pixels as 00, 00, src alpha, combined alpha
+	 * Combined alpha is derived from the pow_lut table in blend.c
 	 */
 	movzbq 7(%rdi, %rcx, 4), %rdx
 	movb 7(%rsi, %rcx, 4), %dh
@@ -1784,11 +1779,11 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba):
 	shlq $32, %rax
 	movb (%r13, %rdx), %al
 	movb %dh, %ah
-	movd %rax, %xmm3 
+	movd %rax, %xmm3
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* src alpha = 255 - dst alpha */
@@ -1805,20 +1800,20 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba):
 	psllw $1, %xmm1
 	pmulhw %xmm3, %xmm1
 	paddsw %xmm1, %xmm2
-	
+
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
-	/* Load two pixels as 00, 00, src alpha, combined alpha 
-	 * Combined alpha is derived from the pow_lut table in blend.c 
+	/* Load two pixels as 00, 00, src alpha, combined alpha
+	 * Combined alpha is derived from the pow_lut table in blend.c
 	 */
 	movzbq 7(%rdi, %rcx, 4), %rdx
 	movb 7(%rsi, %rcx, 4), %dh
@@ -1829,11 +1824,11 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba):
 	shlq $32, %rax
 	movb (%r13, %rdx), %al
 	movb %dh, %ah
-	movd %rax, %xmm3 
+	movd %rax, %xmm3
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* src alpha = 255 - dst alpha */
@@ -1850,20 +1845,20 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba):
 	psllw $1, %xmm1
 	pmulhw %xmm3, %xmm1
 	paddsw %xmm1, %xmm2
-	
+
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
-	/* Load two pixels as 00, 00, src alpha, combined alpha 
-	 * Combined alpha is derived from the pow_lut table in blend.c 
+	/* Load two pixels as 00, 00, src alpha, combined alpha
+	 * Combined alpha is derived from the pow_lut table in blend.c
 	 */
 	movzbq 7(%rdi, %rcx, 4), %rdx
 	movb 7(%rsi, %rcx, 4), %dh
@@ -1874,11 +1869,11 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba):
 	shlq $32, %rax
 	movb (%r13, %rdx), %al
 	movb %dh, %ah
-	movd %rax, %xmm3 
+	movd %rax, %xmm3
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* src alpha = 255 - dst alpha */
@@ -1895,20 +1890,20 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba):
 	psllw $1, %xmm1
 	pmulhw %xmm3, %xmm1
 	paddsw %xmm1, %xmm2
-	
+
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
-	/* Load two pixels as 00, 00, src alpha, combined alpha 
-	 * Combined alpha is derived from the pow_lut table in blend.c 
+	/* Load two pixels as 00, 00, src alpha, combined alpha
+	 * Combined alpha is derived from the pow_lut table in blend.c
 	 */
 	movzbq 7(%rdi, %rcx, 4), %rdx
 	movb 7(%rsi, %rcx, 4), %dh
@@ -1919,11 +1914,11 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba):
 	shlq $32, %rax
 	movb (%r13, %rdx), %al
 	movb %dh, %ah
-	movd %rax, %xmm3 
+	movd %rax, %xmm3
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* src alpha = 255 - dst alpha */
@@ -1940,20 +1935,20 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba):
 	psllw $1, %xmm1
 	pmulhw %xmm3, %xmm1
 	paddsw %xmm1, %xmm2
-	
+
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
-	/* Load two pixels as 00, 00, src alpha, combined alpha 
-	 * Combined alpha is derived from the pow_lut table in blend.c 
+	/* Load two pixels as 00, 00, src alpha, combined alpha
+	 * Combined alpha is derived from the pow_lut table in blend.c
 	 */
 	movzbq 7(%rdi, %rcx, 4), %rdx
 	movb 7(%rsi, %rcx, 4), %dh
@@ -1964,11 +1959,11 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba):
 	shlq $32, %rax
 	movb (%r13, %rdx), %al
 	movb %dh, %ah
-	movd %rax, %xmm3 
+	movd %rax, %xmm3
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* src alpha = 255 - dst alpha */
@@ -1985,20 +1980,20 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba):
 	psllw $1, %xmm1
 	pmulhw %xmm3, %xmm1
 	paddsw %xmm1, %xmm2
-	
+
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
-	/* Load two pixels as 00, 00, src alpha, combined alpha 
-	 * Combined alpha is derived from the pow_lut table in blend.c 
+	/* Load two pixels as 00, 00, src alpha, combined alpha
+	 * Combined alpha is derived from the pow_lut table in blend.c
 	 */
 	movzbq 7(%rdi, %rcx, 4), %rdx
 	movb 7(%rsi, %rcx, 4), %dh
@@ -2009,11 +2004,11 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba):
 	shlq $32, %rax
 	movb (%r13, %rdx), %al
 	movb %dh, %ah
-	movd %rax, %xmm3 
+	movd %rax, %xmm3
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* src alpha = 255 - dst alpha */
@@ -2030,20 +2025,20 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba):
 	psllw $1, %xmm1
 	pmulhw %xmm3, %xmm1
 	paddsw %xmm1, %xmm2
-	
+
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
-	/* Load two pixels as 00, 00, src alpha, combined alpha 
-	 * Combined alpha is derived from the pow_lut table in blend.c 
+	/* Load two pixels as 00, 00, src alpha, combined alpha
+	 * Combined alpha is derived from the pow_lut table in blend.c
 	 */
 	movzbq 7(%rdi, %rcx, 4), %rdx
 	movb 7(%rsi, %rcx, 4), %dh
@@ -2054,11 +2049,11 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba):
 	shlq $32, %rax
 	movb (%r13, %rdx), %al
 	movb %dh, %ah
-	movd %rax, %xmm3 
+	movd %rax, %xmm3
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* src alpha = 255 - dst alpha */
@@ -2075,20 +2070,20 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba):
 	psllw $1, %xmm1
 	pmulhw %xmm3, %xmm1
 	paddsw %xmm1, %xmm2
-	
+
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
-	/* Load two pixels as 00, 00, src alpha, combined alpha 
-	 * Combined alpha is derived from the pow_lut table in blend.c 
+	/* Load two pixels as 00, 00, src alpha, combined alpha
+	 * Combined alpha is derived from the pow_lut table in blend.c
 	 */
 	movzbq 7(%rdi, %rcx, 4), %rdx
 	movb 7(%rsi, %rcx, 4), %dh
@@ -2099,11 +2094,11 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba):
 	shlq $32, %rax
 	movb (%r13, %rdx), %al
 	movb %dh, %ah
-	movd %rax, %xmm3 
+	movd %rax, %xmm3
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* src alpha = 255 - dst alpha */
@@ -2120,30 +2115,30 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba):
 	psllw $1, %xmm1
 	pmulhw %xmm3, %xmm1
 	paddsw %xmm1, %xmm2
-	
+
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
-	js 1b				
-	jnz 3f				
+	incq %rcx
+	incq %rcx
+	js 1b
+	jnz 3f
 2:
 	movd (%rsi, %rcx, 4), %xmm1
 	movd (%rdi, %rcx, 4), %xmm2
-	/* Load one pixel as 00, 00, src alpha, combined alpha 
-	 * Combined alpha is derived from the pow_lut table in blend.c 
+	/* Load one pixel as 00, 00, src alpha, combined alpha
+	 * Combined alpha is derived from the pow_lut table in blend.c
 	 */
 	movzbq 3(%rdi, %rcx, 4), %rdx
 	movb 3(%rsi, %rcx, 4), %dh
 	movb (%r13, %rdx), %al
 	movb %dh, %ah
-	movd %eax, %xmm3 
+	movd %eax, %xmm3
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* src alpha = 255 - dst alpha */
@@ -2160,14 +2155,14 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba):
 	psllw $1, %xmm1
 	pmulhw %xmm3, %xmm1
 	paddsw %xmm1, %xmm2
-	
+
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movd %xmm2, (%rdi, %rcx, 4)
-3:					
-	leaq (%rsi, %r10, 4), %rsi	
-	leaq (%rdi, %r11, 4), %rdi	
-	decq %r9			
+3:
+	leaq (%rsi, %r10, 4), %rsi
+	leaq (%rdi, %r11, 4), %rdi
+	decq %r9
 	jnz 0b
 
 9:
@@ -2502,7 +2497,7 @@ PR_(imlib_amd64_add_copy_rgb_to_rgba):
 
 	movd (%rsi, %rcx, 4), %xmm1
 	movd (%rdi, %rcx, 4), %xmm2
-	/* d = (d + s) | 0xff000000 */	
+	/* d = (d + s) | 0xff000000 */
 	paddusb %xmm1, %xmm2
 	por %xmm5, %xmm2
 	movd %xmm2, (%rdi, %rcx, 4)
@@ -2528,7 +2523,7 @@ PR_(imlib_amd64_add_copy_rgb_to_rgba):
 
 	movdqa (%rsi, %rcx, 4), %xmm1
 	movdqa (%rdi, %rcx, 4), %xmm2
-	/* d = (d + s) | 0xff000000 */	
+	/* d = (d + s) | 0xff000000 */
 	paddusb %xmm1, %xmm2
 	por %xmm5, %xmm2
 	movdqa %xmm2, (%rdi, %rcx, 4)
@@ -2537,7 +2532,7 @@ PR_(imlib_amd64_add_copy_rgb_to_rgba):
 
 	movdqa (%rsi, %rcx, 4), %xmm1
 	movdqa (%rdi, %rcx, 4), %xmm2
-	/* d = (d + s) | 0xff000000 */	
+	/* d = (d + s) | 0xff000000 */
 	paddusb %xmm1, %xmm2
 	por %xmm5, %xmm2
 	movdqa %xmm2, (%rdi, %rcx, 4)
@@ -2546,7 +2541,7 @@ PR_(imlib_amd64_add_copy_rgb_to_rgba):
 
 	movdqa (%rsi, %rcx, 4), %xmm1
 	movdqa (%rdi, %rcx, 4), %xmm2
-	/* d = (d + s) | 0xff000000 */	
+	/* d = (d + s) | 0xff000000 */
 	paddusb %xmm1, %xmm2
 	por %xmm5, %xmm2
 	movdqa %xmm2, (%rdi, %rcx, 4)
@@ -2555,7 +2550,7 @@ PR_(imlib_amd64_add_copy_rgb_to_rgba):
 
 	movdqa (%rsi, %rcx, 4), %xmm1
 	movdqa (%rdi, %rcx, 4), %xmm2
-	/* d = (d + s) | 0xff000000 */	
+	/* d = (d + s) | 0xff000000 */
 	paddusb %xmm1, %xmm2
 	por %xmm5, %xmm2
 	movdqa %xmm2, (%rdi, %rcx, 4)
@@ -2570,7 +2565,7 @@ PR_(imlib_amd64_add_copy_rgb_to_rgba):
 
 	movdqu (%rsi, %rcx, 4), %xmm1
 	movdqa (%rdi, %rcx, 4), %xmm2
-	/* d = (d + s) | 0xff000000 */	
+	/* d = (d + s) | 0xff000000 */
 	paddusb %xmm1, %xmm2
 	por %xmm5, %xmm2
 	movdqa %xmm2, (%rdi, %rcx, 4)
@@ -2579,7 +2574,7 @@ PR_(imlib_amd64_add_copy_rgb_to_rgba):
 
 	movdqu (%rsi, %rcx, 4), %xmm1
 	movdqa (%rdi, %rcx, 4), %xmm2
-	/* d = (d + s) | 0xff000000 */	
+	/* d = (d + s) | 0xff000000 */
 	paddusb %xmm1, %xmm2
 	por %xmm5, %xmm2
 	movdqa %xmm2, (%rdi, %rcx, 4)
@@ -2588,7 +2583,7 @@ PR_(imlib_amd64_add_copy_rgb_to_rgba):
 
 	movdqu (%rsi, %rcx, 4), %xmm1
 	movdqa (%rdi, %rcx, 4), %xmm2
-	/* d = (d + s) | 0xff000000 */	
+	/* d = (d + s) | 0xff000000 */
 	paddusb %xmm1, %xmm2
 	por %xmm5, %xmm2
 	movdqa %xmm2, (%rdi, %rcx, 4)
@@ -2597,7 +2592,7 @@ PR_(imlib_amd64_add_copy_rgb_to_rgba):
 
 	movdqu (%rsi, %rcx, 4), %xmm1
 	movdqa (%rdi, %rcx, 4), %xmm2
-	/* d = (d + s) | 0xff000000 */	
+	/* d = (d + s) | 0xff000000 */
 	paddusb %xmm1, %xmm2
 	por %xmm5, %xmm2
 	movdqa %xmm2, (%rdi, %rcx, 4)
@@ -2610,7 +2605,7 @@ PR_(imlib_amd64_add_copy_rgb_to_rgba):
 
 	movd (%rsi, %rcx, 4), %xmm1
 	movd (%rdi, %rcx, 4), %xmm2
-	/* d = (d + s) | 0xff000000 */	
+	/* d = (d + s) | 0xff000000 */
 	paddusb %xmm1, %xmm2
 	por %xmm5, %xmm2
 	movd %xmm2, (%rdi, %rcx, 4)
@@ -2633,20 +2628,20 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb):
 	pxor %xmm4, %xmm4
 	movdqu m00XXXXXX(%rip), %xmm6
 
-	/* Move right to left across each line, */ 
-	/* processing in two pixel chunks */ 
-	leaq (%rsi, %r8, 4), %rsi	
-	leaq (%rdi, %r8, 4), %rdi	
-					
-	/* Last instruction is %rcx = 0 */ 
-	subq $4, %rsi			
-	subq $4, %rdi			
-					
-	negq %r8			
-0:					
-	movq %r8, %rcx			
-					
-	incq %rcx			
+	/* Move right to left across each line, */
+	/* processing in two pixel chunks */
+	leaq (%rsi, %r8, 4), %rsi
+	leaq (%rdi, %r8, 4), %rdi
+
+	/* Last instruction is %rcx = 0 */
+	subq $4, %rsi
+	subq $4, %rdi
+
+	negq %r8
+0:
+	movq %r8, %rcx
+
+	incq %rcx
 
 	/* prefetch a couple cache lines ahead */
 	prefetchnta (%rsi, %rcx, 4)
@@ -2654,7 +2649,7 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb):
 	prefetchnta 64(%rsi, %rcx, 4)
 	prefetcht0 64(%rdi, %rcx, 4)
 
-	jz 2f /* one pixel line */	
+	jz 2f /* one pixel line */
 1:
 	/* main loop, unrolled to work on 64 byte chunks */
 	prefetchnta 128(%rsi, %rcx, 4)
@@ -2666,13 +2661,13 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -2693,10 +2688,10 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
@@ -2704,13 +2699,13 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -2731,10 +2726,10 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
@@ -2742,13 +2737,13 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -2769,10 +2764,10 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
@@ -2780,13 +2775,13 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -2807,10 +2802,10 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
@@ -2818,13 +2813,13 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -2845,10 +2840,10 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
@@ -2856,13 +2851,13 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -2883,10 +2878,10 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
@@ -2894,13 +2889,13 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -2921,10 +2916,10 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
@@ -2932,13 +2927,13 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -2959,10 +2954,10 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
-	js 1b				
-	jnz 3f				
+	incq %rcx
+	incq %rcx
+	js 1b
+	jnz 3f
 2:
 	movd (%rsi, %rcx, 4), %xmm1
 	movd (%rdi, %rcx, 4), %xmm2
@@ -2970,13 +2965,13 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -2996,10 +2991,10 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb):
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movd %xmm2, (%rdi, %rcx, 4)
-3:					
-	leaq (%rsi, %r10, 4), %rsi	
-	leaq (%rdi, %r11, 4), %rdi	
-	decq %r9			
+3:
+	leaq (%rsi, %r10, 4), %rsi
+	leaq (%rdi, %r11, 4), %rdi
+	decq %r9
 	jnz 0b
 
 9:
@@ -3016,20 +3011,20 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba):
 	movdqu mX000X000(%rip), %xmm7
 	xorq %rax, %rax
 
-	/* Move right to left across each line, */ 
-	/* processing in two pixel chunks */ 
-	leaq (%rsi, %r8, 4), %rsi	
-	leaq (%rdi, %r8, 4), %rdi	
-					
-	/* Last instruction is %rcx = 0 */ 
-	subq $4, %rsi			
-	subq $4, %rdi			
-					
-	negq %r8			
-0:					
-	movq %r8, %rcx			
-					
-	incq %rcx			
+	/* Move right to left across each line, */
+	/* processing in two pixel chunks */
+	leaq (%rsi, %r8, 4), %rsi
+	leaq (%rdi, %r8, 4), %rdi
+
+	/* Last instruction is %rcx = 0 */
+	subq $4, %rsi
+	subq $4, %rdi
+
+	negq %r8
+0:
+	movq %r8, %rcx
+
+	incq %rcx
 
 	/* prefetch a couple cache lines ahead */
 	prefetchnta (%rsi, %rcx, 4)
@@ -3037,7 +3032,7 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba):
 	prefetchnta 64(%rsi, %rcx, 4)
 	prefetcht0 64(%rdi, %rcx, 4)
 
-	jz 2f /* one pixel line */	
+	jz 2f /* one pixel line */
 1:
 	/* main loop, unrolled to work on 64 byte chunks */
 	prefetchnta 128(%rsi, %rcx, 4)
@@ -3045,8 +3040,8 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba):
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
-	/* Load two pixels as 00, 00, src alpha, combined alpha 
-	 * Combined alpha is derived from the pow_lut table in blend.c 
+	/* Load two pixels as 00, 00, src alpha, combined alpha
+	 * Combined alpha is derived from the pow_lut table in blend.c
 	 */
 	movzbq 7(%rdi, %rcx, 4), %rdx
 	movb 7(%rsi, %rcx, 4), %dh
@@ -3057,11 +3052,11 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba):
 	shlq $32, %rax
 	movb (%r13, %rdx), %al
 	movb %dh, %ah
-	movd %rax, %xmm3 
+	movd %rax, %xmm3
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* src alpha = 255 - dst alpha */
@@ -3079,20 +3074,20 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba):
 	pmulhw %xmm3, %xmm1
 	pxor %xmm7, %xmm1
 	psubsw %xmm1, %xmm2
-	
+
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
-	/* Load two pixels as 00, 00, src alpha, combined alpha 
-	 * Combined alpha is derived from the pow_lut table in blend.c 
+	/* Load two pixels as 00, 00, src alpha, combined alpha
+	 * Combined alpha is derived from the pow_lut table in blend.c
 	 */
 	movzbq 7(%rdi, %rcx, 4), %rdx
 	movb 7(%rsi, %rcx, 4), %dh
@@ -3103,11 +3098,11 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba):
 	shlq $32, %rax
 	movb (%r13, %rdx), %al
 	movb %dh, %ah
-	movd %rax, %xmm3 
+	movd %rax, %xmm3
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* src alpha = 255 - dst alpha */
@@ -3125,20 +3120,20 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba):
 	pmulhw %xmm3, %xmm1
 	pxor %xmm7, %xmm1
 	psubsw %xmm1, %xmm2
-	
+
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
-	/* Load two pixels as 00, 00, src alpha, combined alpha 
-	 * Combined alpha is derived from the pow_lut table in blend.c 
+	/* Load two pixels as 00, 00, src alpha, combined alpha
+	 * Combined alpha is derived from the pow_lut table in blend.c
 	 */
 	movzbq 7(%rdi, %rcx, 4), %rdx
 	movb 7(%rsi, %rcx, 4), %dh
@@ -3149,11 +3144,11 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba):
 	shlq $32, %rax
 	movb (%r13, %rdx), %al
 	movb %dh, %ah
-	movd %rax, %xmm3 
+	movd %rax, %xmm3
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* src alpha = 255 - dst alpha */
@@ -3171,20 +3166,20 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba):
 	pmulhw %xmm3, %xmm1
 	pxor %xmm7, %xmm1
 	psubsw %xmm1, %xmm2
-	
+
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
-	/* Load two pixels as 00, 00, src alpha, combined alpha 
-	 * Combined alpha is derived from the pow_lut table in blend.c 
+	/* Load two pixels as 00, 00, src alpha, combined alpha
+	 * Combined alpha is derived from the pow_lut table in blend.c
 	 */
 	movzbq 7(%rdi, %rcx, 4), %rdx
 	movb 7(%rsi, %rcx, 4), %dh
@@ -3195,11 +3190,11 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba):
 	shlq $32, %rax
 	movb (%r13, %rdx), %al
 	movb %dh, %ah
-	movd %rax, %xmm3 
+	movd %rax, %xmm3
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* src alpha = 255 - dst alpha */
@@ -3217,20 +3212,20 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba):
 	pmulhw %xmm3, %xmm1
 	pxor %xmm7, %xmm1
 	psubsw %xmm1, %xmm2
-	
+
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
-	/* Load two pixels as 00, 00, src alpha, combined alpha 
-	 * Combined alpha is derived from the pow_lut table in blend.c 
+	/* Load two pixels as 00, 00, src alpha, combined alpha
+	 * Combined alpha is derived from the pow_lut table in blend.c
 	 */
 	movzbq 7(%rdi, %rcx, 4), %rdx
 	movb 7(%rsi, %rcx, 4), %dh
@@ -3241,11 +3236,11 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba):
 	shlq $32, %rax
 	movb (%r13, %rdx), %al
 	movb %dh, %ah
-	movd %rax, %xmm3 
+	movd %rax, %xmm3
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* src alpha = 255 - dst alpha */
@@ -3263,20 +3258,20 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba):
 	pmulhw %xmm3, %xmm1
 	pxor %xmm7, %xmm1
 	psubsw %xmm1, %xmm2
-	
+
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
-	/* Load two pixels as 00, 00, src alpha, combined alpha 
-	 * Combined alpha is derived from the pow_lut table in blend.c 
+	/* Load two pixels as 00, 00, src alpha, combined alpha
+	 * Combined alpha is derived from the pow_lut table in blend.c
 	 */
 	movzbq 7(%rdi, %rcx, 4), %rdx
 	movb 7(%rsi, %rcx, 4), %dh
@@ -3287,11 +3282,11 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba):
 	shlq $32, %rax
 	movb (%r13, %rdx), %al
 	movb %dh, %ah
-	movd %rax, %xmm3 
+	movd %rax, %xmm3
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* src alpha = 255 - dst alpha */
@@ -3309,20 +3304,20 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba):
 	pmulhw %xmm3, %xmm1
 	pxor %xmm7, %xmm1
 	psubsw %xmm1, %xmm2
-	
+
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
-	/* Load two pixels as 00, 00, src alpha, combined alpha 
-	 * Combined alpha is derived from the pow_lut table in blend.c 
+	/* Load two pixels as 00, 00, src alpha, combined alpha
+	 * Combined alpha is derived from the pow_lut table in blend.c
 	 */
 	movzbq 7(%rdi, %rcx, 4), %rdx
 	movb 7(%rsi, %rcx, 4), %dh
@@ -3333,11 +3328,11 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba):
 	shlq $32, %rax
 	movb (%r13, %rdx), %al
 	movb %dh, %ah
-	movd %rax, %xmm3 
+	movd %rax, %xmm3
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* src alpha = 255 - dst alpha */
@@ -3355,20 +3350,20 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba):
 	pmulhw %xmm3, %xmm1
 	pxor %xmm7, %xmm1
 	psubsw %xmm1, %xmm2
-	
+
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
-	/* Load two pixels as 00, 00, src alpha, combined alpha 
-	 * Combined alpha is derived from the pow_lut table in blend.c 
+	/* Load two pixels as 00, 00, src alpha, combined alpha
+	 * Combined alpha is derived from the pow_lut table in blend.c
 	 */
 	movzbq 7(%rdi, %rcx, 4), %rdx
 	movb 7(%rsi, %rcx, 4), %dh
@@ -3379,11 +3374,11 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba):
 	shlq $32, %rax
 	movb (%r13, %rdx), %al
 	movb %dh, %ah
-	movd %rax, %xmm3 
+	movd %rax, %xmm3
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* src alpha = 255 - dst alpha */
@@ -3401,30 +3396,30 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba):
 	pmulhw %xmm3, %xmm1
 	pxor %xmm7, %xmm1
 	psubsw %xmm1, %xmm2
-	
+
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
-	js 1b				
-	jnz 3f				
+	incq %rcx
+	incq %rcx
+	js 1b
+	jnz 3f
 2:
 	movd (%rsi, %rcx, 4), %xmm1
 	movd (%rdi, %rcx, 4), %xmm2
-	/* Load one pixel as 00, 00, src alpha, combined alpha 
-	 * Combined alpha is derived from the pow_lut table in blend.c 
+	/* Load one pixel as 00, 00, src alpha, combined alpha
+	 * Combined alpha is derived from the pow_lut table in blend.c
 	 */
 	movzbq 3(%rdi, %rcx, 4), %rdx
 	movb 3(%rsi, %rcx, 4), %dh
 	movb (%r13, %rdx), %al
 	movb %dh, %ah
-	movd %eax, %xmm3 
+	movd %eax, %xmm3
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* src alpha = 255 - dst alpha */
@@ -3442,14 +3437,14 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba):
 	pmulhw %xmm3, %xmm1
 	pxor %xmm7, %xmm1
 	psubsw %xmm1, %xmm2
-	
+
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movd %xmm2, (%rdi, %rcx, 4)
-3:					
-	leaq (%rsi, %r10, 4), %rsi	
-	leaq (%rdi, %r11, 4), %rdi	
-	decq %r9			
+3:
+	leaq (%rsi, %r10, 4), %rsi
+	leaq (%rdi, %r11, 4), %rdi
+	decq %r9
 	jnz 0b
 
 9:
@@ -3805,7 +3800,7 @@ PR_(imlib_amd64_subtract_copy_rgb_to_rgba):
 
 	movd (%rsi, %rcx, 4), %xmm1
 	movd (%rdi, %rcx, 4), %xmm2
-	/* d = (d - s) | 0xff000000 */	
+	/* d = (d - s) | 0xff000000 */
 	psubusb %xmm1, %xmm2
 	por %xmm5, %xmm2
 	movd %xmm2, (%rdi, %rcx, 4)
@@ -3831,7 +3826,7 @@ PR_(imlib_amd64_subtract_copy_rgb_to_rgba):
 
 	movdqa (%rsi, %rcx, 4), %xmm1
 	movdqa (%rdi, %rcx, 4), %xmm2
-	/* d = (d - s) | 0xff000000 */	
+	/* d = (d - s) | 0xff000000 */
 	psubusb %xmm1, %xmm2
 	por %xmm5, %xmm2
 	movdqa %xmm2, (%rdi, %rcx, 4)
@@ -3840,7 +3835,7 @@ PR_(imlib_amd64_subtract_copy_rgb_to_rgba):
 
 	movdqa (%rsi, %rcx, 4), %xmm1
 	movdqa (%rdi, %rcx, 4), %xmm2
-	/* d = (d - s) | 0xff000000 */	
+	/* d = (d - s) | 0xff000000 */
 	psubusb %xmm1, %xmm2
 	por %xmm5, %xmm2
 	movdqa %xmm2, (%rdi, %rcx, 4)
@@ -3849,7 +3844,7 @@ PR_(imlib_amd64_subtract_copy_rgb_to_rgba):
 
 	movdqa (%rsi, %rcx, 4), %xmm1
 	movdqa (%rdi, %rcx, 4), %xmm2
-	/* d = (d - s) | 0xff000000 */	
+	/* d = (d - s) | 0xff000000 */
 	psubusb %xmm1, %xmm2
 	por %xmm5, %xmm2
 	movdqa %xmm2, (%rdi, %rcx, 4)
@@ -3858,7 +3853,7 @@ PR_(imlib_amd64_subtract_copy_rgb_to_rgba):
 
 	movdqa (%rsi, %rcx, 4), %xmm1
 	movdqa (%rdi, %rcx, 4), %xmm2
-	/* d = (d - s) | 0xff000000 */	
+	/* d = (d - s) | 0xff000000 */
 	psubusb %xmm1, %xmm2
 	por %xmm5, %xmm2
 	movdqa %xmm2, (%rdi, %rcx, 4)
@@ -3873,7 +3868,7 @@ PR_(imlib_amd64_subtract_copy_rgb_to_rgba):
 
 	movdqu (%rsi, %rcx, 4), %xmm1
 	movdqa (%rdi, %rcx, 4), %xmm2
-	/* d = (d - s) | 0xff000000 */	
+	/* d = (d - s) | 0xff000000 */
 	psubusb %xmm1, %xmm2
 	por %xmm5, %xmm2
 	movdqa %xmm2, (%rdi, %rcx, 4)
@@ -3882,7 +3877,7 @@ PR_(imlib_amd64_subtract_copy_rgb_to_rgba):
 
 	movdqu (%rsi, %rcx, 4), %xmm1
 	movdqa (%rdi, %rcx, 4), %xmm2
-	/* d = (d - s) | 0xff000000 */	
+	/* d = (d - s) | 0xff000000 */
 	psubusb %xmm1, %xmm2
 	por %xmm5, %xmm2
 	movdqa %xmm2, (%rdi, %rcx, 4)
@@ -3891,7 +3886,7 @@ PR_(imlib_amd64_subtract_copy_rgb_to_rgba):
 
 	movdqu (%rsi, %rcx, 4), %xmm1
 	movdqa (%rdi, %rcx, 4), %xmm2
-	/* d = (d - s) | 0xff000000 */	
+	/* d = (d - s) | 0xff000000 */
 	psubusb %xmm1, %xmm2
 	por %xmm5, %xmm2
 	movdqa %xmm2, (%rdi, %rcx, 4)
@@ -3900,7 +3895,7 @@ PR_(imlib_amd64_subtract_copy_rgb_to_rgba):
 
 	movdqu (%rsi, %rcx, 4), %xmm1
 	movdqa (%rdi, %rcx, 4), %xmm2
-	/* d = (d - s) | 0xff000000 */	
+	/* d = (d - s) | 0xff000000 */
 	psubusb %xmm1, %xmm2
 	por %xmm5, %xmm2
 	movdqa %xmm2, (%rdi, %rcx, 4)
@@ -3913,7 +3908,7 @@ PR_(imlib_amd64_subtract_copy_rgb_to_rgba):
 
 	movd (%rsi, %rcx, 4), %xmm1
 	movd (%rdi, %rcx, 4), %xmm2
-	/* d = (d - s) | 0xff000000 */	
+	/* d = (d - s) | 0xff000000 */
 	psubusb %xmm1, %xmm2
 	por %xmm5, %xmm2
 	movd %xmm2, (%rdi, %rcx, 4)
@@ -3937,20 +3932,20 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb):
 	movdqu m000V0V0V000V0V0V(%rip), %xmm6
 	movdqu m00XXXXXX(%rip), %xmm7
 
-	/* Move right to left across each line, */ 
-	/* processing in two pixel chunks */ 
-	leaq (%rsi, %r8, 4), %rsi	
-	leaq (%rdi, %r8, 4), %rdi	
-					
-	/* Last instruction is %rcx = 0 */ 
-	subq $4, %rsi			
-	subq $4, %rdi			
-					
-	negq %r8			
-0:					
-	movq %r8, %rcx			
-					
-	incq %rcx			
+	/* Move right to left across each line, */
+	/* processing in two pixel chunks */
+	leaq (%rsi, %r8, 4), %rsi
+	leaq (%rdi, %r8, 4), %rdi
+
+	/* Last instruction is %rcx = 0 */
+	subq $4, %rsi
+	subq $4, %rdi
+
+	negq %r8
+0:
+	movq %r8, %rcx
+
+	incq %rcx
 
 	/* prefetch a couple cache lines ahead */
 	prefetchnta (%rsi, %rcx, 4)
@@ -3958,7 +3953,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb):
 	prefetchnta 64(%rsi, %rcx, 4)
 	prefetcht0 64(%rdi, %rcx, 4)
 
-	jz 2f /* one pixel line */	
+	jz 2f /* one pixel line */
 1:
 	/* main loop, unrolled to work on 64 byte chunks */
 	prefetchnta 128(%rsi, %rcx, 4)
@@ -3970,7 +3965,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb):
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero blending alpha */
@@ -3990,10 +3985,10 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
@@ -4001,7 +3996,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb):
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero blending alpha */
@@ -4021,10 +4016,10 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
@@ -4032,7 +4027,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb):
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero blending alpha */
@@ -4052,10 +4047,10 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
@@ -4063,7 +4058,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb):
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero blending alpha */
@@ -4083,10 +4078,10 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
@@ -4094,7 +4089,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb):
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero blending alpha */
@@ -4114,10 +4109,10 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
@@ -4125,7 +4120,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb):
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero blending alpha */
@@ -4145,10 +4140,10 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
@@ -4156,7 +4151,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb):
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero blending alpha */
@@ -4176,10 +4171,10 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
@@ -4187,7 +4182,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb):
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero blending alpha */
@@ -4207,10 +4202,10 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
-	js 1b				
-	jnz 3f				
+	incq %rcx
+	incq %rcx
+	js 1b
+	jnz 3f
 2:
 	movd (%rsi, %rcx, 4), %xmm1
 	movd (%rdi, %rcx, 4), %xmm2
@@ -4218,7 +4213,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb):
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero blending alpha */
@@ -4237,10 +4232,10 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb):
 	/* Repack new pixels */
 	packuswb %xmm4, %xmm2
 	movd %xmm2, (%rdi, %rcx, 4)
-3:					
-	leaq (%rsi, %r10, 4), %rsi	
-	leaq (%rdi, %r11, 4), %rdi	
-	decq %r9			
+3:
+	leaq (%rsi, %r10, 4), %rsi
+	leaq (%rdi, %r11, 4), %rdi
+	decq %r9
 	jnz 0b
 
 9:
@@ -4258,20 +4253,20 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba):
 	movdqu m000V0V0V000V0V0V(%rip), %xmm8
 	xorq %rax, %rax
 
-	/* Move right to left across each line, */ 
-	/* processing in two pixel chunks */ 
-	leaq (%rsi, %r8, 4), %rsi	
-	leaq (%rdi, %r8, 4), %rdi	
-					
-	/* Last instruction is %rcx = 0 */ 
-	subq $4, %rsi			
-	subq $4, %rdi			
-					
-	negq %r8			
-0:					
-	movq %r8, %rcx			
-					
-	incq %rcx			
+	/* Move right to left across each line, */
+	/* processing in two pixel chunks */
+	leaq (%rsi, %r8, 4), %rsi
+	leaq (%rdi, %r8, 4), %rdi
+
+	/* Last instruction is %rcx = 0 */
+	subq $4, %rsi
+	subq $4, %rdi
+
+	negq %r8
+0:
+	movq %r8, %rcx
+
+	incq %rcx
 
 	/* prefetch a couple cache lines ahead */
 	prefetchnta (%rsi, %rcx, 4)
@@ -4279,7 +4274,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba):
 	prefetchnta 64(%rsi, %rcx, 4)
 	prefetcht0 64(%rdi, %rcx, 4)
 
-	jz 2f /* one pixel line */	
+	jz 2f /* one pixel line */
 1:
 	/* main loop, unrolled to work on 64 byte chunks */
 	prefetchnta 128(%rsi, %rcx, 4)
@@ -4298,10 +4293,10 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba):
 	movb (%r13, %rdx), %al
 	movb %dh, %ah
 	shrb $1, %ah
-	movd %rax, %xmm3 
+	movd %rax, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	movdqa %xmm2, %xmm0
@@ -4316,14 +4311,14 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba):
 	psllw $2, %xmm1
 	pmulhw %xmm3, %xmm1
 	paddsw %xmm1, %xmm2
-	
+
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
@@ -4338,10 +4333,10 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba):
 	movb (%r13, %rdx), %al
 	movb %dh, %ah
 	shrb $1, %ah
-	movd %rax, %xmm3 
+	movd %rax, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	movdqa %xmm2, %xmm0
@@ -4356,14 +4351,14 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba):
 	psllw $2, %xmm1
 	pmulhw %xmm3, %xmm1
 	paddsw %xmm1, %xmm2
-	
+
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
@@ -4378,10 +4373,10 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba):
 	movb (%r13, %rdx), %al
 	movb %dh, %ah
 	shrb $1, %ah
-	movd %rax, %xmm3 
+	movd %rax, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	movdqa %xmm2, %xmm0
@@ -4396,14 +4391,14 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba):
 	psllw $2, %xmm1
 	pmulhw %xmm3, %xmm1
 	paddsw %xmm1, %xmm2
-	
+
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
@@ -4418,10 +4413,10 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba):
 	movb (%r13, %rdx), %al
 	movb %dh, %ah
 	shrb $1, %ah
-	movd %rax, %xmm3 
+	movd %rax, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	movdqa %xmm2, %xmm0
@@ -4436,14 +4431,14 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba):
 	psllw $2, %xmm1
 	pmulhw %xmm3, %xmm1
 	paddsw %xmm1, %xmm2
-	
+
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
@@ -4458,10 +4453,10 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba):
 	movb (%r13, %rdx), %al
 	movb %dh, %ah
 	shrb $1, %ah
-	movd %rax, %xmm3 
+	movd %rax, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	movdqa %xmm2, %xmm0
@@ -4476,14 +4471,14 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba):
 	psllw $2, %xmm1
 	pmulhw %xmm3, %xmm1
 	paddsw %xmm1, %xmm2
-	
+
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
@@ -4498,10 +4493,10 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba):
 	movb (%r13, %rdx), %al
 	movb %dh, %ah
 	shrb $1, %ah
-	movd %rax, %xmm3 
+	movd %rax, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	movdqa %xmm2, %xmm0
@@ -4516,14 +4511,14 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba):
 	psllw $2, %xmm1
 	pmulhw %xmm3, %xmm1
 	paddsw %xmm1, %xmm2
-	
+
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
@@ -4538,10 +4533,10 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba):
 	movb (%r13, %rdx), %al
 	movb %dh, %ah
 	shrb $1, %ah
-	movd %rax, %xmm3 
+	movd %rax, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	movdqa %xmm2, %xmm0
@@ -4556,14 +4551,14 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba):
 	psllw $2, %xmm1
 	pmulhw %xmm3, %xmm1
 	paddsw %xmm1, %xmm2
-	
+
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	movq (%rsi, %rcx, 4), %xmm1
 	movq (%rdi, %rcx, 4), %xmm2
@@ -4578,10 +4573,10 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba):
 	movb (%r13, %rdx), %al
 	movb %dh, %ah
 	shrb $1, %ah
-	movd %rax, %xmm3 
+	movd %rax, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	movdqa %xmm2, %xmm0
@@ -4596,14 +4591,14 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba):
 	psllw $2, %xmm1
 	pmulhw %xmm3, %xmm1
 	paddsw %xmm1, %xmm2
-	
+
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
-	js 1b				
-	jnz 3f				
+	incq %rcx
+	incq %rcx
+	js 1b
+	jnz 3f
 2:
 	movd (%rsi, %rcx, 4), %xmm1
 	movd (%rdi, %rcx, 4), %xmm2
@@ -4612,10 +4607,10 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba):
 	movb (%r13, %rdx), %al
 	movb %dh, %ah
 	shrb $1, %ah
-	movd %eax, %xmm3 
+	movd %eax, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	movdqa %xmm2, %xmm0
@@ -4630,13 +4625,13 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba):
 	psllw $2, %xmm1
 	pmulhw %xmm3, %xmm1
 	paddsw %xmm1, %xmm2
-	
+
 	packuswb %xmm4, %xmm2
 	movd %xmm2, (%rdi, %rcx, 4)
-3:					
-	leaq (%rsi, %r10, 4), %rsi	
-	leaq (%rdi, %r11, 4), %rdi	
-	decq %r9			
+3:
+	leaq (%rsi, %r10, 4), %rsi
+	leaq (%rdi, %r11, 4), %rdi
+	decq %r9
 	jnz 0b
 
 9:
@@ -5015,7 +5010,7 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgba):
 	/* d = d + s1 - s2, unsigned saturation */
 	paddusb %xmm1, %xmm2
 	psubusb %xmm3, %xmm2
-	
+
 	/* d alpha = s alpha */
 	pand %xmm5, %xmm2
 	pand %xmm7, %xmm0
@@ -5055,7 +5050,7 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgba):
 	/* d = d + s1 - s2, unsigned saturation */
 	paddusb %xmm1, %xmm2
 	psubusb %xmm3, %xmm2
-	
+
 	/* d alpha = s alpha */
 	pand %xmm5, %xmm2
 	pand %xmm7, %xmm0
@@ -5078,7 +5073,7 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgba):
 	/* d = d + s1 - s2, unsigned saturation */
 	paddusb %xmm1, %xmm2
 	psubusb %xmm3, %xmm2
-	
+
 	/* d alpha = s alpha */
 	pand %xmm5, %xmm2
 	pand %xmm7, %xmm0
@@ -5101,7 +5096,7 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgba):
 	/* d = d + s1 - s2, unsigned saturation */
 	paddusb %xmm1, %xmm2
 	psubusb %xmm3, %xmm2
-	
+
 	/* d alpha = s alpha */
 	pand %xmm5, %xmm2
 	pand %xmm7, %xmm0
@@ -5124,7 +5119,7 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgba):
 	/* d = d + s1 - s2, unsigned saturation */
 	paddusb %xmm1, %xmm2
 	psubusb %xmm3, %xmm2
-	
+
 	/* d alpha = s alpha */
 	pand %xmm5, %xmm2
 	pand %xmm7, %xmm0
@@ -5153,7 +5148,7 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgba):
 	/* d = d + s1 - s2, unsigned saturation */
 	paddusb %xmm1, %xmm2
 	psubusb %xmm3, %xmm2
-	
+
 	/* d alpha = s alpha */
 	pand %xmm5, %xmm2
 	pand %xmm7, %xmm0
@@ -5176,7 +5171,7 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgba):
 	/* d = d + s1 - s2, unsigned saturation */
 	paddusb %xmm1, %xmm2
 	psubusb %xmm3, %xmm2
-	
+
 	/* d alpha = s alpha */
 	pand %xmm5, %xmm2
 	pand %xmm7, %xmm0
@@ -5199,7 +5194,7 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgba):
 	/* d = d + s1 - s2, unsigned saturation */
 	paddusb %xmm1, %xmm2
 	psubusb %xmm3, %xmm2
-	
+
 	/* d alpha = s alpha */
 	pand %xmm5, %xmm2
 	pand %xmm7, %xmm0
@@ -5222,7 +5217,7 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgba):
 	/* d = d + s1 - s2, unsigned saturation */
 	paddusb %xmm1, %xmm2
 	psubusb %xmm3, %xmm2
-	
+
 	/* d alpha = s alpha */
 	pand %xmm5, %xmm2
 	pand %xmm7, %xmm0
@@ -5249,7 +5244,7 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgba):
 	/* d = d + s1 - s2, unsigned saturation */
 	paddusb %xmm1, %xmm2
 	psubusb %xmm3, %xmm2
-	
+
 	/* d alpha = s alpha */
 	pand %xmm5, %xmm2
 	pand %xmm7, %xmm0
@@ -5521,8 +5516,6 @@ PR_(imlib_amd64_reshade_copy_rgb_to_rgba):
 	LEAVE
 SIZE(imlib_amd64_reshade_copy_rgb_to_rgba)
 
-#endif
-
 #ifdef __ELF__
 .section .note.GNU-stack,"",@progbits
 #endif
diff --git a/src/lib/amd64_blend_cmod.S b/src/lib/amd64_blend_cmod.S
index 78e0847..e75b868 100644
--- a/src/lib/amd64_blend_cmod.S
+++ b/src/lib/amd64_blend_cmod.S
@@ -1,9 +1,7 @@
 #include <config.h>
 #include "asm.h"
 
-#ifdef DO_AMD64_ASM
-
-/*\ 
+/*\
 |*| AMD64 SSE2 assembly blending routines for Imlib2
 |*| Written by John Slaten <zartheenumerator@comcast.net>
 |*| Based on MMX routines written by Willem Monsuwe <willem@stack.nl>
@@ -87,7 +85,7 @@ FN_(imlib_amd64_reshade_copy_rgba_to_rgba_cmod)
 FN_(imlib_amd64_reshade_copy_rgb_to_rgba_cmod)
 
 .extern pow_lut
-	
+
 /*\ SSE register use:
 |*| %xmm1 = Source value
 |*| %xmm2 = Destination value
@@ -104,13 +102,6 @@ FN_(imlib_amd64_reshade_copy_rgb_to_rgba_cmod)
 |*| %r10d = sw
 |*| %r11d = dw
 \*/
-	
-
-
-
-
-
-
 
 
 #define ENTER		\
@@ -130,7 +121,7 @@ FN_(imlib_amd64_reshade_copy_rgb_to_rgba_cmod)
 	jz 9f		; \
 	testq %r9, %r9	; \
 	jz 9f
-	
+
 #define LEAVE		\
 	popq %r14	; \
 	popq %r13	; \
@@ -147,20 +138,20 @@ PR_(imlib_amd64_blend_rgba_to_rgb_cmod):
 	movdqa c1(%rip), %xmm5
 	movdqa m00XXXXXX(%rip), %xmm6
 
-	/* Move right to left across each line, */ 
-	/* processing in two pixel chunks */ 
-	leaq (%rsi, %r8, 4), %rsi	
-	leaq (%rdi, %r8, 4), %rdi	
-					
-	/* Last instruction is %rcx = 0 */ 
-	subq $4, %rsi			
-	subq $4, %rdi			
-					
-	negq %r8			
-0:					
-	movq %r8, %rcx			
-					
-	incq %rcx			
+	/* Move right to left across each line, */
+	/* processing in two pixel chunks */
+	leaq (%rsi, %r8, 4), %rsi
+	leaq (%rdi, %r8, 4), %rdi
+
+	/* Last instruction is %rcx = 0 */
+	subq $4, %rsi
+	subq $4, %rdi
+
+	negq %r8
+0:
+	movq %r8, %rcx
+
+	incq %rcx
 
 	/* prefetch a couple cache lines ahead */
 	prefetchnta (%rsi, %rcx, 4)
@@ -168,7 +159,7 @@ PR_(imlib_amd64_blend_rgba_to_rgb_cmod):
 	prefetchnta 64(%rsi, %rcx, 4)
 	prefetcht0 64(%rdi, %rcx, 4)
 
-	jz 2f /* one pixel line */	
+	jz 2f /* one pixel line */
 1:
 	/* main loop, unrolled to work on 64 byte chunks */
 	prefetchnta 128(%rsi, %rcx, 4)
@@ -213,13 +204,13 @@ PR_(imlib_amd64_blend_rgba_to_rgb_cmod):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -242,10 +233,10 @@ PR_(imlib_amd64_blend_rgba_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -286,13 +277,13 @@ PR_(imlib_amd64_blend_rgba_to_rgb_cmod):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -315,10 +306,10 @@ PR_(imlib_amd64_blend_rgba_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -359,13 +350,13 @@ PR_(imlib_amd64_blend_rgba_to_rgb_cmod):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -388,10 +379,10 @@ PR_(imlib_amd64_blend_rgba_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -432,13 +423,13 @@ PR_(imlib_amd64_blend_rgba_to_rgb_cmod):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -461,10 +452,10 @@ PR_(imlib_amd64_blend_rgba_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -505,13 +496,13 @@ PR_(imlib_amd64_blend_rgba_to_rgb_cmod):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -534,10 +525,10 @@ PR_(imlib_amd64_blend_rgba_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -578,13 +569,13 @@ PR_(imlib_amd64_blend_rgba_to_rgb_cmod):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -607,10 +598,10 @@ PR_(imlib_amd64_blend_rgba_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -651,13 +642,13 @@ PR_(imlib_amd64_blend_rgba_to_rgb_cmod):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -680,10 +671,10 @@ PR_(imlib_amd64_blend_rgba_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -724,13 +715,13 @@ PR_(imlib_amd64_blend_rgba_to_rgb_cmod):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -753,10 +744,10 @@ PR_(imlib_amd64_blend_rgba_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
-	js 1b				
-	jnz 3f				
+	incq %rcx
+	incq %rcx
+	js 1b
+	jnz 3f
 2:
 	/* Grab 1 pixel from src, with colormod */
 	movl (%rsi, %rcx, 4), %eax
@@ -781,13 +772,13 @@ PR_(imlib_amd64_blend_rgba_to_rgb_cmod):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -809,10 +800,10 @@ PR_(imlib_amd64_blend_rgba_to_rgb_cmod):
 	/* Repack new pixels */
 	packuswb %xmm4, %xmm2
 	movd %xmm2, (%rdi, %rcx, 4)
-3:					
-	leaq (%rsi, %r10, 4), %rsi	
-	leaq (%rdi, %r11, 4), %rdi	
-	decq %r9			
+3:
+	leaq (%rsi, %r10, 4), %rsi
+	leaq (%rdi, %r11, 4), %rdi
+	decq %r9
 	jnz 0b
 
 9:
@@ -827,20 +818,20 @@ PR_(imlib_amd64_blend_rgba_to_rgba_cmod):
 	movdqa mX000X000X000X000(%rip), %xmm6
 	movq pow_lut@GOTPCREL(%rip), %r13
 
-	/* Move right to left across each line, */ 
-	/* processing in two pixel chunks */ 
-	leaq (%rsi, %r8, 4), %rsi	
-	leaq (%rdi, %r8, 4), %rdi	
-					
-	/* Last instruction is %rcx = 0 */ 
-	subq $4, %rsi			
-	subq $4, %rdi			
-					
-	negq %r8			
-0:					
-	movq %r8, %rcx			
-					
-	incq %rcx			
+	/* Move right to left across each line, */
+	/* processing in two pixel chunks */
+	leaq (%rsi, %r8, 4), %rsi
+	leaq (%rdi, %r8, 4), %rdi
+
+	/* Last instruction is %rcx = 0 */
+	subq $4, %rsi
+	subq $4, %rdi
+
+	negq %r8
+0:
+	movq %r8, %rcx
+
+	incq %rcx
 
 	/* prefetch a couple cache lines ahead */
 	prefetchnta (%rsi, %rcx, 4)
@@ -848,7 +839,7 @@ PR_(imlib_amd64_blend_rgba_to_rgba_cmod):
 	prefetchnta 64(%rsi, %rcx, 4)
 	prefetcht0 64(%rdi, %rcx, 4)
 
-	jz 2f /* one pixel line */	
+	jz 2f /* one pixel line */
 1:
 	/* main loop, unrolled to work on 64 byte chunks */
 	prefetchnta 128(%rsi, %rcx, 4)
@@ -916,7 +907,7 @@ PR_(imlib_amd64_blend_rgba_to_rgba_cmod):
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* unpack source and dest */
@@ -934,10 +925,10 @@ PR_(imlib_amd64_blend_rgba_to_rgba_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -1001,7 +992,7 @@ PR_(imlib_amd64_blend_rgba_to_rgba_cmod):
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* unpack source and dest */
@@ -1019,10 +1010,10 @@ PR_(imlib_amd64_blend_rgba_to_rgba_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -1086,7 +1077,7 @@ PR_(imlib_amd64_blend_rgba_to_rgba_cmod):
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* unpack source and dest */
@@ -1104,10 +1095,10 @@ PR_(imlib_amd64_blend_rgba_to_rgba_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -1171,7 +1162,7 @@ PR_(imlib_amd64_blend_rgba_to_rgba_cmod):
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* unpack source and dest */
@@ -1189,10 +1180,10 @@ PR_(imlib_amd64_blend_rgba_to_rgba_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -1256,7 +1247,7 @@ PR_(imlib_amd64_blend_rgba_to_rgba_cmod):
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* unpack source and dest */
@@ -1274,10 +1265,10 @@ PR_(imlib_amd64_blend_rgba_to_rgba_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -1341,7 +1332,7 @@ PR_(imlib_amd64_blend_rgba_to_rgba_cmod):
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* unpack source and dest */
@@ -1359,10 +1350,10 @@ PR_(imlib_amd64_blend_rgba_to_rgba_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -1426,7 +1417,7 @@ PR_(imlib_amd64_blend_rgba_to_rgba_cmod):
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* unpack source and dest */
@@ -1444,10 +1435,10 @@ PR_(imlib_amd64_blend_rgba_to_rgba_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -1511,7 +1502,7 @@ PR_(imlib_amd64_blend_rgba_to_rgba_cmod):
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* unpack source and dest */
@@ -1529,10 +1520,10 @@ PR_(imlib_amd64_blend_rgba_to_rgba_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
-	js 1b				
-	jnz 3f				
+	incq %rcx
+	incq %rcx
+	js 1b
+	jnz 3f
 2:
 	/* Grab 1 pixel from src, with colormod */
 	movl (%rsi, %rcx, 4), %eax
@@ -1568,7 +1559,7 @@ PR_(imlib_amd64_blend_rgba_to_rgba_cmod):
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* unpack source and dest */
@@ -1585,10 +1576,10 @@ PR_(imlib_amd64_blend_rgba_to_rgba_cmod):
 	/* repack new pixels */
 	packuswb %xmm4, %xmm2
 	movd %xmm2, (%rdi, %rcx, 4)
-3:					
-	leaq (%rsi, %r10, 4), %rsi	
-	leaq (%rdi, %r11, 4), %rdi	
-	decq %r9			
+3:
+	leaq (%rsi, %r10, 4), %rsi
+	leaq (%rdi, %r11, 4), %rdi
+	decq %r9
 	jnz 0b
 
 9:
@@ -1603,20 +1594,20 @@ PR_(imlib_amd64_blend_rgb_to_rgba_cmod):
 	movdqa mX000X000X000X000(%rip), %xmm6
 	movq pow_lut@GOTPCREL(%rip), %r13
 
-	/* Move right to left across each line, */ 
-	/* processing in two pixel chunks */ 
-	leaq (%rsi, %r8, 4), %rsi	
-	leaq (%rdi, %r8, 4), %rdi	
-					
-	/* Last instruction is %rcx = 0 */ 
-	subq $4, %rsi			
-	subq $4, %rdi			
-					
-	negq %r8			
-0:					
-	movq %r8, %rcx			
-					
-	incq %rcx			
+	/* Move right to left across each line, */
+	/* processing in two pixel chunks */
+	leaq (%rsi, %r8, 4), %rsi
+	leaq (%rdi, %r8, 4), %rdi
+
+	/* Last instruction is %rcx = 0 */
+	subq $4, %rsi
+	subq $4, %rdi
+
+	negq %r8
+0:
+	movq %r8, %rcx
+
+	incq %rcx
 
 	/* prefetch a couple cache lines ahead */
 	prefetchnta (%rsi, %rcx, 4)
@@ -1624,7 +1615,7 @@ PR_(imlib_amd64_blend_rgb_to_rgba_cmod):
 	prefetchnta 64(%rsi, %rcx, 4)
 	prefetcht0 64(%rdi, %rcx, 4)
 
-	jz 2f /* one pixel line */	
+	jz 2f /* one pixel line */
 1:
 	/* main loop, unrolled to work on 64 byte chunks */
 	prefetchnta 128(%rsi, %rcx, 4)
@@ -1690,7 +1681,7 @@ PR_(imlib_amd64_blend_rgb_to_rgba_cmod):
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* unpack source and dest */
@@ -1708,10 +1699,10 @@ PR_(imlib_amd64_blend_rgb_to_rgba_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -1773,7 +1764,7 @@ PR_(imlib_amd64_blend_rgb_to_rgba_cmod):
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* unpack source and dest */
@@ -1791,10 +1782,10 @@ PR_(imlib_amd64_blend_rgb_to_rgba_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -1856,7 +1847,7 @@ PR_(imlib_amd64_blend_rgb_to_rgba_cmod):
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* unpack source and dest */
@@ -1874,10 +1865,10 @@ PR_(imlib_amd64_blend_rgb_to_rgba_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -1939,7 +1930,7 @@ PR_(imlib_amd64_blend_rgb_to_rgba_cmod):
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* unpack source and dest */
@@ -1957,10 +1948,10 @@ PR_(imlib_amd64_blend_rgb_to_rgba_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -2022,7 +2013,7 @@ PR_(imlib_amd64_blend_rgb_to_rgba_cmod):
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* unpack source and dest */
@@ -2040,10 +2031,10 @@ PR_(imlib_amd64_blend_rgb_to_rgba_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -2105,7 +2096,7 @@ PR_(imlib_amd64_blend_rgb_to_rgba_cmod):
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* unpack source and dest */
@@ -2123,10 +2114,10 @@ PR_(imlib_amd64_blend_rgb_to_rgba_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -2188,7 +2179,7 @@ PR_(imlib_amd64_blend_rgb_to_rgba_cmod):
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* unpack source and dest */
@@ -2206,10 +2197,10 @@ PR_(imlib_amd64_blend_rgb_to_rgba_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -2271,7 +2262,7 @@ PR_(imlib_amd64_blend_rgb_to_rgba_cmod):
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* unpack source and dest */
@@ -2289,10 +2280,10 @@ PR_(imlib_amd64_blend_rgb_to_rgba_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
-	js 1b				
-	jnz 3f				
+	incq %rcx
+	incq %rcx
+	js 1b
+	jnz 3f
 2:
 	/* Grab 1 pixel from src, with colormod, with a = amod[255] */
 	movl (%rsi, %rcx, 4), %eax
@@ -2327,7 +2318,7 @@ PR_(imlib_amd64_blend_rgb_to_rgba_cmod):
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* unpack source and dest */
@@ -2344,10 +2335,10 @@ PR_(imlib_amd64_blend_rgb_to_rgba_cmod):
 	/* repack new pixels */
 	packuswb %xmm4, %xmm2
 	movd %xmm2, (%rdi, %rcx, 4)
-3:					
-	leaq (%rsi, %r10, 4), %rsi	
-	leaq (%rdi, %r11, 4), %rdi	
-	decq %r9			
+3:
+	leaq (%rsi, %r10, 4), %rsi
+	leaq (%rdi, %r11, 4), %rdi
+	decq %r9
 	jnz 0b
 
 9:
@@ -2360,20 +2351,20 @@ PR_(imlib_amd64_blend_rgb_to_rgb_cmod):
 	movdqa c1(%rip), %xmm5
 	movdqa m00XXXXXX(%rip), %xmm6
 
-	/* Move right to left across each line, */ 
-	/* processing in two pixel chunks */ 
-	leaq (%rsi, %r8, 4), %rsi	
-	leaq (%rdi, %r8, 4), %rdi	
-					
-	/* Last instruction is %rcx = 0 */ 
-	subq $4, %rsi			
-	subq $4, %rdi			
-					
-	negq %r8			
-0:					
-	movq %r8, %rcx			
-					
-	incq %rcx			
+	/* Move right to left across each line, */
+	/* processing in two pixel chunks */
+	leaq (%rsi, %r8, 4), %rsi
+	leaq (%rdi, %r8, 4), %rdi
+
+	/* Last instruction is %rcx = 0 */
+	subq $4, %rsi
+	subq $4, %rdi
+
+	negq %r8
+0:
+	movq %r8, %rcx
+
+	incq %rcx
 
 	/* prefetch a couple cache lines ahead */
 	prefetchnta (%rsi, %rcx, 4)
@@ -2381,7 +2372,7 @@ PR_(imlib_amd64_blend_rgb_to_rgb_cmod):
 	prefetchnta 64(%rsi, %rcx, 4)
 	prefetcht0 64(%rdi, %rcx, 4)
 
-	jz 2f /* one pixel line */	
+	jz 2f /* one pixel line */
 1:
 	/* main loop, unrolled to work on 64 byte chunks */
 	prefetchnta 128(%rsi, %rcx, 4)
@@ -2424,13 +2415,13 @@ PR_(imlib_amd64_blend_rgb_to_rgb_cmod):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -2453,10 +2444,10 @@ PR_(imlib_amd64_blend_rgb_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -2495,13 +2486,13 @@ PR_(imlib_amd64_blend_rgb_to_rgb_cmod):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -2524,10 +2515,10 @@ PR_(imlib_amd64_blend_rgb_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -2566,13 +2557,13 @@ PR_(imlib_amd64_blend_rgb_to_rgb_cmod):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -2595,10 +2586,10 @@ PR_(imlib_amd64_blend_rgb_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -2637,13 +2628,13 @@ PR_(imlib_amd64_blend_rgb_to_rgb_cmod):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -2666,10 +2657,10 @@ PR_(imlib_amd64_blend_rgb_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -2708,13 +2699,13 @@ PR_(imlib_amd64_blend_rgb_to_rgb_cmod):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -2737,10 +2728,10 @@ PR_(imlib_amd64_blend_rgb_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -2779,13 +2770,13 @@ PR_(imlib_amd64_blend_rgb_to_rgb_cmod):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -2808,10 +2799,10 @@ PR_(imlib_amd64_blend_rgb_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -2850,13 +2841,13 @@ PR_(imlib_amd64_blend_rgb_to_rgb_cmod):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -2879,10 +2870,10 @@ PR_(imlib_amd64_blend_rgb_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -2921,13 +2912,13 @@ PR_(imlib_amd64_blend_rgb_to_rgb_cmod):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -2950,10 +2941,10 @@ PR_(imlib_amd64_blend_rgb_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
-	js 1b				
-	jnz 3f				
+	incq %rcx
+	incq %rcx
+	js 1b
+	jnz 3f
 2:
 	/* Grab 1 pixel from src, with colormod, with a = amod[255] */
 	movl (%rsi, %rcx, 4), %eax
@@ -2972,18 +2963,18 @@ PR_(imlib_amd64_blend_rgb_to_rgb_cmod):
 	movb %al, %bl
 	movb 0x200(%r14, %rbx), %dl
 	movd %edx, %xmm1
-	movd (%rdi, %rcx, 4), %xmm2 
+	movd (%rdi, %rcx, 4), %xmm2
 	/* Get alpha from source and unpack to words
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -3005,10 +2996,10 @@ PR_(imlib_amd64_blend_rgb_to_rgb_cmod):
 	/* Repack new pixels */
 	packuswb %xmm4, %xmm2
 	movd %xmm2, (%rdi, %rcx, 4)
-3:					
-	leaq (%rsi, %r10, 4), %rsi	
-	leaq (%rdi, %r11, 4), %rdi	
-	decq %r9			
+3:
+	leaq (%rsi, %r10, 4), %rsi
+	leaq (%rdi, %r11, 4), %rdi
+	decq %r9
 	jnz 0b
 
 9:
@@ -3019,20 +3010,20 @@ PR_(imlib_amd64_copy_rgba_to_rgb_cmod):
 
 	movq mX000X000X000X000(%rip), %r13
 
-	/* Move right to left across each line, */ 
-	/* processing in two pixel chunks */ 
-	leaq (%rsi, %r8, 4), %rsi	
-	leaq (%rdi, %r8, 4), %rdi	
-					
-	/* Last instruction is %rcx = 0 */ 
-	subq $4, %rsi			
-	subq $4, %rdi			
-					
-	negq %r8			
-0:					
-	movq %r8, %rcx			
-					
-	incq %rcx			
+	/* Move right to left across each line, */
+	/* processing in two pixel chunks */
+	leaq (%rsi, %r8, 4), %rsi
+	leaq (%rdi, %r8, 4), %rdi
+
+	/* Last instruction is %rcx = 0 */
+	subq $4, %rsi
+	subq $4, %rdi
+
+	negq %r8
+0:
+	movq %r8, %rcx
+
+	incq %rcx
 
 	/* prefetch a couple cache lines ahead */
 	prefetchnta (%rsi, %rcx, 4)
@@ -3040,7 +3031,7 @@ PR_(imlib_amd64_copy_rgba_to_rgb_cmod):
 	prefetchnta 64(%rsi, %rcx, 4)
 	prefetcht0 64(%rdi, %rcx, 4)
 
-	jz 2f /* one pixel line */	
+	jz 2f /* one pixel line */
 1:
 	/* main loop, unrolled to work on 64 byte chunks */
 	prefetchnta 128(%rsi, %rcx, 4)
@@ -3076,10 +3067,10 @@ PR_(imlib_amd64_copy_rgba_to_rgb_cmod):
 	orq %rax, %rdx
 	movq %rdx, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = 0 */
 	movq (%rsi, %rcx, 4), %rax
@@ -3111,10 +3102,10 @@ PR_(imlib_amd64_copy_rgba_to_rgb_cmod):
 	orq %rax, %rdx
 	movq %rdx, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = 0 */
 	movq (%rsi, %rcx, 4), %rax
@@ -3146,10 +3137,10 @@ PR_(imlib_amd64_copy_rgba_to_rgb_cmod):
 	orq %rax, %rdx
 	movq %rdx, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = 0 */
 	movq (%rsi, %rcx, 4), %rax
@@ -3181,10 +3172,10 @@ PR_(imlib_amd64_copy_rgba_to_rgb_cmod):
 	orq %rax, %rdx
 	movq %rdx, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = 0 */
 	movq (%rsi, %rcx, 4), %rax
@@ -3216,10 +3207,10 @@ PR_(imlib_amd64_copy_rgba_to_rgb_cmod):
 	orq %rax, %rdx
 	movq %rdx, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = 0 */
 	movq (%rsi, %rcx, 4), %rax
@@ -3251,10 +3242,10 @@ PR_(imlib_amd64_copy_rgba_to_rgb_cmod):
 	orq %rax, %rdx
 	movq %rdx, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = 0 */
 	movq (%rsi, %rcx, 4), %rax
@@ -3286,10 +3277,10 @@ PR_(imlib_amd64_copy_rgba_to_rgb_cmod):
 	orq %rax, %rdx
 	movq %rdx, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = 0 */
 	movq (%rsi, %rcx, 4), %rax
@@ -3321,10 +3312,10 @@ PR_(imlib_amd64_copy_rgba_to_rgb_cmod):
 	orq %rax, %rdx
 	movq %rdx, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
-	js 1b				
-	jnz 3f				
+	incq %rcx
+	incq %rcx
+	js 1b
+	jnz 3f
 2:
 	/* Grab 1 pixel from src, with colormod, with a = 0 */
 	movl (%rsi, %rcx, 4), %eax
@@ -3343,10 +3334,10 @@ PR_(imlib_amd64_copy_rgba_to_rgb_cmod):
 	andq %r13, %rax
 	orq %rax, %rdx
 	movl %edx, (%rdi, %rcx, 4)
-3:					
-	leaq (%rsi, %r10, 4), %rsi	
-	leaq (%rdi, %r11, 4), %rdi	
-	decq %r9			
+3:
+	leaq (%rsi, %r10, 4), %rsi
+	leaq (%rdi, %r11, 4), %rdi
+	decq %r9
 	jnz 0b
 
 9:
@@ -3356,20 +3347,20 @@ PR_(imlib_amd64_copy_rgba_to_rgba_cmod):
 	ENTER
 
 
-	/* Move right to left across each line, */ 
-	/* processing in two pixel chunks */ 
-	leaq (%rsi, %r8, 4), %rsi	
-	leaq (%rdi, %r8, 4), %rdi	
-					
-	/* Last instruction is %rcx = 0 */ 
-	subq $4, %rsi			
-	subq $4, %rdi			
-					
-	negq %r8			
-0:					
-	movq %r8, %rcx			
-					
-	incq %rcx			
+	/* Move right to left across each line, */
+	/* processing in two pixel chunks */
+	leaq (%rsi, %r8, 4), %rsi
+	leaq (%rdi, %r8, 4), %rdi
+
+	/* Last instruction is %rcx = 0 */
+	subq $4, %rsi
+	subq $4, %rdi
+
+	negq %r8
+0:
+	movq %r8, %rcx
+
+	incq %rcx
 
 	/* prefetch a couple cache lines ahead */
 	prefetchnta (%rsi, %rcx, 4)
@@ -3377,7 +3368,7 @@ PR_(imlib_amd64_copy_rgba_to_rgba_cmod):
 	prefetchnta 64(%rsi, %rcx, 4)
 	prefetcht0 64(%rdi, %rcx, 4)
 
-	jz 2f /* one pixel line */	
+	jz 2f /* one pixel line */
 1:
 	/* main loop, unrolled to work on 64 byte chunks */
 	prefetchnta 128(%rsi, %rcx, 4)
@@ -3418,10 +3409,10 @@ PR_(imlib_amd64_copy_rgba_to_rgba_cmod):
 	movb 0x200(%r14, %rbx), %dl
 	movq %rdx, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -3458,10 +3449,10 @@ PR_(imlib_amd64_copy_rgba_to_rgba_cmod):
 	movb 0x200(%r14, %rbx), %dl
 	movq %rdx, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -3498,10 +3489,10 @@ PR_(imlib_amd64_copy_rgba_to_rgba_cmod):
 	movb 0x200(%r14, %rbx), %dl
 	movq %rdx, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -3538,10 +3529,10 @@ PR_(imlib_amd64_copy_rgba_to_rgba_cmod):
 	movb 0x200(%r14, %rbx), %dl
 	movq %rdx, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -3578,10 +3569,10 @@ PR_(imlib_amd64_copy_rgba_to_rgba_cmod):
 	movb 0x200(%r14, %rbx), %dl
 	movq %rdx, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -3618,10 +3609,10 @@ PR_(imlib_amd64_copy_rgba_to_rgba_cmod):
 	movb 0x200(%r14, %rbx), %dl
 	movq %rdx, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -3658,10 +3649,10 @@ PR_(imlib_amd64_copy_rgba_to_rgba_cmod):
 	movb 0x200(%r14, %rbx), %dl
 	movq %rdx, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -3698,10 +3689,10 @@ PR_(imlib_amd64_copy_rgba_to_rgba_cmod):
 	movb 0x200(%r14, %rbx), %dl
 	movq %rdx, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
-	js 1b				
-	jnz 3f				
+	incq %rcx
+	incq %rcx
+	js 1b
+	jnz 3f
 2:
 	/* Grab 1 pixel from src, with colormod */
 	movl (%rsi, %rcx, 4), %eax
@@ -3721,10 +3712,10 @@ PR_(imlib_amd64_copy_rgba_to_rgba_cmod):
 	movb %al, %bl
 	movb 0x200(%r14, %rbx), %dl
 	movl %edx, (%rdi, %rcx, 4)
-3:					
-	leaq (%rsi, %r10, 4), %rsi	
-	leaq (%rdi, %r11, 4), %rdi	
-	decq %r9			
+3:
+	leaq (%rsi, %r10, 4), %rsi
+	leaq (%rdi, %r11, 4), %rdi
+	decq %r9
 	jnz 0b
 
 9:
@@ -3734,20 +3725,20 @@ PR_(imlib_amd64_copy_rgb_to_rgba_cmod):
 	ENTER
 
 
-	/* Move right to left across each line, */ 
-	/* processing in two pixel chunks */ 
-	leaq (%rsi, %r8, 4), %rsi	
-	leaq (%rdi, %r8, 4), %rdi	
-					
-	/* Last instruction is %rcx = 0 */ 
-	subq $4, %rsi			
-	subq $4, %rdi			
-					
-	negq %r8			
-0:					
-	movq %r8, %rcx			
-					
-	incq %rcx			
+	/* Move right to left across each line, */
+	/* processing in two pixel chunks */
+	leaq (%rsi, %r8, 4), %rsi
+	leaq (%rdi, %r8, 4), %rdi
+
+	/* Last instruction is %rcx = 0 */
+	subq $4, %rsi
+	subq $4, %rdi
+
+	negq %r8
+0:
+	movq %r8, %rcx
+
+	incq %rcx
 
 	/* prefetch a couple cache lines ahead */
 	prefetchnta (%rsi, %rcx, 4)
@@ -3755,7 +3746,7 @@ PR_(imlib_amd64_copy_rgb_to_rgba_cmod):
 	prefetchnta 64(%rsi, %rcx, 4)
 	prefetcht0 64(%rdi, %rcx, 4)
 
-	jz 2f /* one pixel line */	
+	jz 2f /* one pixel line */
 1:
 	/* main loop, unrolled to work on 64 byte chunks */
 	prefetchnta 128(%rsi, %rcx, 4)
@@ -3794,10 +3785,10 @@ PR_(imlib_amd64_copy_rgb_to_rgba_cmod):
 	movb 0x200(%r14, %rbx), %dl
 	movq %rdx, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -3832,10 +3823,10 @@ PR_(imlib_amd64_copy_rgb_to_rgba_cmod):
 	movb 0x200(%r14, %rbx), %dl
 	movq %rdx, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -3870,10 +3861,10 @@ PR_(imlib_amd64_copy_rgb_to_rgba_cmod):
 	movb 0x200(%r14, %rbx), %dl
 	movq %rdx, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -3908,10 +3899,10 @@ PR_(imlib_amd64_copy_rgb_to_rgba_cmod):
 	movb 0x200(%r14, %rbx), %dl
 	movq %rdx, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -3946,10 +3937,10 @@ PR_(imlib_amd64_copy_rgb_to_rgba_cmod):
 	movb 0x200(%r14, %rbx), %dl
 	movq %rdx, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -3984,10 +3975,10 @@ PR_(imlib_amd64_copy_rgb_to_rgba_cmod):
 	movb 0x200(%r14, %rbx), %dl
 	movq %rdx, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -4022,10 +4013,10 @@ PR_(imlib_amd64_copy_rgb_to_rgba_cmod):
 	movb 0x200(%r14, %rbx), %dl
 	movq %rdx, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -4060,10 +4051,10 @@ PR_(imlib_amd64_copy_rgb_to_rgba_cmod):
 	movb 0x200(%r14, %rbx), %dl
 	movq %rdx, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
-	js 1b				
-	jnz 3f				
+	incq %rcx
+	incq %rcx
+	js 1b
+	jnz 3f
 2:
 	/* Grab 1 pixel from src, with colormod, with a = amod[255] */
 	movl (%rsi, %rcx, 4), %eax
@@ -4082,10 +4073,10 @@ PR_(imlib_amd64_copy_rgb_to_rgba_cmod):
 	movb %al, %bl
 	movb 0x200(%r14, %rbx), %dl
 	movl %edx, (%rdi, %rcx, 4)
-3:					
-	leaq (%rsi, %r10, 4), %rsi	
-	leaq (%rdi, %r11, 4), %rdi	
-	decq %r9			
+3:
+	leaq (%rsi, %r10, 4), %rsi
+	leaq (%rdi, %r11, 4), %rdi
+	decq %r9
 	jnz 0b
 
 9:
@@ -4097,20 +4088,20 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb_cmod):
 	pxor %xmm4, %xmm4
 	movdqa m00XXXXXX(%rip), %xmm6
 
-	/* Move right to left across each line, */ 
-	/* processing in two pixel chunks */ 
-	leaq (%rsi, %r8, 4), %rsi	
-	leaq (%rdi, %r8, 4), %rdi	
-					
-	/* Last instruction is %rcx = 0 */ 
-	subq $4, %rsi			
-	subq $4, %rdi			
-					
-	negq %r8			
-0:					
-	movq %r8, %rcx			
-					
-	incq %rcx			
+	/* Move right to left across each line, */
+	/* processing in two pixel chunks */
+	leaq (%rsi, %r8, 4), %rsi
+	leaq (%rdi, %r8, 4), %rdi
+
+	/* Last instruction is %rcx = 0 */
+	subq $4, %rsi
+	subq $4, %rdi
+
+	negq %r8
+0:
+	movq %r8, %rcx
+
+	incq %rcx
 
 	/* prefetch a couple cache lines ahead */
 	prefetchnta (%rsi, %rcx, 4)
@@ -4118,7 +4109,7 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb_cmod):
 	prefetchnta 64(%rsi, %rcx, 4)
 	prefetcht0 64(%rdi, %rcx, 4)
 
-	jz 2f /* one pixel line */	
+	jz 2f /* one pixel line */
 1:
 	/* main loop, unrolled to work on 64 byte chunks */
 	prefetchnta 128(%rsi, %rcx, 4)
@@ -4163,13 +4154,13 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb_cmod):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -4190,10 +4181,10 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -4234,13 +4225,13 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb_cmod):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -4261,10 +4252,10 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -4305,13 +4296,13 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb_cmod):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -4332,10 +4323,10 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -4376,13 +4367,13 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb_cmod):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -4403,10 +4394,10 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -4447,13 +4438,13 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb_cmod):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -4474,10 +4465,10 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -4518,13 +4509,13 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb_cmod):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -4545,10 +4536,10 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -4589,13 +4580,13 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb_cmod):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -4616,10 +4607,10 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -4660,13 +4651,13 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb_cmod):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -4687,10 +4678,10 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
-	js 1b				
-	jnz 3f				
+	incq %rcx
+	incq %rcx
+	js 1b
+	jnz 3f
 2:
 	/* Grab 1 pixel from src, with colormod */
 	movl (%rsi, %rcx, 4), %eax
@@ -4715,13 +4706,13 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb_cmod):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -4741,10 +4732,10 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb_cmod):
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
  	movd %xmm2, (%rdi, %rcx, 4)
-3:					
-	leaq (%rsi, %r10, 4), %rsi	
-	leaq (%rdi, %r11, 4), %rdi	
-	decq %r9			
+3:
+	leaq (%rsi, %r10, 4), %rsi
+	leaq (%rdi, %r11, 4), %rdi
+	decq %r9
 	jnz 0b
 
 9:
@@ -4760,20 +4751,20 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba_cmod):
 	movdqa mX000X000X000X000(%rip), %xmm6
 	movq pow_lut@GOTPCREL(%rip), %r13
 
-	/* Move right to left across each line, */ 
-	/* processing in two pixel chunks */ 
-	leaq (%rsi, %r8, 4), %rsi	
-	leaq (%rdi, %r8, 4), %rdi	
-					
-	/* Last instruction is %rcx = 0 */ 
-	subq $4, %rsi			
-	subq $4, %rdi			
-					
-	negq %r8			
-0:					
-	movq %r8, %rcx			
-					
-	incq %rcx			
+	/* Move right to left across each line, */
+	/* processing in two pixel chunks */
+	leaq (%rsi, %r8, 4), %rsi
+	leaq (%rdi, %r8, 4), %rdi
+
+	/* Last instruction is %rcx = 0 */
+	subq $4, %rsi
+	subq $4, %rdi
+
+	negq %r8
+0:
+	movq %r8, %rcx
+
+	incq %rcx
 
 	/* prefetch a couple cache lines ahead */
 	prefetchnta (%rsi, %rcx, 4)
@@ -4781,7 +4772,7 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba_cmod):
 	prefetchnta 64(%rsi, %rcx, 4)
 	prefetcht0 64(%rdi, %rcx, 4)
 
-	jz 2f /* one pixel line */	
+	jz 2f /* one pixel line */
 1:
 	/* main loop, unrolled to work on 64 byte chunks */
 	prefetchnta 128(%rsi, %rcx, 4)
@@ -4846,7 +4837,7 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba_cmod):
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* src alpha = 255 - dst alpha */
@@ -4863,15 +4854,15 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba_cmod):
 	psllw $1, %xmm1
 	pmulhw %xmm3, %xmm1
 	paddsw %xmm1, %xmm2
-	
+
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -4932,7 +4923,7 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba_cmod):
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* src alpha = 255 - dst alpha */
@@ -4949,15 +4940,15 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba_cmod):
 	psllw $1, %xmm1
 	pmulhw %xmm3, %xmm1
 	paddsw %xmm1, %xmm2
-	
+
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -5018,7 +5009,7 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba_cmod):
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* src alpha = 255 - dst alpha */
@@ -5035,15 +5026,15 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba_cmod):
 	psllw $1, %xmm1
 	pmulhw %xmm3, %xmm1
 	paddsw %xmm1, %xmm2
-	
+
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -5104,7 +5095,7 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba_cmod):
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* src alpha = 255 - dst alpha */
@@ -5121,15 +5112,15 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba_cmod):
 	psllw $1, %xmm1
 	pmulhw %xmm3, %xmm1
 	paddsw %xmm1, %xmm2
-	
+
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -5190,7 +5181,7 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba_cmod):
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* src alpha = 255 - dst alpha */
@@ -5207,15 +5198,15 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba_cmod):
 	psllw $1, %xmm1
 	pmulhw %xmm3, %xmm1
 	paddsw %xmm1, %xmm2
-	
+
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -5276,7 +5267,7 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba_cmod):
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* src alpha = 255 - dst alpha */
@@ -5293,15 +5284,15 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba_cmod):
 	psllw $1, %xmm1
 	pmulhw %xmm3, %xmm1
 	paddsw %xmm1, %xmm2
-	
+
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -5362,7 +5353,7 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba_cmod):
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* src alpha = 255 - dst alpha */
@@ -5379,15 +5370,15 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba_cmod):
 	psllw $1, %xmm1
 	pmulhw %xmm3, %xmm1
 	paddsw %xmm1, %xmm2
-	
+
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -5448,7 +5439,7 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba_cmod):
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* src alpha = 255 - dst alpha */
@@ -5465,15 +5456,15 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba_cmod):
 	psllw $1, %xmm1
 	pmulhw %xmm3, %xmm1
 	paddsw %xmm1, %xmm2
-	
+
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
-	js 1b				
-	jnz 3f				
+	incq %rcx
+	incq %rcx
+	js 1b
+	jnz 3f
 2:
 	/* Grab 1 pixel from src, with colormod */
 	movl (%rsi, %rcx, 4), %eax
@@ -5506,7 +5497,7 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba_cmod):
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* src alpha = 255 - dst alpha */
@@ -5523,14 +5514,14 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba_cmod):
 	psllw $1, %xmm1
 	pmulhw %xmm3, %xmm1
 	paddsw %xmm1, %xmm2
-	
+
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movd %xmm2, (%rdi, %rcx, 4)
-3:					
-	leaq (%rsi, %r10, 4), %rsi	
-	leaq (%rdi, %r11, 4), %rdi	
-	decq %r9			
+3:
+	leaq (%rsi, %r10, 4), %rsi
+	leaq (%rdi, %r11, 4), %rdi
+	decq %r9
 	jnz 0b
 
 9:
@@ -5546,20 +5537,20 @@ PR_(imlib_amd64_add_blend_rgb_to_rgba_cmod):
 	movdqa mX000X000X000X000(%rip), %xmm6
 	movq pow_lut@GOTPCREL(%rip), %r13
 
-	/* Move right to left across each line, */ 
-	/* processing in two pixel chunks */ 
-	leaq (%rsi, %r8, 4), %rsi	
-	leaq (%rdi, %r8, 4), %rdi	
-					
-	/* Last instruction is %rcx = 0 */ 
-	subq $4, %rsi			
-	subq $4, %rdi			
-					
-	negq %r8			
-0:					
-	movq %r8, %rcx			
-					
-	incq %rcx			
+	/* Move right to left across each line, */
+	/* processing in two pixel chunks */
+	leaq (%rsi, %r8, 4), %rsi
+	leaq (%rdi, %r8, 4), %rdi
+
+	/* Last instruction is %rcx = 0 */
+	subq $4, %rsi
+	subq $4, %rdi
+
+	negq %r8
+0:
+	movq %r8, %rcx
+
+	incq %rcx
 
 	/* prefetch a couple cache lines ahead */
 	prefetchnta (%rsi, %rcx, 4)
@@ -5567,7 +5558,7 @@ PR_(imlib_amd64_add_blend_rgb_to_rgba_cmod):
 	prefetchnta 64(%rsi, %rcx, 4)
 	prefetcht0 64(%rdi, %rcx, 4)
 
-	jz 2f /* one pixel line */	
+	jz 2f /* one pixel line */
 1:
 	/* main loop, unrolled to work on 64 byte chunks */
 	prefetchnta 128(%rsi, %rcx, 4)
@@ -5630,7 +5621,7 @@ PR_(imlib_amd64_add_blend_rgb_to_rgba_cmod):
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* src alpha = 255 - dst alpha */
@@ -5647,15 +5638,15 @@ PR_(imlib_amd64_add_blend_rgb_to_rgba_cmod):
 	psllw $1, %xmm1
 	pmulhw %xmm3, %xmm1
 	paddsw %xmm1, %xmm2
-	
+
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -5714,7 +5705,7 @@ PR_(imlib_amd64_add_blend_rgb_to_rgba_cmod):
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* src alpha = 255 - dst alpha */
@@ -5731,15 +5722,15 @@ PR_(imlib_amd64_add_blend_rgb_to_rgba_cmod):
 	psllw $1, %xmm1
 	pmulhw %xmm3, %xmm1
 	paddsw %xmm1, %xmm2
-	
+
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -5798,7 +5789,7 @@ PR_(imlib_amd64_add_blend_rgb_to_rgba_cmod):
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* src alpha = 255 - dst alpha */
@@ -5815,15 +5806,15 @@ PR_(imlib_amd64_add_blend_rgb_to_rgba_cmod):
 	psllw $1, %xmm1
 	pmulhw %xmm3, %xmm1
 	paddsw %xmm1, %xmm2
-	
+
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -5882,7 +5873,7 @@ PR_(imlib_amd64_add_blend_rgb_to_rgba_cmod):
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* src alpha = 255 - dst alpha */
@@ -5899,15 +5890,15 @@ PR_(imlib_amd64_add_blend_rgb_to_rgba_cmod):
 	psllw $1, %xmm1
 	pmulhw %xmm3, %xmm1
 	paddsw %xmm1, %xmm2
-	
+
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -5966,7 +5957,7 @@ PR_(imlib_amd64_add_blend_rgb_to_rgba_cmod):
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* src alpha = 255 - dst alpha */
@@ -5983,15 +5974,15 @@ PR_(imlib_amd64_add_blend_rgb_to_rgba_cmod):
 	psllw $1, %xmm1
 	pmulhw %xmm3, %xmm1
 	paddsw %xmm1, %xmm2
-	
+
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -6050,7 +6041,7 @@ PR_(imlib_amd64_add_blend_rgb_to_rgba_cmod):
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* src alpha = 255 - dst alpha */
@@ -6067,15 +6058,15 @@ PR_(imlib_amd64_add_blend_rgb_to_rgba_cmod):
 	psllw $1, %xmm1
 	pmulhw %xmm3, %xmm1
 	paddsw %xmm1, %xmm2
-	
+
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -6134,7 +6125,7 @@ PR_(imlib_amd64_add_blend_rgb_to_rgba_cmod):
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* src alpha = 255 - dst alpha */
@@ -6151,15 +6142,15 @@ PR_(imlib_amd64_add_blend_rgb_to_rgba_cmod):
 	psllw $1, %xmm1
 	pmulhw %xmm3, %xmm1
 	paddsw %xmm1, %xmm2
-	
+
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -6218,7 +6209,7 @@ PR_(imlib_amd64_add_blend_rgb_to_rgba_cmod):
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* src alpha = 255 - dst alpha */
@@ -6235,15 +6226,15 @@ PR_(imlib_amd64_add_blend_rgb_to_rgba_cmod):
 	psllw $1, %xmm1
 	pmulhw %xmm3, %xmm1
 	paddsw %xmm1, %xmm2
-	
+
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
-	js 1b				
-	jnz 3f				
+	incq %rcx
+	incq %rcx
+	js 1b
+	jnz 3f
 2:
 	/* Grab 1 pixel from src, with colormod, with a = amod[255] */
 	movl (%rsi, %rcx, 4), %eax
@@ -6275,7 +6266,7 @@ PR_(imlib_amd64_add_blend_rgb_to_rgba_cmod):
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* src alpha = 255 - dst alpha */
@@ -6292,14 +6283,14 @@ PR_(imlib_amd64_add_blend_rgb_to_rgba_cmod):
 	psllw $1, %xmm1
 	pmulhw %xmm3, %xmm1
 	paddsw %xmm1, %xmm2
-	
+
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movd %xmm2, (%rdi, %rcx, 4)
-3:					
-	leaq (%rsi, %r10, 4), %rsi	
-	leaq (%rdi, %r11, 4), %rdi	
-	decq %r9			
+3:
+	leaq (%rsi, %r10, 4), %rsi
+	leaq (%rdi, %r11, 4), %rdi
+	decq %r9
 	jnz 0b
 
 9:
@@ -6312,20 +6303,20 @@ PR_(imlib_amd64_add_blend_rgb_to_rgb_cmod):
 	pxor %xmm4, %xmm4
 	movdqa m00XXXXXX(%rip), %xmm6
 
-	/* Move right to left across each line, */ 
-	/* processing in two pixel chunks */ 
-	leaq (%rsi, %r8, 4), %rsi	
-	leaq (%rdi, %r8, 4), %rdi	
-					
-	/* Last instruction is %rcx = 0 */ 
-	subq $4, %rsi			
-	subq $4, %rdi			
-					
-	negq %r8			
-0:					
-	movq %r8, %rcx			
-					
-	incq %rcx			
+	/* Move right to left across each line, */
+	/* processing in two pixel chunks */
+	leaq (%rsi, %r8, 4), %rsi
+	leaq (%rdi, %r8, 4), %rdi
+
+	/* Last instruction is %rcx = 0 */
+	subq $4, %rsi
+	subq $4, %rdi
+
+	negq %r8
+0:
+	movq %r8, %rcx
+
+	incq %rcx
 
 	/* prefetch a couple cache lines ahead */
 	prefetchnta (%rsi, %rcx, 4)
@@ -6333,7 +6324,7 @@ PR_(imlib_amd64_add_blend_rgb_to_rgb_cmod):
 	prefetchnta 64(%rsi, %rcx, 4)
 	prefetcht0 64(%rdi, %rcx, 4)
 
-	jz 2f /* one pixel line */	
+	jz 2f /* one pixel line */
 1:
 	/* main loop, unrolled to work on 64 byte chunks */
 	prefetchnta 128(%rsi, %rcx, 4)
@@ -6376,13 +6367,13 @@ PR_(imlib_amd64_add_blend_rgb_to_rgb_cmod):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -6403,10 +6394,10 @@ PR_(imlib_amd64_add_blend_rgb_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -6445,13 +6436,13 @@ PR_(imlib_amd64_add_blend_rgb_to_rgb_cmod):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -6472,10 +6463,10 @@ PR_(imlib_amd64_add_blend_rgb_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -6514,13 +6505,13 @@ PR_(imlib_amd64_add_blend_rgb_to_rgb_cmod):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -6541,10 +6532,10 @@ PR_(imlib_amd64_add_blend_rgb_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -6583,13 +6574,13 @@ PR_(imlib_amd64_add_blend_rgb_to_rgb_cmod):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -6610,10 +6601,10 @@ PR_(imlib_amd64_add_blend_rgb_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -6652,13 +6643,13 @@ PR_(imlib_amd64_add_blend_rgb_to_rgb_cmod):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -6679,10 +6670,10 @@ PR_(imlib_amd64_add_blend_rgb_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -6721,13 +6712,13 @@ PR_(imlib_amd64_add_blend_rgb_to_rgb_cmod):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -6748,10 +6739,10 @@ PR_(imlib_amd64_add_blend_rgb_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -6790,13 +6781,13 @@ PR_(imlib_amd64_add_blend_rgb_to_rgb_cmod):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -6817,10 +6808,10 @@ PR_(imlib_amd64_add_blend_rgb_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -6859,13 +6850,13 @@ PR_(imlib_amd64_add_blend_rgb_to_rgb_cmod):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -6886,10 +6877,10 @@ PR_(imlib_amd64_add_blend_rgb_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
-	js 1b				
-	jnz 3f				
+	incq %rcx
+	incq %rcx
+	js 1b
+	jnz 3f
 2:
 	/* Grab 1 pixel from src, with colormod, with a = amod[255] */
 	movl (%rsi, %rcx, 4), %eax
@@ -6913,13 +6904,13 @@ PR_(imlib_amd64_add_blend_rgb_to_rgb_cmod):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -6939,10 +6930,10 @@ PR_(imlib_amd64_add_blend_rgb_to_rgb_cmod):
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
  	movd %xmm2, (%rdi, %rcx, 4)
-3:					
-	leaq (%rsi, %r10, 4), %rsi	
-	leaq (%rdi, %r11, 4), %rdi	
-	decq %r9			
+3:
+	leaq (%rsi, %r10, 4), %rsi
+	leaq (%rdi, %r11, 4), %rdi
+	decq %r9
 	jnz 0b
 
 9:
@@ -6954,20 +6945,20 @@ PR_(imlib_amd64_add_copy_rgba_to_rgb_cmod):
 
 	movdqa m0XXX0XXX0XXX0XXX(%rip), %xmm5
 
-	/* Move right to left across each line, */ 
-	/* processing in two pixel chunks */ 
-	leaq (%rsi, %r8, 4), %rsi	
-	leaq (%rdi, %r8, 4), %rdi	
-					
-	/* Last instruction is %rcx = 0 */ 
-	subq $4, %rsi			
-	subq $4, %rdi			
-					
-	negq %r8			
-0:					
-	movq %r8, %rcx			
-					
-	incq %rcx			
+	/* Move right to left across each line, */
+	/* processing in two pixel chunks */
+	leaq (%rsi, %r8, 4), %rsi
+	leaq (%rdi, %r8, 4), %rdi
+
+	/* Last instruction is %rcx = 0 */
+	subq $4, %rsi
+	subq $4, %rdi
+
+	negq %r8
+0:
+	movq %r8, %rcx
+
+	incq %rcx
 
 	/* prefetch a couple cache lines ahead */
 	prefetchnta (%rsi, %rcx, 4)
@@ -6975,7 +6966,7 @@ PR_(imlib_amd64_add_copy_rgba_to_rgb_cmod):
 	prefetchnta 64(%rsi, %rcx, 4)
 	prefetcht0 64(%rdi, %rcx, 4)
 
-	jz 2f /* one pixel line */	
+	jz 2f /* one pixel line */
 1:
 	/* main loop, unrolled to work on 64 byte chunks */
 	prefetchnta 128(%rsi, %rcx, 4)
@@ -7021,10 +7012,10 @@ PR_(imlib_amd64_add_copy_rgba_to_rgb_cmod):
 	paddusb %xmm1, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -7066,10 +7057,10 @@ PR_(imlib_amd64_add_copy_rgba_to_rgb_cmod):
 	paddusb %xmm1, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -7111,10 +7102,10 @@ PR_(imlib_amd64_add_copy_rgba_to_rgb_cmod):
 	paddusb %xmm1, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -7156,10 +7147,10 @@ PR_(imlib_amd64_add_copy_rgba_to_rgb_cmod):
 	paddusb %xmm1, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -7201,10 +7192,10 @@ PR_(imlib_amd64_add_copy_rgba_to_rgb_cmod):
 	paddusb %xmm1, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -7246,10 +7237,10 @@ PR_(imlib_amd64_add_copy_rgba_to_rgb_cmod):
 	paddusb %xmm1, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -7291,10 +7282,10 @@ PR_(imlib_amd64_add_copy_rgba_to_rgb_cmod):
 	paddusb %xmm1, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -7336,10 +7327,10 @@ PR_(imlib_amd64_add_copy_rgba_to_rgb_cmod):
 	paddusb %xmm1, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
-	js 1b				
-	jnz 3f				
+	incq %rcx
+	incq %rcx
+	js 1b
+	jnz 3f
 2:
 	/* Grab 1 pixel from src, with colormod */
 	movl (%rsi, %rcx, 4), %eax
@@ -7364,10 +7355,10 @@ PR_(imlib_amd64_add_copy_rgba_to_rgb_cmod):
 	pand %xmm5, %xmm1
 	paddusb %xmm1, %xmm2
 	movd %xmm2, (%rdi, %rcx, 4)
-3:					
-	leaq (%rsi, %r10, 4), %rsi	
-	leaq (%rdi, %r11, 4), %rdi	
-	decq %r9			
+3:
+	leaq (%rsi, %r10, 4), %rsi
+	leaq (%rdi, %r11, 4), %rdi
+	decq %r9
 	jnz 0b
 
 9:
@@ -7379,20 +7370,20 @@ PR_(imlib_amd64_add_copy_rgba_to_rgba_cmod):
 
 	movdqa m0XXX0XXX0XXX0XXX(%rip), %xmm5
 
-	/* Move right to left across each line, */ 
-	/* processing in two pixel chunks */ 
-	leaq (%rsi, %r8, 4), %rsi	
-	leaq (%rdi, %r8, 4), %rdi	
-					
-	/* Last instruction is %rcx = 0 */ 
-	subq $4, %rsi			
-	subq $4, %rdi			
-					
-	negq %r8			
-0:					
-	movq %r8, %rcx			
-					
-	incq %rcx			
+	/* Move right to left across each line, */
+	/* processing in two pixel chunks */
+	leaq (%rsi, %r8, 4), %rsi
+	leaq (%rdi, %r8, 4), %rdi
+
+	/* Last instruction is %rcx = 0 */
+	subq $4, %rsi
+	subq $4, %rdi
+
+	negq %r8
+0:
+	movq %r8, %rcx
+
+	incq %rcx
 
 	/* prefetch a couple cache lines ahead */
 	prefetchnta (%rsi, %rcx, 4)
@@ -7400,7 +7391,7 @@ PR_(imlib_amd64_add_copy_rgba_to_rgba_cmod):
 	prefetchnta 64(%rsi, %rcx, 4)
 	prefetcht0 64(%rdi, %rcx, 4)
 
-	jz 2f /* one pixel line */	
+	jz 2f /* one pixel line */
 1:
 	/* main loop, unrolled to work on 64 byte chunks */
 	prefetchnta 128(%rsi, %rcx, 4)
@@ -7446,10 +7437,10 @@ PR_(imlib_amd64_add_copy_rgba_to_rgba_cmod):
 	paddusb %xmm1, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -7491,10 +7482,10 @@ PR_(imlib_amd64_add_copy_rgba_to_rgba_cmod):
 	paddusb %xmm1, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -7536,10 +7527,10 @@ PR_(imlib_amd64_add_copy_rgba_to_rgba_cmod):
 	paddusb %xmm1, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -7581,10 +7572,10 @@ PR_(imlib_amd64_add_copy_rgba_to_rgba_cmod):
 	paddusb %xmm1, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -7626,10 +7617,10 @@ PR_(imlib_amd64_add_copy_rgba_to_rgba_cmod):
 	paddusb %xmm1, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -7671,10 +7662,10 @@ PR_(imlib_amd64_add_copy_rgba_to_rgba_cmod):
 	paddusb %xmm1, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -7716,10 +7707,10 @@ PR_(imlib_amd64_add_copy_rgba_to_rgba_cmod):
 	paddusb %xmm1, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -7761,10 +7752,10 @@ PR_(imlib_amd64_add_copy_rgba_to_rgba_cmod):
 	paddusb %xmm1, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
-	js 1b				
-	jnz 3f				
+	incq %rcx
+	incq %rcx
+	js 1b
+	jnz 3f
 2:
 	/* Grab 1 pixel from src, with colormod */
 	movl (%rsi, %rcx, 4), %eax
@@ -7789,10 +7780,10 @@ PR_(imlib_amd64_add_copy_rgba_to_rgba_cmod):
 	pand %xmm5, %xmm2
 	paddusb %xmm1, %xmm2
 	movd %xmm2, (%rdi, %rcx, 4)
-3:					
-	leaq (%rsi, %r10, 4), %rsi	
-	leaq (%rdi, %r11, 4), %rdi	
-	decq %r9			
+3:
+	leaq (%rsi, %r10, 4), %rsi
+	leaq (%rdi, %r11, 4), %rdi
+	decq %r9
 	jnz 0b
 
 9:
@@ -7804,20 +7795,20 @@ PR_(imlib_amd64_add_copy_rgb_to_rgba_cmod):
 
 	movdqa m0XXX0XXX0XXX0XXX(%rip), %xmm5
 
-	/* Move right to left across each line, */ 
-	/* processing in two pixel chunks */ 
-	leaq (%rsi, %r8, 4), %rsi	
-	leaq (%rdi, %r8, 4), %rdi	
-					
-	/* Last instruction is %rcx = 0 */ 
-	subq $4, %rsi			
-	subq $4, %rdi			
-					
-	negq %r8			
-0:					
-	movq %r8, %rcx			
-					
-	incq %rcx			
+	/* Move right to left across each line, */
+	/* processing in two pixel chunks */
+	leaq (%rsi, %r8, 4), %rsi
+	leaq (%rdi, %r8, 4), %rdi
+
+	/* Last instruction is %rcx = 0 */
+	subq $4, %rsi
+	subq $4, %rdi
+
+	negq %r8
+0:
+	movq %r8, %rcx
+
+	incq %rcx
 
 	/* prefetch a couple cache lines ahead */
 	prefetchnta (%rsi, %rcx, 4)
@@ -7825,7 +7816,7 @@ PR_(imlib_amd64_add_copy_rgb_to_rgba_cmod):
 	prefetchnta 64(%rsi, %rcx, 4)
 	prefetcht0 64(%rdi, %rcx, 4)
 
-	jz 2f /* one pixel line */	
+	jz 2f /* one pixel line */
 1:
 	/* main loop, unrolled to work on 64 byte chunks */
 	prefetchnta 128(%rsi, %rcx, 4)
@@ -7868,10 +7859,10 @@ PR_(imlib_amd64_add_copy_rgb_to_rgba_cmod):
 	paddusb %xmm1, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -7910,10 +7901,10 @@ PR_(imlib_amd64_add_copy_rgb_to_rgba_cmod):
 	paddusb %xmm1, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -7952,10 +7943,10 @@ PR_(imlib_amd64_add_copy_rgb_to_rgba_cmod):
 	paddusb %xmm1, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -7994,10 +7985,10 @@ PR_(imlib_amd64_add_copy_rgb_to_rgba_cmod):
 	paddusb %xmm1, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -8036,10 +8027,10 @@ PR_(imlib_amd64_add_copy_rgb_to_rgba_cmod):
 	paddusb %xmm1, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -8078,10 +8069,10 @@ PR_(imlib_amd64_add_copy_rgb_to_rgba_cmod):
 	paddusb %xmm1, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -8120,10 +8111,10 @@ PR_(imlib_amd64_add_copy_rgb_to_rgba_cmod):
 	paddusb %xmm1, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -8162,10 +8153,10 @@ PR_(imlib_amd64_add_copy_rgb_to_rgba_cmod):
 	paddusb %xmm1, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
-	js 1b				
-	jnz 3f				
+	incq %rcx
+	incq %rcx
+	js 1b
+	jnz 3f
 2:
 	/* Grab 1 pixel from src, with colormod, with a = amod[255] */
 	movl (%rsi, %rcx, 4), %eax
@@ -8188,10 +8179,10 @@ PR_(imlib_amd64_add_copy_rgb_to_rgba_cmod):
 	pand %xmm5, %xmm2
 	paddusb %xmm1, %xmm2
 	movd %xmm2, (%rdi, %rcx, 4)
-3:					
-	leaq (%rsi, %r10, 4), %rsi	
-	leaq (%rdi, %r11, 4), %rdi	
-	decq %r9			
+3:
+	leaq (%rsi, %r10, 4), %rsi
+	leaq (%rdi, %r11, 4), %rdi
+	decq %r9
 	jnz 0b
 
 9:
@@ -8204,20 +8195,20 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb_cmod):
 	pxor %xmm4, %xmm4
 	movdqa m00XXXXXX(%rip), %xmm6
 
-	/* Move right to left across each line, */ 
-	/* processing in two pixel chunks */ 
-	leaq (%rsi, %r8, 4), %rsi	
-	leaq (%rdi, %r8, 4), %rdi	
-					
-	/* Last instruction is %rcx = 0 */ 
-	subq $4, %rsi			
-	subq $4, %rdi			
-					
-	negq %r8			
-0:					
-	movq %r8, %rcx			
-					
-	incq %rcx			
+	/* Move right to left across each line, */
+	/* processing in two pixel chunks */
+	leaq (%rsi, %r8, 4), %rsi
+	leaq (%rdi, %r8, 4), %rdi
+
+	/* Last instruction is %rcx = 0 */
+	subq $4, %rsi
+	subq $4, %rdi
+
+	negq %r8
+0:
+	movq %r8, %rcx
+
+	incq %rcx
 
 	/* prefetch a couple cache lines ahead */
 	prefetchnta (%rsi, %rcx, 4)
@@ -8225,7 +8216,7 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb_cmod):
 	prefetchnta 64(%rsi, %rcx, 4)
 	prefetcht0 64(%rdi, %rcx, 4)
 
-	jz 2f /* one pixel line */	
+	jz 2f /* one pixel line */
 1:
 	/* main loop, unrolled to work on 64 byte chunks */
 	prefetchnta 128(%rsi, %rcx, 4)
@@ -8270,13 +8261,13 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb_cmod):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -8297,10 +8288,10 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -8341,13 +8332,13 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb_cmod):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -8368,10 +8359,10 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -8412,13 +8403,13 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb_cmod):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -8439,10 +8430,10 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -8483,13 +8474,13 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb_cmod):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -8510,10 +8501,10 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -8554,13 +8545,13 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb_cmod):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -8581,10 +8572,10 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -8625,13 +8616,13 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb_cmod):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -8652,10 +8643,10 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -8696,13 +8687,13 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb_cmod):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -8723,10 +8714,10 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -8767,13 +8758,13 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb_cmod):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -8794,10 +8785,10 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
-	js 1b				
-	jnz 3f				
+	incq %rcx
+	incq %rcx
+	js 1b
+	jnz 3f
 2:
 	/* Grab 1 pixel from src, with colormod */
 	movl (%rsi, %rcx, 4), %eax
@@ -8822,13 +8813,13 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb_cmod):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -8848,10 +8839,10 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb_cmod):
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movd %xmm2, (%rdi, %rcx, 4)
-3:					
-	leaq (%rsi, %r10, 4), %rsi	
-	leaq (%rdi, %r11, 4), %rdi	
-	decq %r9			
+3:
+	leaq (%rsi, %r10, 4), %rsi
+	leaq (%rdi, %r11, 4), %rdi
+	decq %r9
 	jnz 0b
 
 9:
@@ -8867,20 +8858,20 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba_cmod):
 	movdqa mX000X000(%rip), %xmm7
 	xorq %rax, %rax
 
-	/* Move right to left across each line, */ 
-	/* processing in two pixel chunks */ 
-	leaq (%rsi, %r8, 4), %rsi	
-	leaq (%rdi, %r8, 4), %rdi	
-					
-	/* Last instruction is %rcx = 0 */ 
-	subq $4, %rsi			
-	subq $4, %rdi			
-					
-	negq %r8			
-0:					
-	movq %r8, %rcx			
-					
-	incq %rcx			
+	/* Move right to left across each line, */
+	/* processing in two pixel chunks */
+	leaq (%rsi, %r8, 4), %rsi
+	leaq (%rdi, %r8, 4), %rdi
+
+	/* Last instruction is %rcx = 0 */
+	subq $4, %rsi
+	subq $4, %rdi
+
+	negq %r8
+0:
+	movq %r8, %rcx
+
+	incq %rcx
 
 	/* prefetch a couple cache lines ahead */
 	prefetchnta (%rsi, %rcx, 4)
@@ -8888,7 +8879,7 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba_cmod):
 	prefetchnta 64(%rsi, %rcx, 4)
 	prefetcht0 64(%rdi, %rcx, 4)
 
-	jz 2f /* one pixel line */	
+	jz 2f /* one pixel line */
 1:
 	/* main loop, unrolled to work on 64 byte chunks */
 	prefetchnta 128(%rsi, %rcx, 4)
@@ -8953,7 +8944,7 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba_cmod):
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* src alpha = 255 - dst alpha */
@@ -8971,15 +8962,15 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba_cmod):
 	pmulhw %xmm3, %xmm1
 	pxor %xmm7, %xmm1
 	psubsw %xmm1, %xmm2
-	
+
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -9040,7 +9031,7 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba_cmod):
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* src alpha = 255 - dst alpha */
@@ -9058,15 +9049,15 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba_cmod):
 	pmulhw %xmm3, %xmm1
 	pxor %xmm7, %xmm1
 	psubsw %xmm1, %xmm2
-	
+
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -9127,7 +9118,7 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba_cmod):
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* src alpha = 255 - dst alpha */
@@ -9145,15 +9136,15 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba_cmod):
 	pmulhw %xmm3, %xmm1
 	pxor %xmm7, %xmm1
 	psubsw %xmm1, %xmm2
-	
+
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -9214,7 +9205,7 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba_cmod):
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* src alpha = 255 - dst alpha */
@@ -9232,15 +9223,15 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba_cmod):
 	pmulhw %xmm3, %xmm1
 	pxor %xmm7, %xmm1
 	psubsw %xmm1, %xmm2
-	
+
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -9301,7 +9292,7 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba_cmod):
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* src alpha = 255 - dst alpha */
@@ -9319,15 +9310,15 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba_cmod):
 	pmulhw %xmm3, %xmm1
 	pxor %xmm7, %xmm1
 	psubsw %xmm1, %xmm2
-	
+
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -9388,7 +9379,7 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba_cmod):
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* src alpha = 255 - dst alpha */
@@ -9406,15 +9397,15 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba_cmod):
 	pmulhw %xmm3, %xmm1
 	pxor %xmm7, %xmm1
 	psubsw %xmm1, %xmm2
-	
+
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -9475,7 +9466,7 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba_cmod):
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* src alpha = 255 - dst alpha */
@@ -9493,15 +9484,15 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba_cmod):
 	pmulhw %xmm3, %xmm1
 	pxor %xmm7, %xmm1
 	psubsw %xmm1, %xmm2
-	
+
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -9562,7 +9553,7 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba_cmod):
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* src alpha = 255 - dst alpha */
@@ -9580,15 +9571,15 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba_cmod):
 	pmulhw %xmm3, %xmm1
 	pxor %xmm7, %xmm1
 	psubsw %xmm1, %xmm2
-	
+
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
-	js 1b				
-	jnz 3f				
+	incq %rcx
+	incq %rcx
+	js 1b
+	jnz 3f
 2:
 	/* Grab 1 pixel from src, with colormod */
 	movl (%rsi, %rcx, 4), %eax
@@ -9621,7 +9612,7 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba_cmod):
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* src alpha = 255 - dst alpha */
@@ -9639,14 +9630,14 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba_cmod):
 	pmulhw %xmm3, %xmm1
 	pxor %xmm7, %xmm1
 	psubsw %xmm1, %xmm2
-	
+
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movd %xmm2, (%rdi, %rcx, 4)
-3:					
-	leaq (%rsi, %r10, 4), %rsi	
-	leaq (%rdi, %r11, 4), %rdi	
-	decq %r9			
+3:
+	leaq (%rsi, %r10, 4), %rsi
+	leaq (%rdi, %r11, 4), %rdi
+	decq %r9
 	jnz 0b
 
 9:
@@ -9662,20 +9653,20 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgba_cmod):
 	movdqa mX000X000(%rip), %xmm7
 	xorq %rax, %rax
 
-	/* Move right to left across each line, */ 
-	/* processing in two pixel chunks */ 
-	leaq (%rsi, %r8, 4), %rsi	
-	leaq (%rdi, %r8, 4), %rdi	
-					
-	/* Last instruction is %rcx = 0 */ 
-	subq $4, %rsi			
-	subq $4, %rdi			
-					
-	negq %r8			
-0:					
-	movq %r8, %rcx			
-					
-	incq %rcx			
+	/* Move right to left across each line, */
+	/* processing in two pixel chunks */
+	leaq (%rsi, %r8, 4), %rsi
+	leaq (%rdi, %r8, 4), %rdi
+
+	/* Last instruction is %rcx = 0 */
+	subq $4, %rsi
+	subq $4, %rdi
+
+	negq %r8
+0:
+	movq %r8, %rcx
+
+	incq %rcx
 
 	/* prefetch a couple cache lines ahead */
 	prefetchnta (%rsi, %rcx, 4)
@@ -9683,7 +9674,7 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgba_cmod):
 	prefetchnta 64(%rsi, %rcx, 4)
 	prefetcht0 64(%rdi, %rcx, 4)
 
-	jz 2f /* one pixel line */	
+	jz 2f /* one pixel line */
 1:
 	/* main loop, unrolled to work on 64 byte chunks */
 	prefetchnta 128(%rsi, %rcx, 4)
@@ -9746,7 +9737,7 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgba_cmod):
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* src alpha = 255 - dst alpha */
@@ -9764,15 +9755,15 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgba_cmod):
 	pmulhw %xmm3, %xmm1
 	pxor %xmm7, %xmm1
 	psubsw %xmm1, %xmm2
-	
+
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -9831,7 +9822,7 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgba_cmod):
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* src alpha = 255 - dst alpha */
@@ -9849,15 +9840,15 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgba_cmod):
 	pmulhw %xmm3, %xmm1
 	pxor %xmm7, %xmm1
 	psubsw %xmm1, %xmm2
-	
+
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -9916,7 +9907,7 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgba_cmod):
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* src alpha = 255 - dst alpha */
@@ -9934,15 +9925,15 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgba_cmod):
 	pmulhw %xmm3, %xmm1
 	pxor %xmm7, %xmm1
 	psubsw %xmm1, %xmm2
-	
+
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -10001,7 +9992,7 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgba_cmod):
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* src alpha = 255 - dst alpha */
@@ -10019,15 +10010,15 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgba_cmod):
 	pmulhw %xmm3, %xmm1
 	pxor %xmm7, %xmm1
 	psubsw %xmm1, %xmm2
-	
+
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -10086,7 +10077,7 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgba_cmod):
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* src alpha = 255 - dst alpha */
@@ -10104,15 +10095,15 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgba_cmod):
 	pmulhw %xmm3, %xmm1
 	pxor %xmm7, %xmm1
 	psubsw %xmm1, %xmm2
-	
+
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -10171,7 +10162,7 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgba_cmod):
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* src alpha = 255 - dst alpha */
@@ -10189,15 +10180,15 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgba_cmod):
 	pmulhw %xmm3, %xmm1
 	pxor %xmm7, %xmm1
 	psubsw %xmm1, %xmm2
-	
+
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -10256,7 +10247,7 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgba_cmod):
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* src alpha = 255 - dst alpha */
@@ -10274,15 +10265,15 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgba_cmod):
 	pmulhw %xmm3, %xmm1
 	pxor %xmm7, %xmm1
 	psubsw %xmm1, %xmm2
-	
+
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -10341,7 +10332,7 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgba_cmod):
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* src alpha = 255 - dst alpha */
@@ -10359,15 +10350,15 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgba_cmod):
 	pmulhw %xmm3, %xmm1
 	pxor %xmm7, %xmm1
 	psubsw %xmm1, %xmm2
-	
+
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
-	js 1b				
-	jnz 3f				
+	incq %rcx
+	incq %rcx
+	js 1b
+	jnz 3f
 2:
 	/* Grab 1 pixel from src, with colormod, with a = amod[255] */
 	movl (%rsi, %rcx, 4), %eax
@@ -10399,7 +10390,7 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgba_cmod):
 	/* unpack alpha to src alpha, combined alpha x 3 */
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* src alpha = 255 - dst alpha */
@@ -10417,14 +10408,14 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgba_cmod):
 	pmulhw %xmm3, %xmm1
 	pxor %xmm7, %xmm1
 	psubsw %xmm1, %xmm2
-	
+
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movd %xmm2, (%rdi, %rcx, 4)
-3:					
-	leaq (%rsi, %r10, 4), %rsi	
-	leaq (%rdi, %r11, 4), %rdi	
-	decq %r9			
+3:
+	leaq (%rsi, %r10, 4), %rsi
+	leaq (%rdi, %r11, 4), %rdi
+	decq %r9
 	jnz 0b
 
 9:
@@ -10436,20 +10427,20 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgb_cmod):
 	pxor %xmm4, %xmm4
 	movdqa m00XXXXXX(%rip), %xmm6
 
-	/* Move right to left across each line, */ 
-	/* processing in two pixel chunks */ 
-	leaq (%rsi, %r8, 4), %rsi	
-	leaq (%rdi, %r8, 4), %rdi	
-					
-	/* Last instruction is %rcx = 0 */ 
-	subq $4, %rsi			
-	subq $4, %rdi			
-					
-	negq %r8			
-0:					
-	movq %r8, %rcx			
-					
-	incq %rcx			
+	/* Move right to left across each line, */
+	/* processing in two pixel chunks */
+	leaq (%rsi, %r8, 4), %rsi
+	leaq (%rdi, %r8, 4), %rdi
+
+	/* Last instruction is %rcx = 0 */
+	subq $4, %rsi
+	subq $4, %rdi
+
+	negq %r8
+0:
+	movq %r8, %rcx
+
+	incq %rcx
 
 	/* prefetch a couple cache lines ahead */
 	prefetchnta (%rsi, %rcx, 4)
@@ -10457,7 +10448,7 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgb_cmod):
 	prefetchnta 64(%rsi, %rcx, 4)
 	prefetcht0 64(%rdi, %rcx, 4)
 
-	jz 2f /* one pixel line */	
+	jz 2f /* one pixel line */
 1:
 	/* main loop, unrolled to work on 64 byte chunks */
 	prefetchnta 128(%rsi, %rcx, 4)
@@ -10500,13 +10491,13 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgb_cmod):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -10527,10 +10518,10 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -10569,13 +10560,13 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgb_cmod):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -10596,10 +10587,10 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -10638,13 +10629,13 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgb_cmod):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -10665,10 +10656,10 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -10707,13 +10698,13 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgb_cmod):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -10734,10 +10725,10 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -10776,13 +10767,13 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgb_cmod):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -10803,10 +10794,10 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -10845,13 +10836,13 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgb_cmod):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -10872,10 +10863,10 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -10914,13 +10905,13 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgb_cmod):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -10941,10 +10932,10 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -10983,13 +10974,13 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgb_cmod):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -11010,10 +11001,10 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
-	js 1b				
-	jnz 3f				
+	incq %rcx
+	incq %rcx
+	js 1b
+	jnz 3f
 2:
 	/* Grab 1 pixel from src, with colormod, with a = amod[255] */
 	movl (%rsi, %rcx, 4), %eax
@@ -11037,13 +11028,13 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgb_cmod):
 	 * Result ranges is [0, 0x7fff], and is mapped to
 	 *  point values in [0.0, 1.0) by using the high word
 	 *  of the 32 bit multiplication result.
-	 * Because we want the unsigned value, we shift right one 
+	 * Because we want the unsigned value, we shift right one
 	 *  here and also shift left the other factors to compensate.
 	 */
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero out the alpha channel of the source to leave the
@@ -11063,10 +11054,10 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgb_cmod):
 	/* pack new pixels */
 	packuswb %xmm4, %xmm2
 	movd %xmm2, (%rdi, %rcx, 4)
-3:					
-	leaq (%rsi, %r10, 4), %rsi	
-	leaq (%rdi, %r11, 4), %rdi	
-	decq %r9			
+3:
+	leaq (%rsi, %r10, 4), %rsi
+	leaq (%rdi, %r11, 4), %rdi
+	decq %r9
 	jnz 0b
 
 9:
@@ -11077,20 +11068,20 @@ PR_(imlib_amd64_subtract_copy_rgba_to_rgb_cmod):
 
 	movdqa m0XXX0XXX0XXX0XXX(%rip), %xmm5
 
-	/* Move right to left across each line, */ 
-	/* processing in two pixel chunks */ 
-	leaq (%rsi, %r8, 4), %rsi	
-	leaq (%rdi, %r8, 4), %rdi	
-					
-	/* Last instruction is %rcx = 0 */ 
-	subq $4, %rsi			
-	subq $4, %rdi			
-					
-	negq %r8			
-0:					
-	movq %r8, %rcx			
-					
-	incq %rcx			
+	/* Move right to left across each line, */
+	/* processing in two pixel chunks */
+	leaq (%rsi, %r8, 4), %rsi
+	leaq (%rdi, %r8, 4), %rdi
+
+	/* Last instruction is %rcx = 0 */
+	subq $4, %rsi
+	subq $4, %rdi
+
+	negq %r8
+0:
+	movq %r8, %rcx
+
+	incq %rcx
 
 	/* prefetch a couple cache lines ahead */
 	prefetchnta (%rsi, %rcx, 4)
@@ -11098,7 +11089,7 @@ PR_(imlib_amd64_subtract_copy_rgba_to_rgb_cmod):
 	prefetchnta 64(%rsi, %rcx, 4)
 	prefetcht0 64(%rdi, %rcx, 4)
 
-	jz 2f /* one pixel line */	
+	jz 2f /* one pixel line */
 1:
 	/* main loop, unrolled to work on 64 byte chunks */
 	prefetchnta 128(%rsi, %rcx, 4)
@@ -11144,10 +11135,10 @@ PR_(imlib_amd64_subtract_copy_rgba_to_rgb_cmod):
 	psubusb %xmm1, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -11189,10 +11180,10 @@ PR_(imlib_amd64_subtract_copy_rgba_to_rgb_cmod):
 	psubusb %xmm1, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -11234,10 +11225,10 @@ PR_(imlib_amd64_subtract_copy_rgba_to_rgb_cmod):
 	psubusb %xmm1, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -11279,10 +11270,10 @@ PR_(imlib_amd64_subtract_copy_rgba_to_rgb_cmod):
 	psubusb %xmm1, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -11324,10 +11315,10 @@ PR_(imlib_amd64_subtract_copy_rgba_to_rgb_cmod):
 	psubusb %xmm1, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -11369,10 +11360,10 @@ PR_(imlib_amd64_subtract_copy_rgba_to_rgb_cmod):
 	psubusb %xmm1, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -11414,10 +11405,10 @@ PR_(imlib_amd64_subtract_copy_rgba_to_rgb_cmod):
 	psubusb %xmm1, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -11459,10 +11450,10 @@ PR_(imlib_amd64_subtract_copy_rgba_to_rgb_cmod):
 	psubusb %xmm1, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
-	js 1b				
-	jnz 3f				
+	incq %rcx
+	incq %rcx
+	js 1b
+	jnz 3f
 2:
 	/* Grab 1 pixel from src, with colormod */
 	movl (%rsi, %rcx, 4), %eax
@@ -11487,10 +11478,10 @@ PR_(imlib_amd64_subtract_copy_rgba_to_rgb_cmod):
 	pand %xmm5, %xmm1
 	psubusb %xmm1, %xmm2
 	movd %xmm2, (%rdi, %rcx, 4)
-3:					
-	leaq (%rsi, %r10, 4), %rsi	
-	leaq (%rdi, %r11, 4), %rdi	
-	decq %r9			
+3:
+	leaq (%rsi, %r10, 4), %rsi
+	leaq (%rdi, %r11, 4), %rdi
+	decq %r9
 	jnz 0b
 
 9:
@@ -11502,20 +11493,20 @@ PR_(imlib_amd64_subtract_copy_rgba_to_rgba_cmod):
 	movdqa m0XXX0XXX0XXX0XXX(%rip), %xmm5
 	movdqa mX000X000X000X000(%rip), %xmm6
 
-	/* Move right to left across each line, */ 
-	/* processing in two pixel chunks */ 
-	leaq (%rsi, %r8, 4), %rsi	
-	leaq (%rdi, %r8, 4), %rdi	
-					
-	/* Last instruction is %rcx = 0 */ 
-	subq $4, %rsi			
-	subq $4, %rdi			
-					
-	negq %r8			
-0:					
-	movq %r8, %rcx			
-					
-	incq %rcx			
+	/* Move right to left across each line, */
+	/* processing in two pixel chunks */
+	leaq (%rsi, %r8, 4), %rsi
+	leaq (%rdi, %r8, 4), %rdi
+
+	/* Last instruction is %rcx = 0 */
+	subq $4, %rsi
+	subq $4, %rdi
+
+	negq %r8
+0:
+	movq %r8, %rcx
+
+	incq %rcx
 
 	/* prefetch a couple cache lines ahead */
 	prefetchnta (%rsi, %rcx, 4)
@@ -11523,7 +11514,7 @@ PR_(imlib_amd64_subtract_copy_rgba_to_rgba_cmod):
 	prefetchnta 64(%rsi, %rcx, 4)
 	prefetcht0 64(%rdi, %rcx, 4)
 
-	jz 2f /* one pixel line */	
+	jz 2f /* one pixel line */
 1:
 	/* main loop, unrolled to work on 64 byte chunks */
 	prefetchnta 128(%rsi, %rcx, 4)
@@ -11571,10 +11562,10 @@ PR_(imlib_amd64_subtract_copy_rgba_to_rgba_cmod):
 	por %xmm1, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -11618,10 +11609,10 @@ PR_(imlib_amd64_subtract_copy_rgba_to_rgba_cmod):
 	por %xmm1, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -11665,10 +11656,10 @@ PR_(imlib_amd64_subtract_copy_rgba_to_rgba_cmod):
 	por %xmm1, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -11712,10 +11703,10 @@ PR_(imlib_amd64_subtract_copy_rgba_to_rgba_cmod):
 	por %xmm1, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -11759,10 +11750,10 @@ PR_(imlib_amd64_subtract_copy_rgba_to_rgba_cmod):
 	por %xmm1, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -11806,10 +11797,10 @@ PR_(imlib_amd64_subtract_copy_rgba_to_rgba_cmod):
 	por %xmm1, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -11853,10 +11844,10 @@ PR_(imlib_amd64_subtract_copy_rgba_to_rgba_cmod):
 	por %xmm1, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -11900,10 +11891,10 @@ PR_(imlib_amd64_subtract_copy_rgba_to_rgba_cmod):
 	por %xmm1, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
-	js 1b				
-	jnz 3f				
+	incq %rcx
+	incq %rcx
+	js 1b
+	jnz 3f
 2:
 	/* Grab 1 pixel from src, with colormod */
 	movl (%rsi, %rcx, 4), %eax
@@ -11930,10 +11921,10 @@ PR_(imlib_amd64_subtract_copy_rgba_to_rgba_cmod):
 	pand %xmm5, %xmm2
 	por %xmm1, %xmm2
 	movd %xmm2, (%rdi, %rcx, 4)
-3:					
-	leaq (%rsi, %r10, 4), %rsi	
-	leaq (%rdi, %r11, 4), %rdi	
-	decq %r9			
+3:
+	leaq (%rsi, %r10, 4), %rsi
+	leaq (%rdi, %r11, 4), %rdi
+	decq %r9
 	jnz 0b
 
 9:
@@ -11945,20 +11936,20 @@ PR_(imlib_amd64_subtract_copy_rgb_to_rgba_cmod):
 	movdqa m0XXX0XXX0XXX0XXX(%rip), %xmm5
 	movdqa mX000X000X000X000(%rip), %xmm6
 
-	/* Move right to left across each line, */ 
-	/* processing in two pixel chunks */ 
-	leaq (%rsi, %r8, 4), %rsi	
-	leaq (%rdi, %r8, 4), %rdi	
-					
-	/* Last instruction is %rcx = 0 */ 
-	subq $4, %rsi			
-	subq $4, %rdi			
-					
-	negq %r8			
-0:					
-	movq %r8, %rcx			
-					
-	incq %rcx			
+	/* Move right to left across each line, */
+	/* processing in two pixel chunks */
+	leaq (%rsi, %r8, 4), %rsi
+	leaq (%rdi, %r8, 4), %rdi
+
+	/* Last instruction is %rcx = 0 */
+	subq $4, %rsi
+	subq $4, %rdi
+
+	negq %r8
+0:
+	movq %r8, %rcx
+
+	incq %rcx
 
 	/* prefetch a couple cache lines ahead */
 	prefetchnta (%rsi, %rcx, 4)
@@ -11966,7 +11957,7 @@ PR_(imlib_amd64_subtract_copy_rgb_to_rgba_cmod):
 	prefetchnta 64(%rsi, %rcx, 4)
 	prefetcht0 64(%rdi, %rcx, 4)
 
-	jz 2f /* one pixel line */	
+	jz 2f /* one pixel line */
 1:
 	/* main loop, unrolled to work on 64 byte chunks */
 	prefetchnta 128(%rsi, %rcx, 4)
@@ -12014,10 +12005,10 @@ PR_(imlib_amd64_subtract_copy_rgb_to_rgba_cmod):
 	por %xmm1, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -12061,10 +12052,10 @@ PR_(imlib_amd64_subtract_copy_rgb_to_rgba_cmod):
 	por %xmm1, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -12108,10 +12099,10 @@ PR_(imlib_amd64_subtract_copy_rgb_to_rgba_cmod):
 	por %xmm1, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -12155,10 +12146,10 @@ PR_(imlib_amd64_subtract_copy_rgb_to_rgba_cmod):
 	por %xmm1, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -12202,10 +12193,10 @@ PR_(imlib_amd64_subtract_copy_rgb_to_rgba_cmod):
 	por %xmm1, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -12249,10 +12240,10 @@ PR_(imlib_amd64_subtract_copy_rgb_to_rgba_cmod):
 	por %xmm1, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -12296,10 +12287,10 @@ PR_(imlib_amd64_subtract_copy_rgb_to_rgba_cmod):
 	por %xmm1, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -12343,10 +12334,10 @@ PR_(imlib_amd64_subtract_copy_rgb_to_rgba_cmod):
 	por %xmm1, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
-	js 1b				
-	jnz 3f				
+	incq %rcx
+	incq %rcx
+	js 1b
+	jnz 3f
 2:
 	/* Grab 1 pixel from src, with colormod, with a = amod[255] */
 	movl (%rsi, %rcx, 4), %eax
@@ -12374,10 +12365,10 @@ PR_(imlib_amd64_subtract_copy_rgb_to_rgba_cmod):
 	pand %xmm6, %xmm1
 	por %xmm1, %xmm2
 	movd %xmm2, (%rdi, %rcx, 4)
-3:					
-	leaq (%rsi, %r10, 4), %rsi	
-	leaq (%rdi, %r11, 4), %rdi	
-	decq %r9			
+3:
+	leaq (%rsi, %r10, 4), %rsi
+	leaq (%rdi, %r11, 4), %rdi
+	decq %r9
 	jnz 0b
 
 9:
@@ -12390,20 +12381,20 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb_cmod):
 	movdqa m000V0V0V000V0V0V(%rip), %xmm6
 	movdqa m00XXXXXX(%rip), %xmm7
 
-	/* Move right to left across each line, */ 
-	/* processing in two pixel chunks */ 
-	leaq (%rsi, %r8, 4), %rsi	
-	leaq (%rdi, %r8, 4), %rdi	
-					
-	/* Last instruction is %rcx = 0 */ 
-	subq $4, %rsi			
-	subq $4, %rdi			
-					
-	negq %r8			
-0:					
-	movq %r8, %rcx			
-					
-	incq %rcx			
+	/* Move right to left across each line, */
+	/* processing in two pixel chunks */
+	leaq (%rsi, %r8, 4), %rsi
+	leaq (%rdi, %r8, 4), %rdi
+
+	/* Last instruction is %rcx = 0 */
+	subq $4, %rsi
+	subq $4, %rdi
+
+	negq %r8
+0:
+	movq %r8, %rcx
+
+	incq %rcx
 
 	/* prefetch a couple cache lines ahead */
 	prefetchnta (%rsi, %rcx, 4)
@@ -12411,7 +12402,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb_cmod):
 	prefetchnta 64(%rsi, %rcx, 4)
 	prefetcht0 64(%rdi, %rcx, 4)
 
-	jz 2f /* one pixel line */	
+	jz 2f /* one pixel line */
 1:
 	/* main loop, unrolled to work on 64 byte chunks */
 	prefetchnta 128(%rsi, %rcx, 4)
@@ -12456,7 +12447,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb_cmod):
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero blending alpha */
@@ -12476,10 +12467,10 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -12520,7 +12511,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb_cmod):
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero blending alpha */
@@ -12540,10 +12531,10 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -12584,7 +12575,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb_cmod):
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero blending alpha */
@@ -12604,10 +12595,10 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -12648,7 +12639,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb_cmod):
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero blending alpha */
@@ -12668,10 +12659,10 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -12712,7 +12703,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb_cmod):
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero blending alpha */
@@ -12732,10 +12723,10 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -12776,7 +12767,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb_cmod):
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero blending alpha */
@@ -12796,10 +12787,10 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -12840,7 +12831,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb_cmod):
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero blending alpha */
@@ -12860,10 +12851,10 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -12904,7 +12895,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb_cmod):
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero blending alpha */
@@ -12924,10 +12915,10 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
-	js 1b				
-	jnz 3f				
+	incq %rcx
+	incq %rcx
+	js 1b
+	jnz 3f
 2:
 	/* Grab 1 pixel from src, with colormod */
 	movl (%rsi, %rcx, 4), %eax
@@ -12952,7 +12943,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb_cmod):
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero blending alpha */
@@ -12971,10 +12962,10 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb_cmod):
 	/* Repack new pixels */
 	packuswb %xmm4, %xmm2
  	movd %xmm2, (%rdi, %rcx, 4)
-3:					
-	leaq (%rsi, %r10, 4), %rsi	
-	leaq (%rdi, %r11, 4), %rdi	
-	decq %r9			
+3:
+	leaq (%rsi, %r10, 4), %rsi
+	leaq (%rdi, %r11, 4), %rdi
+	decq %r9
 	jnz 0b
 
 9:
@@ -12992,20 +12983,20 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba_cmod):
 	movdqa m000V0V0V000V0V0V(%rip), %xmm8
 	xorq %rax, %rax
 
-	/* Move right to left across each line, */ 
-	/* processing in two pixel chunks */ 
-	leaq (%rsi, %r8, 4), %rsi	
-	leaq (%rdi, %r8, 4), %rdi	
-					
-	/* Last instruction is %rcx = 0 */ 
-	subq $4, %rsi			
-	subq $4, %rdi			
-					
-	negq %r8			
-0:					
-	movq %r8, %rcx			
-					
-	incq %rcx			
+	/* Move right to left across each line, */
+	/* processing in two pixel chunks */
+	leaq (%rsi, %r8, 4), %rsi
+	leaq (%rdi, %r8, 4), %rdi
+
+	/* Last instruction is %rcx = 0 */
+	subq $4, %rsi
+	subq $4, %rdi
+
+	negq %r8
+0:
+	movq %r8, %rcx
+
+	incq %rcx
 
 	/* prefetch a couple cache lines ahead */
 	prefetchnta (%rsi, %rcx, 4)
@@ -13013,7 +13004,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba_cmod):
 	prefetchnta 64(%rsi, %rcx, 4)
 	prefetcht0 64(%rdi, %rcx, 4)
 
-	jz 2f /* one pixel line */	
+	jz 2f /* one pixel line */
 1:
 	/* main loop, unrolled to work on 64 byte chunks */
 	prefetchnta 128(%rsi, %rcx, 4)
@@ -13080,7 +13071,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba_cmod):
 	movd %rax, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	movdqa %xmm2, %xmm0
@@ -13095,14 +13086,14 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba_cmod):
 	psllw $2, %xmm1
 	pmulhw %xmm3, %xmm1
 	paddsw %xmm1, %xmm2
-	
+
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -13165,7 +13156,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba_cmod):
 	movd %rax, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	movdqa %xmm2, %xmm0
@@ -13180,14 +13171,14 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba_cmod):
 	psllw $2, %xmm1
 	pmulhw %xmm3, %xmm1
 	paddsw %xmm1, %xmm2
-	
+
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -13250,7 +13241,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba_cmod):
 	movd %rax, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	movdqa %xmm2, %xmm0
@@ -13265,14 +13256,14 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba_cmod):
 	psllw $2, %xmm1
 	pmulhw %xmm3, %xmm1
 	paddsw %xmm1, %xmm2
-	
+
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -13335,7 +13326,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba_cmod):
 	movd %rax, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	movdqa %xmm2, %xmm0
@@ -13350,14 +13341,14 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba_cmod):
 	psllw $2, %xmm1
 	pmulhw %xmm3, %xmm1
 	paddsw %xmm1, %xmm2
-	
+
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -13420,7 +13411,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba_cmod):
 	movd %rax, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	movdqa %xmm2, %xmm0
@@ -13435,14 +13426,14 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba_cmod):
 	psllw $2, %xmm1
 	pmulhw %xmm3, %xmm1
 	paddsw %xmm1, %xmm2
-	
+
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -13505,7 +13496,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba_cmod):
 	movd %rax, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	movdqa %xmm2, %xmm0
@@ -13520,14 +13511,14 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba_cmod):
 	psllw $2, %xmm1
 	pmulhw %xmm3, %xmm1
 	paddsw %xmm1, %xmm2
-	
+
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -13590,7 +13581,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba_cmod):
 	movd %rax, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	movdqa %xmm2, %xmm0
@@ -13605,14 +13596,14 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba_cmod):
 	psllw $2, %xmm1
 	pmulhw %xmm3, %xmm1
 	paddsw %xmm1, %xmm2
-	
+
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -13675,7 +13666,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba_cmod):
 	movd %rax, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	movdqa %xmm2, %xmm0
@@ -13690,14 +13681,14 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba_cmod):
 	psllw $2, %xmm1
 	pmulhw %xmm3, %xmm1
 	paddsw %xmm1, %xmm2
-	
+
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
-	js 1b				
-	jnz 3f				
+	incq %rcx
+	incq %rcx
+	js 1b
+	jnz 3f
 2:
 	/* Grab 1 pixel from src, with colormod */
 	movl (%rsi, %rcx, 4), %eax
@@ -13731,7 +13722,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba_cmod):
 	movd %eax, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	movdqa %xmm2, %xmm0
@@ -13746,13 +13737,13 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba_cmod):
 	psllw $2, %xmm1
 	pmulhw %xmm3, %xmm1
 	paddsw %xmm1, %xmm2
-	
+
 	packuswb %xmm4, %xmm2
 	movd %xmm2, (%rdi, %rcx, 4)
-3:					
-	leaq (%rsi, %r10, 4), %rsi	
-	leaq (%rdi, %r11, 4), %rdi	
-	decq %r9			
+3:
+	leaq (%rsi, %r10, 4), %rsi
+	leaq (%rdi, %r11, 4), %rdi
+	decq %r9
 	jnz 0b
 
 9:
@@ -13766,20 +13757,20 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgb_cmod):
 	movdqa m000V0V0V000V0V0V(%rip), %xmm6
 	movdqa m00XXXXXX(%rip), %xmm7
 
-	/* Move right to left across each line, */ 
-	/* processing in two pixel chunks */ 
-	leaq (%rsi, %r8, 4), %rsi	
-	leaq (%rdi, %r8, 4), %rdi	
-					
-	/* Last instruction is %rcx = 0 */ 
-	subq $4, %rsi			
-	subq $4, %rdi			
-					
-	negq %r8			
-0:					
-	movq %r8, %rcx			
-					
-	incq %rcx			
+	/* Move right to left across each line, */
+	/* processing in two pixel chunks */
+	leaq (%rsi, %r8, 4), %rsi
+	leaq (%rdi, %r8, 4), %rdi
+
+	/* Last instruction is %rcx = 0 */
+	subq $4, %rsi
+	subq $4, %rdi
+
+	negq %r8
+0:
+	movq %r8, %rcx
+
+	incq %rcx
 
 	/* prefetch a couple cache lines ahead */
 	prefetchnta (%rsi, %rcx, 4)
@@ -13787,7 +13778,7 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgb_cmod):
 	prefetchnta 64(%rsi, %rcx, 4)
 	prefetcht0 64(%rdi, %rcx, 4)
 
-	jz 2f /* one pixel line */	
+	jz 2f /* one pixel line */
 1:
 	/* main loop, unrolled to work on 64 byte chunks */
 	prefetchnta 128(%rsi, %rcx, 4)
@@ -13830,7 +13821,7 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgb_cmod):
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero blending alpha */
@@ -13850,10 +13841,10 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -13892,7 +13883,7 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgb_cmod):
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero blending alpha */
@@ -13912,10 +13903,10 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -13954,7 +13945,7 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgb_cmod):
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero blending alpha */
@@ -13974,10 +13965,10 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -14016,7 +14007,7 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgb_cmod):
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero blending alpha */
@@ -14036,10 +14027,10 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -14078,7 +14069,7 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgb_cmod):
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero blending alpha */
@@ -14098,10 +14089,10 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -14140,7 +14131,7 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgb_cmod):
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero blending alpha */
@@ -14160,10 +14151,10 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -14202,7 +14193,7 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgb_cmod):
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero blending alpha */
@@ -14222,10 +14213,10 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -14264,7 +14255,7 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgb_cmod):
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero blending alpha */
@@ -14284,10 +14275,10 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgb_cmod):
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
-	js 1b				
-	jnz 3f				
+	incq %rcx
+	incq %rcx
+	js 1b
+	jnz 3f
 2:
 	/* Grab 1 pixel from src, with colormod, with a = amod[255] */
 	movl (%rsi, %rcx, 4), %eax
@@ -14311,7 +14302,7 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgb_cmod):
 	movq %xmm1, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0xFF, %xmm3, %xmm3
-	pshuflw $0xFF, %xmm3, %xmm3 
+	pshuflw $0xFF, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	/* Zero blending alpha */
@@ -14330,10 +14321,10 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgb_cmod):
 	/* Repack new pixels */
 	packuswb %xmm4, %xmm2
  	movd %xmm2, (%rdi, %rcx, 4)
-3:					
-	leaq (%rsi, %r10, 4), %rsi	
-	leaq (%rdi, %r11, 4), %rdi	
-	decq %r9			
+3:
+	leaq (%rsi, %r10, 4), %rsi
+	leaq (%rdi, %r11, 4), %rdi
+	decq %r9
 	jnz 0b
 
 9:
@@ -14351,20 +14342,20 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgba_cmod):
 	movdqa m000V0V0V000V0V0V(%rip), %xmm8
 	xorq %rax, %rax
 
-	/* Move right to left across each line, */ 
-	/* processing in two pixel chunks */ 
-	leaq (%rsi, %r8, 4), %rsi	
-	leaq (%rdi, %r8, 4), %rdi	
-					
-	/* Last instruction is %rcx = 0 */ 
-	subq $4, %rsi			
-	subq $4, %rdi			
-					
-	negq %r8			
-0:					
-	movq %r8, %rcx			
-					
-	incq %rcx			
+	/* Move right to left across each line, */
+	/* processing in two pixel chunks */
+	leaq (%rsi, %r8, 4), %rsi
+	leaq (%rdi, %r8, 4), %rdi
+
+	/* Last instruction is %rcx = 0 */
+	subq $4, %rsi
+	subq $4, %rdi
+
+	negq %r8
+0:
+	movq %r8, %rcx
+
+	incq %rcx
 
 	/* prefetch a couple cache lines ahead */
 	prefetchnta (%rsi, %rcx, 4)
@@ -14372,7 +14363,7 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgba_cmod):
 	prefetchnta 64(%rsi, %rcx, 4)
 	prefetcht0 64(%rdi, %rcx, 4)
 
-	jz 2f /* one pixel line */	
+	jz 2f /* one pixel line */
 1:
 	/* main loop, unrolled to work on 64 byte chunks */
 	prefetchnta 128(%rsi, %rcx, 4)
@@ -14437,7 +14428,7 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgba_cmod):
 	movd %rax, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	movdqa %xmm2, %xmm0
@@ -14452,14 +14443,14 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgba_cmod):
 	psllw $2, %xmm1
 	pmulhw %xmm3, %xmm1
 	paddsw %xmm1, %xmm2
-	
+
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -14520,7 +14511,7 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgba_cmod):
 	movd %rax, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	movdqa %xmm2, %xmm0
@@ -14535,14 +14526,14 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgba_cmod):
 	psllw $2, %xmm1
 	pmulhw %xmm3, %xmm1
 	paddsw %xmm1, %xmm2
-	
+
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -14603,7 +14594,7 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgba_cmod):
 	movd %rax, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	movdqa %xmm2, %xmm0
@@ -14618,14 +14609,14 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgba_cmod):
 	psllw $2, %xmm1
 	pmulhw %xmm3, %xmm1
 	paddsw %xmm1, %xmm2
-	
+
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -14686,7 +14677,7 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgba_cmod):
 	movd %rax, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	movdqa %xmm2, %xmm0
@@ -14701,14 +14692,14 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgba_cmod):
 	psllw $2, %xmm1
 	pmulhw %xmm3, %xmm1
 	paddsw %xmm1, %xmm2
-	
+
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -14769,7 +14760,7 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgba_cmod):
 	movd %rax, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	movdqa %xmm2, %xmm0
@@ -14784,14 +14775,14 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgba_cmod):
 	psllw $2, %xmm1
 	pmulhw %xmm3, %xmm1
 	paddsw %xmm1, %xmm2
-	
+
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -14852,7 +14843,7 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgba_cmod):
 	movd %rax, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	movdqa %xmm2, %xmm0
@@ -14867,14 +14858,14 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgba_cmod):
 	psllw $2, %xmm1
 	pmulhw %xmm3, %xmm1
 	paddsw %xmm1, %xmm2
-	
+
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -14935,7 +14926,7 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgba_cmod):
 	movd %rax, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	movdqa %xmm2, %xmm0
@@ -14950,14 +14941,14 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgba_cmod):
 	psllw $2, %xmm1
 	pmulhw %xmm3, %xmm1
 	paddsw %xmm1, %xmm2
-	
+
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -15018,7 +15009,7 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgba_cmod):
 	movd %rax, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	movdqa %xmm2, %xmm0
@@ -15033,14 +15024,14 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgba_cmod):
 	psllw $2, %xmm1
 	pmulhw %xmm3, %xmm1
 	paddsw %xmm1, %xmm2
-	
+
 	packuswb %xmm4, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
-	js 1b				
-	jnz 3f				
+	incq %rcx
+	incq %rcx
+	js 1b
+	jnz 3f
 2:
 	/* Grab 1 pixel from src, with colormod, with a = amod[255] */
 	movl (%rsi, %rcx, 4), %eax
@@ -15073,7 +15064,7 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgba_cmod):
 	movd %eax, %xmm3
 	punpcklbw %xmm3, %xmm3
 	pshufhw $0x40, %xmm3, %xmm3
-	pshuflw $0x40, %xmm3, %xmm3	
+	pshuflw $0x40, %xmm3, %xmm3
 	psrlw $1, %xmm3
 
 	movdqa %xmm2, %xmm0
@@ -15088,13 +15079,13 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgba_cmod):
 	psllw $2, %xmm1
 	pmulhw %xmm3, %xmm1
 	paddsw %xmm1, %xmm2
-	
+
 	packuswb %xmm4, %xmm2
 	movd %xmm2, (%rdi, %rcx, 4)
-3:					
-	leaq (%rsi, %r10, 4), %rsi	
-	leaq (%rdi, %r11, 4), %rdi	
-	decq %r9			
+3:
+	leaq (%rsi, %r10, 4), %rsi
+	leaq (%rdi, %r11, 4), %rdi
+	decq %r9
 	jnz 0b
 
 9:
@@ -15107,20 +15098,20 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgb_cmod):
 	movdqa m0XXX0XXX0XXX0XXX(%rip), %xmm5
 	movdqa m0VVV0VVV0VVV0VVV(%rip), %xmm6
 
-	/* Move right to left across each line, */ 
-	/* processing in two pixel chunks */ 
-	leaq (%rsi, %r8, 4), %rsi	
-	leaq (%rdi, %r8, 4), %rdi	
-					
-	/* Last instruction is %rcx = 0 */ 
-	subq $4, %rsi			
-	subq $4, %rdi			
-					
-	negq %r8			
-0:					
-	movq %r8, %rcx			
-					
-	incq %rcx			
+	/* Move right to left across each line, */
+	/* processing in two pixel chunks */
+	leaq (%rsi, %r8, 4), %rsi
+	leaq (%rdi, %r8, 4), %rdi
+
+	/* Last instruction is %rcx = 0 */
+	subq $4, %rsi
+	subq $4, %rdi
+
+	negq %r8
+0:
+	movq %r8, %rcx
+
+	incq %rcx
 
 	/* prefetch a couple cache lines ahead */
 	prefetchnta (%rsi, %rcx, 4)
@@ -15128,7 +15119,7 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgb_cmod):
 	prefetchnta 64(%rsi, %rcx, 4)
 	prefetcht0 64(%rdi, %rcx, 4)
 
-	jz 2f /* one pixel line */	
+	jz 2f /* one pixel line */
 1:
 	/* main loop, unrolled to work on 64 byte chunks */
 	prefetchnta 128(%rsi, %rcx, 4)
@@ -15192,10 +15183,10 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgb_cmod):
 	psubusb %xmm3, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -15255,10 +15246,10 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgb_cmod):
 	psubusb %xmm3, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -15318,10 +15309,10 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgb_cmod):
 	psubusb %xmm3, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -15381,10 +15372,10 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgb_cmod):
 	psubusb %xmm3, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -15444,10 +15435,10 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgb_cmod):
 	psubusb %xmm3, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -15507,10 +15498,10 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgb_cmod):
 	psubusb %xmm3, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -15570,10 +15561,10 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgb_cmod):
 	psubusb %xmm3, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -15633,10 +15624,10 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgb_cmod):
 	psubusb %xmm3, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
-	js 1b				
-	jnz 3f				
+	incq %rcx
+	incq %rcx
+	js 1b
+	jnz 3f
 2:
 	/* Grab 1 pixel from src, with colormod */
 	movl (%rsi, %rcx, 4), %eax
@@ -15679,10 +15670,10 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgb_cmod):
 	paddusb %xmm1, %xmm2
 	psubusb %xmm3, %xmm2
 	movd %xmm2, (%rdi, %rcx, 4)
-3:					
-	leaq (%rsi, %r10, 4), %rsi	
-	leaq (%rdi, %r11, 4), %rdi	
-	decq %r9			
+3:
+	leaq (%rsi, %r10, 4), %rsi
+	leaq (%rdi, %r11, 4), %rdi
+	decq %r9
 	jnz 0b
 
 9:
@@ -15696,20 +15687,20 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgba_cmod):
 	movdqu m0VVV0VVV0VVV0VVV(%rip), %xmm6
 	movdqu mX000X000X000X000(%rip), %xmm7
 
-	/* Move right to left across each line, */ 
-	/* processing in two pixel chunks */ 
-	leaq (%rsi, %r8, 4), %rsi	
-	leaq (%rdi, %r8, 4), %rdi	
-					
-	/* Last instruction is %rcx = 0 */ 
-	subq $4, %rsi			
-	subq $4, %rdi			
-					
-	negq %r8			
-0:					
-	movq %r8, %rcx			
-					
-	incq %rcx			
+	/* Move right to left across each line, */
+	/* processing in two pixel chunks */
+	leaq (%rsi, %r8, 4), %rsi
+	leaq (%rdi, %r8, 4), %rdi
+
+	/* Last instruction is %rcx = 0 */
+	subq $4, %rsi
+	subq $4, %rdi
+
+	negq %r8
+0:
+	movq %r8, %rcx
+
+	incq %rcx
 
 	/* prefetch a couple cache lines ahead */
 	prefetchnta (%rsi, %rcx, 4)
@@ -15717,7 +15708,7 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgba_cmod):
 	prefetchnta 64(%rsi, %rcx, 4)
 	prefetcht0 64(%rdi, %rcx, 4)
 
-	jz 2f /* one pixel line */	
+	jz 2f /* one pixel line */
 1:
 	/* main loop, unrolled to work on 64 byte chunks */
 	prefetchnta 128(%rsi, %rcx, 4)
@@ -15770,17 +15761,17 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgba_cmod):
 	/* d = d + s1 - s2, unsigned saturation */
 	paddusb %xmm1, %xmm2
 	psubusb %xmm3, %xmm2
-	
+
 	/* d alpha = s alpha */
 	pand %xmm5, %xmm2
 	pand %xmm7, %xmm0
 	por %xmm0, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -15829,17 +15820,17 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgba_cmod):
 	/* d = d + s1 - s2, unsigned saturation */
 	paddusb %xmm1, %xmm2
 	psubusb %xmm3, %xmm2
-	
+
 	/* d alpha = s alpha */
 	pand %xmm5, %xmm2
 	pand %xmm7, %xmm0
 	por %xmm0, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -15888,17 +15879,17 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgba_cmod):
 	/* d = d + s1 - s2, unsigned saturation */
 	paddusb %xmm1, %xmm2
 	psubusb %xmm3, %xmm2
-	
+
 	/* d alpha = s alpha */
 	pand %xmm5, %xmm2
 	pand %xmm7, %xmm0
 	por %xmm0, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -15947,17 +15938,17 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgba_cmod):
 	/* d = d + s1 - s2, unsigned saturation */
 	paddusb %xmm1, %xmm2
 	psubusb %xmm3, %xmm2
-	
+
 	/* d alpha = s alpha */
 	pand %xmm5, %xmm2
 	pand %xmm7, %xmm0
 	por %xmm0, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -16006,17 +15997,17 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgba_cmod):
 	/* d = d + s1 - s2, unsigned saturation */
 	paddusb %xmm1, %xmm2
 	psubusb %xmm3, %xmm2
-	
+
 	/* d alpha = s alpha */
 	pand %xmm5, %xmm2
 	pand %xmm7, %xmm0
 	por %xmm0, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -16065,17 +16056,17 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgba_cmod):
 	/* d = d + s1 - s2, unsigned saturation */
 	paddusb %xmm1, %xmm2
 	psubusb %xmm3, %xmm2
-	
+
 	/* d alpha = s alpha */
 	pand %xmm5, %xmm2
 	pand %xmm7, %xmm0
 	por %xmm0, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -16124,17 +16115,17 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgba_cmod):
 	/* d = d + s1 - s2, unsigned saturation */
 	paddusb %xmm1, %xmm2
 	psubusb %xmm3, %xmm2
-	
+
 	/* d alpha = s alpha */
 	pand %xmm5, %xmm2
 	pand %xmm7, %xmm0
 	por %xmm0, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod */
 	movq (%rsi, %rcx, 4), %rax
@@ -16183,17 +16174,17 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgba_cmod):
 	/* d = d + s1 - s2, unsigned saturation */
 	paddusb %xmm1, %xmm2
 	psubusb %xmm3, %xmm2
-	
+
 	/* d alpha = s alpha */
 	pand %xmm5, %xmm2
 	pand %xmm7, %xmm0
 	por %xmm0, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
-	js 1b				
-	jnz 3f				
+	incq %rcx
+	incq %rcx
+	js 1b
+	jnz 3f
 2:
 	/* Grab 1 pixel from src, with colormod */
 	movl (%rsi, %rcx, 4), %eax
@@ -16226,16 +16217,16 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgba_cmod):
 	/* d = d + s1 - s2, unsigned saturation */
 	paddusb %xmm1, %xmm2
 	psubusb %xmm3, %xmm2
-	
+
 	/* d alpha = s alpha */
 	pand %xmm5, %xmm2
 	pand %xmm7, %xmm0
 	por %xmm0, %xmm2
 	movd %xmm2, (%rdi, %rcx, 4)
-3:					
-	leaq (%rsi, %r10, 4), %rsi	
-	leaq (%rdi, %r11, 4), %rdi	
-	decq %r9			
+3:
+	leaq (%rsi, %r10, 4), %rsi
+	leaq (%rdi, %r11, 4), %rdi
+	decq %r9
 	jnz 0b
 
 9:
@@ -16249,20 +16240,20 @@ PR_(imlib_amd64_reshade_copy_rgb_to_rgba_cmod):
 	movdqu m0VVV0VVV0VVV0VVV(%rip), %xmm6
 	movdqu mX000X000X000X000(%rip), %xmm7
 
-	/* Move right to left across each line, */ 
-	/* processing in two pixel chunks */ 
-	leaq (%rsi, %r8, 4), %rsi	
-	leaq (%rdi, %r8, 4), %rdi	
-					
-	/* Last instruction is %rcx = 0 */ 
-	subq $4, %rsi			
-	subq $4, %rdi			
-					
-	negq %r8			
-0:					
-	movq %r8, %rcx			
-					
-	incq %rcx			
+	/* Move right to left across each line, */
+	/* processing in two pixel chunks */
+	leaq (%rsi, %r8, 4), %rsi
+	leaq (%rdi, %r8, 4), %rdi
+
+	/* Last instruction is %rcx = 0 */
+	subq $4, %rsi
+	subq $4, %rdi
+
+	negq %r8
+0:
+	movq %r8, %rcx
+
+	incq %rcx
 
 	/* prefetch a couple cache lines ahead */
 	prefetchnta (%rsi, %rcx, 4)
@@ -16270,7 +16261,7 @@ PR_(imlib_amd64_reshade_copy_rgb_to_rgba_cmod):
 	prefetchnta 64(%rsi, %rcx, 4)
 	prefetcht0 64(%rdi, %rcx, 4)
 
-	jz 2f /* one pixel line */	
+	jz 2f /* one pixel line */
 1:
 	/* main loop, unrolled to work on 64 byte chunks */
 	prefetchnta 128(%rsi, %rcx, 4)
@@ -16321,17 +16312,17 @@ PR_(imlib_amd64_reshade_copy_rgb_to_rgba_cmod):
 	/* d = d + s1 - s2, unsigned saturation */
 	paddusb %xmm1, %xmm2
 	psubusb %xmm3, %xmm2
-	
+
 	/* d alpha = s alpha */
 	pand %xmm5, %xmm2
 	pand %xmm7, %xmm0
 	por %xmm0, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -16378,17 +16369,17 @@ PR_(imlib_amd64_reshade_copy_rgb_to_rgba_cmod):
 	/* d = d + s1 - s2, unsigned saturation */
 	paddusb %xmm1, %xmm2
 	psubusb %xmm3, %xmm2
-	
+
 	/* d alpha = s alpha */
 	pand %xmm5, %xmm2
 	pand %xmm7, %xmm0
 	por %xmm0, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -16435,17 +16426,17 @@ PR_(imlib_amd64_reshade_copy_rgb_to_rgba_cmod):
 	/* d = d + s1 - s2, unsigned saturation */
 	paddusb %xmm1, %xmm2
 	psubusb %xmm3, %xmm2
-	
+
 	/* d alpha = s alpha */
 	pand %xmm5, %xmm2
 	pand %xmm7, %xmm0
 	por %xmm0, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -16492,17 +16483,17 @@ PR_(imlib_amd64_reshade_copy_rgb_to_rgba_cmod):
 	/* d = d + s1 - s2, unsigned saturation */
 	paddusb %xmm1, %xmm2
 	psubusb %xmm3, %xmm2
-	
+
 	/* d alpha = s alpha */
 	pand %xmm5, %xmm2
 	pand %xmm7, %xmm0
 	por %xmm0, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -16549,17 +16540,17 @@ PR_(imlib_amd64_reshade_copy_rgb_to_rgba_cmod):
 	/* d = d + s1 - s2, unsigned saturation */
 	paddusb %xmm1, %xmm2
 	psubusb %xmm3, %xmm2
-	
+
 	/* d alpha = s alpha */
 	pand %xmm5, %xmm2
 	pand %xmm7, %xmm0
 	por %xmm0, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -16606,17 +16597,17 @@ PR_(imlib_amd64_reshade_copy_rgb_to_rgba_cmod):
 	/* d = d + s1 - s2, unsigned saturation */
 	paddusb %xmm1, %xmm2
 	psubusb %xmm3, %xmm2
-	
+
 	/* d alpha = s alpha */
 	pand %xmm5, %xmm2
 	pand %xmm7, %xmm0
 	por %xmm0, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -16663,17 +16654,17 @@ PR_(imlib_amd64_reshade_copy_rgb_to_rgba_cmod):
 	/* d = d + s1 - s2, unsigned saturation */
 	paddusb %xmm1, %xmm2
 	psubusb %xmm3, %xmm2
-	
+
 	/* d alpha = s alpha */
 	pand %xmm5, %xmm2
 	pand %xmm7, %xmm0
 	por %xmm0, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
+	incq %rcx
+	incq %rcx
 	jz 2f
-	jns 3f				
+	jns 3f
 
 	/* Grab 2 pixels from src, with colormod, with a = amod[255] */
 	movq (%rsi, %rcx, 4), %rax
@@ -16720,17 +16711,17 @@ PR_(imlib_amd64_reshade_copy_rgb_to_rgba_cmod):
 	/* d = d + s1 - s2, unsigned saturation */
 	paddusb %xmm1, %xmm2
 	psubusb %xmm3, %xmm2
-	
+
 	/* d alpha = s alpha */
 	pand %xmm5, %xmm2
 	pand %xmm7, %xmm0
 	por %xmm0, %xmm2
 	movq %xmm2, (%rdi, %rcx, 4)
 
-	incq %rcx			
-	incq %rcx			
-	js 1b				
-	jnz 3f				
+	incq %rcx
+	incq %rcx
+	js 1b
+	jnz 3f
 2:
 	/* Grab 1 pixel from src, with colormod, with a = amod[255] */
 	movl (%rsi, %rcx, 4), %eax
@@ -16762,24 +16753,22 @@ PR_(imlib_amd64_reshade_copy_rgb_to_rgba_cmod):
 	/* d = d + s1 - s2, unsigned saturation */
 	paddusb %xmm1, %xmm2
 	psubusb %xmm3, %xmm2
-	
+
 	/* d alpha = s alpha */
 	pand %xmm5, %xmm2
 	pand %xmm7, %xmm0
 	por %xmm0, %xmm2
 	movd %xmm2, (%rdi, %rcx, 4)
-3:					
-	leaq (%rsi, %r10, 4), %rsi	
-	leaq (%rdi, %r11, 4), %rdi	
-	decq %r9			
+3:
+	leaq (%rsi, %r10, 4), %rsi
+	leaq (%rdi, %r11, 4), %rdi
+	decq %r9
 	jnz 0b
 
 9:
 	LEAVE
 SIZE(imlib_amd64_reshade_copy_rgb_to_rgba_cmod)
 
-#endif
-
 #ifdef __ELF__
 .section .note.GNU-stack,"",@progbits
 #endif
diff --git a/src/lib/asm_blend.S b/src/lib/asm_blend.S
index 5ce63f0..e598fde 100644
--- a/src/lib/asm_blend.S
+++ b/src/lib/asm_blend.S
@@ -1,9 +1,7 @@
 #include <config.h>
 #include "asm.h"
 
-#ifdef DO_MMX_ASM
-
-/*\ 
+/*\
 |*| MMX assembly blending routines for Imlib2
 |*| Written by Willem Monsuwe <willem@stack.nl>
 |*|
@@ -51,7 +49,7 @@ FN_(imlib_mmx_reshade_copy_rgb_to_rgba)
 
 #include "asm_loadimmq.S"
 
-	
+
 /*\ MMX register use:
 |*| %mm1 = Source value
 |*| %mm2 = Destination value
@@ -95,7 +93,6 @@ FN_(imlib_mmx_reshade_copy_rgb_to_rgba)
 	decl %edx			;\
 	jns 8b
 
-
 /*\ Unset MMX mode, reset registers, return \*/
 #define LEAVE			\
 9:				;\
@@ -281,7 +278,7 @@ PR_(imlib_mmx_copy_rgba_to_rgba):
 	/*\ Load source, save destination \*/
 	movq (%esi, %ecx, 4), %mm1
 	movq %mm1, (%edi, %ecx, 4)
-	
+
 	addl $2, %ecx
 	js 1b
 	jnz 3f
@@ -358,7 +355,7 @@ PR_(imlib_mmx_add_blend_rgba_to_rgb):
 	/*\ Pack into lower 4 bytes and save \*/
 	packuswb %mm4, %mm2
 	movd %mm2, (%edi, %ecx, 4)
-	
+
 	incl %ecx
 	js 1b
 
@@ -408,7 +405,7 @@ PR_(imlib_mmx_add_blend_rgba_to_rgba):
 	/*\ Pack into lower 4 bytes and save \*/
 	packuswb %mm4, %mm2
 	movd %mm2, (%edi, %ecx, 4)
-	
+
 	incl %ecx
 	js 1b
 
@@ -440,7 +437,7 @@ PR_(imlib_mmx_add_copy_rgba_to_rgb):
 	/*\ d = d + s, unsigned saturation, and save \*/
 	paddusb %mm1, %mm2
 	movq %mm2, (%edi, %ecx, 4)
-	
+
 	addl $2, %ecx
 	js 1b
 	jnz 3f
@@ -473,7 +470,7 @@ PR_(imlib_mmx_add_copy_rgba_to_rgba):
 	/*\ d = d + s, unsigned saturation, and save \*/
 	paddusb %mm1, %mm2
 	movq %mm2, (%edi, %ecx, 4)
-	
+
 	addl $2, %ecx
 	js 1b
 	jnz 3f
@@ -511,7 +508,7 @@ PR_(imlib_mmx_add_copy_rgb_to_rgba):
 	/*\ Make result alpha 0xff \*/
 	por %mm5, %mm2
 	movq %mm2, (%edi, %ecx, 4)
-	
+
 	addl $2, %ecx
 	js 1b
 	jnz 3f
@@ -559,7 +556,7 @@ PR_(imlib_mmx_subtract_blend_rgba_to_rgb):
 	/*\ Pack into lower 4 bytes and save \*/
 	packuswb %mm4, %mm2
 	movd %mm2, (%edi, %ecx, 4)
-	
+
 	incl %ecx
 	js 1b
 
@@ -609,7 +606,7 @@ PR_(imlib_mmx_subtract_blend_rgba_to_rgba):
 	/*\ Pack into lower 4 bytes and save \*/
 	packuswb %mm4, %mm2
 	movd %mm2, (%edi, %ecx, 4)
-	
+
 	incl %ecx
 	js 1b
 
@@ -641,7 +638,7 @@ PR_(imlib_mmx_subtract_copy_rgba_to_rgb):
 	/*\ d = d - s, unsigned saturation, and save \*/
 	psubusb %mm1, %mm2
 	movq %mm2, (%edi, %ecx, 4)
-	
+
 	addl $2, %ecx
 	js 1b
 	jnz 3f
@@ -683,7 +680,7 @@ PR_(imlib_mmx_subtract_copy_rgba_to_rgba):
 	/*\ Negate result alphas \*/
 	pxor %mm5, %mm2
 	movq %mm2, (%edi, %ecx, 4)
-	
+
 	addl $2, %ecx
 	js 1b
 	jnz 3f
@@ -723,7 +720,7 @@ PR_(imlib_mmx_subtract_copy_rgb_to_rgba):
 	/*\ Make result alpha 0xff \*/
 	por %mm5, %mm2
 	movq %mm2, (%edi, %ecx, 4)
-	
+
 	addl $2, %ecx
 	js 1b
 	jnz 3f
@@ -774,7 +771,7 @@ PR_(imlib_mmx_reshade_blend_rgba_to_rgb):
 	/*\ Pack into lower 4 bytes and save \*/
 	packuswb %mm4, %mm2
 	movd %mm2, (%edi, %ecx, 4)
-	
+
 	incl %ecx
 	js 1b
 
@@ -827,7 +824,7 @@ PR_(imlib_mmx_reshade_blend_rgba_to_rgba):
 	/*\ Pack into lower 4 bytes and save \*/
 	packuswb %mm4, %mm2
 	movd %mm2, (%edi, %ecx, 4)
-	
+
 	incl %ecx
 	js 1b
 
@@ -877,7 +874,7 @@ PR_(imlib_mmx_reshade_copy_rgba_to_rgb):
 	paddusb %mm1, %mm2
 	psubusb %mm3, %mm2
 	movq %mm2, (%edi, %ecx, 4)
-	
+
 	addl $2, %ecx
 	js 1b
 	jnz 3f
@@ -942,7 +939,7 @@ PR_(imlib_mmx_reshade_copy_rgba_to_rgba):
 	paddusb %mm1, %mm2
 	psubusb %mm3, %mm2
 	movq %mm2, (%edi, %ecx, 4)
-	
+
 	addl $2, %ecx
 	js 1b
 	jnz 3f
@@ -1001,7 +998,7 @@ PR_(imlib_mmx_reshade_copy_rgb_to_rgba):
 	/*\ Make result alpha 0xff \*/
 	por %mm7, %mm2
 	movq %mm2, (%edi, %ecx, 4)
-	
+
 	addl $2, %ecx
 	js 1b
 	jnz 3f
@@ -1024,8 +1021,6 @@ PR_(imlib_mmx_reshade_copy_rgb_to_rgba):
 
 SIZE(imlib_mmx_reshade_copy_rgb_to_rgba)
 
-#endif
-
 #ifdef __ELF__
 .section .note.GNU-stack,"",@progbits
 #endif
diff --git a/src/lib/asm_blend_cmod.S b/src/lib/asm_blend_cmod.S
index 23b2500..9fedb79 100644
--- a/src/lib/asm_blend_cmod.S
+++ b/src/lib/asm_blend_cmod.S
@@ -1,9 +1,7 @@
 #include <config.h>
 #include "asm.h"
 
-#ifdef DO_MMX_ASM
-
-/*\ 
+/*\
 |*| MMX assembly blending routines, with colour modding, for Imlib2
 |*| Written by Willem Monsuwe <willem@stack.nl>
 |*|
@@ -69,7 +67,7 @@ FN_(imlib_mmx_reshade_copy_rgba_to_rgba_cmod)
 FN_(imlib_mmx_reshade_copy_rgb_to_rgba_cmod)
 
 #include "asm_loadimmq.S"
-	
+
 /*\ MMX register use:
 |*| %mm1 = Source value
 |*| %mm2 = Destination value
@@ -113,7 +111,6 @@ FN_(imlib_mmx_reshade_copy_rgb_to_rgba_cmod)
 	decl %edx			;\
 	jns 8b
 
-
 /*\ Unset MMX mode, reset registers, return \*/
 #define LEAVE			\
 9:				;\
@@ -622,7 +619,7 @@ PR_(imlib_mmx_add_blend_rgba_to_rgb_cmod):
 	/*\ Pack into lower 4 bytes and save \*/
 	packuswb %mm4, %mm2
 	movd %mm2, (%edi, %ecx, 4)
-	
+
 	incl %ecx
 	js 1b
 
@@ -672,7 +669,7 @@ PR_(imlib_mmx_add_blend_rgba_to_rgba_cmod):
 	/*\ Pack into lower 4 bytes and save \*/
 	packuswb %mm4, %mm2
 	movd %mm2, (%edi, %ecx, 4)
-	
+
 	incl %ecx
 	js 1b
 
@@ -713,7 +710,7 @@ PR_(imlib_mmx_add_blend_rgb_to_rgb_cmod):
 	/*\ Pack into lower 4 bytes and save \*/
 	packuswb %mm4, %mm2
 	movd %mm2, (%edi, %ecx, 4)
-	
+
 	incl %ecx
 	js 1b
 
@@ -763,7 +760,7 @@ PR_(imlib_mmx_add_blend_rgb_to_rgba_cmod):
 	/*\ Pack into lower 4 bytes and save \*/
 	packuswb %mm4, %mm2
 	movd %mm2, (%edi, %ecx, 4)
-	
+
 	incl %ecx
 	js 1b
 
@@ -795,7 +792,7 @@ PR_(imlib_mmx_add_copy_rgba_to_rgb_cmod):
 	/*\ d = d + s, unsigned saturation, and save \*/
 	paddusb %mm1, %mm2
 	movq %mm2, (%edi, %ecx, 4)
-	
+
 	addl $2, %ecx
 	js 1b
 	jnz 3f
@@ -828,7 +825,7 @@ PR_(imlib_mmx_add_copy_rgba_to_rgba_cmod):
 	/*\ d = d + s, unsigned saturation, and save \*/
 	paddusb %mm1, %mm2
 	movq %mm2, (%edi, %ecx, 4)
-	
+
 	addl $2, %ecx
 	js 1b
 	jnz 3f
@@ -860,7 +857,7 @@ PR_(imlib_mmx_add_copy_rgb_to_rgba_cmod):
 	/*\ d = d + s, unsigned saturation, and save \*/
 	paddusb %mm1, %mm2
 	movq %mm2, (%edi, %ecx, 4)
-	
+
 	addl $2, %ecx
 	js 1b
 	jnz 3f
@@ -907,7 +904,7 @@ PR_(imlib_mmx_subtract_blend_rgba_to_rgb_cmod):
 	/*\ Pack into lower 4 bytes and save \*/
 	packuswb %mm4, %mm2
 	movd %mm2, (%edi, %ecx, 4)
-	
+
 	incl %ecx
 	js 1b
 
@@ -957,7 +954,7 @@ PR_(imlib_mmx_subtract_blend_rgba_to_rgba_cmod):
 	/*\ Pack into lower 4 bytes and save \*/
 	packuswb %mm4, %mm2
 	movd %mm2, (%edi, %ecx, 4)
-	
+
 	incl %ecx
 	js 1b
 
@@ -998,7 +995,7 @@ PR_(imlib_mmx_subtract_blend_rgb_to_rgb_cmod):
 	/*\ Pack into lower 4 bytes and save \*/
 	packuswb %mm4, %mm2
 	movd %mm2, (%edi, %ecx, 4)
-	
+
 	incl %ecx
 	js 1b
 
@@ -1048,7 +1045,7 @@ PR_(imlib_mmx_subtract_blend_rgb_to_rgba_cmod):
 	/*\ Pack into lower 4 bytes and save \*/
 	packuswb %mm4, %mm2
 	movd %mm2, (%edi, %ecx, 4)
-	
+
 	incl %ecx
 	js 1b
 
@@ -1080,7 +1077,7 @@ PR_(imlib_mmx_subtract_copy_rgba_to_rgb_cmod):
 	/*\ d = d - s, unsigned saturation, and save \*/
 	psubusb %mm1, %mm2
 	movq %mm2, (%edi, %ecx, 4)
-	
+
 	addl $2, %ecx
 	js 1b
 	jnz 3f
@@ -1122,7 +1119,7 @@ PR_(imlib_mmx_subtract_copy_rgba_to_rgba_cmod):
 	/*\ Negate result alphas \*/
 	pxor %mm5, %mm2
 	movq %mm2, (%edi, %ecx, 4)
-	
+
 	addl $2, %ecx
 	js 1b
 	jnz 3f
@@ -1161,7 +1158,7 @@ PR_(imlib_mmx_subtract_copy_rgb_to_rgba_cmod):
 	psubusb %mm1, %mm2
 	pxor %mm5, %mm2
 	movq %mm2, (%edi, %ecx, 4)
-	
+
 	addl $2, %ecx
 	js 1b
 	jnz 3f
@@ -1213,7 +1210,7 @@ PR_(imlib_mmx_reshade_blend_rgba_to_rgb_cmod):
 	/*\ Pack into lower 4 bytes and save \*/
 	packuswb %mm4, %mm2
 	movd %mm2, (%edi, %ecx, 4)
-	
+
 	incl %ecx
 	js 1b
 
@@ -1266,7 +1263,7 @@ PR_(imlib_mmx_reshade_blend_rgba_to_rgba_cmod):
 	/*\ Pack into lower 4 bytes and save \*/
 	packuswb %mm4, %mm2
 	movd %mm2, (%edi, %ecx, 4)
-	
+
 	incl %ecx
 	js 1b
 
@@ -1310,7 +1307,7 @@ PR_(imlib_mmx_reshade_blend_rgb_to_rgb_cmod):
 	/*\ Pack into lower 4 bytes and save \*/
 	packuswb %mm4, %mm2
 	movd %mm2, (%edi, %ecx, 4)
-	
+
 	incl %ecx
 	js 1b
 
@@ -1363,7 +1360,7 @@ PR_(imlib_mmx_reshade_blend_rgb_to_rgba_cmod):
 	/*\ Pack into lower 4 bytes and save \*/
 	packuswb %mm4, %mm2
 	movd %mm2, (%edi, %ecx, 4)
-	
+
 	incl %ecx
 	js 1b
 
@@ -1413,7 +1410,7 @@ PR_(imlib_mmx_reshade_copy_rgba_to_rgb_cmod):
 	paddusb %mm1, %mm2
 	psubusb %mm3, %mm2
 	movq %mm2, (%edi, %ecx, 4)
-	
+
 	addl $2, %ecx
 	js 1b
 	jnz 3f
@@ -1478,7 +1475,7 @@ PR_(imlib_mmx_reshade_copy_rgba_to_rgba_cmod):
 	paddusb %mm1, %mm2
 	psubusb %mm3, %mm2
 	movq %mm2, (%edi, %ecx, 4)
-	
+
 	addl $2, %ecx
 	js 1b
 	jnz 3f
@@ -1538,7 +1535,7 @@ PR_(imlib_mmx_reshade_copy_rgb_to_rgba_cmod):
 	paddusb %mm1, %mm2
 	psubusb %mm3, %mm2
 	movq %mm2, (%edi, %ecx, 4)
-	
+
 	addl $2, %ecx
 	js 1b
 	jnz 3f
@@ -1563,8 +1560,6 @@ PR_(imlib_mmx_reshade_copy_rgb_to_rgba_cmod):
 
 SIZE(imlib_mmx_reshade_copy_rgb_to_rgba_cmod)
 
-#endif
-
 #ifdef __ELF__
 .section .note.GNU-stack,"",@progbits
 #endif
diff --git a/src/lib/asm_rgba.S b/src/lib/asm_rgba.S
index 9903ae2..552756d 100644
--- a/src/lib/asm_rgba.S
+++ b/src/lib/asm_rgba.S
@@ -1,9 +1,7 @@
 #include <config.h>
 #include "asm.h"
 
-#ifdef DO_MMX_ASM
-
-/*\ 
+/*\
 |*| MMX assembly rgba rendering routines for Imlib2
 |*| Written by Willem Monsuwe <willem@stack.nl>
 |*|
@@ -81,7 +79,6 @@ FN_(imlib_get_cpuid)
 	ret
 
 
-
 PR_(imlib_mmx_bgr565_fast):
 	LOAD_IMMQ(mul_bgr565, %mm7)	/*\ This constant is the only difference \*/
 	CLEANUP_IMMQ_LOADS(1)
@@ -273,8 +270,6 @@ PR_(imlib_get_cpuid):
 
 SIZE(imlib_get_cpuid)
 
-#endif
-
 #ifdef __ELF__
 .section .note.GNU-stack,"",@progbits
 #endif
diff --git a/src/lib/asm_rotate.S b/src/lib/asm_rotate.S
index 2af05b3..6bda910 100644
--- a/src/lib/asm_rotate.S
+++ b/src/lib/asm_rotate.S
@@ -1,9 +1,7 @@
 #include <config.h>
 #include "asm.h"
 
-#ifdef DO_MMX_ASM
-
-/*\ 
+/*\
 |*| MMX assembly rotation routine for Imlib2
 |*| Written by Willem Monsuwe <willem@stack.nl>
 \*/
@@ -197,7 +195,7 @@ PR_(imlib_mmx_RotateAA):
 	paddw %mm3, %mm5
 	packuswb %mm5, %mm5
 	movd %mm5, (%edi, %ecx, 4)
-	
+
 	paddd dxh, %mm6
 
 	incl %ecx
@@ -220,7 +218,7 @@ PR_(imlib_mmx_RotateAA):
 	decl %eax
 	sall $12, %eax
 	movl %eax, sht
-	
+
 	movl sow, %ebx
 	movl src, %edx
 .outside_loop_y:
@@ -421,7 +419,7 @@ PR_(imlib_mmx_RotateAA):
 .outside_il_0:
 	movl $0, %eax
 	movl %eax, (%edi, %ecx, 4)
-	
+
 .outside_il_end:
 	paddd dxh, %mm6
 
@@ -447,8 +445,6 @@ PR_(imlib_mmx_RotateAA):
 
 SIZE(imlib_mmx_RotateAA)
 
-#endif
-
 #ifdef __ELF__
 .section .note.GNU-stack,"",@progbits
 #endif
diff --git a/src/lib/asm_scale.S b/src/lib/asm_scale.S
index b48737e..c82a05f 100644
--- a/src/lib/asm_scale.S
+++ b/src/lib/asm_scale.S
@@ -1,9 +1,7 @@
 #include <config.h>
 #include "asm.h"
 
-#ifdef DO_MMX_ASM
-
-/*\ 
+/*\
 |*| MMX assembly scaling routine for Imlib2
 |*| Written by Willem Monsuwe <willem@stack.nl>
 \*/
@@ -293,7 +291,7 @@ PR_(imlib_Scale_mmx_AARGBA):
 	punpcklbw %mm7, %mm0
 	psllw $6, %mm0
 	pmulhw %mm5, %mm0
-	
+
 	/*\ i = 0x4000 - My \*/
 	movl $0x4000, %ebx
 	subl My, %ebx
@@ -307,18 +305,18 @@ PR_(imlib_Scale_mmx_AARGBA):
 	psllw $6, %mm1
 	pmulhw %mm4, %mm1
 	paddw %mm1, %mm0
-	
+
 	/*\ i -= Cy; while (i > Cy) \*/
 	subl Cy, %ebx
 2:
 	cmpl Cy, %ebx
 	jg 1b
-	
+
 	/*\ mm6 = i \*/
 	movd %ebx, %mm6
 	punpcklwd %mm6, %mm6
 	punpckldq %mm6, %mm6
-	
+
 	/*\ p += sow; v += (*p * i) >> 10 \*/
 	addl sow_4, %eax
 	movd (%eax), %mm1
@@ -336,7 +334,7 @@ PR_(imlib_Scale_mmx_AARGBA):
 	movd %eax, %mm3
 	punpcklwd %mm3, %mm3
 	punpckldq %mm3, %mm3
-	
+
 	/*\ p + 1 \*/
 	movl %esi, %eax
 	addl $4, %eax
@@ -345,7 +343,7 @@ PR_(imlib_Scale_mmx_AARGBA):
 	punpcklbw %mm7, %mm2
 	psllw $6, %mm2
 	pmulhw %mm5, %mm2
-	
+
 	/*\ i = 0x4000 - My \*/
 	movl $0x4000, %ebx
 	subl My, %ebx
@@ -359,13 +357,13 @@ PR_(imlib_Scale_mmx_AARGBA):
 	psllw $6, %mm1
 	pmulhw %mm4, %mm1
 	paddw %mm1, %mm2
-	
+
 	/*\ i -= Cy; while (i > Cy) \*/
 	subl Cy, %ebx
 2:
 	cmpl Cy, %ebx
 	jg 1b
-	
+
 	/*\ p += sow; v += (*p * i) >> 10 \*/
 	addl sow_4, %eax
 	movd (%eax), %mm1
@@ -425,7 +423,7 @@ PR_(imlib_Scale_mmx_AARGBA):
 	movd %eax, %mm3
 	punpcklwd %mm3, %mm3
 	punpckldq %mm3, %mm3
-	
+
 	/*\ x = -dw \*/
 	movl dw, %ecx
 	negl %ecx
@@ -459,7 +457,7 @@ PR_(imlib_Scale_mmx_AARGBA):
 	punpcklbw %mm7, %mm0
 	psllw $6, %mm0
 	pmulhw %mm5, %mm0
-	
+
 	/*\ i = 0x4000 - Mx \*/
 	movl $0x4000, %ebx
 	subl Mx, %ebx
@@ -473,18 +471,18 @@ PR_(imlib_Scale_mmx_AARGBA):
 	psllw $6, %mm1
 	pmulhw %mm4, %mm1
 	paddw %mm1, %mm0
-	
+
 	/*\ i -= Cx; while (i > Cx) \*/
 	subl Cx, %ebx
 2:
 	cmpl Cx, %ebx
 	jg 1b
-	
+
 	/*\ mm6 = i \*/
 	movd %ebx, %mm6
 	punpcklwd %mm6, %mm6
 	punpckldq %mm6, %mm6
-	
+
 	/*\ p += sow; v += (*p * i) >> 10 \*/
 	addl $4, %eax
 	movd (%eax), %mm1
@@ -504,7 +502,7 @@ PR_(imlib_Scale_mmx_AARGBA):
 	punpcklbw %mm7, %mm2
 	psllw $6, %mm2
 	pmulhw %mm5, %mm2
-	
+
 	/*\ i = 0x4000 - Mx \*/
 	movl $0x4000, %ebx
 	subl Mx, %ebx
@@ -518,13 +516,13 @@ PR_(imlib_Scale_mmx_AARGBA):
 	psllw $6, %mm1
 	pmulhw %mm4, %mm1
 	paddw %mm1, %mm2
-	
+
 	/*\ i -= Cx; while (i > Cx) \*/
 	subl Cx, %ebx
 2:
 	cmpl Cx, %ebx
 	jg 1b
-	
+
 	/*\ p += sow; v += (*p * i) >> 10 \*/
 	addl $4, %eax
 	movd (%eax), %mm1
@@ -604,14 +602,14 @@ PR_(imlib_Scale_mmx_AARGBA):
 	movd %ebx, %mm5
 	punpcklwd %mm5, %mm5
 	punpckldq %mm5, %mm5
-	
+
 	/*\ p = sptr; v = (*p * Mx) >> 9 \*/
 	movl %esi, %eax
 	movd (%eax), %mm0
 	punpcklbw %mm7, %mm0
 	psllw $7, %mm0
 	pmulhw %mm5, %mm0
-	
+
 	/*\ i = 0x4000 - Mx \*/
 	movl $0x4000, %ebx
 	subl Mx, %ebx
@@ -625,18 +623,18 @@ PR_(imlib_Scale_mmx_AARGBA):
 	psllw $7, %mm1
 	pmulhw %mm3, %mm1
 	paddw %mm1, %mm0
-	
+
 	/*\ i -= Cx; while (i > Cx) \*/
 	subl Cx, %ebx
 2:
 	cmpl Cx, %ebx
 	jg 1b
-	
+
 	/*\ mm6 = i \*/
 	movd %ebx, %mm6
 	punpcklwd %mm6, %mm6
 	punpckldq %mm6, %mm6
-	
+
 	/*\ v += (*++p * i) >> 9 \*/
 	addl $4, %eax
 	movd (%eax), %mm1
@@ -651,7 +649,7 @@ PR_(imlib_Scale_mmx_AARGBA):
 	punpckldq %mm4, %mm4
 	psllw $2, %mm0
 	pmulhw %mm4, %mm0
-	
+
 	/*\ j = 0x4000 - My \*/
 	movl $0x4000, %edx
 	subl My, %edx
@@ -666,7 +664,7 @@ PR_(imlib_Scale_mmx_AARGBA):
 	punpcklbw %mm7, %mm1
 	psllw $7, %mm1
 	pmulhw %mm5, %mm1
-	
+
 	/*\ i = 0x4000 - Mx \*/
 	movl $0x4000, %ebx
 	subl Mx, %ebx
@@ -680,13 +678,13 @@ PR_(imlib_Scale_mmx_AARGBA):
 	psllw $7, %mm2
 	pmulhw %mm3, %mm2
 	paddw %mm2, %mm1
-	
+
 	/*\ i -= Cx; while (i > Cx) \*/
 	subl Cx, %ebx
 2:
 	cmpl Cx, %ebx
 	jg 1b
-	
+
 	/*\ vx += (*++p * i) >> 9 \*/
 	addl $4, %eax
 	movd (%eax), %mm2
@@ -702,13 +700,13 @@ PR_(imlib_Scale_mmx_AARGBA):
 	psllw $2, %mm1
 	pmulhw %mm4, %mm1
 	paddw %mm1, %mm0
-	
+
 	/*\ j -= Cy; while (j > Cy) \*/
 	subl Cy, %edx
 4:
 	cmpl Cy, %edx
 	jg 3b
-	
+
 	/*\ sptr += sow; p = sptr \*/
 	addl sow_4, %esi
 	movl %esi, %eax
@@ -717,7 +715,7 @@ PR_(imlib_Scale_mmx_AARGBA):
 	punpcklbw %mm7, %mm1
 	psllw $7, %mm1
 	pmulhw %mm5, %mm1
-	
+
 	/*\ i = 0x4000 - Mx \*/
 	movl $0x4000, %ebx
 	subl Mx, %ebx
@@ -731,13 +729,13 @@ PR_(imlib_Scale_mmx_AARGBA):
 	psllw $7, %mm2
 	pmulhw %mm3, %mm2
 	paddw %mm2, %mm1
-	
+
 	/*\ i -= Cx; while (i > Cx) \*/
 	subl Cx, %ebx
 2:
 	cmpl Cx, %ebx
 	jg 1b
-	
+
 	/*\ vx += (*++p * i) >> 9 \*/
 	addl $4, %eax
 	movd (%eax), %mm2
@@ -788,8 +786,6 @@ PR_(imlib_Scale_mmx_AARGBA):
 
 SIZE(imlib_Scale_mmx_AARGBA)
 
-#endif
-
 #ifdef __ELF__
 .section .note.GNU-stack,"",@progbits
 #endif