diff --git a/src/lib/amd64_blend.S b/src/lib/amd64_blend.S index 5eabfc4..5721913 100644 --- a/src/lib/amd64_blend.S +++ b/src/lib/amd64_blend.S @@ -1,9 +1,7 @@ #include #include "asm.h" -#ifdef DO_AMD64_ASM - -/*\ +/*\ |*| AMD64 SSE2 assembly blending routines for Imlib2 |*| Written by John Slaten |*| Based on MMX routines written by Willem Monsuwe @@ -79,7 +77,7 @@ FN_(imlib_amd64_reshade_copy_rgba_to_rgba) FN_(imlib_amd64_reshade_copy_rgb_to_rgba) .extern pow_lut - + /*\ SSE register use: |*| %xmm1 = Source value |*| %xmm2 = Destination value @@ -96,9 +94,6 @@ FN_(imlib_amd64_reshade_copy_rgb_to_rgba) |*| %r10d = sw |*| %r11d = dw \*/ - - - #define ENTER \ @@ -118,7 +113,7 @@ FN_(imlib_amd64_reshade_copy_rgb_to_rgba) jz 9f ; \ testq %r9, %r9 ; \ jz 9f - + #define LEAVE \ popq %r14 ; \ popq %r13 ; \ @@ -135,20 +130,20 @@ PR_(imlib_amd64_blend_rgba_to_rgb): movdqu c1(%rip), %xmm5 movdqu m00XXXXXX(%rip), %xmm6 - /* Move right to left across each line, */ - /* processing in two pixel chunks */ - leaq (%rsi, %r8, 4), %rsi - leaq (%rdi, %r8, 4), %rdi - - /* Last instruction is %rcx = 0 */ - subq $4, %rsi - subq $4, %rdi - - negq %r8 -0: - movq %r8, %rcx - - incq %rcx + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) @@ -156,7 +151,7 @@ PR_(imlib_amd64_blend_rgba_to_rgb): prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) - jz 2f /* one pixel line */ + jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) @@ -168,13 +163,13 @@ PR_(imlib_amd64_blend_rgba_to_rgb): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -197,10 +192,10 @@ PR_(imlib_amd64_blend_rgba_to_rgb): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 @@ -208,13 +203,13 @@ PR_(imlib_amd64_blend_rgba_to_rgb): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -237,10 +232,10 @@ PR_(imlib_amd64_blend_rgba_to_rgb): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 @@ -248,13 +243,13 @@ PR_(imlib_amd64_blend_rgba_to_rgb): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -277,10 +272,10 @@ PR_(imlib_amd64_blend_rgba_to_rgb): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 @@ -288,13 +283,13 @@ PR_(imlib_amd64_blend_rgba_to_rgb): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -317,10 +312,10 @@ PR_(imlib_amd64_blend_rgba_to_rgb): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 @@ -328,13 +323,13 @@ PR_(imlib_amd64_blend_rgba_to_rgb): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -357,10 +352,10 @@ PR_(imlib_amd64_blend_rgba_to_rgb): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 @@ -368,13 +363,13 @@ PR_(imlib_amd64_blend_rgba_to_rgb): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -397,10 +392,10 @@ PR_(imlib_amd64_blend_rgba_to_rgb): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 @@ -408,13 +403,13 @@ PR_(imlib_amd64_blend_rgba_to_rgb): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -437,10 +432,10 @@ PR_(imlib_amd64_blend_rgba_to_rgb): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 @@ -448,13 +443,13 @@ PR_(imlib_amd64_blend_rgba_to_rgb): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -477,10 +472,10 @@ PR_(imlib_amd64_blend_rgba_to_rgb): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx - js 1b - jnz 3f + incq %rcx + incq %rcx + js 1b + jnz 3f 2: movd (%rsi, %rcx, 4), %xmm1 movd (%rdi, %rcx, 4), %xmm2 @@ -488,13 +483,13 @@ PR_(imlib_amd64_blend_rgba_to_rgb): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -516,10 +511,10 @@ PR_(imlib_amd64_blend_rgba_to_rgb): /* Repack new pixels */ packuswb %xmm4, %xmm2 movd %xmm2, (%rdi, %rcx, 4) -3: - leaq (%rsi, %r10, 4), %rsi - leaq (%rdi, %r11, 4), %rdi - decq %r9 +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 jnz 0b 9: @@ -534,20 +529,20 @@ PR_(imlib_amd64_blend_rgba_to_rgba): movdqu mX000X000X000X000(%rip), %xmm6 movq pow_lut@GOTPCREL(%rip), %r13 - /* Move right to left across each line, */ - /* processing in two pixel chunks */ - leaq (%rsi, %r8, 4), %rsi - leaq (%rdi, %r8, 4), %rdi - - /* Last instruction is %rcx = 0 */ - subq $4, %rsi - subq $4, %rdi - - negq %r8 -0: - movq %r8, %rcx - - incq %rcx + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) @@ -555,7 +550,7 @@ PR_(imlib_amd64_blend_rgba_to_rgba): prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) - jz 2f /* one pixel line */ + jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) @@ -563,8 +558,8 @@ PR_(imlib_amd64_blend_rgba_to_rgba): movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 - /* Load two pixels as 00, 00, src alpha, combined alpha - * Combined alpha is derived from the pow_lut table in blend.c + /* Load two pixels as 00, 00, src alpha, combined alpha + * Combined alpha is derived from the pow_lut table in blend.c */ movzbq 7(%rdi, %rcx, 4), %rdx movb 7(%rsi, %rcx, 4), %dh @@ -575,14 +570,14 @@ PR_(imlib_amd64_blend_rgba_to_rgba): shlq $32, %rax movb (%r13, %rdx), %al movb %dh, %ah - movd %rax, %xmm3 + movd %rax, %xmm3 /* override source alpha to 255 */ por %xmm6, %xmm1 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* unpack source and dest */ @@ -600,15 +595,15 @@ PR_(imlib_amd64_blend_rgba_to_rgba): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 - /* Load two pixels as 00, 00, src alpha, combined alpha - * Combined alpha is derived from the pow_lut table in blend.c + /* Load two pixels as 00, 00, src alpha, combined alpha + * Combined alpha is derived from the pow_lut table in blend.c */ movzbq 7(%rdi, %rcx, 4), %rdx movb 7(%rsi, %rcx, 4), %dh @@ -619,14 +614,14 @@ PR_(imlib_amd64_blend_rgba_to_rgba): shlq $32, %rax movb (%r13, %rdx), %al movb %dh, %ah - movd %rax, %xmm3 + movd %rax, %xmm3 /* override source alpha to 255 */ por %xmm6, %xmm1 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* unpack source and dest */ @@ -644,15 +639,15 @@ PR_(imlib_amd64_blend_rgba_to_rgba): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 - /* Load two pixels as 00, 00, src alpha, combined alpha - * Combined alpha is derived from the pow_lut table in blend.c + /* Load two pixels as 00, 00, src alpha, combined alpha + * Combined alpha is derived from the pow_lut table in blend.c */ movzbq 7(%rdi, %rcx, 4), %rdx movb 7(%rsi, %rcx, 4), %dh @@ -663,14 +658,14 @@ PR_(imlib_amd64_blend_rgba_to_rgba): shlq $32, %rax movb (%r13, %rdx), %al movb %dh, %ah - movd %rax, %xmm3 + movd %rax, %xmm3 /* override source alpha to 255 */ por %xmm6, %xmm1 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* unpack source and dest */ @@ -688,15 +683,15 @@ PR_(imlib_amd64_blend_rgba_to_rgba): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 - /* Load two pixels as 00, 00, src alpha, combined alpha - * Combined alpha is derived from the pow_lut table in blend.c + /* Load two pixels as 00, 00, src alpha, combined alpha + * Combined alpha is derived from the pow_lut table in blend.c */ movzbq 7(%rdi, %rcx, 4), %rdx movb 7(%rsi, %rcx, 4), %dh @@ -707,14 +702,14 @@ PR_(imlib_amd64_blend_rgba_to_rgba): shlq $32, %rax movb (%r13, %rdx), %al movb %dh, %ah - movd %rax, %xmm3 + movd %rax, %xmm3 /* override source alpha to 255 */ por %xmm6, %xmm1 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* unpack source and dest */ @@ -732,15 +727,15 @@ PR_(imlib_amd64_blend_rgba_to_rgba): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 - /* Load two pixels as 00, 00, src alpha, combined alpha - * Combined alpha is derived from the pow_lut table in blend.c + /* Load two pixels as 00, 00, src alpha, combined alpha + * Combined alpha is derived from the pow_lut table in blend.c */ movzbq 7(%rdi, %rcx, 4), %rdx movb 7(%rsi, %rcx, 4), %dh @@ -751,14 +746,14 @@ PR_(imlib_amd64_blend_rgba_to_rgba): shlq $32, %rax movb (%r13, %rdx), %al movb %dh, %ah - movd %rax, %xmm3 + movd %rax, %xmm3 /* override source alpha to 255 */ por %xmm6, %xmm1 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* unpack source and dest */ @@ -776,15 +771,15 @@ PR_(imlib_amd64_blend_rgba_to_rgba): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 - /* Load two pixels as 00, 00, src alpha, combined alpha - * Combined alpha is derived from the pow_lut table in blend.c + /* Load two pixels as 00, 00, src alpha, combined alpha + * Combined alpha is derived from the pow_lut table in blend.c */ movzbq 7(%rdi, %rcx, 4), %rdx movb 7(%rsi, %rcx, 4), %dh @@ -795,14 +790,14 @@ PR_(imlib_amd64_blend_rgba_to_rgba): shlq $32, %rax movb (%r13, %rdx), %al movb %dh, %ah - movd %rax, %xmm3 + movd %rax, %xmm3 /* override source alpha to 255 */ por %xmm6, %xmm1 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* unpack source and dest */ @@ -820,15 +815,15 @@ PR_(imlib_amd64_blend_rgba_to_rgba): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 - /* Load two pixels as 00, 00, src alpha, combined alpha - * Combined alpha is derived from the pow_lut table in blend.c + /* Load two pixels as 00, 00, src alpha, combined alpha + * Combined alpha is derived from the pow_lut table in blend.c */ movzbq 7(%rdi, %rcx, 4), %rdx movb 7(%rsi, %rcx, 4), %dh @@ -839,14 +834,14 @@ PR_(imlib_amd64_blend_rgba_to_rgba): shlq $32, %rax movb (%r13, %rdx), %al movb %dh, %ah - movd %rax, %xmm3 + movd %rax, %xmm3 /* override source alpha to 255 */ por %xmm6, %xmm1 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* unpack source and dest */ @@ -864,15 +859,15 @@ PR_(imlib_amd64_blend_rgba_to_rgba): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 - /* Load two pixels as 00, 00, src alpha, combined alpha - * Combined alpha is derived from the pow_lut table in blend.c + /* Load two pixels as 00, 00, src alpha, combined alpha + * Combined alpha is derived from the pow_lut table in blend.c */ movzbq 7(%rdi, %rcx, 4), %rdx movb 7(%rsi, %rcx, 4), %dh @@ -883,14 +878,14 @@ PR_(imlib_amd64_blend_rgba_to_rgba): shlq $32, %rax movb (%r13, %rdx), %al movb %dh, %ah - movd %rax, %xmm3 + movd %rax, %xmm3 /* override source alpha to 255 */ por %xmm6, %xmm1 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* unpack source and dest */ @@ -908,28 +903,28 @@ PR_(imlib_amd64_blend_rgba_to_rgba): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx - js 1b - jnz 3f + incq %rcx + incq %rcx + js 1b + jnz 3f 2: movd (%rsi, %rcx, 4), %xmm1 movd (%rdi, %rcx, 4), %xmm2 - /* Load one pixel as 00, 00, src alpha, combined alpha - * Combined alpha is derived from the pow_lut table in blend.c + /* Load one pixel as 00, 00, src alpha, combined alpha + * Combined alpha is derived from the pow_lut table in blend.c */ movzbq 3(%rdi, %rcx, 4), %rdx movb 3(%rsi, %rcx, 4), %dh movb (%r13, %rdx), %al movb %dh, %ah - movd %eax, %xmm3 + movd %eax, %xmm3 /* override source alpha to 255 */ por %xmm6, %xmm1 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* unpack source and dest */ @@ -946,10 +941,10 @@ PR_(imlib_amd64_blend_rgba_to_rgba): /* repack new pixels */ packuswb %xmm4, %xmm2 movd %xmm2, (%rdi, %rcx, 4) -3: - leaq (%rsi, %r10, 4), %rsi - leaq (%rdi, %r11, 4), %rdi - decq %r9 +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 jnz 0b 9: @@ -1361,20 +1356,20 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb): pxor %xmm4, %xmm4 movdqu m00XXXXXX(%rip), %xmm6 - /* Move right to left across each line, */ - /* processing in two pixel chunks */ - leaq (%rsi, %r8, 4), %rsi - leaq (%rdi, %r8, 4), %rdi - - /* Last instruction is %rcx = 0 */ - subq $4, %rsi - subq $4, %rdi - - negq %r8 -0: - movq %r8, %rcx - - incq %rcx + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) @@ -1382,7 +1377,7 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb): prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) - jz 2f /* one pixel line */ + jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) @@ -1394,13 +1389,13 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -1421,10 +1416,10 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 @@ -1432,13 +1427,13 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -1459,10 +1454,10 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 @@ -1470,13 +1465,13 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -1497,10 +1492,10 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 @@ -1508,13 +1503,13 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -1535,10 +1530,10 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 @@ -1546,13 +1541,13 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -1573,10 +1568,10 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 @@ -1584,13 +1579,13 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -1611,10 +1606,10 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 @@ -1622,13 +1617,13 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -1649,10 +1644,10 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 @@ -1660,13 +1655,13 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -1687,10 +1682,10 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx - js 1b - jnz 3f + incq %rcx + incq %rcx + js 1b + jnz 3f 2: movd (%rsi, %rcx, 4), %xmm1 movd (%rdi, %rcx, 4), %xmm2 @@ -1698,13 +1693,13 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -1724,10 +1719,10 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb): /* pack new pixels */ packuswb %xmm4, %xmm2 movd %xmm2, (%rdi, %rcx, 4) -3: - leaq (%rsi, %r10, 4), %rsi - leaq (%rdi, %r11, 4), %rdi - decq %r9 +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 jnz 0b 9: @@ -1743,20 +1738,20 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba): movdqu mX000X000X000X000(%rip), %xmm6 movq pow_lut@GOTPCREL(%rip), %r13 - /* Move right to left across each line, */ - /* processing in two pixel chunks */ - leaq (%rsi, %r8, 4), %rsi - leaq (%rdi, %r8, 4), %rdi - - /* Last instruction is %rcx = 0 */ - subq $4, %rsi - subq $4, %rdi - - negq %r8 -0: - movq %r8, %rcx - - incq %rcx + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) @@ -1764,7 +1759,7 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba): prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) - jz 2f /* one pixel line */ + jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) @@ -1772,8 +1767,8 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba): movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 - /* Load two pixels as 00, 00, src alpha, combined alpha - * Combined alpha is derived from the pow_lut table in blend.c + /* Load two pixels as 00, 00, src alpha, combined alpha + * Combined alpha is derived from the pow_lut table in blend.c */ movzbq 7(%rdi, %rcx, 4), %rdx movb 7(%rsi, %rcx, 4), %dh @@ -1784,11 +1779,11 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba): shlq $32, %rax movb (%r13, %rdx), %al movb %dh, %ah - movd %rax, %xmm3 + movd %rax, %xmm3 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ @@ -1805,20 +1800,20 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba): psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 - + /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 - /* Load two pixels as 00, 00, src alpha, combined alpha - * Combined alpha is derived from the pow_lut table in blend.c + /* Load two pixels as 00, 00, src alpha, combined alpha + * Combined alpha is derived from the pow_lut table in blend.c */ movzbq 7(%rdi, %rcx, 4), %rdx movb 7(%rsi, %rcx, 4), %dh @@ -1829,11 +1824,11 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba): shlq $32, %rax movb (%r13, %rdx), %al movb %dh, %ah - movd %rax, %xmm3 + movd %rax, %xmm3 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ @@ -1850,20 +1845,20 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba): psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 - + /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 - /* Load two pixels as 00, 00, src alpha, combined alpha - * Combined alpha is derived from the pow_lut table in blend.c + /* Load two pixels as 00, 00, src alpha, combined alpha + * Combined alpha is derived from the pow_lut table in blend.c */ movzbq 7(%rdi, %rcx, 4), %rdx movb 7(%rsi, %rcx, 4), %dh @@ -1874,11 +1869,11 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba): shlq $32, %rax movb (%r13, %rdx), %al movb %dh, %ah - movd %rax, %xmm3 + movd %rax, %xmm3 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ @@ -1895,20 +1890,20 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba): psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 - + /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 - /* Load two pixels as 00, 00, src alpha, combined alpha - * Combined alpha is derived from the pow_lut table in blend.c + /* Load two pixels as 00, 00, src alpha, combined alpha + * Combined alpha is derived from the pow_lut table in blend.c */ movzbq 7(%rdi, %rcx, 4), %rdx movb 7(%rsi, %rcx, 4), %dh @@ -1919,11 +1914,11 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba): shlq $32, %rax movb (%r13, %rdx), %al movb %dh, %ah - movd %rax, %xmm3 + movd %rax, %xmm3 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ @@ -1940,20 +1935,20 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba): psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 - + /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 - /* Load two pixels as 00, 00, src alpha, combined alpha - * Combined alpha is derived from the pow_lut table in blend.c + /* Load two pixels as 00, 00, src alpha, combined alpha + * Combined alpha is derived from the pow_lut table in blend.c */ movzbq 7(%rdi, %rcx, 4), %rdx movb 7(%rsi, %rcx, 4), %dh @@ -1964,11 +1959,11 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba): shlq $32, %rax movb (%r13, %rdx), %al movb %dh, %ah - movd %rax, %xmm3 + movd %rax, %xmm3 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ @@ -1985,20 +1980,20 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba): psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 - + /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 - /* Load two pixels as 00, 00, src alpha, combined alpha - * Combined alpha is derived from the pow_lut table in blend.c + /* Load two pixels as 00, 00, src alpha, combined alpha + * Combined alpha is derived from the pow_lut table in blend.c */ movzbq 7(%rdi, %rcx, 4), %rdx movb 7(%rsi, %rcx, 4), %dh @@ -2009,11 +2004,11 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba): shlq $32, %rax movb (%r13, %rdx), %al movb %dh, %ah - movd %rax, %xmm3 + movd %rax, %xmm3 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ @@ -2030,20 +2025,20 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba): psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 - + /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 - /* Load two pixels as 00, 00, src alpha, combined alpha - * Combined alpha is derived from the pow_lut table in blend.c + /* Load two pixels as 00, 00, src alpha, combined alpha + * Combined alpha is derived from the pow_lut table in blend.c */ movzbq 7(%rdi, %rcx, 4), %rdx movb 7(%rsi, %rcx, 4), %dh @@ -2054,11 +2049,11 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba): shlq $32, %rax movb (%r13, %rdx), %al movb %dh, %ah - movd %rax, %xmm3 + movd %rax, %xmm3 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ @@ -2075,20 +2070,20 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba): psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 - + /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 - /* Load two pixels as 00, 00, src alpha, combined alpha - * Combined alpha is derived from the pow_lut table in blend.c + /* Load two pixels as 00, 00, src alpha, combined alpha + * Combined alpha is derived from the pow_lut table in blend.c */ movzbq 7(%rdi, %rcx, 4), %rdx movb 7(%rsi, %rcx, 4), %dh @@ -2099,11 +2094,11 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba): shlq $32, %rax movb (%r13, %rdx), %al movb %dh, %ah - movd %rax, %xmm3 + movd %rax, %xmm3 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ @@ -2120,30 +2115,30 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba): psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 - + /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx - js 1b - jnz 3f + incq %rcx + incq %rcx + js 1b + jnz 3f 2: movd (%rsi, %rcx, 4), %xmm1 movd (%rdi, %rcx, 4), %xmm2 - /* Load one pixel as 00, 00, src alpha, combined alpha - * Combined alpha is derived from the pow_lut table in blend.c + /* Load one pixel as 00, 00, src alpha, combined alpha + * Combined alpha is derived from the pow_lut table in blend.c */ movzbq 3(%rdi, %rcx, 4), %rdx movb 3(%rsi, %rcx, 4), %dh movb (%r13, %rdx), %al movb %dh, %ah - movd %eax, %xmm3 + movd %eax, %xmm3 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ @@ -2160,14 +2155,14 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba): psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 - + /* pack new pixels */ packuswb %xmm4, %xmm2 movd %xmm2, (%rdi, %rcx, 4) -3: - leaq (%rsi, %r10, 4), %rsi - leaq (%rdi, %r11, 4), %rdi - decq %r9 +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 jnz 0b 9: @@ -2502,7 +2497,7 @@ PR_(imlib_amd64_add_copy_rgb_to_rgba): movd (%rsi, %rcx, 4), %xmm1 movd (%rdi, %rcx, 4), %xmm2 - /* d = (d + s) | 0xff000000 */ + /* d = (d + s) | 0xff000000 */ paddusb %xmm1, %xmm2 por %xmm5, %xmm2 movd %xmm2, (%rdi, %rcx, 4) @@ -2528,7 +2523,7 @@ PR_(imlib_amd64_add_copy_rgb_to_rgba): movdqa (%rsi, %rcx, 4), %xmm1 movdqa (%rdi, %rcx, 4), %xmm2 - /* d = (d + s) | 0xff000000 */ + /* d = (d + s) | 0xff000000 */ paddusb %xmm1, %xmm2 por %xmm5, %xmm2 movdqa %xmm2, (%rdi, %rcx, 4) @@ -2537,7 +2532,7 @@ PR_(imlib_amd64_add_copy_rgb_to_rgba): movdqa (%rsi, %rcx, 4), %xmm1 movdqa (%rdi, %rcx, 4), %xmm2 - /* d = (d + s) | 0xff000000 */ + /* d = (d + s) | 0xff000000 */ paddusb %xmm1, %xmm2 por %xmm5, %xmm2 movdqa %xmm2, (%rdi, %rcx, 4) @@ -2546,7 +2541,7 @@ PR_(imlib_amd64_add_copy_rgb_to_rgba): movdqa (%rsi, %rcx, 4), %xmm1 movdqa (%rdi, %rcx, 4), %xmm2 - /* d = (d + s) | 0xff000000 */ + /* d = (d + s) | 0xff000000 */ paddusb %xmm1, %xmm2 por %xmm5, %xmm2 movdqa %xmm2, (%rdi, %rcx, 4) @@ -2555,7 +2550,7 @@ PR_(imlib_amd64_add_copy_rgb_to_rgba): movdqa (%rsi, %rcx, 4), %xmm1 movdqa (%rdi, %rcx, 4), %xmm2 - /* d = (d + s) | 0xff000000 */ + /* d = (d + s) | 0xff000000 */ paddusb %xmm1, %xmm2 por %xmm5, %xmm2 movdqa %xmm2, (%rdi, %rcx, 4) @@ -2570,7 +2565,7 @@ PR_(imlib_amd64_add_copy_rgb_to_rgba): movdqu (%rsi, %rcx, 4), %xmm1 movdqa (%rdi, %rcx, 4), %xmm2 - /* d = (d + s) | 0xff000000 */ + /* d = (d + s) | 0xff000000 */ paddusb %xmm1, %xmm2 por %xmm5, %xmm2 movdqa %xmm2, (%rdi, %rcx, 4) @@ -2579,7 +2574,7 @@ PR_(imlib_amd64_add_copy_rgb_to_rgba): movdqu (%rsi, %rcx, 4), %xmm1 movdqa (%rdi, %rcx, 4), %xmm2 - /* d = (d + s) | 0xff000000 */ + /* d = (d + s) | 0xff000000 */ paddusb %xmm1, %xmm2 por %xmm5, %xmm2 movdqa %xmm2, (%rdi, %rcx, 4) @@ -2588,7 +2583,7 @@ PR_(imlib_amd64_add_copy_rgb_to_rgba): movdqu (%rsi, %rcx, 4), %xmm1 movdqa (%rdi, %rcx, 4), %xmm2 - /* d = (d + s) | 0xff000000 */ + /* d = (d + s) | 0xff000000 */ paddusb %xmm1, %xmm2 por %xmm5, %xmm2 movdqa %xmm2, (%rdi, %rcx, 4) @@ -2597,7 +2592,7 @@ PR_(imlib_amd64_add_copy_rgb_to_rgba): movdqu (%rsi, %rcx, 4), %xmm1 movdqa (%rdi, %rcx, 4), %xmm2 - /* d = (d + s) | 0xff000000 */ + /* d = (d + s) | 0xff000000 */ paddusb %xmm1, %xmm2 por %xmm5, %xmm2 movdqa %xmm2, (%rdi, %rcx, 4) @@ -2610,7 +2605,7 @@ PR_(imlib_amd64_add_copy_rgb_to_rgba): movd (%rsi, %rcx, 4), %xmm1 movd (%rdi, %rcx, 4), %xmm2 - /* d = (d + s) | 0xff000000 */ + /* d = (d + s) | 0xff000000 */ paddusb %xmm1, %xmm2 por %xmm5, %xmm2 movd %xmm2, (%rdi, %rcx, 4) @@ -2633,20 +2628,20 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb): pxor %xmm4, %xmm4 movdqu m00XXXXXX(%rip), %xmm6 - /* Move right to left across each line, */ - /* processing in two pixel chunks */ - leaq (%rsi, %r8, 4), %rsi - leaq (%rdi, %r8, 4), %rdi - - /* Last instruction is %rcx = 0 */ - subq $4, %rsi - subq $4, %rdi - - negq %r8 -0: - movq %r8, %rcx - - incq %rcx + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) @@ -2654,7 +2649,7 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb): prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) - jz 2f /* one pixel line */ + jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) @@ -2666,13 +2661,13 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -2693,10 +2688,10 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 @@ -2704,13 +2699,13 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -2731,10 +2726,10 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 @@ -2742,13 +2737,13 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -2769,10 +2764,10 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 @@ -2780,13 +2775,13 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -2807,10 +2802,10 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 @@ -2818,13 +2813,13 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -2845,10 +2840,10 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 @@ -2856,13 +2851,13 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -2883,10 +2878,10 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 @@ -2894,13 +2889,13 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -2921,10 +2916,10 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 @@ -2932,13 +2927,13 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -2959,10 +2954,10 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx - js 1b - jnz 3f + incq %rcx + incq %rcx + js 1b + jnz 3f 2: movd (%rsi, %rcx, 4), %xmm1 movd (%rdi, %rcx, 4), %xmm2 @@ -2970,13 +2965,13 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -2996,10 +2991,10 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb): /* pack new pixels */ packuswb %xmm4, %xmm2 movd %xmm2, (%rdi, %rcx, 4) -3: - leaq (%rsi, %r10, 4), %rsi - leaq (%rdi, %r11, 4), %rdi - decq %r9 +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 jnz 0b 9: @@ -3016,20 +3011,20 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba): movdqu mX000X000(%rip), %xmm7 xorq %rax, %rax - /* Move right to left across each line, */ - /* processing in two pixel chunks */ - leaq (%rsi, %r8, 4), %rsi - leaq (%rdi, %r8, 4), %rdi - - /* Last instruction is %rcx = 0 */ - subq $4, %rsi - subq $4, %rdi - - negq %r8 -0: - movq %r8, %rcx - - incq %rcx + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) @@ -3037,7 +3032,7 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba): prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) - jz 2f /* one pixel line */ + jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) @@ -3045,8 +3040,8 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba): movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 - /* Load two pixels as 00, 00, src alpha, combined alpha - * Combined alpha is derived from the pow_lut table in blend.c + /* Load two pixels as 00, 00, src alpha, combined alpha + * Combined alpha is derived from the pow_lut table in blend.c */ movzbq 7(%rdi, %rcx, 4), %rdx movb 7(%rsi, %rcx, 4), %dh @@ -3057,11 +3052,11 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba): shlq $32, %rax movb (%r13, %rdx), %al movb %dh, %ah - movd %rax, %xmm3 + movd %rax, %xmm3 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ @@ -3079,20 +3074,20 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba): pmulhw %xmm3, %xmm1 pxor %xmm7, %xmm1 psubsw %xmm1, %xmm2 - + /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 - /* Load two pixels as 00, 00, src alpha, combined alpha - * Combined alpha is derived from the pow_lut table in blend.c + /* Load two pixels as 00, 00, src alpha, combined alpha + * Combined alpha is derived from the pow_lut table in blend.c */ movzbq 7(%rdi, %rcx, 4), %rdx movb 7(%rsi, %rcx, 4), %dh @@ -3103,11 +3098,11 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba): shlq $32, %rax movb (%r13, %rdx), %al movb %dh, %ah - movd %rax, %xmm3 + movd %rax, %xmm3 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ @@ -3125,20 +3120,20 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba): pmulhw %xmm3, %xmm1 pxor %xmm7, %xmm1 psubsw %xmm1, %xmm2 - + /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 - /* Load two pixels as 00, 00, src alpha, combined alpha - * Combined alpha is derived from the pow_lut table in blend.c + /* Load two pixels as 00, 00, src alpha, combined alpha + * Combined alpha is derived from the pow_lut table in blend.c */ movzbq 7(%rdi, %rcx, 4), %rdx movb 7(%rsi, %rcx, 4), %dh @@ -3149,11 +3144,11 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba): shlq $32, %rax movb (%r13, %rdx), %al movb %dh, %ah - movd %rax, %xmm3 + movd %rax, %xmm3 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ @@ -3171,20 +3166,20 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba): pmulhw %xmm3, %xmm1 pxor %xmm7, %xmm1 psubsw %xmm1, %xmm2 - + /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 - /* Load two pixels as 00, 00, src alpha, combined alpha - * Combined alpha is derived from the pow_lut table in blend.c + /* Load two pixels as 00, 00, src alpha, combined alpha + * Combined alpha is derived from the pow_lut table in blend.c */ movzbq 7(%rdi, %rcx, 4), %rdx movb 7(%rsi, %rcx, 4), %dh @@ -3195,11 +3190,11 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba): shlq $32, %rax movb (%r13, %rdx), %al movb %dh, %ah - movd %rax, %xmm3 + movd %rax, %xmm3 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ @@ -3217,20 +3212,20 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba): pmulhw %xmm3, %xmm1 pxor %xmm7, %xmm1 psubsw %xmm1, %xmm2 - + /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 - /* Load two pixels as 00, 00, src alpha, combined alpha - * Combined alpha is derived from the pow_lut table in blend.c + /* Load two pixels as 00, 00, src alpha, combined alpha + * Combined alpha is derived from the pow_lut table in blend.c */ movzbq 7(%rdi, %rcx, 4), %rdx movb 7(%rsi, %rcx, 4), %dh @@ -3241,11 +3236,11 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba): shlq $32, %rax movb (%r13, %rdx), %al movb %dh, %ah - movd %rax, %xmm3 + movd %rax, %xmm3 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ @@ -3263,20 +3258,20 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba): pmulhw %xmm3, %xmm1 pxor %xmm7, %xmm1 psubsw %xmm1, %xmm2 - + /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 - /* Load two pixels as 00, 00, src alpha, combined alpha - * Combined alpha is derived from the pow_lut table in blend.c + /* Load two pixels as 00, 00, src alpha, combined alpha + * Combined alpha is derived from the pow_lut table in blend.c */ movzbq 7(%rdi, %rcx, 4), %rdx movb 7(%rsi, %rcx, 4), %dh @@ -3287,11 +3282,11 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba): shlq $32, %rax movb (%r13, %rdx), %al movb %dh, %ah - movd %rax, %xmm3 + movd %rax, %xmm3 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ @@ -3309,20 +3304,20 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba): pmulhw %xmm3, %xmm1 pxor %xmm7, %xmm1 psubsw %xmm1, %xmm2 - + /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 - /* Load two pixels as 00, 00, src alpha, combined alpha - * Combined alpha is derived from the pow_lut table in blend.c + /* Load two pixels as 00, 00, src alpha, combined alpha + * Combined alpha is derived from the pow_lut table in blend.c */ movzbq 7(%rdi, %rcx, 4), %rdx movb 7(%rsi, %rcx, 4), %dh @@ -3333,11 +3328,11 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba): shlq $32, %rax movb (%r13, %rdx), %al movb %dh, %ah - movd %rax, %xmm3 + movd %rax, %xmm3 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ @@ -3355,20 +3350,20 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba): pmulhw %xmm3, %xmm1 pxor %xmm7, %xmm1 psubsw %xmm1, %xmm2 - + /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 - /* Load two pixels as 00, 00, src alpha, combined alpha - * Combined alpha is derived from the pow_lut table in blend.c + /* Load two pixels as 00, 00, src alpha, combined alpha + * Combined alpha is derived from the pow_lut table in blend.c */ movzbq 7(%rdi, %rcx, 4), %rdx movb 7(%rsi, %rcx, 4), %dh @@ -3379,11 +3374,11 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba): shlq $32, %rax movb (%r13, %rdx), %al movb %dh, %ah - movd %rax, %xmm3 + movd %rax, %xmm3 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ @@ -3401,30 +3396,30 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba): pmulhw %xmm3, %xmm1 pxor %xmm7, %xmm1 psubsw %xmm1, %xmm2 - + /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx - js 1b - jnz 3f + incq %rcx + incq %rcx + js 1b + jnz 3f 2: movd (%rsi, %rcx, 4), %xmm1 movd (%rdi, %rcx, 4), %xmm2 - /* Load one pixel as 00, 00, src alpha, combined alpha - * Combined alpha is derived from the pow_lut table in blend.c + /* Load one pixel as 00, 00, src alpha, combined alpha + * Combined alpha is derived from the pow_lut table in blend.c */ movzbq 3(%rdi, %rcx, 4), %rdx movb 3(%rsi, %rcx, 4), %dh movb (%r13, %rdx), %al movb %dh, %ah - movd %eax, %xmm3 + movd %eax, %xmm3 /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ @@ -3442,14 +3437,14 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba): pmulhw %xmm3, %xmm1 pxor %xmm7, %xmm1 psubsw %xmm1, %xmm2 - + /* pack new pixels */ packuswb %xmm4, %xmm2 movd %xmm2, (%rdi, %rcx, 4) -3: - leaq (%rsi, %r10, 4), %rsi - leaq (%rdi, %r11, 4), %rdi - decq %r9 +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 jnz 0b 9: @@ -3805,7 +3800,7 @@ PR_(imlib_amd64_subtract_copy_rgb_to_rgba): movd (%rsi, %rcx, 4), %xmm1 movd (%rdi, %rcx, 4), %xmm2 - /* d = (d - s) | 0xff000000 */ + /* d = (d - s) | 0xff000000 */ psubusb %xmm1, %xmm2 por %xmm5, %xmm2 movd %xmm2, (%rdi, %rcx, 4) @@ -3831,7 +3826,7 @@ PR_(imlib_amd64_subtract_copy_rgb_to_rgba): movdqa (%rsi, %rcx, 4), %xmm1 movdqa (%rdi, %rcx, 4), %xmm2 - /* d = (d - s) | 0xff000000 */ + /* d = (d - s) | 0xff000000 */ psubusb %xmm1, %xmm2 por %xmm5, %xmm2 movdqa %xmm2, (%rdi, %rcx, 4) @@ -3840,7 +3835,7 @@ PR_(imlib_amd64_subtract_copy_rgb_to_rgba): movdqa (%rsi, %rcx, 4), %xmm1 movdqa (%rdi, %rcx, 4), %xmm2 - /* d = (d - s) | 0xff000000 */ + /* d = (d - s) | 0xff000000 */ psubusb %xmm1, %xmm2 por %xmm5, %xmm2 movdqa %xmm2, (%rdi, %rcx, 4) @@ -3849,7 +3844,7 @@ PR_(imlib_amd64_subtract_copy_rgb_to_rgba): movdqa (%rsi, %rcx, 4), %xmm1 movdqa (%rdi, %rcx, 4), %xmm2 - /* d = (d - s) | 0xff000000 */ + /* d = (d - s) | 0xff000000 */ psubusb %xmm1, %xmm2 por %xmm5, %xmm2 movdqa %xmm2, (%rdi, %rcx, 4) @@ -3858,7 +3853,7 @@ PR_(imlib_amd64_subtract_copy_rgb_to_rgba): movdqa (%rsi, %rcx, 4), %xmm1 movdqa (%rdi, %rcx, 4), %xmm2 - /* d = (d - s) | 0xff000000 */ + /* d = (d - s) | 0xff000000 */ psubusb %xmm1, %xmm2 por %xmm5, %xmm2 movdqa %xmm2, (%rdi, %rcx, 4) @@ -3873,7 +3868,7 @@ PR_(imlib_amd64_subtract_copy_rgb_to_rgba): movdqu (%rsi, %rcx, 4), %xmm1 movdqa (%rdi, %rcx, 4), %xmm2 - /* d = (d - s) | 0xff000000 */ + /* d = (d - s) | 0xff000000 */ psubusb %xmm1, %xmm2 por %xmm5, %xmm2 movdqa %xmm2, (%rdi, %rcx, 4) @@ -3882,7 +3877,7 @@ PR_(imlib_amd64_subtract_copy_rgb_to_rgba): movdqu (%rsi, %rcx, 4), %xmm1 movdqa (%rdi, %rcx, 4), %xmm2 - /* d = (d - s) | 0xff000000 */ + /* d = (d - s) | 0xff000000 */ psubusb %xmm1, %xmm2 por %xmm5, %xmm2 movdqa %xmm2, (%rdi, %rcx, 4) @@ -3891,7 +3886,7 @@ PR_(imlib_amd64_subtract_copy_rgb_to_rgba): movdqu (%rsi, %rcx, 4), %xmm1 movdqa (%rdi, %rcx, 4), %xmm2 - /* d = (d - s) | 0xff000000 */ + /* d = (d - s) | 0xff000000 */ psubusb %xmm1, %xmm2 por %xmm5, %xmm2 movdqa %xmm2, (%rdi, %rcx, 4) @@ -3900,7 +3895,7 @@ PR_(imlib_amd64_subtract_copy_rgb_to_rgba): movdqu (%rsi, %rcx, 4), %xmm1 movdqa (%rdi, %rcx, 4), %xmm2 - /* d = (d - s) | 0xff000000 */ + /* d = (d - s) | 0xff000000 */ psubusb %xmm1, %xmm2 por %xmm5, %xmm2 movdqa %xmm2, (%rdi, %rcx, 4) @@ -3913,7 +3908,7 @@ PR_(imlib_amd64_subtract_copy_rgb_to_rgba): movd (%rsi, %rcx, 4), %xmm1 movd (%rdi, %rcx, 4), %xmm2 - /* d = (d - s) | 0xff000000 */ + /* d = (d - s) | 0xff000000 */ psubusb %xmm1, %xmm2 por %xmm5, %xmm2 movd %xmm2, (%rdi, %rcx, 4) @@ -3937,20 +3932,20 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb): movdqu m000V0V0V000V0V0V(%rip), %xmm6 movdqu m00XXXXXX(%rip), %xmm7 - /* Move right to left across each line, */ - /* processing in two pixel chunks */ - leaq (%rsi, %r8, 4), %rsi - leaq (%rdi, %r8, 4), %rdi - - /* Last instruction is %rcx = 0 */ - subq $4, %rsi - subq $4, %rdi - - negq %r8 -0: - movq %r8, %rcx - - incq %rcx + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) @@ -3958,7 +3953,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb): prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) - jz 2f /* one pixel line */ + jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) @@ -3970,7 +3965,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb): movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero blending alpha */ @@ -3990,10 +3985,10 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 @@ -4001,7 +3996,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb): movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero blending alpha */ @@ -4021,10 +4016,10 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 @@ -4032,7 +4027,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb): movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero blending alpha */ @@ -4052,10 +4047,10 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 @@ -4063,7 +4058,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb): movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero blending alpha */ @@ -4083,10 +4078,10 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 @@ -4094,7 +4089,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb): movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero blending alpha */ @@ -4114,10 +4109,10 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 @@ -4125,7 +4120,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb): movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero blending alpha */ @@ -4145,10 +4140,10 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 @@ -4156,7 +4151,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb): movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero blending alpha */ @@ -4176,10 +4171,10 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 @@ -4187,7 +4182,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb): movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero blending alpha */ @@ -4207,10 +4202,10 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx - js 1b - jnz 3f + incq %rcx + incq %rcx + js 1b + jnz 3f 2: movd (%rsi, %rcx, 4), %xmm1 movd (%rdi, %rcx, 4), %xmm2 @@ -4218,7 +4213,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb): movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero blending alpha */ @@ -4237,10 +4232,10 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb): /* Repack new pixels */ packuswb %xmm4, %xmm2 movd %xmm2, (%rdi, %rcx, 4) -3: - leaq (%rsi, %r10, 4), %rsi - leaq (%rdi, %r11, 4), %rdi - decq %r9 +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 jnz 0b 9: @@ -4258,20 +4253,20 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba): movdqu m000V0V0V000V0V0V(%rip), %xmm8 xorq %rax, %rax - /* Move right to left across each line, */ - /* processing in two pixel chunks */ - leaq (%rsi, %r8, 4), %rsi - leaq (%rdi, %r8, 4), %rdi - - /* Last instruction is %rcx = 0 */ - subq $4, %rsi - subq $4, %rdi - - negq %r8 -0: - movq %r8, %rcx - - incq %rcx + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) @@ -4279,7 +4274,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba): prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) - jz 2f /* one pixel line */ + jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) @@ -4298,10 +4293,10 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba): movb (%r13, %rdx), %al movb %dh, %ah shrb $1, %ah - movd %rax, %xmm3 + movd %rax, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 movdqa %xmm2, %xmm0 @@ -4316,14 +4311,14 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba): psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 - + packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 @@ -4338,10 +4333,10 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba): movb (%r13, %rdx), %al movb %dh, %ah shrb $1, %ah - movd %rax, %xmm3 + movd %rax, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 movdqa %xmm2, %xmm0 @@ -4356,14 +4351,14 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba): psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 - + packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 @@ -4378,10 +4373,10 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba): movb (%r13, %rdx), %al movb %dh, %ah shrb $1, %ah - movd %rax, %xmm3 + movd %rax, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 movdqa %xmm2, %xmm0 @@ -4396,14 +4391,14 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba): psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 - + packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 @@ -4418,10 +4413,10 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba): movb (%r13, %rdx), %al movb %dh, %ah shrb $1, %ah - movd %rax, %xmm3 + movd %rax, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 movdqa %xmm2, %xmm0 @@ -4436,14 +4431,14 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba): psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 - + packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 @@ -4458,10 +4453,10 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba): movb (%r13, %rdx), %al movb %dh, %ah shrb $1, %ah - movd %rax, %xmm3 + movd %rax, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 movdqa %xmm2, %xmm0 @@ -4476,14 +4471,14 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba): psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 - + packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 @@ -4498,10 +4493,10 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba): movb (%r13, %rdx), %al movb %dh, %ah shrb $1, %ah - movd %rax, %xmm3 + movd %rax, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 movdqa %xmm2, %xmm0 @@ -4516,14 +4511,14 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba): psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 - + packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 @@ -4538,10 +4533,10 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba): movb (%r13, %rdx), %al movb %dh, %ah shrb $1, %ah - movd %rax, %xmm3 + movd %rax, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 movdqa %xmm2, %xmm0 @@ -4556,14 +4551,14 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba): psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 - + packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f movq (%rsi, %rcx, 4), %xmm1 movq (%rdi, %rcx, 4), %xmm2 @@ -4578,10 +4573,10 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba): movb (%r13, %rdx), %al movb %dh, %ah shrb $1, %ah - movd %rax, %xmm3 + movd %rax, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 movdqa %xmm2, %xmm0 @@ -4596,14 +4591,14 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba): psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 - + packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx - js 1b - jnz 3f + incq %rcx + incq %rcx + js 1b + jnz 3f 2: movd (%rsi, %rcx, 4), %xmm1 movd (%rdi, %rcx, 4), %xmm2 @@ -4612,10 +4607,10 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba): movb (%r13, %rdx), %al movb %dh, %ah shrb $1, %ah - movd %eax, %xmm3 + movd %eax, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 movdqa %xmm2, %xmm0 @@ -4630,13 +4625,13 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba): psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 - + packuswb %xmm4, %xmm2 movd %xmm2, (%rdi, %rcx, 4) -3: - leaq (%rsi, %r10, 4), %rsi - leaq (%rdi, %r11, 4), %rdi - decq %r9 +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 jnz 0b 9: @@ -5015,7 +5010,7 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgba): /* d = d + s1 - s2, unsigned saturation */ paddusb %xmm1, %xmm2 psubusb %xmm3, %xmm2 - + /* d alpha = s alpha */ pand %xmm5, %xmm2 pand %xmm7, %xmm0 @@ -5055,7 +5050,7 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgba): /* d = d + s1 - s2, unsigned saturation */ paddusb %xmm1, %xmm2 psubusb %xmm3, %xmm2 - + /* d alpha = s alpha */ pand %xmm5, %xmm2 pand %xmm7, %xmm0 @@ -5078,7 +5073,7 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgba): /* d = d + s1 - s2, unsigned saturation */ paddusb %xmm1, %xmm2 psubusb %xmm3, %xmm2 - + /* d alpha = s alpha */ pand %xmm5, %xmm2 pand %xmm7, %xmm0 @@ -5101,7 +5096,7 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgba): /* d = d + s1 - s2, unsigned saturation */ paddusb %xmm1, %xmm2 psubusb %xmm3, %xmm2 - + /* d alpha = s alpha */ pand %xmm5, %xmm2 pand %xmm7, %xmm0 @@ -5124,7 +5119,7 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgba): /* d = d + s1 - s2, unsigned saturation */ paddusb %xmm1, %xmm2 psubusb %xmm3, %xmm2 - + /* d alpha = s alpha */ pand %xmm5, %xmm2 pand %xmm7, %xmm0 @@ -5153,7 +5148,7 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgba): /* d = d + s1 - s2, unsigned saturation */ paddusb %xmm1, %xmm2 psubusb %xmm3, %xmm2 - + /* d alpha = s alpha */ pand %xmm5, %xmm2 pand %xmm7, %xmm0 @@ -5176,7 +5171,7 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgba): /* d = d + s1 - s2, unsigned saturation */ paddusb %xmm1, %xmm2 psubusb %xmm3, %xmm2 - + /* d alpha = s alpha */ pand %xmm5, %xmm2 pand %xmm7, %xmm0 @@ -5199,7 +5194,7 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgba): /* d = d + s1 - s2, unsigned saturation */ paddusb %xmm1, %xmm2 psubusb %xmm3, %xmm2 - + /* d alpha = s alpha */ pand %xmm5, %xmm2 pand %xmm7, %xmm0 @@ -5222,7 +5217,7 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgba): /* d = d + s1 - s2, unsigned saturation */ paddusb %xmm1, %xmm2 psubusb %xmm3, %xmm2 - + /* d alpha = s alpha */ pand %xmm5, %xmm2 pand %xmm7, %xmm0 @@ -5249,7 +5244,7 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgba): /* d = d + s1 - s2, unsigned saturation */ paddusb %xmm1, %xmm2 psubusb %xmm3, %xmm2 - + /* d alpha = s alpha */ pand %xmm5, %xmm2 pand %xmm7, %xmm0 @@ -5521,8 +5516,6 @@ PR_(imlib_amd64_reshade_copy_rgb_to_rgba): LEAVE SIZE(imlib_amd64_reshade_copy_rgb_to_rgba) -#endif - #ifdef __ELF__ .section .note.GNU-stack,"",@progbits #endif diff --git a/src/lib/amd64_blend_cmod.S b/src/lib/amd64_blend_cmod.S index 78e0847..e75b868 100644 --- a/src/lib/amd64_blend_cmod.S +++ b/src/lib/amd64_blend_cmod.S @@ -1,9 +1,7 @@ #include #include "asm.h" -#ifdef DO_AMD64_ASM - -/*\ +/*\ |*| AMD64 SSE2 assembly blending routines for Imlib2 |*| Written by John Slaten |*| Based on MMX routines written by Willem Monsuwe @@ -87,7 +85,7 @@ FN_(imlib_amd64_reshade_copy_rgba_to_rgba_cmod) FN_(imlib_amd64_reshade_copy_rgb_to_rgba_cmod) .extern pow_lut - + /*\ SSE register use: |*| %xmm1 = Source value |*| %xmm2 = Destination value @@ -104,13 +102,6 @@ FN_(imlib_amd64_reshade_copy_rgb_to_rgba_cmod) |*| %r10d = sw |*| %r11d = dw \*/ - - - - - - - #define ENTER \ @@ -130,7 +121,7 @@ FN_(imlib_amd64_reshade_copy_rgb_to_rgba_cmod) jz 9f ; \ testq %r9, %r9 ; \ jz 9f - + #define LEAVE \ popq %r14 ; \ popq %r13 ; \ @@ -147,20 +138,20 @@ PR_(imlib_amd64_blend_rgba_to_rgb_cmod): movdqa c1(%rip), %xmm5 movdqa m00XXXXXX(%rip), %xmm6 - /* Move right to left across each line, */ - /* processing in two pixel chunks */ - leaq (%rsi, %r8, 4), %rsi - leaq (%rdi, %r8, 4), %rdi - - /* Last instruction is %rcx = 0 */ - subq $4, %rsi - subq $4, %rdi - - negq %r8 -0: - movq %r8, %rcx - - incq %rcx + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) @@ -168,7 +159,7 @@ PR_(imlib_amd64_blend_rgba_to_rgb_cmod): prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) - jz 2f /* one pixel line */ + jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) @@ -213,13 +204,13 @@ PR_(imlib_amd64_blend_rgba_to_rgb_cmod): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -242,10 +233,10 @@ PR_(imlib_amd64_blend_rgba_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -286,13 +277,13 @@ PR_(imlib_amd64_blend_rgba_to_rgb_cmod): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -315,10 +306,10 @@ PR_(imlib_amd64_blend_rgba_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -359,13 +350,13 @@ PR_(imlib_amd64_blend_rgba_to_rgb_cmod): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -388,10 +379,10 @@ PR_(imlib_amd64_blend_rgba_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -432,13 +423,13 @@ PR_(imlib_amd64_blend_rgba_to_rgb_cmod): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -461,10 +452,10 @@ PR_(imlib_amd64_blend_rgba_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -505,13 +496,13 @@ PR_(imlib_amd64_blend_rgba_to_rgb_cmod): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -534,10 +525,10 @@ PR_(imlib_amd64_blend_rgba_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -578,13 +569,13 @@ PR_(imlib_amd64_blend_rgba_to_rgb_cmod): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -607,10 +598,10 @@ PR_(imlib_amd64_blend_rgba_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -651,13 +642,13 @@ PR_(imlib_amd64_blend_rgba_to_rgb_cmod): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -680,10 +671,10 @@ PR_(imlib_amd64_blend_rgba_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -724,13 +715,13 @@ PR_(imlib_amd64_blend_rgba_to_rgb_cmod): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -753,10 +744,10 @@ PR_(imlib_amd64_blend_rgba_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx - js 1b - jnz 3f + incq %rcx + incq %rcx + js 1b + jnz 3f 2: /* Grab 1 pixel from src, with colormod */ movl (%rsi, %rcx, 4), %eax @@ -781,13 +772,13 @@ PR_(imlib_amd64_blend_rgba_to_rgb_cmod): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -809,10 +800,10 @@ PR_(imlib_amd64_blend_rgba_to_rgb_cmod): /* Repack new pixels */ packuswb %xmm4, %xmm2 movd %xmm2, (%rdi, %rcx, 4) -3: - leaq (%rsi, %r10, 4), %rsi - leaq (%rdi, %r11, 4), %rdi - decq %r9 +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 jnz 0b 9: @@ -827,20 +818,20 @@ PR_(imlib_amd64_blend_rgba_to_rgba_cmod): movdqa mX000X000X000X000(%rip), %xmm6 movq pow_lut@GOTPCREL(%rip), %r13 - /* Move right to left across each line, */ - /* processing in two pixel chunks */ - leaq (%rsi, %r8, 4), %rsi - leaq (%rdi, %r8, 4), %rdi - - /* Last instruction is %rcx = 0 */ - subq $4, %rsi - subq $4, %rdi - - negq %r8 -0: - movq %r8, %rcx - - incq %rcx + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) @@ -848,7 +839,7 @@ PR_(imlib_amd64_blend_rgba_to_rgba_cmod): prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) - jz 2f /* one pixel line */ + jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) @@ -916,7 +907,7 @@ PR_(imlib_amd64_blend_rgba_to_rgba_cmod): /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* unpack source and dest */ @@ -934,10 +925,10 @@ PR_(imlib_amd64_blend_rgba_to_rgba_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -1001,7 +992,7 @@ PR_(imlib_amd64_blend_rgba_to_rgba_cmod): /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* unpack source and dest */ @@ -1019,10 +1010,10 @@ PR_(imlib_amd64_blend_rgba_to_rgba_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -1086,7 +1077,7 @@ PR_(imlib_amd64_blend_rgba_to_rgba_cmod): /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* unpack source and dest */ @@ -1104,10 +1095,10 @@ PR_(imlib_amd64_blend_rgba_to_rgba_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -1171,7 +1162,7 @@ PR_(imlib_amd64_blend_rgba_to_rgba_cmod): /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* unpack source and dest */ @@ -1189,10 +1180,10 @@ PR_(imlib_amd64_blend_rgba_to_rgba_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -1256,7 +1247,7 @@ PR_(imlib_amd64_blend_rgba_to_rgba_cmod): /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* unpack source and dest */ @@ -1274,10 +1265,10 @@ PR_(imlib_amd64_blend_rgba_to_rgba_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -1341,7 +1332,7 @@ PR_(imlib_amd64_blend_rgba_to_rgba_cmod): /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* unpack source and dest */ @@ -1359,10 +1350,10 @@ PR_(imlib_amd64_blend_rgba_to_rgba_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -1426,7 +1417,7 @@ PR_(imlib_amd64_blend_rgba_to_rgba_cmod): /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* unpack source and dest */ @@ -1444,10 +1435,10 @@ PR_(imlib_amd64_blend_rgba_to_rgba_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -1511,7 +1502,7 @@ PR_(imlib_amd64_blend_rgba_to_rgba_cmod): /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* unpack source and dest */ @@ -1529,10 +1520,10 @@ PR_(imlib_amd64_blend_rgba_to_rgba_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx - js 1b - jnz 3f + incq %rcx + incq %rcx + js 1b + jnz 3f 2: /* Grab 1 pixel from src, with colormod */ movl (%rsi, %rcx, 4), %eax @@ -1568,7 +1559,7 @@ PR_(imlib_amd64_blend_rgba_to_rgba_cmod): /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* unpack source and dest */ @@ -1585,10 +1576,10 @@ PR_(imlib_amd64_blend_rgba_to_rgba_cmod): /* repack new pixels */ packuswb %xmm4, %xmm2 movd %xmm2, (%rdi, %rcx, 4) -3: - leaq (%rsi, %r10, 4), %rsi - leaq (%rdi, %r11, 4), %rdi - decq %r9 +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 jnz 0b 9: @@ -1603,20 +1594,20 @@ PR_(imlib_amd64_blend_rgb_to_rgba_cmod): movdqa mX000X000X000X000(%rip), %xmm6 movq pow_lut@GOTPCREL(%rip), %r13 - /* Move right to left across each line, */ - /* processing in two pixel chunks */ - leaq (%rsi, %r8, 4), %rsi - leaq (%rdi, %r8, 4), %rdi - - /* Last instruction is %rcx = 0 */ - subq $4, %rsi - subq $4, %rdi - - negq %r8 -0: - movq %r8, %rcx - - incq %rcx + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) @@ -1624,7 +1615,7 @@ PR_(imlib_amd64_blend_rgb_to_rgba_cmod): prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) - jz 2f /* one pixel line */ + jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) @@ -1690,7 +1681,7 @@ PR_(imlib_amd64_blend_rgb_to_rgba_cmod): /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* unpack source and dest */ @@ -1708,10 +1699,10 @@ PR_(imlib_amd64_blend_rgb_to_rgba_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -1773,7 +1764,7 @@ PR_(imlib_amd64_blend_rgb_to_rgba_cmod): /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* unpack source and dest */ @@ -1791,10 +1782,10 @@ PR_(imlib_amd64_blend_rgb_to_rgba_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -1856,7 +1847,7 @@ PR_(imlib_amd64_blend_rgb_to_rgba_cmod): /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* unpack source and dest */ @@ -1874,10 +1865,10 @@ PR_(imlib_amd64_blend_rgb_to_rgba_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -1939,7 +1930,7 @@ PR_(imlib_amd64_blend_rgb_to_rgba_cmod): /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* unpack source and dest */ @@ -1957,10 +1948,10 @@ PR_(imlib_amd64_blend_rgb_to_rgba_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -2022,7 +2013,7 @@ PR_(imlib_amd64_blend_rgb_to_rgba_cmod): /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* unpack source and dest */ @@ -2040,10 +2031,10 @@ PR_(imlib_amd64_blend_rgb_to_rgba_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -2105,7 +2096,7 @@ PR_(imlib_amd64_blend_rgb_to_rgba_cmod): /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* unpack source and dest */ @@ -2123,10 +2114,10 @@ PR_(imlib_amd64_blend_rgb_to_rgba_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -2188,7 +2179,7 @@ PR_(imlib_amd64_blend_rgb_to_rgba_cmod): /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* unpack source and dest */ @@ -2206,10 +2197,10 @@ PR_(imlib_amd64_blend_rgb_to_rgba_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -2271,7 +2262,7 @@ PR_(imlib_amd64_blend_rgb_to_rgba_cmod): /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* unpack source and dest */ @@ -2289,10 +2280,10 @@ PR_(imlib_amd64_blend_rgb_to_rgba_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx - js 1b - jnz 3f + incq %rcx + incq %rcx + js 1b + jnz 3f 2: /* Grab 1 pixel from src, with colormod, with a = amod[255] */ movl (%rsi, %rcx, 4), %eax @@ -2327,7 +2318,7 @@ PR_(imlib_amd64_blend_rgb_to_rgba_cmod): /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* unpack source and dest */ @@ -2344,10 +2335,10 @@ PR_(imlib_amd64_blend_rgb_to_rgba_cmod): /* repack new pixels */ packuswb %xmm4, %xmm2 movd %xmm2, (%rdi, %rcx, 4) -3: - leaq (%rsi, %r10, 4), %rsi - leaq (%rdi, %r11, 4), %rdi - decq %r9 +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 jnz 0b 9: @@ -2360,20 +2351,20 @@ PR_(imlib_amd64_blend_rgb_to_rgb_cmod): movdqa c1(%rip), %xmm5 movdqa m00XXXXXX(%rip), %xmm6 - /* Move right to left across each line, */ - /* processing in two pixel chunks */ - leaq (%rsi, %r8, 4), %rsi - leaq (%rdi, %r8, 4), %rdi - - /* Last instruction is %rcx = 0 */ - subq $4, %rsi - subq $4, %rdi - - negq %r8 -0: - movq %r8, %rcx - - incq %rcx + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) @@ -2381,7 +2372,7 @@ PR_(imlib_amd64_blend_rgb_to_rgb_cmod): prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) - jz 2f /* one pixel line */ + jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) @@ -2424,13 +2415,13 @@ PR_(imlib_amd64_blend_rgb_to_rgb_cmod): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -2453,10 +2444,10 @@ PR_(imlib_amd64_blend_rgb_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -2495,13 +2486,13 @@ PR_(imlib_amd64_blend_rgb_to_rgb_cmod): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -2524,10 +2515,10 @@ PR_(imlib_amd64_blend_rgb_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -2566,13 +2557,13 @@ PR_(imlib_amd64_blend_rgb_to_rgb_cmod): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -2595,10 +2586,10 @@ PR_(imlib_amd64_blend_rgb_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -2637,13 +2628,13 @@ PR_(imlib_amd64_blend_rgb_to_rgb_cmod): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -2666,10 +2657,10 @@ PR_(imlib_amd64_blend_rgb_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -2708,13 +2699,13 @@ PR_(imlib_amd64_blend_rgb_to_rgb_cmod): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -2737,10 +2728,10 @@ PR_(imlib_amd64_blend_rgb_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -2779,13 +2770,13 @@ PR_(imlib_amd64_blend_rgb_to_rgb_cmod): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -2808,10 +2799,10 @@ PR_(imlib_amd64_blend_rgb_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -2850,13 +2841,13 @@ PR_(imlib_amd64_blend_rgb_to_rgb_cmod): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -2879,10 +2870,10 @@ PR_(imlib_amd64_blend_rgb_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -2921,13 +2912,13 @@ PR_(imlib_amd64_blend_rgb_to_rgb_cmod): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -2950,10 +2941,10 @@ PR_(imlib_amd64_blend_rgb_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx - js 1b - jnz 3f + incq %rcx + incq %rcx + js 1b + jnz 3f 2: /* Grab 1 pixel from src, with colormod, with a = amod[255] */ movl (%rsi, %rcx, 4), %eax @@ -2972,18 +2963,18 @@ PR_(imlib_amd64_blend_rgb_to_rgb_cmod): movb %al, %bl movb 0x200(%r14, %rbx), %dl movd %edx, %xmm1 - movd (%rdi, %rcx, 4), %xmm2 + movd (%rdi, %rcx, 4), %xmm2 /* Get alpha from source and unpack to words * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -3005,10 +2996,10 @@ PR_(imlib_amd64_blend_rgb_to_rgb_cmod): /* Repack new pixels */ packuswb %xmm4, %xmm2 movd %xmm2, (%rdi, %rcx, 4) -3: - leaq (%rsi, %r10, 4), %rsi - leaq (%rdi, %r11, 4), %rdi - decq %r9 +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 jnz 0b 9: @@ -3019,20 +3010,20 @@ PR_(imlib_amd64_copy_rgba_to_rgb_cmod): movq mX000X000X000X000(%rip), %r13 - /* Move right to left across each line, */ - /* processing in two pixel chunks */ - leaq (%rsi, %r8, 4), %rsi - leaq (%rdi, %r8, 4), %rdi - - /* Last instruction is %rcx = 0 */ - subq $4, %rsi - subq $4, %rdi - - negq %r8 -0: - movq %r8, %rcx - - incq %rcx + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) @@ -3040,7 +3031,7 @@ PR_(imlib_amd64_copy_rgba_to_rgb_cmod): prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) - jz 2f /* one pixel line */ + jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) @@ -3076,10 +3067,10 @@ PR_(imlib_amd64_copy_rgba_to_rgb_cmod): orq %rax, %rdx movq %rdx, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = 0 */ movq (%rsi, %rcx, 4), %rax @@ -3111,10 +3102,10 @@ PR_(imlib_amd64_copy_rgba_to_rgb_cmod): orq %rax, %rdx movq %rdx, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = 0 */ movq (%rsi, %rcx, 4), %rax @@ -3146,10 +3137,10 @@ PR_(imlib_amd64_copy_rgba_to_rgb_cmod): orq %rax, %rdx movq %rdx, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = 0 */ movq (%rsi, %rcx, 4), %rax @@ -3181,10 +3172,10 @@ PR_(imlib_amd64_copy_rgba_to_rgb_cmod): orq %rax, %rdx movq %rdx, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = 0 */ movq (%rsi, %rcx, 4), %rax @@ -3216,10 +3207,10 @@ PR_(imlib_amd64_copy_rgba_to_rgb_cmod): orq %rax, %rdx movq %rdx, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = 0 */ movq (%rsi, %rcx, 4), %rax @@ -3251,10 +3242,10 @@ PR_(imlib_amd64_copy_rgba_to_rgb_cmod): orq %rax, %rdx movq %rdx, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = 0 */ movq (%rsi, %rcx, 4), %rax @@ -3286,10 +3277,10 @@ PR_(imlib_amd64_copy_rgba_to_rgb_cmod): orq %rax, %rdx movq %rdx, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = 0 */ movq (%rsi, %rcx, 4), %rax @@ -3321,10 +3312,10 @@ PR_(imlib_amd64_copy_rgba_to_rgb_cmod): orq %rax, %rdx movq %rdx, (%rdi, %rcx, 4) - incq %rcx - incq %rcx - js 1b - jnz 3f + incq %rcx + incq %rcx + js 1b + jnz 3f 2: /* Grab 1 pixel from src, with colormod, with a = 0 */ movl (%rsi, %rcx, 4), %eax @@ -3343,10 +3334,10 @@ PR_(imlib_amd64_copy_rgba_to_rgb_cmod): andq %r13, %rax orq %rax, %rdx movl %edx, (%rdi, %rcx, 4) -3: - leaq (%rsi, %r10, 4), %rsi - leaq (%rdi, %r11, 4), %rdi - decq %r9 +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 jnz 0b 9: @@ -3356,20 +3347,20 @@ PR_(imlib_amd64_copy_rgba_to_rgba_cmod): ENTER - /* Move right to left across each line, */ - /* processing in two pixel chunks */ - leaq (%rsi, %r8, 4), %rsi - leaq (%rdi, %r8, 4), %rdi - - /* Last instruction is %rcx = 0 */ - subq $4, %rsi - subq $4, %rdi - - negq %r8 -0: - movq %r8, %rcx - - incq %rcx + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) @@ -3377,7 +3368,7 @@ PR_(imlib_amd64_copy_rgba_to_rgba_cmod): prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) - jz 2f /* one pixel line */ + jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) @@ -3418,10 +3409,10 @@ PR_(imlib_amd64_copy_rgba_to_rgba_cmod): movb 0x200(%r14, %rbx), %dl movq %rdx, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -3458,10 +3449,10 @@ PR_(imlib_amd64_copy_rgba_to_rgba_cmod): movb 0x200(%r14, %rbx), %dl movq %rdx, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -3498,10 +3489,10 @@ PR_(imlib_amd64_copy_rgba_to_rgba_cmod): movb 0x200(%r14, %rbx), %dl movq %rdx, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -3538,10 +3529,10 @@ PR_(imlib_amd64_copy_rgba_to_rgba_cmod): movb 0x200(%r14, %rbx), %dl movq %rdx, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -3578,10 +3569,10 @@ PR_(imlib_amd64_copy_rgba_to_rgba_cmod): movb 0x200(%r14, %rbx), %dl movq %rdx, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -3618,10 +3609,10 @@ PR_(imlib_amd64_copy_rgba_to_rgba_cmod): movb 0x200(%r14, %rbx), %dl movq %rdx, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -3658,10 +3649,10 @@ PR_(imlib_amd64_copy_rgba_to_rgba_cmod): movb 0x200(%r14, %rbx), %dl movq %rdx, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -3698,10 +3689,10 @@ PR_(imlib_amd64_copy_rgba_to_rgba_cmod): movb 0x200(%r14, %rbx), %dl movq %rdx, (%rdi, %rcx, 4) - incq %rcx - incq %rcx - js 1b - jnz 3f + incq %rcx + incq %rcx + js 1b + jnz 3f 2: /* Grab 1 pixel from src, with colormod */ movl (%rsi, %rcx, 4), %eax @@ -3721,10 +3712,10 @@ PR_(imlib_amd64_copy_rgba_to_rgba_cmod): movb %al, %bl movb 0x200(%r14, %rbx), %dl movl %edx, (%rdi, %rcx, 4) -3: - leaq (%rsi, %r10, 4), %rsi - leaq (%rdi, %r11, 4), %rdi - decq %r9 +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 jnz 0b 9: @@ -3734,20 +3725,20 @@ PR_(imlib_amd64_copy_rgb_to_rgba_cmod): ENTER - /* Move right to left across each line, */ - /* processing in two pixel chunks */ - leaq (%rsi, %r8, 4), %rsi - leaq (%rdi, %r8, 4), %rdi - - /* Last instruction is %rcx = 0 */ - subq $4, %rsi - subq $4, %rdi - - negq %r8 -0: - movq %r8, %rcx - - incq %rcx + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) @@ -3755,7 +3746,7 @@ PR_(imlib_amd64_copy_rgb_to_rgba_cmod): prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) - jz 2f /* one pixel line */ + jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) @@ -3794,10 +3785,10 @@ PR_(imlib_amd64_copy_rgb_to_rgba_cmod): movb 0x200(%r14, %rbx), %dl movq %rdx, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -3832,10 +3823,10 @@ PR_(imlib_amd64_copy_rgb_to_rgba_cmod): movb 0x200(%r14, %rbx), %dl movq %rdx, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -3870,10 +3861,10 @@ PR_(imlib_amd64_copy_rgb_to_rgba_cmod): movb 0x200(%r14, %rbx), %dl movq %rdx, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -3908,10 +3899,10 @@ PR_(imlib_amd64_copy_rgb_to_rgba_cmod): movb 0x200(%r14, %rbx), %dl movq %rdx, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -3946,10 +3937,10 @@ PR_(imlib_amd64_copy_rgb_to_rgba_cmod): movb 0x200(%r14, %rbx), %dl movq %rdx, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -3984,10 +3975,10 @@ PR_(imlib_amd64_copy_rgb_to_rgba_cmod): movb 0x200(%r14, %rbx), %dl movq %rdx, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -4022,10 +4013,10 @@ PR_(imlib_amd64_copy_rgb_to_rgba_cmod): movb 0x200(%r14, %rbx), %dl movq %rdx, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -4060,10 +4051,10 @@ PR_(imlib_amd64_copy_rgb_to_rgba_cmod): movb 0x200(%r14, %rbx), %dl movq %rdx, (%rdi, %rcx, 4) - incq %rcx - incq %rcx - js 1b - jnz 3f + incq %rcx + incq %rcx + js 1b + jnz 3f 2: /* Grab 1 pixel from src, with colormod, with a = amod[255] */ movl (%rsi, %rcx, 4), %eax @@ -4082,10 +4073,10 @@ PR_(imlib_amd64_copy_rgb_to_rgba_cmod): movb %al, %bl movb 0x200(%r14, %rbx), %dl movl %edx, (%rdi, %rcx, 4) -3: - leaq (%rsi, %r10, 4), %rsi - leaq (%rdi, %r11, 4), %rdi - decq %r9 +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 jnz 0b 9: @@ -4097,20 +4088,20 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb_cmod): pxor %xmm4, %xmm4 movdqa m00XXXXXX(%rip), %xmm6 - /* Move right to left across each line, */ - /* processing in two pixel chunks */ - leaq (%rsi, %r8, 4), %rsi - leaq (%rdi, %r8, 4), %rdi - - /* Last instruction is %rcx = 0 */ - subq $4, %rsi - subq $4, %rdi - - negq %r8 -0: - movq %r8, %rcx - - incq %rcx + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) @@ -4118,7 +4109,7 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb_cmod): prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) - jz 2f /* one pixel line */ + jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) @@ -4163,13 +4154,13 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb_cmod): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -4190,10 +4181,10 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -4234,13 +4225,13 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb_cmod): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -4261,10 +4252,10 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -4305,13 +4296,13 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb_cmod): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -4332,10 +4323,10 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -4376,13 +4367,13 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb_cmod): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -4403,10 +4394,10 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -4447,13 +4438,13 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb_cmod): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -4474,10 +4465,10 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -4518,13 +4509,13 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb_cmod): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -4545,10 +4536,10 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -4589,13 +4580,13 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb_cmod): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -4616,10 +4607,10 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -4660,13 +4651,13 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb_cmod): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -4687,10 +4678,10 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx - js 1b - jnz 3f + incq %rcx + incq %rcx + js 1b + jnz 3f 2: /* Grab 1 pixel from src, with colormod */ movl (%rsi, %rcx, 4), %eax @@ -4715,13 +4706,13 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb_cmod): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -4741,10 +4732,10 @@ PR_(imlib_amd64_add_blend_rgba_to_rgb_cmod): /* pack new pixels */ packuswb %xmm4, %xmm2 movd %xmm2, (%rdi, %rcx, 4) -3: - leaq (%rsi, %r10, 4), %rsi - leaq (%rdi, %r11, 4), %rdi - decq %r9 +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 jnz 0b 9: @@ -4760,20 +4751,20 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba_cmod): movdqa mX000X000X000X000(%rip), %xmm6 movq pow_lut@GOTPCREL(%rip), %r13 - /* Move right to left across each line, */ - /* processing in two pixel chunks */ - leaq (%rsi, %r8, 4), %rsi - leaq (%rdi, %r8, 4), %rdi - - /* Last instruction is %rcx = 0 */ - subq $4, %rsi - subq $4, %rdi - - negq %r8 -0: - movq %r8, %rcx - - incq %rcx + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) @@ -4781,7 +4772,7 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba_cmod): prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) - jz 2f /* one pixel line */ + jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) @@ -4846,7 +4837,7 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba_cmod): /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ @@ -4863,15 +4854,15 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba_cmod): psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 - + /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -4932,7 +4923,7 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba_cmod): /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ @@ -4949,15 +4940,15 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba_cmod): psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 - + /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -5018,7 +5009,7 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba_cmod): /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ @@ -5035,15 +5026,15 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba_cmod): psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 - + /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -5104,7 +5095,7 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba_cmod): /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ @@ -5121,15 +5112,15 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba_cmod): psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 - + /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -5190,7 +5181,7 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba_cmod): /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ @@ -5207,15 +5198,15 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba_cmod): psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 - + /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -5276,7 +5267,7 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba_cmod): /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ @@ -5293,15 +5284,15 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba_cmod): psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 - + /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -5362,7 +5353,7 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba_cmod): /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ @@ -5379,15 +5370,15 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba_cmod): psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 - + /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -5448,7 +5439,7 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba_cmod): /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ @@ -5465,15 +5456,15 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba_cmod): psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 - + /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx - js 1b - jnz 3f + incq %rcx + incq %rcx + js 1b + jnz 3f 2: /* Grab 1 pixel from src, with colormod */ movl (%rsi, %rcx, 4), %eax @@ -5506,7 +5497,7 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba_cmod): /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ @@ -5523,14 +5514,14 @@ PR_(imlib_amd64_add_blend_rgba_to_rgba_cmod): psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 - + /* pack new pixels */ packuswb %xmm4, %xmm2 movd %xmm2, (%rdi, %rcx, 4) -3: - leaq (%rsi, %r10, 4), %rsi - leaq (%rdi, %r11, 4), %rdi - decq %r9 +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 jnz 0b 9: @@ -5546,20 +5537,20 @@ PR_(imlib_amd64_add_blend_rgb_to_rgba_cmod): movdqa mX000X000X000X000(%rip), %xmm6 movq pow_lut@GOTPCREL(%rip), %r13 - /* Move right to left across each line, */ - /* processing in two pixel chunks */ - leaq (%rsi, %r8, 4), %rsi - leaq (%rdi, %r8, 4), %rdi - - /* Last instruction is %rcx = 0 */ - subq $4, %rsi - subq $4, %rdi - - negq %r8 -0: - movq %r8, %rcx - - incq %rcx + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) @@ -5567,7 +5558,7 @@ PR_(imlib_amd64_add_blend_rgb_to_rgba_cmod): prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) - jz 2f /* one pixel line */ + jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) @@ -5630,7 +5621,7 @@ PR_(imlib_amd64_add_blend_rgb_to_rgba_cmod): /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ @@ -5647,15 +5638,15 @@ PR_(imlib_amd64_add_blend_rgb_to_rgba_cmod): psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 - + /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -5714,7 +5705,7 @@ PR_(imlib_amd64_add_blend_rgb_to_rgba_cmod): /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ @@ -5731,15 +5722,15 @@ PR_(imlib_amd64_add_blend_rgb_to_rgba_cmod): psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 - + /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -5798,7 +5789,7 @@ PR_(imlib_amd64_add_blend_rgb_to_rgba_cmod): /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ @@ -5815,15 +5806,15 @@ PR_(imlib_amd64_add_blend_rgb_to_rgba_cmod): psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 - + /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -5882,7 +5873,7 @@ PR_(imlib_amd64_add_blend_rgb_to_rgba_cmod): /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ @@ -5899,15 +5890,15 @@ PR_(imlib_amd64_add_blend_rgb_to_rgba_cmod): psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 - + /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -5966,7 +5957,7 @@ PR_(imlib_amd64_add_blend_rgb_to_rgba_cmod): /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ @@ -5983,15 +5974,15 @@ PR_(imlib_amd64_add_blend_rgb_to_rgba_cmod): psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 - + /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -6050,7 +6041,7 @@ PR_(imlib_amd64_add_blend_rgb_to_rgba_cmod): /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ @@ -6067,15 +6058,15 @@ PR_(imlib_amd64_add_blend_rgb_to_rgba_cmod): psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 - + /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -6134,7 +6125,7 @@ PR_(imlib_amd64_add_blend_rgb_to_rgba_cmod): /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ @@ -6151,15 +6142,15 @@ PR_(imlib_amd64_add_blend_rgb_to_rgba_cmod): psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 - + /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -6218,7 +6209,7 @@ PR_(imlib_amd64_add_blend_rgb_to_rgba_cmod): /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ @@ -6235,15 +6226,15 @@ PR_(imlib_amd64_add_blend_rgb_to_rgba_cmod): psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 - + /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx - js 1b - jnz 3f + incq %rcx + incq %rcx + js 1b + jnz 3f 2: /* Grab 1 pixel from src, with colormod, with a = amod[255] */ movl (%rsi, %rcx, 4), %eax @@ -6275,7 +6266,7 @@ PR_(imlib_amd64_add_blend_rgb_to_rgba_cmod): /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ @@ -6292,14 +6283,14 @@ PR_(imlib_amd64_add_blend_rgb_to_rgba_cmod): psllw $1, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 - + /* pack new pixels */ packuswb %xmm4, %xmm2 movd %xmm2, (%rdi, %rcx, 4) -3: - leaq (%rsi, %r10, 4), %rsi - leaq (%rdi, %r11, 4), %rdi - decq %r9 +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 jnz 0b 9: @@ -6312,20 +6303,20 @@ PR_(imlib_amd64_add_blend_rgb_to_rgb_cmod): pxor %xmm4, %xmm4 movdqa m00XXXXXX(%rip), %xmm6 - /* Move right to left across each line, */ - /* processing in two pixel chunks */ - leaq (%rsi, %r8, 4), %rsi - leaq (%rdi, %r8, 4), %rdi - - /* Last instruction is %rcx = 0 */ - subq $4, %rsi - subq $4, %rdi - - negq %r8 -0: - movq %r8, %rcx - - incq %rcx + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) @@ -6333,7 +6324,7 @@ PR_(imlib_amd64_add_blend_rgb_to_rgb_cmod): prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) - jz 2f /* one pixel line */ + jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) @@ -6376,13 +6367,13 @@ PR_(imlib_amd64_add_blend_rgb_to_rgb_cmod): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -6403,10 +6394,10 @@ PR_(imlib_amd64_add_blend_rgb_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -6445,13 +6436,13 @@ PR_(imlib_amd64_add_blend_rgb_to_rgb_cmod): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -6472,10 +6463,10 @@ PR_(imlib_amd64_add_blend_rgb_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -6514,13 +6505,13 @@ PR_(imlib_amd64_add_blend_rgb_to_rgb_cmod): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -6541,10 +6532,10 @@ PR_(imlib_amd64_add_blend_rgb_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -6583,13 +6574,13 @@ PR_(imlib_amd64_add_blend_rgb_to_rgb_cmod): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -6610,10 +6601,10 @@ PR_(imlib_amd64_add_blend_rgb_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -6652,13 +6643,13 @@ PR_(imlib_amd64_add_blend_rgb_to_rgb_cmod): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -6679,10 +6670,10 @@ PR_(imlib_amd64_add_blend_rgb_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -6721,13 +6712,13 @@ PR_(imlib_amd64_add_blend_rgb_to_rgb_cmod): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -6748,10 +6739,10 @@ PR_(imlib_amd64_add_blend_rgb_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -6790,13 +6781,13 @@ PR_(imlib_amd64_add_blend_rgb_to_rgb_cmod): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -6817,10 +6808,10 @@ PR_(imlib_amd64_add_blend_rgb_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -6859,13 +6850,13 @@ PR_(imlib_amd64_add_blend_rgb_to_rgb_cmod): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -6886,10 +6877,10 @@ PR_(imlib_amd64_add_blend_rgb_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx - js 1b - jnz 3f + incq %rcx + incq %rcx + js 1b + jnz 3f 2: /* Grab 1 pixel from src, with colormod, with a = amod[255] */ movl (%rsi, %rcx, 4), %eax @@ -6913,13 +6904,13 @@ PR_(imlib_amd64_add_blend_rgb_to_rgb_cmod): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -6939,10 +6930,10 @@ PR_(imlib_amd64_add_blend_rgb_to_rgb_cmod): /* pack new pixels */ packuswb %xmm4, %xmm2 movd %xmm2, (%rdi, %rcx, 4) -3: - leaq (%rsi, %r10, 4), %rsi - leaq (%rdi, %r11, 4), %rdi - decq %r9 +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 jnz 0b 9: @@ -6954,20 +6945,20 @@ PR_(imlib_amd64_add_copy_rgba_to_rgb_cmod): movdqa m0XXX0XXX0XXX0XXX(%rip), %xmm5 - /* Move right to left across each line, */ - /* processing in two pixel chunks */ - leaq (%rsi, %r8, 4), %rsi - leaq (%rdi, %r8, 4), %rdi - - /* Last instruction is %rcx = 0 */ - subq $4, %rsi - subq $4, %rdi - - negq %r8 -0: - movq %r8, %rcx - - incq %rcx + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) @@ -6975,7 +6966,7 @@ PR_(imlib_amd64_add_copy_rgba_to_rgb_cmod): prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) - jz 2f /* one pixel line */ + jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) @@ -7021,10 +7012,10 @@ PR_(imlib_amd64_add_copy_rgba_to_rgb_cmod): paddusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -7066,10 +7057,10 @@ PR_(imlib_amd64_add_copy_rgba_to_rgb_cmod): paddusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -7111,10 +7102,10 @@ PR_(imlib_amd64_add_copy_rgba_to_rgb_cmod): paddusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -7156,10 +7147,10 @@ PR_(imlib_amd64_add_copy_rgba_to_rgb_cmod): paddusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -7201,10 +7192,10 @@ PR_(imlib_amd64_add_copy_rgba_to_rgb_cmod): paddusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -7246,10 +7237,10 @@ PR_(imlib_amd64_add_copy_rgba_to_rgb_cmod): paddusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -7291,10 +7282,10 @@ PR_(imlib_amd64_add_copy_rgba_to_rgb_cmod): paddusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -7336,10 +7327,10 @@ PR_(imlib_amd64_add_copy_rgba_to_rgb_cmod): paddusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx - js 1b - jnz 3f + incq %rcx + incq %rcx + js 1b + jnz 3f 2: /* Grab 1 pixel from src, with colormod */ movl (%rsi, %rcx, 4), %eax @@ -7364,10 +7355,10 @@ PR_(imlib_amd64_add_copy_rgba_to_rgb_cmod): pand %xmm5, %xmm1 paddusb %xmm1, %xmm2 movd %xmm2, (%rdi, %rcx, 4) -3: - leaq (%rsi, %r10, 4), %rsi - leaq (%rdi, %r11, 4), %rdi - decq %r9 +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 jnz 0b 9: @@ -7379,20 +7370,20 @@ PR_(imlib_amd64_add_copy_rgba_to_rgba_cmod): movdqa m0XXX0XXX0XXX0XXX(%rip), %xmm5 - /* Move right to left across each line, */ - /* processing in two pixel chunks */ - leaq (%rsi, %r8, 4), %rsi - leaq (%rdi, %r8, 4), %rdi - - /* Last instruction is %rcx = 0 */ - subq $4, %rsi - subq $4, %rdi - - negq %r8 -0: - movq %r8, %rcx - - incq %rcx + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) @@ -7400,7 +7391,7 @@ PR_(imlib_amd64_add_copy_rgba_to_rgba_cmod): prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) - jz 2f /* one pixel line */ + jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) @@ -7446,10 +7437,10 @@ PR_(imlib_amd64_add_copy_rgba_to_rgba_cmod): paddusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -7491,10 +7482,10 @@ PR_(imlib_amd64_add_copy_rgba_to_rgba_cmod): paddusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -7536,10 +7527,10 @@ PR_(imlib_amd64_add_copy_rgba_to_rgba_cmod): paddusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -7581,10 +7572,10 @@ PR_(imlib_amd64_add_copy_rgba_to_rgba_cmod): paddusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -7626,10 +7617,10 @@ PR_(imlib_amd64_add_copy_rgba_to_rgba_cmod): paddusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -7671,10 +7662,10 @@ PR_(imlib_amd64_add_copy_rgba_to_rgba_cmod): paddusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -7716,10 +7707,10 @@ PR_(imlib_amd64_add_copy_rgba_to_rgba_cmod): paddusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -7761,10 +7752,10 @@ PR_(imlib_amd64_add_copy_rgba_to_rgba_cmod): paddusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx - js 1b - jnz 3f + incq %rcx + incq %rcx + js 1b + jnz 3f 2: /* Grab 1 pixel from src, with colormod */ movl (%rsi, %rcx, 4), %eax @@ -7789,10 +7780,10 @@ PR_(imlib_amd64_add_copy_rgba_to_rgba_cmod): pand %xmm5, %xmm2 paddusb %xmm1, %xmm2 movd %xmm2, (%rdi, %rcx, 4) -3: - leaq (%rsi, %r10, 4), %rsi - leaq (%rdi, %r11, 4), %rdi - decq %r9 +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 jnz 0b 9: @@ -7804,20 +7795,20 @@ PR_(imlib_amd64_add_copy_rgb_to_rgba_cmod): movdqa m0XXX0XXX0XXX0XXX(%rip), %xmm5 - /* Move right to left across each line, */ - /* processing in two pixel chunks */ - leaq (%rsi, %r8, 4), %rsi - leaq (%rdi, %r8, 4), %rdi - - /* Last instruction is %rcx = 0 */ - subq $4, %rsi - subq $4, %rdi - - negq %r8 -0: - movq %r8, %rcx - - incq %rcx + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) @@ -7825,7 +7816,7 @@ PR_(imlib_amd64_add_copy_rgb_to_rgba_cmod): prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) - jz 2f /* one pixel line */ + jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) @@ -7868,10 +7859,10 @@ PR_(imlib_amd64_add_copy_rgb_to_rgba_cmod): paddusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -7910,10 +7901,10 @@ PR_(imlib_amd64_add_copy_rgb_to_rgba_cmod): paddusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -7952,10 +7943,10 @@ PR_(imlib_amd64_add_copy_rgb_to_rgba_cmod): paddusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -7994,10 +7985,10 @@ PR_(imlib_amd64_add_copy_rgb_to_rgba_cmod): paddusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -8036,10 +8027,10 @@ PR_(imlib_amd64_add_copy_rgb_to_rgba_cmod): paddusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -8078,10 +8069,10 @@ PR_(imlib_amd64_add_copy_rgb_to_rgba_cmod): paddusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -8120,10 +8111,10 @@ PR_(imlib_amd64_add_copy_rgb_to_rgba_cmod): paddusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -8162,10 +8153,10 @@ PR_(imlib_amd64_add_copy_rgb_to_rgba_cmod): paddusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx - js 1b - jnz 3f + incq %rcx + incq %rcx + js 1b + jnz 3f 2: /* Grab 1 pixel from src, with colormod, with a = amod[255] */ movl (%rsi, %rcx, 4), %eax @@ -8188,10 +8179,10 @@ PR_(imlib_amd64_add_copy_rgb_to_rgba_cmod): pand %xmm5, %xmm2 paddusb %xmm1, %xmm2 movd %xmm2, (%rdi, %rcx, 4) -3: - leaq (%rsi, %r10, 4), %rsi - leaq (%rdi, %r11, 4), %rdi - decq %r9 +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 jnz 0b 9: @@ -8204,20 +8195,20 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb_cmod): pxor %xmm4, %xmm4 movdqa m00XXXXXX(%rip), %xmm6 - /* Move right to left across each line, */ - /* processing in two pixel chunks */ - leaq (%rsi, %r8, 4), %rsi - leaq (%rdi, %r8, 4), %rdi - - /* Last instruction is %rcx = 0 */ - subq $4, %rsi - subq $4, %rdi - - negq %r8 -0: - movq %r8, %rcx - - incq %rcx + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) @@ -8225,7 +8216,7 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb_cmod): prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) - jz 2f /* one pixel line */ + jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) @@ -8270,13 +8261,13 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb_cmod): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -8297,10 +8288,10 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -8341,13 +8332,13 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb_cmod): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -8368,10 +8359,10 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -8412,13 +8403,13 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb_cmod): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -8439,10 +8430,10 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -8483,13 +8474,13 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb_cmod): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -8510,10 +8501,10 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -8554,13 +8545,13 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb_cmod): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -8581,10 +8572,10 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -8625,13 +8616,13 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb_cmod): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -8652,10 +8643,10 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -8696,13 +8687,13 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb_cmod): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -8723,10 +8714,10 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -8767,13 +8758,13 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb_cmod): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -8794,10 +8785,10 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx - js 1b - jnz 3f + incq %rcx + incq %rcx + js 1b + jnz 3f 2: /* Grab 1 pixel from src, with colormod */ movl (%rsi, %rcx, 4), %eax @@ -8822,13 +8813,13 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb_cmod): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -8848,10 +8839,10 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgb_cmod): /* pack new pixels */ packuswb %xmm4, %xmm2 movd %xmm2, (%rdi, %rcx, 4) -3: - leaq (%rsi, %r10, 4), %rsi - leaq (%rdi, %r11, 4), %rdi - decq %r9 +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 jnz 0b 9: @@ -8867,20 +8858,20 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba_cmod): movdqa mX000X000(%rip), %xmm7 xorq %rax, %rax - /* Move right to left across each line, */ - /* processing in two pixel chunks */ - leaq (%rsi, %r8, 4), %rsi - leaq (%rdi, %r8, 4), %rdi - - /* Last instruction is %rcx = 0 */ - subq $4, %rsi - subq $4, %rdi - - negq %r8 -0: - movq %r8, %rcx - - incq %rcx + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) @@ -8888,7 +8879,7 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba_cmod): prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) - jz 2f /* one pixel line */ + jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) @@ -8953,7 +8944,7 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba_cmod): /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ @@ -8971,15 +8962,15 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba_cmod): pmulhw %xmm3, %xmm1 pxor %xmm7, %xmm1 psubsw %xmm1, %xmm2 - + /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -9040,7 +9031,7 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba_cmod): /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ @@ -9058,15 +9049,15 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba_cmod): pmulhw %xmm3, %xmm1 pxor %xmm7, %xmm1 psubsw %xmm1, %xmm2 - + /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -9127,7 +9118,7 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba_cmod): /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ @@ -9145,15 +9136,15 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba_cmod): pmulhw %xmm3, %xmm1 pxor %xmm7, %xmm1 psubsw %xmm1, %xmm2 - + /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -9214,7 +9205,7 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba_cmod): /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ @@ -9232,15 +9223,15 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba_cmod): pmulhw %xmm3, %xmm1 pxor %xmm7, %xmm1 psubsw %xmm1, %xmm2 - + /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -9301,7 +9292,7 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba_cmod): /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ @@ -9319,15 +9310,15 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba_cmod): pmulhw %xmm3, %xmm1 pxor %xmm7, %xmm1 psubsw %xmm1, %xmm2 - + /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -9388,7 +9379,7 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba_cmod): /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ @@ -9406,15 +9397,15 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba_cmod): pmulhw %xmm3, %xmm1 pxor %xmm7, %xmm1 psubsw %xmm1, %xmm2 - + /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -9475,7 +9466,7 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba_cmod): /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ @@ -9493,15 +9484,15 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba_cmod): pmulhw %xmm3, %xmm1 pxor %xmm7, %xmm1 psubsw %xmm1, %xmm2 - + /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -9562,7 +9553,7 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba_cmod): /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ @@ -9580,15 +9571,15 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba_cmod): pmulhw %xmm3, %xmm1 pxor %xmm7, %xmm1 psubsw %xmm1, %xmm2 - + /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx - js 1b - jnz 3f + incq %rcx + incq %rcx + js 1b + jnz 3f 2: /* Grab 1 pixel from src, with colormod */ movl (%rsi, %rcx, 4), %eax @@ -9621,7 +9612,7 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba_cmod): /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ @@ -9639,14 +9630,14 @@ PR_(imlib_amd64_subtract_blend_rgba_to_rgba_cmod): pmulhw %xmm3, %xmm1 pxor %xmm7, %xmm1 psubsw %xmm1, %xmm2 - + /* pack new pixels */ packuswb %xmm4, %xmm2 movd %xmm2, (%rdi, %rcx, 4) -3: - leaq (%rsi, %r10, 4), %rsi - leaq (%rdi, %r11, 4), %rdi - decq %r9 +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 jnz 0b 9: @@ -9662,20 +9653,20 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgba_cmod): movdqa mX000X000(%rip), %xmm7 xorq %rax, %rax - /* Move right to left across each line, */ - /* processing in two pixel chunks */ - leaq (%rsi, %r8, 4), %rsi - leaq (%rdi, %r8, 4), %rdi - - /* Last instruction is %rcx = 0 */ - subq $4, %rsi - subq $4, %rdi - - negq %r8 -0: - movq %r8, %rcx - - incq %rcx + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) @@ -9683,7 +9674,7 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgba_cmod): prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) - jz 2f /* one pixel line */ + jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) @@ -9746,7 +9737,7 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgba_cmod): /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ @@ -9764,15 +9755,15 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgba_cmod): pmulhw %xmm3, %xmm1 pxor %xmm7, %xmm1 psubsw %xmm1, %xmm2 - + /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -9831,7 +9822,7 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgba_cmod): /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ @@ -9849,15 +9840,15 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgba_cmod): pmulhw %xmm3, %xmm1 pxor %xmm7, %xmm1 psubsw %xmm1, %xmm2 - + /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -9916,7 +9907,7 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgba_cmod): /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ @@ -9934,15 +9925,15 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgba_cmod): pmulhw %xmm3, %xmm1 pxor %xmm7, %xmm1 psubsw %xmm1, %xmm2 - + /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -10001,7 +9992,7 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgba_cmod): /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ @@ -10019,15 +10010,15 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgba_cmod): pmulhw %xmm3, %xmm1 pxor %xmm7, %xmm1 psubsw %xmm1, %xmm2 - + /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -10086,7 +10077,7 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgba_cmod): /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ @@ -10104,15 +10095,15 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgba_cmod): pmulhw %xmm3, %xmm1 pxor %xmm7, %xmm1 psubsw %xmm1, %xmm2 - + /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -10171,7 +10162,7 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgba_cmod): /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ @@ -10189,15 +10180,15 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgba_cmod): pmulhw %xmm3, %xmm1 pxor %xmm7, %xmm1 psubsw %xmm1, %xmm2 - + /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -10256,7 +10247,7 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgba_cmod): /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ @@ -10274,15 +10265,15 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgba_cmod): pmulhw %xmm3, %xmm1 pxor %xmm7, %xmm1 psubsw %xmm1, %xmm2 - + /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -10341,7 +10332,7 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgba_cmod): /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ @@ -10359,15 +10350,15 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgba_cmod): pmulhw %xmm3, %xmm1 pxor %xmm7, %xmm1 psubsw %xmm1, %xmm2 - + /* pack new pixels */ packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx - js 1b - jnz 3f + incq %rcx + incq %rcx + js 1b + jnz 3f 2: /* Grab 1 pixel from src, with colormod, with a = amod[255] */ movl (%rsi, %rcx, 4), %eax @@ -10399,7 +10390,7 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgba_cmod): /* unpack alpha to src alpha, combined alpha x 3 */ punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 /* src alpha = 255 - dst alpha */ @@ -10417,14 +10408,14 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgba_cmod): pmulhw %xmm3, %xmm1 pxor %xmm7, %xmm1 psubsw %xmm1, %xmm2 - + /* pack new pixels */ packuswb %xmm4, %xmm2 movd %xmm2, (%rdi, %rcx, 4) -3: - leaq (%rsi, %r10, 4), %rsi - leaq (%rdi, %r11, 4), %rdi - decq %r9 +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 jnz 0b 9: @@ -10436,20 +10427,20 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgb_cmod): pxor %xmm4, %xmm4 movdqa m00XXXXXX(%rip), %xmm6 - /* Move right to left across each line, */ - /* processing in two pixel chunks */ - leaq (%rsi, %r8, 4), %rsi - leaq (%rdi, %r8, 4), %rdi - - /* Last instruction is %rcx = 0 */ - subq $4, %rsi - subq $4, %rdi - - negq %r8 -0: - movq %r8, %rcx - - incq %rcx + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) @@ -10457,7 +10448,7 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgb_cmod): prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) - jz 2f /* one pixel line */ + jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) @@ -10500,13 +10491,13 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgb_cmod): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -10527,10 +10518,10 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -10569,13 +10560,13 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgb_cmod): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -10596,10 +10587,10 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -10638,13 +10629,13 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgb_cmod): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -10665,10 +10656,10 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -10707,13 +10698,13 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgb_cmod): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -10734,10 +10725,10 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -10776,13 +10767,13 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgb_cmod): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -10803,10 +10794,10 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -10845,13 +10836,13 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgb_cmod): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -10872,10 +10863,10 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -10914,13 +10905,13 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgb_cmod): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -10941,10 +10932,10 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -10983,13 +10974,13 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgb_cmod): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -11010,10 +11001,10 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx - js 1b - jnz 3f + incq %rcx + incq %rcx + js 1b + jnz 3f 2: /* Grab 1 pixel from src, with colormod, with a = amod[255] */ movl (%rsi, %rcx, 4), %eax @@ -11037,13 +11028,13 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgb_cmod): * Result ranges is [0, 0x7fff], and is mapped to * point values in [0.0, 1.0) by using the high word * of the 32 bit multiplication result. - * Because we want the unsigned value, we shift right one + * Because we want the unsigned value, we shift right one * here and also shift left the other factors to compensate. */ movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero out the alpha channel of the source to leave the @@ -11063,10 +11054,10 @@ PR_(imlib_amd64_subtract_blend_rgb_to_rgb_cmod): /* pack new pixels */ packuswb %xmm4, %xmm2 movd %xmm2, (%rdi, %rcx, 4) -3: - leaq (%rsi, %r10, 4), %rsi - leaq (%rdi, %r11, 4), %rdi - decq %r9 +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 jnz 0b 9: @@ -11077,20 +11068,20 @@ PR_(imlib_amd64_subtract_copy_rgba_to_rgb_cmod): movdqa m0XXX0XXX0XXX0XXX(%rip), %xmm5 - /* Move right to left across each line, */ - /* processing in two pixel chunks */ - leaq (%rsi, %r8, 4), %rsi - leaq (%rdi, %r8, 4), %rdi - - /* Last instruction is %rcx = 0 */ - subq $4, %rsi - subq $4, %rdi - - negq %r8 -0: - movq %r8, %rcx - - incq %rcx + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) @@ -11098,7 +11089,7 @@ PR_(imlib_amd64_subtract_copy_rgba_to_rgb_cmod): prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) - jz 2f /* one pixel line */ + jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) @@ -11144,10 +11135,10 @@ PR_(imlib_amd64_subtract_copy_rgba_to_rgb_cmod): psubusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -11189,10 +11180,10 @@ PR_(imlib_amd64_subtract_copy_rgba_to_rgb_cmod): psubusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -11234,10 +11225,10 @@ PR_(imlib_amd64_subtract_copy_rgba_to_rgb_cmod): psubusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -11279,10 +11270,10 @@ PR_(imlib_amd64_subtract_copy_rgba_to_rgb_cmod): psubusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -11324,10 +11315,10 @@ PR_(imlib_amd64_subtract_copy_rgba_to_rgb_cmod): psubusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -11369,10 +11360,10 @@ PR_(imlib_amd64_subtract_copy_rgba_to_rgb_cmod): psubusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -11414,10 +11405,10 @@ PR_(imlib_amd64_subtract_copy_rgba_to_rgb_cmod): psubusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -11459,10 +11450,10 @@ PR_(imlib_amd64_subtract_copy_rgba_to_rgb_cmod): psubusb %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx - js 1b - jnz 3f + incq %rcx + incq %rcx + js 1b + jnz 3f 2: /* Grab 1 pixel from src, with colormod */ movl (%rsi, %rcx, 4), %eax @@ -11487,10 +11478,10 @@ PR_(imlib_amd64_subtract_copy_rgba_to_rgb_cmod): pand %xmm5, %xmm1 psubusb %xmm1, %xmm2 movd %xmm2, (%rdi, %rcx, 4) -3: - leaq (%rsi, %r10, 4), %rsi - leaq (%rdi, %r11, 4), %rdi - decq %r9 +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 jnz 0b 9: @@ -11502,20 +11493,20 @@ PR_(imlib_amd64_subtract_copy_rgba_to_rgba_cmod): movdqa m0XXX0XXX0XXX0XXX(%rip), %xmm5 movdqa mX000X000X000X000(%rip), %xmm6 - /* Move right to left across each line, */ - /* processing in two pixel chunks */ - leaq (%rsi, %r8, 4), %rsi - leaq (%rdi, %r8, 4), %rdi - - /* Last instruction is %rcx = 0 */ - subq $4, %rsi - subq $4, %rdi - - negq %r8 -0: - movq %r8, %rcx - - incq %rcx + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) @@ -11523,7 +11514,7 @@ PR_(imlib_amd64_subtract_copy_rgba_to_rgba_cmod): prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) - jz 2f /* one pixel line */ + jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) @@ -11571,10 +11562,10 @@ PR_(imlib_amd64_subtract_copy_rgba_to_rgba_cmod): por %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -11618,10 +11609,10 @@ PR_(imlib_amd64_subtract_copy_rgba_to_rgba_cmod): por %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -11665,10 +11656,10 @@ PR_(imlib_amd64_subtract_copy_rgba_to_rgba_cmod): por %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -11712,10 +11703,10 @@ PR_(imlib_amd64_subtract_copy_rgba_to_rgba_cmod): por %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -11759,10 +11750,10 @@ PR_(imlib_amd64_subtract_copy_rgba_to_rgba_cmod): por %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -11806,10 +11797,10 @@ PR_(imlib_amd64_subtract_copy_rgba_to_rgba_cmod): por %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -11853,10 +11844,10 @@ PR_(imlib_amd64_subtract_copy_rgba_to_rgba_cmod): por %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -11900,10 +11891,10 @@ PR_(imlib_amd64_subtract_copy_rgba_to_rgba_cmod): por %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx - js 1b - jnz 3f + incq %rcx + incq %rcx + js 1b + jnz 3f 2: /* Grab 1 pixel from src, with colormod */ movl (%rsi, %rcx, 4), %eax @@ -11930,10 +11921,10 @@ PR_(imlib_amd64_subtract_copy_rgba_to_rgba_cmod): pand %xmm5, %xmm2 por %xmm1, %xmm2 movd %xmm2, (%rdi, %rcx, 4) -3: - leaq (%rsi, %r10, 4), %rsi - leaq (%rdi, %r11, 4), %rdi - decq %r9 +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 jnz 0b 9: @@ -11945,20 +11936,20 @@ PR_(imlib_amd64_subtract_copy_rgb_to_rgba_cmod): movdqa m0XXX0XXX0XXX0XXX(%rip), %xmm5 movdqa mX000X000X000X000(%rip), %xmm6 - /* Move right to left across each line, */ - /* processing in two pixel chunks */ - leaq (%rsi, %r8, 4), %rsi - leaq (%rdi, %r8, 4), %rdi - - /* Last instruction is %rcx = 0 */ - subq $4, %rsi - subq $4, %rdi - - negq %r8 -0: - movq %r8, %rcx - - incq %rcx + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) @@ -11966,7 +11957,7 @@ PR_(imlib_amd64_subtract_copy_rgb_to_rgba_cmod): prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) - jz 2f /* one pixel line */ + jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) @@ -12014,10 +12005,10 @@ PR_(imlib_amd64_subtract_copy_rgb_to_rgba_cmod): por %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -12061,10 +12052,10 @@ PR_(imlib_amd64_subtract_copy_rgb_to_rgba_cmod): por %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -12108,10 +12099,10 @@ PR_(imlib_amd64_subtract_copy_rgb_to_rgba_cmod): por %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -12155,10 +12146,10 @@ PR_(imlib_amd64_subtract_copy_rgb_to_rgba_cmod): por %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -12202,10 +12193,10 @@ PR_(imlib_amd64_subtract_copy_rgb_to_rgba_cmod): por %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -12249,10 +12240,10 @@ PR_(imlib_amd64_subtract_copy_rgb_to_rgba_cmod): por %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -12296,10 +12287,10 @@ PR_(imlib_amd64_subtract_copy_rgb_to_rgba_cmod): por %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -12343,10 +12334,10 @@ PR_(imlib_amd64_subtract_copy_rgb_to_rgba_cmod): por %xmm1, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx - js 1b - jnz 3f + incq %rcx + incq %rcx + js 1b + jnz 3f 2: /* Grab 1 pixel from src, with colormod, with a = amod[255] */ movl (%rsi, %rcx, 4), %eax @@ -12374,10 +12365,10 @@ PR_(imlib_amd64_subtract_copy_rgb_to_rgba_cmod): pand %xmm6, %xmm1 por %xmm1, %xmm2 movd %xmm2, (%rdi, %rcx, 4) -3: - leaq (%rsi, %r10, 4), %rsi - leaq (%rdi, %r11, 4), %rdi - decq %r9 +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 jnz 0b 9: @@ -12390,20 +12381,20 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb_cmod): movdqa m000V0V0V000V0V0V(%rip), %xmm6 movdqa m00XXXXXX(%rip), %xmm7 - /* Move right to left across each line, */ - /* processing in two pixel chunks */ - leaq (%rsi, %r8, 4), %rsi - leaq (%rdi, %r8, 4), %rdi - - /* Last instruction is %rcx = 0 */ - subq $4, %rsi - subq $4, %rdi - - negq %r8 -0: - movq %r8, %rcx - - incq %rcx + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) @@ -12411,7 +12402,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb_cmod): prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) - jz 2f /* one pixel line */ + jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) @@ -12456,7 +12447,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb_cmod): movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero blending alpha */ @@ -12476,10 +12467,10 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -12520,7 +12511,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb_cmod): movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero blending alpha */ @@ -12540,10 +12531,10 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -12584,7 +12575,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb_cmod): movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero blending alpha */ @@ -12604,10 +12595,10 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -12648,7 +12639,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb_cmod): movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero blending alpha */ @@ -12668,10 +12659,10 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -12712,7 +12703,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb_cmod): movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero blending alpha */ @@ -12732,10 +12723,10 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -12776,7 +12767,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb_cmod): movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero blending alpha */ @@ -12796,10 +12787,10 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -12840,7 +12831,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb_cmod): movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero blending alpha */ @@ -12860,10 +12851,10 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -12904,7 +12895,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb_cmod): movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero blending alpha */ @@ -12924,10 +12915,10 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx - js 1b - jnz 3f + incq %rcx + incq %rcx + js 1b + jnz 3f 2: /* Grab 1 pixel from src, with colormod */ movl (%rsi, %rcx, 4), %eax @@ -12952,7 +12943,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb_cmod): movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero blending alpha */ @@ -12971,10 +12962,10 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgb_cmod): /* Repack new pixels */ packuswb %xmm4, %xmm2 movd %xmm2, (%rdi, %rcx, 4) -3: - leaq (%rsi, %r10, 4), %rsi - leaq (%rdi, %r11, 4), %rdi - decq %r9 +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 jnz 0b 9: @@ -12992,20 +12983,20 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba_cmod): movdqa m000V0V0V000V0V0V(%rip), %xmm8 xorq %rax, %rax - /* Move right to left across each line, */ - /* processing in two pixel chunks */ - leaq (%rsi, %r8, 4), %rsi - leaq (%rdi, %r8, 4), %rdi - - /* Last instruction is %rcx = 0 */ - subq $4, %rsi - subq $4, %rdi - - negq %r8 -0: - movq %r8, %rcx - - incq %rcx + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) @@ -13013,7 +13004,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba_cmod): prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) - jz 2f /* one pixel line */ + jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) @@ -13080,7 +13071,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba_cmod): movd %rax, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 movdqa %xmm2, %xmm0 @@ -13095,14 +13086,14 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba_cmod): psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 - + packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -13165,7 +13156,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba_cmod): movd %rax, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 movdqa %xmm2, %xmm0 @@ -13180,14 +13171,14 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba_cmod): psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 - + packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -13250,7 +13241,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba_cmod): movd %rax, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 movdqa %xmm2, %xmm0 @@ -13265,14 +13256,14 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba_cmod): psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 - + packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -13335,7 +13326,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba_cmod): movd %rax, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 movdqa %xmm2, %xmm0 @@ -13350,14 +13341,14 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba_cmod): psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 - + packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -13420,7 +13411,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba_cmod): movd %rax, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 movdqa %xmm2, %xmm0 @@ -13435,14 +13426,14 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba_cmod): psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 - + packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -13505,7 +13496,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba_cmod): movd %rax, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 movdqa %xmm2, %xmm0 @@ -13520,14 +13511,14 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba_cmod): psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 - + packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -13590,7 +13581,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba_cmod): movd %rax, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 movdqa %xmm2, %xmm0 @@ -13605,14 +13596,14 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba_cmod): psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 - + packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -13675,7 +13666,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba_cmod): movd %rax, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 movdqa %xmm2, %xmm0 @@ -13690,14 +13681,14 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba_cmod): psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 - + packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx - js 1b - jnz 3f + incq %rcx + incq %rcx + js 1b + jnz 3f 2: /* Grab 1 pixel from src, with colormod */ movl (%rsi, %rcx, 4), %eax @@ -13731,7 +13722,7 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba_cmod): movd %eax, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 movdqa %xmm2, %xmm0 @@ -13746,13 +13737,13 @@ PR_(imlib_amd64_reshade_blend_rgba_to_rgba_cmod): psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 - + packuswb %xmm4, %xmm2 movd %xmm2, (%rdi, %rcx, 4) -3: - leaq (%rsi, %r10, 4), %rsi - leaq (%rdi, %r11, 4), %rdi - decq %r9 +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 jnz 0b 9: @@ -13766,20 +13757,20 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgb_cmod): movdqa m000V0V0V000V0V0V(%rip), %xmm6 movdqa m00XXXXXX(%rip), %xmm7 - /* Move right to left across each line, */ - /* processing in two pixel chunks */ - leaq (%rsi, %r8, 4), %rsi - leaq (%rdi, %r8, 4), %rdi - - /* Last instruction is %rcx = 0 */ - subq $4, %rsi - subq $4, %rdi - - negq %r8 -0: - movq %r8, %rcx - - incq %rcx + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) @@ -13787,7 +13778,7 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgb_cmod): prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) - jz 2f /* one pixel line */ + jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) @@ -13830,7 +13821,7 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgb_cmod): movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero blending alpha */ @@ -13850,10 +13841,10 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -13892,7 +13883,7 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgb_cmod): movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero blending alpha */ @@ -13912,10 +13903,10 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -13954,7 +13945,7 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgb_cmod): movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero blending alpha */ @@ -13974,10 +13965,10 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -14016,7 +14007,7 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgb_cmod): movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero blending alpha */ @@ -14036,10 +14027,10 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -14078,7 +14069,7 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgb_cmod): movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero blending alpha */ @@ -14098,10 +14089,10 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -14140,7 +14131,7 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgb_cmod): movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero blending alpha */ @@ -14160,10 +14151,10 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -14202,7 +14193,7 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgb_cmod): movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero blending alpha */ @@ -14222,10 +14213,10 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -14264,7 +14255,7 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgb_cmod): movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero blending alpha */ @@ -14284,10 +14275,10 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgb_cmod): packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx - js 1b - jnz 3f + incq %rcx + incq %rcx + js 1b + jnz 3f 2: /* Grab 1 pixel from src, with colormod, with a = amod[255] */ movl (%rsi, %rcx, 4), %eax @@ -14311,7 +14302,7 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgb_cmod): movq %xmm1, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0xFF, %xmm3, %xmm3 - pshuflw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 psrlw $1, %xmm3 /* Zero blending alpha */ @@ -14330,10 +14321,10 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgb_cmod): /* Repack new pixels */ packuswb %xmm4, %xmm2 movd %xmm2, (%rdi, %rcx, 4) -3: - leaq (%rsi, %r10, 4), %rsi - leaq (%rdi, %r11, 4), %rdi - decq %r9 +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 jnz 0b 9: @@ -14351,20 +14342,20 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgba_cmod): movdqa m000V0V0V000V0V0V(%rip), %xmm8 xorq %rax, %rax - /* Move right to left across each line, */ - /* processing in two pixel chunks */ - leaq (%rsi, %r8, 4), %rsi - leaq (%rdi, %r8, 4), %rdi - - /* Last instruction is %rcx = 0 */ - subq $4, %rsi - subq $4, %rdi - - negq %r8 -0: - movq %r8, %rcx - - incq %rcx + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) @@ -14372,7 +14363,7 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgba_cmod): prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) - jz 2f /* one pixel line */ + jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) @@ -14437,7 +14428,7 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgba_cmod): movd %rax, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 movdqa %xmm2, %xmm0 @@ -14452,14 +14443,14 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgba_cmod): psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 - + packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -14520,7 +14511,7 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgba_cmod): movd %rax, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 movdqa %xmm2, %xmm0 @@ -14535,14 +14526,14 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgba_cmod): psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 - + packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -14603,7 +14594,7 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgba_cmod): movd %rax, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 movdqa %xmm2, %xmm0 @@ -14618,14 +14609,14 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgba_cmod): psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 - + packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -14686,7 +14677,7 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgba_cmod): movd %rax, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 movdqa %xmm2, %xmm0 @@ -14701,14 +14692,14 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgba_cmod): psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 - + packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -14769,7 +14760,7 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgba_cmod): movd %rax, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 movdqa %xmm2, %xmm0 @@ -14784,14 +14775,14 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgba_cmod): psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 - + packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -14852,7 +14843,7 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgba_cmod): movd %rax, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 movdqa %xmm2, %xmm0 @@ -14867,14 +14858,14 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgba_cmod): psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 - + packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -14935,7 +14926,7 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgba_cmod): movd %rax, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 movdqa %xmm2, %xmm0 @@ -14950,14 +14941,14 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgba_cmod): psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 - + packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -15018,7 +15009,7 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgba_cmod): movd %rax, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 movdqa %xmm2, %xmm0 @@ -15033,14 +15024,14 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgba_cmod): psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 - + packuswb %xmm4, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx - js 1b - jnz 3f + incq %rcx + incq %rcx + js 1b + jnz 3f 2: /* Grab 1 pixel from src, with colormod, with a = amod[255] */ movl (%rsi, %rcx, 4), %eax @@ -15073,7 +15064,7 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgba_cmod): movd %eax, %xmm3 punpcklbw %xmm3, %xmm3 pshufhw $0x40, %xmm3, %xmm3 - pshuflw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 psrlw $1, %xmm3 movdqa %xmm2, %xmm0 @@ -15088,13 +15079,13 @@ PR_(imlib_amd64_reshade_blend_rgb_to_rgba_cmod): psllw $2, %xmm1 pmulhw %xmm3, %xmm1 paddsw %xmm1, %xmm2 - + packuswb %xmm4, %xmm2 movd %xmm2, (%rdi, %rcx, 4) -3: - leaq (%rsi, %r10, 4), %rsi - leaq (%rdi, %r11, 4), %rdi - decq %r9 +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 jnz 0b 9: @@ -15107,20 +15098,20 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgb_cmod): movdqa m0XXX0XXX0XXX0XXX(%rip), %xmm5 movdqa m0VVV0VVV0VVV0VVV(%rip), %xmm6 - /* Move right to left across each line, */ - /* processing in two pixel chunks */ - leaq (%rsi, %r8, 4), %rsi - leaq (%rdi, %r8, 4), %rdi - - /* Last instruction is %rcx = 0 */ - subq $4, %rsi - subq $4, %rdi - - negq %r8 -0: - movq %r8, %rcx - - incq %rcx + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) @@ -15128,7 +15119,7 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgb_cmod): prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) - jz 2f /* one pixel line */ + jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) @@ -15192,10 +15183,10 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgb_cmod): psubusb %xmm3, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -15255,10 +15246,10 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgb_cmod): psubusb %xmm3, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -15318,10 +15309,10 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgb_cmod): psubusb %xmm3, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -15381,10 +15372,10 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgb_cmod): psubusb %xmm3, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -15444,10 +15435,10 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgb_cmod): psubusb %xmm3, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -15507,10 +15498,10 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgb_cmod): psubusb %xmm3, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -15570,10 +15561,10 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgb_cmod): psubusb %xmm3, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -15633,10 +15624,10 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgb_cmod): psubusb %xmm3, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx - js 1b - jnz 3f + incq %rcx + incq %rcx + js 1b + jnz 3f 2: /* Grab 1 pixel from src, with colormod */ movl (%rsi, %rcx, 4), %eax @@ -15679,10 +15670,10 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgb_cmod): paddusb %xmm1, %xmm2 psubusb %xmm3, %xmm2 movd %xmm2, (%rdi, %rcx, 4) -3: - leaq (%rsi, %r10, 4), %rsi - leaq (%rdi, %r11, 4), %rdi - decq %r9 +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 jnz 0b 9: @@ -15696,20 +15687,20 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgba_cmod): movdqu m0VVV0VVV0VVV0VVV(%rip), %xmm6 movdqu mX000X000X000X000(%rip), %xmm7 - /* Move right to left across each line, */ - /* processing in two pixel chunks */ - leaq (%rsi, %r8, 4), %rsi - leaq (%rdi, %r8, 4), %rdi - - /* Last instruction is %rcx = 0 */ - subq $4, %rsi - subq $4, %rdi - - negq %r8 -0: - movq %r8, %rcx - - incq %rcx + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) @@ -15717,7 +15708,7 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgba_cmod): prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) - jz 2f /* one pixel line */ + jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) @@ -15770,17 +15761,17 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgba_cmod): /* d = d + s1 - s2, unsigned saturation */ paddusb %xmm1, %xmm2 psubusb %xmm3, %xmm2 - + /* d alpha = s alpha */ pand %xmm5, %xmm2 pand %xmm7, %xmm0 por %xmm0, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -15829,17 +15820,17 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgba_cmod): /* d = d + s1 - s2, unsigned saturation */ paddusb %xmm1, %xmm2 psubusb %xmm3, %xmm2 - + /* d alpha = s alpha */ pand %xmm5, %xmm2 pand %xmm7, %xmm0 por %xmm0, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -15888,17 +15879,17 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgba_cmod): /* d = d + s1 - s2, unsigned saturation */ paddusb %xmm1, %xmm2 psubusb %xmm3, %xmm2 - + /* d alpha = s alpha */ pand %xmm5, %xmm2 pand %xmm7, %xmm0 por %xmm0, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -15947,17 +15938,17 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgba_cmod): /* d = d + s1 - s2, unsigned saturation */ paddusb %xmm1, %xmm2 psubusb %xmm3, %xmm2 - + /* d alpha = s alpha */ pand %xmm5, %xmm2 pand %xmm7, %xmm0 por %xmm0, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -16006,17 +15997,17 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgba_cmod): /* d = d + s1 - s2, unsigned saturation */ paddusb %xmm1, %xmm2 psubusb %xmm3, %xmm2 - + /* d alpha = s alpha */ pand %xmm5, %xmm2 pand %xmm7, %xmm0 por %xmm0, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -16065,17 +16056,17 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgba_cmod): /* d = d + s1 - s2, unsigned saturation */ paddusb %xmm1, %xmm2 psubusb %xmm3, %xmm2 - + /* d alpha = s alpha */ pand %xmm5, %xmm2 pand %xmm7, %xmm0 por %xmm0, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -16124,17 +16115,17 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgba_cmod): /* d = d + s1 - s2, unsigned saturation */ paddusb %xmm1, %xmm2 psubusb %xmm3, %xmm2 - + /* d alpha = s alpha */ pand %xmm5, %xmm2 pand %xmm7, %xmm0 por %xmm0, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod */ movq (%rsi, %rcx, 4), %rax @@ -16183,17 +16174,17 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgba_cmod): /* d = d + s1 - s2, unsigned saturation */ paddusb %xmm1, %xmm2 psubusb %xmm3, %xmm2 - + /* d alpha = s alpha */ pand %xmm5, %xmm2 pand %xmm7, %xmm0 por %xmm0, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx - js 1b - jnz 3f + incq %rcx + incq %rcx + js 1b + jnz 3f 2: /* Grab 1 pixel from src, with colormod */ movl (%rsi, %rcx, 4), %eax @@ -16226,16 +16217,16 @@ PR_(imlib_amd64_reshade_copy_rgba_to_rgba_cmod): /* d = d + s1 - s2, unsigned saturation */ paddusb %xmm1, %xmm2 psubusb %xmm3, %xmm2 - + /* d alpha = s alpha */ pand %xmm5, %xmm2 pand %xmm7, %xmm0 por %xmm0, %xmm2 movd %xmm2, (%rdi, %rcx, 4) -3: - leaq (%rsi, %r10, 4), %rsi - leaq (%rdi, %r11, 4), %rdi - decq %r9 +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 jnz 0b 9: @@ -16249,20 +16240,20 @@ PR_(imlib_amd64_reshade_copy_rgb_to_rgba_cmod): movdqu m0VVV0VVV0VVV0VVV(%rip), %xmm6 movdqu mX000X000X000X000(%rip), %xmm7 - /* Move right to left across each line, */ - /* processing in two pixel chunks */ - leaq (%rsi, %r8, 4), %rsi - leaq (%rdi, %r8, 4), %rdi - - /* Last instruction is %rcx = 0 */ - subq $4, %rsi - subq $4, %rdi - - negq %r8 -0: - movq %r8, %rcx - - incq %rcx + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx /* prefetch a couple cache lines ahead */ prefetchnta (%rsi, %rcx, 4) @@ -16270,7 +16261,7 @@ PR_(imlib_amd64_reshade_copy_rgb_to_rgba_cmod): prefetchnta 64(%rsi, %rcx, 4) prefetcht0 64(%rdi, %rcx, 4) - jz 2f /* one pixel line */ + jz 2f /* one pixel line */ 1: /* main loop, unrolled to work on 64 byte chunks */ prefetchnta 128(%rsi, %rcx, 4) @@ -16321,17 +16312,17 @@ PR_(imlib_amd64_reshade_copy_rgb_to_rgba_cmod): /* d = d + s1 - s2, unsigned saturation */ paddusb %xmm1, %xmm2 psubusb %xmm3, %xmm2 - + /* d alpha = s alpha */ pand %xmm5, %xmm2 pand %xmm7, %xmm0 por %xmm0, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -16378,17 +16369,17 @@ PR_(imlib_amd64_reshade_copy_rgb_to_rgba_cmod): /* d = d + s1 - s2, unsigned saturation */ paddusb %xmm1, %xmm2 psubusb %xmm3, %xmm2 - + /* d alpha = s alpha */ pand %xmm5, %xmm2 pand %xmm7, %xmm0 por %xmm0, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -16435,17 +16426,17 @@ PR_(imlib_amd64_reshade_copy_rgb_to_rgba_cmod): /* d = d + s1 - s2, unsigned saturation */ paddusb %xmm1, %xmm2 psubusb %xmm3, %xmm2 - + /* d alpha = s alpha */ pand %xmm5, %xmm2 pand %xmm7, %xmm0 por %xmm0, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -16492,17 +16483,17 @@ PR_(imlib_amd64_reshade_copy_rgb_to_rgba_cmod): /* d = d + s1 - s2, unsigned saturation */ paddusb %xmm1, %xmm2 psubusb %xmm3, %xmm2 - + /* d alpha = s alpha */ pand %xmm5, %xmm2 pand %xmm7, %xmm0 por %xmm0, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -16549,17 +16540,17 @@ PR_(imlib_amd64_reshade_copy_rgb_to_rgba_cmod): /* d = d + s1 - s2, unsigned saturation */ paddusb %xmm1, %xmm2 psubusb %xmm3, %xmm2 - + /* d alpha = s alpha */ pand %xmm5, %xmm2 pand %xmm7, %xmm0 por %xmm0, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -16606,17 +16597,17 @@ PR_(imlib_amd64_reshade_copy_rgb_to_rgba_cmod): /* d = d + s1 - s2, unsigned saturation */ paddusb %xmm1, %xmm2 psubusb %xmm3, %xmm2 - + /* d alpha = s alpha */ pand %xmm5, %xmm2 pand %xmm7, %xmm0 por %xmm0, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -16663,17 +16654,17 @@ PR_(imlib_amd64_reshade_copy_rgb_to_rgba_cmod): /* d = d + s1 - s2, unsigned saturation */ paddusb %xmm1, %xmm2 psubusb %xmm3, %xmm2 - + /* d alpha = s alpha */ pand %xmm5, %xmm2 pand %xmm7, %xmm0 por %xmm0, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx + incq %rcx + incq %rcx jz 2f - jns 3f + jns 3f /* Grab 2 pixels from src, with colormod, with a = amod[255] */ movq (%rsi, %rcx, 4), %rax @@ -16720,17 +16711,17 @@ PR_(imlib_amd64_reshade_copy_rgb_to_rgba_cmod): /* d = d + s1 - s2, unsigned saturation */ paddusb %xmm1, %xmm2 psubusb %xmm3, %xmm2 - + /* d alpha = s alpha */ pand %xmm5, %xmm2 pand %xmm7, %xmm0 por %xmm0, %xmm2 movq %xmm2, (%rdi, %rcx, 4) - incq %rcx - incq %rcx - js 1b - jnz 3f + incq %rcx + incq %rcx + js 1b + jnz 3f 2: /* Grab 1 pixel from src, with colormod, with a = amod[255] */ movl (%rsi, %rcx, 4), %eax @@ -16762,24 +16753,22 @@ PR_(imlib_amd64_reshade_copy_rgb_to_rgba_cmod): /* d = d + s1 - s2, unsigned saturation */ paddusb %xmm1, %xmm2 psubusb %xmm3, %xmm2 - + /* d alpha = s alpha */ pand %xmm5, %xmm2 pand %xmm7, %xmm0 por %xmm0, %xmm2 movd %xmm2, (%rdi, %rcx, 4) -3: - leaq (%rsi, %r10, 4), %rsi - leaq (%rdi, %r11, 4), %rdi - decq %r9 +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 jnz 0b 9: LEAVE SIZE(imlib_amd64_reshade_copy_rgb_to_rgba_cmod) -#endif - #ifdef __ELF__ .section .note.GNU-stack,"",@progbits #endif diff --git a/src/lib/asm_blend.S b/src/lib/asm_blend.S index 5ce63f0..e598fde 100644 --- a/src/lib/asm_blend.S +++ b/src/lib/asm_blend.S @@ -1,9 +1,7 @@ #include #include "asm.h" -#ifdef DO_MMX_ASM - -/*\ +/*\ |*| MMX assembly blending routines for Imlib2 |*| Written by Willem Monsuwe |*| @@ -51,7 +49,7 @@ FN_(imlib_mmx_reshade_copy_rgb_to_rgba) #include "asm_loadimmq.S" - + /*\ MMX register use: |*| %mm1 = Source value |*| %mm2 = Destination value @@ -95,7 +93,6 @@ FN_(imlib_mmx_reshade_copy_rgb_to_rgba) decl %edx ;\ jns 8b - /*\ Unset MMX mode, reset registers, return \*/ #define LEAVE \ 9: ;\ @@ -281,7 +278,7 @@ PR_(imlib_mmx_copy_rgba_to_rgba): /*\ Load source, save destination \*/ movq (%esi, %ecx, 4), %mm1 movq %mm1, (%edi, %ecx, 4) - + addl $2, %ecx js 1b jnz 3f @@ -358,7 +355,7 @@ PR_(imlib_mmx_add_blend_rgba_to_rgb): /*\ Pack into lower 4 bytes and save \*/ packuswb %mm4, %mm2 movd %mm2, (%edi, %ecx, 4) - + incl %ecx js 1b @@ -408,7 +405,7 @@ PR_(imlib_mmx_add_blend_rgba_to_rgba): /*\ Pack into lower 4 bytes and save \*/ packuswb %mm4, %mm2 movd %mm2, (%edi, %ecx, 4) - + incl %ecx js 1b @@ -440,7 +437,7 @@ PR_(imlib_mmx_add_copy_rgba_to_rgb): /*\ d = d + s, unsigned saturation, and save \*/ paddusb %mm1, %mm2 movq %mm2, (%edi, %ecx, 4) - + addl $2, %ecx js 1b jnz 3f @@ -473,7 +470,7 @@ PR_(imlib_mmx_add_copy_rgba_to_rgba): /*\ d = d + s, unsigned saturation, and save \*/ paddusb %mm1, %mm2 movq %mm2, (%edi, %ecx, 4) - + addl $2, %ecx js 1b jnz 3f @@ -511,7 +508,7 @@ PR_(imlib_mmx_add_copy_rgb_to_rgba): /*\ Make result alpha 0xff \*/ por %mm5, %mm2 movq %mm2, (%edi, %ecx, 4) - + addl $2, %ecx js 1b jnz 3f @@ -559,7 +556,7 @@ PR_(imlib_mmx_subtract_blend_rgba_to_rgb): /*\ Pack into lower 4 bytes and save \*/ packuswb %mm4, %mm2 movd %mm2, (%edi, %ecx, 4) - + incl %ecx js 1b @@ -609,7 +606,7 @@ PR_(imlib_mmx_subtract_blend_rgba_to_rgba): /*\ Pack into lower 4 bytes and save \*/ packuswb %mm4, %mm2 movd %mm2, (%edi, %ecx, 4) - + incl %ecx js 1b @@ -641,7 +638,7 @@ PR_(imlib_mmx_subtract_copy_rgba_to_rgb): /*\ d = d - s, unsigned saturation, and save \*/ psubusb %mm1, %mm2 movq %mm2, (%edi, %ecx, 4) - + addl $2, %ecx js 1b jnz 3f @@ -683,7 +680,7 @@ PR_(imlib_mmx_subtract_copy_rgba_to_rgba): /*\ Negate result alphas \*/ pxor %mm5, %mm2 movq %mm2, (%edi, %ecx, 4) - + addl $2, %ecx js 1b jnz 3f @@ -723,7 +720,7 @@ PR_(imlib_mmx_subtract_copy_rgb_to_rgba): /*\ Make result alpha 0xff \*/ por %mm5, %mm2 movq %mm2, (%edi, %ecx, 4) - + addl $2, %ecx js 1b jnz 3f @@ -774,7 +771,7 @@ PR_(imlib_mmx_reshade_blend_rgba_to_rgb): /*\ Pack into lower 4 bytes and save \*/ packuswb %mm4, %mm2 movd %mm2, (%edi, %ecx, 4) - + incl %ecx js 1b @@ -827,7 +824,7 @@ PR_(imlib_mmx_reshade_blend_rgba_to_rgba): /*\ Pack into lower 4 bytes and save \*/ packuswb %mm4, %mm2 movd %mm2, (%edi, %ecx, 4) - + incl %ecx js 1b @@ -877,7 +874,7 @@ PR_(imlib_mmx_reshade_copy_rgba_to_rgb): paddusb %mm1, %mm2 psubusb %mm3, %mm2 movq %mm2, (%edi, %ecx, 4) - + addl $2, %ecx js 1b jnz 3f @@ -942,7 +939,7 @@ PR_(imlib_mmx_reshade_copy_rgba_to_rgba): paddusb %mm1, %mm2 psubusb %mm3, %mm2 movq %mm2, (%edi, %ecx, 4) - + addl $2, %ecx js 1b jnz 3f @@ -1001,7 +998,7 @@ PR_(imlib_mmx_reshade_copy_rgb_to_rgba): /*\ Make result alpha 0xff \*/ por %mm7, %mm2 movq %mm2, (%edi, %ecx, 4) - + addl $2, %ecx js 1b jnz 3f @@ -1024,8 +1021,6 @@ PR_(imlib_mmx_reshade_copy_rgb_to_rgba): SIZE(imlib_mmx_reshade_copy_rgb_to_rgba) -#endif - #ifdef __ELF__ .section .note.GNU-stack,"",@progbits #endif diff --git a/src/lib/asm_blend_cmod.S b/src/lib/asm_blend_cmod.S index 23b2500..9fedb79 100644 --- a/src/lib/asm_blend_cmod.S +++ b/src/lib/asm_blend_cmod.S @@ -1,9 +1,7 @@ #include #include "asm.h" -#ifdef DO_MMX_ASM - -/*\ +/*\ |*| MMX assembly blending routines, with colour modding, for Imlib2 |*| Written by Willem Monsuwe |*| @@ -69,7 +67,7 @@ FN_(imlib_mmx_reshade_copy_rgba_to_rgba_cmod) FN_(imlib_mmx_reshade_copy_rgb_to_rgba_cmod) #include "asm_loadimmq.S" - + /*\ MMX register use: |*| %mm1 = Source value |*| %mm2 = Destination value @@ -113,7 +111,6 @@ FN_(imlib_mmx_reshade_copy_rgb_to_rgba_cmod) decl %edx ;\ jns 8b - /*\ Unset MMX mode, reset registers, return \*/ #define LEAVE \ 9: ;\ @@ -622,7 +619,7 @@ PR_(imlib_mmx_add_blend_rgba_to_rgb_cmod): /*\ Pack into lower 4 bytes and save \*/ packuswb %mm4, %mm2 movd %mm2, (%edi, %ecx, 4) - + incl %ecx js 1b @@ -672,7 +669,7 @@ PR_(imlib_mmx_add_blend_rgba_to_rgba_cmod): /*\ Pack into lower 4 bytes and save \*/ packuswb %mm4, %mm2 movd %mm2, (%edi, %ecx, 4) - + incl %ecx js 1b @@ -713,7 +710,7 @@ PR_(imlib_mmx_add_blend_rgb_to_rgb_cmod): /*\ Pack into lower 4 bytes and save \*/ packuswb %mm4, %mm2 movd %mm2, (%edi, %ecx, 4) - + incl %ecx js 1b @@ -763,7 +760,7 @@ PR_(imlib_mmx_add_blend_rgb_to_rgba_cmod): /*\ Pack into lower 4 bytes and save \*/ packuswb %mm4, %mm2 movd %mm2, (%edi, %ecx, 4) - + incl %ecx js 1b @@ -795,7 +792,7 @@ PR_(imlib_mmx_add_copy_rgba_to_rgb_cmod): /*\ d = d + s, unsigned saturation, and save \*/ paddusb %mm1, %mm2 movq %mm2, (%edi, %ecx, 4) - + addl $2, %ecx js 1b jnz 3f @@ -828,7 +825,7 @@ PR_(imlib_mmx_add_copy_rgba_to_rgba_cmod): /*\ d = d + s, unsigned saturation, and save \*/ paddusb %mm1, %mm2 movq %mm2, (%edi, %ecx, 4) - + addl $2, %ecx js 1b jnz 3f @@ -860,7 +857,7 @@ PR_(imlib_mmx_add_copy_rgb_to_rgba_cmod): /*\ d = d + s, unsigned saturation, and save \*/ paddusb %mm1, %mm2 movq %mm2, (%edi, %ecx, 4) - + addl $2, %ecx js 1b jnz 3f @@ -907,7 +904,7 @@ PR_(imlib_mmx_subtract_blend_rgba_to_rgb_cmod): /*\ Pack into lower 4 bytes and save \*/ packuswb %mm4, %mm2 movd %mm2, (%edi, %ecx, 4) - + incl %ecx js 1b @@ -957,7 +954,7 @@ PR_(imlib_mmx_subtract_blend_rgba_to_rgba_cmod): /*\ Pack into lower 4 bytes and save \*/ packuswb %mm4, %mm2 movd %mm2, (%edi, %ecx, 4) - + incl %ecx js 1b @@ -998,7 +995,7 @@ PR_(imlib_mmx_subtract_blend_rgb_to_rgb_cmod): /*\ Pack into lower 4 bytes and save \*/ packuswb %mm4, %mm2 movd %mm2, (%edi, %ecx, 4) - + incl %ecx js 1b @@ -1048,7 +1045,7 @@ PR_(imlib_mmx_subtract_blend_rgb_to_rgba_cmod): /*\ Pack into lower 4 bytes and save \*/ packuswb %mm4, %mm2 movd %mm2, (%edi, %ecx, 4) - + incl %ecx js 1b @@ -1080,7 +1077,7 @@ PR_(imlib_mmx_subtract_copy_rgba_to_rgb_cmod): /*\ d = d - s, unsigned saturation, and save \*/ psubusb %mm1, %mm2 movq %mm2, (%edi, %ecx, 4) - + addl $2, %ecx js 1b jnz 3f @@ -1122,7 +1119,7 @@ PR_(imlib_mmx_subtract_copy_rgba_to_rgba_cmod): /*\ Negate result alphas \*/ pxor %mm5, %mm2 movq %mm2, (%edi, %ecx, 4) - + addl $2, %ecx js 1b jnz 3f @@ -1161,7 +1158,7 @@ PR_(imlib_mmx_subtract_copy_rgb_to_rgba_cmod): psubusb %mm1, %mm2 pxor %mm5, %mm2 movq %mm2, (%edi, %ecx, 4) - + addl $2, %ecx js 1b jnz 3f @@ -1213,7 +1210,7 @@ PR_(imlib_mmx_reshade_blend_rgba_to_rgb_cmod): /*\ Pack into lower 4 bytes and save \*/ packuswb %mm4, %mm2 movd %mm2, (%edi, %ecx, 4) - + incl %ecx js 1b @@ -1266,7 +1263,7 @@ PR_(imlib_mmx_reshade_blend_rgba_to_rgba_cmod): /*\ Pack into lower 4 bytes and save \*/ packuswb %mm4, %mm2 movd %mm2, (%edi, %ecx, 4) - + incl %ecx js 1b @@ -1310,7 +1307,7 @@ PR_(imlib_mmx_reshade_blend_rgb_to_rgb_cmod): /*\ Pack into lower 4 bytes and save \*/ packuswb %mm4, %mm2 movd %mm2, (%edi, %ecx, 4) - + incl %ecx js 1b @@ -1363,7 +1360,7 @@ PR_(imlib_mmx_reshade_blend_rgb_to_rgba_cmod): /*\ Pack into lower 4 bytes and save \*/ packuswb %mm4, %mm2 movd %mm2, (%edi, %ecx, 4) - + incl %ecx js 1b @@ -1413,7 +1410,7 @@ PR_(imlib_mmx_reshade_copy_rgba_to_rgb_cmod): paddusb %mm1, %mm2 psubusb %mm3, %mm2 movq %mm2, (%edi, %ecx, 4) - + addl $2, %ecx js 1b jnz 3f @@ -1478,7 +1475,7 @@ PR_(imlib_mmx_reshade_copy_rgba_to_rgba_cmod): paddusb %mm1, %mm2 psubusb %mm3, %mm2 movq %mm2, (%edi, %ecx, 4) - + addl $2, %ecx js 1b jnz 3f @@ -1538,7 +1535,7 @@ PR_(imlib_mmx_reshade_copy_rgb_to_rgba_cmod): paddusb %mm1, %mm2 psubusb %mm3, %mm2 movq %mm2, (%edi, %ecx, 4) - + addl $2, %ecx js 1b jnz 3f @@ -1563,8 +1560,6 @@ PR_(imlib_mmx_reshade_copy_rgb_to_rgba_cmod): SIZE(imlib_mmx_reshade_copy_rgb_to_rgba_cmod) -#endif - #ifdef __ELF__ .section .note.GNU-stack,"",@progbits #endif diff --git a/src/lib/asm_rgba.S b/src/lib/asm_rgba.S index 9903ae2..552756d 100644 --- a/src/lib/asm_rgba.S +++ b/src/lib/asm_rgba.S @@ -1,9 +1,7 @@ #include #include "asm.h" -#ifdef DO_MMX_ASM - -/*\ +/*\ |*| MMX assembly rgba rendering routines for Imlib2 |*| Written by Willem Monsuwe |*| @@ -81,7 +79,6 @@ FN_(imlib_get_cpuid) ret - PR_(imlib_mmx_bgr565_fast): LOAD_IMMQ(mul_bgr565, %mm7) /*\ This constant is the only difference \*/ CLEANUP_IMMQ_LOADS(1) @@ -273,8 +270,6 @@ PR_(imlib_get_cpuid): SIZE(imlib_get_cpuid) -#endif - #ifdef __ELF__ .section .note.GNU-stack,"",@progbits #endif diff --git a/src/lib/asm_rotate.S b/src/lib/asm_rotate.S index 2af05b3..6bda910 100644 --- a/src/lib/asm_rotate.S +++ b/src/lib/asm_rotate.S @@ -1,9 +1,7 @@ #include #include "asm.h" -#ifdef DO_MMX_ASM - -/*\ +/*\ |*| MMX assembly rotation routine for Imlib2 |*| Written by Willem Monsuwe \*/ @@ -197,7 +195,7 @@ PR_(imlib_mmx_RotateAA): paddw %mm3, %mm5 packuswb %mm5, %mm5 movd %mm5, (%edi, %ecx, 4) - + paddd dxh, %mm6 incl %ecx @@ -220,7 +218,7 @@ PR_(imlib_mmx_RotateAA): decl %eax sall $12, %eax movl %eax, sht - + movl sow, %ebx movl src, %edx .outside_loop_y: @@ -421,7 +419,7 @@ PR_(imlib_mmx_RotateAA): .outside_il_0: movl $0, %eax movl %eax, (%edi, %ecx, 4) - + .outside_il_end: paddd dxh, %mm6 @@ -447,8 +445,6 @@ PR_(imlib_mmx_RotateAA): SIZE(imlib_mmx_RotateAA) -#endif - #ifdef __ELF__ .section .note.GNU-stack,"",@progbits #endif diff --git a/src/lib/asm_scale.S b/src/lib/asm_scale.S index b48737e..c82a05f 100644 --- a/src/lib/asm_scale.S +++ b/src/lib/asm_scale.S @@ -1,9 +1,7 @@ #include #include "asm.h" -#ifdef DO_MMX_ASM - -/*\ +/*\ |*| MMX assembly scaling routine for Imlib2 |*| Written by Willem Monsuwe \*/ @@ -293,7 +291,7 @@ PR_(imlib_Scale_mmx_AARGBA): punpcklbw %mm7, %mm0 psllw $6, %mm0 pmulhw %mm5, %mm0 - + /*\ i = 0x4000 - My \*/ movl $0x4000, %ebx subl My, %ebx @@ -307,18 +305,18 @@ PR_(imlib_Scale_mmx_AARGBA): psllw $6, %mm1 pmulhw %mm4, %mm1 paddw %mm1, %mm0 - + /*\ i -= Cy; while (i > Cy) \*/ subl Cy, %ebx 2: cmpl Cy, %ebx jg 1b - + /*\ mm6 = i \*/ movd %ebx, %mm6 punpcklwd %mm6, %mm6 punpckldq %mm6, %mm6 - + /*\ p += sow; v += (*p * i) >> 10 \*/ addl sow_4, %eax movd (%eax), %mm1 @@ -336,7 +334,7 @@ PR_(imlib_Scale_mmx_AARGBA): movd %eax, %mm3 punpcklwd %mm3, %mm3 punpckldq %mm3, %mm3 - + /*\ p + 1 \*/ movl %esi, %eax addl $4, %eax @@ -345,7 +343,7 @@ PR_(imlib_Scale_mmx_AARGBA): punpcklbw %mm7, %mm2 psllw $6, %mm2 pmulhw %mm5, %mm2 - + /*\ i = 0x4000 - My \*/ movl $0x4000, %ebx subl My, %ebx @@ -359,13 +357,13 @@ PR_(imlib_Scale_mmx_AARGBA): psllw $6, %mm1 pmulhw %mm4, %mm1 paddw %mm1, %mm2 - + /*\ i -= Cy; while (i > Cy) \*/ subl Cy, %ebx 2: cmpl Cy, %ebx jg 1b - + /*\ p += sow; v += (*p * i) >> 10 \*/ addl sow_4, %eax movd (%eax), %mm1 @@ -425,7 +423,7 @@ PR_(imlib_Scale_mmx_AARGBA): movd %eax, %mm3 punpcklwd %mm3, %mm3 punpckldq %mm3, %mm3 - + /*\ x = -dw \*/ movl dw, %ecx negl %ecx @@ -459,7 +457,7 @@ PR_(imlib_Scale_mmx_AARGBA): punpcklbw %mm7, %mm0 psllw $6, %mm0 pmulhw %mm5, %mm0 - + /*\ i = 0x4000 - Mx \*/ movl $0x4000, %ebx subl Mx, %ebx @@ -473,18 +471,18 @@ PR_(imlib_Scale_mmx_AARGBA): psllw $6, %mm1 pmulhw %mm4, %mm1 paddw %mm1, %mm0 - + /*\ i -= Cx; while (i > Cx) \*/ subl Cx, %ebx 2: cmpl Cx, %ebx jg 1b - + /*\ mm6 = i \*/ movd %ebx, %mm6 punpcklwd %mm6, %mm6 punpckldq %mm6, %mm6 - + /*\ p += sow; v += (*p * i) >> 10 \*/ addl $4, %eax movd (%eax), %mm1 @@ -504,7 +502,7 @@ PR_(imlib_Scale_mmx_AARGBA): punpcklbw %mm7, %mm2 psllw $6, %mm2 pmulhw %mm5, %mm2 - + /*\ i = 0x4000 - Mx \*/ movl $0x4000, %ebx subl Mx, %ebx @@ -518,13 +516,13 @@ PR_(imlib_Scale_mmx_AARGBA): psllw $6, %mm1 pmulhw %mm4, %mm1 paddw %mm1, %mm2 - + /*\ i -= Cx; while (i > Cx) \*/ subl Cx, %ebx 2: cmpl Cx, %ebx jg 1b - + /*\ p += sow; v += (*p * i) >> 10 \*/ addl $4, %eax movd (%eax), %mm1 @@ -604,14 +602,14 @@ PR_(imlib_Scale_mmx_AARGBA): movd %ebx, %mm5 punpcklwd %mm5, %mm5 punpckldq %mm5, %mm5 - + /*\ p = sptr; v = (*p * Mx) >> 9 \*/ movl %esi, %eax movd (%eax), %mm0 punpcklbw %mm7, %mm0 psllw $7, %mm0 pmulhw %mm5, %mm0 - + /*\ i = 0x4000 - Mx \*/ movl $0x4000, %ebx subl Mx, %ebx @@ -625,18 +623,18 @@ PR_(imlib_Scale_mmx_AARGBA): psllw $7, %mm1 pmulhw %mm3, %mm1 paddw %mm1, %mm0 - + /*\ i -= Cx; while (i > Cx) \*/ subl Cx, %ebx 2: cmpl Cx, %ebx jg 1b - + /*\ mm6 = i \*/ movd %ebx, %mm6 punpcklwd %mm6, %mm6 punpckldq %mm6, %mm6 - + /*\ v += (*++p * i) >> 9 \*/ addl $4, %eax movd (%eax), %mm1 @@ -651,7 +649,7 @@ PR_(imlib_Scale_mmx_AARGBA): punpckldq %mm4, %mm4 psllw $2, %mm0 pmulhw %mm4, %mm0 - + /*\ j = 0x4000 - My \*/ movl $0x4000, %edx subl My, %edx @@ -666,7 +664,7 @@ PR_(imlib_Scale_mmx_AARGBA): punpcklbw %mm7, %mm1 psllw $7, %mm1 pmulhw %mm5, %mm1 - + /*\ i = 0x4000 - Mx \*/ movl $0x4000, %ebx subl Mx, %ebx @@ -680,13 +678,13 @@ PR_(imlib_Scale_mmx_AARGBA): psllw $7, %mm2 pmulhw %mm3, %mm2 paddw %mm2, %mm1 - + /*\ i -= Cx; while (i > Cx) \*/ subl Cx, %ebx 2: cmpl Cx, %ebx jg 1b - + /*\ vx += (*++p * i) >> 9 \*/ addl $4, %eax movd (%eax), %mm2 @@ -702,13 +700,13 @@ PR_(imlib_Scale_mmx_AARGBA): psllw $2, %mm1 pmulhw %mm4, %mm1 paddw %mm1, %mm0 - + /*\ j -= Cy; while (j > Cy) \*/ subl Cy, %edx 4: cmpl Cy, %edx jg 3b - + /*\ sptr += sow; p = sptr \*/ addl sow_4, %esi movl %esi, %eax @@ -717,7 +715,7 @@ PR_(imlib_Scale_mmx_AARGBA): punpcklbw %mm7, %mm1 psllw $7, %mm1 pmulhw %mm5, %mm1 - + /*\ i = 0x4000 - Mx \*/ movl $0x4000, %ebx subl Mx, %ebx @@ -731,13 +729,13 @@ PR_(imlib_Scale_mmx_AARGBA): psllw $7, %mm2 pmulhw %mm3, %mm2 paddw %mm2, %mm1 - + /*\ i -= Cx; while (i > Cx) \*/ subl Cx, %ebx 2: cmpl Cx, %ebx jg 1b - + /*\ vx += (*++p * i) >> 9 \*/ addl $4, %eax movd (%eax), %mm2 @@ -788,8 +786,6 @@ PR_(imlib_Scale_mmx_AARGBA): SIZE(imlib_Scale_mmx_AARGBA) -#endif - #ifdef __ELF__ .section .note.GNU-stack,"",@progbits #endif