From 2bf270814494e38ffa3d98e3e8f6c10bac9249bd Mon Sep 17 00:00:00 2001 From: Carsten Haitzler Date: Fri, 15 Apr 2005 07:00:40 +0000 Subject: [PATCH] John Slaten's amd64 mmx patch SVN revision: 14207 --- configure.in | 2 + src/lib/Makefile.am | 3 +- src/lib/amd64_blend_cmod.S | 16825 +++++++++++++++++++++++++++++++++++ src/lib/blend.c | 52 +- src/lib/blend.h | 85 + 5 files changed, 16950 insertions(+), 17 deletions(-) create mode 100644 src/lib/amd64_blend_cmod.S diff --git a/configure.in b/configure.in index 83c0f87..c310942 100644 --- a/configure.in +++ b/configure.in @@ -104,6 +104,8 @@ AC_ARG_ENABLE(mmx,[ --enable-mmx attempt compiling using mmx assembly [ if test x$enableval = xyes; then mmx=yes + # Cannot compile with both options enabled + amd64=no AC_MSG_RESULT(enabling mmx support) else mmx=no diff --git a/src/lib/Makefile.am b/src/lib/Makefile.am index fb896c7..d191a04 100644 --- a/src/lib/Makefile.am +++ b/src/lib/Makefile.am @@ -75,7 +75,8 @@ asm_rotate.S \ asm_scale.S AMD64_SRCS = \ -amd64_blend.S +amd64_blend.S \ +amd64_blend_cmod.S MMX_OBJS = $(MMX_SRCS:.S=.lo) AMD64_OBJS = $(AMD64_SRCS:.S=.lo) diff --git a/src/lib/amd64_blend_cmod.S b/src/lib/amd64_blend_cmod.S new file mode 100644 index 0000000..46a95f6 --- /dev/null +++ b/src/lib/amd64_blend_cmod.S @@ -0,0 +1,16825 @@ +#include + +#ifdef __EMX__ +/* Due to strange behaviour of as.exe we use this macros */ +/* For all OS/2 coders - please use PGCC to compile this code */ +#define PR_(foo) ___##foo +#define PT_(foo,func) ___##foo,##func +#define SIZE(sym) \ + .___end_##sym:; \ + .size ___##sym,.___end_##sym-___##sym; \ + .align 16; +#else +#define PR_(foo) __##foo +#define PT_(foo,func) __##foo,##func +#define SIZE(sym) \ + .__end_##sym:; \ + .size __##sym,.__end_##sym-__##sym; \ + .align 16; +#endif + +#ifdef DO_AMD64_ASM + +/*\ +|*| AMD64 SSE2 assembly blending routines for Imlib2 +|*| Written by John Slaten +|*| Based on MMX routines written by Willem Monsuwe +\*/ + +/*\ All functions have the same calling convention: +|*| __imlib_amd64__rgba_to_rgb[A](void *src, int sw, void *dst, int dw, +|*| int w, int h, ImlibColorModifier *cm) +|*| AMD64 GCC passes paramters by register, so no aliases exist in this version. +\*/ + +.text + .align 16 +.globl PR_(imlib_amd64_blend_rgba_to_rgb_cmod) + .type PT_(imlib_amd64_blend_rgba_to_rgb_cmod,@function) +.globl PR_(imlib_amd64_blend_rgba_to_rgba_cmod) + .type PT_(imlib_amd64_blend_rgba_to_rgba_cmod,@function) +.globl PR_(imlib_amd64_blend_rgb_to_rgba_cmod) + .type PT_(imlib_amd64_blend_rgb_to_rgba_cmod,@function) +.globl PR_(imlib_amd64_blend_rgb_to_rgb_cmod) + .type PT_(imlib_amd64_blend_rgb_to_rgb_cmod,@function) +.globl PR_(imlib_amd64_copy_rgba_to_rgb_cmod) + .type PT_(imlib_amd64_copy_rgba_to_rgb_cmod,@function) +.globl PR_(imlib_amd64_copy_rgba_to_rgba_cmod) + .type PT_(imlib_amd64_copy_rgba_to_rgba_cmod,@function) +.globl PR_(imlib_amd64_copy_rgb_to_rgba_cmod) + .type PT_(imlib_amd64_copy_rgb_to_rgba_cmod,@function) + +.globl PR_(imlib_amd64_add_blend_rgba_to_rgb_cmod) + .type PT_(imlib_amd64_add_blend_rgba_to_rgb_cmod,@function) +.globl PR_(imlib_amd64_add_blend_rgba_to_rgba_cmod) + .type PT_(imlib_amd64_add_blend_rgba_to_rgba_cmod,@function) +.globl PR_(imlib_amd64_add_blend_rgb_to_rgba_cmod) + .type PT_(imlib_amd64_add_blend_rgb_to_rgba_cmod,@function) +.globl PR_(imlib_amd64_add_blend_rgb_to_rgb_cmod) + .type PT_(imlib_amd64_add_blend_rgb_to_rgb_cmod,@function) +.globl PR_(imlib_amd64_add_copy_rgba_to_rgb_cmod) + .type PT_(imlib_amd64_add_copy_rgba_to_rgb_cmod,@function) +.globl PR_(imlib_amd64_add_copy_rgba_to_rgba_cmod) + .type PT_(imlib_amd64_add_copy_rgba_to_rgba_cmod,@function) +.globl PR_(imlib_amd64_add_copy_rgb_to_rgba_cmod) + .type PT_(imlib_amd64_add_copy_rgb_to_rgba_cmod,@function) + +.globl PR_(imlib_amd64_subtract_blend_rgba_to_rgb_cmod) + .type PT_(imlib_amd64_subtract_blend_rgba_to_rgb_cmod,@function) +.globl PR_(imlib_amd64_subtract_blend_rgba_to_rgba_cmod) + .type PT_(imlib_amd64_subtract_blend_rgba_to_rgba_cmod,@function) +.globl PR_(imlib_amd64_subtract_blend_rgb_to_rgba_cmod) + .type PT_(imlib_amd64_subtract_blend_rgb_to_rgba_cmod,@function) +.globl PR_(imlib_amd64_subtract_blend_rgb_to_rgb_cmod) + .type PT_(imlib_amd64_subtract_blend_rgb_to_rgb_cmod,@function) +.globl PR_(imlib_amd64_subtract_copy_rgba_to_rgb_cmod) + .type PT_(imlib_amd64_subtract_copy_rgba_to_rgb_cmod,@function) +.globl PR_(imlib_amd64_subtract_copy_rgba_to_rgba_cmod) + .type PT_(imlib_amd64_subtract_copy_rgba_to_rgba_cmod,@function) +.globl PR_(imlib_amd64_subtract_copy_rgb_to_rgba_cmod) + .type PT_(imlib_amd64_subtract_copy_rgb_to_rgba_cmod,@function) + +.globl PR_(imlib_amd64_reshade_blend_rgba_to_rgb_cmod) + .type PT_(imlib_amd64_reshade_blend_rgba_to_rgb_cmod,@function) +.globl PR_(imlib_amd64_reshade_blend_rgba_to_rgba_cmod) + .type PT_(imlib_amd64_reshade_blend_rgba_to_rgba_cmod,@function) +.globl PR_(imlib_amd64_reshade_blend_rgb_to_rgba_cmod) + .type PT_(imlib_amd64_reshade_blend_rgb_to_rgba_cmod,@function) +.globl PR_(imlib_amd64_reshade_blend_rgb_to_rgb_cmod) + .type PT_(imlib_amd64_reshade_blend_rgb_to_rgb_cmod,@function) +.globl PR_(imlib_amd64_reshade_copy_rgba_to_rgb_cmod) + .type PT_(imlib_amd64_reshade_copy_rgba_to_rgb_cmod,@function) +.globl PR_(imlib_amd64_reshade_copy_rgba_to_rgba_cmod) + .type PT_(imlib_amd64_reshade_copy_rgba_to_rgba_cmod,@function) +.globl PR_(imlib_amd64_reshade_copy_rgb_to_rgba_cmod) + .type PT_(imlib_amd64_reshade_copy_rgb_to_rgba_cmod,@function) + +.extern pow_lut + +/*\ Some useful masks \*/ +m0X000000: .byte 0, 0, 0, 0, 0, 0, 255, 0 + .byte 0, 0, 0, 0, 0, 0, 255, 0 +m10000000: .byte 0, 0, 0, 0, 0, 0, 0, 1 + .byte 0, 0, 0, 0, 0, 0, 0, 1 +m00XXXXXX: .byte 255, 255, 255, 255, 255, 255, 0, 0 + .byte 255, 255, 255, 255, 255, 255, 0, 0 +mVX000000: .byte 0, 0, 0, 0, 0, 0, 255, 127 + .byte 0, 0, 0, 0, 0, 0, 255, 127 +mV0000000: .byte 0, 0, 0, 0, 0, 0, 0, 128 + .byte 0, 0, 0, 0, 0, 0, 0, 128 +mX000X000: .byte 0, 0, 0, 0, 0, 0, 255, 255 + .byte 0, 0, 0, 0, 0, 0, 255, 255 +m0XXX0XXX0XXX0XXX: .byte 255, 255, 255, 0, 255, 255, 255, 0 + .byte 255, 255, 255, 0, 255, 255, 255, 0 +m0XXX0XXX00000000: .byte 255, 255, 255, 0, 255, 255, 255, 0 + .byte 0, 0, 0, 0, 0, 0, 0, 0 +m0XXX000000000000: .byte 255, 255, 255, 0, 0, 0, 0, 0 + .byte 0, 0, 0, 0, 0, 0, 0, 0 +mX000X000X000X000: .byte 0, 0, 0, 255, 0, 0, 0, 255 + .byte 0, 0, 0, 255, 0, 0, 0, 255 +mX000X00000000000: .byte 0, 0, 0, 255, 0, 0, 0, 255 + .byte 0, 0, 0, 255, 0, 0, 0, 255 +mX000000000000000: .byte 0, 0, 0, 255, 0, 0, 0, 255 + .byte 0, 0, 0, 255, 0, 0, 0, 255 +m1000100010001000: .byte 0, 0, 0, 1, 0, 0, 0, 1 + .byte 0, 0, 0, 1, 0, 0, 0, 1 +m000V0V0V000V0V0V: .byte 127, 0, 127, 0, 127, 0, 0, 0 + .byte 127, 0, 127, 0, 127, 0, 0, 0 +mI0000000I0000000: .byte 0, 0, 0, 0, 0, 0, 0, 64 + .byte 0, 0, 0, 0, 0, 0, 0, 64 +m0VVV0VVV0VVV0VVV: .byte 127, 127, 127, 0, 127, 127, 127, 0 + .byte 127, 127, 127, 0, 127, 127, 127, 0 +c1: .word 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1 + +/*\ SSE register use: +|*| %xmm1 = Source value +|*| %xmm2 = Destination value +|*| %xmm3 = Alpha value +|*| %xmm4 = 0 +|*| %xmm5-%xmm7 = masks +\*/ + +/*\ Variables: +|*| %rsi = src +|*| %rdi = dst +|*| %r8d = w +|*| %r9d = h +|*| %r10d = sw +|*| %r11d = dw +\*/ + + + + + + + + + +#define ENTER \ + pushq %rbp ; \ + movq %rsp, %rbp ; \ + pushq %rbx ; \ + pushq %r13 ; \ + pushq %r14 ; \ + movq %rsi, %r10 ; \ + movq %rcx, %r11 ; \ + movq %rdi, %rsi ; \ + movq %rdx, %rdi ; \ + movq 16(%rbp), %r14 ; \ + ; \ + /* param sanity check */ ; \ + testq %r8, %r8 ; \ + jz 9f ; \ + testq %r9, %r9 ; \ + jz 9f + +#define LEAVE \ + popq %r14 ; \ + popq %r13 ; \ + popq %rbx ; \ + movq %rbp, %rsp ; \ + popq %rbp ; \ + ret + + +PR_(imlib_amd64_blend_rgba_to_rgb_cmod): + ENTER + + pxor %xmm4, %xmm4 + movdqa c1(%rip), %xmm5 + movdqa m00XXXXXX(%rip), %xmm6 + + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx + + /* prefetch a couple cache lines ahead */ + prefetchnta (%rsi, %rcx, 4) + prefetcht0 (%rdi, %rcx, 4) + prefetchnta 64(%rsi, %rcx, 4) + prefetcht0 64(%rdi, %rcx, 4) + + jz 2f /* one pixel line */ +1: + /* main loop, unrolled to work on 64 byte chunks */ + prefetchnta 128(%rsi, %rcx, 4) + prefetcht0 128(%rdi, %rcx, 4) + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Get alpha from source and unpack to words + * Result ranges is [0, 0x7fff], and is mapped to + * point values in [0.0, 1.0) by using the high word + * of the 32 bit multiplication result. + * Because we want the unsigned value, we shift right one + * here and also shift left the other factors to compensate. + */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero out the alpha channel of the source to leave the + * destination alpha unchanged. + */ + pand %xmm6, %xmm3 + + /* Unpack src and dst to words */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (a * ((s - d) + 0.5)) */ + psubw %xmm2, %xmm1 + psllw $1, %xmm1 + paddw %xmm5, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* Repack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Get alpha from source and unpack to words + * Result ranges is [0, 0x7fff], and is mapped to + * point values in [0.0, 1.0) by using the high word + * of the 32 bit multiplication result. + * Because we want the unsigned value, we shift right one + * here and also shift left the other factors to compensate. + */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero out the alpha channel of the source to leave the + * destination alpha unchanged. + */ + pand %xmm6, %xmm3 + + /* Unpack src and dst to words */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (a * ((s - d) + 0.5)) */ + psubw %xmm2, %xmm1 + psllw $1, %xmm1 + paddw %xmm5, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* Repack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Get alpha from source and unpack to words + * Result ranges is [0, 0x7fff], and is mapped to + * point values in [0.0, 1.0) by using the high word + * of the 32 bit multiplication result. + * Because we want the unsigned value, we shift right one + * here and also shift left the other factors to compensate. + */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero out the alpha channel of the source to leave the + * destination alpha unchanged. + */ + pand %xmm6, %xmm3 + + /* Unpack src and dst to words */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (a * ((s - d) + 0.5)) */ + psubw %xmm2, %xmm1 + psllw $1, %xmm1 + paddw %xmm5, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* Repack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Get alpha from source and unpack to words + * Result ranges is [0, 0x7fff], and is mapped to + * point values in [0.0, 1.0) by using the high word + * of the 32 bit multiplication result. + * Because we want the unsigned value, we shift right one + * here and also shift left the other factors to compensate. + */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero out the alpha channel of the source to leave the + * destination alpha unchanged. + */ + pand %xmm6, %xmm3 + + /* Unpack src and dst to words */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (a * ((s - d) + 0.5)) */ + psubw %xmm2, %xmm1 + psllw $1, %xmm1 + paddw %xmm5, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* Repack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Get alpha from source and unpack to words + * Result ranges is [0, 0x7fff], and is mapped to + * point values in [0.0, 1.0) by using the high word + * of the 32 bit multiplication result. + * Because we want the unsigned value, we shift right one + * here and also shift left the other factors to compensate. + */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero out the alpha channel of the source to leave the + * destination alpha unchanged. + */ + pand %xmm6, %xmm3 + + /* Unpack src and dst to words */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (a * ((s - d) + 0.5)) */ + psubw %xmm2, %xmm1 + psllw $1, %xmm1 + paddw %xmm5, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* Repack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Get alpha from source and unpack to words + * Result ranges is [0, 0x7fff], and is mapped to + * point values in [0.0, 1.0) by using the high word + * of the 32 bit multiplication result. + * Because we want the unsigned value, we shift right one + * here and also shift left the other factors to compensate. + */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero out the alpha channel of the source to leave the + * destination alpha unchanged. + */ + pand %xmm6, %xmm3 + + /* Unpack src and dst to words */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (a * ((s - d) + 0.5)) */ + psubw %xmm2, %xmm1 + psllw $1, %xmm1 + paddw %xmm5, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* Repack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Get alpha from source and unpack to words + * Result ranges is [0, 0x7fff], and is mapped to + * point values in [0.0, 1.0) by using the high word + * of the 32 bit multiplication result. + * Because we want the unsigned value, we shift right one + * here and also shift left the other factors to compensate. + */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero out the alpha channel of the source to leave the + * destination alpha unchanged. + */ + pand %xmm6, %xmm3 + + /* Unpack src and dst to words */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (a * ((s - d) + 0.5)) */ + psubw %xmm2, %xmm1 + psllw $1, %xmm1 + paddw %xmm5, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* Repack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Get alpha from source and unpack to words + * Result ranges is [0, 0x7fff], and is mapped to + * point values in [0.0, 1.0) by using the high word + * of the 32 bit multiplication result. + * Because we want the unsigned value, we shift right one + * here and also shift left the other factors to compensate. + */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero out the alpha channel of the source to leave the + * destination alpha unchanged. + */ + pand %xmm6, %xmm3 + + /* Unpack src and dst to words */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (a * ((s - d) + 0.5)) */ + psubw %xmm2, %xmm1 + psllw $1, %xmm1 + paddw %xmm5, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* Repack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + js 1b + jnz 3f +2: + /* Grab 1 pixel from src, with colormod */ + movl (%rsi, %rcx, 4), %eax + ror $24, %eax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %edx, %xmm1 + movd (%rdi, %rcx, 4), %xmm2 + /* Get alpha from source and unpack to words + * Result ranges is [0, 0x7fff], and is mapped to + * point values in [0.0, 1.0) by using the high word + * of the 32 bit multiplication result. + * Because we want the unsigned value, we shift right one + * here and also shift left the other factors to compensate. + */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero out the alpha channel of the source to leave the + * destination alpha unchanged. + */ + pand %xmm6, %xmm3 + + /* Unpack src and dst to words */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (a * ((s - d) + 0.5)) */ + psubw %xmm2, %xmm1 + psllw $1, %xmm1 + paddw %xmm5, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* Repack new pixels */ + packuswb %xmm4, %xmm2 + movd %xmm2, (%rdi, %rcx, 4) +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 + jnz 0b + +9: + LEAVE +SIZE(imlib_amd64_blend_rgba_to_rgb_cmod) +PR_(imlib_amd64_blend_rgba_to_rgba_cmod): + ENTER + + pxor %xmm4, %xmm4 + movdqa c1(%rip), %xmm5 + xorq %rax, %rax + movdqa mX000X000X000X000(%rip), %xmm6 + movq pow_lut@GOTPCREL(%rip), %r13 + + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx + + /* prefetch a couple cache lines ahead */ + prefetchnta (%rsi, %rcx, 4) + prefetcht0 (%rdi, %rcx, 4) + prefetchnta 64(%rsi, %rcx, 4) + prefetcht0 64(%rdi, %rcx, 4) + + jz 2f /* one pixel line */ +1: + /* main loop, unrolled to work on 64 byte chunks */ + prefetchnta 128(%rsi, %rcx, 4) + prefetcht0 128(%rdi, %rcx, 4) + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movd %rax, %xmm3 + /* override source alpha to 255 */ + por %xmm6, %xmm1 + + /* unpack alpha to src alpha, combined alpha x 3 */ + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* unpack source and dest */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (a * ((s - d) + 0.5)) */ + psubw %xmm2, %xmm1 + psllw $1, %xmm1 + paddw %xmm5, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* repack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movd %rax, %xmm3 + /* override source alpha to 255 */ + por %xmm6, %xmm1 + + /* unpack alpha to src alpha, combined alpha x 3 */ + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* unpack source and dest */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (a * ((s - d) + 0.5)) */ + psubw %xmm2, %xmm1 + psllw $1, %xmm1 + paddw %xmm5, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* repack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movd %rax, %xmm3 + /* override source alpha to 255 */ + por %xmm6, %xmm1 + + /* unpack alpha to src alpha, combined alpha x 3 */ + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* unpack source and dest */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (a * ((s - d) + 0.5)) */ + psubw %xmm2, %xmm1 + psllw $1, %xmm1 + paddw %xmm5, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* repack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movd %rax, %xmm3 + /* override source alpha to 255 */ + por %xmm6, %xmm1 + + /* unpack alpha to src alpha, combined alpha x 3 */ + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* unpack source and dest */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (a * ((s - d) + 0.5)) */ + psubw %xmm2, %xmm1 + psllw $1, %xmm1 + paddw %xmm5, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* repack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movd %rax, %xmm3 + /* override source alpha to 255 */ + por %xmm6, %xmm1 + + /* unpack alpha to src alpha, combined alpha x 3 */ + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* unpack source and dest */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (a * ((s - d) + 0.5)) */ + psubw %xmm2, %xmm1 + psllw $1, %xmm1 + paddw %xmm5, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* repack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movd %rax, %xmm3 + /* override source alpha to 255 */ + por %xmm6, %xmm1 + + /* unpack alpha to src alpha, combined alpha x 3 */ + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* unpack source and dest */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (a * ((s - d) + 0.5)) */ + psubw %xmm2, %xmm1 + psllw $1, %xmm1 + paddw %xmm5, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* repack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movd %rax, %xmm3 + /* override source alpha to 255 */ + por %xmm6, %xmm1 + + /* unpack alpha to src alpha, combined alpha x 3 */ + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* unpack source and dest */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (a * ((s - d) + 0.5)) */ + psubw %xmm2, %xmm1 + psllw $1, %xmm1 + paddw %xmm5, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* repack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movd %rax, %xmm3 + /* override source alpha to 255 */ + por %xmm6, %xmm1 + + /* unpack alpha to src alpha, combined alpha x 3 */ + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* unpack source and dest */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (a * ((s - d) + 0.5)) */ + psubw %xmm2, %xmm1 + psllw $1, %xmm1 + paddw %xmm5, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* repack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + js 1b + jnz 3f +2: + /* Grab 1 pixel from src, with colormod */ + movl (%rsi, %rcx, 4), %eax + ror $24, %eax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %edx, %xmm1 + movd (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending */ + roll $16, %edx + andl $0x0000ff00, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + movd %eax, %xmm3 + /* override source alpha to 255 */ + por %xmm6, %xmm1 + + /* unpack alpha to src alpha, combined alpha x 3 */ + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* unpack source and dest */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (a * ((s - d) + 0.5)) */ + psubw %xmm2, %xmm1 + psllw $1, %xmm1 + paddw %xmm5, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* repack new pixels */ + packuswb %xmm4, %xmm2 + movd %xmm2, (%rdi, %rcx, 4) +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 + jnz 0b + +9: + LEAVE +SIZE(imlib_amd64_blend_rgba_to_rgba_cmod) +PR_(imlib_amd64_blend_rgb_to_rgba_cmod): + ENTER + + pxor %xmm4, %xmm4 + movdqa c1(%rip), %xmm5 + xorq %rax, %rax + movdqa mX000X000X000X000(%rip), %xmm6 + movq pow_lut@GOTPCREL(%rip), %r13 + + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx + + /* prefetch a couple cache lines ahead */ + prefetchnta (%rsi, %rcx, 4) + prefetcht0 (%rdi, %rcx, 4) + prefetchnta 64(%rsi, %rcx, 4) + prefetcht0 64(%rdi, %rcx, 4) + + jz 2f /* one pixel line */ +1: + /* main loop, unrolled to work on 64 byte chunks */ + prefetchnta 128(%rsi, %rcx, 4) + prefetcht0 128(%rdi, %rcx, 4) + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movd %rax, %xmm3 + /* override source alpha to 255 */ + por %xmm6, %xmm1 + + /* unpack alpha to src alpha, combined alpha x 3 */ + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* unpack source and dest */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (a * ((s - d) + 0.5)) */ + psubw %xmm2, %xmm1 + psllw $1, %xmm1 + paddw %xmm5, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* repack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movd %rax, %xmm3 + /* override source alpha to 255 */ + por %xmm6, %xmm1 + + /* unpack alpha to src alpha, combined alpha x 3 */ + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* unpack source and dest */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (a * ((s - d) + 0.5)) */ + psubw %xmm2, %xmm1 + psllw $1, %xmm1 + paddw %xmm5, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* repack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movd %rax, %xmm3 + /* override source alpha to 255 */ + por %xmm6, %xmm1 + + /* unpack alpha to src alpha, combined alpha x 3 */ + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* unpack source and dest */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (a * ((s - d) + 0.5)) */ + psubw %xmm2, %xmm1 + psllw $1, %xmm1 + paddw %xmm5, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* repack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movd %rax, %xmm3 + /* override source alpha to 255 */ + por %xmm6, %xmm1 + + /* unpack alpha to src alpha, combined alpha x 3 */ + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* unpack source and dest */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (a * ((s - d) + 0.5)) */ + psubw %xmm2, %xmm1 + psllw $1, %xmm1 + paddw %xmm5, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* repack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movd %rax, %xmm3 + /* override source alpha to 255 */ + por %xmm6, %xmm1 + + /* unpack alpha to src alpha, combined alpha x 3 */ + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* unpack source and dest */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (a * ((s - d) + 0.5)) */ + psubw %xmm2, %xmm1 + psllw $1, %xmm1 + paddw %xmm5, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* repack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movd %rax, %xmm3 + /* override source alpha to 255 */ + por %xmm6, %xmm1 + + /* unpack alpha to src alpha, combined alpha x 3 */ + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* unpack source and dest */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (a * ((s - d) + 0.5)) */ + psubw %xmm2, %xmm1 + psllw $1, %xmm1 + paddw %xmm5, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* repack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movd %rax, %xmm3 + /* override source alpha to 255 */ + por %xmm6, %xmm1 + + /* unpack alpha to src alpha, combined alpha x 3 */ + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* unpack source and dest */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (a * ((s - d) + 0.5)) */ + psubw %xmm2, %xmm1 + psllw $1, %xmm1 + paddw %xmm5, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* repack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movd %rax, %xmm3 + /* override source alpha to 255 */ + por %xmm6, %xmm1 + + /* unpack alpha to src alpha, combined alpha x 3 */ + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* unpack source and dest */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (a * ((s - d) + 0.5)) */ + psubw %xmm2, %xmm1 + psllw $1, %xmm1 + paddw %xmm5, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* repack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + js 1b + jnz 3f +2: + /* Grab 1 pixel from src, with colormod, with a = amod[255] */ + movl (%rsi, %rcx, 4), %eax + ror $16, %eax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shl $8, %edx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %edx, %xmm1 + movd (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending */ + roll $16, %edx + andl $0x0000ff00, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + movd %eax, %xmm3 + /* override source alpha to 255 */ + por %xmm6, %xmm1 + + /* unpack alpha to src alpha, combined alpha x 3 */ + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* unpack source and dest */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (a * ((s - d) + 0.5)) */ + psubw %xmm2, %xmm1 + psllw $1, %xmm1 + paddw %xmm5, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* repack new pixels */ + packuswb %xmm4, %xmm2 + movd %xmm2, (%rdi, %rcx, 4) +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 + jnz 0b + +9: + LEAVE +SIZE(imlib_amd64_blend_rgb_to_rgba_cmod) +PR_(imlib_amd64_blend_rgb_to_rgb_cmod): + ENTER + + pxor %xmm4, %xmm4 + movdqa c1(%rip), %xmm5 + movdqa m00XXXXXX(%rip), %xmm6 + + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx + + /* prefetch a couple cache lines ahead */ + prefetchnta (%rsi, %rcx, 4) + prefetcht0 (%rdi, %rcx, 4) + prefetchnta 64(%rsi, %rcx, 4) + prefetcht0 64(%rdi, %rcx, 4) + + jz 2f /* one pixel line */ +1: + /* main loop, unrolled to work on 64 byte chunks */ + prefetchnta 128(%rsi, %rcx, 4) + prefetcht0 128(%rdi, %rcx, 4) + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Get alpha from source and unpack to words + * Result ranges is [0, 0x7fff], and is mapped to + * point values in [0.0, 1.0) by using the high word + * of the 32 bit multiplication result. + * Because we want the unsigned value, we shift right one + * here and also shift left the other factors to compensate. + */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero out the alpha channel of the source to leave the + * destination alpha unchanged. + */ + pand %xmm6, %xmm3 + + /* Unpack src and dst to words */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (a * ((s - d) + 0.5)) */ + psubw %xmm2, %xmm1 + psllw $1, %xmm1 + paddw %xmm5, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* Repack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Get alpha from source and unpack to words + * Result ranges is [0, 0x7fff], and is mapped to + * point values in [0.0, 1.0) by using the high word + * of the 32 bit multiplication result. + * Because we want the unsigned value, we shift right one + * here and also shift left the other factors to compensate. + */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero out the alpha channel of the source to leave the + * destination alpha unchanged. + */ + pand %xmm6, %xmm3 + + /* Unpack src and dst to words */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (a * ((s - d) + 0.5)) */ + psubw %xmm2, %xmm1 + psllw $1, %xmm1 + paddw %xmm5, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* Repack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Get alpha from source and unpack to words + * Result ranges is [0, 0x7fff], and is mapped to + * point values in [0.0, 1.0) by using the high word + * of the 32 bit multiplication result. + * Because we want the unsigned value, we shift right one + * here and also shift left the other factors to compensate. + */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero out the alpha channel of the source to leave the + * destination alpha unchanged. + */ + pand %xmm6, %xmm3 + + /* Unpack src and dst to words */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (a * ((s - d) + 0.5)) */ + psubw %xmm2, %xmm1 + psllw $1, %xmm1 + paddw %xmm5, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* Repack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Get alpha from source and unpack to words + * Result ranges is [0, 0x7fff], and is mapped to + * point values in [0.0, 1.0) by using the high word + * of the 32 bit multiplication result. + * Because we want the unsigned value, we shift right one + * here and also shift left the other factors to compensate. + */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero out the alpha channel of the source to leave the + * destination alpha unchanged. + */ + pand %xmm6, %xmm3 + + /* Unpack src and dst to words */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (a * ((s - d) + 0.5)) */ + psubw %xmm2, %xmm1 + psllw $1, %xmm1 + paddw %xmm5, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* Repack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Get alpha from source and unpack to words + * Result ranges is [0, 0x7fff], and is mapped to + * point values in [0.0, 1.0) by using the high word + * of the 32 bit multiplication result. + * Because we want the unsigned value, we shift right one + * here and also shift left the other factors to compensate. + */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero out the alpha channel of the source to leave the + * destination alpha unchanged. + */ + pand %xmm6, %xmm3 + + /* Unpack src and dst to words */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (a * ((s - d) + 0.5)) */ + psubw %xmm2, %xmm1 + psllw $1, %xmm1 + paddw %xmm5, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* Repack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Get alpha from source and unpack to words + * Result ranges is [0, 0x7fff], and is mapped to + * point values in [0.0, 1.0) by using the high word + * of the 32 bit multiplication result. + * Because we want the unsigned value, we shift right one + * here and also shift left the other factors to compensate. + */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero out the alpha channel of the source to leave the + * destination alpha unchanged. + */ + pand %xmm6, %xmm3 + + /* Unpack src and dst to words */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (a * ((s - d) + 0.5)) */ + psubw %xmm2, %xmm1 + psllw $1, %xmm1 + paddw %xmm5, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* Repack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Get alpha from source and unpack to words + * Result ranges is [0, 0x7fff], and is mapped to + * point values in [0.0, 1.0) by using the high word + * of the 32 bit multiplication result. + * Because we want the unsigned value, we shift right one + * here and also shift left the other factors to compensate. + */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero out the alpha channel of the source to leave the + * destination alpha unchanged. + */ + pand %xmm6, %xmm3 + + /* Unpack src and dst to words */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (a * ((s - d) + 0.5)) */ + psubw %xmm2, %xmm1 + psllw $1, %xmm1 + paddw %xmm5, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* Repack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Get alpha from source and unpack to words + * Result ranges is [0, 0x7fff], and is mapped to + * point values in [0.0, 1.0) by using the high word + * of the 32 bit multiplication result. + * Because we want the unsigned value, we shift right one + * here and also shift left the other factors to compensate. + */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero out the alpha channel of the source to leave the + * destination alpha unchanged. + */ + pand %xmm6, %xmm3 + + /* Unpack src and dst to words */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (a * ((s - d) + 0.5)) */ + psubw %xmm2, %xmm1 + psllw $1, %xmm1 + paddw %xmm5, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* Repack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + js 1b + jnz 3f +2: + /* Grab 1 pixel from src, with colormod, with a = amod[255] */ + movl (%rsi, %rcx, 4), %eax + ror $16, %eax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shl $8, %edx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %edx, %xmm1 + movd (%rdi, %rcx, 4), %xmm2 + /* Get alpha from source and unpack to words + * Result ranges is [0, 0x7fff], and is mapped to + * point values in [0.0, 1.0) by using the high word + * of the 32 bit multiplication result. + * Because we want the unsigned value, we shift right one + * here and also shift left the other factors to compensate. + */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero out the alpha channel of the source to leave the + * destination alpha unchanged. + */ + pand %xmm6, %xmm3 + + /* Unpack src and dst to words */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (a * ((s - d) + 0.5)) */ + psubw %xmm2, %xmm1 + psllw $1, %xmm1 + paddw %xmm5, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* Repack new pixels */ + packuswb %xmm4, %xmm2 + movd %xmm2, (%rdi, %rcx, 4) +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 + jnz 0b + +9: + LEAVE +SIZE(imlib_amd64_blend_rgb_to_rgb_cmod) +PR_(imlib_amd64_copy_rgba_to_rgb_cmod): + ENTER + + movq mX000X000X000X000(%rip), %r13 + + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx + + /* prefetch a couple cache lines ahead */ + prefetchnta (%rsi, %rcx, 4) + prefetcht0 (%rdi, %rcx, 4) + prefetchnta 64(%rsi, %rcx, 4) + prefetcht0 64(%rdi, %rcx, 4) + + jz 2f /* one pixel line */ +1: + /* main loop, unrolled to work on 64 byte chunks */ + prefetchnta 128(%rsi, %rcx, 4) + prefetcht0 128(%rdi, %rcx, 4) + + /* Grab 2 pixels from src, with colormod, with a = 0 */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movzbq %al, %rbx + movzbq 0x000(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $16, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movq (%rdi, %rcx, 4), %rax + andq %r13, %rax + orq %rax, %rdx + movq %rdx, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = 0 */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movzbq %al, %rbx + movzbq 0x000(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $16, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movq (%rdi, %rcx, 4), %rax + andq %r13, %rax + orq %rax, %rdx + movq %rdx, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = 0 */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movzbq %al, %rbx + movzbq 0x000(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $16, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movq (%rdi, %rcx, 4), %rax + andq %r13, %rax + orq %rax, %rdx + movq %rdx, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = 0 */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movzbq %al, %rbx + movzbq 0x000(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $16, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movq (%rdi, %rcx, 4), %rax + andq %r13, %rax + orq %rax, %rdx + movq %rdx, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = 0 */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movzbq %al, %rbx + movzbq 0x000(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $16, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movq (%rdi, %rcx, 4), %rax + andq %r13, %rax + orq %rax, %rdx + movq %rdx, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = 0 */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movzbq %al, %rbx + movzbq 0x000(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $16, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movq (%rdi, %rcx, 4), %rax + andq %r13, %rax + orq %rax, %rdx + movq %rdx, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = 0 */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movzbq %al, %rbx + movzbq 0x000(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $16, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movq (%rdi, %rcx, 4), %rax + andq %r13, %rax + orq %rax, %rdx + movq %rdx, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = 0 */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movzbq %al, %rbx + movzbq 0x000(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $16, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movq (%rdi, %rcx, 4), %rax + andq %r13, %rax + orq %rax, %rdx + movq %rdx, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + js 1b + jnz 3f +2: + /* Grab 1 pixel from src, with colormod, with a = 0 */ + movl (%rsi, %rcx, 4), %eax + ror $16, %eax + movzbq %al, %rbx + movzbq 0x000(%r14, %rbx), %rdx + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movl (%rdi, %rcx, 4), %eax + andq %r13, %rax + orq %rax, %rdx + movl %edx, (%rdi, %rcx, 4) +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 + jnz 0b + +9: + LEAVE +SIZE(imlib_amd64_copy_rgba_to_rgb_cmod) +PR_(imlib_amd64_copy_rgba_to_rgba_cmod): + ENTER + + + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx + + /* prefetch a couple cache lines ahead */ + prefetchnta (%rsi, %rcx, 4) + prefetcht0 (%rdi, %rcx, 4) + prefetchnta 64(%rsi, %rcx, 4) + prefetcht0 64(%rdi, %rcx, 4) + + jz 2f /* one pixel line */ +1: + /* main loop, unrolled to work on 64 byte chunks */ + prefetchnta 128(%rsi, %rcx, 4) + prefetcht0 128(%rdi, %rcx, 4) + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movq %rdx, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movq %rdx, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movq %rdx, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movq %rdx, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movq %rdx, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movq %rdx, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movq %rdx, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movq %rdx, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + js 1b + jnz 3f +2: + /* Grab 1 pixel from src, with colormod */ + movl (%rsi, %rcx, 4), %eax + ror $24, %eax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movl %edx, (%rdi, %rcx, 4) +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 + jnz 0b + +9: + LEAVE +SIZE(imlib_amd64_copy_rgba_to_rgba_cmod) +PR_(imlib_amd64_copy_rgb_to_rgba_cmod): + ENTER + + + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx + + /* prefetch a couple cache lines ahead */ + prefetchnta (%rsi, %rcx, 4) + prefetcht0 (%rdi, %rcx, 4) + prefetchnta 64(%rsi, %rcx, 4) + prefetcht0 64(%rdi, %rcx, 4) + + jz 2f /* one pixel line */ +1: + /* main loop, unrolled to work on 64 byte chunks */ + prefetchnta 128(%rsi, %rcx, 4) + prefetcht0 128(%rdi, %rcx, 4) + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movq %rdx, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movq %rdx, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movq %rdx, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movq %rdx, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movq %rdx, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movq %rdx, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movq %rdx, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movq %rdx, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + js 1b + jnz 3f +2: + /* Grab 1 pixel from src, with colormod, with a = amod[255] */ + movl (%rsi, %rcx, 4), %eax + ror $16, %eax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shl $8, %edx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movl %edx, (%rdi, %rcx, 4) +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 + jnz 0b + +9: + LEAVE +SIZE(imlib_amd64_copy_rgb_to_rgba_cmod) +PR_(imlib_amd64_add_blend_rgba_to_rgb_cmod): + ENTER + + pxor %xmm4, %xmm4 + movdqa m00XXXXXX(%rip), %xmm6 + + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx + + /* prefetch a couple cache lines ahead */ + prefetchnta (%rsi, %rcx, 4) + prefetcht0 (%rdi, %rcx, 4) + prefetchnta 64(%rsi, %rcx, 4) + prefetcht0 64(%rdi, %rcx, 4) + + jz 2f /* one pixel line */ +1: + /* main loop, unrolled to work on 64 byte chunks */ + prefetchnta 128(%rsi, %rcx, 4) + prefetcht0 128(%rdi, %rcx, 4) + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Get alpha from source and unpack to words + * Result ranges is [0, 0x7fff], and is mapped to + * point values in [0.0, 1.0) by using the high word + * of the 32 bit multiplication result. + * Because we want the unsigned value, we shift right one + * here and also shift left the other factors to compensate. + */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero out the alpha channel of the source to leave the + * destination alpha unchanged. + */ + pand %xmm6, %xmm3 + + /* unpack src and dst to words */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (a * s) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Get alpha from source and unpack to words + * Result ranges is [0, 0x7fff], and is mapped to + * point values in [0.0, 1.0) by using the high word + * of the 32 bit multiplication result. + * Because we want the unsigned value, we shift right one + * here and also shift left the other factors to compensate. + */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero out the alpha channel of the source to leave the + * destination alpha unchanged. + */ + pand %xmm6, %xmm3 + + /* unpack src and dst to words */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (a * s) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Get alpha from source and unpack to words + * Result ranges is [0, 0x7fff], and is mapped to + * point values in [0.0, 1.0) by using the high word + * of the 32 bit multiplication result. + * Because we want the unsigned value, we shift right one + * here and also shift left the other factors to compensate. + */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero out the alpha channel of the source to leave the + * destination alpha unchanged. + */ + pand %xmm6, %xmm3 + + /* unpack src and dst to words */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (a * s) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Get alpha from source and unpack to words + * Result ranges is [0, 0x7fff], and is mapped to + * point values in [0.0, 1.0) by using the high word + * of the 32 bit multiplication result. + * Because we want the unsigned value, we shift right one + * here and also shift left the other factors to compensate. + */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero out the alpha channel of the source to leave the + * destination alpha unchanged. + */ + pand %xmm6, %xmm3 + + /* unpack src and dst to words */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (a * s) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Get alpha from source and unpack to words + * Result ranges is [0, 0x7fff], and is mapped to + * point values in [0.0, 1.0) by using the high word + * of the 32 bit multiplication result. + * Because we want the unsigned value, we shift right one + * here and also shift left the other factors to compensate. + */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero out the alpha channel of the source to leave the + * destination alpha unchanged. + */ + pand %xmm6, %xmm3 + + /* unpack src and dst to words */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (a * s) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Get alpha from source and unpack to words + * Result ranges is [0, 0x7fff], and is mapped to + * point values in [0.0, 1.0) by using the high word + * of the 32 bit multiplication result. + * Because we want the unsigned value, we shift right one + * here and also shift left the other factors to compensate. + */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero out the alpha channel of the source to leave the + * destination alpha unchanged. + */ + pand %xmm6, %xmm3 + + /* unpack src and dst to words */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (a * s) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Get alpha from source and unpack to words + * Result ranges is [0, 0x7fff], and is mapped to + * point values in [0.0, 1.0) by using the high word + * of the 32 bit multiplication result. + * Because we want the unsigned value, we shift right one + * here and also shift left the other factors to compensate. + */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero out the alpha channel of the source to leave the + * destination alpha unchanged. + */ + pand %xmm6, %xmm3 + + /* unpack src and dst to words */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (a * s) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Get alpha from source and unpack to words + * Result ranges is [0, 0x7fff], and is mapped to + * point values in [0.0, 1.0) by using the high word + * of the 32 bit multiplication result. + * Because we want the unsigned value, we shift right one + * here and also shift left the other factors to compensate. + */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero out the alpha channel of the source to leave the + * destination alpha unchanged. + */ + pand %xmm6, %xmm3 + + /* unpack src and dst to words */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (a * s) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + js 1b + jnz 3f +2: + /* Grab 1 pixel from src, with colormod */ + movl (%rsi, %rcx, 4), %eax + ror $24, %eax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %edx, %xmm1 + movd (%rdi, %rcx, 4), %xmm2 + /* Get alpha from source and unpack to words + * Result ranges is [0, 0x7fff], and is mapped to + * point values in [0.0, 1.0) by using the high word + * of the 32 bit multiplication result. + * Because we want the unsigned value, we shift right one + * here and also shift left the other factors to compensate. + */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero out the alpha channel of the source to leave the + * destination alpha unchanged. + */ + pand %xmm6, %xmm3 + + /* unpack src and dst to words */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (a * s) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movd %xmm2, (%rdi, %rcx, 4) +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 + jnz 0b + +9: + LEAVE +SIZE(imlib_amd64_add_blend_rgba_to_rgb_cmod) + +PR_(imlib_amd64_add_blend_rgba_to_rgba_cmod): + ENTER + + pxor %xmm4, %xmm4 + movdqa c1(%rip), %xmm5 + xorq %rax, %rax + movdqa mX000X000X000X000(%rip), %xmm6 + movq pow_lut@GOTPCREL(%rip), %r13 + + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx + + /* prefetch a couple cache lines ahead */ + prefetchnta (%rsi, %rcx, 4) + prefetcht0 (%rdi, %rcx, 4) + prefetchnta 64(%rsi, %rcx, 4) + prefetcht0 64(%rdi, %rcx, 4) + + jz 2f /* one pixel line */ +1: + /* main loop, unrolled to work on 64 byte chunks */ + prefetchnta 128(%rsi, %rcx, 4) + prefetcht0 128(%rdi, %rcx, 4) + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movd %rax, %xmm3 + /* unpack alpha to src alpha, combined alpha x 3 */ + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* src alpha = 255 - dst alpha */ + movdqa %xmm2, %xmm0 + por %xmm6, %xmm1 + pand %xmm6, %xmm0 + psubusb %xmm0, %xmm1 + + /* unpack src and dst */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (s * ca) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movd %rax, %xmm3 + /* unpack alpha to src alpha, combined alpha x 3 */ + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* src alpha = 255 - dst alpha */ + movdqa %xmm2, %xmm0 + por %xmm6, %xmm1 + pand %xmm6, %xmm0 + psubusb %xmm0, %xmm1 + + /* unpack src and dst */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (s * ca) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movd %rax, %xmm3 + /* unpack alpha to src alpha, combined alpha x 3 */ + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* src alpha = 255 - dst alpha */ + movdqa %xmm2, %xmm0 + por %xmm6, %xmm1 + pand %xmm6, %xmm0 + psubusb %xmm0, %xmm1 + + /* unpack src and dst */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (s * ca) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movd %rax, %xmm3 + /* unpack alpha to src alpha, combined alpha x 3 */ + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* src alpha = 255 - dst alpha */ + movdqa %xmm2, %xmm0 + por %xmm6, %xmm1 + pand %xmm6, %xmm0 + psubusb %xmm0, %xmm1 + + /* unpack src and dst */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (s * ca) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movd %rax, %xmm3 + /* unpack alpha to src alpha, combined alpha x 3 */ + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* src alpha = 255 - dst alpha */ + movdqa %xmm2, %xmm0 + por %xmm6, %xmm1 + pand %xmm6, %xmm0 + psubusb %xmm0, %xmm1 + + /* unpack src and dst */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (s * ca) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movd %rax, %xmm3 + /* unpack alpha to src alpha, combined alpha x 3 */ + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* src alpha = 255 - dst alpha */ + movdqa %xmm2, %xmm0 + por %xmm6, %xmm1 + pand %xmm6, %xmm0 + psubusb %xmm0, %xmm1 + + /* unpack src and dst */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (s * ca) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movd %rax, %xmm3 + /* unpack alpha to src alpha, combined alpha x 3 */ + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* src alpha = 255 - dst alpha */ + movdqa %xmm2, %xmm0 + por %xmm6, %xmm1 + pand %xmm6, %xmm0 + psubusb %xmm0, %xmm1 + + /* unpack src and dst */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (s * ca) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movd %rax, %xmm3 + /* unpack alpha to src alpha, combined alpha x 3 */ + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* src alpha = 255 - dst alpha */ + movdqa %xmm2, %xmm0 + por %xmm6, %xmm1 + pand %xmm6, %xmm0 + psubusb %xmm0, %xmm1 + + /* unpack src and dst */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (s * ca) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + js 1b + jnz 3f +2: + /* Grab 1 pixel from src, with colormod */ + movl (%rsi, %rcx, 4), %eax + ror $24, %eax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %edx, %xmm1 + movd (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending */ + roll $16, %edx + andl $0x0000ff00, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + movd %eax, %xmm3 + /* unpack alpha to src alpha, combined alpha x 3 */ + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* src alpha = 255 - dst alpha */ + movdqa %xmm2, %xmm0 + por %xmm6, %xmm1 + pand %xmm6, %xmm0 + psubusb %xmm0, %xmm1 + + /* unpack src and dst */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (s * ca) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movd %xmm2, (%rdi, %rcx, 4) +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 + jnz 0b + +9: + LEAVE +SIZE(imlib_amd64_add_blend_rgba_to_rgba_cmod) + +PR_(imlib_amd64_add_blend_rgb_to_rgba_cmod): + ENTER + + pxor %xmm4, %xmm4 + movdqa c1(%rip), %xmm5 + xorq %rax, %rax + movdqa mX000X000X000X000(%rip), %xmm6 + movq pow_lut@GOTPCREL(%rip), %r13 + + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx + + /* prefetch a couple cache lines ahead */ + prefetchnta (%rsi, %rcx, 4) + prefetcht0 (%rdi, %rcx, 4) + prefetchnta 64(%rsi, %rcx, 4) + prefetcht0 64(%rdi, %rcx, 4) + + jz 2f /* one pixel line */ +1: + /* main loop, unrolled to work on 64 byte chunks */ + prefetchnta 128(%rsi, %rcx, 4) + prefetcht0 128(%rdi, %rcx, 4) + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movd %rax, %xmm3 + /* unpack alpha to src alpha, combined alpha x 3 */ + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* src alpha = 255 - dst alpha */ + movdqa %xmm2, %xmm0 + por %xmm6, %xmm1 + pand %xmm6, %xmm0 + psubusb %xmm0, %xmm1 + + /* unpack src and dst */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (s * ca) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movd %rax, %xmm3 + /* unpack alpha to src alpha, combined alpha x 3 */ + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* src alpha = 255 - dst alpha */ + movdqa %xmm2, %xmm0 + por %xmm6, %xmm1 + pand %xmm6, %xmm0 + psubusb %xmm0, %xmm1 + + /* unpack src and dst */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (s * ca) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movd %rax, %xmm3 + /* unpack alpha to src alpha, combined alpha x 3 */ + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* src alpha = 255 - dst alpha */ + movdqa %xmm2, %xmm0 + por %xmm6, %xmm1 + pand %xmm6, %xmm0 + psubusb %xmm0, %xmm1 + + /* unpack src and dst */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (s * ca) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movd %rax, %xmm3 + /* unpack alpha to src alpha, combined alpha x 3 */ + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* src alpha = 255 - dst alpha */ + movdqa %xmm2, %xmm0 + por %xmm6, %xmm1 + pand %xmm6, %xmm0 + psubusb %xmm0, %xmm1 + + /* unpack src and dst */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (s * ca) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movd %rax, %xmm3 + /* unpack alpha to src alpha, combined alpha x 3 */ + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* src alpha = 255 - dst alpha */ + movdqa %xmm2, %xmm0 + por %xmm6, %xmm1 + pand %xmm6, %xmm0 + psubusb %xmm0, %xmm1 + + /* unpack src and dst */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (s * ca) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movd %rax, %xmm3 + /* unpack alpha to src alpha, combined alpha x 3 */ + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* src alpha = 255 - dst alpha */ + movdqa %xmm2, %xmm0 + por %xmm6, %xmm1 + pand %xmm6, %xmm0 + psubusb %xmm0, %xmm1 + + /* unpack src and dst */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (s * ca) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movd %rax, %xmm3 + /* unpack alpha to src alpha, combined alpha x 3 */ + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* src alpha = 255 - dst alpha */ + movdqa %xmm2, %xmm0 + por %xmm6, %xmm1 + pand %xmm6, %xmm0 + psubusb %xmm0, %xmm1 + + /* unpack src and dst */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (s * ca) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movd %rax, %xmm3 + /* unpack alpha to src alpha, combined alpha x 3 */ + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* src alpha = 255 - dst alpha */ + movdqa %xmm2, %xmm0 + por %xmm6, %xmm1 + pand %xmm6, %xmm0 + psubusb %xmm0, %xmm1 + + /* unpack src and dst */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (s * ca) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + js 1b + jnz 3f +2: + /* Grab 1 pixel from src, with colormod, with a = amod[255] */ + movl (%rsi, %rcx, 4), %eax + ror $16, %eax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shl $8, %edx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %edx, %xmm1 + movd (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending */ + roll $16, %edx + andl $0x0000ff00, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + movd %eax, %xmm3 + /* unpack alpha to src alpha, combined alpha x 3 */ + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* src alpha = 255 - dst alpha */ + movdqa %xmm2, %xmm0 + por %xmm6, %xmm1 + pand %xmm6, %xmm0 + psubusb %xmm0, %xmm1 + + /* unpack src and dst */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (s * ca) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movd %xmm2, (%rdi, %rcx, 4) +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 + jnz 0b + +9: + LEAVE +SIZE(imlib_amd64_add_blend_rgb_to_rgba_cmod) + +PR_(imlib_amd64_add_blend_rgb_to_rgb_cmod): + ENTER + + pxor %xmm4, %xmm4 + movdqa m00XXXXXX(%rip), %xmm6 + + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx + + /* prefetch a couple cache lines ahead */ + prefetchnta (%rsi, %rcx, 4) + prefetcht0 (%rdi, %rcx, 4) + prefetchnta 64(%rsi, %rcx, 4) + prefetcht0 64(%rdi, %rcx, 4) + + jz 2f /* one pixel line */ +1: + /* main loop, unrolled to work on 64 byte chunks */ + prefetchnta 128(%rsi, %rcx, 4) + prefetcht0 128(%rdi, %rcx, 4) + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Get alpha from source and unpack to words + * Result ranges is [0, 0x7fff], and is mapped to + * point values in [0.0, 1.0) by using the high word + * of the 32 bit multiplication result. + * Because we want the unsigned value, we shift right one + * here and also shift left the other factors to compensate. + */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero out the alpha channel of the source to leave the + * destination alpha unchanged. + */ + pand %xmm6, %xmm3 + + /* unpack src and dst to words */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (a * s) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Get alpha from source and unpack to words + * Result ranges is [0, 0x7fff], and is mapped to + * point values in [0.0, 1.0) by using the high word + * of the 32 bit multiplication result. + * Because we want the unsigned value, we shift right one + * here and also shift left the other factors to compensate. + */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero out the alpha channel of the source to leave the + * destination alpha unchanged. + */ + pand %xmm6, %xmm3 + + /* unpack src and dst to words */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (a * s) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Get alpha from source and unpack to words + * Result ranges is [0, 0x7fff], and is mapped to + * point values in [0.0, 1.0) by using the high word + * of the 32 bit multiplication result. + * Because we want the unsigned value, we shift right one + * here and also shift left the other factors to compensate. + */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero out the alpha channel of the source to leave the + * destination alpha unchanged. + */ + pand %xmm6, %xmm3 + + /* unpack src and dst to words */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (a * s) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Get alpha from source and unpack to words + * Result ranges is [0, 0x7fff], and is mapped to + * point values in [0.0, 1.0) by using the high word + * of the 32 bit multiplication result. + * Because we want the unsigned value, we shift right one + * here and also shift left the other factors to compensate. + */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero out the alpha channel of the source to leave the + * destination alpha unchanged. + */ + pand %xmm6, %xmm3 + + /* unpack src and dst to words */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (a * s) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Get alpha from source and unpack to words + * Result ranges is [0, 0x7fff], and is mapped to + * point values in [0.0, 1.0) by using the high word + * of the 32 bit multiplication result. + * Because we want the unsigned value, we shift right one + * here and also shift left the other factors to compensate. + */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero out the alpha channel of the source to leave the + * destination alpha unchanged. + */ + pand %xmm6, %xmm3 + + /* unpack src and dst to words */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (a * s) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Get alpha from source and unpack to words + * Result ranges is [0, 0x7fff], and is mapped to + * point values in [0.0, 1.0) by using the high word + * of the 32 bit multiplication result. + * Because we want the unsigned value, we shift right one + * here and also shift left the other factors to compensate. + */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero out the alpha channel of the source to leave the + * destination alpha unchanged. + */ + pand %xmm6, %xmm3 + + /* unpack src and dst to words */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (a * s) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Get alpha from source and unpack to words + * Result ranges is [0, 0x7fff], and is mapped to + * point values in [0.0, 1.0) by using the high word + * of the 32 bit multiplication result. + * Because we want the unsigned value, we shift right one + * here and also shift left the other factors to compensate. + */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero out the alpha channel of the source to leave the + * destination alpha unchanged. + */ + pand %xmm6, %xmm3 + + /* unpack src and dst to words */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (a * s) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Get alpha from source and unpack to words + * Result ranges is [0, 0x7fff], and is mapped to + * point values in [0.0, 1.0) by using the high word + * of the 32 bit multiplication result. + * Because we want the unsigned value, we shift right one + * here and also shift left the other factors to compensate. + */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero out the alpha channel of the source to leave the + * destination alpha unchanged. + */ + pand %xmm6, %xmm3 + + /* unpack src and dst to words */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (a * s) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + js 1b + jnz 3f +2: + /* Grab 1 pixel from src, with colormod, with a = amod[255] */ + movl (%rsi, %rcx, 4), %eax + ror $16, %eax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shl $8, %edx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %edx, %xmm1 + movd (%rdi, %rcx, 4), %xmm2 + /* Get alpha from source and unpack to words + * Result ranges is [0, 0x7fff], and is mapped to + * point values in [0.0, 1.0) by using the high word + * of the 32 bit multiplication result. + * Because we want the unsigned value, we shift right one + * here and also shift left the other factors to compensate. + */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero out the alpha channel of the source to leave the + * destination alpha unchanged. + */ + pand %xmm6, %xmm3 + + /* unpack src and dst to words */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (a * s) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movd %xmm2, (%rdi, %rcx, 4) +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 + jnz 0b + +9: + LEAVE +SIZE(imlib_amd64_add_blend_rgb_to_rgb_cmod) + +PR_(imlib_amd64_add_copy_rgba_to_rgb_cmod): + ENTER + + movdqa m0XXX0XXX0XXX0XXX(%rip), %xmm5 + + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx + + /* prefetch a couple cache lines ahead */ + prefetchnta (%rsi, %rcx, 4) + prefetcht0 (%rdi, %rcx, 4) + prefetchnta 64(%rsi, %rcx, 4) + prefetcht0 64(%rdi, %rcx, 4) + + jz 2f /* one pixel line */ +1: + /* main loop, unrolled to work on 64 byte chunks */ + prefetchnta 128(%rsi, %rcx, 4) + prefetcht0 128(%rdi, %rcx, 4) + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* d = d + (s & 0x00ffffff) */ + pand %xmm5, %xmm1 + paddusb %xmm1, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* d = d + (s & 0x00ffffff) */ + pand %xmm5, %xmm1 + paddusb %xmm1, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* d = d + (s & 0x00ffffff) */ + pand %xmm5, %xmm1 + paddusb %xmm1, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* d = d + (s & 0x00ffffff) */ + pand %xmm5, %xmm1 + paddusb %xmm1, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* d = d + (s & 0x00ffffff) */ + pand %xmm5, %xmm1 + paddusb %xmm1, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* d = d + (s & 0x00ffffff) */ + pand %xmm5, %xmm1 + paddusb %xmm1, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* d = d + (s & 0x00ffffff) */ + pand %xmm5, %xmm1 + paddusb %xmm1, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* d = d + (s & 0x00ffffff) */ + pand %xmm5, %xmm1 + paddusb %xmm1, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + js 1b + jnz 3f +2: + /* Grab 1 pixel from src, with colormod */ + movl (%rsi, %rcx, 4), %eax + ror $24, %eax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %edx, %xmm1 + movd (%rdi, %rcx, 4), %xmm2 + /* d = d + (s & 0x00ffffff) */ + pand %xmm5, %xmm1 + paddusb %xmm1, %xmm2 + movd %xmm2, (%rdi, %rcx, 4) +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 + jnz 0b + +9: + LEAVE +SIZE(imlib_amd64_add_copy_rgba_to_rgb_cmod) + +PR_(imlib_amd64_add_copy_rgba_to_rgba_cmod): + ENTER + + movdqa m0XXX0XXX0XXX0XXX(%rip), %xmm5 + + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx + + /* prefetch a couple cache lines ahead */ + prefetchnta (%rsi, %rcx, 4) + prefetcht0 (%rdi, %rcx, 4) + prefetchnta 64(%rsi, %rcx, 4) + prefetcht0 64(%rdi, %rcx, 4) + + jz 2f /* one pixel line */ +1: + /* main loop, unrolled to work on 64 byte chunks */ + prefetchnta 128(%rsi, %rcx, 4) + prefetcht0 128(%rdi, %rcx, 4) + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* d = (d & 0x00ffffff) + s */ + pand %xmm5, %xmm2 + paddusb %xmm1, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* d = (d & 0x00ffffff) + s */ + pand %xmm5, %xmm2 + paddusb %xmm1, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* d = (d & 0x00ffffff) + s */ + pand %xmm5, %xmm2 + paddusb %xmm1, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* d = (d & 0x00ffffff) + s */ + pand %xmm5, %xmm2 + paddusb %xmm1, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* d = (d & 0x00ffffff) + s */ + pand %xmm5, %xmm2 + paddusb %xmm1, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* d = (d & 0x00ffffff) + s */ + pand %xmm5, %xmm2 + paddusb %xmm1, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* d = (d & 0x00ffffff) + s */ + pand %xmm5, %xmm2 + paddusb %xmm1, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* d = (d & 0x00ffffff) + s */ + pand %xmm5, %xmm2 + paddusb %xmm1, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + js 1b + jnz 3f +2: + /* Grab 1 pixel from src, with colormod */ + movl (%rsi, %rcx, 4), %eax + ror $24, %eax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %edx, %xmm1 + movd (%rdi, %rcx, 4), %xmm2 + /* d = (d & 0x00ffffff) + s */ + pand %xmm5, %xmm2 + paddusb %xmm1, %xmm2 + movd %xmm2, (%rdi, %rcx, 4) +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 + jnz 0b + +9: + LEAVE +SIZE(imlib_amd64_add_copy_rgba_to_rgba_cmod) + +PR_(imlib_amd64_add_copy_rgb_to_rgba_cmod): + ENTER + + movdqa m0XXX0XXX0XXX0XXX(%rip), %xmm5 + + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx + + /* prefetch a couple cache lines ahead */ + prefetchnta (%rsi, %rcx, 4) + prefetcht0 (%rdi, %rcx, 4) + prefetchnta 64(%rsi, %rcx, 4) + prefetcht0 64(%rdi, %rcx, 4) + + jz 2f /* one pixel line */ +1: + /* main loop, unrolled to work on 64 byte chunks */ + prefetchnta 128(%rsi, %rcx, 4) + prefetcht0 128(%rdi, %rcx, 4) + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + pand %xmm5, %xmm2 + paddusb %xmm1, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + pand %xmm5, %xmm2 + paddusb %xmm1, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + pand %xmm5, %xmm2 + paddusb %xmm1, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + pand %xmm5, %xmm2 + paddusb %xmm1, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + pand %xmm5, %xmm2 + paddusb %xmm1, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + pand %xmm5, %xmm2 + paddusb %xmm1, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + pand %xmm5, %xmm2 + paddusb %xmm1, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + pand %xmm5, %xmm2 + paddusb %xmm1, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + js 1b + jnz 3f +2: + /* Grab 1 pixel from src, with colormod, with a = amod[255] */ + movl (%rsi, %rcx, 4), %eax + ror $16, %eax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shl $8, %edx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %edx, %xmm1 + movd (%rdi, %rcx, 4), %xmm2 + pand %xmm5, %xmm2 + paddusb %xmm1, %xmm2 + movd %xmm2, (%rdi, %rcx, 4) +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 + jnz 0b + +9: + LEAVE +SIZE(imlib_amd64_add_copy_rgb_to_rgba_cmod) + +PR_(imlib_amd64_subtract_blend_rgba_to_rgb_cmod): + ENTER + + pxor %xmm4, %xmm4 + movdqa m00XXXXXX(%rip), %xmm6 + + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx + + /* prefetch a couple cache lines ahead */ + prefetchnta (%rsi, %rcx, 4) + prefetcht0 (%rdi, %rcx, 4) + prefetchnta 64(%rsi, %rcx, 4) + prefetcht0 64(%rdi, %rcx, 4) + + jz 2f /* one pixel line */ +1: + /* main loop, unrolled to work on 64 byte chunks */ + prefetchnta 128(%rsi, %rcx, 4) + prefetcht0 128(%rdi, %rcx, 4) + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Get alpha from source and unpack to words + * Result ranges is [0, 0x7fff], and is mapped to + * point values in [0.0, 1.0) by using the high word + * of the 32 bit multiplication result. + * Because we want the unsigned value, we shift right one + * here and also shift left the other factors to compensate. + */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero out the alpha channel of the source to leave the + * destination alpha unchanged. + */ + pand %xmm6, %xmm3 + + /* Unpack src and dst to words */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d - (s * a) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + psubsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Get alpha from source and unpack to words + * Result ranges is [0, 0x7fff], and is mapped to + * point values in [0.0, 1.0) by using the high word + * of the 32 bit multiplication result. + * Because we want the unsigned value, we shift right one + * here and also shift left the other factors to compensate. + */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero out the alpha channel of the source to leave the + * destination alpha unchanged. + */ + pand %xmm6, %xmm3 + + /* Unpack src and dst to words */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d - (s * a) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + psubsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Get alpha from source and unpack to words + * Result ranges is [0, 0x7fff], and is mapped to + * point values in [0.0, 1.0) by using the high word + * of the 32 bit multiplication result. + * Because we want the unsigned value, we shift right one + * here and also shift left the other factors to compensate. + */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero out the alpha channel of the source to leave the + * destination alpha unchanged. + */ + pand %xmm6, %xmm3 + + /* Unpack src and dst to words */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d - (s * a) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + psubsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Get alpha from source and unpack to words + * Result ranges is [0, 0x7fff], and is mapped to + * point values in [0.0, 1.0) by using the high word + * of the 32 bit multiplication result. + * Because we want the unsigned value, we shift right one + * here and also shift left the other factors to compensate. + */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero out the alpha channel of the source to leave the + * destination alpha unchanged. + */ + pand %xmm6, %xmm3 + + /* Unpack src and dst to words */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d - (s * a) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + psubsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Get alpha from source and unpack to words + * Result ranges is [0, 0x7fff], and is mapped to + * point values in [0.0, 1.0) by using the high word + * of the 32 bit multiplication result. + * Because we want the unsigned value, we shift right one + * here and also shift left the other factors to compensate. + */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero out the alpha channel of the source to leave the + * destination alpha unchanged. + */ + pand %xmm6, %xmm3 + + /* Unpack src and dst to words */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d - (s * a) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + psubsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Get alpha from source and unpack to words + * Result ranges is [0, 0x7fff], and is mapped to + * point values in [0.0, 1.0) by using the high word + * of the 32 bit multiplication result. + * Because we want the unsigned value, we shift right one + * here and also shift left the other factors to compensate. + */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero out the alpha channel of the source to leave the + * destination alpha unchanged. + */ + pand %xmm6, %xmm3 + + /* Unpack src and dst to words */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d - (s * a) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + psubsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Get alpha from source and unpack to words + * Result ranges is [0, 0x7fff], and is mapped to + * point values in [0.0, 1.0) by using the high word + * of the 32 bit multiplication result. + * Because we want the unsigned value, we shift right one + * here and also shift left the other factors to compensate. + */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero out the alpha channel of the source to leave the + * destination alpha unchanged. + */ + pand %xmm6, %xmm3 + + /* Unpack src and dst to words */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d - (s * a) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + psubsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Get alpha from source and unpack to words + * Result ranges is [0, 0x7fff], and is mapped to + * point values in [0.0, 1.0) by using the high word + * of the 32 bit multiplication result. + * Because we want the unsigned value, we shift right one + * here and also shift left the other factors to compensate. + */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero out the alpha channel of the source to leave the + * destination alpha unchanged. + */ + pand %xmm6, %xmm3 + + /* Unpack src and dst to words */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d - (s * a) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + psubsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + js 1b + jnz 3f +2: + /* Grab 1 pixel from src, with colormod */ + movl (%rsi, %rcx, 4), %eax + ror $24, %eax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %edx, %xmm1 + movd (%rdi, %rcx, 4), %xmm2 + /* Get alpha from source and unpack to words + * Result ranges is [0, 0x7fff], and is mapped to + * point values in [0.0, 1.0) by using the high word + * of the 32 bit multiplication result. + * Because we want the unsigned value, we shift right one + * here and also shift left the other factors to compensate. + */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero out the alpha channel of the source to leave the + * destination alpha unchanged. + */ + pand %xmm6, %xmm3 + + /* Unpack src and dst to words */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d - (s * a) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + psubsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movd %xmm2, (%rdi, %rcx, 4) +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 + jnz 0b + +9: + LEAVE +SIZE(imlib_amd64_subtract_blend_rgba_to_rgb_cmod) +PR_(imlib_amd64_subtract_blend_rgba_to_rgba_cmod): + ENTER + + movq pow_lut@GOTPCREL(%rip), %r13 + pxor %xmm4, %xmm4 + movdqa c1(%rip), %xmm5 + movdqa mX000X000X000X000(%rip), %xmm6 + movdqa mX000X000(%rip), %xmm7 + xorq %rax, %rax + + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx + + /* prefetch a couple cache lines ahead */ + prefetchnta (%rsi, %rcx, 4) + prefetcht0 (%rdi, %rcx, 4) + prefetchnta 64(%rsi, %rcx, 4) + prefetcht0 64(%rdi, %rcx, 4) + + jz 2f /* one pixel line */ +1: + /* main loop, unrolled to work on 64 byte chunks */ + prefetchnta 128(%rsi, %rcx, 4) + prefetcht0 128(%rdi, %rcx, 4) + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movd %rax, %xmm3 + /* unpack alpha to src alpha, combined alpha x 3 */ + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* src alpha = 255 - dst alpha */ + movdqa %xmm2, %xmm0 + pand %xmm6, %xmm0 + por %xmm6, %xmm1 + psubusb %xmm0, %xmm1 + + /* unpack src and dst */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d - ((s * a) ^ 0xff000000) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + pxor %xmm7, %xmm1 + psubsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movd %rax, %xmm3 + /* unpack alpha to src alpha, combined alpha x 3 */ + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* src alpha = 255 - dst alpha */ + movdqa %xmm2, %xmm0 + pand %xmm6, %xmm0 + por %xmm6, %xmm1 + psubusb %xmm0, %xmm1 + + /* unpack src and dst */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d - ((s * a) ^ 0xff000000) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + pxor %xmm7, %xmm1 + psubsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movd %rax, %xmm3 + /* unpack alpha to src alpha, combined alpha x 3 */ + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* src alpha = 255 - dst alpha */ + movdqa %xmm2, %xmm0 + pand %xmm6, %xmm0 + por %xmm6, %xmm1 + psubusb %xmm0, %xmm1 + + /* unpack src and dst */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d - ((s * a) ^ 0xff000000) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + pxor %xmm7, %xmm1 + psubsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movd %rax, %xmm3 + /* unpack alpha to src alpha, combined alpha x 3 */ + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* src alpha = 255 - dst alpha */ + movdqa %xmm2, %xmm0 + pand %xmm6, %xmm0 + por %xmm6, %xmm1 + psubusb %xmm0, %xmm1 + + /* unpack src and dst */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d - ((s * a) ^ 0xff000000) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + pxor %xmm7, %xmm1 + psubsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movd %rax, %xmm3 + /* unpack alpha to src alpha, combined alpha x 3 */ + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* src alpha = 255 - dst alpha */ + movdqa %xmm2, %xmm0 + pand %xmm6, %xmm0 + por %xmm6, %xmm1 + psubusb %xmm0, %xmm1 + + /* unpack src and dst */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d - ((s * a) ^ 0xff000000) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + pxor %xmm7, %xmm1 + psubsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movd %rax, %xmm3 + /* unpack alpha to src alpha, combined alpha x 3 */ + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* src alpha = 255 - dst alpha */ + movdqa %xmm2, %xmm0 + pand %xmm6, %xmm0 + por %xmm6, %xmm1 + psubusb %xmm0, %xmm1 + + /* unpack src and dst */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d - ((s * a) ^ 0xff000000) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + pxor %xmm7, %xmm1 + psubsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movd %rax, %xmm3 + /* unpack alpha to src alpha, combined alpha x 3 */ + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* src alpha = 255 - dst alpha */ + movdqa %xmm2, %xmm0 + pand %xmm6, %xmm0 + por %xmm6, %xmm1 + psubusb %xmm0, %xmm1 + + /* unpack src and dst */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d - ((s * a) ^ 0xff000000) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + pxor %xmm7, %xmm1 + psubsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movd %rax, %xmm3 + /* unpack alpha to src alpha, combined alpha x 3 */ + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* src alpha = 255 - dst alpha */ + movdqa %xmm2, %xmm0 + pand %xmm6, %xmm0 + por %xmm6, %xmm1 + psubusb %xmm0, %xmm1 + + /* unpack src and dst */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d - ((s * a) ^ 0xff000000) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + pxor %xmm7, %xmm1 + psubsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + js 1b + jnz 3f +2: + /* Grab 1 pixel from src, with colormod */ + movl (%rsi, %rcx, 4), %eax + ror $24, %eax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %edx, %xmm1 + movd (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending */ + roll $16, %edx + andl $0x0000ff00, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + movd %eax, %xmm3 + /* unpack alpha to src alpha, combined alpha x 3 */ + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* src alpha = 255 - dst alpha */ + movdqa %xmm2, %xmm0 + pand %xmm6, %xmm0 + por %xmm6, %xmm1 + psubusb %xmm0, %xmm1 + + /* unpack src and dst */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d - ((s * a) ^ 0xff000000) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + pxor %xmm7, %xmm1 + psubsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movd %xmm2, (%rdi, %rcx, 4) +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 + jnz 0b + +9: + LEAVE +SIZE(imlib_amd64_subtract_blend_rgba_to_rgba_cmod) +PR_(imlib_amd64_subtract_blend_rgb_to_rgba_cmod): + ENTER + + movq pow_lut@GOTPCREL(%rip), %r13 + pxor %xmm4, %xmm4 + movdqa c1(%rip), %xmm5 + movdqa mX000X000X000X000(%rip), %xmm6 + movdqa mX000X000(%rip), %xmm7 + xorq %rax, %rax + + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx + + /* prefetch a couple cache lines ahead */ + prefetchnta (%rsi, %rcx, 4) + prefetcht0 (%rdi, %rcx, 4) + prefetchnta 64(%rsi, %rcx, 4) + prefetcht0 64(%rdi, %rcx, 4) + + jz 2f /* one pixel line */ +1: + /* main loop, unrolled to work on 64 byte chunks */ + prefetchnta 128(%rsi, %rcx, 4) + prefetcht0 128(%rdi, %rcx, 4) + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movd %rax, %xmm3 + /* unpack alpha to src alpha, combined alpha x 3 */ + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* src alpha = 255 - dst alpha */ + movdqa %xmm2, %xmm0 + pand %xmm6, %xmm0 + por %xmm6, %xmm1 + psubusb %xmm0, %xmm1 + + /* unpack src and dst */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d - ((s * a) ^ 0xff000000) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + pxor %xmm7, %xmm1 + psubsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movd %rax, %xmm3 + /* unpack alpha to src alpha, combined alpha x 3 */ + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* src alpha = 255 - dst alpha */ + movdqa %xmm2, %xmm0 + pand %xmm6, %xmm0 + por %xmm6, %xmm1 + psubusb %xmm0, %xmm1 + + /* unpack src and dst */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d - ((s * a) ^ 0xff000000) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + pxor %xmm7, %xmm1 + psubsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movd %rax, %xmm3 + /* unpack alpha to src alpha, combined alpha x 3 */ + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* src alpha = 255 - dst alpha */ + movdqa %xmm2, %xmm0 + pand %xmm6, %xmm0 + por %xmm6, %xmm1 + psubusb %xmm0, %xmm1 + + /* unpack src and dst */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d - ((s * a) ^ 0xff000000) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + pxor %xmm7, %xmm1 + psubsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movd %rax, %xmm3 + /* unpack alpha to src alpha, combined alpha x 3 */ + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* src alpha = 255 - dst alpha */ + movdqa %xmm2, %xmm0 + pand %xmm6, %xmm0 + por %xmm6, %xmm1 + psubusb %xmm0, %xmm1 + + /* unpack src and dst */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d - ((s * a) ^ 0xff000000) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + pxor %xmm7, %xmm1 + psubsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movd %rax, %xmm3 + /* unpack alpha to src alpha, combined alpha x 3 */ + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* src alpha = 255 - dst alpha */ + movdqa %xmm2, %xmm0 + pand %xmm6, %xmm0 + por %xmm6, %xmm1 + psubusb %xmm0, %xmm1 + + /* unpack src and dst */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d - ((s * a) ^ 0xff000000) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + pxor %xmm7, %xmm1 + psubsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movd %rax, %xmm3 + /* unpack alpha to src alpha, combined alpha x 3 */ + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* src alpha = 255 - dst alpha */ + movdqa %xmm2, %xmm0 + pand %xmm6, %xmm0 + por %xmm6, %xmm1 + psubusb %xmm0, %xmm1 + + /* unpack src and dst */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d - ((s * a) ^ 0xff000000) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + pxor %xmm7, %xmm1 + psubsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movd %rax, %xmm3 + /* unpack alpha to src alpha, combined alpha x 3 */ + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* src alpha = 255 - dst alpha */ + movdqa %xmm2, %xmm0 + pand %xmm6, %xmm0 + por %xmm6, %xmm1 + psubusb %xmm0, %xmm1 + + /* unpack src and dst */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d - ((s * a) ^ 0xff000000) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + pxor %xmm7, %xmm1 + psubsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + + rolq $32, %rax + movd %rax, %xmm3 + /* unpack alpha to src alpha, combined alpha x 3 */ + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* src alpha = 255 - dst alpha */ + movdqa %xmm2, %xmm0 + pand %xmm6, %xmm0 + por %xmm6, %xmm1 + psubusb %xmm0, %xmm1 + + /* unpack src and dst */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d - ((s * a) ^ 0xff000000) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + pxor %xmm7, %xmm1 + psubsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + js 1b + jnz 3f +2: + /* Grab 1 pixel from src, with colormod, with a = amod[255] */ + movl (%rsi, %rcx, 4), %eax + ror $16, %eax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shl $8, %edx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %edx, %xmm1 + movd (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending */ + roll $16, %edx + andl $0x0000ff00, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + movd %eax, %xmm3 + /* unpack alpha to src alpha, combined alpha x 3 */ + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* src alpha = 255 - dst alpha */ + movdqa %xmm2, %xmm0 + pand %xmm6, %xmm0 + por %xmm6, %xmm1 + psubusb %xmm0, %xmm1 + + /* unpack src and dst */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d - ((s * a) ^ 0xff000000) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + pxor %xmm7, %xmm1 + psubsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movd %xmm2, (%rdi, %rcx, 4) +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 + jnz 0b + +9: + LEAVE +SIZE(imlib_amd64_subtract_blend_rgb_to_rgba_cmod) +PR_(imlib_amd64_subtract_blend_rgb_to_rgb_cmod): + ENTER + + pxor %xmm4, %xmm4 + movdqa m00XXXXXX(%rip), %xmm6 + + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx + + /* prefetch a couple cache lines ahead */ + prefetchnta (%rsi, %rcx, 4) + prefetcht0 (%rdi, %rcx, 4) + prefetchnta 64(%rsi, %rcx, 4) + prefetcht0 64(%rdi, %rcx, 4) + + jz 2f /* one pixel line */ +1: + /* main loop, unrolled to work on 64 byte chunks */ + prefetchnta 128(%rsi, %rcx, 4) + prefetcht0 128(%rdi, %rcx, 4) + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Get alpha from source and unpack to words + * Result ranges is [0, 0x7fff], and is mapped to + * point values in [0.0, 1.0) by using the high word + * of the 32 bit multiplication result. + * Because we want the unsigned value, we shift right one + * here and also shift left the other factors to compensate. + */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero out the alpha channel of the source to leave the + * destination alpha unchanged. + */ + pand %xmm6, %xmm3 + + /* Unpack src and dst to words */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d - (s * a) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + psubsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Get alpha from source and unpack to words + * Result ranges is [0, 0x7fff], and is mapped to + * point values in [0.0, 1.0) by using the high word + * of the 32 bit multiplication result. + * Because we want the unsigned value, we shift right one + * here and also shift left the other factors to compensate. + */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero out the alpha channel of the source to leave the + * destination alpha unchanged. + */ + pand %xmm6, %xmm3 + + /* Unpack src and dst to words */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d - (s * a) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + psubsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Get alpha from source and unpack to words + * Result ranges is [0, 0x7fff], and is mapped to + * point values in [0.0, 1.0) by using the high word + * of the 32 bit multiplication result. + * Because we want the unsigned value, we shift right one + * here and also shift left the other factors to compensate. + */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero out the alpha channel of the source to leave the + * destination alpha unchanged. + */ + pand %xmm6, %xmm3 + + /* Unpack src and dst to words */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d - (s * a) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + psubsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Get alpha from source and unpack to words + * Result ranges is [0, 0x7fff], and is mapped to + * point values in [0.0, 1.0) by using the high word + * of the 32 bit multiplication result. + * Because we want the unsigned value, we shift right one + * here and also shift left the other factors to compensate. + */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero out the alpha channel of the source to leave the + * destination alpha unchanged. + */ + pand %xmm6, %xmm3 + + /* Unpack src and dst to words */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d - (s * a) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + psubsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Get alpha from source and unpack to words + * Result ranges is [0, 0x7fff], and is mapped to + * point values in [0.0, 1.0) by using the high word + * of the 32 bit multiplication result. + * Because we want the unsigned value, we shift right one + * here and also shift left the other factors to compensate. + */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero out the alpha channel of the source to leave the + * destination alpha unchanged. + */ + pand %xmm6, %xmm3 + + /* Unpack src and dst to words */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d - (s * a) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + psubsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Get alpha from source and unpack to words + * Result ranges is [0, 0x7fff], and is mapped to + * point values in [0.0, 1.0) by using the high word + * of the 32 bit multiplication result. + * Because we want the unsigned value, we shift right one + * here and also shift left the other factors to compensate. + */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero out the alpha channel of the source to leave the + * destination alpha unchanged. + */ + pand %xmm6, %xmm3 + + /* Unpack src and dst to words */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d - (s * a) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + psubsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Get alpha from source and unpack to words + * Result ranges is [0, 0x7fff], and is mapped to + * point values in [0.0, 1.0) by using the high word + * of the 32 bit multiplication result. + * Because we want the unsigned value, we shift right one + * here and also shift left the other factors to compensate. + */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero out the alpha channel of the source to leave the + * destination alpha unchanged. + */ + pand %xmm6, %xmm3 + + /* Unpack src and dst to words */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d - (s * a) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + psubsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Get alpha from source and unpack to words + * Result ranges is [0, 0x7fff], and is mapped to + * point values in [0.0, 1.0) by using the high word + * of the 32 bit multiplication result. + * Because we want the unsigned value, we shift right one + * here and also shift left the other factors to compensate. + */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero out the alpha channel of the source to leave the + * destination alpha unchanged. + */ + pand %xmm6, %xmm3 + + /* Unpack src and dst to words */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d - (s * a) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + psubsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + js 1b + jnz 3f +2: + /* Grab 1 pixel from src, with colormod, with a = amod[255] */ + movl (%rsi, %rcx, 4), %eax + ror $16, %eax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shl $8, %edx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %edx, %xmm1 + movd (%rdi, %rcx, 4), %xmm2 + /* Get alpha from source and unpack to words + * Result ranges is [0, 0x7fff], and is mapped to + * point values in [0.0, 1.0) by using the high word + * of the 32 bit multiplication result. + * Because we want the unsigned value, we shift right one + * here and also shift left the other factors to compensate. + */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero out the alpha channel of the source to leave the + * destination alpha unchanged. + */ + pand %xmm6, %xmm3 + + /* Unpack src and dst to words */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d - (s * a) */ + psllw $1, %xmm1 + pmulhw %xmm3, %xmm1 + psubsw %xmm1, %xmm2 + + /* pack new pixels */ + packuswb %xmm4, %xmm2 + movd %xmm2, (%rdi, %rcx, 4) +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 + jnz 0b + +9: + LEAVE +SIZE(imlib_amd64_subtract_blend_rgb_to_rgb_cmod) +PR_(imlib_amd64_subtract_copy_rgba_to_rgb_cmod): + ENTER + + movdqa m0XXX0XXX0XXX0XXX(%rip), %xmm5 + + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx + + /* prefetch a couple cache lines ahead */ + prefetchnta (%rsi, %rcx, 4) + prefetcht0 (%rdi, %rcx, 4) + prefetchnta 64(%rsi, %rcx, 4) + prefetcht0 64(%rdi, %rcx, 4) + + jz 2f /* one pixel line */ +1: + /* main loop, unrolled to work on 64 byte chunks */ + prefetchnta 128(%rsi, %rcx, 4) + prefetcht0 128(%rdi, %rcx, 4) + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* d = d - (s & 0x00ffffff) */ + pand %xmm5, %xmm1 + psubusb %xmm1, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* d = d - (s & 0x00ffffff) */ + pand %xmm5, %xmm1 + psubusb %xmm1, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* d = d - (s & 0x00ffffff) */ + pand %xmm5, %xmm1 + psubusb %xmm1, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* d = d - (s & 0x00ffffff) */ + pand %xmm5, %xmm1 + psubusb %xmm1, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* d = d - (s & 0x00ffffff) */ + pand %xmm5, %xmm1 + psubusb %xmm1, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* d = d - (s & 0x00ffffff) */ + pand %xmm5, %xmm1 + psubusb %xmm1, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* d = d - (s & 0x00ffffff) */ + pand %xmm5, %xmm1 + psubusb %xmm1, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* d = d - (s & 0x00ffffff) */ + pand %xmm5, %xmm1 + psubusb %xmm1, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + js 1b + jnz 3f +2: + /* Grab 1 pixel from src, with colormod */ + movl (%rsi, %rcx, 4), %eax + ror $24, %eax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %edx, %xmm1 + movd (%rdi, %rcx, 4), %xmm2 + /* d = d - (s & 0x00ffffff) */ + pand %xmm5, %xmm1 + psubusb %xmm1, %xmm2 + movd %xmm2, (%rdi, %rcx, 4) +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 + jnz 0b + +9: + LEAVE +SIZE(imlib_amd64_subtract_copy_rgba_to_rgb_cmod) +PR_(imlib_amd64_subtract_copy_rgba_to_rgba_cmod): + ENTER + + movdqa m0XXX0XXX0XXX0XXX(%rip), %xmm5 + movdqa mX000X000X000X000(%rip), %xmm6 + + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx + + /* prefetch a couple cache lines ahead */ + prefetchnta (%rsi, %rcx, 4) + prefetcht0 (%rdi, %rcx, 4) + prefetchnta 64(%rsi, %rcx, 4) + prefetcht0 64(%rdi, %rcx, 4) + + jz 2f /* one pixel line */ +1: + /* main loop, unrolled to work on 64 byte chunks */ + prefetchnta 128(%rsi, %rcx, 4) + prefetcht0 128(%rdi, %rcx, 4) + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* d = d - s, d alpha = s alpha */ + psubusb %xmm1, %xmm2 + pand %xmm6, %xmm1 + pand %xmm5, %xmm2 + por %xmm1, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* d = d - s, d alpha = s alpha */ + psubusb %xmm1, %xmm2 + pand %xmm6, %xmm1 + pand %xmm5, %xmm2 + por %xmm1, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* d = d - s, d alpha = s alpha */ + psubusb %xmm1, %xmm2 + pand %xmm6, %xmm1 + pand %xmm5, %xmm2 + por %xmm1, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* d = d - s, d alpha = s alpha */ + psubusb %xmm1, %xmm2 + pand %xmm6, %xmm1 + pand %xmm5, %xmm2 + por %xmm1, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* d = d - s, d alpha = s alpha */ + psubusb %xmm1, %xmm2 + pand %xmm6, %xmm1 + pand %xmm5, %xmm2 + por %xmm1, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* d = d - s, d alpha = s alpha */ + psubusb %xmm1, %xmm2 + pand %xmm6, %xmm1 + pand %xmm5, %xmm2 + por %xmm1, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* d = d - s, d alpha = s alpha */ + psubusb %xmm1, %xmm2 + pand %xmm6, %xmm1 + pand %xmm5, %xmm2 + por %xmm1, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* d = d - s, d alpha = s alpha */ + psubusb %xmm1, %xmm2 + pand %xmm6, %xmm1 + pand %xmm5, %xmm2 + por %xmm1, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + js 1b + jnz 3f +2: + /* Grab 1 pixel from src, with colormod */ + movl (%rsi, %rcx, 4), %eax + ror $24, %eax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %edx, %xmm1 + movd (%rdi, %rcx, 4), %xmm2 + /* d = d - s, d alpha = s alpha */ + psubusb %xmm1, %xmm2 + pand %xmm6, %xmm1 + pand %xmm5, %xmm2 + por %xmm1, %xmm2 + movd %xmm2, (%rdi, %rcx, 4) +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 + jnz 0b + +9: + LEAVE +SIZE(imlib_amd64_subtract_copy_rgba_to_rgba_cmod) +PR_(imlib_amd64_subtract_copy_rgb_to_rgba_cmod): + ENTER + + movdqa m0XXX0XXX0XXX0XXX(%rip), %xmm5 + movdqa mX000X000X000X000(%rip), %xmm6 + + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx + + /* prefetch a couple cache lines ahead */ + prefetchnta (%rsi, %rcx, 4) + prefetcht0 (%rdi, %rcx, 4) + prefetchnta 64(%rsi, %rcx, 4) + prefetcht0 64(%rdi, %rcx, 4) + + jz 2f /* one pixel line */ +1: + /* main loop, unrolled to work on 64 byte chunks */ + prefetchnta 128(%rsi, %rcx, 4) + prefetcht0 128(%rdi, %rcx, 4) + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* d = (d - s) */ + psubusb %xmm1, %xmm2 + + /* Preserve source alpha */ + pand %xmm5, %xmm2 + pand %xmm6, %xmm1 + por %xmm1, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* d = (d - s) */ + psubusb %xmm1, %xmm2 + + /* Preserve source alpha */ + pand %xmm5, %xmm2 + pand %xmm6, %xmm1 + por %xmm1, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* d = (d - s) */ + psubusb %xmm1, %xmm2 + + /* Preserve source alpha */ + pand %xmm5, %xmm2 + pand %xmm6, %xmm1 + por %xmm1, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* d = (d - s) */ + psubusb %xmm1, %xmm2 + + /* Preserve source alpha */ + pand %xmm5, %xmm2 + pand %xmm6, %xmm1 + por %xmm1, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* d = (d - s) */ + psubusb %xmm1, %xmm2 + + /* Preserve source alpha */ + pand %xmm5, %xmm2 + pand %xmm6, %xmm1 + por %xmm1, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* d = (d - s) */ + psubusb %xmm1, %xmm2 + + /* Preserve source alpha */ + pand %xmm5, %xmm2 + pand %xmm6, %xmm1 + por %xmm1, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* d = (d - s) */ + psubusb %xmm1, %xmm2 + + /* Preserve source alpha */ + pand %xmm5, %xmm2 + pand %xmm6, %xmm1 + por %xmm1, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* d = (d - s) */ + psubusb %xmm1, %xmm2 + + /* Preserve source alpha */ + pand %xmm5, %xmm2 + pand %xmm6, %xmm1 + por %xmm1, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + js 1b + jnz 3f +2: + /* Grab 1 pixel from src, with colormod, with a = amod[255] */ + movl (%rsi, %rcx, 4), %eax + ror $16, %eax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shl $8, %edx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %edx, %xmm1 + movd (%rdi, %rcx, 4), %xmm2 + /* d = (d - s) */ + psubusb %xmm1, %xmm2 + + /* Preserve source alpha */ + pand %xmm5, %xmm2 + pand %xmm6, %xmm1 + por %xmm1, %xmm2 + movd %xmm2, (%rdi, %rcx, 4) +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 + jnz 0b + +9: + LEAVE +SIZE(imlib_amd64_subtract_copy_rgb_to_rgba_cmod) +PR_(imlib_amd64_reshade_blend_rgba_to_rgb_cmod): + ENTER + + pxor %xmm4, %xmm4 + movdqa m000V0V0V000V0V0V(%rip), %xmm6 + movdqa m00XXXXXX(%rip), %xmm7 + + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx + + /* prefetch a couple cache lines ahead */ + prefetchnta (%rsi, %rcx, 4) + prefetcht0 (%rdi, %rcx, 4) + prefetchnta 64(%rsi, %rcx, 4) + prefetcht0 64(%rdi, %rcx, 4) + + jz 2f /* one pixel line */ +1: + /* main loop, unrolled to work on 64 byte chunks */ + prefetchnta 128(%rsi, %rcx, 4) + prefetcht0 128(%rdi, %rcx, 4) + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Unpack alpha */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero blending alpha */ + pand %xmm7, %xmm3 + + /* Unpack src and dst */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (2 * a * (s - 127)) */ + psubw %xmm6, %xmm1 + psllw $2, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* Repack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Unpack alpha */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero blending alpha */ + pand %xmm7, %xmm3 + + /* Unpack src and dst */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (2 * a * (s - 127)) */ + psubw %xmm6, %xmm1 + psllw $2, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* Repack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Unpack alpha */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero blending alpha */ + pand %xmm7, %xmm3 + + /* Unpack src and dst */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (2 * a * (s - 127)) */ + psubw %xmm6, %xmm1 + psllw $2, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* Repack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Unpack alpha */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero blending alpha */ + pand %xmm7, %xmm3 + + /* Unpack src and dst */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (2 * a * (s - 127)) */ + psubw %xmm6, %xmm1 + psllw $2, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* Repack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Unpack alpha */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero blending alpha */ + pand %xmm7, %xmm3 + + /* Unpack src and dst */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (2 * a * (s - 127)) */ + psubw %xmm6, %xmm1 + psllw $2, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* Repack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Unpack alpha */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero blending alpha */ + pand %xmm7, %xmm3 + + /* Unpack src and dst */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (2 * a * (s - 127)) */ + psubw %xmm6, %xmm1 + psllw $2, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* Repack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Unpack alpha */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero blending alpha */ + pand %xmm7, %xmm3 + + /* Unpack src and dst */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (2 * a * (s - 127)) */ + psubw %xmm6, %xmm1 + psllw $2, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* Repack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Unpack alpha */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero blending alpha */ + pand %xmm7, %xmm3 + + /* Unpack src and dst */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (2 * a * (s - 127)) */ + psubw %xmm6, %xmm1 + psllw $2, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* Repack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + js 1b + jnz 3f +2: + /* Grab 1 pixel from src, with colormod */ + movl (%rsi, %rcx, 4), %eax + ror $24, %eax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %edx, %xmm1 + movd (%rdi, %rcx, 4), %xmm2 + /* Unpack alpha */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero blending alpha */ + pand %xmm7, %xmm3 + + /* Unpack src and dst */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (2 * a * (s - 127)) */ + psubw %xmm6, %xmm1 + psllw $2, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* Repack new pixels */ + packuswb %xmm4, %xmm2 + movd %xmm2, (%rdi, %rcx, 4) +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 + jnz 0b + +9: + LEAVE +SIZE(imlib_amd64_reshade_blend_rgba_to_rgb_cmod) + +PR_(imlib_amd64_reshade_blend_rgba_to_rgba_cmod): + ENTER + + movq pow_lut@GOTPCREL(%rip), %r13 + pxor %xmm4, %xmm4 + movdqa c1(%rip), %xmm5 + movdqa mX000X000X000X000(%rip), %xmm6 + movdqa m0XXX0XXX0XXX0XXX(%rip), %xmm7 + movdqa m000V0V0V000V0V0V(%rip), %xmm8 + xorq %rax, %rax + + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx + + /* prefetch a couple cache lines ahead */ + prefetchnta (%rsi, %rcx, 4) + prefetcht0 (%rdi, %rcx, 4) + prefetchnta 64(%rsi, %rcx, 4) + prefetcht0 64(%rdi, %rcx, 4) + + jz 2f /* one pixel line */ +1: + /* main loop, unrolled to work on 64 byte chunks */ + prefetchnta 128(%rsi, %rcx, 4) + prefetcht0 128(%rdi, %rcx, 4) + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending, specialized for reshade by shifting the source alpha + * right by one */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + shrb $1, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + shrb $1, %ah + + rolq $32, %rax + movd %rax, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + movdqa %xmm2, %xmm0 + pand %xmm6, %xmm0 + por %xmm6, %xmm1 + psubusb %xmm0, %xmm1 + + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + psubw %xmm8, %xmm1 + psllw $2, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending, specialized for reshade by shifting the source alpha + * right by one */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + shrb $1, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + shrb $1, %ah + + rolq $32, %rax + movd %rax, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + movdqa %xmm2, %xmm0 + pand %xmm6, %xmm0 + por %xmm6, %xmm1 + psubusb %xmm0, %xmm1 + + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + psubw %xmm8, %xmm1 + psllw $2, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending, specialized for reshade by shifting the source alpha + * right by one */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + shrb $1, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + shrb $1, %ah + + rolq $32, %rax + movd %rax, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + movdqa %xmm2, %xmm0 + pand %xmm6, %xmm0 + por %xmm6, %xmm1 + psubusb %xmm0, %xmm1 + + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + psubw %xmm8, %xmm1 + psllw $2, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending, specialized for reshade by shifting the source alpha + * right by one */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + shrb $1, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + shrb $1, %ah + + rolq $32, %rax + movd %rax, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + movdqa %xmm2, %xmm0 + pand %xmm6, %xmm0 + por %xmm6, %xmm1 + psubusb %xmm0, %xmm1 + + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + psubw %xmm8, %xmm1 + psllw $2, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending, specialized for reshade by shifting the source alpha + * right by one */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + shrb $1, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + shrb $1, %ah + + rolq $32, %rax + movd %rax, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + movdqa %xmm2, %xmm0 + pand %xmm6, %xmm0 + por %xmm6, %xmm1 + psubusb %xmm0, %xmm1 + + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + psubw %xmm8, %xmm1 + psllw $2, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending, specialized for reshade by shifting the source alpha + * right by one */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + shrb $1, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + shrb $1, %ah + + rolq $32, %rax + movd %rax, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + movdqa %xmm2, %xmm0 + pand %xmm6, %xmm0 + por %xmm6, %xmm1 + psubusb %xmm0, %xmm1 + + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + psubw %xmm8, %xmm1 + psllw $2, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending, specialized for reshade by shifting the source alpha + * right by one */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + shrb $1, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + shrb $1, %ah + + rolq $32, %rax + movd %rax, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + movdqa %xmm2, %xmm0 + pand %xmm6, %xmm0 + por %xmm6, %xmm1 + psubusb %xmm0, %xmm1 + + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + psubw %xmm8, %xmm1 + psllw $2, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending, specialized for reshade by shifting the source alpha + * right by one */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + shrb $1, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + shrb $1, %ah + + rolq $32, %rax + movd %rax, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + movdqa %xmm2, %xmm0 + pand %xmm6, %xmm0 + por %xmm6, %xmm1 + psubusb %xmm0, %xmm1 + + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + psubw %xmm8, %xmm1 + psllw $2, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + js 1b + jnz 3f +2: + /* Grab 1 pixel from src, with colormod */ + movl (%rsi, %rcx, 4), %eax + ror $24, %eax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %edx, %xmm1 + movd (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending, specialized for reshade by shifting the source alpha + * right by one */ + roll $16, %edx + andl $0x0000ff00, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + shrb $1, %ah + movd %eax, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + movdqa %xmm2, %xmm0 + pand %xmm6, %xmm0 + por %xmm6, %xmm1 + psubusb %xmm0, %xmm1 + + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + psubw %xmm8, %xmm1 + psllw $2, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + packuswb %xmm4, %xmm2 + movd %xmm2, (%rdi, %rcx, 4) +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 + jnz 0b + +9: + LEAVE +SIZE(imlib_amd64_reshade_blend_rgba_to_rgba_cmod) + +PR_(imlib_amd64_reshade_blend_rgb_to_rgb_cmod): + ENTER + + pxor %xmm4, %xmm4 + movdqa m000V0V0V000V0V0V(%rip), %xmm6 + movdqa m00XXXXXX(%rip), %xmm7 + + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx + + /* prefetch a couple cache lines ahead */ + prefetchnta (%rsi, %rcx, 4) + prefetcht0 (%rdi, %rcx, 4) + prefetchnta 64(%rsi, %rcx, 4) + prefetcht0 64(%rdi, %rcx, 4) + + jz 2f /* one pixel line */ +1: + /* main loop, unrolled to work on 64 byte chunks */ + prefetchnta 128(%rsi, %rcx, 4) + prefetcht0 128(%rdi, %rcx, 4) + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Unpack alpha */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero blending alpha */ + pand %xmm7, %xmm3 + + /* Unpack src and dst */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (2 * a * (s - 127)) */ + psubw %xmm6, %xmm1 + psllw $2, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* Repack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Unpack alpha */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero blending alpha */ + pand %xmm7, %xmm3 + + /* Unpack src and dst */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (2 * a * (s - 127)) */ + psubw %xmm6, %xmm1 + psllw $2, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* Repack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Unpack alpha */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero blending alpha */ + pand %xmm7, %xmm3 + + /* Unpack src and dst */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (2 * a * (s - 127)) */ + psubw %xmm6, %xmm1 + psllw $2, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* Repack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Unpack alpha */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero blending alpha */ + pand %xmm7, %xmm3 + + /* Unpack src and dst */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (2 * a * (s - 127)) */ + psubw %xmm6, %xmm1 + psllw $2, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* Repack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Unpack alpha */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero blending alpha */ + pand %xmm7, %xmm3 + + /* Unpack src and dst */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (2 * a * (s - 127)) */ + psubw %xmm6, %xmm1 + psllw $2, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* Repack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Unpack alpha */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero blending alpha */ + pand %xmm7, %xmm3 + + /* Unpack src and dst */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (2 * a * (s - 127)) */ + psubw %xmm6, %xmm1 + psllw $2, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* Repack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Unpack alpha */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero blending alpha */ + pand %xmm7, %xmm3 + + /* Unpack src and dst */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (2 * a * (s - 127)) */ + psubw %xmm6, %xmm1 + psllw $2, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* Repack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* Unpack alpha */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero blending alpha */ + pand %xmm7, %xmm3 + + /* Unpack src and dst */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (2 * a * (s - 127)) */ + psubw %xmm6, %xmm1 + psllw $2, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* Repack new pixels */ + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + js 1b + jnz 3f +2: + /* Grab 1 pixel from src, with colormod, with a = amod[255] */ + movl (%rsi, %rcx, 4), %eax + ror $16, %eax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shl $8, %edx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %edx, %xmm1 + movd (%rdi, %rcx, 4), %xmm2 + /* Unpack alpha */ + movq %xmm1, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0xFF, %xmm3, %xmm3 + pshuflw $0xFF, %xmm3, %xmm3 + psrlw $1, %xmm3 + + /* Zero blending alpha */ + pand %xmm7, %xmm3 + + /* Unpack src and dst */ + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + /* d = d + (2 * a * (s - 127)) */ + psubw %xmm6, %xmm1 + psllw $2, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + /* Repack new pixels */ + packuswb %xmm4, %xmm2 + movd %xmm2, (%rdi, %rcx, 4) +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 + jnz 0b + +9: + LEAVE +SIZE(imlib_amd64_reshade_blend_rgb_to_rgb_cmod) + +PR_(imlib_amd64_reshade_blend_rgb_to_rgba_cmod): + ENTER + + movq pow_lut@GOTPCREL(%rip), %r13 + pxor %xmm4, %xmm4 + movdqa c1(%rip), %xmm5 + movdqa mX000X000X000X000(%rip), %xmm6 + movdqa m0XXX0XXX0XXX0XXX(%rip), %xmm7 + movdqa m000V0V0V000V0V0V(%rip), %xmm8 + xorq %rax, %rax + + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx + + /* prefetch a couple cache lines ahead */ + prefetchnta (%rsi, %rcx, 4) + prefetcht0 (%rdi, %rcx, 4) + prefetchnta 64(%rsi, %rcx, 4) + prefetcht0 64(%rdi, %rcx, 4) + + jz 2f /* one pixel line */ +1: + /* main loop, unrolled to work on 64 byte chunks */ + prefetchnta 128(%rsi, %rcx, 4) + prefetcht0 128(%rdi, %rcx, 4) + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending, specialized for reshade by shifting the source alpha + * right by one */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + shrb $1, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + shrb $1, %ah + + rolq $32, %rax + movd %rax, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + movdqa %xmm2, %xmm0 + pand %xmm6, %xmm0 + por %xmm6, %xmm1 + psubusb %xmm0, %xmm1 + + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + psubw %xmm8, %xmm1 + psllw $2, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending, specialized for reshade by shifting the source alpha + * right by one */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + shrb $1, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + shrb $1, %ah + + rolq $32, %rax + movd %rax, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + movdqa %xmm2, %xmm0 + pand %xmm6, %xmm0 + por %xmm6, %xmm1 + psubusb %xmm0, %xmm1 + + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + psubw %xmm8, %xmm1 + psllw $2, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending, specialized for reshade by shifting the source alpha + * right by one */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + shrb $1, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + shrb $1, %ah + + rolq $32, %rax + movd %rax, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + movdqa %xmm2, %xmm0 + pand %xmm6, %xmm0 + por %xmm6, %xmm1 + psubusb %xmm0, %xmm1 + + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + psubw %xmm8, %xmm1 + psllw $2, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending, specialized for reshade by shifting the source alpha + * right by one */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + shrb $1, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + shrb $1, %ah + + rolq $32, %rax + movd %rax, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + movdqa %xmm2, %xmm0 + pand %xmm6, %xmm0 + por %xmm6, %xmm1 + psubusb %xmm0, %xmm1 + + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + psubw %xmm8, %xmm1 + psllw $2, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending, specialized for reshade by shifting the source alpha + * right by one */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + shrb $1, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + shrb $1, %ah + + rolq $32, %rax + movd %rax, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + movdqa %xmm2, %xmm0 + pand %xmm6, %xmm0 + por %xmm6, %xmm1 + psubusb %xmm0, %xmm1 + + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + psubw %xmm8, %xmm1 + psllw $2, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending, specialized for reshade by shifting the source alpha + * right by one */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + shrb $1, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + shrb $1, %ah + + rolq $32, %rax + movd %rax, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + movdqa %xmm2, %xmm0 + pand %xmm6, %xmm0 + por %xmm6, %xmm1 + psubusb %xmm0, %xmm1 + + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + psubw %xmm8, %xmm1 + psllw $2, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending, specialized for reshade by shifting the source alpha + * right by one */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + shrb $1, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + shrb $1, %ah + + rolq $32, %rax + movd %rax, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + movdqa %xmm2, %xmm0 + pand %xmm6, %xmm0 + por %xmm6, %xmm1 + psubusb %xmm0, %xmm1 + + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + psubw %xmm8, %xmm1 + psllw $2, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending, specialized for reshade by shifting the source alpha + * right by one */ + movq %rdx, %rax + andl $0xff000000, %edx + roll $16, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + shrb $1, %ah + + rolq $32, %rax + movl %eax, %edx + + andl $0xff000000, %edx + roll $16, %edx + movb 7(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + shrb $1, %ah + + rolq $32, %rax + movd %rax, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + movdqa %xmm2, %xmm0 + pand %xmm6, %xmm0 + por %xmm6, %xmm1 + psubusb %xmm0, %xmm1 + + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + psubw %xmm8, %xmm1 + psllw $2, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + packuswb %xmm4, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + js 1b + jnz 3f +2: + /* Grab 1 pixel from src, with colormod, with a = amod[255] */ + movl (%rsi, %rcx, 4), %eax + ror $16, %eax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shl $8, %edx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %edx, %xmm1 + movd (%rdi, %rcx, 4), %xmm2 + + /* Convert the cmod alpha to the pow_lut alpha that will be used + * for blending, specialized for reshade by shifting the source alpha + * right by one */ + roll $16, %edx + andl $0x0000ff00, %edx + movb 3(%rdi, %rcx, 4), %dl + movb (%r13, %rdx), %al + movb %dh, %ah + shrb $1, %ah + movd %eax, %xmm3 + punpcklbw %xmm3, %xmm3 + pshufhw $0x40, %xmm3, %xmm3 + pshuflw $0x40, %xmm3, %xmm3 + psrlw $1, %xmm3 + + movdqa %xmm2, %xmm0 + pand %xmm6, %xmm0 + por %xmm6, %xmm1 + psubusb %xmm0, %xmm1 + + punpcklbw %xmm4, %xmm1 + punpcklbw %xmm4, %xmm2 + + psubw %xmm8, %xmm1 + psllw $2, %xmm1 + pmulhw %xmm3, %xmm1 + paddsw %xmm1, %xmm2 + + packuswb %xmm4, %xmm2 + movd %xmm2, (%rdi, %rcx, 4) +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 + jnz 0b + +9: + LEAVE +SIZE(imlib_amd64_reshade_blend_rgb_to_rgba_cmod) + +PR_(imlib_amd64_reshade_copy_rgba_to_rgb_cmod): + ENTER + + movdqa m0XXX0XXX0XXX0XXX(%rip), %xmm5 + movdqa m0VVV0VVV0VVV0VVV(%rip), %xmm6 + + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx + + /* prefetch a couple cache lines ahead */ + prefetchnta (%rsi, %rcx, 4) + prefetcht0 (%rdi, %rcx, 4) + prefetchnta 64(%rsi, %rcx, 4) + prefetcht0 64(%rdi, %rcx, 4) + + jz 2f /* one pixel line */ +1: + /* main loop, unrolled to work on 64 byte chunks */ + prefetchnta 128(%rsi, %rcx, 4) + prefetcht0 128(%rdi, %rcx, 4) + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* To take advantage of saturation and be able to do 8 bytes + * at a time, we divide reshading into two separate steps: + * adding values above 128, and subtracting values below 128 + * These values go into %mm1 and %mm3 respectively + * - %xmm1 becomes (2 * (s - 127)) + * - %xmm3 becomes (2 * (255 - (127 + s))) = (2 * (128 - s)) + */ + movdqa %xmm1, %xmm3 + psubusb %xmm6, %xmm1 + paddusb %xmm1, %xmm1 + paddusb %xmm6, %xmm3 + pxor %xmm5, %xmm3 + paddusb %xmm3, %xmm3 + + /* dest alpha should not be changed in this func */ + pand %xmm5, %xmm1 + pand %xmm5, %xmm3 + + /* d = d + s1 - s2, unsigned saturation */ + paddusb %xmm1, %xmm2 + psubusb %xmm3, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* To take advantage of saturation and be able to do 8 bytes + * at a time, we divide reshading into two separate steps: + * adding values above 128, and subtracting values below 128 + * These values go into %mm1 and %mm3 respectively + * - %xmm1 becomes (2 * (s - 127)) + * - %xmm3 becomes (2 * (255 - (127 + s))) = (2 * (128 - s)) + */ + movdqa %xmm1, %xmm3 + psubusb %xmm6, %xmm1 + paddusb %xmm1, %xmm1 + paddusb %xmm6, %xmm3 + pxor %xmm5, %xmm3 + paddusb %xmm3, %xmm3 + + /* dest alpha should not be changed in this func */ + pand %xmm5, %xmm1 + pand %xmm5, %xmm3 + + /* d = d + s1 - s2, unsigned saturation */ + paddusb %xmm1, %xmm2 + psubusb %xmm3, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* To take advantage of saturation and be able to do 8 bytes + * at a time, we divide reshading into two separate steps: + * adding values above 128, and subtracting values below 128 + * These values go into %mm1 and %mm3 respectively + * - %xmm1 becomes (2 * (s - 127)) + * - %xmm3 becomes (2 * (255 - (127 + s))) = (2 * (128 - s)) + */ + movdqa %xmm1, %xmm3 + psubusb %xmm6, %xmm1 + paddusb %xmm1, %xmm1 + paddusb %xmm6, %xmm3 + pxor %xmm5, %xmm3 + paddusb %xmm3, %xmm3 + + /* dest alpha should not be changed in this func */ + pand %xmm5, %xmm1 + pand %xmm5, %xmm3 + + /* d = d + s1 - s2, unsigned saturation */ + paddusb %xmm1, %xmm2 + psubusb %xmm3, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* To take advantage of saturation and be able to do 8 bytes + * at a time, we divide reshading into two separate steps: + * adding values above 128, and subtracting values below 128 + * These values go into %mm1 and %mm3 respectively + * - %xmm1 becomes (2 * (s - 127)) + * - %xmm3 becomes (2 * (255 - (127 + s))) = (2 * (128 - s)) + */ + movdqa %xmm1, %xmm3 + psubusb %xmm6, %xmm1 + paddusb %xmm1, %xmm1 + paddusb %xmm6, %xmm3 + pxor %xmm5, %xmm3 + paddusb %xmm3, %xmm3 + + /* dest alpha should not be changed in this func */ + pand %xmm5, %xmm1 + pand %xmm5, %xmm3 + + /* d = d + s1 - s2, unsigned saturation */ + paddusb %xmm1, %xmm2 + psubusb %xmm3, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* To take advantage of saturation and be able to do 8 bytes + * at a time, we divide reshading into two separate steps: + * adding values above 128, and subtracting values below 128 + * These values go into %mm1 and %mm3 respectively + * - %xmm1 becomes (2 * (s - 127)) + * - %xmm3 becomes (2 * (255 - (127 + s))) = (2 * (128 - s)) + */ + movdqa %xmm1, %xmm3 + psubusb %xmm6, %xmm1 + paddusb %xmm1, %xmm1 + paddusb %xmm6, %xmm3 + pxor %xmm5, %xmm3 + paddusb %xmm3, %xmm3 + + /* dest alpha should not be changed in this func */ + pand %xmm5, %xmm1 + pand %xmm5, %xmm3 + + /* d = d + s1 - s2, unsigned saturation */ + paddusb %xmm1, %xmm2 + psubusb %xmm3, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* To take advantage of saturation and be able to do 8 bytes + * at a time, we divide reshading into two separate steps: + * adding values above 128, and subtracting values below 128 + * These values go into %mm1 and %mm3 respectively + * - %xmm1 becomes (2 * (s - 127)) + * - %xmm3 becomes (2 * (255 - (127 + s))) = (2 * (128 - s)) + */ + movdqa %xmm1, %xmm3 + psubusb %xmm6, %xmm1 + paddusb %xmm1, %xmm1 + paddusb %xmm6, %xmm3 + pxor %xmm5, %xmm3 + paddusb %xmm3, %xmm3 + + /* dest alpha should not be changed in this func */ + pand %xmm5, %xmm1 + pand %xmm5, %xmm3 + + /* d = d + s1 - s2, unsigned saturation */ + paddusb %xmm1, %xmm2 + psubusb %xmm3, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* To take advantage of saturation and be able to do 8 bytes + * at a time, we divide reshading into two separate steps: + * adding values above 128, and subtracting values below 128 + * These values go into %mm1 and %mm3 respectively + * - %xmm1 becomes (2 * (s - 127)) + * - %xmm3 becomes (2 * (255 - (127 + s))) = (2 * (128 - s)) + */ + movdqa %xmm1, %xmm3 + psubusb %xmm6, %xmm1 + paddusb %xmm1, %xmm1 + paddusb %xmm6, %xmm3 + pxor %xmm5, %xmm3 + paddusb %xmm3, %xmm3 + + /* dest alpha should not be changed in this func */ + pand %xmm5, %xmm1 + pand %xmm5, %xmm3 + + /* d = d + s1 - s2, unsigned saturation */ + paddusb %xmm1, %xmm2 + psubusb %xmm3, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* To take advantage of saturation and be able to do 8 bytes + * at a time, we divide reshading into two separate steps: + * adding values above 128, and subtracting values below 128 + * These values go into %mm1 and %mm3 respectively + * - %xmm1 becomes (2 * (s - 127)) + * - %xmm3 becomes (2 * (255 - (127 + s))) = (2 * (128 - s)) + */ + movdqa %xmm1, %xmm3 + psubusb %xmm6, %xmm1 + paddusb %xmm1, %xmm1 + paddusb %xmm6, %xmm3 + pxor %xmm5, %xmm3 + paddusb %xmm3, %xmm3 + + /* dest alpha should not be changed in this func */ + pand %xmm5, %xmm1 + pand %xmm5, %xmm3 + + /* d = d + s1 - s2, unsigned saturation */ + paddusb %xmm1, %xmm2 + psubusb %xmm3, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + js 1b + jnz 3f +2: + /* Grab 1 pixel from src, with colormod */ + movl (%rsi, %rcx, 4), %eax + ror $24, %eax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %edx, %xmm1 + movd (%rdi, %rcx, 4), %xmm2 + /* To take advantage of saturation and be able to do 8 bytes + * at a time, we divide reshading into two separate steps: + * adding values above 128, and subtracting values below 128 + * These values go into %mm1 and %mm3 respectively + * - %xmm1 becomes (2 * (s - 127)) + * - %xmm3 becomes (2 * (255 - (127 + s))) = (2 * (128 - s)) + */ + movdqa %xmm1, %xmm3 + psubusb %xmm6, %xmm1 + paddusb %xmm1, %xmm1 + paddusb %xmm6, %xmm3 + pxor %xmm5, %xmm3 + paddusb %xmm3, %xmm3 + + /* dest alpha should not be changed in this func */ + pand %xmm5, %xmm1 + pand %xmm5, %xmm3 + + /* d = d + s1 - s2, unsigned saturation */ + paddusb %xmm1, %xmm2 + psubusb %xmm3, %xmm2 + movd %xmm2, (%rdi, %rcx, 4) +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 + jnz 0b + +9: + LEAVE +SIZE(imlib_amd64_reshade_copy_rgba_to_rgb_cmod) + +PR_(imlib_amd64_reshade_copy_rgba_to_rgba_cmod): + ENTER + + movdqu m0XXX0XXX0XXX0XXX(%rip), %xmm5 + movdqu m0VVV0VVV0VVV0VVV(%rip), %xmm6 + movdqu mX000X000X000X000(%rip), %xmm7 + + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx + + /* prefetch a couple cache lines ahead */ + prefetchnta (%rsi, %rcx, 4) + prefetcht0 (%rdi, %rcx, 4) + prefetchnta 64(%rsi, %rcx, 4) + prefetcht0 64(%rdi, %rcx, 4) + + jz 2f /* one pixel line */ +1: + /* main loop, unrolled to work on 64 byte chunks */ + prefetchnta 128(%rsi, %rcx, 4) + prefetcht0 128(%rdi, %rcx, 4) + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */ + movdqa %xmm1, %xmm3 + psubusb %xmm6, %xmm1 + movdqa %xmm1, %xmm0 + paddusb %xmm1, %xmm1 + paddusb %xmm6, %xmm3 + pxor %xmm5, %xmm3 + paddusb %xmm3, %xmm3 + + /* d = d + s1 - s2, unsigned saturation */ + paddusb %xmm1, %xmm2 + psubusb %xmm3, %xmm2 + + /* d alpha = s alpha */ + pand %xmm5, %xmm2 + pand %xmm7, %xmm0 + por %xmm0, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */ + movdqa %xmm1, %xmm3 + psubusb %xmm6, %xmm1 + movdqa %xmm1, %xmm0 + paddusb %xmm1, %xmm1 + paddusb %xmm6, %xmm3 + pxor %xmm5, %xmm3 + paddusb %xmm3, %xmm3 + + /* d = d + s1 - s2, unsigned saturation */ + paddusb %xmm1, %xmm2 + psubusb %xmm3, %xmm2 + + /* d alpha = s alpha */ + pand %xmm5, %xmm2 + pand %xmm7, %xmm0 + por %xmm0, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */ + movdqa %xmm1, %xmm3 + psubusb %xmm6, %xmm1 + movdqa %xmm1, %xmm0 + paddusb %xmm1, %xmm1 + paddusb %xmm6, %xmm3 + pxor %xmm5, %xmm3 + paddusb %xmm3, %xmm3 + + /* d = d + s1 - s2, unsigned saturation */ + paddusb %xmm1, %xmm2 + psubusb %xmm3, %xmm2 + + /* d alpha = s alpha */ + pand %xmm5, %xmm2 + pand %xmm7, %xmm0 + por %xmm0, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */ + movdqa %xmm1, %xmm3 + psubusb %xmm6, %xmm1 + movdqa %xmm1, %xmm0 + paddusb %xmm1, %xmm1 + paddusb %xmm6, %xmm3 + pxor %xmm5, %xmm3 + paddusb %xmm3, %xmm3 + + /* d = d + s1 - s2, unsigned saturation */ + paddusb %xmm1, %xmm2 + psubusb %xmm3, %xmm2 + + /* d alpha = s alpha */ + pand %xmm5, %xmm2 + pand %xmm7, %xmm0 + por %xmm0, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */ + movdqa %xmm1, %xmm3 + psubusb %xmm6, %xmm1 + movdqa %xmm1, %xmm0 + paddusb %xmm1, %xmm1 + paddusb %xmm6, %xmm3 + pxor %xmm5, %xmm3 + paddusb %xmm3, %xmm3 + + /* d = d + s1 - s2, unsigned saturation */ + paddusb %xmm1, %xmm2 + psubusb %xmm3, %xmm2 + + /* d alpha = s alpha */ + pand %xmm5, %xmm2 + pand %xmm7, %xmm0 + por %xmm0, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */ + movdqa %xmm1, %xmm3 + psubusb %xmm6, %xmm1 + movdqa %xmm1, %xmm0 + paddusb %xmm1, %xmm1 + paddusb %xmm6, %xmm3 + pxor %xmm5, %xmm3 + paddusb %xmm3, %xmm3 + + /* d = d + s1 - s2, unsigned saturation */ + paddusb %xmm1, %xmm2 + psubusb %xmm3, %xmm2 + + /* d alpha = s alpha */ + pand %xmm5, %xmm2 + pand %xmm7, %xmm0 + por %xmm0, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */ + movdqa %xmm1, %xmm3 + psubusb %xmm6, %xmm1 + movdqa %xmm1, %xmm0 + paddusb %xmm1, %xmm1 + paddusb %xmm6, %xmm3 + pxor %xmm5, %xmm3 + paddusb %xmm3, %xmm3 + + /* d = d + s1 - s2, unsigned saturation */ + paddusb %xmm1, %xmm2 + psubusb %xmm3, %xmm2 + + /* d alpha = s alpha */ + pand %xmm5, %xmm2 + pand %xmm7, %xmm0 + por %xmm0, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod */ + movq (%rsi, %rcx, 4), %rax + rorq $56, %rax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */ + movdqa %xmm1, %xmm3 + psubusb %xmm6, %xmm1 + movdqa %xmm1, %xmm0 + paddusb %xmm1, %xmm1 + paddusb %xmm6, %xmm3 + pxor %xmm5, %xmm3 + paddusb %xmm3, %xmm3 + + /* d = d + s1 - s2, unsigned saturation */ + paddusb %xmm1, %xmm2 + psubusb %xmm3, %xmm2 + + /* d alpha = s alpha */ + pand %xmm5, %xmm2 + pand %xmm7, %xmm0 + por %xmm0, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + js 1b + jnz 3f +2: + /* Grab 1 pixel from src, with colormod */ + movl (%rsi, %rcx, 4), %eax + ror $24, %eax + movzbq %al, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %edx, %xmm1 + movd (%rdi, %rcx, 4), %xmm2 + /* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */ + movdqa %xmm1, %xmm3 + psubusb %xmm6, %xmm1 + movdqa %xmm1, %xmm0 + paddusb %xmm1, %xmm1 + paddusb %xmm6, %xmm3 + pxor %xmm5, %xmm3 + paddusb %xmm3, %xmm3 + + /* d = d + s1 - s2, unsigned saturation */ + paddusb %xmm1, %xmm2 + psubusb %xmm3, %xmm2 + + /* d alpha = s alpha */ + pand %xmm5, %xmm2 + pand %xmm7, %xmm0 + por %xmm0, %xmm2 + movd %xmm2, (%rdi, %rcx, 4) +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 + jnz 0b + +9: + LEAVE +SIZE(imlib_amd64_reshade_copy_rgba_to_rgba_cmod) + +PR_(imlib_amd64_reshade_copy_rgb_to_rgba_cmod): + ENTER + + movdqu m0XXX0XXX0XXX0XXX(%rip), %xmm5 + movdqu m0VVV0VVV0VVV0VVV(%rip), %xmm6 + movdqu mX000X000X000X000(%rip), %xmm7 + + /* Move right to left across each line, */ + /* processing in two pixel chunks */ + leaq (%rsi, %r8, 4), %rsi + leaq (%rdi, %r8, 4), %rdi + + /* Last instruction is %rcx = 0 */ + subq $4, %rsi + subq $4, %rdi + + negq %r8 +0: + movq %r8, %rcx + + incq %rcx + + /* prefetch a couple cache lines ahead */ + prefetchnta (%rsi, %rcx, 4) + prefetcht0 (%rdi, %rcx, 4) + prefetchnta 64(%rsi, %rcx, 4) + prefetcht0 64(%rdi, %rcx, 4) + + jz 2f /* one pixel line */ +1: + /* main loop, unrolled to work on 64 byte chunks */ + prefetchnta 128(%rsi, %rcx, 4) + prefetcht0 128(%rdi, %rcx, 4) + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */ + movdqa %xmm1, %xmm3 + psubusb %xmm6, %xmm1 + movdqa %xmm1, %xmm0 + paddusb %xmm1, %xmm1 + paddusb %xmm6, %xmm3 + pxor %xmm5, %xmm3 + paddusb %xmm3, %xmm3 + + /* d = d + s1 - s2, unsigned saturation */ + paddusb %xmm1, %xmm2 + psubusb %xmm3, %xmm2 + + /* d alpha = s alpha */ + pand %xmm5, %xmm2 + pand %xmm7, %xmm0 + por %xmm0, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */ + movdqa %xmm1, %xmm3 + psubusb %xmm6, %xmm1 + movdqa %xmm1, %xmm0 + paddusb %xmm1, %xmm1 + paddusb %xmm6, %xmm3 + pxor %xmm5, %xmm3 + paddusb %xmm3, %xmm3 + + /* d = d + s1 - s2, unsigned saturation */ + paddusb %xmm1, %xmm2 + psubusb %xmm3, %xmm2 + + /* d alpha = s alpha */ + pand %xmm5, %xmm2 + pand %xmm7, %xmm0 + por %xmm0, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */ + movdqa %xmm1, %xmm3 + psubusb %xmm6, %xmm1 + movdqa %xmm1, %xmm0 + paddusb %xmm1, %xmm1 + paddusb %xmm6, %xmm3 + pxor %xmm5, %xmm3 + paddusb %xmm3, %xmm3 + + /* d = d + s1 - s2, unsigned saturation */ + paddusb %xmm1, %xmm2 + psubusb %xmm3, %xmm2 + + /* d alpha = s alpha */ + pand %xmm5, %xmm2 + pand %xmm7, %xmm0 + por %xmm0, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */ + movdqa %xmm1, %xmm3 + psubusb %xmm6, %xmm1 + movdqa %xmm1, %xmm0 + paddusb %xmm1, %xmm1 + paddusb %xmm6, %xmm3 + pxor %xmm5, %xmm3 + paddusb %xmm3, %xmm3 + + /* d = d + s1 - s2, unsigned saturation */ + paddusb %xmm1, %xmm2 + psubusb %xmm3, %xmm2 + + /* d alpha = s alpha */ + pand %xmm5, %xmm2 + pand %xmm7, %xmm0 + por %xmm0, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */ + movdqa %xmm1, %xmm3 + psubusb %xmm6, %xmm1 + movdqa %xmm1, %xmm0 + paddusb %xmm1, %xmm1 + paddusb %xmm6, %xmm3 + pxor %xmm5, %xmm3 + paddusb %xmm3, %xmm3 + + /* d = d + s1 - s2, unsigned saturation */ + paddusb %xmm1, %xmm2 + psubusb %xmm3, %xmm2 + + /* d alpha = s alpha */ + pand %xmm5, %xmm2 + pand %xmm7, %xmm0 + por %xmm0, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */ + movdqa %xmm1, %xmm3 + psubusb %xmm6, %xmm1 + movdqa %xmm1, %xmm0 + paddusb %xmm1, %xmm1 + paddusb %xmm6, %xmm3 + pxor %xmm5, %xmm3 + paddusb %xmm3, %xmm3 + + /* d = d + s1 - s2, unsigned saturation */ + paddusb %xmm1, %xmm2 + psubusb %xmm3, %xmm2 + + /* d alpha = s alpha */ + pand %xmm5, %xmm2 + pand %xmm7, %xmm0 + por %xmm0, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */ + movdqa %xmm1, %xmm3 + psubusb %xmm6, %xmm1 + movdqa %xmm1, %xmm0 + paddusb %xmm1, %xmm1 + paddusb %xmm6, %xmm3 + pxor %xmm5, %xmm3 + paddusb %xmm3, %xmm3 + + /* d = d + s1 - s2, unsigned saturation */ + paddusb %xmm1, %xmm2 + psubusb %xmm3, %xmm2 + + /* d alpha = s alpha */ + pand %xmm5, %xmm2 + pand %xmm7, %xmm0 + por %xmm0, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + jz 2f + jns 3f + + /* Grab 2 pixels from src, with colormod, with a = amod[255] */ + movq (%rsi, %rcx, 4), %rax + rorq $48, %rax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shlq $8, %rdx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + shlq $8, %rdx + movl $0x000000FF, %ebx + movb 0x300(%r14, %rbx), %dl + shlq $8, %rdx + rolq $16, %rax + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shlq $8, %rdx + rolq $8, %rax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %rdx, %xmm1 + movq (%rdi, %rcx, 4), %xmm2 + /* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */ + movdqa %xmm1, %xmm3 + psubusb %xmm6, %xmm1 + movdqa %xmm1, %xmm0 + paddusb %xmm1, %xmm1 + paddusb %xmm6, %xmm3 + pxor %xmm5, %xmm3 + paddusb %xmm3, %xmm3 + + /* d = d + s1 - s2, unsigned saturation */ + paddusb %xmm1, %xmm2 + psubusb %xmm3, %xmm2 + + /* d alpha = s alpha */ + pand %xmm5, %xmm2 + pand %xmm7, %xmm0 + por %xmm0, %xmm2 + movq %xmm2, (%rdi, %rcx, 4) + + incq %rcx + incq %rcx + js 1b + jnz 3f +2: + /* Grab 1 pixel from src, with colormod, with a = amod[255] */ + movl (%rsi, %rcx, 4), %eax + ror $16, %eax + movq $0x000000FF, %rbx + movzbq 0x300(%r14, %rbx), %rdx + shl $8, %edx + movb %al, %bl + movb 0x000(%r14, %rbx), %dl + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x100(%r14, %rbx), %dl + shl $8, %edx + rol $8, %eax + movb %al, %bl + movb 0x200(%r14, %rbx), %dl + movd %edx, %xmm1 + movd (%rdi, %rcx, 4), %xmm2 + /* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */ + movdqa %xmm1, %xmm3 + psubusb %xmm6, %xmm1 + movdqa %xmm1, %xmm0 + paddusb %xmm1, %xmm1 + paddusb %xmm6, %xmm3 + pxor %xmm5, %xmm3 + paddusb %xmm3, %xmm3 + + /* d = d + s1 - s2, unsigned saturation */ + paddusb %xmm1, %xmm2 + psubusb %xmm3, %xmm2 + + /* d alpha = s alpha */ + pand %xmm5, %xmm2 + pand %xmm7, %xmm0 + por %xmm0, %xmm2 + movd %xmm2, (%rdi, %rcx, 4) +3: + leaq (%rsi, %r10, 4), %rsi + leaq (%rdi, %r11, 4), %rdi + decq %r9 + jnz 0b + +9: + LEAVE +SIZE(imlib_amd64_reshade_copy_rgb_to_rgba_cmod) + + +#endif diff --git a/src/lib/blend.c b/src/lib/blend.c index 706e55b..541ea6e 100644 --- a/src/lib/blend.c +++ b/src/lib/blend.c @@ -1402,6 +1402,7 @@ __imlib_ReCopyRGBToRGBACmod(DATA32 * src, int srcw, DATA32 * dst, int dstw, #define __imlib_amd64_blend_rgb_to_rgba __imlib_amd64_copy_rgb_to_rgba #define __imlib_CopyRGBToRGBCmod __imlib_CopyRGBAToRGBCmod #define __imlib_mmx_copy_rgb_to_rgb_cmod __imlib_mmx_copy_rgba_to_rgb_cmod +#define __imlib_amd64_copy_rgb_to_rgb_cmod __imlib_amd64_copy_rgba_to_rgb_cmod #define __imlib_AddCopyRGBToRGB __imlib_AddCopyRGBAToRGB #define __imlib_AddBlendRGBToRGB __imlib_AddCopyRGBToRGB @@ -1414,6 +1415,7 @@ __imlib_ReCopyRGBToRGBACmod(DATA32 * src, int srcw, DATA32 * dst, int dstw, #define __imlib_amd64_add_blend_rgb_to_rgba __imlib_amd64_add_copy_rgb_to_rgba #define __imlib_AddCopyRGBToRGBCmod __imlib_AddCopyRGBAToRGBCmod #define __imlib_mmx_add_copy_rgb_to_rgb_cmod __imlib_mmx_add_copy_rgb_to_rgba_cmod +#define __imlib_amd64_add_copy_rgb_to_rgb_cmod __imlib_amd64_add_copy_rgb_to_rgba_cmod #define __imlib_SubCopyRGBToRGB __imlib_SubCopyRGBAToRGB #define __imlib_SubBlendRGBToRGB __imlib_SubCopyRGBToRGB @@ -1427,6 +1429,7 @@ __imlib_ReCopyRGBToRGBACmod(DATA32 * src, int srcw, DATA32 * dst, int dstw, #define __imlib_amd64_subtract_blend_rgb_to_rgba __imlib_amd64_subtract_copy_rgb_to_rgba #define __imlib_SubCopyRGBToRGBCmod __imlib_SubCopyRGBAToRGBCmod #define __imlib_mmx_subtract_copy_rgb_to_rgb_cmod __imlib_mmx_subtract_copy_rgb_to_rgba_cmod +#define __imlib_amd64_subtract_copy_rgb_to_rgb_cmod __imlib_amd64_subtract_copy_rgb_to_rgba_cmod #define __imlib_ReCopyRGBToRGB __imlib_ReCopyRGBAToRGB #define __imlib_ReBlendRGBToRGB __imlib_ReCopyRGBToRGB @@ -1440,6 +1443,7 @@ __imlib_ReCopyRGBToRGBACmod(DATA32 * src, int srcw, DATA32 * dst, int dstw, #define __imlib_amd64_reshade_blend_rgb_to_rgba __imlib_amd64_reshade_copy_rgb_to_rgba #define __imlib_ReCopyRGBToRGBCmod __imlib_ReCopyRGBAToRGBCmod #define __imlib_mmx_reshade_copy_rgb_to_rgb_cmod __imlib_mmx_reshade_copy_rgb_to_rgba_cmod +#define __imlib_amd64_reshade_copy_rgb_to_rgb_cmod __imlib_amd64_reshade_copy_rgb_to_rgba_cmod ImlibBlendFunction @@ -1570,10 +1574,14 @@ __imlib_GetBlendFunction(ImlibOp op, char blend, char merge_alpha, char rgb_src, {{__imlib_amd64_copy_rgba_to_rgba, __imlib_amd64_blend_rgba_to_rgba }, {__imlib_amd64_copy_rgb_to_rgba, __imlib_amd64_blend_rgb_to_rgba}}}, - {{{__imlib_CopyRGBAToRGBCmod, __imlib_BlendRGBAToRGBCmod}, - {__imlib_CopyRGBToRGBCmod, __imlib_BlendRGBToRGBCmod}}, - {{__imlib_CopyRGBAToRGBACmod, __imlib_BlendRGBAToRGBACmod}, - {__imlib_CopyRGBToRGBACmod, __imlib_BlendRGBToRGBACmod}}}}, + {{{__imlib_amd64_copy_rgba_to_rgb_cmod, + __imlib_amd64_blend_rgba_to_rgb_cmod}, + {__imlib_amd64_copy_rgb_to_rgb_cmod, + __imlib_amd64_blend_rgb_to_rgb_cmod}}, + {{__imlib_amd64_copy_rgba_to_rgba_cmod, + __imlib_amd64_blend_rgba_to_rgba_cmod}, + {__imlib_amd64_copy_rgb_to_rgba_cmod, + __imlib_amd64_blend_rgb_to_rgba_cmod}}}}, /*\ OP_ADD \ */ {{{{__imlib_amd64_add_copy_rgba_to_rgb, __imlib_amd64_add_blend_rgba_to_rgb}, @@ -1584,10 +1592,14 @@ __imlib_GetBlendFunction(ImlibOp op, char blend, char merge_alpha, char rgb_src, {__imlib_amd64_add_copy_rgb_to_rgba, __imlib_amd64_add_blend_rgb_to_rgba}}}, - {{{__imlib_AddCopyRGBAToRGBCmod, __imlib_AddBlendRGBAToRGBCmod}, - {__imlib_AddCopyRGBToRGBCmod, __imlib_AddBlendRGBToRGBCmod}}, - {{__imlib_AddCopyRGBAToRGBACmod, __imlib_AddBlendRGBAToRGBACmod}, - {__imlib_AddCopyRGBToRGBACmod, __imlib_AddBlendRGBToRGBACmod}}}}, + {{{__imlib_amd64_add_copy_rgba_to_rgb_cmod, + __imlib_amd64_add_blend_rgba_to_rgb_cmod}, + {__imlib_amd64_add_copy_rgb_to_rgb_cmod, + __imlib_amd64_add_blend_rgb_to_rgb_cmod}}, + {{__imlib_amd64_add_copy_rgba_to_rgba_cmod, + __imlib_amd64_add_blend_rgba_to_rgba_cmod}, + {__imlib_amd64_add_copy_rgb_to_rgba_cmod, + __imlib_amd64_add_blend_rgb_to_rgba_cmod}}}}, /*\ OP_SUBTRACT \ */ {{{{__imlib_amd64_subtract_copy_rgba_to_rgb, __imlib_amd64_subtract_blend_rgba_to_rgb}, @@ -1598,10 +1610,14 @@ __imlib_GetBlendFunction(ImlibOp op, char blend, char merge_alpha, char rgb_src, {__imlib_amd64_subtract_copy_rgb_to_rgba, __imlib_amd64_subtract_blend_rgb_to_rgba}}}, - {{{__imlib_SubCopyRGBAToRGBCmod, __imlib_SubBlendRGBAToRGBCmod}, - {__imlib_SubCopyRGBToRGBCmod, __imlib_SubBlendRGBToRGBCmod}}, - {{__imlib_SubCopyRGBAToRGBACmod, __imlib_SubBlendRGBAToRGBACmod}, - {__imlib_SubCopyRGBToRGBACmod, __imlib_SubBlendRGBToRGBACmod}}}}, + {{{__imlib_amd64_subtract_copy_rgba_to_rgb_cmod, + __imlib_amd64_subtract_blend_rgba_to_rgb_cmod}, + {__imlib_amd64_subtract_copy_rgb_to_rgb_cmod, + __imlib_amd64_subtract_blend_rgb_to_rgb_cmod}}, + {{__imlib_amd64_subtract_copy_rgba_to_rgba_cmod, + __imlib_amd64_subtract_blend_rgba_to_rgba_cmod}, + {__imlib_amd64_subtract_copy_rgb_to_rgba_cmod, + __imlib_amd64_subtract_blend_rgb_to_rgba_cmod}}}}, /*\ OP_RESHADE \ */ {{{{__imlib_amd64_reshade_copy_rgba_to_rgb, __imlib_amd64_reshade_blend_rgba_to_rgb}, @@ -1612,10 +1628,14 @@ __imlib_GetBlendFunction(ImlibOp op, char blend, char merge_alpha, char rgb_src, {__imlib_amd64_reshade_copy_rgb_to_rgba, __imlib_amd64_reshade_blend_rgb_to_rgba}}}, - {{{__imlib_ReCopyRGBAToRGBCmod, __imlib_ReBlendRGBAToRGBCmod}, - {__imlib_ReCopyRGBToRGBCmod, __imlib_ReBlendRGBToRGBCmod}}, - {{__imlib_ReCopyRGBAToRGBACmod, __imlib_ReBlendRGBAToRGBACmod}, - {__imlib_ReCopyRGBToRGBACmod, __imlib_ReBlendRGBToRGBACmod}}}}}, + {{{__imlib_amd64_reshade_copy_rgba_to_rgb_cmod, + __imlib_amd64_reshade_blend_rgba_to_rgb_cmod}, + {__imlib_amd64_reshade_copy_rgb_to_rgb_cmod, + __imlib_amd64_reshade_blend_rgb_to_rgb_cmod}}, + {{__imlib_amd64_reshade_copy_rgba_to_rgba_cmod, + __imlib_amd64_reshade_blend_rgba_to_rgba_cmod}, + {__imlib_amd64_reshade_copy_rgb_to_rgba_cmod, + __imlib_amd64_reshade_blend_rgb_to_rgba_cmod}}}}}, #endif }; diff --git a/src/lib/blend.h b/src/lib/blend.h index 20144ac..9458f3b 100644 --- a/src/lib/blend.h +++ b/src/lib/blend.h @@ -613,5 +613,90 @@ void __imlib_amd64_reshade_copy_rgb_to_rgba(DATA32 *src, int sw, DATA32 *dst, int dw, int w, int h, ImlibColorModifier *cm); + +void +__imlib_amd64_blend_rgba_to_rgb_cmod(DATA32 *src, int sw, DATA32 *dst, + int dw, int w, int h, ImlibColorModifier *cm); +void +__imlib_amd64_blend_rgba_to_rgba_cmod(DATA32 *src, int sw, DATA32 *dst, + int dw, int w, int h, ImlibColorModifier *cm); +void +__imlib_amd64_blend_rgb_to_rgb_cmod(DATA32 *src, int sw, DATA32 *dst, + int dw, int w, int h, ImlibColorModifier *cm); +void +__imlib_amd64_blend_rgb_to_rgba_cmod(DATA32 *src, int sw, DATA32 *dst, + int dw, int w, int h, ImlibColorModifier *cm); +void +__imlib_amd64_copy_rgba_to_rgb_cmod(DATA32 *src, int sw, DATA32 *dst, + int dw, int w, int h, ImlibColorModifier *cm); +void +__imlib_amd64_copy_rgba_to_rgba_cmod(DATA32 *src, int sw, DATA32 *dst, + int dw, int w, int h, ImlibColorModifier *cm); +void +__imlib_amd64_copy_rgb_to_rgba_cmod(DATA32 *src, int sw, DATA32 *dst, + int dw, int w, int h, ImlibColorModifier *cm); +void +__imlib_amd64_add_blend_rgba_to_rgb_cmod(DATA32 *src, int sw, DATA32 *dst, + int dw, int w, int h, ImlibColorModifier *cm); +void +__imlib_amd64_add_blend_rgba_to_rgba_cmod(DATA32 *src, int sw, DATA32 *dst, + int dw, int w, int h, ImlibColorModifier *cm); +void +__imlib_amd64_add_blend_rgb_to_rgb_cmod(DATA32 *src, int sw, DATA32 *dst, + int dw, int w, int h, ImlibColorModifier *cm); +void +__imlib_amd64_add_blend_rgb_to_rgba_cmod(DATA32 *src, int sw, DATA32 *dst, + int dw, int w, int h, ImlibColorModifier *cm); +void +__imlib_amd64_add_copy_rgba_to_rgb_cmod(DATA32 *src, int sw, DATA32 *dst, + int dw, int w, int h, ImlibColorModifier *cm); +void +__imlib_amd64_add_copy_rgba_to_rgba_cmod(DATA32 *src, int sw, DATA32 *dst, + int dw, int w, int h, ImlibColorModifier *cm); +void +__imlib_amd64_add_copy_rgb_to_rgba_cmod(DATA32 *src, int sw, DATA32 *dst, + int dw, int w, int h, ImlibColorModifier *cm); +void +__imlib_amd64_subtract_blend_rgba_to_rgb_cmod(DATA32 *src, int sw, DATA32 *dst, + int dw, int w, int h, ImlibColorModifier *cm); +void +__imlib_amd64_subtract_blend_rgba_to_rgba_cmod(DATA32 *src, int sw, DATA32 *dst, + int dw, int w, int h, ImlibColorModifier *cm); +void +__imlib_amd64_subtract_blend_rgb_to_rgb_cmod(DATA32 *src, int sw, DATA32 *dst, + int dw, int w, int h, ImlibColorModifier *cm); +void +__imlib_amd64_subtract_blend_rgb_to_rgba_cmod(DATA32 *src, int sw, DATA32 *dst, + int dw, int w, int h, ImlibColorModifier *cm); +void +__imlib_amd64_subtract_copy_rgba_to_rgb_cmod(DATA32 *src, int sw, DATA32 *dst, + int dw, int w, int h, ImlibColorModifier *cm); +void +__imlib_amd64_subtract_copy_rgba_to_rgba_cmod(DATA32 *src, int sw, DATA32 *dst, + int dw, int w, int h, ImlibColorModifier *cm); +void +__imlib_amd64_subtract_copy_rgb_to_rgba_cmod(DATA32 *src, int sw, DATA32 *dst, + int dw, int w, int h, ImlibColorModifier *cm); +void +__imlib_amd64_reshade_blend_rgba_to_rgb_cmod(DATA32 *src, int sw, DATA32 *dst, + int dw, int w, int h, ImlibColorModifier *cm); +void +__imlib_amd64_reshade_blend_rgba_to_rgba_cmod(DATA32 *src, int sw, DATA32 *dst, + int dw, int w, int h, ImlibColorModifier *cm); +void +__imlib_amd64_reshade_blend_rgb_to_rgb_cmod(DATA32 *src, int sw, DATA32 *dst, + int dw, int w, int h, ImlibColorModifier *cm); +void +__imlib_amd64_reshade_blend_rgb_to_rgba_cmod(DATA32 *src, int sw, DATA32 *dst, + int dw, int w, int h, ImlibColorModifier *cm); +void +__imlib_amd64_reshade_copy_rgba_to_rgb_cmod(DATA32 *src, int sw, DATA32 *dst, + int dw, int w, int h, ImlibColorModifier *cm); +void +__imlib_amd64_reshade_copy_rgba_to_rgba_cmod(DATA32 *src, int sw, DATA32 *dst, + int dw, int w, int h, ImlibColorModifier *cm); +void +__imlib_amd64_reshade_copy_rgb_to_rgba_cmod(DATA32 *src, int sw, DATA32 *dst, + int dw, int w, int h, ImlibColorModifier *cm); #endif #endif