5522 lines
116 KiB
ArmAsm
5522 lines
116 KiB
ArmAsm
#include <config.h>
|
|
#include "asm.h"
|
|
|
|
/*\
|
|
|*| AMD64 SSE2 assembly blending routines for Imlib2
|
|
|*| Written by John Slaten <zartheenumerator@comcast.net>
|
|
|*| Based on MMX routines written by Willem Monsuwe <willem@stack.nl>
|
|
\*/
|
|
|
|
/*\ Some useful masks \*/
|
|
.data
|
|
.align 16
|
|
m0X000000: .byte 0, 0, 0, 0, 0, 0, 255, 0
|
|
.byte 0, 0, 0, 0, 0, 0, 255, 0
|
|
m10000000: .byte 0, 0, 0, 0, 0, 0, 0, 1
|
|
.byte 0, 0, 0, 0, 0, 0, 0, 1
|
|
m00XXXXXX: .byte 255, 255, 255, 255, 255, 255, 0, 0
|
|
.byte 255, 255, 255, 255, 255, 255, 0, 0
|
|
mVX000000: .byte 0, 0, 0, 0, 0, 0, 255, 127
|
|
.byte 0, 0, 0, 0, 0, 0, 255, 127
|
|
mV0000000: .byte 0, 0, 0, 0, 0, 0, 0, 128
|
|
.byte 0, 0, 0, 0, 0, 0, 0, 128
|
|
mX000X000: .byte 0, 0, 0, 0, 0, 0, 255, 255
|
|
.byte 0, 0, 0, 0, 0, 0, 255, 255
|
|
m0XXX0XXX0XXX0XXX: .byte 255, 255, 255, 0, 255, 255, 255, 0
|
|
.byte 255, 255, 255, 0, 255, 255, 255, 0
|
|
m0XXX0XXX00000000: .byte 255, 255, 255, 0, 255, 255, 255, 0
|
|
.byte 0, 0, 0, 0, 0, 0, 0, 0
|
|
m0XXX000000000000: .byte 255, 255, 255, 0, 0, 0, 0, 0
|
|
.byte 0, 0, 0, 0, 0, 0, 0, 0
|
|
mX000X000X000X000: .byte 0, 0, 0, 255, 0, 0, 0, 255
|
|
.byte 0, 0, 0, 255, 0, 0, 0, 255
|
|
mX000X00000000000: .byte 0, 0, 0, 255, 0, 0, 0, 255
|
|
.byte 0, 0, 0, 255, 0, 0, 0, 255
|
|
mX000000000000000: .byte 0, 0, 0, 255, 0, 0, 0, 255
|
|
.byte 0, 0, 0, 255, 0, 0, 0, 255
|
|
m1000100010001000: .byte 0, 0, 0, 1, 0, 0, 0, 1
|
|
.byte 0, 0, 0, 1, 0, 0, 0, 1
|
|
m000V0V0V000V0V0V: .byte 127, 0, 127, 0, 127, 0, 0, 0
|
|
.byte 127, 0, 127, 0, 127, 0, 0, 0
|
|
mI0000000I0000000: .byte 0, 0, 0, 0, 0, 0, 0, 64
|
|
.byte 0, 0, 0, 0, 0, 0, 0, 64
|
|
m0VVV0VVV0VVV0VVV: .byte 127, 127, 127, 0, 127, 127, 127, 0
|
|
.byte 127, 127, 127, 0, 127, 127, 127, 0
|
|
c1: .word 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1
|
|
|
|
/*\ All functions have the same calling convention:
|
|
|*| __imlib_amd64_<op>_rgba_to_rgb[A](void *src, int sw, void *dst, int dw,
|
|
|*| int w, int h, ImlibColorModifier *cm)
|
|
|*| AMD64 GCC passes paramters by register, so no aliases exist in this version.
|
|
\*/
|
|
|
|
.text
|
|
.align 16
|
|
FN_(imlib_amd64_blend_rgba_to_rgb)
|
|
FN_(imlib_amd64_blend_rgba_to_rgba)
|
|
FN_(imlib_amd64_copy_rgba_to_rgb)
|
|
FN_(imlib_amd64_copy_rgba_to_rgba)
|
|
|
|
FN_(imlib_amd64_copy_rgb_to_rgba)
|
|
FN_(imlib_amd64_add_blend_rgba_to_rgb)
|
|
FN_(imlib_amd64_add_blend_rgba_to_rgba)
|
|
FN_(imlib_amd64_add_copy_rgba_to_rgb)
|
|
FN_(imlib_amd64_add_copy_rgba_to_rgba)
|
|
FN_(imlib_amd64_add_copy_rgb_to_rgba)
|
|
|
|
FN_(imlib_amd64_subtract_blend_rgba_to_rgb)
|
|
FN_(imlib_amd64_subtract_blend_rgba_to_rgba)
|
|
FN_(imlib_amd64_subtract_copy_rgba_to_rgb)
|
|
FN_(imlib_amd64_subtract_copy_rgba_to_rgba)
|
|
FN_(imlib_amd64_subtract_copy_rgb_to_rgba)
|
|
|
|
FN_(imlib_amd64_reshade_blend_rgba_to_rgb)
|
|
FN_(imlib_amd64_reshade_blend_rgba_to_rgba)
|
|
FN_(imlib_amd64_reshade_copy_rgba_to_rgb)
|
|
FN_(imlib_amd64_reshade_copy_rgba_to_rgba)
|
|
FN_(imlib_amd64_reshade_copy_rgb_to_rgba)
|
|
|
|
.extern pow_lut
|
|
|
|
/*\ SSE register use:
|
|
|*| %xmm1 = Source value
|
|
|*| %xmm2 = Destination value
|
|
|*| %xmm3 = Alpha value
|
|
|*| %xmm4 = 0
|
|
|*| %xmm5-%xmm7 = masks
|
|
\*/
|
|
|
|
/*\ Variables:
|
|
|*| %rsi = src
|
|
|*| %rdi = dst
|
|
|*| %r8d = w
|
|
|*| %r9d = h
|
|
|*| %r10d = sw
|
|
|*| %r11d = dw
|
|
\*/
|
|
|
|
|
|
#define ENTER \
|
|
pushq %rbp ; \
|
|
movq %rsp, %rbp ; \
|
|
pushq %rbx ; \
|
|
pushq %r13 ; \
|
|
pushq %r14 ; \
|
|
movq %rsi, %r10 ; \
|
|
movq %rcx, %r11 ; \
|
|
movq %rdi, %rsi ; \
|
|
movq %rdx, %rdi ; \
|
|
movq 16(%rbp), %r14 ; \
|
|
; \
|
|
/* param sanity check */ ; \
|
|
testq %r8, %r8 ; \
|
|
jz 9f ; \
|
|
testq %r9, %r9 ; \
|
|
jz 9f
|
|
|
|
#define LEAVE \
|
|
popq %r14 ; \
|
|
popq %r13 ; \
|
|
popq %rbx ; \
|
|
movq %rbp, %rsp ; \
|
|
popq %rbp ; \
|
|
ret
|
|
|
|
|
|
PR_(imlib_amd64_blend_rgba_to_rgb):
|
|
ENTER
|
|
|
|
pxor %xmm4, %xmm4
|
|
movdqu c1(%rip), %xmm5
|
|
movdqu m00XXXXXX(%rip), %xmm6
|
|
|
|
/* Move right to left across each line, */
|
|
/* processing in two pixel chunks */
|
|
leaq (%rsi, %r8, 4), %rsi
|
|
leaq (%rdi, %r8, 4), %rdi
|
|
|
|
/* Last instruction is %rcx = 0 */
|
|
subq $4, %rsi
|
|
subq $4, %rdi
|
|
|
|
negq %r8
|
|
0:
|
|
movq %r8, %rcx
|
|
|
|
incq %rcx
|
|
|
|
/* prefetch a couple cache lines ahead */
|
|
prefetchnta (%rsi, %rcx, 4)
|
|
prefetcht0 (%rdi, %rcx, 4)
|
|
prefetchnta 64(%rsi, %rcx, 4)
|
|
prefetcht0 64(%rdi, %rcx, 4)
|
|
|
|
jz 2f /* one pixel line */
|
|
1:
|
|
/* main loop, unrolled to work on 64 byte chunks */
|
|
prefetchnta 128(%rsi, %rcx, 4)
|
|
prefetcht0 128(%rdi, %rcx, 4)
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
/* Get alpha from source and unpack to words
|
|
* Result ranges is [0, 0x7fff], and is mapped to
|
|
* point values in [0.0, 1.0) by using the high word
|
|
* of the 32 bit multiplication result.
|
|
* Because we want the unsigned value, we shift right one
|
|
* here and also shift left the other factors to compensate.
|
|
*/
|
|
movq %xmm1, %xmm3
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0xFF, %xmm3, %xmm3
|
|
pshuflw $0xFF, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* Zero out the alpha channel of the source to leave the
|
|
* destination alpha unchanged.
|
|
*/
|
|
pand %xmm6, %xmm3
|
|
|
|
/* Unpack src and dst to words */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d + (a * ((s - d) + 0.5)) */
|
|
psubw %xmm2, %xmm1
|
|
psllw $1, %xmm1
|
|
paddw %xmm5, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
paddsw %xmm1, %xmm2
|
|
|
|
/* Repack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
jz 2f
|
|
jns 3f
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
/* Get alpha from source and unpack to words
|
|
* Result ranges is [0, 0x7fff], and is mapped to
|
|
* point values in [0.0, 1.0) by using the high word
|
|
* of the 32 bit multiplication result.
|
|
* Because we want the unsigned value, we shift right one
|
|
* here and also shift left the other factors to compensate.
|
|
*/
|
|
movq %xmm1, %xmm3
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0xFF, %xmm3, %xmm3
|
|
pshuflw $0xFF, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* Zero out the alpha channel of the source to leave the
|
|
* destination alpha unchanged.
|
|
*/
|
|
pand %xmm6, %xmm3
|
|
|
|
/* Unpack src and dst to words */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d + (a * ((s - d) + 0.5)) */
|
|
psubw %xmm2, %xmm1
|
|
psllw $1, %xmm1
|
|
paddw %xmm5, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
paddsw %xmm1, %xmm2
|
|
|
|
/* Repack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
jz 2f
|
|
jns 3f
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
/* Get alpha from source and unpack to words
|
|
* Result ranges is [0, 0x7fff], and is mapped to
|
|
* point values in [0.0, 1.0) by using the high word
|
|
* of the 32 bit multiplication result.
|
|
* Because we want the unsigned value, we shift right one
|
|
* here and also shift left the other factors to compensate.
|
|
*/
|
|
movq %xmm1, %xmm3
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0xFF, %xmm3, %xmm3
|
|
pshuflw $0xFF, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* Zero out the alpha channel of the source to leave the
|
|
* destination alpha unchanged.
|
|
*/
|
|
pand %xmm6, %xmm3
|
|
|
|
/* Unpack src and dst to words */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d + (a * ((s - d) + 0.5)) */
|
|
psubw %xmm2, %xmm1
|
|
psllw $1, %xmm1
|
|
paddw %xmm5, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
paddsw %xmm1, %xmm2
|
|
|
|
/* Repack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
jz 2f
|
|
jns 3f
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
/* Get alpha from source and unpack to words
|
|
* Result ranges is [0, 0x7fff], and is mapped to
|
|
* point values in [0.0, 1.0) by using the high word
|
|
* of the 32 bit multiplication result.
|
|
* Because we want the unsigned value, we shift right one
|
|
* here and also shift left the other factors to compensate.
|
|
*/
|
|
movq %xmm1, %xmm3
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0xFF, %xmm3, %xmm3
|
|
pshuflw $0xFF, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* Zero out the alpha channel of the source to leave the
|
|
* destination alpha unchanged.
|
|
*/
|
|
pand %xmm6, %xmm3
|
|
|
|
/* Unpack src and dst to words */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d + (a * ((s - d) + 0.5)) */
|
|
psubw %xmm2, %xmm1
|
|
psllw $1, %xmm1
|
|
paddw %xmm5, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
paddsw %xmm1, %xmm2
|
|
|
|
/* Repack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
jz 2f
|
|
jns 3f
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
/* Get alpha from source and unpack to words
|
|
* Result ranges is [0, 0x7fff], and is mapped to
|
|
* point values in [0.0, 1.0) by using the high word
|
|
* of the 32 bit multiplication result.
|
|
* Because we want the unsigned value, we shift right one
|
|
* here and also shift left the other factors to compensate.
|
|
*/
|
|
movq %xmm1, %xmm3
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0xFF, %xmm3, %xmm3
|
|
pshuflw $0xFF, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* Zero out the alpha channel of the source to leave the
|
|
* destination alpha unchanged.
|
|
*/
|
|
pand %xmm6, %xmm3
|
|
|
|
/* Unpack src and dst to words */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d + (a * ((s - d) + 0.5)) */
|
|
psubw %xmm2, %xmm1
|
|
psllw $1, %xmm1
|
|
paddw %xmm5, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
paddsw %xmm1, %xmm2
|
|
|
|
/* Repack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
jz 2f
|
|
jns 3f
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
/* Get alpha from source and unpack to words
|
|
* Result ranges is [0, 0x7fff], and is mapped to
|
|
* point values in [0.0, 1.0) by using the high word
|
|
* of the 32 bit multiplication result.
|
|
* Because we want the unsigned value, we shift right one
|
|
* here and also shift left the other factors to compensate.
|
|
*/
|
|
movq %xmm1, %xmm3
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0xFF, %xmm3, %xmm3
|
|
pshuflw $0xFF, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* Zero out the alpha channel of the source to leave the
|
|
* destination alpha unchanged.
|
|
*/
|
|
pand %xmm6, %xmm3
|
|
|
|
/* Unpack src and dst to words */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d + (a * ((s - d) + 0.5)) */
|
|
psubw %xmm2, %xmm1
|
|
psllw $1, %xmm1
|
|
paddw %xmm5, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
paddsw %xmm1, %xmm2
|
|
|
|
/* Repack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
jz 2f
|
|
jns 3f
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
/* Get alpha from source and unpack to words
|
|
* Result ranges is [0, 0x7fff], and is mapped to
|
|
* point values in [0.0, 1.0) by using the high word
|
|
* of the 32 bit multiplication result.
|
|
* Because we want the unsigned value, we shift right one
|
|
* here and also shift left the other factors to compensate.
|
|
*/
|
|
movq %xmm1, %xmm3
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0xFF, %xmm3, %xmm3
|
|
pshuflw $0xFF, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* Zero out the alpha channel of the source to leave the
|
|
* destination alpha unchanged.
|
|
*/
|
|
pand %xmm6, %xmm3
|
|
|
|
/* Unpack src and dst to words */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d + (a * ((s - d) + 0.5)) */
|
|
psubw %xmm2, %xmm1
|
|
psllw $1, %xmm1
|
|
paddw %xmm5, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
paddsw %xmm1, %xmm2
|
|
|
|
/* Repack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
jz 2f
|
|
jns 3f
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
/* Get alpha from source and unpack to words
|
|
* Result ranges is [0, 0x7fff], and is mapped to
|
|
* point values in [0.0, 1.0) by using the high word
|
|
* of the 32 bit multiplication result.
|
|
* Because we want the unsigned value, we shift right one
|
|
* here and also shift left the other factors to compensate.
|
|
*/
|
|
movq %xmm1, %xmm3
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0xFF, %xmm3, %xmm3
|
|
pshuflw $0xFF, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* Zero out the alpha channel of the source to leave the
|
|
* destination alpha unchanged.
|
|
*/
|
|
pand %xmm6, %xmm3
|
|
|
|
/* Unpack src and dst to words */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d + (a * ((s - d) + 0.5)) */
|
|
psubw %xmm2, %xmm1
|
|
psllw $1, %xmm1
|
|
paddw %xmm5, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
paddsw %xmm1, %xmm2
|
|
|
|
/* Repack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
js 1b
|
|
jnz 3f
|
|
2:
|
|
movd (%rsi, %rcx, 4), %xmm1
|
|
movd (%rdi, %rcx, 4), %xmm2
|
|
/* Get alpha from source and unpack to words
|
|
* Result ranges is [0, 0x7fff], and is mapped to
|
|
* point values in [0.0, 1.0) by using the high word
|
|
* of the 32 bit multiplication result.
|
|
* Because we want the unsigned value, we shift right one
|
|
* here and also shift left the other factors to compensate.
|
|
*/
|
|
movq %xmm1, %xmm3
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0xFF, %xmm3, %xmm3
|
|
pshuflw $0xFF, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* Zero out the alpha channel of the source to leave the
|
|
* destination alpha unchanged.
|
|
*/
|
|
pand %xmm6, %xmm3
|
|
|
|
/* Unpack src and dst to words */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d + (a * ((s - d) + 0.5)) */
|
|
psubw %xmm2, %xmm1
|
|
psllw $1, %xmm1
|
|
paddw %xmm5, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
paddsw %xmm1, %xmm2
|
|
|
|
/* Repack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movd %xmm2, (%rdi, %rcx, 4)
|
|
3:
|
|
leaq (%rsi, %r10, 4), %rsi
|
|
leaq (%rdi, %r11, 4), %rdi
|
|
decq %r9
|
|
jnz 0b
|
|
|
|
9:
|
|
LEAVE
|
|
SIZE(imlib_amd64_blend_rgba_to_rgb)
|
|
PR_(imlib_amd64_blend_rgba_to_rgba):
|
|
ENTER
|
|
|
|
pxor %xmm4, %xmm4
|
|
movdqu c1(%rip), %xmm5
|
|
xorq %rax, %rax
|
|
movdqu mX000X000X000X000(%rip), %xmm6
|
|
movq pow_lut@GOTPCREL(%rip), %r13
|
|
|
|
/* Move right to left across each line, */
|
|
/* processing in two pixel chunks */
|
|
leaq (%rsi, %r8, 4), %rsi
|
|
leaq (%rdi, %r8, 4), %rdi
|
|
|
|
/* Last instruction is %rcx = 0 */
|
|
subq $4, %rsi
|
|
subq $4, %rdi
|
|
|
|
negq %r8
|
|
0:
|
|
movq %r8, %rcx
|
|
|
|
incq %rcx
|
|
|
|
/* prefetch a couple cache lines ahead */
|
|
prefetchnta (%rsi, %rcx, 4)
|
|
prefetcht0 (%rdi, %rcx, 4)
|
|
prefetchnta 64(%rsi, %rcx, 4)
|
|
prefetcht0 64(%rdi, %rcx, 4)
|
|
|
|
jz 2f /* one pixel line */
|
|
1:
|
|
/* main loop, unrolled to work on 64 byte chunks */
|
|
prefetchnta 128(%rsi, %rcx, 4)
|
|
prefetcht0 128(%rdi, %rcx, 4)
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
/* Load two pixels as 00, 00, src alpha, combined alpha
|
|
* Combined alpha is derived from the pow_lut table in blend.c
|
|
*/
|
|
movzbq 7(%rdi, %rcx, 4), %rdx
|
|
movb 7(%rsi, %rcx, 4), %dh
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
movzbq 3(%rdi, %rcx, 4), %rdx
|
|
movb 3(%rsi, %rcx, 4), %dh
|
|
shlq $32, %rax
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
movd %rax, %xmm3
|
|
/* override source alpha to 255 */
|
|
por %xmm6, %xmm1
|
|
|
|
/* unpack alpha to src alpha, combined alpha x 3 */
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0x40, %xmm3, %xmm3
|
|
pshuflw $0x40, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* unpack source and dest */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d + (a * ((s - d) + 0.5)) */
|
|
psubw %xmm2, %xmm1
|
|
psllw $1, %xmm1
|
|
paddw %xmm5, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
paddsw %xmm1, %xmm2
|
|
|
|
/* repack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
jz 2f
|
|
jns 3f
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
/* Load two pixels as 00, 00, src alpha, combined alpha
|
|
* Combined alpha is derived from the pow_lut table in blend.c
|
|
*/
|
|
movzbq 7(%rdi, %rcx, 4), %rdx
|
|
movb 7(%rsi, %rcx, 4), %dh
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
movzbq 3(%rdi, %rcx, 4), %rdx
|
|
movb 3(%rsi, %rcx, 4), %dh
|
|
shlq $32, %rax
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
movd %rax, %xmm3
|
|
/* override source alpha to 255 */
|
|
por %xmm6, %xmm1
|
|
|
|
/* unpack alpha to src alpha, combined alpha x 3 */
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0x40, %xmm3, %xmm3
|
|
pshuflw $0x40, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* unpack source and dest */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d + (a * ((s - d) + 0.5)) */
|
|
psubw %xmm2, %xmm1
|
|
psllw $1, %xmm1
|
|
paddw %xmm5, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
paddsw %xmm1, %xmm2
|
|
|
|
/* repack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
jz 2f
|
|
jns 3f
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
/* Load two pixels as 00, 00, src alpha, combined alpha
|
|
* Combined alpha is derived from the pow_lut table in blend.c
|
|
*/
|
|
movzbq 7(%rdi, %rcx, 4), %rdx
|
|
movb 7(%rsi, %rcx, 4), %dh
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
movzbq 3(%rdi, %rcx, 4), %rdx
|
|
movb 3(%rsi, %rcx, 4), %dh
|
|
shlq $32, %rax
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
movd %rax, %xmm3
|
|
/* override source alpha to 255 */
|
|
por %xmm6, %xmm1
|
|
|
|
/* unpack alpha to src alpha, combined alpha x 3 */
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0x40, %xmm3, %xmm3
|
|
pshuflw $0x40, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* unpack source and dest */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d + (a * ((s - d) + 0.5)) */
|
|
psubw %xmm2, %xmm1
|
|
psllw $1, %xmm1
|
|
paddw %xmm5, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
paddsw %xmm1, %xmm2
|
|
|
|
/* repack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
jz 2f
|
|
jns 3f
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
/* Load two pixels as 00, 00, src alpha, combined alpha
|
|
* Combined alpha is derived from the pow_lut table in blend.c
|
|
*/
|
|
movzbq 7(%rdi, %rcx, 4), %rdx
|
|
movb 7(%rsi, %rcx, 4), %dh
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
movzbq 3(%rdi, %rcx, 4), %rdx
|
|
movb 3(%rsi, %rcx, 4), %dh
|
|
shlq $32, %rax
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
movd %rax, %xmm3
|
|
/* override source alpha to 255 */
|
|
por %xmm6, %xmm1
|
|
|
|
/* unpack alpha to src alpha, combined alpha x 3 */
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0x40, %xmm3, %xmm3
|
|
pshuflw $0x40, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* unpack source and dest */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d + (a * ((s - d) + 0.5)) */
|
|
psubw %xmm2, %xmm1
|
|
psllw $1, %xmm1
|
|
paddw %xmm5, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
paddsw %xmm1, %xmm2
|
|
|
|
/* repack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
jz 2f
|
|
jns 3f
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
/* Load two pixels as 00, 00, src alpha, combined alpha
|
|
* Combined alpha is derived from the pow_lut table in blend.c
|
|
*/
|
|
movzbq 7(%rdi, %rcx, 4), %rdx
|
|
movb 7(%rsi, %rcx, 4), %dh
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
movzbq 3(%rdi, %rcx, 4), %rdx
|
|
movb 3(%rsi, %rcx, 4), %dh
|
|
shlq $32, %rax
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
movd %rax, %xmm3
|
|
/* override source alpha to 255 */
|
|
por %xmm6, %xmm1
|
|
|
|
/* unpack alpha to src alpha, combined alpha x 3 */
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0x40, %xmm3, %xmm3
|
|
pshuflw $0x40, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* unpack source and dest */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d + (a * ((s - d) + 0.5)) */
|
|
psubw %xmm2, %xmm1
|
|
psllw $1, %xmm1
|
|
paddw %xmm5, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
paddsw %xmm1, %xmm2
|
|
|
|
/* repack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
jz 2f
|
|
jns 3f
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
/* Load two pixels as 00, 00, src alpha, combined alpha
|
|
* Combined alpha is derived from the pow_lut table in blend.c
|
|
*/
|
|
movzbq 7(%rdi, %rcx, 4), %rdx
|
|
movb 7(%rsi, %rcx, 4), %dh
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
movzbq 3(%rdi, %rcx, 4), %rdx
|
|
movb 3(%rsi, %rcx, 4), %dh
|
|
shlq $32, %rax
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
movd %rax, %xmm3
|
|
/* override source alpha to 255 */
|
|
por %xmm6, %xmm1
|
|
|
|
/* unpack alpha to src alpha, combined alpha x 3 */
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0x40, %xmm3, %xmm3
|
|
pshuflw $0x40, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* unpack source and dest */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d + (a * ((s - d) + 0.5)) */
|
|
psubw %xmm2, %xmm1
|
|
psllw $1, %xmm1
|
|
paddw %xmm5, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
paddsw %xmm1, %xmm2
|
|
|
|
/* repack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
jz 2f
|
|
jns 3f
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
/* Load two pixels as 00, 00, src alpha, combined alpha
|
|
* Combined alpha is derived from the pow_lut table in blend.c
|
|
*/
|
|
movzbq 7(%rdi, %rcx, 4), %rdx
|
|
movb 7(%rsi, %rcx, 4), %dh
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
movzbq 3(%rdi, %rcx, 4), %rdx
|
|
movb 3(%rsi, %rcx, 4), %dh
|
|
shlq $32, %rax
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
movd %rax, %xmm3
|
|
/* override source alpha to 255 */
|
|
por %xmm6, %xmm1
|
|
|
|
/* unpack alpha to src alpha, combined alpha x 3 */
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0x40, %xmm3, %xmm3
|
|
pshuflw $0x40, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* unpack source and dest */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d + (a * ((s - d) + 0.5)) */
|
|
psubw %xmm2, %xmm1
|
|
psllw $1, %xmm1
|
|
paddw %xmm5, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
paddsw %xmm1, %xmm2
|
|
|
|
/* repack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
jz 2f
|
|
jns 3f
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
/* Load two pixels as 00, 00, src alpha, combined alpha
|
|
* Combined alpha is derived from the pow_lut table in blend.c
|
|
*/
|
|
movzbq 7(%rdi, %rcx, 4), %rdx
|
|
movb 7(%rsi, %rcx, 4), %dh
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
movzbq 3(%rdi, %rcx, 4), %rdx
|
|
movb 3(%rsi, %rcx, 4), %dh
|
|
shlq $32, %rax
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
movd %rax, %xmm3
|
|
/* override source alpha to 255 */
|
|
por %xmm6, %xmm1
|
|
|
|
/* unpack alpha to src alpha, combined alpha x 3 */
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0x40, %xmm3, %xmm3
|
|
pshuflw $0x40, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* unpack source and dest */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d + (a * ((s - d) + 0.5)) */
|
|
psubw %xmm2, %xmm1
|
|
psllw $1, %xmm1
|
|
paddw %xmm5, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
paddsw %xmm1, %xmm2
|
|
|
|
/* repack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
js 1b
|
|
jnz 3f
|
|
2:
|
|
movd (%rsi, %rcx, 4), %xmm1
|
|
movd (%rdi, %rcx, 4), %xmm2
|
|
/* Load one pixel as 00, 00, src alpha, combined alpha
|
|
* Combined alpha is derived from the pow_lut table in blend.c
|
|
*/
|
|
movzbq 3(%rdi, %rcx, 4), %rdx
|
|
movb 3(%rsi, %rcx, 4), %dh
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
movd %eax, %xmm3
|
|
/* override source alpha to 255 */
|
|
por %xmm6, %xmm1
|
|
|
|
/* unpack alpha to src alpha, combined alpha x 3 */
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0x40, %xmm3, %xmm3
|
|
pshuflw $0x40, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* unpack source and dest */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d + (a * ((s - d) + 0.5)) */
|
|
psubw %xmm2, %xmm1
|
|
psllw $1, %xmm1
|
|
paddw %xmm5, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
paddsw %xmm1, %xmm2
|
|
|
|
/* repack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movd %xmm2, (%rdi, %rcx, 4)
|
|
3:
|
|
leaq (%rsi, %r10, 4), %rsi
|
|
leaq (%rdi, %r11, 4), %rdi
|
|
decq %r9
|
|
jnz 0b
|
|
|
|
9:
|
|
LEAVE
|
|
SIZE(imlib_amd64_blend_rgba_to_rgba)
|
|
PR_(imlib_amd64_copy_rgba_to_rgb):
|
|
ENTER
|
|
|
|
movdqu m0XXX0XXX0XXX0XXX(%rip), %xmm5
|
|
movdqu mX000X000X000X000(%rip), %xmm6
|
|
|
|
leaq (%rsi, %r8, 4), %rsi
|
|
leaq (%rdi, %r8, 4), %rdi
|
|
|
|
subq $12, %rsi
|
|
subq $12, %rdi
|
|
|
|
negq %r8
|
|
0:
|
|
movq %r8, %rcx
|
|
|
|
/* if < 4 pixels left, goto end */
|
|
addq $3, %rcx
|
|
jns 4f
|
|
1:
|
|
/* 16 byte align dst ptr */
|
|
leaq (%rdi, %rcx, 4), %rdx
|
|
test $0x0f, %rdx
|
|
jz 1f
|
|
|
|
movd (%rsi, %rcx, 4), %xmm1
|
|
movd (%rdi, %rcx, 4), %xmm2
|
|
/* d = (s & 0x00ffffff) | (d & 0xff000000) */
|
|
pand %xmm5, %xmm1
|
|
pand %xmm6, %xmm2
|
|
por %xmm1, %xmm2
|
|
movd %xmm2, (%rdi, %rcx, 4)
|
|
incq %rcx
|
|
jz 4f
|
|
jmp 1b
|
|
1:
|
|
/* prefetch a couple cache lines ahead */
|
|
prefetchnta (%rsi, %rcx, 4)
|
|
prefetcht0 (%rdi, %rcx, 4)
|
|
prefetchnta 64(%rsi, %rcx, 4)
|
|
prefetcht0 64(%rdi, %rcx, 4)
|
|
|
|
/* test if 16 byte aligned src ptr */
|
|
leaq (%rsi, %rcx, 4), %rdx
|
|
test $0x0f, %rdx
|
|
jnz 3f
|
|
2:
|
|
/* main loop, unrolled to work on 64 byte chunks */
|
|
/* aligned src, aligned dst */
|
|
prefetchnta 128(%rsi, %rcx, 4)
|
|
prefetcht0 128(%rdi, %rcx, 4)
|
|
|
|
movdqa (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* d = (s & 0x00ffffff) | (d & 0xff000000) */
|
|
pand %xmm5, %xmm1
|
|
pand %xmm6, %xmm2
|
|
por %xmm1, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqa (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* d = (s & 0x00ffffff) | (d & 0xff000000) */
|
|
pand %xmm5, %xmm1
|
|
pand %xmm6, %xmm2
|
|
por %xmm1, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqa (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* d = (s & 0x00ffffff) | (d & 0xff000000) */
|
|
pand %xmm5, %xmm1
|
|
pand %xmm6, %xmm2
|
|
por %xmm1, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqa (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* d = (s & 0x00ffffff) | (d & 0xff000000) */
|
|
pand %xmm5, %xmm1
|
|
pand %xmm6, %xmm2
|
|
por %xmm1, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
js 2b
|
|
jmp 4f
|
|
3:
|
|
/* main loop, unrolled to work on 64 byte chunks */
|
|
/* unaligned src, aligned dst */
|
|
prefetchnta 128(%rsi, %rcx, 4)
|
|
prefetcht0 128(%rdi, %rcx, 4)
|
|
|
|
movdqu (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* d = (s & 0x00ffffff) | (d & 0xff000000) */
|
|
pand %xmm5, %xmm1
|
|
pand %xmm6, %xmm2
|
|
por %xmm1, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqu (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* d = (s & 0x00ffffff) | (d & 0xff000000) */
|
|
pand %xmm5, %xmm1
|
|
pand %xmm6, %xmm2
|
|
por %xmm1, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqu (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* d = (s & 0x00ffffff) | (d & 0xff000000) */
|
|
pand %xmm5, %xmm1
|
|
pand %xmm6, %xmm2
|
|
por %xmm1, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqu (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* d = (s & 0x00ffffff) | (d & 0xff000000) */
|
|
pand %xmm5, %xmm1
|
|
pand %xmm6, %xmm2
|
|
por %xmm1, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
js 3b
|
|
4:
|
|
/* finish loop */
|
|
cmp $2, %rcx
|
|
jg 5f
|
|
|
|
movd (%rsi, %rcx, 4), %xmm1
|
|
movd (%rdi, %rcx, 4), %xmm2
|
|
/* d = (s & 0x00ffffff) | (d & 0xff000000) */
|
|
pand %xmm5, %xmm1
|
|
pand %xmm6, %xmm2
|
|
por %xmm1, %xmm2
|
|
movd %xmm2, (%rdi, %rcx, 4)
|
|
incq %rcx
|
|
jmp 4b
|
|
5:
|
|
/* finish line */
|
|
leaq (%rsi, %r10, 4), %rsi
|
|
leaq (%rdi, %r11, 4), %rdi
|
|
decq %r9
|
|
jnz 0b
|
|
|
|
9:
|
|
LEAVE
|
|
SIZE(imlib_amd64_copy_rgba_to_rgb)
|
|
PR_(imlib_amd64_copy_rgba_to_rgba):
|
|
ENTER
|
|
|
|
|
|
leaq (%rsi, %r8, 4), %rsi
|
|
leaq (%rdi, %r8, 4), %rdi
|
|
|
|
subq $12, %rsi
|
|
subq $12, %rdi
|
|
|
|
negq %r8
|
|
0:
|
|
movq %r8, %rcx
|
|
|
|
/* if < 4 pixels left, goto end */
|
|
addq $3, %rcx
|
|
jns 4f
|
|
1:
|
|
/* 16 byte align dst ptr */
|
|
leaq (%rdi, %rcx, 4), %rdx
|
|
test $0x0f, %rdx
|
|
jz 1f
|
|
|
|
movd (%rsi, %rcx, 4), %xmm1
|
|
movd %xmm1, (%rdi, %rcx, 4)
|
|
incq %rcx
|
|
jz 4f
|
|
jmp 1b
|
|
1:
|
|
/* prefetch a couple cache lines ahead */
|
|
prefetchnta (%rsi, %rcx, 4)
|
|
prefetcht0 (%rdi, %rcx, 4)
|
|
prefetchnta 64(%rsi, %rcx, 4)
|
|
prefetcht0 64(%rdi, %rcx, 4)
|
|
|
|
/* test if 16 byte aligned src ptr */
|
|
leaq (%rsi, %rcx, 4), %rdx
|
|
test $0x0f, %rdx
|
|
jnz 3f
|
|
2:
|
|
/* main loop, unrolled to work on 64 byte chunks */
|
|
/* aligned src, aligned dst */
|
|
prefetchnta 128(%rsi, %rcx, 4)
|
|
prefetcht0 128(%rdi, %rcx, 4)
|
|
|
|
movdqa (%rsi, %rcx, 4), %xmm1
|
|
movdqa %xmm1, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqa (%rsi, %rcx, 4), %xmm1
|
|
movdqa %xmm1, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqa (%rsi, %rcx, 4), %xmm1
|
|
movdqa %xmm1, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqa (%rsi, %rcx, 4), %xmm1
|
|
movdqa %xmm1, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
js 2b
|
|
jmp 4f
|
|
3:
|
|
/* main loop, unrolled to work on 64 byte chunks */
|
|
/* unaligned src, aligned dst */
|
|
prefetchnta 128(%rsi, %rcx, 4)
|
|
prefetcht0 128(%rdi, %rcx, 4)
|
|
|
|
movdqu (%rsi, %rcx, 4), %xmm1
|
|
movdqa %xmm1, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqu (%rsi, %rcx, 4), %xmm1
|
|
movdqa %xmm1, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqu (%rsi, %rcx, 4), %xmm1
|
|
movdqa %xmm1, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqu (%rsi, %rcx, 4), %xmm1
|
|
movdqa %xmm1, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
js 3b
|
|
4:
|
|
/* finish loop */
|
|
cmp $2, %rcx
|
|
jg 5f
|
|
|
|
movd (%rsi, %rcx, 4), %xmm1
|
|
movd %xmm1, (%rdi, %rcx, 4)
|
|
incq %rcx
|
|
jmp 4b
|
|
5:
|
|
/* finish line */
|
|
leaq (%rsi, %r10, 4), %rsi
|
|
leaq (%rdi, %r11, 4), %rdi
|
|
decq %r9
|
|
jnz 0b
|
|
|
|
9:
|
|
LEAVE
|
|
SIZE(imlib_amd64_copy_rgba_to_rgba)
|
|
PR_(imlib_amd64_copy_rgb_to_rgba):
|
|
ENTER
|
|
|
|
movdqu mX000X000X000X000(%rip), %xmm5
|
|
|
|
leaq (%rsi, %r8, 4), %rsi
|
|
leaq (%rdi, %r8, 4), %rdi
|
|
|
|
subq $12, %rsi
|
|
subq $12, %rdi
|
|
|
|
negq %r8
|
|
0:
|
|
movq %r8, %rcx
|
|
|
|
/* if < 4 pixels left, goto end */
|
|
addq $3, %rcx
|
|
jns 4f
|
|
1:
|
|
/* 16 byte align dst ptr */
|
|
leaq (%rdi, %rcx, 4), %rdx
|
|
test $0x0f, %rdx
|
|
jz 1f
|
|
|
|
movd (%rsi, %rcx, 4), %xmm1
|
|
/* d = s | 0xff000000 */
|
|
por %xmm5, %xmm1
|
|
movd %xmm1, (%rdi, %rcx, 4)
|
|
incq %rcx
|
|
jz 4f
|
|
jmp 1b
|
|
1:
|
|
/* prefetch a couple cache lines ahead */
|
|
prefetchnta (%rsi, %rcx, 4)
|
|
prefetcht0 (%rdi, %rcx, 4)
|
|
prefetchnta 64(%rsi, %rcx, 4)
|
|
prefetcht0 64(%rdi, %rcx, 4)
|
|
|
|
/* test if 16 byte aligned src ptr */
|
|
leaq (%rsi, %rcx, 4), %rdx
|
|
test $0x0f, %rdx
|
|
jnz 3f
|
|
2:
|
|
/* main loop, unrolled to work on 64 byte chunks */
|
|
/* aligned src, aligned dst */
|
|
prefetchnta 128(%rsi, %rcx, 4)
|
|
prefetcht0 128(%rdi, %rcx, 4)
|
|
|
|
movdqa (%rsi, %rcx, 4), %xmm1
|
|
/* d = s | 0xff000000 */
|
|
por %xmm5, %xmm1
|
|
movdqa %xmm1, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqa (%rsi, %rcx, 4), %xmm1
|
|
/* d = s | 0xff000000 */
|
|
por %xmm5, %xmm1
|
|
movdqa %xmm1, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqa (%rsi, %rcx, 4), %xmm1
|
|
/* d = s | 0xff000000 */
|
|
por %xmm5, %xmm1
|
|
movdqa %xmm1, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqa (%rsi, %rcx, 4), %xmm1
|
|
/* d = s | 0xff000000 */
|
|
por %xmm5, %xmm1
|
|
movdqa %xmm1, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
js 2b
|
|
jmp 4f
|
|
3:
|
|
/* main loop, unrolled to work on 64 byte chunks */
|
|
/* unaligned src, aligned dst */
|
|
prefetchnta 128(%rsi, %rcx, 4)
|
|
prefetcht0 128(%rdi, %rcx, 4)
|
|
|
|
movdqu (%rsi, %rcx, 4), %xmm1
|
|
/* d = s | 0xff000000 */
|
|
por %xmm5, %xmm1
|
|
movdqa %xmm1, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqu (%rsi, %rcx, 4), %xmm1
|
|
/* d = s | 0xff000000 */
|
|
por %xmm5, %xmm1
|
|
movdqa %xmm1, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqu (%rsi, %rcx, 4), %xmm1
|
|
/* d = s | 0xff000000 */
|
|
por %xmm5, %xmm1
|
|
movdqa %xmm1, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqu (%rsi, %rcx, 4), %xmm1
|
|
/* d = s | 0xff000000 */
|
|
por %xmm5, %xmm1
|
|
movdqa %xmm1, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
js 3b
|
|
4:
|
|
/* finish loop */
|
|
cmp $2, %rcx
|
|
jg 5f
|
|
|
|
movd (%rsi, %rcx, 4), %xmm1
|
|
/* d = s | 0xff000000 */
|
|
por %xmm5, %xmm1
|
|
movd %xmm1, (%rdi, %rcx, 4)
|
|
incq %rcx
|
|
jmp 4b
|
|
5:
|
|
/* finish line */
|
|
leaq (%rsi, %r10, 4), %rsi
|
|
leaq (%rdi, %r11, 4), %rdi
|
|
decq %r9
|
|
jnz 0b
|
|
|
|
9:
|
|
LEAVE
|
|
SIZE(imlib_amd64_copy_rgb_to_rgba)
|
|
PR_(imlib_amd64_add_blend_rgba_to_rgb):
|
|
ENTER
|
|
|
|
pxor %xmm4, %xmm4
|
|
movdqu m00XXXXXX(%rip), %xmm6
|
|
|
|
/* Move right to left across each line, */
|
|
/* processing in two pixel chunks */
|
|
leaq (%rsi, %r8, 4), %rsi
|
|
leaq (%rdi, %r8, 4), %rdi
|
|
|
|
/* Last instruction is %rcx = 0 */
|
|
subq $4, %rsi
|
|
subq $4, %rdi
|
|
|
|
negq %r8
|
|
0:
|
|
movq %r8, %rcx
|
|
|
|
incq %rcx
|
|
|
|
/* prefetch a couple cache lines ahead */
|
|
prefetchnta (%rsi, %rcx, 4)
|
|
prefetcht0 (%rdi, %rcx, 4)
|
|
prefetchnta 64(%rsi, %rcx, 4)
|
|
prefetcht0 64(%rdi, %rcx, 4)
|
|
|
|
jz 2f /* one pixel line */
|
|
1:
|
|
/* main loop, unrolled to work on 64 byte chunks */
|
|
prefetchnta 128(%rsi, %rcx, 4)
|
|
prefetcht0 128(%rdi, %rcx, 4)
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
/* Get alpha from source and unpack to words
|
|
* Result ranges is [0, 0x7fff], and is mapped to
|
|
* point values in [0.0, 1.0) by using the high word
|
|
* of the 32 bit multiplication result.
|
|
* Because we want the unsigned value, we shift right one
|
|
* here and also shift left the other factors to compensate.
|
|
*/
|
|
movq %xmm1, %xmm3
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0xFF, %xmm3, %xmm3
|
|
pshuflw $0xFF, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* Zero out the alpha channel of the source to leave the
|
|
* destination alpha unchanged.
|
|
*/
|
|
pand %xmm6, %xmm3
|
|
|
|
/* unpack src and dst to words */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d + (a * s) */
|
|
psllw $1, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
paddsw %xmm1, %xmm2
|
|
|
|
/* pack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
jz 2f
|
|
jns 3f
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
/* Get alpha from source and unpack to words
|
|
* Result ranges is [0, 0x7fff], and is mapped to
|
|
* point values in [0.0, 1.0) by using the high word
|
|
* of the 32 bit multiplication result.
|
|
* Because we want the unsigned value, we shift right one
|
|
* here and also shift left the other factors to compensate.
|
|
*/
|
|
movq %xmm1, %xmm3
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0xFF, %xmm3, %xmm3
|
|
pshuflw $0xFF, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* Zero out the alpha channel of the source to leave the
|
|
* destination alpha unchanged.
|
|
*/
|
|
pand %xmm6, %xmm3
|
|
|
|
/* unpack src and dst to words */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d + (a * s) */
|
|
psllw $1, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
paddsw %xmm1, %xmm2
|
|
|
|
/* pack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
jz 2f
|
|
jns 3f
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
/* Get alpha from source and unpack to words
|
|
* Result ranges is [0, 0x7fff], and is mapped to
|
|
* point values in [0.0, 1.0) by using the high word
|
|
* of the 32 bit multiplication result.
|
|
* Because we want the unsigned value, we shift right one
|
|
* here and also shift left the other factors to compensate.
|
|
*/
|
|
movq %xmm1, %xmm3
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0xFF, %xmm3, %xmm3
|
|
pshuflw $0xFF, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* Zero out the alpha channel of the source to leave the
|
|
* destination alpha unchanged.
|
|
*/
|
|
pand %xmm6, %xmm3
|
|
|
|
/* unpack src and dst to words */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d + (a * s) */
|
|
psllw $1, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
paddsw %xmm1, %xmm2
|
|
|
|
/* pack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
jz 2f
|
|
jns 3f
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
/* Get alpha from source and unpack to words
|
|
* Result ranges is [0, 0x7fff], and is mapped to
|
|
* point values in [0.0, 1.0) by using the high word
|
|
* of the 32 bit multiplication result.
|
|
* Because we want the unsigned value, we shift right one
|
|
* here and also shift left the other factors to compensate.
|
|
*/
|
|
movq %xmm1, %xmm3
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0xFF, %xmm3, %xmm3
|
|
pshuflw $0xFF, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* Zero out the alpha channel of the source to leave the
|
|
* destination alpha unchanged.
|
|
*/
|
|
pand %xmm6, %xmm3
|
|
|
|
/* unpack src and dst to words */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d + (a * s) */
|
|
psllw $1, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
paddsw %xmm1, %xmm2
|
|
|
|
/* pack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
jz 2f
|
|
jns 3f
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
/* Get alpha from source and unpack to words
|
|
* Result ranges is [0, 0x7fff], and is mapped to
|
|
* point values in [0.0, 1.0) by using the high word
|
|
* of the 32 bit multiplication result.
|
|
* Because we want the unsigned value, we shift right one
|
|
* here and also shift left the other factors to compensate.
|
|
*/
|
|
movq %xmm1, %xmm3
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0xFF, %xmm3, %xmm3
|
|
pshuflw $0xFF, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* Zero out the alpha channel of the source to leave the
|
|
* destination alpha unchanged.
|
|
*/
|
|
pand %xmm6, %xmm3
|
|
|
|
/* unpack src and dst to words */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d + (a * s) */
|
|
psllw $1, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
paddsw %xmm1, %xmm2
|
|
|
|
/* pack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
jz 2f
|
|
jns 3f
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
/* Get alpha from source and unpack to words
|
|
* Result ranges is [0, 0x7fff], and is mapped to
|
|
* point values in [0.0, 1.0) by using the high word
|
|
* of the 32 bit multiplication result.
|
|
* Because we want the unsigned value, we shift right one
|
|
* here and also shift left the other factors to compensate.
|
|
*/
|
|
movq %xmm1, %xmm3
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0xFF, %xmm3, %xmm3
|
|
pshuflw $0xFF, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* Zero out the alpha channel of the source to leave the
|
|
* destination alpha unchanged.
|
|
*/
|
|
pand %xmm6, %xmm3
|
|
|
|
/* unpack src and dst to words */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d + (a * s) */
|
|
psllw $1, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
paddsw %xmm1, %xmm2
|
|
|
|
/* pack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
jz 2f
|
|
jns 3f
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
/* Get alpha from source and unpack to words
|
|
* Result ranges is [0, 0x7fff], and is mapped to
|
|
* point values in [0.0, 1.0) by using the high word
|
|
* of the 32 bit multiplication result.
|
|
* Because we want the unsigned value, we shift right one
|
|
* here and also shift left the other factors to compensate.
|
|
*/
|
|
movq %xmm1, %xmm3
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0xFF, %xmm3, %xmm3
|
|
pshuflw $0xFF, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* Zero out the alpha channel of the source to leave the
|
|
* destination alpha unchanged.
|
|
*/
|
|
pand %xmm6, %xmm3
|
|
|
|
/* unpack src and dst to words */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d + (a * s) */
|
|
psllw $1, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
paddsw %xmm1, %xmm2
|
|
|
|
/* pack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
jz 2f
|
|
jns 3f
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
/* Get alpha from source and unpack to words
|
|
* Result ranges is [0, 0x7fff], and is mapped to
|
|
* point values in [0.0, 1.0) by using the high word
|
|
* of the 32 bit multiplication result.
|
|
* Because we want the unsigned value, we shift right one
|
|
* here and also shift left the other factors to compensate.
|
|
*/
|
|
movq %xmm1, %xmm3
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0xFF, %xmm3, %xmm3
|
|
pshuflw $0xFF, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* Zero out the alpha channel of the source to leave the
|
|
* destination alpha unchanged.
|
|
*/
|
|
pand %xmm6, %xmm3
|
|
|
|
/* unpack src and dst to words */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d + (a * s) */
|
|
psllw $1, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
paddsw %xmm1, %xmm2
|
|
|
|
/* pack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
js 1b
|
|
jnz 3f
|
|
2:
|
|
movd (%rsi, %rcx, 4), %xmm1
|
|
movd (%rdi, %rcx, 4), %xmm2
|
|
/* Get alpha from source and unpack to words
|
|
* Result ranges is [0, 0x7fff], and is mapped to
|
|
* point values in [0.0, 1.0) by using the high word
|
|
* of the 32 bit multiplication result.
|
|
* Because we want the unsigned value, we shift right one
|
|
* here and also shift left the other factors to compensate.
|
|
*/
|
|
movq %xmm1, %xmm3
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0xFF, %xmm3, %xmm3
|
|
pshuflw $0xFF, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* Zero out the alpha channel of the source to leave the
|
|
* destination alpha unchanged.
|
|
*/
|
|
pand %xmm6, %xmm3
|
|
|
|
/* unpack src and dst to words */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d + (a * s) */
|
|
psllw $1, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
paddsw %xmm1, %xmm2
|
|
|
|
/* pack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movd %xmm2, (%rdi, %rcx, 4)
|
|
3:
|
|
leaq (%rsi, %r10, 4), %rsi
|
|
leaq (%rdi, %r11, 4), %rdi
|
|
decq %r9
|
|
jnz 0b
|
|
|
|
9:
|
|
LEAVE
|
|
SIZE(imlib_amd64_add_blend_rgba_to_rgb)
|
|
|
|
PR_(imlib_amd64_add_blend_rgba_to_rgba):
|
|
ENTER
|
|
|
|
pxor %xmm4, %xmm4
|
|
movdqu c1(%rip), %xmm5
|
|
xorq %rax, %rax
|
|
movdqu mX000X000X000X000(%rip), %xmm6
|
|
movq pow_lut@GOTPCREL(%rip), %r13
|
|
|
|
/* Move right to left across each line, */
|
|
/* processing in two pixel chunks */
|
|
leaq (%rsi, %r8, 4), %rsi
|
|
leaq (%rdi, %r8, 4), %rdi
|
|
|
|
/* Last instruction is %rcx = 0 */
|
|
subq $4, %rsi
|
|
subq $4, %rdi
|
|
|
|
negq %r8
|
|
0:
|
|
movq %r8, %rcx
|
|
|
|
incq %rcx
|
|
|
|
/* prefetch a couple cache lines ahead */
|
|
prefetchnta (%rsi, %rcx, 4)
|
|
prefetcht0 (%rdi, %rcx, 4)
|
|
prefetchnta 64(%rsi, %rcx, 4)
|
|
prefetcht0 64(%rdi, %rcx, 4)
|
|
|
|
jz 2f /* one pixel line */
|
|
1:
|
|
/* main loop, unrolled to work on 64 byte chunks */
|
|
prefetchnta 128(%rsi, %rcx, 4)
|
|
prefetcht0 128(%rdi, %rcx, 4)
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
/* Load two pixels as 00, 00, src alpha, combined alpha
|
|
* Combined alpha is derived from the pow_lut table in blend.c
|
|
*/
|
|
movzbq 7(%rdi, %rcx, 4), %rdx
|
|
movb 7(%rsi, %rcx, 4), %dh
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
movzbq 3(%rdi, %rcx, 4), %rdx
|
|
movb 3(%rsi, %rcx, 4), %dh
|
|
shlq $32, %rax
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
movd %rax, %xmm3
|
|
/* unpack alpha to src alpha, combined alpha x 3 */
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0x40, %xmm3, %xmm3
|
|
pshuflw $0x40, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* src alpha = 255 - dst alpha */
|
|
movdqa %xmm2, %xmm0
|
|
por %xmm6, %xmm1
|
|
pand %xmm6, %xmm0
|
|
psubusb %xmm0, %xmm1
|
|
|
|
/* unpack src and dst */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d + (s * ca) */
|
|
psllw $1, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
paddsw %xmm1, %xmm2
|
|
|
|
/* pack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
jz 2f
|
|
jns 3f
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
/* Load two pixels as 00, 00, src alpha, combined alpha
|
|
* Combined alpha is derived from the pow_lut table in blend.c
|
|
*/
|
|
movzbq 7(%rdi, %rcx, 4), %rdx
|
|
movb 7(%rsi, %rcx, 4), %dh
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
movzbq 3(%rdi, %rcx, 4), %rdx
|
|
movb 3(%rsi, %rcx, 4), %dh
|
|
shlq $32, %rax
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
movd %rax, %xmm3
|
|
/* unpack alpha to src alpha, combined alpha x 3 */
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0x40, %xmm3, %xmm3
|
|
pshuflw $0x40, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* src alpha = 255 - dst alpha */
|
|
movdqa %xmm2, %xmm0
|
|
por %xmm6, %xmm1
|
|
pand %xmm6, %xmm0
|
|
psubusb %xmm0, %xmm1
|
|
|
|
/* unpack src and dst */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d + (s * ca) */
|
|
psllw $1, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
paddsw %xmm1, %xmm2
|
|
|
|
/* pack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
jz 2f
|
|
jns 3f
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
/* Load two pixels as 00, 00, src alpha, combined alpha
|
|
* Combined alpha is derived from the pow_lut table in blend.c
|
|
*/
|
|
movzbq 7(%rdi, %rcx, 4), %rdx
|
|
movb 7(%rsi, %rcx, 4), %dh
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
movzbq 3(%rdi, %rcx, 4), %rdx
|
|
movb 3(%rsi, %rcx, 4), %dh
|
|
shlq $32, %rax
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
movd %rax, %xmm3
|
|
/* unpack alpha to src alpha, combined alpha x 3 */
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0x40, %xmm3, %xmm3
|
|
pshuflw $0x40, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* src alpha = 255 - dst alpha */
|
|
movdqa %xmm2, %xmm0
|
|
por %xmm6, %xmm1
|
|
pand %xmm6, %xmm0
|
|
psubusb %xmm0, %xmm1
|
|
|
|
/* unpack src and dst */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d + (s * ca) */
|
|
psllw $1, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
paddsw %xmm1, %xmm2
|
|
|
|
/* pack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
jz 2f
|
|
jns 3f
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
/* Load two pixels as 00, 00, src alpha, combined alpha
|
|
* Combined alpha is derived from the pow_lut table in blend.c
|
|
*/
|
|
movzbq 7(%rdi, %rcx, 4), %rdx
|
|
movb 7(%rsi, %rcx, 4), %dh
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
movzbq 3(%rdi, %rcx, 4), %rdx
|
|
movb 3(%rsi, %rcx, 4), %dh
|
|
shlq $32, %rax
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
movd %rax, %xmm3
|
|
/* unpack alpha to src alpha, combined alpha x 3 */
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0x40, %xmm3, %xmm3
|
|
pshuflw $0x40, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* src alpha = 255 - dst alpha */
|
|
movdqa %xmm2, %xmm0
|
|
por %xmm6, %xmm1
|
|
pand %xmm6, %xmm0
|
|
psubusb %xmm0, %xmm1
|
|
|
|
/* unpack src and dst */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d + (s * ca) */
|
|
psllw $1, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
paddsw %xmm1, %xmm2
|
|
|
|
/* pack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
jz 2f
|
|
jns 3f
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
/* Load two pixels as 00, 00, src alpha, combined alpha
|
|
* Combined alpha is derived from the pow_lut table in blend.c
|
|
*/
|
|
movzbq 7(%rdi, %rcx, 4), %rdx
|
|
movb 7(%rsi, %rcx, 4), %dh
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
movzbq 3(%rdi, %rcx, 4), %rdx
|
|
movb 3(%rsi, %rcx, 4), %dh
|
|
shlq $32, %rax
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
movd %rax, %xmm3
|
|
/* unpack alpha to src alpha, combined alpha x 3 */
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0x40, %xmm3, %xmm3
|
|
pshuflw $0x40, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* src alpha = 255 - dst alpha */
|
|
movdqa %xmm2, %xmm0
|
|
por %xmm6, %xmm1
|
|
pand %xmm6, %xmm0
|
|
psubusb %xmm0, %xmm1
|
|
|
|
/* unpack src and dst */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d + (s * ca) */
|
|
psllw $1, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
paddsw %xmm1, %xmm2
|
|
|
|
/* pack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
jz 2f
|
|
jns 3f
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
/* Load two pixels as 00, 00, src alpha, combined alpha
|
|
* Combined alpha is derived from the pow_lut table in blend.c
|
|
*/
|
|
movzbq 7(%rdi, %rcx, 4), %rdx
|
|
movb 7(%rsi, %rcx, 4), %dh
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
movzbq 3(%rdi, %rcx, 4), %rdx
|
|
movb 3(%rsi, %rcx, 4), %dh
|
|
shlq $32, %rax
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
movd %rax, %xmm3
|
|
/* unpack alpha to src alpha, combined alpha x 3 */
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0x40, %xmm3, %xmm3
|
|
pshuflw $0x40, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* src alpha = 255 - dst alpha */
|
|
movdqa %xmm2, %xmm0
|
|
por %xmm6, %xmm1
|
|
pand %xmm6, %xmm0
|
|
psubusb %xmm0, %xmm1
|
|
|
|
/* unpack src and dst */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d + (s * ca) */
|
|
psllw $1, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
paddsw %xmm1, %xmm2
|
|
|
|
/* pack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
jz 2f
|
|
jns 3f
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
/* Load two pixels as 00, 00, src alpha, combined alpha
|
|
* Combined alpha is derived from the pow_lut table in blend.c
|
|
*/
|
|
movzbq 7(%rdi, %rcx, 4), %rdx
|
|
movb 7(%rsi, %rcx, 4), %dh
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
movzbq 3(%rdi, %rcx, 4), %rdx
|
|
movb 3(%rsi, %rcx, 4), %dh
|
|
shlq $32, %rax
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
movd %rax, %xmm3
|
|
/* unpack alpha to src alpha, combined alpha x 3 */
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0x40, %xmm3, %xmm3
|
|
pshuflw $0x40, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* src alpha = 255 - dst alpha */
|
|
movdqa %xmm2, %xmm0
|
|
por %xmm6, %xmm1
|
|
pand %xmm6, %xmm0
|
|
psubusb %xmm0, %xmm1
|
|
|
|
/* unpack src and dst */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d + (s * ca) */
|
|
psllw $1, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
paddsw %xmm1, %xmm2
|
|
|
|
/* pack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
jz 2f
|
|
jns 3f
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
/* Load two pixels as 00, 00, src alpha, combined alpha
|
|
* Combined alpha is derived from the pow_lut table in blend.c
|
|
*/
|
|
movzbq 7(%rdi, %rcx, 4), %rdx
|
|
movb 7(%rsi, %rcx, 4), %dh
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
movzbq 3(%rdi, %rcx, 4), %rdx
|
|
movb 3(%rsi, %rcx, 4), %dh
|
|
shlq $32, %rax
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
movd %rax, %xmm3
|
|
/* unpack alpha to src alpha, combined alpha x 3 */
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0x40, %xmm3, %xmm3
|
|
pshuflw $0x40, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* src alpha = 255 - dst alpha */
|
|
movdqa %xmm2, %xmm0
|
|
por %xmm6, %xmm1
|
|
pand %xmm6, %xmm0
|
|
psubusb %xmm0, %xmm1
|
|
|
|
/* unpack src and dst */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d + (s * ca) */
|
|
psllw $1, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
paddsw %xmm1, %xmm2
|
|
|
|
/* pack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
js 1b
|
|
jnz 3f
|
|
2:
|
|
movd (%rsi, %rcx, 4), %xmm1
|
|
movd (%rdi, %rcx, 4), %xmm2
|
|
/* Load one pixel as 00, 00, src alpha, combined alpha
|
|
* Combined alpha is derived from the pow_lut table in blend.c
|
|
*/
|
|
movzbq 3(%rdi, %rcx, 4), %rdx
|
|
movb 3(%rsi, %rcx, 4), %dh
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
movd %eax, %xmm3
|
|
/* unpack alpha to src alpha, combined alpha x 3 */
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0x40, %xmm3, %xmm3
|
|
pshuflw $0x40, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* src alpha = 255 - dst alpha */
|
|
movdqa %xmm2, %xmm0
|
|
por %xmm6, %xmm1
|
|
pand %xmm6, %xmm0
|
|
psubusb %xmm0, %xmm1
|
|
|
|
/* unpack src and dst */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d + (s * ca) */
|
|
psllw $1, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
paddsw %xmm1, %xmm2
|
|
|
|
/* pack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movd %xmm2, (%rdi, %rcx, 4)
|
|
3:
|
|
leaq (%rsi, %r10, 4), %rsi
|
|
leaq (%rdi, %r11, 4), %rdi
|
|
decq %r9
|
|
jnz 0b
|
|
|
|
9:
|
|
LEAVE
|
|
SIZE(imlib_amd64_add_blend_rgba_to_rgba)
|
|
|
|
PR_(imlib_amd64_add_copy_rgba_to_rgb):
|
|
ENTER
|
|
|
|
movdqu m0XXX0XXX0XXX0XXX(%rip), %xmm5
|
|
|
|
leaq (%rsi, %r8, 4), %rsi
|
|
leaq (%rdi, %r8, 4), %rdi
|
|
|
|
subq $12, %rsi
|
|
subq $12, %rdi
|
|
|
|
negq %r8
|
|
0:
|
|
movq %r8, %rcx
|
|
|
|
/* if < 4 pixels left, goto end */
|
|
addq $3, %rcx
|
|
jns 4f
|
|
1:
|
|
/* 16 byte align dst ptr */
|
|
leaq (%rdi, %rcx, 4), %rdx
|
|
test $0x0f, %rdx
|
|
jz 1f
|
|
|
|
movd (%rsi, %rcx, 4), %xmm1
|
|
movd (%rdi, %rcx, 4), %xmm2
|
|
/* d = d + (s & 0x00ffffff) */
|
|
pand %xmm5, %xmm1
|
|
paddusb %xmm1, %xmm2
|
|
movd %xmm2, (%rdi, %rcx, 4)
|
|
incq %rcx
|
|
jz 4f
|
|
jmp 1b
|
|
1:
|
|
/* prefetch a couple cache lines ahead */
|
|
prefetchnta (%rsi, %rcx, 4)
|
|
prefetcht0 (%rdi, %rcx, 4)
|
|
prefetchnta 64(%rsi, %rcx, 4)
|
|
prefetcht0 64(%rdi, %rcx, 4)
|
|
|
|
/* test if 16 byte aligned src ptr */
|
|
leaq (%rsi, %rcx, 4), %rdx
|
|
test $0x0f, %rdx
|
|
jnz 3f
|
|
2:
|
|
/* main loop, unrolled to work on 64 byte chunks */
|
|
/* aligned src, aligned dst */
|
|
prefetchnta 128(%rsi, %rcx, 4)
|
|
prefetcht0 128(%rdi, %rcx, 4)
|
|
|
|
movdqa (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* d = d + (s & 0x00ffffff) */
|
|
pand %xmm5, %xmm1
|
|
paddusb %xmm1, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqa (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* d = d + (s & 0x00ffffff) */
|
|
pand %xmm5, %xmm1
|
|
paddusb %xmm1, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqa (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* d = d + (s & 0x00ffffff) */
|
|
pand %xmm5, %xmm1
|
|
paddusb %xmm1, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqa (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* d = d + (s & 0x00ffffff) */
|
|
pand %xmm5, %xmm1
|
|
paddusb %xmm1, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
js 2b
|
|
jmp 4f
|
|
3:
|
|
/* main loop, unrolled to work on 64 byte chunks */
|
|
/* unaligned src, aligned dst */
|
|
prefetchnta 128(%rsi, %rcx, 4)
|
|
prefetcht0 128(%rdi, %rcx, 4)
|
|
|
|
movdqu (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* d = d + (s & 0x00ffffff) */
|
|
pand %xmm5, %xmm1
|
|
paddusb %xmm1, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqu (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* d = d + (s & 0x00ffffff) */
|
|
pand %xmm5, %xmm1
|
|
paddusb %xmm1, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqu (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* d = d + (s & 0x00ffffff) */
|
|
pand %xmm5, %xmm1
|
|
paddusb %xmm1, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqu (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* d = d + (s & 0x00ffffff) */
|
|
pand %xmm5, %xmm1
|
|
paddusb %xmm1, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
js 3b
|
|
4:
|
|
/* finish loop */
|
|
cmp $2, %rcx
|
|
jg 5f
|
|
|
|
movd (%rsi, %rcx, 4), %xmm1
|
|
movd (%rdi, %rcx, 4), %xmm2
|
|
/* d = d + (s & 0x00ffffff) */
|
|
pand %xmm5, %xmm1
|
|
paddusb %xmm1, %xmm2
|
|
movd %xmm2, (%rdi, %rcx, 4)
|
|
incq %rcx
|
|
jmp 4b
|
|
5:
|
|
/* finish line */
|
|
leaq (%rsi, %r10, 4), %rsi
|
|
leaq (%rdi, %r11, 4), %rdi
|
|
decq %r9
|
|
jnz 0b
|
|
|
|
9:
|
|
LEAVE
|
|
SIZE(imlib_amd64_add_copy_rgba_to_rgb)
|
|
|
|
PR_(imlib_amd64_add_copy_rgba_to_rgba):
|
|
ENTER
|
|
|
|
movdqu m0XXX0XXX0XXX0XXX(%rip), %xmm5
|
|
|
|
leaq (%rsi, %r8, 4), %rsi
|
|
leaq (%rdi, %r8, 4), %rdi
|
|
|
|
subq $12, %rsi
|
|
subq $12, %rdi
|
|
|
|
negq %r8
|
|
0:
|
|
movq %r8, %rcx
|
|
|
|
/* if < 4 pixels left, goto end */
|
|
addq $3, %rcx
|
|
jns 4f
|
|
1:
|
|
/* 16 byte align dst ptr */
|
|
leaq (%rdi, %rcx, 4), %rdx
|
|
test $0x0f, %rdx
|
|
jz 1f
|
|
|
|
movd (%rsi, %rcx, 4), %xmm1
|
|
movd (%rdi, %rcx, 4), %xmm2
|
|
/* d = (d & 0x00ffffff) + s */
|
|
pand %xmm5, %xmm2
|
|
paddusb %xmm1, %xmm2
|
|
movd %xmm2, (%rdi, %rcx, 4)
|
|
incq %rcx
|
|
jz 4f
|
|
jmp 1b
|
|
1:
|
|
/* prefetch a couple cache lines ahead */
|
|
prefetchnta (%rsi, %rcx, 4)
|
|
prefetcht0 (%rdi, %rcx, 4)
|
|
prefetchnta 64(%rsi, %rcx, 4)
|
|
prefetcht0 64(%rdi, %rcx, 4)
|
|
|
|
/* test if 16 byte aligned src ptr */
|
|
leaq (%rsi, %rcx, 4), %rdx
|
|
test $0x0f, %rdx
|
|
jnz 3f
|
|
2:
|
|
/* main loop, unrolled to work on 64 byte chunks */
|
|
/* aligned src, aligned dst */
|
|
prefetchnta 128(%rsi, %rcx, 4)
|
|
prefetcht0 128(%rdi, %rcx, 4)
|
|
|
|
movdqa (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* d = (d & 0x00ffffff) + s */
|
|
pand %xmm5, %xmm2
|
|
paddusb %xmm1, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqa (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* d = (d & 0x00ffffff) + s */
|
|
pand %xmm5, %xmm2
|
|
paddusb %xmm1, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqa (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* d = (d & 0x00ffffff) + s */
|
|
pand %xmm5, %xmm2
|
|
paddusb %xmm1, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqa (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* d = (d & 0x00ffffff) + s */
|
|
pand %xmm5, %xmm2
|
|
paddusb %xmm1, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
js 2b
|
|
jmp 4f
|
|
3:
|
|
/* main loop, unrolled to work on 64 byte chunks */
|
|
/* unaligned src, aligned dst */
|
|
prefetchnta 128(%rsi, %rcx, 4)
|
|
prefetcht0 128(%rdi, %rcx, 4)
|
|
|
|
movdqu (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* d = (d & 0x00ffffff) + s */
|
|
pand %xmm5, %xmm2
|
|
paddusb %xmm1, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqu (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* d = (d & 0x00ffffff) + s */
|
|
pand %xmm5, %xmm2
|
|
paddusb %xmm1, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqu (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* d = (d & 0x00ffffff) + s */
|
|
pand %xmm5, %xmm2
|
|
paddusb %xmm1, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqu (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* d = (d & 0x00ffffff) + s */
|
|
pand %xmm5, %xmm2
|
|
paddusb %xmm1, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
js 3b
|
|
4:
|
|
/* finish loop */
|
|
cmp $2, %rcx
|
|
jg 5f
|
|
|
|
movd (%rsi, %rcx, 4), %xmm1
|
|
movd (%rdi, %rcx, 4), %xmm2
|
|
/* d = (d & 0x00ffffff) + s */
|
|
pand %xmm5, %xmm2
|
|
paddusb %xmm1, %xmm2
|
|
movd %xmm2, (%rdi, %rcx, 4)
|
|
incq %rcx
|
|
jmp 4b
|
|
5:
|
|
/* finish line */
|
|
leaq (%rsi, %r10, 4), %rsi
|
|
leaq (%rdi, %r11, 4), %rdi
|
|
decq %r9
|
|
jnz 0b
|
|
|
|
9:
|
|
LEAVE
|
|
SIZE(imlib_amd64_add_copy_rgba_to_rgba)
|
|
|
|
PR_(imlib_amd64_add_copy_rgb_to_rgba):
|
|
ENTER
|
|
|
|
movdqu mX000X000X000X000(%rip), %xmm5
|
|
|
|
leaq (%rsi, %r8, 4), %rsi
|
|
leaq (%rdi, %r8, 4), %rdi
|
|
|
|
subq $12, %rsi
|
|
subq $12, %rdi
|
|
|
|
negq %r8
|
|
0:
|
|
movq %r8, %rcx
|
|
|
|
/* if < 4 pixels left, goto end */
|
|
addq $3, %rcx
|
|
jns 4f
|
|
1:
|
|
/* 16 byte align dst ptr */
|
|
leaq (%rdi, %rcx, 4), %rdx
|
|
test $0x0f, %rdx
|
|
jz 1f
|
|
|
|
movd (%rsi, %rcx, 4), %xmm1
|
|
movd (%rdi, %rcx, 4), %xmm2
|
|
/* d = (d + s) | 0xff000000 */
|
|
paddusb %xmm1, %xmm2
|
|
por %xmm5, %xmm2
|
|
movd %xmm2, (%rdi, %rcx, 4)
|
|
incq %rcx
|
|
jz 4f
|
|
jmp 1b
|
|
1:
|
|
/* prefetch a couple cache lines ahead */
|
|
prefetchnta (%rsi, %rcx, 4)
|
|
prefetcht0 (%rdi, %rcx, 4)
|
|
prefetchnta 64(%rsi, %rcx, 4)
|
|
prefetcht0 64(%rdi, %rcx, 4)
|
|
|
|
/* test if 16 byte aligned src ptr */
|
|
leaq (%rsi, %rcx, 4), %rdx
|
|
test $0x0f, %rdx
|
|
jnz 3f
|
|
2:
|
|
/* main loop, unrolled to work on 64 byte chunks */
|
|
/* aligned src, aligned dst */
|
|
prefetchnta 128(%rsi, %rcx, 4)
|
|
prefetcht0 128(%rdi, %rcx, 4)
|
|
|
|
movdqa (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* d = (d + s) | 0xff000000 */
|
|
paddusb %xmm1, %xmm2
|
|
por %xmm5, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqa (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* d = (d + s) | 0xff000000 */
|
|
paddusb %xmm1, %xmm2
|
|
por %xmm5, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqa (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* d = (d + s) | 0xff000000 */
|
|
paddusb %xmm1, %xmm2
|
|
por %xmm5, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqa (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* d = (d + s) | 0xff000000 */
|
|
paddusb %xmm1, %xmm2
|
|
por %xmm5, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
js 2b
|
|
jmp 4f
|
|
3:
|
|
/* main loop, unrolled to work on 64 byte chunks */
|
|
/* unaligned src, aligned dst */
|
|
prefetchnta 128(%rsi, %rcx, 4)
|
|
prefetcht0 128(%rdi, %rcx, 4)
|
|
|
|
movdqu (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* d = (d + s) | 0xff000000 */
|
|
paddusb %xmm1, %xmm2
|
|
por %xmm5, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqu (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* d = (d + s) | 0xff000000 */
|
|
paddusb %xmm1, %xmm2
|
|
por %xmm5, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqu (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* d = (d + s) | 0xff000000 */
|
|
paddusb %xmm1, %xmm2
|
|
por %xmm5, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqu (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* d = (d + s) | 0xff000000 */
|
|
paddusb %xmm1, %xmm2
|
|
por %xmm5, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
js 3b
|
|
4:
|
|
/* finish loop */
|
|
cmp $2, %rcx
|
|
jg 5f
|
|
|
|
movd (%rsi, %rcx, 4), %xmm1
|
|
movd (%rdi, %rcx, 4), %xmm2
|
|
/* d = (d + s) | 0xff000000 */
|
|
paddusb %xmm1, %xmm2
|
|
por %xmm5, %xmm2
|
|
movd %xmm2, (%rdi, %rcx, 4)
|
|
incq %rcx
|
|
jmp 4b
|
|
5:
|
|
/* finish line */
|
|
leaq (%rsi, %r10, 4), %rsi
|
|
leaq (%rdi, %r11, 4), %rdi
|
|
decq %r9
|
|
jnz 0b
|
|
|
|
9:
|
|
LEAVE
|
|
SIZE(imlib_amd64_add_copy_rgb_to_rgba)
|
|
|
|
PR_(imlib_amd64_subtract_blend_rgba_to_rgb):
|
|
ENTER
|
|
|
|
pxor %xmm4, %xmm4
|
|
movdqu m00XXXXXX(%rip), %xmm6
|
|
|
|
/* Move right to left across each line, */
|
|
/* processing in two pixel chunks */
|
|
leaq (%rsi, %r8, 4), %rsi
|
|
leaq (%rdi, %r8, 4), %rdi
|
|
|
|
/* Last instruction is %rcx = 0 */
|
|
subq $4, %rsi
|
|
subq $4, %rdi
|
|
|
|
negq %r8
|
|
0:
|
|
movq %r8, %rcx
|
|
|
|
incq %rcx
|
|
|
|
/* prefetch a couple cache lines ahead */
|
|
prefetchnta (%rsi, %rcx, 4)
|
|
prefetcht0 (%rdi, %rcx, 4)
|
|
prefetchnta 64(%rsi, %rcx, 4)
|
|
prefetcht0 64(%rdi, %rcx, 4)
|
|
|
|
jz 2f /* one pixel line */
|
|
1:
|
|
/* main loop, unrolled to work on 64 byte chunks */
|
|
prefetchnta 128(%rsi, %rcx, 4)
|
|
prefetcht0 128(%rdi, %rcx, 4)
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
/* Get alpha from source and unpack to words
|
|
* Result ranges is [0, 0x7fff], and is mapped to
|
|
* point values in [0.0, 1.0) by using the high word
|
|
* of the 32 bit multiplication result.
|
|
* Because we want the unsigned value, we shift right one
|
|
* here and also shift left the other factors to compensate.
|
|
*/
|
|
movq %xmm1, %xmm3
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0xFF, %xmm3, %xmm3
|
|
pshuflw $0xFF, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* Zero out the alpha channel of the source to leave the
|
|
* destination alpha unchanged.
|
|
*/
|
|
pand %xmm6, %xmm3
|
|
|
|
/* Unpack src and dst to words */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d - (s * a) */
|
|
psllw $1, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
psubsw %xmm1, %xmm2
|
|
|
|
/* pack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
jz 2f
|
|
jns 3f
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
/* Get alpha from source and unpack to words
|
|
* Result ranges is [0, 0x7fff], and is mapped to
|
|
* point values in [0.0, 1.0) by using the high word
|
|
* of the 32 bit multiplication result.
|
|
* Because we want the unsigned value, we shift right one
|
|
* here and also shift left the other factors to compensate.
|
|
*/
|
|
movq %xmm1, %xmm3
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0xFF, %xmm3, %xmm3
|
|
pshuflw $0xFF, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* Zero out the alpha channel of the source to leave the
|
|
* destination alpha unchanged.
|
|
*/
|
|
pand %xmm6, %xmm3
|
|
|
|
/* Unpack src and dst to words */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d - (s * a) */
|
|
psllw $1, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
psubsw %xmm1, %xmm2
|
|
|
|
/* pack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
jz 2f
|
|
jns 3f
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
/* Get alpha from source and unpack to words
|
|
* Result ranges is [0, 0x7fff], and is mapped to
|
|
* point values in [0.0, 1.0) by using the high word
|
|
* of the 32 bit multiplication result.
|
|
* Because we want the unsigned value, we shift right one
|
|
* here and also shift left the other factors to compensate.
|
|
*/
|
|
movq %xmm1, %xmm3
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0xFF, %xmm3, %xmm3
|
|
pshuflw $0xFF, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* Zero out the alpha channel of the source to leave the
|
|
* destination alpha unchanged.
|
|
*/
|
|
pand %xmm6, %xmm3
|
|
|
|
/* Unpack src and dst to words */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d - (s * a) */
|
|
psllw $1, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
psubsw %xmm1, %xmm2
|
|
|
|
/* pack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
jz 2f
|
|
jns 3f
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
/* Get alpha from source and unpack to words
|
|
* Result ranges is [0, 0x7fff], and is mapped to
|
|
* point values in [0.0, 1.0) by using the high word
|
|
* of the 32 bit multiplication result.
|
|
* Because we want the unsigned value, we shift right one
|
|
* here and also shift left the other factors to compensate.
|
|
*/
|
|
movq %xmm1, %xmm3
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0xFF, %xmm3, %xmm3
|
|
pshuflw $0xFF, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* Zero out the alpha channel of the source to leave the
|
|
* destination alpha unchanged.
|
|
*/
|
|
pand %xmm6, %xmm3
|
|
|
|
/* Unpack src and dst to words */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d - (s * a) */
|
|
psllw $1, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
psubsw %xmm1, %xmm2
|
|
|
|
/* pack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
jz 2f
|
|
jns 3f
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
/* Get alpha from source and unpack to words
|
|
* Result ranges is [0, 0x7fff], and is mapped to
|
|
* point values in [0.0, 1.0) by using the high word
|
|
* of the 32 bit multiplication result.
|
|
* Because we want the unsigned value, we shift right one
|
|
* here and also shift left the other factors to compensate.
|
|
*/
|
|
movq %xmm1, %xmm3
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0xFF, %xmm3, %xmm3
|
|
pshuflw $0xFF, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* Zero out the alpha channel of the source to leave the
|
|
* destination alpha unchanged.
|
|
*/
|
|
pand %xmm6, %xmm3
|
|
|
|
/* Unpack src and dst to words */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d - (s * a) */
|
|
psllw $1, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
psubsw %xmm1, %xmm2
|
|
|
|
/* pack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
jz 2f
|
|
jns 3f
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
/* Get alpha from source and unpack to words
|
|
* Result ranges is [0, 0x7fff], and is mapped to
|
|
* point values in [0.0, 1.0) by using the high word
|
|
* of the 32 bit multiplication result.
|
|
* Because we want the unsigned value, we shift right one
|
|
* here and also shift left the other factors to compensate.
|
|
*/
|
|
movq %xmm1, %xmm3
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0xFF, %xmm3, %xmm3
|
|
pshuflw $0xFF, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* Zero out the alpha channel of the source to leave the
|
|
* destination alpha unchanged.
|
|
*/
|
|
pand %xmm6, %xmm3
|
|
|
|
/* Unpack src and dst to words */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d - (s * a) */
|
|
psllw $1, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
psubsw %xmm1, %xmm2
|
|
|
|
/* pack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
jz 2f
|
|
jns 3f
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
/* Get alpha from source and unpack to words
|
|
* Result ranges is [0, 0x7fff], and is mapped to
|
|
* point values in [0.0, 1.0) by using the high word
|
|
* of the 32 bit multiplication result.
|
|
* Because we want the unsigned value, we shift right one
|
|
* here and also shift left the other factors to compensate.
|
|
*/
|
|
movq %xmm1, %xmm3
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0xFF, %xmm3, %xmm3
|
|
pshuflw $0xFF, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* Zero out the alpha channel of the source to leave the
|
|
* destination alpha unchanged.
|
|
*/
|
|
pand %xmm6, %xmm3
|
|
|
|
/* Unpack src and dst to words */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d - (s * a) */
|
|
psllw $1, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
psubsw %xmm1, %xmm2
|
|
|
|
/* pack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
jz 2f
|
|
jns 3f
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
/* Get alpha from source and unpack to words
|
|
* Result ranges is [0, 0x7fff], and is mapped to
|
|
* point values in [0.0, 1.0) by using the high word
|
|
* of the 32 bit multiplication result.
|
|
* Because we want the unsigned value, we shift right one
|
|
* here and also shift left the other factors to compensate.
|
|
*/
|
|
movq %xmm1, %xmm3
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0xFF, %xmm3, %xmm3
|
|
pshuflw $0xFF, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* Zero out the alpha channel of the source to leave the
|
|
* destination alpha unchanged.
|
|
*/
|
|
pand %xmm6, %xmm3
|
|
|
|
/* Unpack src and dst to words */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d - (s * a) */
|
|
psllw $1, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
psubsw %xmm1, %xmm2
|
|
|
|
/* pack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
js 1b
|
|
jnz 3f
|
|
2:
|
|
movd (%rsi, %rcx, 4), %xmm1
|
|
movd (%rdi, %rcx, 4), %xmm2
|
|
/* Get alpha from source and unpack to words
|
|
* Result ranges is [0, 0x7fff], and is mapped to
|
|
* point values in [0.0, 1.0) by using the high word
|
|
* of the 32 bit multiplication result.
|
|
* Because we want the unsigned value, we shift right one
|
|
* here and also shift left the other factors to compensate.
|
|
*/
|
|
movq %xmm1, %xmm3
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0xFF, %xmm3, %xmm3
|
|
pshuflw $0xFF, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* Zero out the alpha channel of the source to leave the
|
|
* destination alpha unchanged.
|
|
*/
|
|
pand %xmm6, %xmm3
|
|
|
|
/* Unpack src and dst to words */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d - (s * a) */
|
|
psllw $1, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
psubsw %xmm1, %xmm2
|
|
|
|
/* pack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movd %xmm2, (%rdi, %rcx, 4)
|
|
3:
|
|
leaq (%rsi, %r10, 4), %rsi
|
|
leaq (%rdi, %r11, 4), %rdi
|
|
decq %r9
|
|
jnz 0b
|
|
|
|
9:
|
|
LEAVE
|
|
SIZE(imlib_amd64_subtract_blend_rgba_to_rgb)
|
|
|
|
PR_(imlib_amd64_subtract_blend_rgba_to_rgba):
|
|
ENTER
|
|
|
|
movq pow_lut@GOTPCREL(%rip), %r13
|
|
pxor %xmm4, %xmm4
|
|
movdqu c1(%rip), %xmm5
|
|
movdqu mX000X000X000X000(%rip), %xmm6
|
|
movdqu mX000X000(%rip), %xmm7
|
|
xorq %rax, %rax
|
|
|
|
/* Move right to left across each line, */
|
|
/* processing in two pixel chunks */
|
|
leaq (%rsi, %r8, 4), %rsi
|
|
leaq (%rdi, %r8, 4), %rdi
|
|
|
|
/* Last instruction is %rcx = 0 */
|
|
subq $4, %rsi
|
|
subq $4, %rdi
|
|
|
|
negq %r8
|
|
0:
|
|
movq %r8, %rcx
|
|
|
|
incq %rcx
|
|
|
|
/* prefetch a couple cache lines ahead */
|
|
prefetchnta (%rsi, %rcx, 4)
|
|
prefetcht0 (%rdi, %rcx, 4)
|
|
prefetchnta 64(%rsi, %rcx, 4)
|
|
prefetcht0 64(%rdi, %rcx, 4)
|
|
|
|
jz 2f /* one pixel line */
|
|
1:
|
|
/* main loop, unrolled to work on 64 byte chunks */
|
|
prefetchnta 128(%rsi, %rcx, 4)
|
|
prefetcht0 128(%rdi, %rcx, 4)
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
/* Load two pixels as 00, 00, src alpha, combined alpha
|
|
* Combined alpha is derived from the pow_lut table in blend.c
|
|
*/
|
|
movzbq 7(%rdi, %rcx, 4), %rdx
|
|
movb 7(%rsi, %rcx, 4), %dh
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
movzbq 3(%rdi, %rcx, 4), %rdx
|
|
movb 3(%rsi, %rcx, 4), %dh
|
|
shlq $32, %rax
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
movd %rax, %xmm3
|
|
/* unpack alpha to src alpha, combined alpha x 3 */
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0x40, %xmm3, %xmm3
|
|
pshuflw $0x40, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* src alpha = 255 - dst alpha */
|
|
movdqa %xmm2, %xmm0
|
|
pand %xmm6, %xmm0
|
|
por %xmm6, %xmm1
|
|
psubusb %xmm0, %xmm1
|
|
|
|
/* unpack src and dst */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d - ((s * a) ^ 0xff000000) */
|
|
psllw $1, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
pxor %xmm7, %xmm1
|
|
psubsw %xmm1, %xmm2
|
|
|
|
/* pack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
jz 2f
|
|
jns 3f
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
/* Load two pixels as 00, 00, src alpha, combined alpha
|
|
* Combined alpha is derived from the pow_lut table in blend.c
|
|
*/
|
|
movzbq 7(%rdi, %rcx, 4), %rdx
|
|
movb 7(%rsi, %rcx, 4), %dh
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
movzbq 3(%rdi, %rcx, 4), %rdx
|
|
movb 3(%rsi, %rcx, 4), %dh
|
|
shlq $32, %rax
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
movd %rax, %xmm3
|
|
/* unpack alpha to src alpha, combined alpha x 3 */
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0x40, %xmm3, %xmm3
|
|
pshuflw $0x40, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* src alpha = 255 - dst alpha */
|
|
movdqa %xmm2, %xmm0
|
|
pand %xmm6, %xmm0
|
|
por %xmm6, %xmm1
|
|
psubusb %xmm0, %xmm1
|
|
|
|
/* unpack src and dst */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d - ((s * a) ^ 0xff000000) */
|
|
psllw $1, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
pxor %xmm7, %xmm1
|
|
psubsw %xmm1, %xmm2
|
|
|
|
/* pack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
jz 2f
|
|
jns 3f
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
/* Load two pixels as 00, 00, src alpha, combined alpha
|
|
* Combined alpha is derived from the pow_lut table in blend.c
|
|
*/
|
|
movzbq 7(%rdi, %rcx, 4), %rdx
|
|
movb 7(%rsi, %rcx, 4), %dh
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
movzbq 3(%rdi, %rcx, 4), %rdx
|
|
movb 3(%rsi, %rcx, 4), %dh
|
|
shlq $32, %rax
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
movd %rax, %xmm3
|
|
/* unpack alpha to src alpha, combined alpha x 3 */
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0x40, %xmm3, %xmm3
|
|
pshuflw $0x40, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* src alpha = 255 - dst alpha */
|
|
movdqa %xmm2, %xmm0
|
|
pand %xmm6, %xmm0
|
|
por %xmm6, %xmm1
|
|
psubusb %xmm0, %xmm1
|
|
|
|
/* unpack src and dst */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d - ((s * a) ^ 0xff000000) */
|
|
psllw $1, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
pxor %xmm7, %xmm1
|
|
psubsw %xmm1, %xmm2
|
|
|
|
/* pack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
jz 2f
|
|
jns 3f
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
/* Load two pixels as 00, 00, src alpha, combined alpha
|
|
* Combined alpha is derived from the pow_lut table in blend.c
|
|
*/
|
|
movzbq 7(%rdi, %rcx, 4), %rdx
|
|
movb 7(%rsi, %rcx, 4), %dh
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
movzbq 3(%rdi, %rcx, 4), %rdx
|
|
movb 3(%rsi, %rcx, 4), %dh
|
|
shlq $32, %rax
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
movd %rax, %xmm3
|
|
/* unpack alpha to src alpha, combined alpha x 3 */
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0x40, %xmm3, %xmm3
|
|
pshuflw $0x40, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* src alpha = 255 - dst alpha */
|
|
movdqa %xmm2, %xmm0
|
|
pand %xmm6, %xmm0
|
|
por %xmm6, %xmm1
|
|
psubusb %xmm0, %xmm1
|
|
|
|
/* unpack src and dst */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d - ((s * a) ^ 0xff000000) */
|
|
psllw $1, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
pxor %xmm7, %xmm1
|
|
psubsw %xmm1, %xmm2
|
|
|
|
/* pack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
jz 2f
|
|
jns 3f
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
/* Load two pixels as 00, 00, src alpha, combined alpha
|
|
* Combined alpha is derived from the pow_lut table in blend.c
|
|
*/
|
|
movzbq 7(%rdi, %rcx, 4), %rdx
|
|
movb 7(%rsi, %rcx, 4), %dh
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
movzbq 3(%rdi, %rcx, 4), %rdx
|
|
movb 3(%rsi, %rcx, 4), %dh
|
|
shlq $32, %rax
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
movd %rax, %xmm3
|
|
/* unpack alpha to src alpha, combined alpha x 3 */
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0x40, %xmm3, %xmm3
|
|
pshuflw $0x40, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* src alpha = 255 - dst alpha */
|
|
movdqa %xmm2, %xmm0
|
|
pand %xmm6, %xmm0
|
|
por %xmm6, %xmm1
|
|
psubusb %xmm0, %xmm1
|
|
|
|
/* unpack src and dst */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d - ((s * a) ^ 0xff000000) */
|
|
psllw $1, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
pxor %xmm7, %xmm1
|
|
psubsw %xmm1, %xmm2
|
|
|
|
/* pack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
jz 2f
|
|
jns 3f
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
/* Load two pixels as 00, 00, src alpha, combined alpha
|
|
* Combined alpha is derived from the pow_lut table in blend.c
|
|
*/
|
|
movzbq 7(%rdi, %rcx, 4), %rdx
|
|
movb 7(%rsi, %rcx, 4), %dh
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
movzbq 3(%rdi, %rcx, 4), %rdx
|
|
movb 3(%rsi, %rcx, 4), %dh
|
|
shlq $32, %rax
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
movd %rax, %xmm3
|
|
/* unpack alpha to src alpha, combined alpha x 3 */
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0x40, %xmm3, %xmm3
|
|
pshuflw $0x40, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* src alpha = 255 - dst alpha */
|
|
movdqa %xmm2, %xmm0
|
|
pand %xmm6, %xmm0
|
|
por %xmm6, %xmm1
|
|
psubusb %xmm0, %xmm1
|
|
|
|
/* unpack src and dst */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d - ((s * a) ^ 0xff000000) */
|
|
psllw $1, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
pxor %xmm7, %xmm1
|
|
psubsw %xmm1, %xmm2
|
|
|
|
/* pack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
jz 2f
|
|
jns 3f
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
/* Load two pixels as 00, 00, src alpha, combined alpha
|
|
* Combined alpha is derived from the pow_lut table in blend.c
|
|
*/
|
|
movzbq 7(%rdi, %rcx, 4), %rdx
|
|
movb 7(%rsi, %rcx, 4), %dh
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
movzbq 3(%rdi, %rcx, 4), %rdx
|
|
movb 3(%rsi, %rcx, 4), %dh
|
|
shlq $32, %rax
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
movd %rax, %xmm3
|
|
/* unpack alpha to src alpha, combined alpha x 3 */
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0x40, %xmm3, %xmm3
|
|
pshuflw $0x40, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* src alpha = 255 - dst alpha */
|
|
movdqa %xmm2, %xmm0
|
|
pand %xmm6, %xmm0
|
|
por %xmm6, %xmm1
|
|
psubusb %xmm0, %xmm1
|
|
|
|
/* unpack src and dst */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d - ((s * a) ^ 0xff000000) */
|
|
psllw $1, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
pxor %xmm7, %xmm1
|
|
psubsw %xmm1, %xmm2
|
|
|
|
/* pack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
jz 2f
|
|
jns 3f
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
/* Load two pixels as 00, 00, src alpha, combined alpha
|
|
* Combined alpha is derived from the pow_lut table in blend.c
|
|
*/
|
|
movzbq 7(%rdi, %rcx, 4), %rdx
|
|
movb 7(%rsi, %rcx, 4), %dh
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
movzbq 3(%rdi, %rcx, 4), %rdx
|
|
movb 3(%rsi, %rcx, 4), %dh
|
|
shlq $32, %rax
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
movd %rax, %xmm3
|
|
/* unpack alpha to src alpha, combined alpha x 3 */
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0x40, %xmm3, %xmm3
|
|
pshuflw $0x40, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* src alpha = 255 - dst alpha */
|
|
movdqa %xmm2, %xmm0
|
|
pand %xmm6, %xmm0
|
|
por %xmm6, %xmm1
|
|
psubusb %xmm0, %xmm1
|
|
|
|
/* unpack src and dst */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d - ((s * a) ^ 0xff000000) */
|
|
psllw $1, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
pxor %xmm7, %xmm1
|
|
psubsw %xmm1, %xmm2
|
|
|
|
/* pack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
js 1b
|
|
jnz 3f
|
|
2:
|
|
movd (%rsi, %rcx, 4), %xmm1
|
|
movd (%rdi, %rcx, 4), %xmm2
|
|
/* Load one pixel as 00, 00, src alpha, combined alpha
|
|
* Combined alpha is derived from the pow_lut table in blend.c
|
|
*/
|
|
movzbq 3(%rdi, %rcx, 4), %rdx
|
|
movb 3(%rsi, %rcx, 4), %dh
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
movd %eax, %xmm3
|
|
/* unpack alpha to src alpha, combined alpha x 3 */
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0x40, %xmm3, %xmm3
|
|
pshuflw $0x40, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* src alpha = 255 - dst alpha */
|
|
movdqa %xmm2, %xmm0
|
|
pand %xmm6, %xmm0
|
|
por %xmm6, %xmm1
|
|
psubusb %xmm0, %xmm1
|
|
|
|
/* unpack src and dst */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d - ((s * a) ^ 0xff000000) */
|
|
psllw $1, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
pxor %xmm7, %xmm1
|
|
psubsw %xmm1, %xmm2
|
|
|
|
/* pack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movd %xmm2, (%rdi, %rcx, 4)
|
|
3:
|
|
leaq (%rsi, %r10, 4), %rsi
|
|
leaq (%rdi, %r11, 4), %rdi
|
|
decq %r9
|
|
jnz 0b
|
|
|
|
9:
|
|
LEAVE
|
|
SIZE(imlib_amd64_subtract_blend_rgba_to_rgba)
|
|
|
|
PR_(imlib_amd64_subtract_copy_rgba_to_rgb):
|
|
ENTER
|
|
|
|
movdqu m0XXX0XXX0XXX0XXX(%rip), %xmm5
|
|
|
|
leaq (%rsi, %r8, 4), %rsi
|
|
leaq (%rdi, %r8, 4), %rdi
|
|
|
|
subq $12, %rsi
|
|
subq $12, %rdi
|
|
|
|
negq %r8
|
|
0:
|
|
movq %r8, %rcx
|
|
|
|
/* if < 4 pixels left, goto end */
|
|
addq $3, %rcx
|
|
jns 4f
|
|
1:
|
|
/* 16 byte align dst ptr */
|
|
leaq (%rdi, %rcx, 4), %rdx
|
|
test $0x0f, %rdx
|
|
jz 1f
|
|
|
|
movd (%rsi, %rcx, 4), %xmm1
|
|
movd (%rdi, %rcx, 4), %xmm2
|
|
/* d = d - (s & 0x00ffffff) */
|
|
pand %xmm5, %xmm1
|
|
psubusb %xmm1, %xmm2
|
|
movd %xmm2, (%rdi, %rcx, 4)
|
|
incq %rcx
|
|
jz 4f
|
|
jmp 1b
|
|
1:
|
|
/* prefetch a couple cache lines ahead */
|
|
prefetchnta (%rsi, %rcx, 4)
|
|
prefetcht0 (%rdi, %rcx, 4)
|
|
prefetchnta 64(%rsi, %rcx, 4)
|
|
prefetcht0 64(%rdi, %rcx, 4)
|
|
|
|
/* test if 16 byte aligned src ptr */
|
|
leaq (%rsi, %rcx, 4), %rdx
|
|
test $0x0f, %rdx
|
|
jnz 3f
|
|
2:
|
|
/* main loop, unrolled to work on 64 byte chunks */
|
|
/* aligned src, aligned dst */
|
|
prefetchnta 128(%rsi, %rcx, 4)
|
|
prefetcht0 128(%rdi, %rcx, 4)
|
|
|
|
movdqa (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* d = d - (s & 0x00ffffff) */
|
|
pand %xmm5, %xmm1
|
|
psubusb %xmm1, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqa (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* d = d - (s & 0x00ffffff) */
|
|
pand %xmm5, %xmm1
|
|
psubusb %xmm1, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqa (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* d = d - (s & 0x00ffffff) */
|
|
pand %xmm5, %xmm1
|
|
psubusb %xmm1, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqa (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* d = d - (s & 0x00ffffff) */
|
|
pand %xmm5, %xmm1
|
|
psubusb %xmm1, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
js 2b
|
|
jmp 4f
|
|
3:
|
|
/* main loop, unrolled to work on 64 byte chunks */
|
|
/* unaligned src, aligned dst */
|
|
prefetchnta 128(%rsi, %rcx, 4)
|
|
prefetcht0 128(%rdi, %rcx, 4)
|
|
|
|
movdqu (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* d = d - (s & 0x00ffffff) */
|
|
pand %xmm5, %xmm1
|
|
psubusb %xmm1, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqu (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* d = d - (s & 0x00ffffff) */
|
|
pand %xmm5, %xmm1
|
|
psubusb %xmm1, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqu (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* d = d - (s & 0x00ffffff) */
|
|
pand %xmm5, %xmm1
|
|
psubusb %xmm1, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqu (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* d = d - (s & 0x00ffffff) */
|
|
pand %xmm5, %xmm1
|
|
psubusb %xmm1, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
js 3b
|
|
4:
|
|
/* finish loop */
|
|
cmp $2, %rcx
|
|
jg 5f
|
|
|
|
movd (%rsi, %rcx, 4), %xmm1
|
|
movd (%rdi, %rcx, 4), %xmm2
|
|
/* d = d - (s & 0x00ffffff) */
|
|
pand %xmm5, %xmm1
|
|
psubusb %xmm1, %xmm2
|
|
movd %xmm2, (%rdi, %rcx, 4)
|
|
incq %rcx
|
|
jmp 4b
|
|
5:
|
|
/* finish line */
|
|
leaq (%rsi, %r10, 4), %rsi
|
|
leaq (%rdi, %r11, 4), %rdi
|
|
decq %r9
|
|
jnz 0b
|
|
|
|
9:
|
|
LEAVE
|
|
SIZE(imlib_amd64_subtract_copy_rgba_to_rgb)
|
|
|
|
PR_(imlib_amd64_subtract_copy_rgba_to_rgba):
|
|
ENTER
|
|
|
|
movdqu m0XXX0XXX0XXX0XXX(%rip), %xmm5
|
|
movdqu mX000X000X000X000(%rip), %xmm6
|
|
|
|
leaq (%rsi, %r8, 4), %rsi
|
|
leaq (%rdi, %r8, 4), %rdi
|
|
|
|
subq $12, %rsi
|
|
subq $12, %rdi
|
|
|
|
negq %r8
|
|
0:
|
|
movq %r8, %rcx
|
|
|
|
/* if < 4 pixels left, goto end */
|
|
addq $3, %rcx
|
|
jns 4f
|
|
1:
|
|
/* 16 byte align dst ptr */
|
|
leaq (%rdi, %rcx, 4), %rdx
|
|
test $0x0f, %rdx
|
|
jz 1f
|
|
|
|
movd (%rsi, %rcx, 4), %xmm1
|
|
movd (%rdi, %rcx, 4), %xmm2
|
|
/* d = d - s, d alpha = s alpha */
|
|
psubusb %xmm1, %xmm2
|
|
pand %xmm6, %xmm1
|
|
pand %xmm5, %xmm2
|
|
por %xmm1, %xmm2
|
|
movd %xmm2, (%rdi, %rcx, 4)
|
|
incq %rcx
|
|
jz 4f
|
|
jmp 1b
|
|
1:
|
|
/* prefetch a couple cache lines ahead */
|
|
prefetchnta (%rsi, %rcx, 4)
|
|
prefetcht0 (%rdi, %rcx, 4)
|
|
prefetchnta 64(%rsi, %rcx, 4)
|
|
prefetcht0 64(%rdi, %rcx, 4)
|
|
|
|
/* test if 16 byte aligned src ptr */
|
|
leaq (%rsi, %rcx, 4), %rdx
|
|
test $0x0f, %rdx
|
|
jnz 3f
|
|
2:
|
|
/* main loop, unrolled to work on 64 byte chunks */
|
|
/* aligned src, aligned dst */
|
|
prefetchnta 128(%rsi, %rcx, 4)
|
|
prefetcht0 128(%rdi, %rcx, 4)
|
|
|
|
movdqa (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* d = d - s, d alpha = s alpha */
|
|
psubusb %xmm1, %xmm2
|
|
pand %xmm6, %xmm1
|
|
pand %xmm5, %xmm2
|
|
por %xmm1, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqa (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* d = d - s, d alpha = s alpha */
|
|
psubusb %xmm1, %xmm2
|
|
pand %xmm6, %xmm1
|
|
pand %xmm5, %xmm2
|
|
por %xmm1, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqa (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* d = d - s, d alpha = s alpha */
|
|
psubusb %xmm1, %xmm2
|
|
pand %xmm6, %xmm1
|
|
pand %xmm5, %xmm2
|
|
por %xmm1, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqa (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* d = d - s, d alpha = s alpha */
|
|
psubusb %xmm1, %xmm2
|
|
pand %xmm6, %xmm1
|
|
pand %xmm5, %xmm2
|
|
por %xmm1, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
js 2b
|
|
jmp 4f
|
|
3:
|
|
/* main loop, unrolled to work on 64 byte chunks */
|
|
/* unaligned src, aligned dst */
|
|
prefetchnta 128(%rsi, %rcx, 4)
|
|
prefetcht0 128(%rdi, %rcx, 4)
|
|
|
|
movdqu (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* d = d - s, d alpha = s alpha */
|
|
psubusb %xmm1, %xmm2
|
|
pand %xmm6, %xmm1
|
|
pand %xmm5, %xmm2
|
|
por %xmm1, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqu (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* d = d - s, d alpha = s alpha */
|
|
psubusb %xmm1, %xmm2
|
|
pand %xmm6, %xmm1
|
|
pand %xmm5, %xmm2
|
|
por %xmm1, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqu (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* d = d - s, d alpha = s alpha */
|
|
psubusb %xmm1, %xmm2
|
|
pand %xmm6, %xmm1
|
|
pand %xmm5, %xmm2
|
|
por %xmm1, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqu (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* d = d - s, d alpha = s alpha */
|
|
psubusb %xmm1, %xmm2
|
|
pand %xmm6, %xmm1
|
|
pand %xmm5, %xmm2
|
|
por %xmm1, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
js 3b
|
|
4:
|
|
/* finish loop */
|
|
cmp $2, %rcx
|
|
jg 5f
|
|
|
|
movd (%rsi, %rcx, 4), %xmm1
|
|
movd (%rdi, %rcx, 4), %xmm2
|
|
/* d = d - s, d alpha = s alpha */
|
|
psubusb %xmm1, %xmm2
|
|
pand %xmm6, %xmm1
|
|
pand %xmm5, %xmm2
|
|
por %xmm1, %xmm2
|
|
movd %xmm2, (%rdi, %rcx, 4)
|
|
incq %rcx
|
|
jmp 4b
|
|
5:
|
|
/* finish line */
|
|
leaq (%rsi, %r10, 4), %rsi
|
|
leaq (%rdi, %r11, 4), %rdi
|
|
decq %r9
|
|
jnz 0b
|
|
|
|
9:
|
|
LEAVE
|
|
SIZE(imlib_amd64_subtract_copy_rgba_to_rgba)
|
|
|
|
PR_(imlib_amd64_subtract_copy_rgb_to_rgba):
|
|
ENTER
|
|
|
|
movdqu mX000X000X000X000(%rip), %xmm5
|
|
|
|
leaq (%rsi, %r8, 4), %rsi
|
|
leaq (%rdi, %r8, 4), %rdi
|
|
|
|
subq $12, %rsi
|
|
subq $12, %rdi
|
|
|
|
negq %r8
|
|
0:
|
|
movq %r8, %rcx
|
|
|
|
/* if < 4 pixels left, goto end */
|
|
addq $3, %rcx
|
|
jns 4f
|
|
1:
|
|
/* 16 byte align dst ptr */
|
|
leaq (%rdi, %rcx, 4), %rdx
|
|
test $0x0f, %rdx
|
|
jz 1f
|
|
|
|
movd (%rsi, %rcx, 4), %xmm1
|
|
movd (%rdi, %rcx, 4), %xmm2
|
|
/* d = (d - s) | 0xff000000 */
|
|
psubusb %xmm1, %xmm2
|
|
por %xmm5, %xmm2
|
|
movd %xmm2, (%rdi, %rcx, 4)
|
|
incq %rcx
|
|
jz 4f
|
|
jmp 1b
|
|
1:
|
|
/* prefetch a couple cache lines ahead */
|
|
prefetchnta (%rsi, %rcx, 4)
|
|
prefetcht0 (%rdi, %rcx, 4)
|
|
prefetchnta 64(%rsi, %rcx, 4)
|
|
prefetcht0 64(%rdi, %rcx, 4)
|
|
|
|
/* test if 16 byte aligned src ptr */
|
|
leaq (%rsi, %rcx, 4), %rdx
|
|
test $0x0f, %rdx
|
|
jnz 3f
|
|
2:
|
|
/* main loop, unrolled to work on 64 byte chunks */
|
|
/* aligned src, aligned dst */
|
|
prefetchnta 128(%rsi, %rcx, 4)
|
|
prefetcht0 128(%rdi, %rcx, 4)
|
|
|
|
movdqa (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* d = (d - s) | 0xff000000 */
|
|
psubusb %xmm1, %xmm2
|
|
por %xmm5, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqa (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* d = (d - s) | 0xff000000 */
|
|
psubusb %xmm1, %xmm2
|
|
por %xmm5, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqa (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* d = (d - s) | 0xff000000 */
|
|
psubusb %xmm1, %xmm2
|
|
por %xmm5, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqa (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* d = (d - s) | 0xff000000 */
|
|
psubusb %xmm1, %xmm2
|
|
por %xmm5, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
js 2b
|
|
jmp 4f
|
|
3:
|
|
/* main loop, unrolled to work on 64 byte chunks */
|
|
/* unaligned src, aligned dst */
|
|
prefetchnta 128(%rsi, %rcx, 4)
|
|
prefetcht0 128(%rdi, %rcx, 4)
|
|
|
|
movdqu (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* d = (d - s) | 0xff000000 */
|
|
psubusb %xmm1, %xmm2
|
|
por %xmm5, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqu (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* d = (d - s) | 0xff000000 */
|
|
psubusb %xmm1, %xmm2
|
|
por %xmm5, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqu (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* d = (d - s) | 0xff000000 */
|
|
psubusb %xmm1, %xmm2
|
|
por %xmm5, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqu (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* d = (d - s) | 0xff000000 */
|
|
psubusb %xmm1, %xmm2
|
|
por %xmm5, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
js 3b
|
|
4:
|
|
/* finish loop */
|
|
cmp $2, %rcx
|
|
jg 5f
|
|
|
|
movd (%rsi, %rcx, 4), %xmm1
|
|
movd (%rdi, %rcx, 4), %xmm2
|
|
/* d = (d - s) | 0xff000000 */
|
|
psubusb %xmm1, %xmm2
|
|
por %xmm5, %xmm2
|
|
movd %xmm2, (%rdi, %rcx, 4)
|
|
incq %rcx
|
|
jmp 4b
|
|
5:
|
|
/* finish line */
|
|
leaq (%rsi, %r10, 4), %rsi
|
|
leaq (%rdi, %r11, 4), %rdi
|
|
decq %r9
|
|
jnz 0b
|
|
|
|
9:
|
|
LEAVE
|
|
SIZE(imlib_amd64_subtract_copy_rgb_to_rgba)
|
|
|
|
PR_(imlib_amd64_reshade_blend_rgba_to_rgb):
|
|
ENTER
|
|
|
|
pxor %xmm4, %xmm4
|
|
movdqu m000V0V0V000V0V0V(%rip), %xmm6
|
|
movdqu m00XXXXXX(%rip), %xmm7
|
|
|
|
/* Move right to left across each line, */
|
|
/* processing in two pixel chunks */
|
|
leaq (%rsi, %r8, 4), %rsi
|
|
leaq (%rdi, %r8, 4), %rdi
|
|
|
|
/* Last instruction is %rcx = 0 */
|
|
subq $4, %rsi
|
|
subq $4, %rdi
|
|
|
|
negq %r8
|
|
0:
|
|
movq %r8, %rcx
|
|
|
|
incq %rcx
|
|
|
|
/* prefetch a couple cache lines ahead */
|
|
prefetchnta (%rsi, %rcx, 4)
|
|
prefetcht0 (%rdi, %rcx, 4)
|
|
prefetchnta 64(%rsi, %rcx, 4)
|
|
prefetcht0 64(%rdi, %rcx, 4)
|
|
|
|
jz 2f /* one pixel line */
|
|
1:
|
|
/* main loop, unrolled to work on 64 byte chunks */
|
|
prefetchnta 128(%rsi, %rcx, 4)
|
|
prefetcht0 128(%rdi, %rcx, 4)
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
/* Unpack alpha */
|
|
movq %xmm1, %xmm3
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0xFF, %xmm3, %xmm3
|
|
pshuflw $0xFF, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* Zero blending alpha */
|
|
pand %xmm7, %xmm3
|
|
|
|
/* Unpack src and dst */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d + (2 * a * (s - 127)) */
|
|
psubw %xmm6, %xmm1
|
|
psllw $2, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
paddsw %xmm1, %xmm2
|
|
|
|
/* Repack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
jz 2f
|
|
jns 3f
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
/* Unpack alpha */
|
|
movq %xmm1, %xmm3
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0xFF, %xmm3, %xmm3
|
|
pshuflw $0xFF, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* Zero blending alpha */
|
|
pand %xmm7, %xmm3
|
|
|
|
/* Unpack src and dst */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d + (2 * a * (s - 127)) */
|
|
psubw %xmm6, %xmm1
|
|
psllw $2, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
paddsw %xmm1, %xmm2
|
|
|
|
/* Repack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
jz 2f
|
|
jns 3f
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
/* Unpack alpha */
|
|
movq %xmm1, %xmm3
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0xFF, %xmm3, %xmm3
|
|
pshuflw $0xFF, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* Zero blending alpha */
|
|
pand %xmm7, %xmm3
|
|
|
|
/* Unpack src and dst */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d + (2 * a * (s - 127)) */
|
|
psubw %xmm6, %xmm1
|
|
psllw $2, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
paddsw %xmm1, %xmm2
|
|
|
|
/* Repack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
jz 2f
|
|
jns 3f
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
/* Unpack alpha */
|
|
movq %xmm1, %xmm3
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0xFF, %xmm3, %xmm3
|
|
pshuflw $0xFF, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* Zero blending alpha */
|
|
pand %xmm7, %xmm3
|
|
|
|
/* Unpack src and dst */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d + (2 * a * (s - 127)) */
|
|
psubw %xmm6, %xmm1
|
|
psllw $2, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
paddsw %xmm1, %xmm2
|
|
|
|
/* Repack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
jz 2f
|
|
jns 3f
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
/* Unpack alpha */
|
|
movq %xmm1, %xmm3
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0xFF, %xmm3, %xmm3
|
|
pshuflw $0xFF, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* Zero blending alpha */
|
|
pand %xmm7, %xmm3
|
|
|
|
/* Unpack src and dst */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d + (2 * a * (s - 127)) */
|
|
psubw %xmm6, %xmm1
|
|
psllw $2, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
paddsw %xmm1, %xmm2
|
|
|
|
/* Repack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
jz 2f
|
|
jns 3f
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
/* Unpack alpha */
|
|
movq %xmm1, %xmm3
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0xFF, %xmm3, %xmm3
|
|
pshuflw $0xFF, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* Zero blending alpha */
|
|
pand %xmm7, %xmm3
|
|
|
|
/* Unpack src and dst */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d + (2 * a * (s - 127)) */
|
|
psubw %xmm6, %xmm1
|
|
psllw $2, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
paddsw %xmm1, %xmm2
|
|
|
|
/* Repack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
jz 2f
|
|
jns 3f
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
/* Unpack alpha */
|
|
movq %xmm1, %xmm3
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0xFF, %xmm3, %xmm3
|
|
pshuflw $0xFF, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* Zero blending alpha */
|
|
pand %xmm7, %xmm3
|
|
|
|
/* Unpack src and dst */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d + (2 * a * (s - 127)) */
|
|
psubw %xmm6, %xmm1
|
|
psllw $2, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
paddsw %xmm1, %xmm2
|
|
|
|
/* Repack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
jz 2f
|
|
jns 3f
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
/* Unpack alpha */
|
|
movq %xmm1, %xmm3
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0xFF, %xmm3, %xmm3
|
|
pshuflw $0xFF, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* Zero blending alpha */
|
|
pand %xmm7, %xmm3
|
|
|
|
/* Unpack src and dst */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d + (2 * a * (s - 127)) */
|
|
psubw %xmm6, %xmm1
|
|
psllw $2, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
paddsw %xmm1, %xmm2
|
|
|
|
/* Repack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
js 1b
|
|
jnz 3f
|
|
2:
|
|
movd (%rsi, %rcx, 4), %xmm1
|
|
movd (%rdi, %rcx, 4), %xmm2
|
|
/* Unpack alpha */
|
|
movq %xmm1, %xmm3
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0xFF, %xmm3, %xmm3
|
|
pshuflw $0xFF, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
/* Zero blending alpha */
|
|
pand %xmm7, %xmm3
|
|
|
|
/* Unpack src and dst */
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
/* d = d + (2 * a * (s - 127)) */
|
|
psubw %xmm6, %xmm1
|
|
psllw $2, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
paddsw %xmm1, %xmm2
|
|
|
|
/* Repack new pixels */
|
|
packuswb %xmm4, %xmm2
|
|
movd %xmm2, (%rdi, %rcx, 4)
|
|
3:
|
|
leaq (%rsi, %r10, 4), %rsi
|
|
leaq (%rdi, %r11, 4), %rdi
|
|
decq %r9
|
|
jnz 0b
|
|
|
|
9:
|
|
LEAVE
|
|
SIZE(imlib_amd64_reshade_blend_rgba_to_rgb)
|
|
|
|
PR_(imlib_amd64_reshade_blend_rgba_to_rgba):
|
|
ENTER
|
|
|
|
movq pow_lut@GOTPCREL(%rip), %r13
|
|
pxor %xmm4, %xmm4
|
|
movdqu c1(%rip), %xmm5
|
|
movdqu mX000X000X000X000(%rip), %xmm6
|
|
movdqu m0XXX0XXX0XXX0XXX(%rip), %xmm7
|
|
movdqu m000V0V0V000V0V0V(%rip), %xmm8
|
|
xorq %rax, %rax
|
|
|
|
/* Move right to left across each line, */
|
|
/* processing in two pixel chunks */
|
|
leaq (%rsi, %r8, 4), %rsi
|
|
leaq (%rdi, %r8, 4), %rdi
|
|
|
|
/* Last instruction is %rcx = 0 */
|
|
subq $4, %rsi
|
|
subq $4, %rdi
|
|
|
|
negq %r8
|
|
0:
|
|
movq %r8, %rcx
|
|
|
|
incq %rcx
|
|
|
|
/* prefetch a couple cache lines ahead */
|
|
prefetchnta (%rsi, %rcx, 4)
|
|
prefetcht0 (%rdi, %rcx, 4)
|
|
prefetchnta 64(%rsi, %rcx, 4)
|
|
prefetcht0 64(%rdi, %rcx, 4)
|
|
|
|
jz 2f /* one pixel line */
|
|
1:
|
|
/* main loop, unrolled to work on 64 byte chunks */
|
|
prefetchnta 128(%rsi, %rcx, 4)
|
|
prefetcht0 128(%rdi, %rcx, 4)
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
movzbq 7(%rdi, %rcx, 4), %rdx
|
|
movb 7(%rsi, %rcx, 4), %dh
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
shrb $1, %ah
|
|
movzbq 3(%rdi, %rcx, 4), %rdx
|
|
movb 3(%rsi, %rcx, 4), %dh
|
|
shlq $32, %rax
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
shrb $1, %ah
|
|
movd %rax, %xmm3
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0x40, %xmm3, %xmm3
|
|
pshuflw $0x40, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
movdqa %xmm2, %xmm0
|
|
pand %xmm6, %xmm0
|
|
por %xmm6, %xmm1
|
|
psubusb %xmm0, %xmm1
|
|
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
psubw %xmm8, %xmm1
|
|
psllw $2, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
paddsw %xmm1, %xmm2
|
|
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
jz 2f
|
|
jns 3f
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
movzbq 7(%rdi, %rcx, 4), %rdx
|
|
movb 7(%rsi, %rcx, 4), %dh
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
shrb $1, %ah
|
|
movzbq 3(%rdi, %rcx, 4), %rdx
|
|
movb 3(%rsi, %rcx, 4), %dh
|
|
shlq $32, %rax
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
shrb $1, %ah
|
|
movd %rax, %xmm3
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0x40, %xmm3, %xmm3
|
|
pshuflw $0x40, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
movdqa %xmm2, %xmm0
|
|
pand %xmm6, %xmm0
|
|
por %xmm6, %xmm1
|
|
psubusb %xmm0, %xmm1
|
|
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
psubw %xmm8, %xmm1
|
|
psllw $2, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
paddsw %xmm1, %xmm2
|
|
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
jz 2f
|
|
jns 3f
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
movzbq 7(%rdi, %rcx, 4), %rdx
|
|
movb 7(%rsi, %rcx, 4), %dh
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
shrb $1, %ah
|
|
movzbq 3(%rdi, %rcx, 4), %rdx
|
|
movb 3(%rsi, %rcx, 4), %dh
|
|
shlq $32, %rax
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
shrb $1, %ah
|
|
movd %rax, %xmm3
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0x40, %xmm3, %xmm3
|
|
pshuflw $0x40, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
movdqa %xmm2, %xmm0
|
|
pand %xmm6, %xmm0
|
|
por %xmm6, %xmm1
|
|
psubusb %xmm0, %xmm1
|
|
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
psubw %xmm8, %xmm1
|
|
psllw $2, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
paddsw %xmm1, %xmm2
|
|
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
jz 2f
|
|
jns 3f
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
movzbq 7(%rdi, %rcx, 4), %rdx
|
|
movb 7(%rsi, %rcx, 4), %dh
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
shrb $1, %ah
|
|
movzbq 3(%rdi, %rcx, 4), %rdx
|
|
movb 3(%rsi, %rcx, 4), %dh
|
|
shlq $32, %rax
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
shrb $1, %ah
|
|
movd %rax, %xmm3
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0x40, %xmm3, %xmm3
|
|
pshuflw $0x40, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
movdqa %xmm2, %xmm0
|
|
pand %xmm6, %xmm0
|
|
por %xmm6, %xmm1
|
|
psubusb %xmm0, %xmm1
|
|
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
psubw %xmm8, %xmm1
|
|
psllw $2, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
paddsw %xmm1, %xmm2
|
|
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
jz 2f
|
|
jns 3f
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
movzbq 7(%rdi, %rcx, 4), %rdx
|
|
movb 7(%rsi, %rcx, 4), %dh
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
shrb $1, %ah
|
|
movzbq 3(%rdi, %rcx, 4), %rdx
|
|
movb 3(%rsi, %rcx, 4), %dh
|
|
shlq $32, %rax
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
shrb $1, %ah
|
|
movd %rax, %xmm3
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0x40, %xmm3, %xmm3
|
|
pshuflw $0x40, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
movdqa %xmm2, %xmm0
|
|
pand %xmm6, %xmm0
|
|
por %xmm6, %xmm1
|
|
psubusb %xmm0, %xmm1
|
|
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
psubw %xmm8, %xmm1
|
|
psllw $2, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
paddsw %xmm1, %xmm2
|
|
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
jz 2f
|
|
jns 3f
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
movzbq 7(%rdi, %rcx, 4), %rdx
|
|
movb 7(%rsi, %rcx, 4), %dh
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
shrb $1, %ah
|
|
movzbq 3(%rdi, %rcx, 4), %rdx
|
|
movb 3(%rsi, %rcx, 4), %dh
|
|
shlq $32, %rax
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
shrb $1, %ah
|
|
movd %rax, %xmm3
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0x40, %xmm3, %xmm3
|
|
pshuflw $0x40, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
movdqa %xmm2, %xmm0
|
|
pand %xmm6, %xmm0
|
|
por %xmm6, %xmm1
|
|
psubusb %xmm0, %xmm1
|
|
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
psubw %xmm8, %xmm1
|
|
psllw $2, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
paddsw %xmm1, %xmm2
|
|
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
jz 2f
|
|
jns 3f
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
movzbq 7(%rdi, %rcx, 4), %rdx
|
|
movb 7(%rsi, %rcx, 4), %dh
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
shrb $1, %ah
|
|
movzbq 3(%rdi, %rcx, 4), %rdx
|
|
movb 3(%rsi, %rcx, 4), %dh
|
|
shlq $32, %rax
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
shrb $1, %ah
|
|
movd %rax, %xmm3
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0x40, %xmm3, %xmm3
|
|
pshuflw $0x40, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
movdqa %xmm2, %xmm0
|
|
pand %xmm6, %xmm0
|
|
por %xmm6, %xmm1
|
|
psubusb %xmm0, %xmm1
|
|
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
psubw %xmm8, %xmm1
|
|
psllw $2, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
paddsw %xmm1, %xmm2
|
|
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
jz 2f
|
|
jns 3f
|
|
|
|
movq (%rsi, %rcx, 4), %xmm1
|
|
movq (%rdi, %rcx, 4), %xmm2
|
|
movzbq 7(%rdi, %rcx, 4), %rdx
|
|
movb 7(%rsi, %rcx, 4), %dh
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
shrb $1, %ah
|
|
movzbq 3(%rdi, %rcx, 4), %rdx
|
|
movb 3(%rsi, %rcx, 4), %dh
|
|
shlq $32, %rax
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
shrb $1, %ah
|
|
movd %rax, %xmm3
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0x40, %xmm3, %xmm3
|
|
pshuflw $0x40, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
movdqa %xmm2, %xmm0
|
|
pand %xmm6, %xmm0
|
|
por %xmm6, %xmm1
|
|
psubusb %xmm0, %xmm1
|
|
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
psubw %xmm8, %xmm1
|
|
psllw $2, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
paddsw %xmm1, %xmm2
|
|
|
|
packuswb %xmm4, %xmm2
|
|
movq %xmm2, (%rdi, %rcx, 4)
|
|
|
|
incq %rcx
|
|
incq %rcx
|
|
js 1b
|
|
jnz 3f
|
|
2:
|
|
movd (%rsi, %rcx, 4), %xmm1
|
|
movd (%rdi, %rcx, 4), %xmm2
|
|
movzbq 3(%rdi, %rcx, 4), %rdx
|
|
movb 3(%rsi, %rcx, 4), %dh
|
|
movb (%r13, %rdx), %al
|
|
movb %dh, %ah
|
|
shrb $1, %ah
|
|
movd %eax, %xmm3
|
|
punpcklbw %xmm3, %xmm3
|
|
pshufhw $0x40, %xmm3, %xmm3
|
|
pshuflw $0x40, %xmm3, %xmm3
|
|
psrlw $1, %xmm3
|
|
|
|
movdqa %xmm2, %xmm0
|
|
pand %xmm6, %xmm0
|
|
por %xmm6, %xmm1
|
|
psubusb %xmm0, %xmm1
|
|
|
|
punpcklbw %xmm4, %xmm1
|
|
punpcklbw %xmm4, %xmm2
|
|
|
|
psubw %xmm8, %xmm1
|
|
psllw $2, %xmm1
|
|
pmulhw %xmm3, %xmm1
|
|
paddsw %xmm1, %xmm2
|
|
|
|
packuswb %xmm4, %xmm2
|
|
movd %xmm2, (%rdi, %rcx, 4)
|
|
3:
|
|
leaq (%rsi, %r10, 4), %rsi
|
|
leaq (%rdi, %r11, 4), %rdi
|
|
decq %r9
|
|
jnz 0b
|
|
|
|
9:
|
|
LEAVE
|
|
SIZE(imlib_amd64_reshade_blend_rgba_to_rgba)
|
|
|
|
PR_(imlib_amd64_reshade_copy_rgba_to_rgb):
|
|
ENTER
|
|
|
|
movdqu m0XXX0XXX0XXX0XXX(%rip), %xmm5
|
|
movdqu m0VVV0VVV0VVV0VVV(%rip), %xmm6
|
|
|
|
leaq (%rsi, %r8, 4), %rsi
|
|
leaq (%rdi, %r8, 4), %rdi
|
|
|
|
subq $12, %rsi
|
|
subq $12, %rdi
|
|
|
|
negq %r8
|
|
0:
|
|
movq %r8, %rcx
|
|
|
|
/* if < 4 pixels left, goto end */
|
|
addq $3, %rcx
|
|
jns 4f
|
|
1:
|
|
/* 16 byte align dst ptr */
|
|
leaq (%rdi, %rcx, 4), %rdx
|
|
test $0x0f, %rdx
|
|
jz 1f
|
|
|
|
movd (%rsi, %rcx, 4), %xmm1
|
|
movd (%rdi, %rcx, 4), %xmm2
|
|
/* To take advantage of saturation and be able to do 8 bytes
|
|
* at a time, we divide reshading into two separate steps:
|
|
* adding values above 128, and subtracting values below 128
|
|
* These values go into %mm1 and %mm3 respectively
|
|
* - %xmm1 becomes (2 * (s - 127))
|
|
* - %xmm3 becomes (2 * (255 - (127 + s))) = (2 * (128 - s))
|
|
*/
|
|
movdqa %xmm1, %xmm3
|
|
psubusb %xmm6, %xmm1
|
|
paddusb %xmm1, %xmm1
|
|
paddusb %xmm6, %xmm3
|
|
pxor %xmm5, %xmm3
|
|
paddusb %xmm3, %xmm3
|
|
|
|
/* dest alpha should not be changed in this func */
|
|
pand %xmm5, %xmm1
|
|
pand %xmm5, %xmm3
|
|
|
|
/* d = d + s1 - s2, unsigned saturation */
|
|
paddusb %xmm1, %xmm2
|
|
psubusb %xmm3, %xmm2
|
|
movd %xmm2, (%rdi, %rcx, 4)
|
|
incq %rcx
|
|
jz 4f
|
|
jmp 1b
|
|
1:
|
|
/* prefetch a couple cache lines ahead */
|
|
prefetchnta (%rsi, %rcx, 4)
|
|
prefetcht0 (%rdi, %rcx, 4)
|
|
prefetchnta 64(%rsi, %rcx, 4)
|
|
prefetcht0 64(%rdi, %rcx, 4)
|
|
|
|
/* test if 16 byte aligned src ptr */
|
|
leaq (%rsi, %rcx, 4), %rdx
|
|
test $0x0f, %rdx
|
|
jnz 3f
|
|
2:
|
|
/* main loop, unrolled to work on 64 byte chunks */
|
|
/* aligned src, aligned dst */
|
|
prefetchnta 128(%rsi, %rcx, 4)
|
|
prefetcht0 128(%rdi, %rcx, 4)
|
|
|
|
movdqa (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* To take advantage of saturation and be able to do 8 bytes
|
|
* at a time, we divide reshading into two separate steps:
|
|
* adding values above 128, and subtracting values below 128
|
|
* These values go into %mm1 and %mm3 respectively
|
|
* - %xmm1 becomes (2 * (s - 127))
|
|
* - %xmm3 becomes (2 * (255 - (127 + s))) = (2 * (128 - s))
|
|
*/
|
|
movdqa %xmm1, %xmm3
|
|
psubusb %xmm6, %xmm1
|
|
paddusb %xmm1, %xmm1
|
|
paddusb %xmm6, %xmm3
|
|
pxor %xmm5, %xmm3
|
|
paddusb %xmm3, %xmm3
|
|
|
|
/* dest alpha should not be changed in this func */
|
|
pand %xmm5, %xmm1
|
|
pand %xmm5, %xmm3
|
|
|
|
/* d = d + s1 - s2, unsigned saturation */
|
|
paddusb %xmm1, %xmm2
|
|
psubusb %xmm3, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqa (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* To take advantage of saturation and be able to do 8 bytes
|
|
* at a time, we divide reshading into two separate steps:
|
|
* adding values above 128, and subtracting values below 128
|
|
* These values go into %mm1 and %mm3 respectively
|
|
* - %xmm1 becomes (2 * (s - 127))
|
|
* - %xmm3 becomes (2 * (255 - (127 + s))) = (2 * (128 - s))
|
|
*/
|
|
movdqa %xmm1, %xmm3
|
|
psubusb %xmm6, %xmm1
|
|
paddusb %xmm1, %xmm1
|
|
paddusb %xmm6, %xmm3
|
|
pxor %xmm5, %xmm3
|
|
paddusb %xmm3, %xmm3
|
|
|
|
/* dest alpha should not be changed in this func */
|
|
pand %xmm5, %xmm1
|
|
pand %xmm5, %xmm3
|
|
|
|
/* d = d + s1 - s2, unsigned saturation */
|
|
paddusb %xmm1, %xmm2
|
|
psubusb %xmm3, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqa (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* To take advantage of saturation and be able to do 8 bytes
|
|
* at a time, we divide reshading into two separate steps:
|
|
* adding values above 128, and subtracting values below 128
|
|
* These values go into %mm1 and %mm3 respectively
|
|
* - %xmm1 becomes (2 * (s - 127))
|
|
* - %xmm3 becomes (2 * (255 - (127 + s))) = (2 * (128 - s))
|
|
*/
|
|
movdqa %xmm1, %xmm3
|
|
psubusb %xmm6, %xmm1
|
|
paddusb %xmm1, %xmm1
|
|
paddusb %xmm6, %xmm3
|
|
pxor %xmm5, %xmm3
|
|
paddusb %xmm3, %xmm3
|
|
|
|
/* dest alpha should not be changed in this func */
|
|
pand %xmm5, %xmm1
|
|
pand %xmm5, %xmm3
|
|
|
|
/* d = d + s1 - s2, unsigned saturation */
|
|
paddusb %xmm1, %xmm2
|
|
psubusb %xmm3, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqa (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* To take advantage of saturation and be able to do 8 bytes
|
|
* at a time, we divide reshading into two separate steps:
|
|
* adding values above 128, and subtracting values below 128
|
|
* These values go into %mm1 and %mm3 respectively
|
|
* - %xmm1 becomes (2 * (s - 127))
|
|
* - %xmm3 becomes (2 * (255 - (127 + s))) = (2 * (128 - s))
|
|
*/
|
|
movdqa %xmm1, %xmm3
|
|
psubusb %xmm6, %xmm1
|
|
paddusb %xmm1, %xmm1
|
|
paddusb %xmm6, %xmm3
|
|
pxor %xmm5, %xmm3
|
|
paddusb %xmm3, %xmm3
|
|
|
|
/* dest alpha should not be changed in this func */
|
|
pand %xmm5, %xmm1
|
|
pand %xmm5, %xmm3
|
|
|
|
/* d = d + s1 - s2, unsigned saturation */
|
|
paddusb %xmm1, %xmm2
|
|
psubusb %xmm3, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
js 2b
|
|
jmp 4f
|
|
3:
|
|
/* main loop, unrolled to work on 64 byte chunks */
|
|
/* unaligned src, aligned dst */
|
|
prefetchnta 128(%rsi, %rcx, 4)
|
|
prefetcht0 128(%rdi, %rcx, 4)
|
|
|
|
movdqu (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* To take advantage of saturation and be able to do 8 bytes
|
|
* at a time, we divide reshading into two separate steps:
|
|
* adding values above 128, and subtracting values below 128
|
|
* These values go into %mm1 and %mm3 respectively
|
|
* - %xmm1 becomes (2 * (s - 127))
|
|
* - %xmm3 becomes (2 * (255 - (127 + s))) = (2 * (128 - s))
|
|
*/
|
|
movdqa %xmm1, %xmm3
|
|
psubusb %xmm6, %xmm1
|
|
paddusb %xmm1, %xmm1
|
|
paddusb %xmm6, %xmm3
|
|
pxor %xmm5, %xmm3
|
|
paddusb %xmm3, %xmm3
|
|
|
|
/* dest alpha should not be changed in this func */
|
|
pand %xmm5, %xmm1
|
|
pand %xmm5, %xmm3
|
|
|
|
/* d = d + s1 - s2, unsigned saturation */
|
|
paddusb %xmm1, %xmm2
|
|
psubusb %xmm3, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqu (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* To take advantage of saturation and be able to do 8 bytes
|
|
* at a time, we divide reshading into two separate steps:
|
|
* adding values above 128, and subtracting values below 128
|
|
* These values go into %mm1 and %mm3 respectively
|
|
* - %xmm1 becomes (2 * (s - 127))
|
|
* - %xmm3 becomes (2 * (255 - (127 + s))) = (2 * (128 - s))
|
|
*/
|
|
movdqa %xmm1, %xmm3
|
|
psubusb %xmm6, %xmm1
|
|
paddusb %xmm1, %xmm1
|
|
paddusb %xmm6, %xmm3
|
|
pxor %xmm5, %xmm3
|
|
paddusb %xmm3, %xmm3
|
|
|
|
/* dest alpha should not be changed in this func */
|
|
pand %xmm5, %xmm1
|
|
pand %xmm5, %xmm3
|
|
|
|
/* d = d + s1 - s2, unsigned saturation */
|
|
paddusb %xmm1, %xmm2
|
|
psubusb %xmm3, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqu (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* To take advantage of saturation and be able to do 8 bytes
|
|
* at a time, we divide reshading into two separate steps:
|
|
* adding values above 128, and subtracting values below 128
|
|
* These values go into %mm1 and %mm3 respectively
|
|
* - %xmm1 becomes (2 * (s - 127))
|
|
* - %xmm3 becomes (2 * (255 - (127 + s))) = (2 * (128 - s))
|
|
*/
|
|
movdqa %xmm1, %xmm3
|
|
psubusb %xmm6, %xmm1
|
|
paddusb %xmm1, %xmm1
|
|
paddusb %xmm6, %xmm3
|
|
pxor %xmm5, %xmm3
|
|
paddusb %xmm3, %xmm3
|
|
|
|
/* dest alpha should not be changed in this func */
|
|
pand %xmm5, %xmm1
|
|
pand %xmm5, %xmm3
|
|
|
|
/* d = d + s1 - s2, unsigned saturation */
|
|
paddusb %xmm1, %xmm2
|
|
psubusb %xmm3, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqu (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* To take advantage of saturation and be able to do 8 bytes
|
|
* at a time, we divide reshading into two separate steps:
|
|
* adding values above 128, and subtracting values below 128
|
|
* These values go into %mm1 and %mm3 respectively
|
|
* - %xmm1 becomes (2 * (s - 127))
|
|
* - %xmm3 becomes (2 * (255 - (127 + s))) = (2 * (128 - s))
|
|
*/
|
|
movdqa %xmm1, %xmm3
|
|
psubusb %xmm6, %xmm1
|
|
paddusb %xmm1, %xmm1
|
|
paddusb %xmm6, %xmm3
|
|
pxor %xmm5, %xmm3
|
|
paddusb %xmm3, %xmm3
|
|
|
|
/* dest alpha should not be changed in this func */
|
|
pand %xmm5, %xmm1
|
|
pand %xmm5, %xmm3
|
|
|
|
/* d = d + s1 - s2, unsigned saturation */
|
|
paddusb %xmm1, %xmm2
|
|
psubusb %xmm3, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
js 3b
|
|
4:
|
|
/* finish loop */
|
|
cmp $2, %rcx
|
|
jg 5f
|
|
|
|
movd (%rsi, %rcx, 4), %xmm1
|
|
movd (%rdi, %rcx, 4), %xmm2
|
|
/* To take advantage of saturation and be able to do 8 bytes
|
|
* at a time, we divide reshading into two separate steps:
|
|
* adding values above 128, and subtracting values below 128
|
|
* These values go into %mm1 and %mm3 respectively
|
|
* - %xmm1 becomes (2 * (s - 127))
|
|
* - %xmm3 becomes (2 * (255 - (127 + s))) = (2 * (128 - s))
|
|
*/
|
|
movdqa %xmm1, %xmm3
|
|
psubusb %xmm6, %xmm1
|
|
paddusb %xmm1, %xmm1
|
|
paddusb %xmm6, %xmm3
|
|
pxor %xmm5, %xmm3
|
|
paddusb %xmm3, %xmm3
|
|
|
|
/* dest alpha should not be changed in this func */
|
|
pand %xmm5, %xmm1
|
|
pand %xmm5, %xmm3
|
|
|
|
/* d = d + s1 - s2, unsigned saturation */
|
|
paddusb %xmm1, %xmm2
|
|
psubusb %xmm3, %xmm2
|
|
movd %xmm2, (%rdi, %rcx, 4)
|
|
incq %rcx
|
|
jmp 4b
|
|
5:
|
|
/* finish line */
|
|
leaq (%rsi, %r10, 4), %rsi
|
|
leaq (%rdi, %r11, 4), %rdi
|
|
decq %r9
|
|
jnz 0b
|
|
|
|
9:
|
|
LEAVE
|
|
SIZE(imlib_amd64_reshade_copy_rgba_to_rgb)
|
|
|
|
PR_(imlib_amd64_reshade_copy_rgba_to_rgba):
|
|
ENTER
|
|
|
|
movdqu m0XXX0XXX0XXX0XXX(%rip), %xmm5
|
|
movdqu m0VVV0VVV0VVV0VVV(%rip), %xmm6
|
|
movdqu mX000X000X000X000(%rip), %xmm7
|
|
|
|
leaq (%rsi, %r8, 4), %rsi
|
|
leaq (%rdi, %r8, 4), %rdi
|
|
|
|
subq $12, %rsi
|
|
subq $12, %rdi
|
|
|
|
negq %r8
|
|
0:
|
|
movq %r8, %rcx
|
|
|
|
/* if < 4 pixels left, goto end */
|
|
addq $3, %rcx
|
|
jns 4f
|
|
1:
|
|
/* 16 byte align dst ptr */
|
|
leaq (%rdi, %rcx, 4), %rdx
|
|
test $0x0f, %rdx
|
|
jz 1f
|
|
|
|
movd (%rsi, %rcx, 4), %xmm1
|
|
movd (%rdi, %rcx, 4), %xmm2
|
|
/* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */
|
|
movdqa %xmm1, %xmm3
|
|
psubusb %xmm6, %xmm1
|
|
movdqa %xmm1, %xmm0
|
|
paddusb %xmm1, %xmm1
|
|
paddusb %xmm6, %xmm3
|
|
pxor %xmm5, %xmm3
|
|
paddusb %xmm3, %xmm3
|
|
|
|
/* d = d + s1 - s2, unsigned saturation */
|
|
paddusb %xmm1, %xmm2
|
|
psubusb %xmm3, %xmm2
|
|
|
|
/* d alpha = s alpha */
|
|
pand %xmm5, %xmm2
|
|
pand %xmm7, %xmm0
|
|
por %xmm0, %xmm2
|
|
movd %xmm2, (%rdi, %rcx, 4)
|
|
incq %rcx
|
|
jz 4f
|
|
jmp 1b
|
|
1:
|
|
/* prefetch a couple cache lines ahead */
|
|
prefetchnta (%rsi, %rcx, 4)
|
|
prefetcht0 (%rdi, %rcx, 4)
|
|
prefetchnta 64(%rsi, %rcx, 4)
|
|
prefetcht0 64(%rdi, %rcx, 4)
|
|
|
|
/* test if 16 byte aligned src ptr */
|
|
leaq (%rsi, %rcx, 4), %rdx
|
|
test $0x0f, %rdx
|
|
jnz 3f
|
|
2:
|
|
/* main loop, unrolled to work on 64 byte chunks */
|
|
/* aligned src, aligned dst */
|
|
prefetchnta 128(%rsi, %rcx, 4)
|
|
prefetcht0 128(%rdi, %rcx, 4)
|
|
|
|
movdqa (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */
|
|
movdqa %xmm1, %xmm3
|
|
psubusb %xmm6, %xmm1
|
|
movdqa %xmm1, %xmm0
|
|
paddusb %xmm1, %xmm1
|
|
paddusb %xmm6, %xmm3
|
|
pxor %xmm5, %xmm3
|
|
paddusb %xmm3, %xmm3
|
|
|
|
/* d = d + s1 - s2, unsigned saturation */
|
|
paddusb %xmm1, %xmm2
|
|
psubusb %xmm3, %xmm2
|
|
|
|
/* d alpha = s alpha */
|
|
pand %xmm5, %xmm2
|
|
pand %xmm7, %xmm0
|
|
por %xmm0, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqa (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */
|
|
movdqa %xmm1, %xmm3
|
|
psubusb %xmm6, %xmm1
|
|
movdqa %xmm1, %xmm0
|
|
paddusb %xmm1, %xmm1
|
|
paddusb %xmm6, %xmm3
|
|
pxor %xmm5, %xmm3
|
|
paddusb %xmm3, %xmm3
|
|
|
|
/* d = d + s1 - s2, unsigned saturation */
|
|
paddusb %xmm1, %xmm2
|
|
psubusb %xmm3, %xmm2
|
|
|
|
/* d alpha = s alpha */
|
|
pand %xmm5, %xmm2
|
|
pand %xmm7, %xmm0
|
|
por %xmm0, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqa (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */
|
|
movdqa %xmm1, %xmm3
|
|
psubusb %xmm6, %xmm1
|
|
movdqa %xmm1, %xmm0
|
|
paddusb %xmm1, %xmm1
|
|
paddusb %xmm6, %xmm3
|
|
pxor %xmm5, %xmm3
|
|
paddusb %xmm3, %xmm3
|
|
|
|
/* d = d + s1 - s2, unsigned saturation */
|
|
paddusb %xmm1, %xmm2
|
|
psubusb %xmm3, %xmm2
|
|
|
|
/* d alpha = s alpha */
|
|
pand %xmm5, %xmm2
|
|
pand %xmm7, %xmm0
|
|
por %xmm0, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqa (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */
|
|
movdqa %xmm1, %xmm3
|
|
psubusb %xmm6, %xmm1
|
|
movdqa %xmm1, %xmm0
|
|
paddusb %xmm1, %xmm1
|
|
paddusb %xmm6, %xmm3
|
|
pxor %xmm5, %xmm3
|
|
paddusb %xmm3, %xmm3
|
|
|
|
/* d = d + s1 - s2, unsigned saturation */
|
|
paddusb %xmm1, %xmm2
|
|
psubusb %xmm3, %xmm2
|
|
|
|
/* d alpha = s alpha */
|
|
pand %xmm5, %xmm2
|
|
pand %xmm7, %xmm0
|
|
por %xmm0, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
js 2b
|
|
jmp 4f
|
|
3:
|
|
/* main loop, unrolled to work on 64 byte chunks */
|
|
/* unaligned src, aligned dst */
|
|
prefetchnta 128(%rsi, %rcx, 4)
|
|
prefetcht0 128(%rdi, %rcx, 4)
|
|
|
|
movdqu (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */
|
|
movdqa %xmm1, %xmm3
|
|
psubusb %xmm6, %xmm1
|
|
movdqa %xmm1, %xmm0
|
|
paddusb %xmm1, %xmm1
|
|
paddusb %xmm6, %xmm3
|
|
pxor %xmm5, %xmm3
|
|
paddusb %xmm3, %xmm3
|
|
|
|
/* d = d + s1 - s2, unsigned saturation */
|
|
paddusb %xmm1, %xmm2
|
|
psubusb %xmm3, %xmm2
|
|
|
|
/* d alpha = s alpha */
|
|
pand %xmm5, %xmm2
|
|
pand %xmm7, %xmm0
|
|
por %xmm0, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqu (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */
|
|
movdqa %xmm1, %xmm3
|
|
psubusb %xmm6, %xmm1
|
|
movdqa %xmm1, %xmm0
|
|
paddusb %xmm1, %xmm1
|
|
paddusb %xmm6, %xmm3
|
|
pxor %xmm5, %xmm3
|
|
paddusb %xmm3, %xmm3
|
|
|
|
/* d = d + s1 - s2, unsigned saturation */
|
|
paddusb %xmm1, %xmm2
|
|
psubusb %xmm3, %xmm2
|
|
|
|
/* d alpha = s alpha */
|
|
pand %xmm5, %xmm2
|
|
pand %xmm7, %xmm0
|
|
por %xmm0, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqu (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */
|
|
movdqa %xmm1, %xmm3
|
|
psubusb %xmm6, %xmm1
|
|
movdqa %xmm1, %xmm0
|
|
paddusb %xmm1, %xmm1
|
|
paddusb %xmm6, %xmm3
|
|
pxor %xmm5, %xmm3
|
|
paddusb %xmm3, %xmm3
|
|
|
|
/* d = d + s1 - s2, unsigned saturation */
|
|
paddusb %xmm1, %xmm2
|
|
psubusb %xmm3, %xmm2
|
|
|
|
/* d alpha = s alpha */
|
|
pand %xmm5, %xmm2
|
|
pand %xmm7, %xmm0
|
|
por %xmm0, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqu (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
/* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */
|
|
movdqa %xmm1, %xmm3
|
|
psubusb %xmm6, %xmm1
|
|
movdqa %xmm1, %xmm0
|
|
paddusb %xmm1, %xmm1
|
|
paddusb %xmm6, %xmm3
|
|
pxor %xmm5, %xmm3
|
|
paddusb %xmm3, %xmm3
|
|
|
|
/* d = d + s1 - s2, unsigned saturation */
|
|
paddusb %xmm1, %xmm2
|
|
psubusb %xmm3, %xmm2
|
|
|
|
/* d alpha = s alpha */
|
|
pand %xmm5, %xmm2
|
|
pand %xmm7, %xmm0
|
|
por %xmm0, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
js 3b
|
|
4:
|
|
/* finish loop */
|
|
cmp $2, %rcx
|
|
jg 5f
|
|
|
|
movd (%rsi, %rcx, 4), %xmm1
|
|
movd (%rdi, %rcx, 4), %xmm2
|
|
/* This time, dest alpha = src alpha, so src alpha is saved in %xmm0 */
|
|
movdqa %xmm1, %xmm3
|
|
psubusb %xmm6, %xmm1
|
|
movdqa %xmm1, %xmm0
|
|
paddusb %xmm1, %xmm1
|
|
paddusb %xmm6, %xmm3
|
|
pxor %xmm5, %xmm3
|
|
paddusb %xmm3, %xmm3
|
|
|
|
/* d = d + s1 - s2, unsigned saturation */
|
|
paddusb %xmm1, %xmm2
|
|
psubusb %xmm3, %xmm2
|
|
|
|
/* d alpha = s alpha */
|
|
pand %xmm5, %xmm2
|
|
pand %xmm7, %xmm0
|
|
por %xmm0, %xmm2
|
|
movd %xmm2, (%rdi, %rcx, 4)
|
|
incq %rcx
|
|
jmp 4b
|
|
5:
|
|
/* finish line */
|
|
leaq (%rsi, %r10, 4), %rsi
|
|
leaq (%rdi, %r11, 4), %rdi
|
|
decq %r9
|
|
jnz 0b
|
|
|
|
9:
|
|
LEAVE
|
|
SIZE(imlib_amd64_reshade_copy_rgba_to_rgba)
|
|
|
|
PR_(imlib_amd64_reshade_copy_rgb_to_rgba):
|
|
ENTER
|
|
|
|
movdqu m0XXX0XXX0XXX0XXX(%rip), %xmm5
|
|
movdqu m0VVV0VVV0VVV0VVV(%rip), %xmm6
|
|
movdqu mX000X000X000X000(%rip), %xmm7
|
|
|
|
leaq (%rsi, %r8, 4), %rsi
|
|
leaq (%rdi, %r8, 4), %rdi
|
|
|
|
subq $12, %rsi
|
|
subq $12, %rdi
|
|
|
|
negq %r8
|
|
0:
|
|
movq %r8, %rcx
|
|
|
|
/* if < 4 pixels left, goto end */
|
|
addq $3, %rcx
|
|
jns 4f
|
|
1:
|
|
/* 16 byte align dst ptr */
|
|
leaq (%rdi, %rcx, 4), %rdx
|
|
test $0x0f, %rdx
|
|
jz 1f
|
|
|
|
movd (%rsi, %rcx, 4), %xmm1
|
|
movd (%rdi, %rcx, 4), %xmm2
|
|
movdqa %xmm1, %xmm3
|
|
psubusb %xmm6, %xmm1
|
|
paddusb %xmm1, %xmm1
|
|
paddusb %xmm6, %xmm3
|
|
pxor %xmm5, %xmm3
|
|
paddusb %xmm3, %xmm3
|
|
|
|
/* d = d + s1 - s2, unsigned saturation */
|
|
paddusb %xmm1, %xmm2
|
|
psubusb %xmm3, %xmm2
|
|
|
|
/* d alpha = 0xff */
|
|
por %xmm7, %xmm2
|
|
movd %xmm2, (%rdi, %rcx, 4)
|
|
incq %rcx
|
|
jz 4f
|
|
jmp 1b
|
|
1:
|
|
/* prefetch a couple cache lines ahead */
|
|
prefetchnta (%rsi, %rcx, 4)
|
|
prefetcht0 (%rdi, %rcx, 4)
|
|
prefetchnta 64(%rsi, %rcx, 4)
|
|
prefetcht0 64(%rdi, %rcx, 4)
|
|
|
|
/* test if 16 byte aligned src ptr */
|
|
leaq (%rsi, %rcx, 4), %rdx
|
|
test $0x0f, %rdx
|
|
jnz 3f
|
|
2:
|
|
/* main loop, unrolled to work on 64 byte chunks */
|
|
/* aligned src, aligned dst */
|
|
prefetchnta 128(%rsi, %rcx, 4)
|
|
prefetcht0 128(%rdi, %rcx, 4)
|
|
|
|
movdqa (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
movdqa %xmm1, %xmm3
|
|
psubusb %xmm6, %xmm1
|
|
paddusb %xmm1, %xmm1
|
|
paddusb %xmm6, %xmm3
|
|
pxor %xmm5, %xmm3
|
|
paddusb %xmm3, %xmm3
|
|
|
|
/* d = d + s1 - s2, unsigned saturation */
|
|
paddusb %xmm1, %xmm2
|
|
psubusb %xmm3, %xmm2
|
|
|
|
/* d alpha = 0xff */
|
|
por %xmm7, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqa (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
movdqa %xmm1, %xmm3
|
|
psubusb %xmm6, %xmm1
|
|
paddusb %xmm1, %xmm1
|
|
paddusb %xmm6, %xmm3
|
|
pxor %xmm5, %xmm3
|
|
paddusb %xmm3, %xmm3
|
|
|
|
/* d = d + s1 - s2, unsigned saturation */
|
|
paddusb %xmm1, %xmm2
|
|
psubusb %xmm3, %xmm2
|
|
|
|
/* d alpha = 0xff */
|
|
por %xmm7, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqa (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
movdqa %xmm1, %xmm3
|
|
psubusb %xmm6, %xmm1
|
|
paddusb %xmm1, %xmm1
|
|
paddusb %xmm6, %xmm3
|
|
pxor %xmm5, %xmm3
|
|
paddusb %xmm3, %xmm3
|
|
|
|
/* d = d + s1 - s2, unsigned saturation */
|
|
paddusb %xmm1, %xmm2
|
|
psubusb %xmm3, %xmm2
|
|
|
|
/* d alpha = 0xff */
|
|
por %xmm7, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqa (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
movdqa %xmm1, %xmm3
|
|
psubusb %xmm6, %xmm1
|
|
paddusb %xmm1, %xmm1
|
|
paddusb %xmm6, %xmm3
|
|
pxor %xmm5, %xmm3
|
|
paddusb %xmm3, %xmm3
|
|
|
|
/* d = d + s1 - s2, unsigned saturation */
|
|
paddusb %xmm1, %xmm2
|
|
psubusb %xmm3, %xmm2
|
|
|
|
/* d alpha = 0xff */
|
|
por %xmm7, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
js 2b
|
|
jmp 4f
|
|
3:
|
|
/* main loop, unrolled to work on 64 byte chunks */
|
|
/* unaligned src, aligned dst */
|
|
prefetchnta 128(%rsi, %rcx, 4)
|
|
prefetcht0 128(%rdi, %rcx, 4)
|
|
|
|
movdqu (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
movdqa %xmm1, %xmm3
|
|
psubusb %xmm6, %xmm1
|
|
paddusb %xmm1, %xmm1
|
|
paddusb %xmm6, %xmm3
|
|
pxor %xmm5, %xmm3
|
|
paddusb %xmm3, %xmm3
|
|
|
|
/* d = d + s1 - s2, unsigned saturation */
|
|
paddusb %xmm1, %xmm2
|
|
psubusb %xmm3, %xmm2
|
|
|
|
/* d alpha = 0xff */
|
|
por %xmm7, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqu (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
movdqa %xmm1, %xmm3
|
|
psubusb %xmm6, %xmm1
|
|
paddusb %xmm1, %xmm1
|
|
paddusb %xmm6, %xmm3
|
|
pxor %xmm5, %xmm3
|
|
paddusb %xmm3, %xmm3
|
|
|
|
/* d = d + s1 - s2, unsigned saturation */
|
|
paddusb %xmm1, %xmm2
|
|
psubusb %xmm3, %xmm2
|
|
|
|
/* d alpha = 0xff */
|
|
por %xmm7, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqu (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
movdqa %xmm1, %xmm3
|
|
psubusb %xmm6, %xmm1
|
|
paddusb %xmm1, %xmm1
|
|
paddusb %xmm6, %xmm3
|
|
pxor %xmm5, %xmm3
|
|
paddusb %xmm3, %xmm3
|
|
|
|
/* d = d + s1 - s2, unsigned saturation */
|
|
paddusb %xmm1, %xmm2
|
|
psubusb %xmm3, %xmm2
|
|
|
|
/* d alpha = 0xff */
|
|
por %xmm7, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
jns 4f
|
|
|
|
movdqu (%rsi, %rcx, 4), %xmm1
|
|
movdqa (%rdi, %rcx, 4), %xmm2
|
|
movdqa %xmm1, %xmm3
|
|
psubusb %xmm6, %xmm1
|
|
paddusb %xmm1, %xmm1
|
|
paddusb %xmm6, %xmm3
|
|
pxor %xmm5, %xmm3
|
|
paddusb %xmm3, %xmm3
|
|
|
|
/* d = d + s1 - s2, unsigned saturation */
|
|
paddusb %xmm1, %xmm2
|
|
psubusb %xmm3, %xmm2
|
|
|
|
/* d alpha = 0xff */
|
|
por %xmm7, %xmm2
|
|
movdqa %xmm2, (%rdi, %rcx, 4)
|
|
addq $4, %rcx
|
|
js 3b
|
|
4:
|
|
/* finish loop */
|
|
cmp $2, %rcx
|
|
jg 5f
|
|
|
|
movd (%rsi, %rcx, 4), %xmm1
|
|
movd (%rdi, %rcx, 4), %xmm2
|
|
movdqa %xmm1, %xmm3
|
|
psubusb %xmm6, %xmm1
|
|
paddusb %xmm1, %xmm1
|
|
paddusb %xmm6, %xmm3
|
|
pxor %xmm5, %xmm3
|
|
paddusb %xmm3, %xmm3
|
|
|
|
/* d = d + s1 - s2, unsigned saturation */
|
|
paddusb %xmm1, %xmm2
|
|
psubusb %xmm3, %xmm2
|
|
|
|
/* d alpha = 0xff */
|
|
por %xmm7, %xmm2
|
|
movd %xmm2, (%rdi, %rcx, 4)
|
|
incq %rcx
|
|
jmp 4b
|
|
5:
|
|
/* finish line */
|
|
leaq (%rsi, %r10, 4), %rsi
|
|
leaq (%rdi, %r11, 4), %rdi
|
|
decq %r9
|
|
jnz 0b
|
|
|
|
9:
|
|
LEAVE
|
|
SIZE(imlib_amd64_reshade_copy_rgb_to_rgba)
|
|
|
|
#ifdef __ELF__
|
|
.section .note.GNU-stack,"",@progbits
|
|
#endif
|