legacy-imlib2/src/lib/asm_blend_cmod.S

1566 lines
30 KiB
ArmAsm

#include <config.h>
#include "asm.h"
/*\
|*| MMX assembly blending routines, with colour modding, for Imlib2
|*| Written by Willem Monsuwe <willem@stack.nl>
|*|
|*| Special (hairy) constructs are only commented on first use.
\*/
/*\ All functions have the same calling convention:
|*| PR_(imlib_mmx_<op>_rgba_to_rgb[A]_cmod(void *src, int sw, void *dst, int dw,
|*| int w, int h, ImlibColorModifier *cm)
\*/
#define src 8(%ebp)
#define sw 12(%ebp)
#define dst 16(%ebp)
#define dw 20(%ebp)
#define w 24(%ebp)
#define h 28(%ebp)
#define cm 32(%ebp)
/*\ Cmod tables, from %ebx \*/
#define rmap(x) (%ebx, x)
#define gmap(x) 0x100(%ebx, x)
#define bmap(x) 0x200(%ebx, x)
#define amap(x) 0x300(%ebx, x)
#define amap_ff 0x3ff(%ebx)
.text
.align 8
FN_(imlib_mmx_blend_rgba_to_rgb_cmod)
FN_(imlib_mmx_blend_rgba_to_rgba_cmod)
FN_(imlib_mmx_blend_rgb_to_rgb_cmod)
FN_(imlib_mmx_blend_rgb_to_rgba_cmod)
FN_(imlib_mmx_copy_rgba_to_rgb_cmod)
FN_(imlib_mmx_copy_rgba_to_rgba_cmod)
FN_(imlib_mmx_copy_rgb_to_rgba_cmod)
FN_(imlib_mmx_add_blend_rgba_to_rgb_cmod)
FN_(imlib_mmx_add_blend_rgba_to_rgba_cmod)
FN_(imlib_mmx_add_blend_rgb_to_rgb_cmod)
FN_(imlib_mmx_add_blend_rgb_to_rgba_cmod)
FN_(imlib_mmx_add_copy_rgba_to_rgb_cmod)
FN_(imlib_mmx_add_copy_rgba_to_rgba_cmod)
FN_(imlib_mmx_add_copy_rgb_to_rgba_cmod)
FN_(imlib_mmx_subtract_blend_rgba_to_rgb_cmod)
FN_(imlib_mmx_subtract_blend_rgba_to_rgba_cmod)
FN_(imlib_mmx_subtract_blend_rgb_to_rgb_cmod)
FN_(imlib_mmx_subtract_blend_rgb_to_rgba_cmod)
FN_(imlib_mmx_subtract_copy_rgba_to_rgb_cmod)
FN_(imlib_mmx_subtract_copy_rgba_to_rgba_cmod)
FN_(imlib_mmx_subtract_copy_rgb_to_rgba_cmod)
FN_(imlib_mmx_reshade_blend_rgba_to_rgb_cmod)
FN_(imlib_mmx_reshade_blend_rgba_to_rgba_cmod)
FN_(imlib_mmx_reshade_blend_rgb_to_rgb_cmod)
FN_(imlib_mmx_reshade_blend_rgb_to_rgba_cmod)
FN_(imlib_mmx_reshade_copy_rgba_to_rgb_cmod)
FN_(imlib_mmx_reshade_copy_rgba_to_rgba_cmod)
FN_(imlib_mmx_reshade_copy_rgb_to_rgba_cmod)
#include "asm_loadimmq.S"
/*\ MMX register use:
|*| %mm1 = Source value
|*| %mm2 = Destination value
|*| %mm3 = Alpha value
|*| %mm4 = 0
|*| %mm5-%mm6 = masks
\*/
/*\ Common code \*/
/*\ Set MMX mode, save registers, load common parameters \*/
#define ENTER \
pushl %ebp ;\
movl %esp, %ebp ;\
pushl %ebx ;\
pushl %ecx ;\
pushl %edx ;\
pushl %edi ;\
pushl %esi ;\
movl cm, %ebx ;\
movl src, %esi ;\
movl dst, %edi ;\
movl w, %ecx ;\
leal (%esi, %ecx, 4), %esi ;\
leal (%edi, %ecx, 4), %edi ;\
negl %ecx ;\
jz 9f ;\
movl h, %edx ;\
decl %edx ;\
jz 9f ;\
#define LOOP_START \
8: ;\
movl w, %ecx ;\
negl %ecx
#define LOOP_END \
movl sw, %ecx ;\
leal (%esi, %ecx, 4), %esi ;\
movl dw, %ecx ;\
leal (%edi, %ecx, 4), %edi ;\
decl %edx ;\
jns 8b
/*\ Unset MMX mode, reset registers, return \*/
#define LEAVE \
9: ;\
emms ;\
popl %esi ;\
popl %edi ;\
popl %edx ;\
popl %ecx ;\
popl %ebx ;\
movl %ebp, %esp ;\
popl %ebp ;\
ret
/*\ Load one value, colourmod it, and put it in %mm1 \*/
#define LOAD1_CMOD \
movzbl 3(%esi, %ecx, 4), %eax ;\
movzbl amap(%eax), %eax ;\
movd %eax, %mm1 ;\
movzbl 2(%esi, %ecx, 4), %eax ;\
psllq $8, %mm1 ;\
movzbl rmap(%eax), %eax ;\
movd %eax, %mm0 ;\
movzbl 1(%esi, %ecx, 4), %eax ;\
por %mm0, %mm1 ;\
movzbl gmap(%eax), %eax ;\
psllq $8, %mm1 ;\
movd %eax, %mm0 ;\
movzbl (%esi, %ecx, 4), %eax ;\
por %mm0, %mm1 ;\
movzbl bmap(%eax), %eax ;\
psllq $8, %mm1 ;\
movd %eax, %mm0 ;\
por %mm0, %mm1
/*\ Load two values, colourmod them, and put them in %mm1 \*/
#define LOAD2_CMOD \
movzbl 7(%esi, %ecx, 4), %eax ;\
movzbl amap(%eax), %eax ;\
movd %eax, %mm1 ;\
movzbl 6(%esi, %ecx, 4), %eax ;\
psllq $8, %mm1 ;\
movzbl rmap(%eax), %eax ;\
movd %eax, %mm0 ;\
movzbl 5(%esi, %ecx, 4), %eax ;\
por %mm0, %mm1 ;\
movzbl gmap(%eax), %eax ;\
psllq $8, %mm1 ;\
movd %eax, %mm0 ;\
movzbl 4(%esi, %ecx, 4), %eax ;\
por %mm0, %mm1 ;\
movzbl bmap(%eax), %eax ;\
psllq $8, %mm1 ;\
movd %eax, %mm0 ;\
movzbl 3(%esi, %ecx, 4), %eax ;\
por %mm0, %mm1 ;\
movzbl amap(%eax), %eax ;\
psllq $8, %mm1 ;\
movd %eax, %mm0 ;\
movzbl 2(%esi, %ecx, 4), %eax ;\
por %mm0, %mm1 ;\
movzbl rmap(%eax), %eax ;\
psllq $8, %mm1 ;\
movd %eax, %mm0 ;\
movzbl 1(%esi, %ecx, 4), %eax ;\
por %mm0, %mm1 ;\
movzbl gmap(%eax), %eax ;\
psllq $8, %mm1 ;\
movd %eax, %mm0 ;\
movzbl (%esi, %ecx, 4), %eax ;\
por %mm0, %mm1 ;\
movzbl bmap(%eax), %eax ;\
psllq $8, %mm1 ;\
movd %eax, %mm0 ;\
por %mm0, %mm1
/*\ Load one value, alpha 0xff, colourmod it, and put it in %mm1 \*/
#define LOAD1_CMOD_AFF \
movzbl amap_ff, %eax ;\
movd %eax, %mm1 ;\
movzbl 2(%esi, %ecx, 4), %eax ;\
psllq $8, %mm1 ;\
movzbl rmap(%eax), %eax ;\
movd %eax, %mm0 ;\
movzbl 1(%esi, %ecx, 4), %eax ;\
por %mm0, %mm1 ;\
movzbl gmap(%eax), %eax ;\
psllq $8, %mm1 ;\
movd %eax, %mm0 ;\
movzbl (%esi, %ecx, 4), %eax ;\
por %mm0, %mm1 ;\
movzbl bmap(%eax), %eax ;\
psllq $8, %mm1 ;\
movd %eax, %mm0 ;\
por %mm0, %mm1
/*\ Load two values, alpha 0xff, colourmod them, and put them in %mm1 \*/
#define LOAD2_CMOD_AFF \
movzbl amap_ff, %eax ;\
movd %eax, %mm1 ;\
movzbl 6(%esi, %ecx, 4), %eax ;\
psllq $8, %mm1 ;\
movzbl rmap(%eax), %eax ;\
movd %eax, %mm0 ;\
movzbl 5(%esi, %ecx, 4), %eax ;\
por %mm0, %mm1 ;\
movzbl gmap(%eax), %eax ;\
psllq $8, %mm1 ;\
movd %eax, %mm0 ;\
movzbl 4(%esi, %ecx, 4), %eax ;\
por %mm0, %mm1 ;\
movzbl bmap(%eax), %eax ;\
psllq $8, %mm1 ;\
movd %eax, %mm0 ;\
por %mm0, %mm1 ;\
movzbl amap_ff, %eax ;\
psllq $8, %mm1 ;\
movd %eax, %mm0 ;\
movzbl 2(%esi, %ecx, 4), %eax ;\
por %mm0, %mm1 ;\
movzbl rmap(%eax), %eax ;\
psllq $8, %mm1 ;\
movd %eax, %mm0 ;\
movzbl 1(%esi, %ecx, 4), %eax ;\
por %mm0, %mm1 ;\
movzbl gmap(%eax), %eax ;\
psllq $8, %mm1 ;\
movd %eax, %mm0 ;\
movzbl (%esi, %ecx, 4), %eax ;\
por %mm0, %mm1 ;\
movzbl bmap(%eax), %eax ;\
psllq $8, %mm1 ;\
movd %eax, %mm0 ;\
por %mm0, %mm1
/*\ Load one value, colourmod it, alpha 0, and put it in %mm1 \*/
#define LOAD1_CMOD_A00 \
movzbl 2(%esi, %ecx, 4), %eax ;\
movzbl rmap(%eax), %eax ;\
movd %eax, %mm1 ;\
movzbl 1(%esi, %ecx, 4), %eax ;\
psllq $8, %mm1 ;\
movzbl gmap(%eax), %eax ;\
movd %eax, %mm0 ;\
movzbl (%esi, %ecx, 4), %eax ;\
por %mm0, %mm1 ;\
movzbl bmap(%eax), %eax ;\
psllq $8, %mm1 ;\
movd %eax, %mm0 ;\
por %mm0, %mm1
/*\ Load two values, colourmod them, alpha 0, and put them in %mm1 \*/
#define LOAD2_CMOD_A00 \
movzbl 6(%esi, %ecx, 4), %eax ;\
movzbl rmap(%eax), %eax ;\
movd %eax, %mm1 ;\
movzbl 5(%esi, %ecx, 4), %eax ;\
psllq $8, %mm1 ;\
movzbl gmap(%eax), %eax ;\
movd %eax, %mm0 ;\
movzbl 4(%esi, %ecx, 4), %eax ;\
por %mm0, %mm1 ;\
movzbl bmap(%eax), %eax ;\
psllq $8, %mm1 ;\
movd %eax, %mm0 ;\
movzbl 2(%esi, %ecx, 4), %eax ;\
por %mm0, %mm1 ;\
movzbl rmap(%eax), %eax ;\
psllq $16, %mm1 ;\
movd %eax, %mm0 ;\
movzbl 1(%esi, %ecx, 4), %eax ;\
por %mm0, %mm1 ;\
movzbl gmap(%eax), %eax ;\
psllq $8, %mm1 ;\
movd %eax, %mm0 ;\
movzbl (%esi, %ecx, 4), %eax ;\
por %mm0, %mm1 ;\
movzbl bmap(%eax), %eax ;\
psllq $8, %mm1 ;\
movd %eax, %mm0 ;\
por %mm0, %mm1
PR_(imlib_mmx_blend_rgba_to_rgb_cmod):
ENTER
pxor %mm4, %mm4
LOAD_IMMQ(c1, %mm5)
CLEANUP_IMMQ_LOADS(1)
LOOP_START
1:
/*\ Load source and destination \*/
LOAD1_CMOD
movd (%edi, %ecx, 4), %mm2
/*\ Get alpha from source and unpack/copy to eight bytes
|*| which are treated as four words.
|*| Result ranges from [0, 0x7fff), and is mapped to
|*| point value in [0.0, 1.0) by using the high word
|*| of the 16->32 multiplications.
|*| (Because we want the unsigned value we shift one bit,
|*| and also shift the other factor to compensate.)
|*| Magic to get the fourth byte: lhh
\*/
movq %mm1, %mm3
punpcklbw %mm3, %mm3
punpckhwd %mm3, %mm3
punpckhdq %mm3, %mm3
psrlw $1, %mm3
/*\ Make the alpha value that gets multiplied to the
|*| alpha channels 0, so the resulting alpha value is
|*| the destination alpha value.
\*/
psrlq $16, %mm3
/*\ Unpack source and destination, bytes to words \*/
punpcklbw %mm4, %mm1
punpcklbw %mm4, %mm2
/*\ d = d + (a * ((s - d) + 0.5)) \*/
psubw %mm2, %mm1
psllw $1, %mm1
paddw %mm5, %mm1 /*\ Roundoff \*/
pmulhw %mm3, %mm1
paddw %mm1, %mm2
/*\ Pack into lower 4 bytes and save \*/
packuswb %mm4, %mm2
movd %mm2, (%edi, %ecx, 4)
incl %ecx
js 1b
LOOP_END
LEAVE
SIZE(imlib_mmx_blend_rgba_to_rgb_cmod)
PR_(imlib_mmx_blend_rgba_to_rgba_cmod):
ENTER
pxor %mm4, %mm4
LOAD_IMMQ(m0X000000, %mm5)
LOAD_IMMQ(m00XXXXXX, %mm6)
LOAD_IMMQ(c1, %mm7)
CLEANUP_IMMQ_LOADS(3)
LOOP_START
1:
/*\ Load source and destination \*/
LOAD1_CMOD
movd (%edi, %ecx, 4), %mm2
/*\ Get alpha from source and target, a = src + (255 - dest) \*/
movq %mm2, %mm3
pxor %mm6, %mm3
paddusb %mm1, %mm3
/*\ Unpack/copy to eight bytes \*/
punpcklbw %mm3, %mm3
punpckhwd %mm3, %mm3
punpckhdq %mm3, %mm3
psrlw $1, %mm3
/*\ Unpack source and destination, bytes to words \*/
punpcklbw %mm4, %mm1
punpcklbw %mm4, %mm2
/*\ Separate alpha channel \*/
movq %mm1, %mm0
pand %mm5, %mm0
/*\ d = d + (a * ((s - d) + 0.5)) \*/
psubw %mm2, %mm1
psllw $1, %mm1
paddw %mm7, %mm1 /*\ Roundoff \*/
pmulhw %mm3, %mm1
/*\ Replace alpha channel with separated out version in mm0 and add \*/
pand %mm6, %mm1
por %mm0, %mm1
paddw %mm1, %mm2
/*\ Pack into lower 4 bytes and save \*/
packuswb %mm4, %mm2
movd %mm2, (%edi, %ecx, 4)
incl %ecx
js 1b
LOOP_END
LEAVE
SIZE(imlib_mmx_blend_rgba_to_rgba_cmod)
PR_(imlib_mmx_blend_rgb_to_rgb_cmod):
ENTER
pxor %mm4, %mm4
LOAD_IMMQ(c1, %mm5)
CLEANUP_IMMQ_LOADS(1)
/*\ Load alpha beforehand, as it's always amap(0xff) \*/
movzbl amap_ff, %eax
movd %eax, %mm3
punpcklbw %mm3, %mm3
punpcklwd %mm3, %mm3
punpckldq %mm3, %mm3
psrlw $1, %mm3
psrlq $16, %mm3
LOOP_START
1:
/*\ Load source and destination \*/
LOAD1_CMOD_A00
movd (%edi, %ecx, 4), %mm2
/*\ Unpack source and destination, bytes to words \*/
punpcklbw %mm4, %mm1
punpcklbw %mm4, %mm2
/*\ d = d + (a * ((s - d) + 0.5)) \*/
psubw %mm2, %mm1
psllw $1, %mm1
paddw %mm5, %mm1 /*\ Roundoff \*/
pmulhw %mm3, %mm1
paddw %mm1, %mm2
/*\ Pack into lower 4 bytes and save \*/
packuswb %mm4, %mm2
movd %mm2, (%edi, %ecx, 4)
incl %ecx
js 1b
LOOP_END
LEAVE
SIZE(imlib_mmx_blend_rgb_to_rgb_cmod)
PR_(imlib_mmx_blend_rgb_to_rgba_cmod):
ENTER
pxor %mm4, %mm4
LOAD_IMMQ(m0X000000, %mm5)
LOAD_IMMQ(m00XXXXXX, %mm6)
LOAD_IMMQ(c1, %mm7)
CLEANUP_IMMQ_LOADS(3)
LOOP_START
1:
/*\ Load source and destination \*/
LOAD1_CMOD_AFF
movd (%edi, %ecx, 4), %mm2
/*\ Get alpha from source and target, a = src + (255 - dest) \*/
movq %mm2, %mm3
pxor %mm6, %mm3
paddusb %mm1, %mm3
/*\ Unpack/copy to eight bytes \*/
punpcklbw %mm3, %mm3
punpckhwd %mm3, %mm3
punpckhdq %mm3, %mm3
psrlw $1, %mm3
/*\ Unpack source and destination, bytes to words \*/
punpcklbw %mm4, %mm1
punpcklbw %mm4, %mm2
/*\ Separate alpha channel \*/
movq %mm1, %mm0
pand %mm5, %mm0
/*\ d = d + (a * ((s - d) + 0.5)) \*/
psubw %mm2, %mm1
psllw $1, %mm1
paddw %mm7, %mm1 /*\ Roundoff \*/
pmulhw %mm3, %mm1
/*\ Replace alpha channel with separated out version in mm0 and add \*/
pand %mm6, %mm1
por %mm0, %mm1
paddw %mm1, %mm2
/*\ Pack into lower 4 bytes and save \*/
packuswb %mm4, %mm2
movd %mm2, (%edi, %ecx, 4)
incl %ecx
js 1b
LOOP_END
LEAVE
SIZE(imlib_mmx_blend_rgb_to_rgba_cmod)
PR_(imlib_mmx_copy_rgba_to_rgb_cmod):
ENTER
LOOP_START
1:
movzbl (%esi, %ecx, 4), %eax
movzbl bmap(%eax), %eax
movb %al, (%edi, %ecx, 4)
movzbl 1(%esi, %ecx, 4), %eax
movzbl gmap(%eax), %eax
movb %al, 1(%edi, %ecx, 4)
movzbl 2(%esi, %ecx, 4), %eax
movzbl rmap(%eax), %eax
movb %al, 2(%edi, %ecx, 4)
incl %ecx
js 1b
LOOP_END
LEAVE
SIZE(imlib_mmx_copy_rgba_to_rgb_cmod)
PR_(imlib_mmx_copy_rgba_to_rgba_cmod):
ENTER
LOOP_START
1:
movzbl (%esi, %ecx, 4), %eax
movzbl bmap(%eax), %eax
movb %al, (%edi, %ecx, 4)
movzbl 1(%esi, %ecx, 4), %eax
movzbl gmap(%eax), %eax
movb %al, 1(%edi, %ecx, 4)
movzbl 2(%esi, %ecx, 4), %eax
movzbl rmap(%eax), %eax
movb %al, 2(%edi, %ecx, 4)
movzbl 3(%esi, %ecx, 4), %eax
movzbl amap(%eax), %eax
movb %al, 3(%edi, %ecx, 4)
incl %ecx
js 1b
LOOP_END
LEAVE
SIZE(imlib_mmx_copy_rgba_to_rgba_cmod)
PR_(imlib_mmx_copy_rgb_to_rgba_cmod):
ENTER
LOOP_START
1:
movzbl (%esi, %ecx, 4), %eax
movzbl bmap(%eax), %eax
movb %al, (%edi, %ecx, 4)
movzbl 1(%esi, %ecx, 4), %eax
movzbl gmap(%eax), %eax
movb %al, 1(%edi, %ecx, 4)
movzbl 2(%esi, %ecx, 4), %eax
movzbl rmap(%eax), %eax
movb %al, 2(%edi, %ecx, 4)
movb $0xff, 3(%edi, %ecx, 4)
incl %ecx
js 1b
LOOP_END
LEAVE
SIZE(imlib_mmx_copy_rgb_to_rgba_cmod)
PR_(imlib_mmx_add_blend_rgba_to_rgb_cmod):
ENTER
pxor %mm4, %mm4
LOOP_START
1:
/*\ Load source and destination \*/
LOAD1_CMOD
movd (%edi, %ecx, 4), %mm2
/*\ Get alpha from source and unpack/copy to eight bytes \*/
movq %mm1, %mm3
punpcklbw %mm3, %mm3
punpckhwd %mm3, %mm3
punpckhdq %mm3, %mm3
psrlw $1, %mm3
psrlq $16, %mm3
/*\ Unpack source and destination, bytes to words \*/
punpcklbw %mm4, %mm1
punpcklbw %mm4, %mm2
/*\ d = d + (a * s) \*/
psllw $1, %mm1
pmulhw %mm3, %mm1
paddw %mm1, %mm2
/*\ Pack into lower 4 bytes and save \*/
packuswb %mm4, %mm2
movd %mm2, (%edi, %ecx, 4)
incl %ecx
js 1b
LOOP_END
LEAVE
SIZE(imlib_mmx_add_blend_rgba_to_rgb_cmod)
PR_(imlib_mmx_add_blend_rgba_to_rgba_cmod):
ENTER
pxor %mm4, %mm4
LOAD_IMMQ(mVX000000, %mm5)
LOAD_IMMQ(m00XXXXXX, %mm6)
CLEANUP_IMMQ_LOADS(2)
LOOP_START
1:
/*\ Load source and destination \*/
LOAD1_CMOD
movd (%edi, %ecx, 4), %mm2
/*\ Get alpha from source and target and unpack/copy to eight bytes \*/
movq %mm2, %mm3
pxor %mm6, %mm3
paddusb %mm1, %mm3
punpcklbw %mm3, %mm3
punpckhwd %mm3, %mm3
punpckhdq %mm3, %mm3
psrlw $1, %mm3
/*\ Make the alpha value that gets multiplied to the
|*| alpha channels 0x7fff, so the resulting alpha value is
|*| the sum of the source and destination alpha values.
\*/
por %mm5, %mm3
/*\ Unpack source and destination, bytes to words \*/
punpcklbw %mm4, %mm1
punpcklbw %mm4, %mm2
/*\ d = d + (a * s) \*/
psllw $1, %mm1
pmulhw %mm3, %mm1
paddw %mm1, %mm2
/*\ Pack into lower 4 bytes and save \*/
packuswb %mm4, %mm2
movd %mm2, (%edi, %ecx, 4)
incl %ecx
js 1b
LOOP_END
LEAVE
SIZE(imlib_mmx_add_blend_rgba_to_rgba_cmod)
PR_(imlib_mmx_add_blend_rgb_to_rgb_cmod):
ENTER
pxor %mm4, %mm4
/*\ Load alpha beforehand, as it's always amap(0xff) \*/
movzbl amap_ff, %eax
movd %eax, %mm3
punpcklbw %mm3, %mm3
punpcklwd %mm3, %mm3
punpckldq %mm3, %mm3
psrlw $1, %mm3
psrlq $16, %mm3
LOOP_START
1:
/*\ Load source and destination \*/
LOAD1_CMOD_A00
movd (%edi, %ecx, 4), %mm2
/*\ Unpack source and destination, bytes to words \*/
punpcklbw %mm4, %mm1
punpcklbw %mm4, %mm2
/*\ d = d + (a * s) \*/
psllw $1, %mm1
pmulhw %mm3, %mm1
paddw %mm1, %mm2
/*\ Pack into lower 4 bytes and save \*/
packuswb %mm4, %mm2
movd %mm2, (%edi, %ecx, 4)
incl %ecx
js 1b
LOOP_END
LEAVE
SIZE(imlib_mmx_add_blend_rgb_to_rgb_cmod)
PR_(imlib_mmx_add_blend_rgb_to_rgba_cmod):
ENTER
pxor %mm4, %mm4
LOAD_IMMQ(mVX000000, %mm5)
LOAD_IMMQ(m00XXXXXX, %mm6)
CLEANUP_IMMQ_LOADS(2)
LOOP_START
1:
/*\ Load source and destination \*/
LOAD1_CMOD_AFF
movd (%edi, %ecx, 4), %mm2
/*\ Get alpha from source and target and unpack/copy to eight bytes \*/
movq %mm2, %mm3
pxor %mm6, %mm3
paddusb %mm1, %mm3
punpcklbw %mm3, %mm3
punpckhwd %mm3, %mm3
punpckhdq %mm3, %mm3
psrlw $1, %mm3
/*\ Make the alpha value that gets multiplied to the
|*| alpha channels 0x7fff, so the resulting alpha value is
|*| the sum of the source and destination alpha values.
\*/
por %mm5, %mm3
/*\ Unpack source and destination, bytes to words \*/
punpcklbw %mm4, %mm1
punpcklbw %mm4, %mm2
/*\ d = d + (a * s) \*/
psllw $1, %mm1
pmulhw %mm3, %mm1
paddw %mm1, %mm2
/*\ Pack into lower 4 bytes and save \*/
packuswb %mm4, %mm2
movd %mm2, (%edi, %ecx, 4)
incl %ecx
js 1b
LOOP_END
LEAVE
SIZE(imlib_mmx_add_blend_rgb_to_rgba_cmod)
PR_(imlib_mmx_add_copy_rgba_to_rgb_cmod):
ENTER
LOAD_IMMQ(m0XXX0XXX, %mm5)
CLEANUP_IMMQ_LOADS(1)
subl $4, %esi
subl $4, %edi
LOOP_START
incl %ecx
jz 2f
1:
/*\ Load source and destination \*/
LOAD2_CMOD
movq (%edi, %ecx, 4), %mm2
/*\ Clear alpha channel of source \*/
pand %mm5, %mm1
/*\ d = d + s, unsigned saturation, and save \*/
paddusb %mm1, %mm2
movq %mm2, (%edi, %ecx, 4)
addl $2, %ecx
js 1b
jnz 3f
2:
LOAD1_CMOD
movd (%edi), %mm2
pand %mm5, %mm1
paddusb %mm1, %mm2
movd %mm2, (%edi)
3:
LOOP_END
LEAVE
SIZE(imlib_mmx_add_copy_rgba_to_rgb_cmod)
PR_(imlib_mmx_add_copy_rgba_to_rgba_cmod):
ENTER
subl $4, %esi
subl $4, %edi
LOOP_START
incl %ecx
jz 2f
1:
/*\ Load source and destination \*/
LOAD2_CMOD
movq (%edi, %ecx, 4), %mm2
/*\ d = d + s, unsigned saturation, and save \*/
paddusb %mm1, %mm2
movq %mm2, (%edi, %ecx, 4)
addl $2, %ecx
js 1b
jnz 3f
2:
LOAD1_CMOD
movd (%edi), %mm2
paddusb %mm1, %mm2
movd %mm2, (%edi)
3:
LOOP_END
LEAVE
SIZE(imlib_mmx_add_copy_rgba_to_rgba_cmod)
PR_(imlib_mmx_add_copy_rgb_to_rgba_cmod):
ENTER
subl $4, %esi
subl $4, %edi
LOOP_START
incl %ecx
jz 2f
1:
/*\ Load source and destination \*/
LOAD2_CMOD_AFF
movq (%edi, %ecx, 4), %mm2
/*\ d = d + s, unsigned saturation, and save \*/
paddusb %mm1, %mm2
movq %mm2, (%edi, %ecx, 4)
addl $2, %ecx
js 1b
jnz 3f
2:
LOAD1_CMOD_AFF
movd (%edi), %mm2
paddusb %mm1, %mm2
movd %mm2, (%edi)
3:
LOOP_END
LEAVE
SIZE(imlib_mmx_add_copy_rgb_to_rgba_cmod)
PR_(imlib_mmx_subtract_blend_rgba_to_rgb_cmod):
ENTER
pxor %mm4, %mm4
LOOP_START
1:
/*\ Load source and destination \*/
LOAD1_CMOD
movd (%edi, %ecx, 4), %mm2
/*\ Get alpha from source and unpack/copy to eight bytes \*/
movq %mm1, %mm3
punpcklbw %mm3, %mm3
punpckhwd %mm3, %mm3
punpckhdq %mm3, %mm3
psrlw $1, %mm3
psrlq $16, %mm3
/*\ Unpack source and destination, bytes to words \*/
punpcklbw %mm4, %mm1
punpcklbw %mm4, %mm2
/*\ d = d - (a * s) \*/
psllw $1, %mm1
pmulhw %mm3, %mm1
psubw %mm1, %mm2
/*\ Pack into lower 4 bytes and save \*/
packuswb %mm4, %mm2
movd %mm2, (%edi, %ecx, 4)
incl %ecx
js 1b
LOOP_END
LEAVE
SIZE(imlib_mmx_subtract_blend_rgba_to_rgb_cmod)
PR_(imlib_mmx_subtract_blend_rgba_to_rgba_cmod):
ENTER
pxor %mm4, %mm4
LOAD_IMMQ(mV0000000, %mm5)
LOAD_IMMQ(m00XXXXXX, %mm6)
CLEANUP_IMMQ_LOADS(2)
LOOP_START
1:
/*\ Load source and destination \*/
LOAD1_CMOD
movd (%edi, %ecx, 4), %mm2
/*\ Get alpha from source and target and unpack/copy to eight bytes \*/
movq %mm2, %mm3
pxor %mm6, %mm3
paddusb %mm1, %mm3
punpcklbw %mm3, %mm3
punpckhwd %mm3, %mm3
punpckhdq %mm3, %mm3
psrlw $1, %mm3
/*\ Make alpha value that gets multiplied with alpha channel
|*| 0x8000, (-1.0), so that the alpha result is s + d
\*/
psrlq $16, %mm3
por %mm5, %mm3
/*\ Unpack source and destination, bytes to words \*/
punpcklbw %mm4, %mm1
punpcklbw %mm4, %mm2
/*\ d = d - (a * s) \*/
psllw $1, %mm1
pmulhw %mm3, %mm1
psubw %mm1, %mm2
/*\ Pack into lower 4 bytes and save \*/
packuswb %mm4, %mm2
movd %mm2, (%edi, %ecx, 4)
incl %ecx
js 1b
LOOP_END
LEAVE
SIZE(imlib_mmx_subtract_blend_rgba_to_rgba_cmod)
PR_(imlib_mmx_subtract_blend_rgb_to_rgb_cmod):
ENTER
pxor %mm4, %mm4
/*\ Load alpha beforehand, as it's always amap(0xff) \*/
movzbl amap_ff, %eax
movd %eax, %mm3
punpcklbw %mm3, %mm3
punpcklwd %mm3, %mm3
punpckldq %mm3, %mm3
psrlw $1, %mm3
psrlq $16, %mm3
LOOP_START
1:
/*\ Load source and destination \*/
LOAD1_CMOD_A00
movd (%edi, %ecx, 4), %mm2
/*\ Unpack source and destination, bytes to words \*/
punpcklbw %mm4, %mm1
punpcklbw %mm4, %mm2
/*\ d = d - (a * s) \*/
psllw $1, %mm1
pmulhw %mm3, %mm1
psubw %mm1, %mm2
/*\ Pack into lower 4 bytes and save \*/
packuswb %mm4, %mm2
movd %mm2, (%edi, %ecx, 4)
incl %ecx
js 1b
LOOP_END
LEAVE
SIZE(imlib_mmx_subtract_blend_rgb_to_rgb_cmod)
PR_(imlib_mmx_subtract_blend_rgb_to_rgba_cmod):
ENTER
pxor %mm4, %mm4
LOAD_IMMQ(mV0000000, %mm5)
LOAD_IMMQ(m00XXXXXX, %mm6)
CLEANUP_IMMQ_LOADS(2)
LOOP_START
1:
/*\ Load source and destination \*/
LOAD1_CMOD_AFF
movd (%edi, %ecx, 4), %mm2
/*\ Get alpha from source and target and unpack/copy to eight bytes \*/
movq %mm2, %mm3
pxor %mm6, %mm3
paddusb %mm1, %mm3
punpcklbw %mm3, %mm3
punpckhwd %mm3, %mm3
punpckhdq %mm3, %mm3
psrlw $1, %mm3
/*\ Make alpha value that gets multiplied with alpha channel
|*| 0x8000, (-1.0), so that the alpha result is s + d
\*/
psrlq $16, %mm3
por %mm5, %mm3
/*\ Unpack source and destination, bytes to words \*/
punpcklbw %mm4, %mm1
punpcklbw %mm4, %mm2
/*\ d = d - (a * s) \*/
psllw $1, %mm1
pmulhw %mm3, %mm1
psubw %mm1, %mm2
/*\ Pack into lower 4 bytes and save \*/
packuswb %mm4, %mm2
movd %mm2, (%edi, %ecx, 4)
incl %ecx
js 1b
LOOP_END
LEAVE
SIZE(imlib_mmx_subtract_blend_rgb_to_rgba_cmod)
PR_(imlib_mmx_subtract_copy_rgba_to_rgb_cmod):
ENTER
LOAD_IMMQ(m0XXX0XXX, %mm5)
CLEANUP_IMMQ_LOADS(1)
subl $4, %esi
subl $4, %edi
LOOP_START
incl %ecx
jz 2f
1:
/*\ Load source and destination \*/
LOAD2_CMOD
movq (%edi, %ecx, 4), %mm2
/*\ Clear alpha channel of source \*/
pand %mm5, %mm1
/*\ d = d - s, unsigned saturation, and save \*/
psubusb %mm1, %mm2
movq %mm2, (%edi, %ecx, 4)
addl $2, %ecx
js 1b
jnz 3f
2:
LOAD1_CMOD
movd (%edi), %mm2
pand %mm5, %mm1
psubusb %mm1, %mm2
movd %mm2, (%edi)
3:
LOOP_END
LEAVE
SIZE(imlib_mmx_subtract_copy_rgba_to_rgb_cmod)
PR_(imlib_mmx_subtract_copy_rgba_to_rgba_cmod):
ENTER
LOAD_IMMQ(mX000X000, %mm5)
CLEANUP_IMMQ_LOADS(1)
subl $4, %esi
subl $4, %edi
LOOP_START
incl %ecx
jz 2f
1:
/*\ Load source and destination \*/
LOAD2_CMOD
movq (%edi, %ecx, 4), %mm2
/*\ Negate destination alphas \*/
pxor %mm5, %mm2
/*\ d = d - s, unsigned saturation, and save \*/
psubusb %mm1, %mm2
/*\ Negate result alphas \*/
pxor %mm5, %mm2
movq %mm2, (%edi, %ecx, 4)
addl $2, %ecx
js 1b
jnz 3f
2:
LOAD1_CMOD
movd (%edi), %mm2
pxor %mm5, %mm2
psubusb %mm1, %mm2
pxor %mm5, %mm2
movd %mm2, (%edi)
3:
LOOP_END
LEAVE
SIZE(imlib_mmx_subtract_copy_rgba_to_rgba_cmod)
PR_(imlib_mmx_subtract_copy_rgb_to_rgba_cmod):
ENTER
LOAD_IMMQ(mX000X000, %mm5)
CLEANUP_IMMQ_LOADS(1)
subl $4, %esi
subl $4, %edi
LOOP_START
incl %ecx
jz 2f
1:
/*\ Load source and destination \*/
LOAD2_CMOD_AFF
movq (%edi, %ecx, 4), %mm2
pxor %mm5, %mm2
/*\ d = d - s, unsigned saturation, and save \*/
psubusb %mm1, %mm2
pxor %mm5, %mm2
movq %mm2, (%edi, %ecx, 4)
addl $2, %ecx
js 1b
jnz 3f
2:
LOAD1_CMOD_AFF
movd (%edi), %mm2
pxor %mm5, %mm2
psubusb %mm1, %mm2
pxor %mm5, %mm2
movd %mm2, (%edi)
3:
LOOP_END
LEAVE
SIZE(imlib_mmx_subtract_copy_rgb_to_rgba_cmod)
PR_(imlib_mmx_reshade_blend_rgba_to_rgb_cmod):
ENTER
pxor %mm4, %mm4
LOAD_IMMQ(m000V0V0V, %mm6)
CLEANUP_IMMQ_LOADS(1)
LOOP_START
1:
/*\ Load source and destination \*/
LOAD1_CMOD
movd (%edi, %ecx, 4), %mm2
/*\ Get alpha from source and unpack/copy to eight bytes \*/
movq %mm1, %mm3
punpcklbw %mm3, %mm3
punpckhwd %mm3, %mm3
punpckhdq %mm3, %mm3
psrlw $1, %mm3
psrlq $16, %mm3
/*\ Unpack source and destination, bytes to words \*/
punpcklbw %mm4, %mm1
punpcklbw %mm4, %mm2
/*\ d = d + (2 * a * (s - 127)) \*/
psubw %mm6, %mm1
psllw $2, %mm1
pmulhw %mm3, %mm1
paddw %mm1, %mm2
/*\ Pack into lower 4 bytes and save \*/
packuswb %mm4, %mm2
movd %mm2, (%edi, %ecx, 4)
incl %ecx
js 1b
LOOP_END
LEAVE
SIZE(imlib_mmx_reshade_blend_rgba_to_rgb_cmod)
PR_(imlib_mmx_reshade_blend_rgba_to_rgba_cmod):
ENTER
pxor %mm4, %mm4
LOAD_IMMQ(mI0000000, %mm5)
LOAD_IMMQ(m000V0V0V, %mm6)
LOAD_IMMQ(m00XXXXXX, %mm7)
CLEANUP_IMMQ_LOADS(3)
LOOP_START
1:
/*\ Load source and destination \*/
LOAD1_CMOD
movd (%edi, %ecx, 4), %mm2
/*\ Get alpha from source and target and unpack/copy to eight bytes \*/
movq %mm2, %mm3
pxor %mm7, %mm3
paddusb %mm1, %mm3
punpcklbw %mm3, %mm3
punpckhwd %mm3, %mm3
punpckhdq %mm3, %mm3
psrlw $1, %mm3
/*\ Make the alpha value that gets multiplied to the
|*| alpha channels 0x4000 (0.5), so the resulting alpha value is
|*| the sum of the source and destination alpha values.
\*/
psrlq $16, %mm3
por %mm5, %mm3
/*\ Unpack source and destination, bytes to words \*/
punpcklbw %mm4, %mm1
punpcklbw %mm4, %mm2
/*\ d = d + (2 * a * (s - 127)), (alpha channel: d = d + (2 * 0.5 * (s - 0)) ) \*/
psubw %mm6, %mm1
psllw $2, %mm1
pmulhw %mm3, %mm1
paddw %mm1, %mm2
/*\ Pack into lower 4 bytes and save \*/
packuswb %mm4, %mm2
movd %mm2, (%edi, %ecx, 4)
incl %ecx
js 1b
LOOP_END
LEAVE
SIZE(imlib_mmx_reshade_blend_rgba_to_rgba_cmod)
PR_(imlib_mmx_reshade_blend_rgb_to_rgb_cmod):
ENTER
pxor %mm4, %mm4
LOAD_IMMQ(m000V0V0V, %mm6)
CLEANUP_IMMQ_LOADS(1)
/*\ Load alpha beforehand, as it's always amap(0xff) \*/
movzbl amap_ff, %eax
movd %eax, %mm3
punpcklbw %mm3, %mm3
punpcklwd %mm3, %mm3
punpckldq %mm3, %mm3
psrlw $1, %mm3
psrlq $16, %mm3
LOOP_START
1:
/*\ Load source and destination \*/
LOAD1_CMOD_A00
movd (%edi, %ecx, 4), %mm2
/*\ Unpack source and destination, bytes to words \*/
punpcklbw %mm4, %mm1
punpcklbw %mm4, %mm2
/*\ d = d + (2 * a * (s - 127)) \*/
psubw %mm6, %mm1
psllw $2, %mm1
pmulhw %mm3, %mm1
paddw %mm1, %mm2
/*\ Pack into lower 4 bytes and save \*/
packuswb %mm4, %mm2
movd %mm2, (%edi, %ecx, 4)
incl %ecx
js 1b
LOOP_END
LEAVE
SIZE(imlib_mmx_reshade_blend_rgb_to_rgb_cmod)
PR_(imlib_mmx_reshade_blend_rgb_to_rgba_cmod):
ENTER
pxor %mm4, %mm4
LOAD_IMMQ(mI0000000, %mm5)
LOAD_IMMQ(m000V0V0V, %mm6)
LOAD_IMMQ(m00XXXXXX, %mm7)
CLEANUP_IMMQ_LOADS(3)
LOOP_START
1:
/*\ Load source and destination \*/
LOAD1_CMOD_AFF
movd (%edi, %ecx, 4), %mm2
/*\ Get alpha from source and target and unpack/copy to eight bytes \*/
movq %mm2, %mm3
pxor %mm7, %mm3
paddusb %mm1, %mm3
punpcklbw %mm3, %mm3
punpckhwd %mm3, %mm3
punpckhdq %mm3, %mm3
psrlw $1, %mm3
/*\ Make the alpha value that gets multiplied to the
|*| alpha channels 0x4000 (0.5), so the resulting alpha value is
|*| the sum of the source and destination alpha values.
\*/
psrlq $16, %mm3
por %mm5, %mm3
/*\ Unpack source and destination, bytes to words \*/
punpcklbw %mm4, %mm1
punpcklbw %mm4, %mm2
/*\ d = d + (2 * a * (s - 127)), (alpha channel: d = d + (2 * 0.5 * (s - 0)) ) \*/
psubw %mm6, %mm1
psllw $2, %mm1
pmulhw %mm3, %mm1
paddw %mm1, %mm2
/*\ Pack into lower 4 bytes and save \*/
packuswb %mm4, %mm2
movd %mm2, (%edi, %ecx, 4)
incl %ecx
js 1b
LOOP_END
LEAVE
SIZE(imlib_mmx_reshade_blend_rgb_to_rgba_cmod)
PR_(imlib_mmx_reshade_copy_rgba_to_rgb_cmod):
ENTER
pxor %mm4, %mm4
LOAD_IMMQ(m0XXX0XXX, %mm5)
LOAD_IMMQ(m0VVV0VVV, %mm6)
CLEANUP_IMMQ_LOADS(2)
subl $4, %esi
subl $4, %edi
LOOP_START
incl %ecx
jz 2f
1:
/*\ Load source and destination \*/
LOAD2_CMOD
movq (%edi, %ecx, 4), %mm2
/*\ To take advantage of saturation and be able to do 8 bytes
|*| at a time, we divide reshading into two separate steps:
|*| adding values above 128, and subtracting values below 128
|*| These values go into %mm1 and %mm3 respectively
|*| - %mm1 becomes (2 * (s - 127))
|*| - %mm3 becomes (2 * (255 - (127 + s))) = (2 * (128 - s))
\*/
movq %mm1, %mm3
psubusb %mm6, %mm1
paddusb %mm1, %mm1
paddusb %mm6, %mm3
pxor %mm5, %mm3
paddusb %mm3, %mm3
/*\ Clear alpha channel of s1 and s2 \*/
pand %mm5, %mm1
pand %mm5, %mm3
/*\ d = d + s1 - s2, unsigned saturation, and save \*/
paddusb %mm1, %mm2
psubusb %mm3, %mm2
movq %mm2, (%edi, %ecx, 4)
addl $2, %ecx
js 1b
jnz 3f
2:
LOAD1_CMOD
movd (%edi), %mm2
movq %mm1, %mm3
psubusb %mm6, %mm1
paddusb %mm1, %mm1
paddusb %mm6, %mm3
pxor %mm5, %mm3
paddusb %mm3, %mm3
pand %mm5, %mm1
pand %mm5, %mm3
paddusb %mm1, %mm2
psubusb %mm3, %mm2
movd %mm2, (%edi)
3:
LOOP_END
LEAVE
SIZE(imlib_mmx_reshade_copy_rgba_to_rgb_cmod)
PR_(imlib_mmx_reshade_copy_rgba_to_rgba_cmod):
ENTER
pxor %mm4, %mm4
LOAD_IMMQ(m0XXX0XXX, %mm5)
LOAD_IMMQ(m0VVV0VVV, %mm6)
CLEANUP_IMMQ_LOADS(2)
subl $4, %esi
subl $4, %edi
LOOP_START
incl %ecx
jz 2f
1:
/*\ Load source and destination \*/
LOAD2_CMOD
movq (%edi, %ecx, 4), %mm2
/*\ This time, the alpha channels have to be added.
|*| For that, the alpha channel of %mm1 should remain
|*| the same. This is done by subtracting 0 from the
|*| alpha channel, and then doing the *2 via a separate
|*| register, clearing its alpha channel first.
\*/
movq %mm1, %mm3
psubusb %mm6, %mm1
movq %mm1, %mm0
pand %mm5, %mm0
paddusb %mm0, %mm1
paddusb %mm6, %mm3
pxor %mm5, %mm3
paddusb %mm3, %mm3
/*\ Clear alpha channel of s2 \*/
pand %mm5, %mm3
/*\ d = d + s1 - s2, unsigned saturation, and save \*/
paddusb %mm1, %mm2
psubusb %mm3, %mm2
movq %mm2, (%edi, %ecx, 4)
addl $2, %ecx
js 1b
jnz 3f
2:
LOAD1_CMOD
movd (%edi), %mm2
movq %mm1, %mm3
psubusb %mm6, %mm1
movq %mm1, %mm0
pand %mm5, %mm0
paddusb %mm0, %mm1
paddusb %mm6, %mm3
pxor %mm5, %mm3
paddusb %mm3, %mm3
pand %mm5, %mm3
paddusb %mm1, %mm2
psubusb %mm3, %mm2
movd %mm2, (%edi)
3:
LOOP_END
LEAVE
SIZE(imlib_mmx_reshade_copy_rgba_to_rgba_cmod)
PR_(imlib_mmx_reshade_copy_rgb_to_rgba_cmod):
ENTER
pxor %mm4, %mm4
LOAD_IMMQ(m0XXX0XXX, %mm5)
LOAD_IMMQ(m0VVV0VVV, %mm6)
CLEANUP_IMMQ_LOADS(2)
subl $4, %esi
subl $4, %edi
LOOP_START
incl %ecx
jz 2f
1:
/*\ Load source and destination \*/
LOAD2_CMOD_AFF
movq (%edi, %ecx, 4), %mm2
movq %mm1, %mm3
psubusb %mm6, %mm1
movq %mm1, %mm0
pand %mm5, %mm0
paddusb %mm0, %mm1
paddusb %mm6, %mm3
pxor %mm5, %mm3
paddusb %mm3, %mm3
/*\ Clear alpha channel of s2 \*/
pand %mm5, %mm3
/*\ d = d + s1 - s2, unsigned saturation, and save \*/
paddusb %mm1, %mm2
psubusb %mm3, %mm2
movq %mm2, (%edi, %ecx, 4)
addl $2, %ecx
js 1b
jnz 3f
2:
LOAD1_CMOD_AFF
movd (%edi), %mm2
movq %mm1, %mm3
psubusb %mm6, %mm1
movq %mm1, %mm0
pand %mm5, %mm0
paddusb %mm0, %mm1
paddusb %mm6, %mm3
pxor %mm5, %mm3
paddusb %mm3, %mm3
pand %mm5, %mm3
paddusb %mm1, %mm2
psubusb %mm3, %mm2
movd %mm2, (%edi)
3:
LOOP_END
LEAVE
SIZE(imlib_mmx_reshade_copy_rgb_to_rgba_cmod)
#ifdef __ELF__
.section .note.GNU-stack,"",@progbits
#endif