legacy-imlib2/src/lib/asm_rotate.S

455 lines
7.7 KiB
ArmAsm

#include <config.h>
#include "asm.h"
#ifdef DO_MMX_ASM
/*\
|*| MMX assembly rotation routine for Imlib2
|*| Written by Willem Monsuwe <willem@stack.nl>
\*/
.text
.align 8
FN_(imlib_mmx_RotateAA)
/*\ Prototype: __imlib_mmx_RotateAA(DATA32 *src, DATA32 *dest, int sow, int sw,
|*| int sh, int dow, int dw, int dh, int x, int y,
|*| int dxh, int dyh, int dxv, int dyv)
\*/
#define src 8(%ebp)
#define dest 12(%ebp)
#define sow 16(%ebp)
#define sw 20(%ebp)
#define sh 24(%ebp)
#define dow 28(%ebp)
#define dw 32(%ebp)
#define dh 36(%ebp)
#define x 40(%ebp)
#define y 44(%ebp)
#define dxh 48(%ebp)
#define dyh 52(%ebp)
#define dxv 56(%ebp)
#define dyv 60(%ebp)
/*\ Local variables \*/
#define j -4(%ebp)
#define dly -8(%ebp)
#define dlx -12(%ebp)
#define sht -16(%ebp)
#define swt -20(%ebp)
#define m0fffh -24(%ebp)
#define m0fff -28(%ebp)
#define mulsow -32(%ebp)
PR_(imlib_mmx_RotateAA):
pushl %ebp
movl %esp, %ebp
subl $40, %esp
pushl %ebx
pushl %ecx
pushl %edx
pushl %edi
pushl %esi
/*\ Check (dw > 0) && (dh > 0) \*/
cmpl $0, dw
jle .rotate_leave
cmpl $0, dh
jle .rotate_leave
pxor %mm7, %mm7
movl sow, %eax
sall $16, %eax
orl $1, %eax
movl %eax, mulsow
movl $0x0fff, %eax
movl %eax, m0fff
movl %eax, m0fffh
/*\ mm6 = x, y \*/
movq x, %mm6
/*\ edi = dest + dw \*/
movl dest, %edi
movl dw, %eax
leal (%edi, %eax, 4), %edi
/*\ dlx = dxv - dw * dxh \*/
movl dw, %eax
imull dxh, %eax
negl %eax
addl dxv, %eax
movl %eax, dlx
/*\ dly = dyv - dw * dyh \*/
movl dw, %eax
imull dyh, %eax
negl %eax
addl dyv, %eax
movl %eax, dly
/*\ j = dh \*/
movl dh, %eax
movl %eax, j
/*\ Check if all coordinates will be inside the source \*/
/*\ x < sw \*/
movl sw, %edx
movl x, %ecx
cmpl %edx, %ecx
jae .rotate_outside
/*\ x + dxh * dw < sw \*/
movl dxh, %ebx
imull dw, %ebx
addl %ebx, %ecx
cmpl %edx, %ecx
jae .rotate_outside
/*\ x + dxh * dw + dxv * dh < sw \*/
movl dxv, %eax
imull dh, %eax
subl %eax, %ecx
cmpl %edx, %ecx
jae .rotate_outside
/*\ x + dxv * dh < sw \*/
subl %ebx, %ecx
cmpl %edx, %ecx
jae .rotate_outside
/*\ y < sh \*/
movl sh, %edx
movl y, %ecx
cmpl %edx, %ecx
jae .rotate_outside
/*\ y + dyh * dw < sh \*/
movl dyh, %ebx
imull dw, %ebx
addl %ebx, %ecx
cmpl %edx, %ecx
jae .rotate_outside
/*\ y + dyh * dw + dyv * dh < sh \*/
movl dyv, %eax
imull dh, %eax
addl %eax, %ecx
cmpl %edx, %ecx
jae .rotate_outside
/*\ y + dyv * dh < sh \*/
subl %ebx, %ecx
cmpl %edx, %ecx
jae .rotate_outside
.rotate_inside:
movl sow, %ebx
movl src, %edx
.inside_loop_y:
/*\ i = -dw \*/
movl dw, %ecx
negl %ecx
.inside_loop_x:
/*\ esi = src + x >> 12 + (y >> 12) * sow \*/
movq %mm6, %mm0
psrad $12, %mm0
packssdw %mm0, %mm0
pmaddwd mulsow, %mm0
movd %mm0, %eax
leal (%edx, %eax, 4), %esi
/*\ x and y \*/
movq %mm6, %mm0
pand m0fff, %mm0
movq %mm0, %mm1
/*\ mm0 = x & 0xfff \*/
punpcklwd %mm0, %mm0
punpckldq %mm0, %mm0
/*\ mm1 = y & 0xfff \*/
punpckhwd %mm1, %mm1
punpckldq %mm1, %mm1
/*\ Load and unpack four pixels in parallel
|*| %mm2 = ptr[0], %mm3 = ptr[1]
|*| %mm4 = ptr[sow], %mm5 = ptr[sow + 1]
\*/
movq (%esi), %mm2
movq (%esi, %ebx, 4), %mm4
movq %mm2, %mm3
movq %mm4, %mm5
punpcklbw %mm7, %mm2
punpcklbw %mm7, %mm4
punpckhbw %mm7, %mm3
punpckhbw %mm7, %mm5
/*\ X interpolation: r = l + (r - l) * xap \*/
psubw %mm2, %mm3
psubw %mm4, %mm5
psllw $4, %mm3
psllw $4, %mm5
pmulhw %mm0, %mm3
pmulhw %mm0, %mm5
paddw %mm2, %mm3
paddw %mm4, %mm5
/*\ Y interpolation: d = u + (d - u) * yap \*/
psubw %mm3, %mm5
psllw $4, %mm5
pmulhw %mm1, %mm5
paddw %mm3, %mm5
packuswb %mm5, %mm5
movd %mm5, (%edi, %ecx, 4)
paddd dxh, %mm6
incl %ecx
jnz .inside_loop_x
paddd dlx, %mm6
movl dow, %ecx
leal (%edi, %ecx, 4), %edi
decl j
jnz .inside_loop_y
jmp .rotate_leave
.rotate_outside:
movl sw, %eax
decl %eax
sall $12, %eax
movl %eax, swt
movl sh, %eax
decl %eax
sall $12, %eax
movl %eax, sht
movl sow, %ebx
movl src, %edx
.outside_loop_y:
/*\ i = -dw \*/
movl dw, %ecx
negl %ecx
.outside_loop_x:
/*\ esi = src + x >> 12 + (y >> 12) * sow \*/
movq %mm6, %mm0
psrad $12, %mm0
packssdw %mm0, %mm0
pmaddwd mulsow, %mm0
movd %mm0, %eax
leal (%edx, %eax, 4), %esi
/*\ x & 0xfff and y & 0xfff \*/
movq %mm6, %mm0
pand m0fff, %mm0
movq %mm0, %mm1
/*\ x < swt \*/
movq %mm6, %mm2
psrlq $32, %mm2
movd %mm6, %eax
cmpl swt, %eax
jae 2f
/*\ y < sht \*/
movd %mm2, %eax
cmpl sht, %eax
jae 1f
/*\ 1234 \*/
.interp_argb:
/*\ Unpack x and y \*/
punpcklwd %mm0, %mm0
punpckldq %mm0, %mm0
punpckhwd %mm1, %mm1
punpckldq %mm1, %mm1
/*\ Load and unpack four pixels in parallel
|*| %mm2 = ptr[0], %mm3 = ptr[1]
|*| %mm4 = ptr[sow], %mm5 = ptr[sow + 1]
\*/
movq (%esi), %mm2
movq (%esi, %ebx, 4), %mm4
movq %mm2, %mm3
movq %mm4, %mm5
punpcklbw %mm7, %mm2
punpcklbw %mm7, %mm4
punpckhbw %mm7, %mm3
punpckhbw %mm7, %mm5
/*\ X interpolation: r = l + (r - l) * xap \*/
psubw %mm2, %mm3
psubw %mm4, %mm5
psllw $4, %mm3
psllw $4, %mm5
pmulhw %mm0, %mm3
pmulhw %mm0, %mm5
paddw %mm2, %mm3
paddw %mm4, %mm5
/*\ Y interpolation: d = u + (d - u) * yap \*/
psubw %mm3, %mm5
psllw $4, %mm5
pmulhw %mm1, %mm5
paddw %mm3, %mm5
packuswb %mm5, %mm5
movd %mm5, (%edi, %ecx, 4)
jmp .outside_il_end
1:
/*\ (-y-1) < 4096 \*/
notl %eax
cmpl $4095, %eax
ja 1f
/*\ ..34 \*/
pxor m0fff, %mm1
movd (%esi, %ebx, 4), %mm2
movd 4(%esi, %ebx, 4), %mm4
.interp_rgb_a0:
/*\ Unpack x and y \*/
punpcklwd %mm0, %mm0
punpckldq %mm0, %mm0
punpckhwd %mm1, %mm1
/*\ Unpack two pixels \*/
punpcklbw %mm7, %mm2
punpcklbw %mm7, %mm4
/*\ Interpolate \*/
psubw %mm2, %mm4
psllw $4, %mm4
pmulhw %mm0, %mm4
paddw %mm2, %mm4
/*\ Separate out alpha, multiply with mm1, and subtract \*/
movq %mm4, %mm2
psllq $48, %mm1
psllw $4, %mm4
pmulhw %mm1, %mm4
psubw %mm4, %mm2
packuswb %mm2, %mm2
movd %mm2, (%edi, %ecx, 4)
jmp .outside_il_end
1:
/*\ (y - sht) < 4096 \*/
notl %eax
subl sht, %eax
cmpl $4095, %eax
ja .outside_il_0
/*\ 12.. \*/
movd (%esi), %mm2
movd 4(%esi), %mm4
jmp .interp_rgb_a0
2:
/*\ Switch x and y \*/
psrlq $32, %mm0
psllq $32, %mm1
/*\ -x-1 < 4096 \*/
notl %eax
cmpl $4095, %eax
ja 2f
pxor m0fff, %mm1
/*\ y < sht \*/
movd %mm2, %eax
cmpl sht, %eax
jae 1f
/*\ .2.4 \*/
movd 4(%esi), %mm2
movd 4(%esi, %ebx, 4), %mm4
jmp .interp_rgb_a0
1:
/*\ (-y-1) < 4096 \*/
notl %eax
cmpl $4095, %eax
ja 1f
/*\ ...4 \*/
movd 4(%esi, %ebx, 4), %mm2
.interp_a000:
/*\ Separate out alpha, multiply with mm0 and mm1 \*/
pxor m0fff, %mm1
punpcklbw %mm7, %mm2
movq %mm2, %mm3
psllq $2, %mm0
psrlq $30, %mm1
pmulhw %mm0, %mm1
pxor m0fff, %mm1
psllq $48, %mm1
psllw $4, %mm3
pmulhw %mm1, %mm3
psubw %mm3, %mm2
packuswb %mm2, %mm2
movd %mm2, (%edi, %ecx, 4)
jmp .outside_il_end
1:
/*\ (y - sht) < 4096 \*/
notl %eax
subl sht, %eax
cmpl $4095, %eax
ja .outside_il_0
/*\ .2.. \*/
pxor m0fff, %mm0
movd 4(%esi), %mm2
jmp .interp_a000
2:
/*\ (x - swt) < 4096 \*/
notl %eax
subl swt, %eax
cmpl $4095, %eax
ja .outside_il_0
/*\ y < sht \*/
movd %mm2, %eax
cmpl sht, %eax
jae 1f
/*\ 1.3. \*/
movd (%esi), %mm2
movd (%esi, %ebx, 4), %mm4
jmp .interp_rgb_a0
1:
/*\ (-y-1) < 4096 \*/
notl %eax
cmpl $4095, %eax
ja 1f
/*\ ..3. \*/
movd (%esi, %ebx, 4), %mm2
jmp .interp_a000
1:
/*\ (y - sht) < 4096 \*/
notl %eax
subl sht, %eax
cmpl $4095, %eax
ja .outside_il_0
/*\ 1... \*/
pxor m0fff, %mm0
movd (%esi), %mm2
jmp .interp_a000
.outside_il_0:
movl $0, %eax
movl %eax, (%edi, %ecx, 4)
.outside_il_end:
paddd dxh, %mm6
incl %ecx
jnz .outside_loop_x
paddd dlx, %mm6
movl dow, %ecx
leal (%edi, %ecx, 4), %edi
decl j
jnz .outside_loop_y
.rotate_leave:
emms
popl %esi
popl %edi
popl %edx
popl %ecx
popl %ebx
movl %ebp, %esp
popl %ebp
ret
SIZE(imlib_mmx_RotateAA)
#endif
#ifdef __ELF__
.section .note.GNU-stack,"",@progbits
#endif