diff --git a/src/loaderpath.h b/src/loaderpath.h
new file mode 100644
index 0000000..5757f51
--- /dev/null
+++ b/src/loaderpath.h
@@ -0,0 +1,2 @@
+#define SYS_LOADERS_PATH "/usr/local/lib/loaders"
+#define USER_LOADERS_PATH ".loaders"
diff --git a/src/rotate.c b/src/rotate.c
new file mode 100644
index 0000000..8c84b3c
--- /dev/null
+++ b/src/rotate.c
@@ -0,0 +1,185 @@
+
+#include "common.h"
+#include "rotate.h"
+
+/*\ I have no idea which of these two is faster..
+|*| The first one doesn't branch, the second one doesn't multiply..
+|*| Tests are inconclusive so far..
+|*| Maybe some kind of table lookup would be quicker ?? \*/
+#if 1
+#define RENORM_X_Y_SRC \
+ src += (x >> _ROTATE_PREC) + ((y >> _ROTATE_PREC) * sow); \
+ x &= _ROTATE_PREC_BITS; y &= _ROTATE_PREC_BITS;
+#else
+#define RENORM_X_Y_SRC \
+ while (x > _ROTATE_PREC_BITS) { \
+  x -= _ROTATE_PREC_BITS; \
+  src++;   \
+ }    \
+ while (x < 0) { \
+  x += _ROTATE_PREC_BITS; \
+  src--;   \
+ }    \
+ while (y > _ROTATE_PREC_BITS) { \
+  y -= _ROTATE_PREC_BITS; \
+  src += sow;  \
+ }    \
+ while (y < 0) { \
+  y += _ROTATE_PREC_BITS; \
+  src -= sow;  \
+ }
+#endif
+
+/*\ Rotate by pixel sampling only, target inside source \*/
+void
+__imlib_RotateSampleInside(DATA32 *src, DATA32 *dest, int sow, int dow,
+    int dw, int dh, int x, int y, int dx, int dy)
+{
+ int i;
+
+ if ((dw < 1) || (dh < 1)) return;
+
+/*\
+fprintf(stderr, "__imlib_RotateSampleInside(src = %p, dest = %p, sow = %d, dow = %d, dw = %d, dh = %d, x = %d, y = %d, dx = %d, dy = %d)\n", src, dest, sow, dow, dw, dh, x, y, dx, dy);
+\*/
+
+ i = 0;
+ src += (x >> _ROTATE_PREC) + ((y >> _ROTATE_PREC) * sow);
+ x &= _ROTATE_PREC_BITS; y &= _ROTATE_PREC_BITS;
+ while (1) {
+  do {
+   *dest = *src;
+   /*\ RIGHT; \*/
+   x += dx;
+   y += dy;
+   RENORM_X_Y_SRC;
+   dest++;
+   i++;
+  } while (i < dw);
+  dh--;
+  if (dh <= 0) break;
+  /*\ DOWN; \*/
+  x -= dy;
+  y += dx;
+  RENORM_X_Y_SRC;
+  dest += dow;
+  do {
+   /*\ LEFT; \*/
+   x -= dx;
+   y -= dy;
+   RENORM_X_Y_SRC;
+   dest--;
+   *dest = *src;
+   i--;
+  } while (i > 0);
+  dh--;
+  if (dh <= 0) break;
+  /*\ DOWN; \*/
+  x -= dy;
+  y += dx;
+  RENORM_X_Y_SRC;
+  dest += dow;
+ }
+}
+
+/*\ Testing shows this version to be 10% (!!!) faster (overall speed)
+|*| So I think we can live with its ugliness..
+|*| It's still a bottleneck, with it's 24 MULs per call..
+\*/
+
+/*\ bigendian and littleendian byte-from-int macro's \*/
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define R_VAL(x) (*(((DATA8 *)&(x))+(0)))
+#define G_VAL(x) (*(((DATA8 *)&(x))+(1)))
+#define B_VAL(x) (*(((DATA8 *)&(x))+(2)))
+#define A_VAL(x) (*(((DATA8 *)&(x))+(3)))
+#elif __BYTE_ORDER == __BIG_ENDIAN
+#define A_VAL(x) (*(((DATA8 *)&(x))+(0)))
+#define B_VAL(x) (*(((DATA8 *)&(x))+(1)))
+#define G_VAL(x) (*(((DATA8 *)&(x))+(2)))
+#define R_VAL(x) (*(((DATA8 *)&(x))+(3)))
+#elif __BYTE_ORDER == __PDP_ENDIAN
+#define B_VAL(x) (*(((DATA8 *)&(x))+(0)))
+#define A_VAL(x) (*(((DATA8 *)&(x))+(1)))
+#define R_VAL(x) (*(((DATA8 *)&(x))+(2)))
+#define G_VAL(x) (*(((DATA8 *)&(x))+(3)))
+#else
+#error Unknown byte endianness.
+#endif
+static inline DATA32
+Interp_ARGB(DATA32 ul, DATA32 ur, DATA32 ll, DATA32 lr, int x, int y)
+{
+ DATA32 rv;
+
+ R_VAL(rv) = ((((R_VAL(ul)) * (_ROTATE_PREC_MAX - x) +
+   (R_VAL(ur)) * x) * (_ROTATE_PREC_MAX - y) +
+  ((R_VAL(ll)) * (_ROTATE_PREC_MAX - x) +
+   (R_VAL(lr)) * x) * y) >> (2 * _ROTATE_PREC));
+
+ G_VAL(rv) = ((((G_VAL(ul)) * (_ROTATE_PREC_MAX - x) +
+   (G_VAL(ur)) * x) * (_ROTATE_PREC_MAX - y) +
+  ((G_VAL(ll)) * (_ROTATE_PREC_MAX - x) +
+   (G_VAL(lr)) * x) * y) >> (2 * _ROTATE_PREC));
+
+ B_VAL(rv) = ((((B_VAL(ul)) * (_ROTATE_PREC_MAX - x) +
+   (B_VAL(ur)) * x) * (_ROTATE_PREC_MAX - y) +
+  ((B_VAL(ll)) * (_ROTATE_PREC_MAX - x) +
+   (B_VAL(lr)) * x) * y) >> (2 * _ROTATE_PREC));
+
+ A_VAL(rv) = ((((A_VAL(ul)) * (_ROTATE_PREC_MAX - x) +
+   (A_VAL(ur)) * x) * (_ROTATE_PREC_MAX - y) +
+  ((A_VAL(ll)) * (_ROTATE_PREC_MAX - x) +
+   (A_VAL(lr)) * x) * y) >> (2 * _ROTATE_PREC));
+
+ return rv;
+}
+
+/*\ Same as last function, but with antialiasing \*/
+void
+__imlib_RotateAAInside(DATA32 *src, DATA32 *dest, int sow, int dow,
+   int dw, int dh, int x, int y, int dx, int dy)
+{
+ int i;
+
+ if ((dw < 1) || (dh < 1)) return;
+
+ i = 0;
+ src += (x >> _ROTATE_PREC) + ((y >> _ROTATE_PREC) * sow);
+ x &= _ROTATE_PREC_BITS; y &= _ROTATE_PREC_BITS;
+ while (1) {
+  do {
+   *dest = Interp_ARGB(src[0], src[1],
+     src[sow], src[sow + 1], x, y);
+   /*\ RIGHT; \*/
+   x += dx;
+   y += dy;
+   RENORM_X_Y_SRC;
+   dest++;
+   i++;
+  } while (i < dw);
+  dh--;
+  if (dh <= 0) break;
+  /*\ DOWN; \*/
+  x -= dy;
+  y += dx;
+  RENORM_X_Y_SRC;
+  dest += dow;
+  do {
+   /*\ LEFT; \*/
+   x -= dx;
+   y -= dy;
+   RENORM_X_Y_SRC;
+   --dest;
+   *dest = Interp_ARGB(src[0], src[1],
+     src[sow], src[sow + 1], x, y);
+   --i;
+  } while (i > 0);
+  dh--;
+  if (dh <= 0) break;
+  /*\ DOWN; \*/
+  x -= dy;
+  y += dx;
+  RENORM_X_Y_SRC;
+  dest += dow;
+ }
+}
diff --git a/src/rotate.h b/src/rotate.h
new file mode 100644
index 0000000..6543ef3
--- /dev/null
+++ b/src/rotate.h
@@ -0,0 +1,14 @@
+#ifndef __ROTATE
+#define __ROTATE 1
+
+/*\ Calc precision \*/
+#define _ROTATE_PREC 12
+#define _ROTATE_PREC_MAX (1 << _ROTATE_PREC)
+#define _ROTATE_PREC_BITS (_ROTATE_PREC_MAX - 1)
+
+void __imlib_RotateSampleInside(DATA32 *src, DATA32 *dest, int sow, int dow,
+    int dw, int dh, int x, int y, int dx, int dy);
+void __imlib_RotateAAInside(DATA32 *src, DATA32 *dest, int sow, int dow,
+   int dw, int dh, int x, int y, int dx, int dy);
+
+#endif