efl/legacy/evas/src/lib/engines/common/evas_scale_smooth_scaler.c

void
SCALE_FUNC(RGBA_Image *src, RGBA_Image *dst,
	   RGBA_Draw_Context *dc,
	   int src_region_x, int src_region_y,
	   int src_region_w, int src_region_h,
	   int dst_region_x, int dst_region_y,
	   int dst_region_w, int dst_region_h)
{
   int      divx, divy;
   int      x, y;
   int     *lin_ptr, *lin2_ptr;
   int     *interp_x, *interp_y;
   int     *sample_x, *sample_y;
   char    *iterate_x, *iterate_y;
   DATA32  *buf, *dptr;
   DATA32 **row_ptr, **row2_ptr;
   DATA32  *ptr, *dst_ptr, *dst_data, *ptr2, *ptr3, *ptr4;
   int      dst_jump;
   int      dst_clip_x, dst_clip_y, dst_clip_w, dst_clip_h;
   int      src_w, src_h, dst_w, dst_h;

   if (!(RECTS_INTERSECT(dst_region_x, dst_region_y, dst_region_w, dst_region_h, 0, 0, dst->image->w, dst->image->h)))
     return;
   if (!(RECTS_INTERSECT(src_region_x, src_region_y, src_region_w, src_region_h, 0, 0, src->image->w, src->image->h)))
     return;

   src_w = src->image->w;
   src_h = src->image->h;
   dst_w = dst->image->w;
   dst_h = dst->image->h;

   dst_data = dst->image->data;

   if (dc->clip.use)
     {
	dst_clip_x = dc->clip.x;
	dst_clip_y = dc->clip.y;
	dst_clip_w = dc->clip.w;
	dst_clip_h = dc->clip.h;
	if (dst_clip_x < 0)
	  {
	     dst_clip_w += dst_clip_x;
	     dst_clip_x = 0;
	  }
	if (dst_clip_y < 0)
	  {
	     dst_clip_h += dst_clip_y;
	     dst_clip_y = 0;
	  }
	if ((dst_clip_w <= 0) || (dst_clip_h <= 0)) return;
	if ((dst_clip_x + dst_clip_w) > dst_w) dst_clip_w = dst_w - dst_clip_x;
	if ((dst_clip_y + dst_clip_h) > dst_h) dst_clip_h = dst_h - dst_clip_y;
     }
   else
     {
	dst_clip_x = 0;
	dst_clip_y = 0;
	dst_clip_w = dst_w;
	dst_clip_h = dst_h;
     }

   if (dst_clip_x < dst_region_x)
     {
	dst_clip_w += dst_clip_x - dst_region_x;
	dst_clip_x = dst_region_x;
     }
   if ((dst_clip_x + dst_clip_w) > (dst_region_x + dst_region_w))
     dst_clip_w = dst_region_x + dst_region_w - dst_clip_x;
   if (dst_clip_y < dst_region_y)
     {
	dst_clip_h += dst_clip_y - dst_region_y;
	dst_clip_y = dst_region_y;
     }
   if ((dst_clip_y + dst_clip_h) > (dst_region_y + dst_region_h))
     dst_clip_h = dst_region_y + dst_region_h - dst_clip_y;

   if ((src_region_w <= 0) || (src_region_h <= 0) ||
       (dst_region_w <= 0) || (dst_region_h <= 0) ||
       (dst_clip_w <= 0) || (dst_clip_h <= 0))
     return;

   /* sanitise x */
   if (src_region_x < 0)
     {
	dst_region_x -= (src_region_x * dst_region_w) / src_region_w;
	dst_region_w += (src_region_x * dst_region_w) / src_region_w;
	src_region_w += src_region_x;
	src_region_x = 0;
     }
   if (src_region_x >= src_w) return;
   if ((src_region_x + src_region_w) > src_w)
     {
	dst_region_w = (dst_region_w * (src_w - src_region_x)) / (src_region_w);
	src_region_w = src_w - src_region_x;
     }
   if (dst_region_w <= 0) return;
   if (src_region_w <= 0) return;
   if (dst_clip_x < 0)
     {
	dst_clip_w += dst_clip_x;
	dst_clip_x = 0;
     }
   if (dst_clip_w <= 0) return;
   if (dst_clip_x >= dst_w) return;
   if (dst_clip_x < dst_region_x)
     {
	dst_clip_w += (dst_clip_x - dst_region_x);
	dst_clip_x = dst_region_x;
     }
   if ((dst_clip_x + dst_clip_w) > dst_w)
     {
	dst_clip_w = dst_w - dst_clip_x;
     }
   if (dst_clip_w <= 0) return;

   /* sanitise y */
   if (src_region_y < 0)
     {
	dst_region_y -= (src_region_y * dst_region_h) / src_region_h;
	dst_region_h += (src_region_y * dst_region_h) / src_region_h;
	src_region_h += src_region_y;
	src_region_y = 0;
     }
   if (src_region_y >= src_h) return;
   if ((src_region_y + src_region_h) > src_h)
     {
	dst_region_h = (dst_region_h * (src_h - src_region_y)) / (src_region_h);
	src_region_h = src_h - src_region_y;
     }
   if (dst_region_h <= 0) return;
   if (src_region_h <= 0) return;
   if (dst_clip_y < 0)
     {
	dst_clip_h += dst_clip_y;
	dst_clip_y = 0;
     }
   if (dst_clip_h <= 0) return;
   if (dst_clip_y >= dst_h) return;
   if (dst_clip_y < dst_region_y)
     {
	dst_clip_h += (dst_clip_y - dst_region_y);
	dst_clip_y = dst_region_y;
     }
   if ((dst_clip_y + dst_clip_h) > dst_h)
     {
	dst_clip_h = dst_h - dst_clip_y;
     }
   if (dst_clip_h <= 0) return;

   lin_ptr = malloc(dst_clip_w * sizeof(int));
   if (!lin_ptr) goto no_lin_ptr;
   row_ptr = malloc(dst_clip_h * sizeof(DATA32 *));
   if (!row_ptr) goto no_row_ptr;
   lin2_ptr = malloc(dst_clip_w * sizeof(int));
   if (!lin2_ptr) goto no_lin2_ptr;
   row2_ptr = malloc(dst_clip_h * sizeof(DATA32 *));
   if (!row2_ptr) goto no_row2_ptr;
   interp_x = malloc(dst_clip_w * sizeof(int));
   if (!interp_x) goto no_interp_x;
   interp_y = malloc(dst_clip_h * sizeof(int));
   if (!interp_y) goto no_interp_y;
   sample_x = malloc(dst_clip_w * sizeof(int) * 3);
   if (!sample_x) goto no_sample_x;
   sample_y = malloc(dst_clip_h * sizeof(int) * 3);
   if (!sample_y) goto no_sample_y;
   iterate_x = malloc(dst_clip_w * sizeof(char));
   if (!iterate_x) goto no_iterate_x;
   iterate_y = malloc(dst_clip_h * sizeof(char));
   if (!iterate_y) goto no_iterate_y;

   /* figure out dst jump */
   dst_jump = dst_w - dst_clip_w;

   /* figure out dest start ptr */
   dst_ptr = dst_data + dst_clip_x + (dst_clip_y * dst_w);

/* FIXME:
 *
 * things to do later for speedups:
 *
 * break upscale into 3 cases (as listed below - up:up, 1:up, up:1)
 *
 * break downscale into more cases (as listed below)
 *
 * roll func (blend/copy/cultiply/cmod) code into inner loop of scaler.
 * (578 fps vs 550 in mmx upscale in evas demo - this means probably
 *  a good 10-15% speedup over the func call, but means massively larger
 *  code)
 *
 * anything involving downscaling has no mmx equivalent code and maybe the
 * C could do with a little work.
 *
 * ---------------------------------------------------------------------------
 *
 * (1 = no scaling (1:1 ratio), + = scale up, - = scale down)
 * (* == fully optimised mmx, # = fully optimised C)
 *
 * h:v mmx C
 *
 * 1:1 *   #
 *
 * +:+ *   #
 * 1:+ *   #
 * +:1 *   #
 *
 * 1:-
 * -:1
 * +:-
 * -:+
 * -:-
 *
 */

   /* if 1:1 scale */
   if ((dst_region_w == src_region_w) &&
       (dst_region_h == src_region_h))
     {
	DATA32 *src_data;

	src_data = src->image->data;
	ptr = src_data + ((dst_clip_y - dst_region_y + src_region_y) * src_w) + (dst_clip_x - dst_region_x) + src_region_x;
	if (dc->mod.use)
	  {
	     Gfx_Func_Blend_Src_Cmod_Dst func;

	     func = evas_common_draw_func_blend_cmod_get(src, dst, dst_clip_w);
	     for (y = 0; y < dst_clip_h; y++)
	       {
		  func(ptr, dst_ptr, dst_clip_w, dc->mod.r, dc->mod.g, dc->mod.b, dc->mod.a);
		  ptr += src_w;
		  dst_ptr += dst_w;
	       }
	  }
	else if (dc->mul.use)
	  {
	     Gfx_Func_Blend_Src_Mul_Dst func;

	     func = evas_common_draw_func_blend_mul_get(src, dc->mul.col, dst, dst_clip_w);
	     for (y = 0; y < dst_clip_h; y++)
	       {
		  func(ptr, dst_ptr, dst_clip_w, dc->mul.col);
		  ptr += src_w;
		  dst_ptr += dst_w;
	       }
	  }
	else
	  {
	     Gfx_Func_Blend_Src_Dst func;

	     func = evas_common_draw_func_blend_get(src, dst, dst_clip_w);
	     for (y = 0; y < dst_clip_h; y++)
	       {
		  func(ptr, dst_ptr, dst_clip_w);
		  ptr += src_w;
		  dst_ptr += dst_w;
	       }
	  }
     }
   else
     {
	Gfx_Func_Blend_Src_Cmod_Dst func_cmod;
	Gfx_Func_Blend_Src_Mul_Dst  func_mul;
	Gfx_Func_Blend_Src_Dst      func;

	/* a scanline buffer */
	buf = malloc(dst_clip_w * sizeof(DATA32));
	if (!buf) goto no_buf;

	func      = evas_common_draw_func_blend_get      (src, dst, dst_clip_w);
	func_cmod = evas_common_draw_func_blend_cmod_get (src, dst, dst_clip_w);
	func_mul  = evas_common_draw_func_blend_mul_get  (src, dc->mul.col, dst, dst_clip_w);

	/* scaling up only - dont need anything except original */
	if ((dst_region_w >= src_region_w) && (dst_region_h >= src_region_h))
	  {
	     DATA32 *src_data;

	     src_data = src->image->data;

	     for (x = 0; x < dst_clip_w; x++)
	       {
		  if (src_region_w > 1)
		    {
		       lin_ptr[x] = (((x + dst_clip_x - dst_region_x) *
				      (src_region_w)) / dst_region_w);
		       interp_x[x] = ((((x + dst_clip_x - dst_region_x) *
					(src_region_w)) << 8) / dst_region_w) -
			 (lin_ptr[x] << 8);
		       lin_ptr[x] += src_region_x;
		    }
		  else
		    {
		       lin_ptr[x] = (((x + dst_clip_x - dst_region_x) *
				      src_region_w) / dst_region_w);
		       interp_x[x] = 0;
		       lin_ptr[x] += src_region_x;
		    }
	       }
	     for (y = 0; y < dst_clip_h; y++)
	       {
		  int pos;

		  if (src_region_h > 1)
		    {
		       pos = (((y + dst_clip_y - dst_region_y) *
			       (src_region_h)) / dst_region_h);
		       row_ptr[y] = src_data + ((pos + src_region_y) * src_w);
		       interp_y[y] = ((((y + dst_clip_y - dst_region_y) *
					(src_region_h)) << 8) / dst_region_h) -
			 (pos << 8);
		    }
		  else
		    {
		       pos = (((y + dst_clip_y - dst_region_y) *
			       src_region_h) / dst_region_h);
		       row_ptr[y] = src_data + ((pos + src_region_y) * src_w);
		       interp_y[y] = 0;
		    }
	       }

	     dptr = dst_ptr;

#ifndef SCALE_USING_MMX
	     if (src->flags & RGBA_IMAGE_HAS_ALPHA)
	       {
		  for (y = 0; y < dst_clip_h; y++)
		    {
		       int i, j, k, l, px;
		       DATA8 p1r, p1g, p1b, p1a;
		       DATA8 p2r, p2g, p2b, p2a;
		       DATA32 *lptr1, *lptr2;

		       lptr1 = row_ptr[y];
		       if ((lptr1 + src_w) >=
			   (src->image->data +
			    (src->image->w *
			     src->image->h)))
			 lptr2 = lptr1;
		       else
			 lptr2 = lptr1 + src_w;

		       k = interp_y[y];
		       l = 256 - k;

		       dst_ptr = buf;

		       px = -1;

		       for (x = 0; x < dst_clip_w; x++)
			 {
			    i = interp_x[x];
			    j = 256 - i;

			    if (px != lin_ptr[x])
			      {
				 px = lin_ptr[x];

				 ptr = lptr1 + lin_ptr[x];
				 ptr3 = lptr2 + lin_ptr[x];
				 if (lin_ptr[x] >= (src->image->w - 1))
				   {
				      ptr2 = ptr;
				      ptr4 = ptr3;
				   }
				 else
				   {
				      ptr2 = ptr + 1;
				      ptr4 = ptr3 + 1;
				   }

				 if (A_VAL(ptr) |
				     A_VAL(ptr2) |
				     A_VAL(ptr3) |
				     A_VAL(ptr4))
				   {
				      p1r = INTERP_2(R_VAL(ptr), R_VAL(ptr3), k, l);
				      p1g = INTERP_2(G_VAL(ptr), G_VAL(ptr3), k, l);
				      p1b = INTERP_2(B_VAL(ptr), B_VAL(ptr3), k, l);
				      p1a = INTERP_2(A_VAL(ptr), A_VAL(ptr3), k, l);
				      p2r = INTERP_2(R_VAL(ptr2), R_VAL(ptr4), k, l);
				      p2g = INTERP_2(G_VAL(ptr2), G_VAL(ptr4), k, l);
				      p2b = INTERP_2(B_VAL(ptr2), B_VAL(ptr4), k, l);
				      p2a = INTERP_2(A_VAL(ptr2), A_VAL(ptr4), k, l);
				   }
				 else
				   {
				      p1a = 0;
				      p2a = 0;
				   }
			      }
			    if (p1a | p2a)
			      {
				 R_VAL(dst_ptr) = INTERP_2(p1r, p2r, i, j);
				 G_VAL(dst_ptr) = INTERP_2(p1g, p2g, i, j);
				 B_VAL(dst_ptr) = INTERP_2(p1b, p2b, i, j);
				 A_VAL(dst_ptr) = INTERP_2(p1a, p2a, i, j);
			      }
			    else
			      A_VAL(dst_ptr) = 0;

			    dst_ptr++;
			 }
		       /* * blend here [clip_w *] buf -> dptr * */
		       if (dc->mod.use)
			 func_cmod(buf, dptr, dst_clip_w, dc->mod.r, dc->mod.g, dc->mod.b, dc->mod.a);
		       else if (dc->mul.use)
			 func_mul(buf, dptr, dst_clip_w, dc->mul.col);
		       else
			 func(buf, dptr, dst_clip_w);
		       dptr += dst_w;
		    }
	       }
	     else
	       {
		  for (y = 0; y < dst_clip_h; y++)
		    {
		       int i, j, k, l, px;
		       DATA8 p1r, p1g, p1b;
		       DATA8 p2r, p2g, p2b;
		       DATA32 *lptr1, *lptr2;

		       lptr1 = row_ptr[y];
		       if ((lptr1 + src_w) >=
			   (src->image->data +
			    (src->image->w *
			     src->image->h)))
			 lptr2 = lptr1;
		       else
			 lptr2 = lptr1 + src_w;

		       k = interp_y[y];
		       l = 256 - k;

		       dst_ptr = buf;

		       px = -1;

		       for (x = 0; x < dst_clip_w; x++)
			 {
			    i = interp_x[x];
			    j = 256 - i;

			    if (px != lin_ptr[x])
			      {
				 px = lin_ptr[x];

				 ptr = lptr1 + lin_ptr[x];
				 ptr3 = lptr2 + lin_ptr[x];
				 if (lin_ptr[x] >= (src->image->w - 1))
				   {
				      ptr2 = ptr;
				      ptr4 = ptr3;
				   }
				 else
				   {
				      ptr2 = ptr + 1;
				      ptr4 = ptr3 + 1;
				   }

				 p1r = INTERP_2(R_VAL(ptr), R_VAL(ptr3), k, l);
				 p1g = INTERP_2(G_VAL(ptr), G_VAL(ptr3), k, l);
				 p1b = INTERP_2(B_VAL(ptr), B_VAL(ptr3), k, l);
				 p2r = INTERP_2(R_VAL(ptr2), R_VAL(ptr4), k, l);
				 p2g = INTERP_2(G_VAL(ptr2), G_VAL(ptr4), k, l);
				 p2b = INTERP_2(B_VAL(ptr2), B_VAL(ptr4), k, l);
			      }
			    R_VAL(dst_ptr) = INTERP_2(p1r, p2r, i, j);
			    G_VAL(dst_ptr) = INTERP_2(p1g, p2g, i, j);
			    B_VAL(dst_ptr) = INTERP_2(p1b, p2b, i, j);
			    A_VAL(dst_ptr) = 0xff;

			    dst_ptr++;
			 }
		       /* * blend here [clip_w *] buf -> dptr * */
		       if (dc->mod.use)
			 func_cmod(buf, dptr, dst_clip_w, dc->mod.r, dc->mod.g, dc->mod.b, dc->mod.a);
		       else if (dc->mul.use)
			 func_mul(buf, dptr, dst_clip_w, dc->mul.col);
		       else
			 func(buf, dptr, dst_clip_w);
		       dptr += dst_w;
		    }
	       }
#else
	     pxor_r2r(mm7, mm7);
	     for (y = 0; y < dst_clip_h; y++)
	       {
		  DATA32 *lptr1, *lptr2;
		  int i, j, k, l, px;

		  lptr1 = row_ptr[y];
		  if ((lptr1 + src_w) >=
		      (src->image->data +
		       (src->image->w *
			src->image->h)))
		    lptr2 = lptr1;
		  else
		    lptr2 = lptr1 + src_w;

		  k = interp_y[y];
		  l = 256 - k;

		  dst_ptr = buf;

		  /* mm5 = k */
		  /* mm6 = l */
		  movd_m2r(k, mm5);
		  punpcklwd_r2r(mm5, mm5);
		  punpckldq_r2r(mm5, mm5);
		  movd_m2r(l, mm6);
		  punpcklwd_r2r(mm6, mm6);
		  punpckldq_r2r(mm6, mm6);

		  px = -1;

		  for (x = 0; x < dst_clip_w; x++)
		    {
		       i = interp_x[x];
		       j = 256 - i;

		       /* if we have a new pair of horizontal pixels to */
		       /* interpolate between them vertically */
		       if (px != lin_ptr[x])
			 {
			    px = lin_ptr[x];

			    ptr = lptr1 + lin_ptr[x];
			    ptr3 = lptr2 + lin_ptr[x];
			    if (lin_ptr[x] >= (src->image->w - 1))
			      {
				 ptr2 = ptr;
				 ptr4 = ptr3;
			      }
			    else
			      {
				 ptr2 = ptr + 1;
				 ptr4 = ptr3 + 1;
			      }

			    /* left edge */
			    movd_m2r(ptr[0], mm0);
			    punpcklbw_r2r(mm7, mm0);
			    pmullw_r2r(mm6, mm0);

			    movd_m2r(ptr3[0], mm1);
			    punpcklbw_r2r(mm7, mm1);
			    pmullw_r2r(mm5, mm1);

			    paddw_r2r(mm1, mm0);
			    psrlw_i2r(8, mm0);
			    /* mm0 = left edge */

			    /* right edge */
			    movd_m2r(ptr2[0], mm1);
			    punpcklbw_r2r(mm7, mm1);
			    pmullw_r2r(mm6, mm1);

			    movd_m2r(ptr4[0], mm2);
			    punpcklbw_r2r(mm7, mm2);
			    pmullw_r2r(mm5, mm2);

			    paddw_r2r(mm2, mm1);
			    psrlw_i2r(8, mm1);
			    /* mm1 = right edge */
			 }

		       movq_r2r(mm1, mm4);

		       movd_m2r(i, mm2);
		       punpcklwd_r2r(mm2, mm2);
		       punpckldq_r2r(mm2, mm2);
		       pmullw_r2r(mm2, mm4);

		       movq_r2r(mm0, mm3);

		       movd_m2r(j, mm2);
		       punpcklwd_r2r(mm2, mm2);
		       punpckldq_r2r(mm2, mm2);
		       pmullw_r2r(mm2, mm3);

		       paddw_r2r(mm4, mm3);
		       psrlw_i2r(8, mm3);

		       packuswb_r2r(mm3, mm3);
		       movd_r2m(mm3, dst_ptr[0]);

		       dst_ptr++;
		    }
		  /* * blend here [clip_w *] buf -> dptr * */
		  if (dc->mod.use)
		    func_cmod(buf, dptr, dst_clip_w, dc->mod.r, dc->mod.g, dc->mod.b, dc->mod.a);
		  else if (dc->mul.use)
		    func_mul(buf, dptr, dst_clip_w, dc->mul.col);
		  else
		    func(buf, dptr, dst_clip_w);
		  dptr += dst_w;
	       }
#endif
	  }
	else
#ifdef BUILD_SCALE_TRILINEAR
	  /* scaling down... funkiness */
	  {
	     DATA32       *src_data, *src2_data;
	     RGBA_Surface *srf1, *srf2;
	     int           mix;

	     /* no mipmaps at all- need to populate mipmap table */
	     if (src->mipmaps.num == 0)
	       evas_common_scale_rgba_gen_mipmap_table(src);
	     /* caclulate mix level between mipmaps */
	     mix = evas_common_scale_rgba_get_mipmap_surfaces(src,
						  src_region_w, src_region_h,
						  dst_region_w, dst_region_h,
						  &srf1, &srf2);
	     /* get 2 mipmap levels */
	     src_data = srf1->data; /* high res surface (sample) */
	     src2_data = srf2->data; /* lower res surface (interp) */
	     /* fill scale + interp tables */
	     /* lin & row ptr for lowest mipmap (hi res) */
	     divx = src->image->w / srf1->w;
	     divy = src->image->h / srf1->h;

	     /* merely sampled pixels in the hi-res mipmap */
	     for (x = 0; x < dst_clip_w; x++)
	       {
		  lin_ptr[x] = (((x + dst_clip_x - dst_region_x) * src_region_w) / (dst_region_w * divx)) + src_region_x;
	       }
	     for (y = 0; y < dst_clip_h; y++)
	       {
		  row_ptr[y] = src_data + (((((y + dst_clip_y - dst_region_y) * src_region_h) / (dst_region_h * divy)) + src_region_y) * srf1->w);
	       }

	     /* lin & row ptr for low-res mipmap */
	     divx = src->image->w / srf2->w;
	     divy = src->image->h / srf2->h;

	     for (x = 0; x < dst_clip_w; x++)
	       {
		  if (src_region_w > 1)
		    {
		       lin2_ptr[x] = (((x + dst_clip_x - dst_region_x) *
				       (src_region_w - 1)) / dst_region_w);
		       lin2_ptr[x] /= divx;
		       interp_x[x] = (((((x + dst_clip_x - dst_region_x) *
					 (src_region_w - 1)) << 8) / dst_region_w) / divx) -
			 (lin2_ptr[x] << 8);
		       lin2_ptr[x] += src_region_x;
		    }
		  else
		    {
		       lin2_ptr[x] = (((x + dst_clip_x - dst_region_x) *
				       src_region_w) / dst_region_w);
		       lin2_ptr[x] /= divx;
		       interp_x[x] = 0;
		       lin2_ptr[x] += src_region_x;
		    }
	       }
	     for (y = 0; y < dst_clip_h; y++)
	       {
		  int pos;

		  if (src_region_h > 1)
		    {
		       pos = (((y + dst_clip_y - dst_region_y) *
			       (src_region_h - 1)) / dst_region_h);
		       pos /= divy;
		       row2_ptr[y] = src2_data + ((pos + src_region_y) * srf2->w);
		       interp_y[y] = (((((y + dst_clip_y - dst_region_y) *
					 (src_region_h - 1)) << 8) / dst_region_h) / divy) -
			 (pos << 8);
		    }
		  else
		    {
		       pos = (((y + dst_clip_y - dst_region_y) *
			       src_region_h) / dst_region_h);
		       pos /= divy;
		       row2_ptr[y] = src2_data + ((pos + src_region_y) * srf2->w);
		       interp_y[y] = 0;
		    }
	       }
	     /* lin2 & row2 ptr for higher mipmap - need interp too */
	     /* ... */
	     /* scale to dst */
	     dptr = dst_ptr;

	     if (src->flags & RGBA_IMAGE_HAS_ALPHA)
	       {
		  for (y = 0; y < dst_clip_h; y++)
		    {
		       dst_ptr = buf;
		       for (x = 0; x < dst_clip_w; x++)
			 {
			    DATA32 hi_col;
			    DATA32 lo_col;

			    /* lowest (higest res) mipmap level - just pick it */
			    ptr = row_ptr[y] + lin_ptr[x];
			    hi_col = *ptr;
			    /* higer mipmap level - need to use interp */
			    /* lowest (higest res) mipmap level - just pick it */
			    ptr = row2_ptr[y] + lin2_ptr[x];
			    ptr2 = row2_ptr[y] + lin2_ptr[x] + 1;
			    ptr3 = row2_ptr[y] + lin2_ptr[x] + srf2->w;
			    ptr4 = row2_ptr[y] + lin2_ptr[x] + srf2->w + 1;

			    /* higer mipmap level - need to use interp */
			    INTERP_VAL(A_VAL(&lo_col),
				       A_VAL(ptr), A_VAL(ptr2),
				       A_VAL(ptr3), A_VAL(ptr4),
				       interp_x[x], interp_y[y]);
			    if (A_VAL(&hi_col) | A_VAL(&lo_col))
			      {
				 INTERP_VAL(R_VAL(&lo_col),
					    R_VAL(ptr), R_VAL(ptr2),
					    R_VAL(ptr3), R_VAL(ptr4),
					    interp_x[x], interp_y[y]);
				 INTERP_VAL(G_VAL(&lo_col),
					    G_VAL(ptr), G_VAL(ptr2),
					    G_VAL(ptr3), G_VAL(ptr4),
					    interp_x[x], interp_y[y]);
				 INTERP_VAL(B_VAL(&lo_col),
					    B_VAL(ptr), B_VAL(ptr2),
					    B_VAL(ptr3), B_VAL(ptr4),
					    interp_x[x], interp_y[y]);

				 /* blend pixel 1 and 2, mix & 255 - mix each */
				 R_VAL(dst_ptr) = ((R_VAL(&hi_col) * mix) + (R_VAL(&lo_col) * (256 - mix))) >> 8;
				 G_VAL(dst_ptr) = ((G_VAL(&hi_col) * mix) + (G_VAL(&lo_col) * (256 - mix))) >> 8;
				 B_VAL(dst_ptr) = ((B_VAL(&hi_col) * mix) + (B_VAL(&lo_col) * (256 - mix))) >> 8;
				 A_VAL(dst_ptr) = ((A_VAL(&hi_col) * mix) + (A_VAL(&lo_col) * (256 - mix))) >> 8;
			      }
			    else
			      {
				 A_VAL(dst_ptr) = 0;
			      }
			    dst_ptr++;
			 }
		       /* * blend here [clip_w *] buf -> dptr * */
		       if (dc->mod.use)
			 {
			    func_cmod(buf, dptr, dst_clip_w, dc->mod.r, dc->mod.g, dc->mod.b, dc->mod.a);
			 }
		       else if (dc->mul.use)
			 {
			    func_mul(buf, dptr, dst_clip_w, dc->mul.col);
			 }
		       else
			 {
			    func(buf, dptr, dst_clip_w);
			 }
		       dptr += dst_w;
		    }
	       }
	     else
	       {
		  for (y = 0; y < dst_clip_h; y++)
		    {
		       dst_ptr = buf;
		       for (x = 0; x < dst_clip_w; x++)
			 {
			    DATA32 hi_col;
			    DATA32 lo_col;

			    /* lowest (higest res) mipmap level - just pick it */
			    ptr = row_ptr[y] + lin_ptr[x];
			    hi_col = *ptr;
			    /* higer mipmap level - need to use interp */
			    /* lowest (higest res) mipmap level - just pick it */
			    ptr = row2_ptr[y] + lin2_ptr[x];
			    ptr2 = row2_ptr[y] + lin2_ptr[x] + 1;
			    ptr3 = row2_ptr[y] + lin2_ptr[x] + srf2->w;
			    ptr4 = row2_ptr[y] + lin2_ptr[x] + srf2->w + 1;

			    /* higer mipmap level - need to use interp */
			    INTERP_VAL(R_VAL(&lo_col),
				       R_VAL(ptr), R_VAL(ptr2),
				       R_VAL(ptr3), R_VAL(ptr4),
				       interp_x[x], interp_y[y]);
			    INTERP_VAL(G_VAL(&lo_col),
				       G_VAL(ptr), G_VAL(ptr2),
				       G_VAL(ptr3), G_VAL(ptr4),
				       interp_x[x], interp_y[y]);
			    INTERP_VAL(B_VAL(&lo_col),
				       B_VAL(ptr), B_VAL(ptr2),
				       B_VAL(ptr3), B_VAL(ptr4),
				       interp_x[x], interp_y[y]);

			    /* blend pixel 1 and 2, mix & 255 - mix each */
			    R_VAL(dst_ptr) = ((R_VAL(&hi_col) * mix) + (R_VAL(&lo_col) * (256 - mix))) >> 8;
			    G_VAL(dst_ptr) = ((G_VAL(&hi_col) * mix) + (G_VAL(&lo_col) * (256 - mix))) >> 8;
			    B_VAL(dst_ptr) = ((B_VAL(&hi_col) * mix) + (B_VAL(&lo_col) * (256 - mix))) >> 8;
			    A_VAL(dst_ptr) = 0xff;
			    dst_ptr++;
			 }
		       /* * blend here [clip_w *] buf -> dptr * */
		       if (dc->mod.use)
			 {
			    func_cmod(buf, dptr, dst_clip_w, dc->mod.r, dc->mod.g, dc->mod.b, dc->mod.a);
			 }
		       else if (dc->mul.use)
			 {
			    func_mul(buf, dptr, dst_clip_w, dc->mul.col);
			 }
		       else
			 {
			    func(buf, dptr, dst_clip_w);
			 }
		       dptr += dst_w;
		    }
	       }
	  }
#else
	  /* scaling down... funkiness */
	  {
#if 1
#define RGBA_COMPOSE(r, g, b, a)  ((a) << 24) | ((r) << 16) | ((g) << 8) | (b)
#define INV_XAP                   (256 - xapoints[dxx + x])
#define XAP                       (xapoints[dxx + x])
#define INV_YAP                   (256 - yapoints[dyy + y])
#define YAP                       (yapoints[dyy + y])
	     /* NEW scaling code... */
	     DATA32 *sptr;
	     int x, y, dyy;
	     DATA32 **ypoints;
	     int *xpoints;
	     int *xapoints;
	     int *yapoints;
	     DATA32 *src_data;

	     src_data = src->image->data;

	     xpoints = scale_calc_x_points(src_region_w, dst_region_w);
	     ypoints = scale_calc_y_points(src_data, src->image->w, src_region_h, dst_region_h);
	     xapoints = scale_calc_a_points(src_region_w, dst_region_w);
	     yapoints = scale_calc_a_points(src_region_h, dst_region_h);
	     /* scaling down vertically */
	     if ((dst_region_w >= src_region_w) &&
		 (dst_region_h <  src_region_h))
	       {
		  int Cy, j;
		  DATA32 *pix;
		  int r, g, b, a, rr, gg, bb, aa;
		  int yap;
		  int sow;
		  int dyy, dxx;

		  dptr = dst_ptr;
		  sow = src->image->w;
		  dyy = dst_clip_y - dst_region_y;
		  dxx = dst_clip_x - dst_region_x;
		  if (src->flags & RGBA_IMAGE_HAS_ALPHA)
		    {
		       for (y = 0; y < dst_clip_h; y++)
			 {
			    Cy = YAP >> 16;
			    yap = YAP & 0xffff;
			    for (x = 0; x < dst_clip_w; x++)
			      {
				 pix = ypoints[dyy + y] + xpoints[dxx + x] + (src_region_y * sow) + src_region_x;
				 r = (R_VAL(pix) * yap) >> 10;
				 g = (G_VAL(pix) * yap) >> 10;
				 b = (B_VAL(pix) * yap) >> 10;
				 a = (A_VAL(pix) * yap) >> 10;
				 for (j = (1 << 14) - yap; j > Cy; j -= Cy)
				   {
				      pix += sow;
				      r += (R_VAL(pix) * Cy) >> 10;
				      g += (G_VAL(pix) * Cy) >> 10;
				      b += (B_VAL(pix) * Cy) >> 10;
				      a += (A_VAL(pix) * Cy) >> 10;
				   }
				 if (j > 0)
				   {
				      pix += sow;
				      r += (R_VAL(pix) * j) >> 10;
				      g += (G_VAL(pix) * j) >> 10;
				      b += (B_VAL(pix) * j) >> 10;
				      a += (A_VAL(pix) * j) >> 10;
				   }
				 if (XAP > 0)
				   {
				      pix = ypoints[dyy + y] + xpoints[dxx + x] + 1 + (src_region_y * sow) + src_region_x;
				      rr = (R_VAL(pix) * yap) >> 10;
				      gg = (G_VAL(pix) * yap) >> 10;
				      bb = (B_VAL(pix) * yap) >> 10;
				      aa = (A_VAL(pix) * yap) >> 10;
				      for (j = (1 << 14) - yap; j > Cy; j -= Cy)
					{
					   pix += sow;
					   rr += (R_VAL(pix) * Cy) >> 10;
					   gg += (G_VAL(pix) * Cy) >> 10;
					   bb += (B_VAL(pix) * Cy) >> 10;
					   aa += (A_VAL(pix) * Cy) >> 10;
					}
				      if (j > 0)
					{
					   pix += sow;
					   rr += (R_VAL(pix) * j) >> 10;
					   gg += (G_VAL(pix) * j) >> 10;
					   bb += (B_VAL(pix) * j) >> 10;
					   aa += (A_VAL(pix) * j) >> 10;
					}
				      r = r * INV_XAP;
				      g = g * INV_XAP;
				      b = b * INV_XAP;
				      a = a * INV_XAP;
				      r = (r + ((rr * XAP))) >> 12;
				      g = (g + ((gg * XAP))) >> 12;
				      b = (b + ((bb * XAP))) >> 12;
				      a = (a + ((aa * XAP))) >> 12;
				   }
				 else
				   {
				      r >>= 4;
				      g >>= 4;
				      b >>= 4;
				      a >>= 4;
				   }
				 buf[x] = RGBA_COMPOSE(r, g, b, a);
			      }
			    if (dc->mod.use)
			      func_cmod(buf, dptr, dst_clip_w, dc->mod.r, dc->mod.g, dc->mod.b, dc->mod.a);
			    else if (dc->mul.use)
			      func_mul(buf, dptr, dst_clip_w, dc->mul.col);
			    else
			      func(buf, dptr, dst_clip_w);
			    dptr += dst_w;
			 }
		    }
		  else
		    {
		       for (y = 0; y < dst_clip_h; y++)
			 {
			    Cy = YAP >> 16;
			    yap = YAP & 0xffff;
			    for (x = 0; x < dst_clip_w; x++)
			      {
				 pix = ypoints[dyy + y] + xpoints[dxx + x] + (src_region_y * sow) + src_region_x;
				 r = (R_VAL(pix) * yap) >> 10;
				 g = (G_VAL(pix) * yap) >> 10;
				 b = (B_VAL(pix) * yap) >> 10;
				 for (j = (1 << 14) - yap; j > Cy; j -= Cy)
				   {
				      pix += sow;
				      r += (R_VAL(pix) * Cy) >> 10;
				      g += (G_VAL(pix) * Cy) >> 10;
				      b += (B_VAL(pix) * Cy) >> 10;
				   }
				 if (j > 0)
				   {
				      pix += sow;
				      r += (R_VAL(pix) * j) >> 10;
				      g += (G_VAL(pix) * j) >> 10;
				      b += (B_VAL(pix) * j) >> 10;
				   }
				 if (XAP > 0)
				   {
				      pix = ypoints[dyy + y] + xpoints[dxx + x] + 1 + (src_region_y * sow) + src_region_x;
				      rr = (R_VAL(pix) * yap) >> 10;
				      gg = (G_VAL(pix) * yap) >> 10;
				      bb = (B_VAL(pix) * yap) >> 10;
				      for (j = (1 << 14) - yap; j > Cy; j -= Cy)
					{
					   pix += sow;
					   rr += (R_VAL(pix) * Cy) >> 10;
					   gg += (G_VAL(pix) * Cy) >> 10;
					   bb += (B_VAL(pix) * Cy) >> 10;
					}
				      if (j > 0)
					{
					   pix += sow;
					   rr += (R_VAL(pix) * j) >> 10;
					   gg += (G_VAL(pix) * j) >> 10;
					   bb += (B_VAL(pix) * j) >> 10;
					}
				      r = r * INV_XAP;
				      g = g * INV_XAP;
				      b = b * INV_XAP;
				      r = (r + ((rr * XAP))) >> 12;
				      g = (g + ((gg * XAP))) >> 12;
				      b = (b + ((bb * XAP))) >> 12;
				   }
				 else
				   {
				      r >>= 4;
				      g >>= 4;
				      b >>= 4;
				   }
				 buf[x] = RGBA_COMPOSE(r, g, b, 0xff);
			      }
			    if (dc->mod.use)
			      func_cmod(buf, dptr, dst_clip_w, dc->mod.r, dc->mod.g, dc->mod.b, dc->mod.a);
			    else if (dc->mul.use)
			      func_mul(buf, dptr, dst_clip_w, dc->mul.col);
			    else
			      func(buf, dptr, dst_clip_w);
			    dptr += dst_w;
			 }
		    }
	       }
	     /* scaling down horizontally */
	     else if ((dst_region_w < src_region_w) &&
		      (dst_region_h >=  src_region_h))
	       {
		  int Cx, j;
		  DATA32 *pix;
		  int r, g, b, a, rr, gg, bb, aa;
		  int xap;
		  int sow;
		  int dyy, dxx;

		  dptr = dst_ptr;
		  sow = src->image->w;
		  dyy = dst_clip_y - dst_region_y;
		  dxx = dst_clip_x - dst_region_x;
		  if (src->flags & RGBA_IMAGE_HAS_ALPHA)
		    {
		       for (y = 0; y < dst_clip_h; y++)
			 {
			    for (x = 0; x < dst_clip_w; x++)
			      {
				 Cx = XAP >> 16;
				 xap = XAP & 0xffff;

				 pix = ypoints[dyy + y] + xpoints[dxx + x] + (src_region_y * sow) + src_region_x;
				 r = (R_VAL(pix) * xap) >> 10;
				 g = (G_VAL(pix) * xap) >> 10;
				 b = (B_VAL(pix) * xap) >> 10;
				 a = (A_VAL(pix) * xap) >> 10;
				 for (j = (1 << 14) - xap; j > Cx; j -= Cx)
				   {
				      pix++;
				      r += (R_VAL(pix) * Cx) >> 10;
				      g += (G_VAL(pix) * Cx) >> 10;
				      b += (B_VAL(pix) * Cx) >> 10;
				      a += (A_VAL(pix) * Cx) >> 10;
				   }
				 if (j > 0)
				   {
				      pix++;
				      r += (R_VAL(pix) * j) >> 10;
				      g += (G_VAL(pix) * j) >> 10;
				      b += (B_VAL(pix) * j) >> 10;
				      a += (A_VAL(pix) * j) >> 10;
				   }
				 if (YAP > 0)
				   {
				      pix = ypoints[dyy + y] + xpoints[dxx + x] + sow + (src_region_y * sow) + src_region_x;
				      rr = (R_VAL(pix) * xap) >> 10;
				      gg = (G_VAL(pix) * xap) >> 10;
				      bb = (B_VAL(pix) * xap) >> 10;
				      aa = (A_VAL(pix) * xap) >> 10;
				      for (j = (1 << 14) - xap; j > Cx; j -= Cx)
					{
					   pix++;
					   rr += (R_VAL(pix) * Cx) >> 10;
					   gg += (G_VAL(pix) * Cx) >> 10;
					   bb += (B_VAL(pix) * Cx) >> 10;
					   aa += (A_VAL(pix) * Cx) >> 10;
					}
				      if (j > 0)
					{
					   pix++;
					   rr += (R_VAL(pix) * j) >> 10;
					   gg += (G_VAL(pix) * j) >> 10;
					   bb += (B_VAL(pix) * j) >> 10;
					   aa += (A_VAL(pix) * j) >> 10;
					}
				      r = r * INV_YAP;
				      g = g * INV_YAP;
				      b = b * INV_YAP;
				      a = a * INV_YAP;
				      r = (r + ((rr * YAP))) >> 12;
				      g = (g + ((gg * YAP))) >> 12;
				      b = (b + ((bb * YAP))) >> 12;
				      a = (a + ((aa * YAP))) >> 12;
				   }
				 else
				   {
				      r >>= 4;
				      g >>= 4;
				      b >>= 4;
				      a >>= 4;
				   }
				 buf[x] = RGBA_COMPOSE(r, g, b, a);
			      }
			    if (dc->mod.use)
			      func_cmod(buf, dptr, dst_clip_w, dc->mod.r, dc->mod.g, dc->mod.b, dc->mod.a);
			    else if (dc->mul.use)
			      func_mul(buf, dptr, dst_clip_w, dc->mul.col);
			    else
			      func(buf, dptr, dst_clip_w);
			    dptr += dst_w;
			 }
		    }
		  else
		    {
		       for (y = 0; y < dst_clip_h; y++)
			 {
			    for (x = 0; x < dst_clip_w; x++)
			      {
				 Cx = XAP >> 16;
				 xap = XAP & 0xffff;

				 pix = ypoints[dyy + y] + xpoints[dxx + x] + (src_region_y * sow) + src_region_x;
				 r = (R_VAL(pix) * xap) >> 10;
				 g = (G_VAL(pix) * xap) >> 10;
				 b = (B_VAL(pix) * xap) >> 10;
				 for (j = (1 << 14) - xap; j > Cx; j -= Cx)
				   {
				      pix++;
				      r += (R_VAL(pix) * Cx) >> 10;
				      g += (G_VAL(pix) * Cx) >> 10;
				      b += (B_VAL(pix) * Cx) >> 10;
				   }
				 if (j > 0)
				   {
				      pix++;
				      r += (R_VAL(pix) * j) >> 10;
				      g += (G_VAL(pix) * j) >> 10;
				      b += (B_VAL(pix) * j) >> 10;
				   }
				 if (YAP > 0)
				   {
				      pix = ypoints[dyy + y] + xpoints[dxx + x] + sow + (src_region_y * sow) + src_region_x;
				      rr = (R_VAL(pix) * xap) >> 10;
				      gg = (G_VAL(pix) * xap) >> 10;
				      bb = (B_VAL(pix) * xap) >> 10;
				      for (j = (1 << 14) - xap; j > Cx; j -= Cx)
					{
					   pix++;
					   rr += (R_VAL(pix) * Cx) >> 10;
					   gg += (G_VAL(pix) * Cx) >> 10;
					   bb += (B_VAL(pix) * Cx) >> 10;
					}
				      if (j > 0)
					{
					   pix++;
					   rr += (R_VAL(pix) * j) >> 10;
					   gg += (G_VAL(pix) * j) >> 10;
					   bb += (B_VAL(pix) * j) >> 10;
					}
				      r = r * INV_YAP;
				      g = g * INV_YAP;
				      b = b * INV_YAP;
				      r = (r + ((rr * YAP))) >> 12;
				      g = (g + ((gg * YAP))) >> 12;
				      b = (b + ((bb * YAP))) >> 12;
				   }
				 else
				   {
				      r >>= 4;
				      g >>= 4;
				      b >>= 4;
				   }
				 buf[x] = RGBA_COMPOSE(r, g, b, 0xff);
			      }
			    if (dc->mod.use)
			      func_cmod(buf, dptr, dst_clip_w, dc->mod.r, dc->mod.g, dc->mod.b, dc->mod.a);
			    else if (dc->mul.use)
			      func_mul(buf, dptr, dst_clip_w, dc->mul.col);
			    else
			      func(buf, dptr, dst_clip_w);
			    dptr += dst_w;
			 }
		    }
	       }
	     /* scaling down both vertically & horizontally */
	     else if ((dst_region_w < src_region_w) &&
		      (dst_region_h <  src_region_h))
	       {
		  int Cx, Cy, i, j;
		  DATA32 *pix;
		  int a, r, g, b, rx, gx, bx, ax;
		  int xap, yap;
		  int sow;
		  int dyy, dxx;

		  dptr = dst_ptr;
		  sow = src->image->w;
		  dyy = dst_clip_y - dst_region_y;
		  dxx = dst_clip_x - dst_region_x;
//#ifndef SCALE_USING_MMX
#if 1
		  if (src->flags & RGBA_IMAGE_HAS_ALPHA)
		    {
		       for (y = 0; y < dst_clip_h; y++)
			 {
			    Cy = YAP >> 16;
			    yap = YAP & 0xffff;

			    for (x = 0; x < dst_clip_w; x++)
			      {
				 Cx = XAP >> 16;
				 xap = XAP & 0xffff;

				 sptr = ypoints[dyy + y] + xpoints[dxx + x] + (src_region_y * sow) + src_region_x;
				 pix = sptr;
				 sptr += sow;
				 rx = (R_VAL(pix) * xap) >> 9;
				 gx = (G_VAL(pix) * xap) >> 9;
				 bx = (B_VAL(pix) * xap) >> 9;
				 ax = (A_VAL(pix) * xap) >> 9;
				 pix++;
				 for (i = (1 << 14) - xap; i > Cx; i -= Cx)
				   {
				      rx += (R_VAL(pix) * Cx) >> 9;
				      gx += (G_VAL(pix) * Cx) >> 9;
				      bx += (B_VAL(pix) * Cx) >> 9;
				      ax += (A_VAL(pix) * Cx) >> 9;
				      pix++;
				   }
				 if (i > 0)
				   {
				      rx += (R_VAL(pix) * i) >> 9;
				      gx += (G_VAL(pix) * i) >> 9;
				      bx += (B_VAL(pix) * i) >> 9;
				      ax += (A_VAL(pix) * i) >> 9;
				   }

				 r = (rx * yap) >> 14;
				 g = (gx * yap) >> 14;
				 b = (bx * yap) >> 14;
				 a = (ax * yap) >> 14;

				 for (j = (1 << 14) - yap; j > Cy; j -= Cy)
				   {
				      pix = sptr;
				      sptr += sow;
				      rx = (R_VAL(pix) * xap) >> 9;
				      gx = (G_VAL(pix) * xap) >> 9;
				      bx = (B_VAL(pix) * xap) >> 9;
				      ax = (A_VAL(pix) * xap) >> 9;
				      pix++;
				      for (i = (1 << 14) - xap; i > Cx; i -= Cx)
					{
					   rx += (R_VAL(pix) * Cx) >> 9;
					   gx += (G_VAL(pix) * Cx) >> 9;
					   bx += (B_VAL(pix) * Cx) >> 9;
					   ax += (A_VAL(pix) * Cx) >> 9;
					   pix++;
					}
				      if (i > 0)
					{
					   rx += (R_VAL(pix) * i) >> 9;
					   gx += (G_VAL(pix) * i) >> 9;
					   bx += (B_VAL(pix) * i) >> 9;
					   ax += (A_VAL(pix) * i) >> 9;
					}

				      r += (rx * Cy) >> 14;
				      g += (gx * Cy) >> 14;
				      b += (bx * Cy) >> 14;
				      a += (ax * Cy) >> 14;
				   }
				 if (j > 0)
				   {
				      pix = sptr;
				      sptr += sow;
				      rx = (R_VAL(pix) * xap) >> 9;
				      gx = (G_VAL(pix) * xap) >> 9;
				      bx = (B_VAL(pix) * xap) >> 9;
				      ax = (A_VAL(pix) * xap) >> 9;
				      pix++;
				      for (i = (1 << 14) - xap; i > Cx; i -= Cx)
					{
					   rx += (R_VAL(pix) * Cx) >> 9;
					   gx += (G_VAL(pix) * Cx) >> 9;
					   bx += (B_VAL(pix) * Cx) >> 9;
					   ax += (A_VAL(pix) * Cx) >> 9;
					   pix++;
					}
				      if (i > 0)
					{
					   rx += (R_VAL(pix) * i) >> 9;
					   gx += (G_VAL(pix) * i) >> 9;
					   bx += (B_VAL(pix) * i) >> 9;
					   ax += (A_VAL(pix) * i) >> 9;
					}

				      r += (rx * j) >> 14;
				      g += (gx * j) >> 14;
				      b += (bx * j) >> 14;
				      a += (ax * j) >> 14;
				   }
				 buf[x] = RGBA_COMPOSE(r >> 5, g >> 5, b >> 5, a >> 5);
			      }
			    if (dc->mod.use)
			      func_cmod(buf, dptr, dst_clip_w, dc->mod.r, dc->mod.g, dc->mod.b, dc->mod.a);
			    else if (dc->mul.use)
			      func_mul(buf, dptr, dst_clip_w, dc->mul.col);
			    else
			      func(buf, dptr, dst_clip_w);
			    dptr += dst_w;
			 }
		    }
		  else
		    {
		       for (y = 0; y < dst_clip_h; y++)
			 {
			    Cy = YAP >> 16;
			    yap = YAP & 0xffff;

			    for (x = 0; x < dst_clip_w; x++)
			      {
				 Cx = XAP >> 16;
				 xap = XAP & 0xffff;

				 sptr = ypoints[dyy + y] + xpoints[dxx + x] + (src_region_y * sow) + src_region_x;
				 pix = sptr;
				 sptr += sow;
				 rx = (R_VAL(pix) * xap) >> 9;
				 gx = (G_VAL(pix) * xap) >> 9;
				 bx = (B_VAL(pix) * xap) >> 9;
				 pix++;
				 for (i = (1 << 14) - xap; i > Cx; i -= Cx)
				   {
				      rx += (R_VAL(pix) * Cx) >> 9;
				      gx += (G_VAL(pix) * Cx) >> 9;
				      bx += (B_VAL(pix) * Cx) >> 9;
				      pix++;
				   }
				 if (i > 0)
				   {
				      rx += (R_VAL(pix) * i) >> 9;
				      gx += (G_VAL(pix) * i) >> 9;
				      bx += (B_VAL(pix) * i) >> 9;
				   }

				 r = (rx * yap) >> 14;
				 g = (gx * yap) >> 14;
				 b = (bx * yap) >> 14;

				 for (j = (1 << 14) - yap; j > Cy; j -= Cy)
				   {
				      pix = sptr;
				      sptr += sow;
				      rx = (R_VAL(pix) * xap) >> 9;
				      gx = (G_VAL(pix) * xap) >> 9;
				      bx = (B_VAL(pix) * xap) >> 9;
				      pix++;
				      for (i = (1 << 14) - xap; i > Cx; i -= Cx)
					{
					   rx += (R_VAL(pix) * Cx) >> 9;
					   gx += (G_VAL(pix) * Cx) >> 9;
					   bx += (B_VAL(pix) * Cx) >> 9;
					   pix++;
					}
				      if (i > 0)
					{
					   rx += (R_VAL(pix) * i) >> 9;
					   gx += (G_VAL(pix) * i) >> 9;
					   bx += (B_VAL(pix) * i) >> 9;
					}

				      r += (rx * Cy) >> 14;
				      g += (gx * Cy) >> 14;
				      b += (bx * Cy) >> 14;
				   }
				 if (j > 0)
				   {
				      pix = sptr;
				      sptr += sow;
				      rx = (R_VAL(pix) * xap) >> 9;
				      gx = (G_VAL(pix) * xap) >> 9;
				      bx = (B_VAL(pix) * xap) >> 9;
				      pix++;
				      for (i = (1 << 14) - xap; i > Cx; i -= Cx)
					{
					   rx += (R_VAL(pix) * Cx) >> 9;
					   gx += (G_VAL(pix) * Cx) >> 9;
					   bx += (B_VAL(pix) * Cx) >> 9;
					   pix++;
					}
				      if (i > 0)
					{
					   rx += (R_VAL(pix) * i) >> 9;
					   gx += (G_VAL(pix) * i) >> 9;
					   bx += (B_VAL(pix) * i) >> 9;
					}

				      r += (rx * j) >> 14;
				      g += (gx * j) >> 14;
				      b += (bx * j) >> 14;
				   }
				 buf[x] = RGBA_COMPOSE(r >> 5, g >> 5, b >> 5, 0xff);
			      }
			    if (dc->mod.use)
			      func_cmod(buf, dptr, dst_clip_w, dc->mod.r, dc->mod.g, dc->mod.b, dc->mod.a);
			    else if (dc->mul.use)
			      func_mul(buf, dptr, dst_clip_w, dc->mul.col);
			    else
			      func(buf, dptr, dst_clip_w);
			    dptr += dst_w;
			 }
		    }
#else
#endif
	       }
	     free(xpoints);
	     free(ypoints);
	     free(xapoints);
	     free(yapoints);
#else
	     DATA32       *src_data;
	     RGBA_Surface *srf;

	     /* no mipmaps at all- need to populate mipmap table */
	     if (src->mipmaps.num == 0)
	       {
		  evas_common_scale_rgba_gen_mipmap_table(src);
	       }
	     /* caclulate mix level between mipmaps */
	     srf = evas_common_scale_rgba_get_mipmap_surface(src,
						 src_region_w, src_region_h,
						 dst_region_w, dst_region_h);
	     /* get mipmap level */
	     src_data = srf->data; /* high res surface (sample) */
	     /* lin & row ptr for lowest mipmap (hi res) */
	     divx = src->image->w / srf->w;
	     divy = src->image->h / srf->h;

	     /* merely sampled pixels in the hi-res mipmap  and  interp no's */
	     for (x = 0; x < dst_clip_w; x++)
	       {
		  unsigned int val, val2;

		  val = (((x + dst_clip_x - dst_region_x) * src_region_w * 256) / (dst_region_w * divx));
		  val2 = (((x + 1 + dst_clip_x - dst_region_x) * src_region_w * 256) / (dst_region_w * divx));
		  lin_ptr[x] = val >> 8;
		  if (src->image->w < 2)
		    {
		       sample_x[(x * 3) + 0] = 256;
		       sample_x[(x * 3) + 1] = 0;
		       sample_x[(x * 3) + 2] = 0;
		    }
		  if (!(val & 0xff))
		    /* start of pixel scan on src pixel boundary */
		    {
		       sample_x[(x * 3) + 0] = 256;
		       sample_x[(x * 3) + 1] = val2 - val - 256;
		       sample_x[(x * 3) + 2] = 0;
		    }
		  else
		    /* start pixel scan in middle of src pixel */
		    {
		       sample_x[(x * 3) + 0] = (val & 0xffffff00) + 256 - val;
		       if (val2 >= ((val & 0xffffff00) + 512))
			 {
			    sample_x[(x * 3) + 1] = 256;
			    sample_x[(x * 3) + 2] = val2 - ((val & 0xffffff00) + 512);
			 }
		       else
			 {
			    sample_x[(x * 3) + 1] = val2 - ((val & 0xffffff00) + 256);
			    sample_x[(x * 3) + 2] = 0;
			 }
		    }
		  interp_x[x] =
		    sample_x[(x * 3) + 0] +
		    sample_x[(x * 3) + 1] +
		    sample_x[(x * 3) + 2];
		  sample_x[(x * 3) + 0] = (sample_x[(x * 3) + 0] * 256) / interp_x[x];
		  sample_x[(x * 3) + 1] = (sample_x[(x * 3) + 1] * 256) / interp_x[x];
		  sample_x[(x * 3) + 2] = (sample_x[(x * 3) + 2] * 256) / interp_x[x];
		  if      (sample_x[(x * 3) + 2]) iterate_x[x] = 3;
		  else if (sample_x[(x * 3) + 1]) iterate_x[x] = 2;
		  else                            iterate_x[x] = 1;
	       }
	     for (y = 0; y < dst_clip_h; y++)
	       {
		  unsigned int val, val2;

		  val = (((y + dst_clip_y - dst_region_y) * src_region_h * 256) / (dst_region_h * divy));
		  val2 = (((y + 1 + dst_clip_y - dst_region_y) * src_region_h * 256) / (dst_region_h * divy));
		  row_ptr[y] = src_data + ((val >> 8) * srf->w);
		  if (src->image->h < 2)
		    {
		       sample_y[(y * 3) + 0] = 256;
		       sample_y[(y * 3) + 1] = 0;
		       sample_y[(y * 3) + 2] = 0;
		    }
		  else if (!(val & 0xff))
		    /* start of pixel scan on src pixel boundary */
		    {
		       sample_y[(y * 3) + 0] = 256;
		       sample_y[(y * 3) + 1] = val2 - val - 256;
		       sample_y[(y * 3) + 2] = 0;
		    }
		  else
		    /* start pixel scan in middle of src pixel */
		    {
		       sample_y[(y * 3) + 0] = (val & 0xffffff00) + 256 - val;
		       if (val2 >= ((val & 0xffffff00) + 512))
			 {
			    sample_y[(y * 3) + 1] = 256;
			    sample_y[(y * 3) + 2] = val2 - ((val & 0xffffff00) + 512);
			 }
		       else
			 {
			    sample_y[(y * 3) + 1] = val2 - ((val & 0xffffff00) + 256);
			    sample_y[(y * 3) + 2] = 0;
			 }
		    }
		  interp_y[y] =
		    sample_y[(y * 3) + 0] +
		    sample_y[(y * 3) + 1] +
		    sample_y[(y * 3) + 2];
		  sample_y[(y * 3) + 0] = (sample_y[(y * 3) + 0] * 256) / interp_y[y];
		  sample_y[(y * 3) + 1] = (sample_y[(y * 3) + 1] * 256) / interp_y[y];
		  sample_y[(y * 3) + 2] = (sample_y[(y * 3) + 2] * 256) / interp_y[y];
		  if      (sample_y[(y * 3) + 2]) iterate_y[y] = 3;
		  else if (sample_y[(y * 3) + 1]) iterate_y[y] = 2;
		  else                            iterate_y[y] = 1;
	       }
	     /* scale to dst */
	     dptr = dst_ptr;

	     if (src->flags & RGBA_IMAGE_HAS_ALPHA)
	       {
		  int srf_w_3;

		  srf_w_3 = srf->w - 3;
		  for (y = 0; y < dst_clip_h; y++)
		    {
		       int ity;

		       ity = iterate_y[y];
		       dst_ptr = buf;
		       for (x = 0; x < dst_clip_w; x++)
			 {
			    int x3, y3, i, j;
#ifndef SCALE_USING_MMX
			    DATA32 r, g, b, a;
#endif

			    x3 = x * 3;
			    y3 = y * 3;
			    ptr = row_ptr[y] + lin_ptr[x];
#ifndef SCALE_USING_MMX
			    r = g = b = a = 0;
			    for (j = 0; j < ity; j++)
			      {
				 int mj;
				 int itx;

				 itx = iterate_x[x];
				 mj = sample_y[y3 + j];
				 for (i = 0; i < itx; i++)
				   {
				      int multiplier;

				      multiplier = sample_x[x3 + i] * mj;
				      r += R_VAL(ptr) * multiplier;
				      g += G_VAL(ptr) * multiplier;
				      b += B_VAL(ptr) * multiplier;
				      a += A_VAL(ptr) * multiplier;
				      ptr++;
				   }
				 ptr += 3 - itx + srf_w_3;
			      }
			    if (a)
			      {
				 A_VAL(dst_ptr) = a >> 16;
				 R_VAL(dst_ptr) = r >> 16;
				 G_VAL(dst_ptr) = g >> 16;
				 B_VAL(dst_ptr) = b >> 16;
			      }
			    else
			      {
				 A_VAL(dst_ptr) = 0;
			      }
#else
			    pxor_r2r(mm0, mm0);
			    for (j = 0; j < ity; j++)
			      {
				 int mj;
				 int itx;

				 itx = iterate_x[x];
				 mj = sample_y[y3 + j];
				 for (i = 0; i < itx; i++)
				   {
				      int multiplier;

				      multiplier = (sample_x[x3 + i] * mj) >> 8;

				      punpcklbw_m2r(ptr[0], mm2);
				      psrlw_i2r(8, mm2);

				      movd_m2r(multiplier, mm3);
				      punpcklwd_r2r(mm3, mm3);
				      punpckldq_r2r(mm3, mm3);

				      pmullw_r2r(mm3, mm2);

				      paddw_r2r(mm2, mm0);
				      ptr++;
				   }
				 ptr += 3 - itx + srf_w_3;
			      }
			    psrlw_i2r(8, mm0);
			    packuswb_r2r(mm0, mm0);
			    movd_r2m(mm0, dst_ptr[0]);
#endif
			    dst_ptr++;
			 }
		       /* * blend here [clip_w *] buf -> dptr * */
		       if (dc->mod.use)
			 {
			    func_cmod(buf, dptr, dst_clip_w, dc->mod.r, dc->mod.g, dc->mod.b, dc->mod.a);
			 }
		       else if (dc->mul.use)
			 {
			    func_mul(buf, dptr, dst_clip_w, dc->mul.col);
			 }
		       else
			 {
			    func(buf, dptr, dst_clip_w);
			 }
		       dptr += dst_w;
		    }
	       }
	     else
	       {
                  int srf_w_3;

		  srf_w_3 = srf->w - 3;
		  for (y = 0; y < dst_clip_h; y++)
		    {
		       int ity;

		       ity = iterate_y[y];
		       dst_ptr = buf;
		       for (x = 0; x < dst_clip_w; x++)
			 {
			    int x3, y3, i, j;
#ifndef SCALE_USING_MMX
			    DATA32 r, g, b;
#endif

			    x3 = x * 3;
			    y3 = y * 3;
			    ptr = row_ptr[y] + lin_ptr[x];
#ifndef SCALE_USING_MMX
			    r = g = b = 0;
			    for (j = 0; j < ity; j++)
			      {
				 int mj;
				 int itx;

				 itx = iterate_x[x];
				 mj = sample_y[y3 + j];
				 for (i = 0; i < itx; i++)
				   {
				      int multiplier;

				      multiplier = sample_x[x3 + i] * mj;
				      if (multiplier)
					{
					   r += R_VAL(ptr) * multiplier;
					   g += G_VAL(ptr) * multiplier;
					   b += B_VAL(ptr) * multiplier;
					}
				      ptr++;
				   }
				 ptr += 3 - itx + srf_w_3;
			      }
			    R_VAL(dst_ptr) = r >> 8;
			    G_VAL(dst_ptr) = g >> 8;
			    B_VAL(dst_ptr) = b >> 8;
			    A_VAL(dst_ptr) = 0xff;
#else
			    pxor_r2r(mm0, mm0);
			    for (j = 0; j < ity; j++)
			      {
				 int mj;
				 int itx;

				 itx = iterate_x[x];
				 mj = sample_y[y3 + j];
				 for (i = 0; i < itx; i++)
				   {
				      int multiplier;

				      multiplier = (sample_x[x3 + i] * mj) >> 8;

				      punpcklbw_m2r(ptr[0], mm2);
				      psrlw_i2r(8, mm2);

				      movd_m2r(multiplier, mm3);
				      punpcklwd_r2r(mm3, mm3);
				      punpckldq_r2r(mm3, mm3);

				      pmullw_r2r(mm3, mm2);

				      paddw_r2r(mm2, mm0);
				      ptr++;
				   }
				 ptr += 3 - itx + srf_w_3;
			      }
			    psrlw_i2r(8, mm0);
			    packuswb_r2r(mm0, mm0);
			    movd_r2m(mm0, dst_ptr[0]);
#endif
			    dst_ptr++;
			 }
		       /* * blend here [clip_w *] buf -> dptr * */
		       if (dc->mod.use)
			 {
			    func_cmod(buf, dptr, dst_clip_w, dc->mod.r, dc->mod.g, dc->mod.b, dc->mod.a);
			 }
		       else if (dc->mul.use)
			 {
			    func_mul(buf, dptr, dst_clip_w, dc->mul.col);
			 }
		       else
			 {
			    func(buf, dptr, dst_clip_w);
			 }
		       dptr += dst_w;
		    }
	       }
#endif
	  }
#endif
	free(buf);
     }
   no_buf:
   /* free scale tables */
   free(iterate_y);
   no_iterate_y:
   free(iterate_x);
   no_iterate_x:
   free(sample_y);
   no_sample_y:
   free(sample_x);
   no_sample_x:
   free(interp_y);
   no_interp_y:
   free(interp_x);
   no_interp_x:
   free(row2_ptr);
   no_row2_ptr:
   free(lin2_ptr);
   no_lin2_ptr:
   free(row_ptr);
   no_row_ptr:
   free(lin_ptr);
//_WIN32_WCE
   no_lin_ptr: ;
}