evas - fonts - move to using 4bit and rel 4 bit compressed font glyphs

this changes the internal encoding of font glyphs in evas to use 4bit uncompressed if small, or 4bit rle (run length encoded) if larger. this caves at least 50% of memory on fonts - and more if bigger. with large fonts (40-80pixel size) we can save in the region of 80% of memory used for glyphs. this also happesn to allow speedups in rendering too.
2014-01-13 05:13:00 +09:00 · 2014-01-13 05:13:00 +09:00 · 86a97efeea
parent f21b0ee6c3
commit 86a97efeea
12 changed files with 972 additions and 276 deletions
--- a/src/Makefile_Evas.am
+++ b/src/Makefile_Evas.am
@ -146,6 +146,7 @@ lib/evas/common/evas_font_draw.c \
 lib/evas/common/evas_font_load.c \
 lib/evas/common/evas_font_main.c \
 lib/evas/common/evas_font_query.c \
+lib/evas/common/evas_font_compress.c \
 lib/evas/common/evas_image_load.c \
 lib/evas/common/evas_image_save.c \
 lib/evas/common/evas_image_main.c \
@ -250,6 +251,7 @@ static_libs/libunibreak/ChangeLog
 # Engines

 EXTRA_DIST += \
+lib/evas/common/evas_font_compress_draw.c \
 lib/evas/common/evas_map_image_internal.c \
 lib/evas/common/evas_map_image_core.c \
 lib/evas/common/evas_map_image_loop.c \
--- a/src/bin/evas/evas_cserve2.h
+++ b/src/bin/evas/evas_cserve2.h
@ -196,8 +196,6 @@ struct _Slave_Msg_Glyph {
   unsigned int rows;
   unsigned int width;
   unsigned int pitch;
-   unsigned int num_grays;
-   unsigned int pixel_mode;
 };

 typedef struct _Slave_Msg_Glyph Slave_Msg_Glyph;
--- a/src/bin/evas/evas_cserve2_cache.c
+++ b/src/bin/evas/evas_cserve2_cache.c
@ -1933,10 +1933,6 @@ _glyphs_loaded_msg_create(Glyphs_Request *req, int *resp_size)
        buf += sizeof(int);
        memcpy(buf, &gldata->pitch, sizeof(int));
        buf += sizeof(int);
-        memcpy(buf, &gldata->num_grays, sizeof(int));
-        buf += sizeof(int);
-        memcpy(buf, &gldata->pixel_mode, sizeof(int));
-        buf += sizeof(int);
        memcpy(buf, &gldata->hint, sizeof(int));
        buf += sizeof(int);
     }
@ -2177,8 +2173,6 @@ _glyphs_load_request_response(Glyphs_Request *req,
             gldata->rows = msg->glyphs[j].rows;
             gldata->width = msg->glyphs[j].width;
             gldata->pitch = msg->glyphs[j].pitch;
-             gldata->num_grays = msg->glyphs[j].num_grays;
-             gldata->pixel_mode = msg->glyphs[j].pixel_mode;
             gldata->hint = hint;

             fe->nglyphs++;
--- a/src/bin/evas/evas_cserve2_fonts.c
+++ b/src/bin/evas/evas_cserve2_fonts.c
@ -313,6 +313,9 @@ _font_slave_glyph_load(Font_Info *fi, unsigned int idx, unsigned int hint)
   return EINA_TRUE;
 }

+// import the 1 func we need
+EAPI void *evas_common_font_glyph_compress(void *data, int num_grays, int pixel_mode, int pitch_data, int w, int h, int *size_ret);
+
 /* This function will render the glyph currently in the glyph slot into the
 * given Font Cache.
 */
@ -321,18 +324,32 @@ _font_slave_glyph_render(Font_Info *fi, Slave_Msg_Font_Glyphs_Loaded *response,
                         unsigned int idx)
 {
   Font_Source_Info *fsi = fi->fsi;
-   unsigned int glyphsize;
+   int glyphsize = 0;
   FT_Glyph glyph;
   FT_BitmapGlyph bglyph;
   char *data;
   int buffer_id = 0;
+   void *buf;

   FT_Get_Glyph(fsi->face->glyph, &glyph);
   FT_Glyph_To_Bitmap(&glyph, FT_RENDER_MODE_NORMAL, 0, 1);
   bglyph = (FT_BitmapGlyph)glyph;

+   if ((bglyph->bitmap.pitch < 1) || (bglyph->bitmap.rows < 1))
+     {
+        FT_Done_Glyph(glyph);
+        goto on_error;
+     }
   glyphsize = bglyph->bitmap.pitch * bglyph->bitmap.rows;
-   if (!glyphsize)
+
+   buf = evas_common_font_glyph_compress(bglyph->bitmap.buffer,
+                                         bglyph->bitmap.num_grays,
+                                         bglyph->bitmap.pixel_mode,
+                                         bglyph->bitmap.pitch,
+                                         bglyph->bitmap.width,
+                                         bglyph->bitmap.rows,
+                                         &glyphsize);
+   if (!buf)
     {
        FT_Done_Glyph(glyph);
        goto on_error;
@ -342,10 +359,12 @@ _font_slave_glyph_render(Font_Info *fi, Slave_Msg_Font_Glyphs_Loaded *response,
   data = cserve2_shared_mempool_buffer_get(response->mempool, buffer_id);
   if (!data)
     {
+        free(buf);
        FT_Done_Glyph(glyph);
        goto on_error;
     }
-   memcpy(data, bglyph->bitmap.buffer, glyphsize);
+   memcpy(data, buf, glyphsize);
+   free(buf);

   // TODO: Check if we have problems with alignment
   response->glyphs[response->nglyphs].index = idx;
@ -356,8 +375,6 @@ _font_slave_glyph_render(Font_Info *fi, Slave_Msg_Font_Glyphs_Loaded *response,
   response->glyphs[response->nglyphs].rows = bglyph->bitmap.rows;
   response->glyphs[response->nglyphs].width = bglyph->bitmap.width;
   response->glyphs[response->nglyphs].pitch = bglyph->bitmap.pitch;
-   response->glyphs[response->nglyphs].num_grays = bglyph->bitmap.num_grays;
-   response->glyphs[response->nglyphs].pixel_mode = bglyph->bitmap.pixel_mode;
   response->nglyphs++;

   FT_Done_Glyph(glyph);
--- a/src/lib/evas/common/evas_font.h
+++ b/src/lib/evas/common/evas_font.h
@ -82,6 +82,10 @@ EAPI int               evas_common_font_query_last_up_to_pos (RGBA_Font *fn, con
 EAPI int               evas_common_font_query_run_font_end_get(RGBA_Font *fn, RGBA_Font_Int **script_fi, RGBA_Font_Int **cur_fi, Evas_Script_Type script, const Eina_Unicode *text, int run_len);
 EAPI void              evas_common_font_ascent_descent_get(RGBA_Font *fn, const Evas_Text_Props *text_props, int *ascent, int *descent);

+EAPI void             *evas_common_font_glyph_compress(void *data, int num_grays, int pixel_mode, int pitch_data, int w, int h, int *size_ret);
+EAPI void              evas_common_font_glyph_draw(RGBA_Font_Glyph *fg, RGBA_Draw_Context *dc, DATA32 *dst, int dst_pitch, int x, int y, int cx, int cy, int cw, int ch);
+EAPI DATA8            *evas_common_font_glyph_uncompress(RGBA_Font_Glyph *fg, int *wret, int *hret);
+
 void evas_common_font_load_init(void);
 void evas_common_font_load_shutdown(void);

--- a/src/lib/evas/common/evas_font_compress.c
+++ b/src/lib/evas/common/evas_font_compress.c
@ -0,0 +1,523 @@
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include <assert.h>
+
+#include "evas_common_private.h"
+#include "evas_private.h"
+
+#include "evas_font_private.h"
+
+#ifdef EVAS_CSERVE2
+# include "../cserve2/evas_cs2_private.h"
+#endif
+
+#include FT_OUTLINE_H
+#include FT_SYNTHESIS_H
+
+// XXX:
+// XXX: adapt cserve2 to this!
+// XXX:
+
+//--------------------------------------------------------------------------
+//- UTILS ------------------------------------------------------------------
+//--------------------------------------------------------------------------
+static void
+expand_bitmap(DATA8 *src, int pitch, int w, int h, DATA8 *dst)
+{
+   // some glyphs from fonts come in 1bit variety - expand it to 8bit before
+   // compressing as it's easier to deal with a universal format
+   static const DATA8 bitrepl[2] = { 0x00, 0xff };
+   DATA8 *s, *d, bits;
+   int bi, bj, y, end;
+
+   for (y = 0; y < h; y++)
+     {
+        d = dst + (y * w);
+        s = src + (y * pitch);
+        // wall all bytes per row
+        for (bi = 0; bi < w; bi += 8)
+          {
+             bits = *s;
+             if ((w - bi) < 8) end = w - bi;
+             else end = 8;
+             // each byte has 8 bits - expand them out using lookup table above
+             for (bj = 0; bj < end; bj++)
+               {
+                  *d = bitrepl[(bits >> (7 - bj)) & 0x1];
+                  d++;
+               }
+             s++;
+          }
+     }
+}
+
+
+
+
+//--------------------------------------------------------------------------
+//- RLE 4BIT ---------------------------------------------------------------
+//--------------------------------------------------------------------------
+
+// what is 4bit rle? it's 4 bit per pixel run-length encoding. this means
+// that every row of pixels is compressed int a separate defined list
+// of "runs" where every run is N pixles at value V. RLE works well for
+// things like fonts which have vast regions that are either empty or solid
+// with some transition (anti-alias) pixels in between. it could be that for
+// a black and white alternating pattern it will come out the worst possible
+// case, but this basically "never happens".
+//
+// data is encoded so it's fastr to access and decompress at runtime. we have
+// both a blob of data that is the RLE encoded data for all rows which consist
+// of 1 byte per run, and also a jump table - per row telling us the byte
+// offset inside the RLE data blob where the row data begins. since we know
+// the offset of the next run, we know how many bytes each row is based on
+// this.
+//
+// since rle data may be small (less than 256 bytes) and in almost all cases
+// less than 64k, a jump table of 8 bite per entry is good for many uses, and
+// otherwise 16bits is used. it also supports 32bit jumptables but these are
+// there just in case the data goes beyond 64k - but is unlikely to ever
+// happen in real life. this means jumptables come in 3 formats thus have to
+// have 3 different handling paths. RLE data is the same so it's common code.
+//
+// each byte in the RLE section encodes a run of between 1 and 16 pixels in
+// length. there is no such thing as a run of 0 pixels. the upper 4 bits of
+// the byte encode the length, with 0 being 1 pixel, 1 being 2 pixels,
+// 2 being 3 pixels and so on up top 16 pixels (thus run length is actually
+// (byte >> 4) + 1). the lower 4 bits encode the 4 bit pixel value of the
+// whole run, from 0 to 15. it is accessed via masking (byte & 0xf). thus
+// every run in RLE consumes exactly 1 byte of memory nice and neatly.
+//
+// at the start before the jumptable is a 32bit (int) header. it just has a
+// value at the moment that indicates 0 for it not being RLE data (used by
+// the 4bit packed bitmap), 1 for 8bit jumptable RLE, 2 for 16bit jumptable
+// and 3 for 32bit jumptable. all other values are reserved
+//
+// so data looks like this when packed into a single blob in memory (where
+// xx is the data size of the jump table - 8, 16 or 32bit). there are n
+// lines of data in the jumptable matching to the height of the glyph where
+// n is the height in rows
+//
+// each jumptable row ACTUALLY indicates the byte offset of the NEXT line.
+// the FIRST row of RLE data is assumed to be at offset 0 in the RLE data
+// section, so a special case is used for this. note that jumptable values
+// are OFFSETS starting at 0 which is the first byte in the RLE data section
+//
+// [int] header (0, 1, 2 or  3)
+// [xx] jump table for line 0
+// [xx] jump table for line 1
+// [xx] jump table for line 2
+// ...
+// [xx] jump table for line n - 1
+// [char] first byte of RLE data (beginning of rle data)
+// [char] second byte of RLE data
+// ...
+// [char] last byte of RLE data
+// 
+static DATA8 *
+compress_rle4(DATA8 *src, int pitch, int w, int h, int *size_ret)
+{
+   unsigned char *scratch, *p, *pix, spanval;
+   int *jumptab, x, y, spanlen, spannum, total, size, *iptr, *pos;
+   unsigned short *sptr;
+   DATA8 *dst, *buf, *dptr;
+
+   // these macros make the code more readable and easier to follow, and
+   // avoid replication of dumb blobs of logic
+#define SPAN_ADD(_len, _val) do { (*pos) += 1; *p = ((_len) << 4) | (_val); p++; } while (0)
+#define LAST_SPAN_VAL() (p[-1] & 0x0f)
+#define LAST_SPAN_LEN() (p[-1] >> 4)
+#define LAST_SPAN_DEL() do { (*pos) -= 1; p -= 1; } while (0)
+
+   // create out scratch buffer for compression on the stack - maximum size
+   scratch = p = alloca(pitch * h * 2);
+   // also place our jumptable on the stack too - all ints here - become
+   // smaller char/shorts after jumptable is generated and size known
+   jumptab = alloca(h * sizeof(int));
+   for (y = 0; y < h; y++)
+     {
+        pix = src + (y * pitch);
+        // pos is the position offset from RLE data start that we have to
+        // track to find out where this rows RLE run *ENDS* so keep a
+        // pointer to it and we will keep ++ing it with each REL entry we add
+        pos = &(jumptab[y]);
+        *pos = (int)((unsigned long)p - (unsigned long)scratch);
+        // no spans now so init all span things to 0
+        spanval = spanlen = spannum = 0;
+        for (x = 0; x < w; x++)
+          {
+             // we only need upper 4 bits of value for span creation
+             DATA8 v = pix[x] >> 4;
+             // if the current pixel value (in 4bit) is not the same as the
+             // span value (n 4 bit) OR... if the span now exceeds 16 pixels
+             // then add/write out the span to our RLE span blob
+             if ((v != spanval) || (spanlen >= 16))
+               {
+                  if (spanlen > 0)
+                    {
+                       SPAN_ADD(spanlen - 1, spanval);
+                       spannum++;
+                    }
+                  spanval = v;
+                  spanlen = 1;
+               }
+             // otherwise make span longer if values are the same
+             else spanlen++;
+          }
+        // do we have a span still being built that we haven't added and that
+        // is NOT transparent (0 value -  there is no point storing spans
+        // at the end of a row that have 0 value
+        if ((spanlen > 0) && (spanval > 0))
+          {
+             SPAN_ADD(spanlen - 1, spanval);
+             spannum++;
+          }
+        // clean up any dangling 0 value at the end of a row as they just
+        // waste space and processing time
+        while ((spannum > 0) && (LAST_SPAN_VAL() == 0))
+          {
+             LAST_SPAN_DEL();
+             spannum--;
+          }
+     }
+   // get the size of RLE data we have plus int header
+   total = (int)((unsigned long)p - (unsigned long)scratch);
+   size = sizeof(int) + total;
+   // based on total number of bytes in RLE, use 32, 16 or 8 bit jumptable
+   // and add that to our size
+   if (total > 65535) size += h * 4; // 32bit
+   else if (total > 255) size += h * 2; // 16bit
+   else size += h; // 8bit
+
+   *size_ret = size;
+   // allocate a fresh buffer where we will merge header, jumptable and RLE
+   // spans inot a single block
+   buf = dst = malloc(size);
+   if (!buf) return NULL;
+   // 32bit int header to indicate encoding type (3, 2 or 1)
+   iptr = (int *)dst;
+   if (total > 65535) *iptr = 3; // 32bit jump table
+   else if (total > 255) *iptr = 2; // 16 bit jump table
+   else *iptr = 1; // 8 bit jump table
+   // skip header and write jump table
+   dst += sizeof(int);
+   if (total > 65535) // 32bit jump table
+     {
+        iptr = (int *)dst;
+        for (y = 0; y < h; y++) iptr[y] = jumptab[y];
+        dst += (h * sizeof(int));
+     }
+   else if (total > 255) // 16bit jump table
+     {
+        sptr = (unsigned short *)dst;
+        for (y = 0; y < h; y++) sptr[y] = jumptab[y];
+        dst += (h * sizeof(unsigned short));
+     }
+   else // 8bit jump table
+     {
+        dptr = dst;
+        for (y = 0; y < h; y++) dptr[y] = jumptab[y];
+        dst += (h * sizeof(DATA8));
+     }
+   // copy rest of RLE data at the end of the jumptable and return it
+   memcpy(dst, scratch, total);
+   return buf;
+}
+
+// this decompresses a specific run of RLE data to the destination pointer
+// and finishes reading RLE data before the "end" byte and starts AT the
+// "start" byte within the array pointed to by src. this ASSUMES the dest
+// buffer has already been zeroed out so we can skip runs that are "0"
+static void
+decompress_full_row(DATA8 *src, int start, int end, DATA8 *dst)
+{
+   DATA8 *p = src + start, *e = src + end, *d = dst, len, val;
+   
+   while (p < e)
+     {
+        // length is upper 4 bits + 1
+        len = (*p >> 4) + 1;
+        // value when EXPANDED to 8bit is the lower 4 bits REPEATEd in all
+        // 8 bites to ensure it rounds properly.
+        // i.e. lower 4 bits B4B3B2B1 -> B4B3B2B1B4B3B2B1
+        val = *p & 0xf;
+        val |= val << 4;
+        // if it's 0 just skip ahead (assume dst buffer is 0'd out)
+        if (val == 0) d += len;
+        else
+          {
+             // write out "len" pixels of tghe given value
+             while (len > 0)
+               {
+                  *d = val;
+                  d++;
+                  len--;
+               }
+          }
+        // next RLE byte
+        p++;
+     }
+}
+
+// to save copy & paste repeating code, this macro acts as a code generator
+// to create a specific decompress function per jumptable size (8, 16 or 32bit)
+#define DECOMPRESS_ROW_FUNC(_name, _type) \
+static void \
+_name(_type *jumptab, DATA8 *src, DATA8 *dst, int pitch, int h) \
+{ \
+   int y, start, end; \
+   for (y = 0; y < h; y++) \
+     { \
+        if (y > 0) start = jumptab[y - 1]; \
+        else start = 0; \
+        end = jumptab[y]; \
+        decompress_full_row(src, start, end, dst + (y * pitch)); \
+     } \
+}
+// 3 versions of the decompress given 3 jumptable types/sizes
+DECOMPRESS_ROW_FUNC(decompress_jumptab8_rle4, DATA8)
+DECOMPRESS_ROW_FUNC(decompress_jumptab16_rle4, unsigned short)
+DECOMPRESS_ROW_FUNC(decompress_jumptab32_rle4, int)
+
+// decompress a full RLE blob with header into the dst pointer. pitch is
+// the number of bytes between each destination row
+static void
+decompress_rle4(DATA8 *src, DATA8 *dst, int pitch, int w EINA_UNUSED, int h)
+{
+   int header;
+   DATA8 *jumptab;
+
+   // get header value and then skip past to jump table
+   header = *((int *)src);
+   jumptab = src + sizeof(int);
+#define DECOMPRESS_FUNC(_name, _type) _name((_type *)jumptab,  jumptab + (h * sizeof(_type)), dst, pitch, h)
+   if (header == 1)
+     DECOMPRESS_FUNC(decompress_jumptab8_rle4, DATA8);
+   else if (header == 2)
+     DECOMPRESS_FUNC(decompress_jumptab16_rle4, unsigned short);
+   else if (header == 3)
+     DECOMPRESS_FUNC(decompress_jumptab32_rle4, int);
+}
+
+
+
+
+//--------------------------------------------------------------------------
+//- RAW 4BIT ---------------------------------------------------------------
+//--------------------------------------------------------------------------
+
+// this compresses 8bit per pixel font data to 4bit per pixel (with 4 bit MSB
+// per byte holding the left most pixel and 4 bit LSB holding the right pixel
+// data). each row is rounded up to a whole number of bytes so the last
+// pixel may only contain 1, not 2 4bit values and thus we throw away the LSB
+// 4 bits on odd-length rows in the last pixel. at the top of the 4bit packed
+// pixel data is an integer that stores the data type - value of 0 means
+// 4bit packed data. this is so we can share the same generic "rle" pointer
+// between 4bit rle and 4bit packed and easily switch between these 2 encodings
+// based on which one is likely more compact and/or faster at runtime.
+static DATA8 *
+compress_bpp4(DATA8 *src, int pitch, int w, int h, int *size_ret)
+{
+   int pitch2, x, y, *iptr;
+   DATA8 *buf, *p, *d, *s;
+
+   // our horizontal pitch in bytes ... rounding up to account for odd lengths
+   pitch2 = (w + 1) / 2;
+   // allocate the buffer size for header plus data
+   buf = malloc(sizeof(int) + (pitch2 * h));
+   if (!buf) return NULL;
+   // write the header value of 0
+   iptr = (int *)buf;
+   *iptr = 0;
+   // start with the 4 bit packed data body
+   p = buf + sizeof(int);
+   // return size
+   *size_ret = (pitch2 * h) + sizeof(int);
+   for (y = 0; y < h; y++)
+     {
+        s = src + (y * pitch);
+        d = p + (y * pitch2);
+        // walk source row 2 pixels at a time and reduce to 4 bit (upper
+        // 4 bits only needed) and pack
+        for (x = 0; x < (w - 1); x += 2)
+          {
+             *d = (s[0] & 0xf0) | (s[1] >> 4);
+             s += 2;
+             d++;
+          }
+        /// handle dangling "last" pixel if odd row length
+        if (x < w) *d = (s[0] & 0xf0);
+     }
+   return buf;
+}
+
+// this decompresses packed 4bit data from the encoded data blob into a
+// destination 8bit buffer assumed to be allocated and the right size with
+// the given destination pitch in bytes per line and a row length of w
+// pixels and height of h rows
+static void
+decompress_bpp4(DATA8 *src, DATA8 *dst, int pitch, int w, int h)
+{
+   int pitch2, x, y;
+   DATA8 *d, *s, val;
+   
+   // deal with source pixel to round up for odd length rows
+   pitch2 = (w + 1) / 2;
+   // skip header int
+   src += sizeof(int);
+   for (y = 0; y < h; y++)
+     {
+        s = src + (y * pitch2);
+        d = dst + (y * pitch);
+        // walk 2 pixels at a time (1 source byte) and unpack
+        for (x = 0; x < (w - 1); x += 2)
+          {
+             // take MSB 4 bits (pixel 1)
+             val = (*s) >> 4;
+             // replicate those 4 bits in MSB of dest so it rounds correctly
+             val |= val << 4;
+             // store in dest
+             *d = val;
+             d++;
+             // take LSB 4 bits (pixel 2)
+             val = (*s) & 0xf;
+             // replicate those 4 bits in MSB of dest so it rounds correctly
+             val |= val << 4;
+             // store in dest
+             *d = val;
+             s++;
+             d++;
+          }
+        // deal with odd length rows and take MSB 4 bits and store to dest
+        if (x < w)
+          {
+             val = (*s) >> 4;
+             val |= val << 4;
+             *d = val;
+          }
+     }
+}
+
+
+
+//--------------------------------------------------------------------------
+//- GENERAL ----------------------------------------------------------------
+//--------------------------------------------------------------------------
+EAPI void *
+evas_common_font_glyph_compress(void *data, int num_grays, int pixel_mode,
+                                int pitch_data, int w, int h, int *size_ret)
+{
+   DATA8 *inbuf, *buf;
+   int size = 0, pitch = 0;
+
+   // avoid compressing 0 sized glyph
+   if ((h < 1) || (pitch_data < 1)) return NULL;
+   inbuf = alloca(w * h);
+   // if glyph buffer is 8bit grey - then compress straght
+   if (((num_grays == 256) && (pixel_mode == FT_PIXEL_MODE_GRAY)))
+     {
+        inbuf = data;
+        pitch = pitch_data;
+     }
+   // if glyph is 1bit bitmap - expand it to 8bit grey first
+   else
+     {
+        pitch = w;
+        expand_bitmap(data, pitch_data, w, h, inbuf);
+     }
+   // in testing for small glyphs - eg 16x16 or smaller it seems raw 4bit
+   // encoding is faster (and smaller) than 4bit RLE.
+   if ((w * h) < (16 * 16))
+     // compress to 4bit per pixel, raw
+     buf = compress_bpp4(inbuf, pitch, w, h, &size);
+   else
+     // compress to 4bit per pixel, run length encoded per row
+     buf = compress_rle4(inbuf, pitch, w, h, &size);
+   *size_ret = size;
+   return buf;
+}
+
+// this decompresses a whole block of compressed font data back to 8bit
+// per pixels and deals with both 4bit RLE and 4bit packed encoding modes
+EAPI DATA8 *
+evas_common_font_glyph_uncompress(RGBA_Font_Glyph *fg, int *wret, int *hret)
+{
+   RGBA_Font_Glyph_Out *fgo = fg->glyph_out;
+   DATA8 *buf = calloc(1, fgo->bitmap.width * fgo->bitmap.rows);
+   int *iptr;
+   
+   if (!buf) return NULL;
+   *wret = fgo->bitmap.width;
+   *hret = fgo->bitmap.rows;
+   iptr = (int *)fgo->rle;
+   if (*iptr > 0) // rle4
+     decompress_rle4(fgo->rle, buf, fgo->bitmap.width,
+                     fgo->bitmap.width, fgo->bitmap.rows);
+   else // bpp4
+     decompress_bpp4(fgo->rle, buf, fgo->bitmap.width,
+                     fgo->bitmap.width, fgo->bitmap.rows);
+   return buf;
+}
+
+// this draws a compressed font glyph and decompresses on the fly as it
+// draws, saving memory bandwidth and providing speedups
+EAPI void
+evas_common_font_glyph_draw(RGBA_Font_Glyph *fg, 
+                            RGBA_Draw_Context *dc,
+                            DATA32 *dst, int dst_pitch,
+                            int x, int y, int cx, int cy, int cw, int ch)
+{
+   RGBA_Font_Glyph_Out *fgo = fg->glyph_out;
+   int w, h, x1, x2, y1, y2, i, *iptr;
+   DATA32 coltab[16], col;
+   DATA16 mtab[16], v;
+   DATA8 tmp;
+
+   w = fgo->bitmap.width; h = fgo->bitmap.rows;
+   // skip if totally clipped out
+   if ((y >= (cy + ch)) || ((y + h) <= cy) ||
+       (x >= (cx + cw)) || ((x + w) <= cx)) return;
+   // figure y1/y2 limit range
+   y1 = 0; y2 = h;
+   if ((y + y1) < cy) y1 = cy - y;
+   if ((y + y2) > (cy + ch)) y2 = cy + ch - y;
+   // figure x1/x2 limit range
+   x1 = 0; x2 = w;
+   if ((x + x1) < cx) x1 = cx - x;
+   if ((x + x2) > (cx + cw)) x2 = cx + cw - x;
+   // build fast multiply + mask color tables to avoid compute. this works
+   // because of our very limited 4bit range of alpha values
+   col = dc->col.col;
+   for (i = 0; i <= 0xf; i++)
+     {
+        v = (i << 4) | i;
+        coltab[i] = MUL_SYM(v, col);
+        tmp = (coltab[i] >> 24);
+        mtab[i] = 256 - (tmp + (tmp >> 7));
+     }
+#ifdef BUILD_MMX
+   if (evas_common_cpu_has_feature(CPU_FEATURE_MMX))
+     {
+#define MMX 1
+#include "evas_font_compress_draw.c"
+#undef MMX
+     }
+   else 
+#endif
+   
+#ifdef BUILD_NEON
+   if (evas_common_cpu_has_feature(CPU_FEATURE_NEON))
+     {
+#define NEON 1
+#include "evas_font_compress_draw.c"
+#undef NEON
+     }
+   else
+#endif
+   
+     {
+#include "evas_font_compress_draw.c"
+     }
+}
--- a/src/lib/evas/common/evas_font_compress_draw.c
+++ b/src/lib/evas/common/evas_font_compress_draw.c
@ -0,0 +1,342 @@
+// inherited from parent func
+//   RGBA_Font_Glyph_Out *fgo;
+//   int w, h, x1, x2, y1, y2, i, *iptr;
+//   DATA32 coltab[16], col;
+//   DATA16 mtab[16], v;
+//   DATA8 tmp;
+
+// blend a pixel using pre-computed multiplied col and inverse mul value
+#define MMX_BLEND(_dst, _col, _mul) \
+   MOV_P2R(_dst, mm1, mm0) \
+   MOV_A2R(_mul, mm3) \
+   MOV_P2R(_col, mm2, mm0) \
+   MUL4_256_R2R(mm3, mm1) \
+   paddw_r2r(mm2, mm1); \
+   MOV_R2P(mm1, _dst, mm0)
+#define C_BLEND(_dst, _col, _mul) \
+   _dst = _col + MUL_256(_mul, _dst)
+
+// copy 64bits in 1 go (special mmx - no such thing in C here)
+#define MMX_COPY64(_dst, _src) \
+   movq_r2m(_src, _dst)
+
+// a loop of 64bit copies
+#define MMX_COPY64LOOP(_dst, _len) \
+   if (_len >= 2) \
+   { \
+      while (_len > 1) \
+        { \
+           MMX_COPY64(_dst[0], mm7); \
+           _dst += 2; _len -= 2; \
+        } \
+   }
+
+// if we build for mmx optimizations, we need to set up a few things in advance
+// like the mm0 register is always all 0'd to fill in 0 padding when
+// unpacking values to registers. also mm7 is reserved to hold an unpacked
+// and dumpliacted coltab entry for the final entry (max color). so it's
+// [col][col] in the 63bit register with both 32bit colors doublicated
+#ifdef MMX        
+pxor_r2r(mm0, mm0);
+movd_m2r(coltab[0xf], mm7);
+punpckldq_r2r(mm7, mm7);
+#endif
+
+// check header for typ (rle4 or bpp4)
+iptr = (int *)fgo->rle;
+if (*iptr > 0) // rle4
+{
+   DATA8 *p = fgo->rle, *e, *s;
+   DATA32 *d0, *d, t;
+   DATA16 len;
+   int xx, yy, dif;
+   
+   iptr = (int *)p;
+   p += sizeof(int);
+   d0 = dst + x + (y * dst_pitch);
+// this may seem horrible to put a massive blob of logic into a macro like
+// this, but this is for speed reasons, so we can generate slightly different
+// versions of the same blob of code logic that hold different optimizations
+// inside (eg mmx/sse/neon asm etc.)
+#define EXPAND_RLE(_donelabel, _extn, _2copy, _blend) \
+   if ((x1 == 0) && (x2 == w)) /* unclipped  horizontally */ \
+   { \
+      d0 += x1; \
+      for (yy = y1; yy < y2; yy++) \
+        { \
+           /* figure out source ptr and end ptr based on jumptable */ \
+           if (yy > 0) s = p + jumptab[yy - 1]; \
+           else s = p; \
+           e = p + jumptab[yy]; \
+           d = d0 + (yy * dst_pitch); \
+           /* walk until we hit the end of the src data */ \
+           while (s < e) \
+             { \
+                /* read the run length from RLE data and value */ \
+                len = (*s >> 4) + 1; \
+                v = *s & 0xf; \
+                /* if value is 0 we can just skip ahead entire run and do */ \
+                /* nothng as empty space doesn't need any work */ \
+                if (v == 0) d += len; \
+                /* if the value ends up being solid (inverse alpha is 0) */ \
+                else if (mtab[v] == 0) \
+                  { \
+                     /* just COPY the color data direct to destination */ \
+                     t = coltab[0xf]; \
+                     /* this is a special 2 pixel (64bit dest) copy for */ \
+                     /* speed - eg mmx etc. */ \
+                     _2copy; \
+                     /* do cleanup of left-over pixels after the 2 pixel */ \
+                     /* copy above (if there is any such code) */ \
+                     while (len > 0) \
+                       { \
+                          /* just a plain copy of looked up value */ \
+                          *d = t; \
+                          d++; len--; \
+                       } \
+                  } \
+                /* our font mask value is between 0 and 15 (0xf) so we */ \
+                /* have to actually blend it to each dest pixel */ \
+                else \
+                  { \
+                     while (len > 0) \
+                       { \
+                          /* do blend using op provided by params */ \
+                          _blend; \
+                          d++; len--; \
+                       } \
+                  } \
+                s++; \
+             } \
+        } \
+   } \
+   else /* clipped horizontally (needs extra skip/cut logic) */ \
+   { \
+      /* init out pos to 0 here (we reset AFTER each horiz loop later */ \
+      xx = 0; \
+      for (yy = y1; yy < y2; yy++) \
+        { \
+           /* figure out source ptr and end ptr based on jumptable */ \
+           if (yy > 0) s = p + jumptab[yy - 1]; \
+           else s = p; \
+           e = p + jumptab[yy]; \
+           d = d0 + (yy * dst_pitch); \
+           /* walk until we hit the end of the src data and SKIP runs */ \
+           /* that are entirely before the start (x1) point and any */ \
+           /* run that spans over the start point is truncated at the */ \
+           /* start of the run */ \
+           while (s < e) \
+             { \
+                len = (*s >> 4) + 1; \
+                /* if current pos pluse run length go over the start (x1) */ \
+                /* point of our clip area, then adjust run length and dest */ \
+                /* pointer and position and break out of our RLE skip loop */ \
+                if ((xx + (int)len) > x1) \
+                  { \
+                     dif = x1 - xx; \
+                     len -= dif; d += dif; xx += dif; \
+                     break; \
+                  } \
+                d += len; xx += len; s++; \
+             } \
+           /* walk until we hit the end of the REL run.. OR the end of */ \
+           /* our clip region - the x2 checks are done inside */ \
+           while (s < e) \
+             { \
+                v = *s & 0xf; \
+                /* if value is 0 we can just skip ahead entire run and do */ \
+                /* nothng as empty space doesn't need any work */ \
+                if (v == 0) \
+                  { \
+                     d += len; xx += len; \
+                     /* clip check to stop run */ \
+                     if (xx >= x2) goto _donelabel##_extn; \
+                  } \
+                /* if the value ends up being solid (inverse alpha is 0) */ \
+                else if (mtab[v] == 0) \
+                  { \
+                     /* just COPY the color data direct to destination */ \
+                     t = coltab[0xf]; \
+                     while (len > 0) \
+                       { \
+                          /* clip check to stop run */ \
+                          if (xx >= x2) goto _donelabel##_extn; \
+                          /* just a plain copy of looked up value */ \
+                          *d = t; \
+                          d++; xx++; len--; \
+                       } \
+                  } \
+                /* our font mask value is between 0 and 15 (0xf) so we */ \
+                /* have to actually blend it to each dest pixel */ \
+                else \
+                  { \
+                     while (len > 0) \
+                       { \
+                          /* clip check to stop run */ \
+                          if (xx >= x2) goto _donelabel##_extn; \
+                          /* do blend using op provided by params */ \
+                          _blend; \
+                          d++; xx++; len--; \
+                       } \
+                  } \
+                s++; \
+                /* extra check here so length fetch after doesn't break */ \
+                if (s >= e) break; \
+                /* get length of NEXT RLE run at the end here */ \
+                len = (*s >> 4) + 1; \
+             } \
+_donelabel##_extn: \
+           /* reset horiz pos to 0 ready for next line */ \
+           xx = 0; \
+        } \
+   }
+
+   // and here actually run the appropriate code in the macro/func defined
+   // above, based on the jumptable type (saves passing params on the stack
+   // to a sub function and we'd have to generate the subfunction by macros
+   // anyway, so just cust down code to assume context vars as opposed to
+   // passing them)
+   if (*iptr == 1) // 8 bit jump table
+     {
+        DATA8 *jumptab = p;
+        p += (h * sizeof(DATA8));
+#ifdef MMX
+        EXPAND_RLE(done_8_clipped, _mmx, MMX_COPY64LOOP(d, len),
+                   MMX_BLEND(d[0], coltab[v], mtab[v]))
+#elif defined(NEON)
+        EXPAND_RLE(done_8_clipped, _neon, ,
+                   C_BLEND(d[0], coltab[v], mtab[v]))
+#else
+        EXPAND_RLE(done_8_clipped, _c, ,
+                   C_BLEND(d[0], coltab[v], mtab[v]))
+#endif
+     }
+   else if (*iptr == 2) // 16 bit jump table
+     {
+        unsigned short *jumptab = (unsigned short *)p;
+        p += (h * sizeof(unsigned short));
+#ifdef MMX
+        EXPAND_RLE(done_16_clipped, _mmx, MMX_COPY64LOOP(d, len),
+                   MMX_BLEND(d[0], coltab[v], mtab[v]))
+#elif defined(NEON)
+        EXPAND_RLE(done_16_clipped, _neon, ,
+                   C_BLEND(d[0], coltab[v], mtab[v]))
+#else
+        EXPAND_RLE(done_16_clipped, _c, ,
+                   C_BLEND(d[0], coltab[v], mtab[v]))
+#endif
+     }
+   else if (*iptr == 3) // 32 bit jump table
+     {
+        int *jumptab = (int *)p;
+        p += (h * sizeof(int));
+#ifdef MMX
+        EXPAND_RLE(done_32_clipped, _mmx, MMX_COPY64LOOP(d, len),
+                   MMX_BLEND(d[0], coltab[v], mtab[v]))
+#elif defined(NEON)
+        EXPAND_RLE(done_32_clipped, _neon, ,
+                   C_BLEND(d[0], coltab[v], mtab[v]))
+#else
+        EXPAND_RLE(done_32_clipped, _c, ,
+                   C_BLEND(d[0], coltab[v], mtab[v]))
+#endif
+     }
+#undef EXPAND_RLE
+}
+else // bpp4
+{
+   int xx, yy, djump;
+   int pitch2;
+   DATA8 *s, *s0, v0;
+   DATA32 *d;
+   
+   d = dst + x + x1 + ((y + y1) * dst_pitch);
+   djump = dst_pitch - (x2 - x1);
+   pitch2 = (w + 1) / 2;
+   s0 = fgo->rle + sizeof(int) + (y1 * pitch2);
+   for (yy = y1; yy < y2; yy++)
+     {
+        s = s0 + (x1 / 2);
+        xx = x1;
+        // do odd pixel at start if there is any
+        if (xx & 0x1)
+          {
+             v = (*s) & 0xf;
+             // fast path - totally solid color can just be written
+             // with no blending done
+             if (mtab[v] == 0) d[0] = coltab[0xf];
+             // blend our color from lookup table
+             else if (v)
+               {
+                  // blend it
+#ifdef MMX
+                  MMX_BLEND(d[0], coltab[v], mtab[v]);
+#else
+                  C_BLEND(d[0], coltab[v], mtab[v]);
+#endif
+               }
+             s++; d++; xx++;
+          }
+        // walk along 2 pixels at a time (1 src pixel is 4 bits packed)
+        for (; xx < (x2 - 1); xx += 2)
+          {
+             v0 = *s;
+             // fast path - totally solid color can just be written
+             // with no blending done - write 2 at once
+             if ((v0 == 0xff) && (mtab[v0 & 0xf] == 0))
+               {
+                  // blend it
+#ifdef MMX
+                  MMX_COPY64(d[0], mm7);
+#else
+                  d[0] = d[1] = coltab[0xf];
+#endif
+               }
+             // if our 2 values are not 0 (as 0's we can skip entirely)
+             else if (v0)
+               {
+                  // get first pixel in MSB and blend it
+                  v = (v0) >> 4;
+#ifdef MMX
+                  MMX_BLEND(d[0], coltab[v], mtab[v]);
+#else
+                  C_BLEND(d[0], coltab[v], mtab[v]);
+#endif
+                  // get next pixel in LSB and blend it
+                  v = (v0) & 0xf;
+#ifdef MMX
+                  MMX_BLEND(d[1], coltab[v], mtab[v]);
+#else
+                  C_BLEND(d[1], coltab[v], mtab[v]);
+#endif
+               }
+             s++; d += 2;
+          }
+        // clean up any leftover pixels at the end
+        if (xx < x2)
+          {
+             v = (*s) >> 4;
+             // fast path - totally solid color can just be written
+             // with no blending done
+             if (mtab[v] == 0) d[0] = coltab[0xf];
+             // blend our color from lookup table
+             else if (v)
+               {
+                  // blend it
+#ifdef MMX
+                  MMX_BLEND(d[0], coltab[v], mtab[v]);
+#else
+                  C_BLEND(d[0], coltab[v], mtab[v]);
+#endif
+               }
+             d++;
+          }
+        d += djump;
+        s0 += pitch2;
+     }
+}
+// with mmx (sse etc.) we need to say we are done with the mmx registers so
+// any fpu usage is restored (early pentiums need this, later x86 do not)
+#ifdef MMX
+evas_common_cpu_end_opt();
+#endif
--- a/src/lib/evas/common/evas_font_draw.c
+++ b/src/lib/evas/common/evas_font_draw.c
@ -14,10 +14,8 @@
 struct _Evas_Glyph
 {
   RGBA_Font_Glyph *fg;
-   void *data;
-   Eina_Rectangle coord;
+   int x, y;
   FT_UInt idx;
-   int j;
 };

 EAPI void
@ -33,7 +31,7 @@ evas_common_font_draw_init(void)
 */
 EAPI Eina_Bool
 evas_common_font_rgba_draw(RGBA_Image *dst, RGBA_Draw_Context *dc, int x, int y,
-                           Evas_Glyph_Array *glyphs, RGBA_Gfx_Func func, int ext_x, int ext_y, int ext_w,
+                           Evas_Glyph_Array *glyphs, RGBA_Gfx_Func func EINA_UNUSED, int ext_x, int ext_y, int ext_w,
                           int ext_h, int im_w, int im_h EINA_UNUSED)
 {
   DATA32 *im;
@ -43,183 +41,33 @@ evas_common_font_rgba_draw(RGBA_Image *dst, RGBA_Draw_Context *dc, int x, int y,
   if (!glyphs->array) return EINA_FALSE;

   im = dst->image.data;
-
   EINA_INARRAY_FOREACH(glyphs->array, glyph)
     {
        RGBA_Font_Glyph *fg;
-        int chr_x, chr_y;
+        int chr_x, chr_y, w;

        fg = glyph->fg;
-
-	/* FIXME: Why was that moved out of prepare ? This increase cache miss. */
-        glyph->coord.w = fg->glyph_out->bitmap.width;
-        glyph->coord.h = fg->glyph_out->bitmap.rows;
-        glyph->j = fg->glyph_out->bitmap.pitch;
-        glyph->data = fg->glyph_out->bitmap.buffer;
-
-        if (dc->font_ext.func.gl_new)
+        if ((!fg->ext_dat) && (dc->font_ext.func.gl_new))
          {
             /* extension calls */
             fg->ext_dat = dc->font_ext.func.gl_new(dc->font_ext.data, fg);
             fg->ext_dat_free = dc->font_ext.func.gl_free;
          }
-
-        chr_x = x + glyph->coord.x;
-        chr_y = y + glyph->coord.y;
-
+        w = fg->glyph_out->bitmap.width;
+        chr_x = x + glyph->x;
+        chr_y = y + glyph->y;
        if (chr_x < (ext_x + ext_w))
          {
-             DATA8 *data;
-             int i, j, w, h;
-
-             data = glyph->data;
-             j = glyph->j;
-             w = glyph->coord.w;
-             if (j < w) j = w;
-             h = glyph->coord.h;
-
-#ifdef HAVE_PIXMAN
-# ifdef PIXMAN_FONT             
-             int index;
-             DATA32 *font_alpha_buffer;
-             pixman_image_t *font_mask_image;
-
-             font_alpha_buffer = alloca(w * h * sizeof(DATA32));
-             for (index = 0; index < (w * h); index++)
-               font_alpha_buffer[index] = data[index] << 24;
-             
-             font_mask_image = pixman_image_create_bits(PIXMAN_a8r8g8b8, w, h,
-                                                        font_alpha_buffer, 
-                                                        w * sizeof(DATA32));
-
-             if (!font_mask_image) return EINA_FALSE;
-# endif
-#endif
-
+             if ((w > 0) && ((chr_x + w) > ext_x))
               {
-                  if ((j > 0) && (chr_x + w > ext_x))
-                    {
-                       if ((fg->ext_dat) && (dc->font_ext.func.gl_draw))
-                         {
-                            /* ext glyph draw */
-                            dc->font_ext.func.gl_draw(dc->font_ext.data,
-                                                      (void *)dst,
-                                                      dc, fg, chr_x,
-                                                      y - (chr_y - y));
-                         }
-                       else
-                         {
-                            if ((fg->glyph_out->bitmap.num_grays == 256) &&
-                                (fg->glyph_out->bitmap.pixel_mode == FT_PIXEL_MODE_GRAY))
-                              {
-#ifdef HAVE_PIXMAN
-# ifdef PIXMAN_FONT
-                                 if ((dst->pixman.im) && 
-                                     (dc->col.pixman_color_image))
-                                   pixman_image_composite(PIXMAN_OP_OVER, 
-                                                          dc->col.pixman_color_image, 
-                                                          font_mask_image, 
-                                                          dst->pixman.im,
-                                                          chr_x, 
-                                                          y - (chr_y - y), 
-                                                          0, 0, 
-                                                          chr_x, 
-                                                          y - (chr_y - y), 
-                                                          w, h);
-                                 else
-# endif                                   
-#endif
-                                   {
-                                      for (i = 0; i < h; i++)
-                                        {
-                                           int dx, dy;
-                                           int in_x, in_w;
-                                           
-                                           in_x = 0;
-                                           in_w = 0;
-                                           dx = chr_x;
-                                           dy = y - (chr_y - i - y);
-
-					   if ((dx < (ext_x + ext_w)) &&
-					       (dy >= (ext_y)) &&
-					       (dy < (ext_y + ext_h)))
-					     {
-					       if (dx + w > (ext_x + ext_w))
-						 in_w += (dx + w) - (ext_x + ext_w);
-					       if (dx < ext_x)
-						 {
-						   in_w += ext_x - dx;
-						   in_x = ext_x - dx;
-						   dx = ext_x;
-						 }
-					       if (in_w < w)
-						 {
-						   func(NULL, data + (i * j) + in_x, dc->col.col,
-							im + (dy * im_w) + dx, w - in_w);
-						 }
-					     }
-                                        }
-                                   }
-                              }
-                            else
-                              {
-                                 DATA8 *tmpbuf = NULL, *dp, *tp, bits;
-                                 int bi, bj;
-                                 const DATA8 bitrepl[2] = {0x0, 0xff};
-
-                                 tmpbuf = alloca(w);
-                                 for (i = 0; i < h; i++)
-                                   {
-                                      int dx, dy;
-                                      int in_x, in_w, end;
-                                      
-                                      in_x = 0;
-                                      in_w = 0;
-                                      dx = chr_x;
-                                      dy = y - (chr_y - i - y);
-
-				      tp = tmpbuf;
-				      dp = data + (i * fg->glyph_out->bitmap.pitch);
-				      for (bi = 0; bi < w; bi += 8)
-					{
-					  bits = *dp;
-					  if ((w - bi) < 8) end = w - bi;
-					  else end = 8;
-					  for (bj = 0; bj < end; bj++)
-					    {
-					      *tp = bitrepl[(bits >> (7 - bj)) & 0x1];
-					      tp++;
-					    }
-					  dp++;
-					}
-				      if ((dx < (ext_x + ext_w)) &&
-					  (dy >= (ext_y)) &&
-					  (dy < (ext_y + ext_h)))
-					{
-					  if (dx + w > (ext_x + ext_w))
-					    in_w += (dx + w) - (ext_x + ext_w);
-					  if (dx < ext_x)
-					    {
-					      in_w += ext_x - dx;
-					      in_x = ext_x - dx;
-					      dx = ext_x;
-					    }
-					  if (in_w < w)
-					    {
-					      func(NULL, tmpbuf + in_x, dc->col.col,
-						   im + (dy * im_w) + dx, w - in_w);
-					    }
-                                        }
-                                   }
-                              }
-                         }
-                    }
+                  if ((fg->ext_dat) && (dc->font_ext.func.gl_draw))
+                    dc->font_ext.func.gl_draw(dc->font_ext.data, (void *)dst,
+                                              dc, fg, chr_x, y - (chr_y - y));
+                  else if (fg->glyph_out->rle)
+                    evas_common_font_glyph_draw(fg, dc, im, im_w,
+                                                chr_x, y - (chr_y - y),
+                                                ext_x, ext_y, ext_w, ext_h);
               }
-#ifdef HAVE_PIXMAN
-# ifdef PIXMAN_FONT
-             pixman_image_unref(font_mask_image);
-# endif
-#endif
          }
        else
          break;
@ -362,8 +210,8 @@ evas_common_font_draw_prepare(Evas_Text_Props *text_props)

        glyph->fg = fg;
        glyph->idx = idx;
-        glyph->coord.x = EVAS_FONT_WALK_PEN_X + EVAS_FONT_WALK_X_OFF + EVAS_FONT_WALK_X_BEAR;
-        glyph->coord.y = EVAS_FONT_WALK_PEN_Y + EVAS_FONT_WALK_Y_OFF + EVAS_FONT_WALK_Y_BEAR;
+        glyph->x = EVAS_FONT_WALK_PEN_X + EVAS_FONT_WALK_X_OFF + EVAS_FONT_WALK_X_BEAR;
+        glyph->y = EVAS_FONT_WALK_PEN_Y + EVAS_FONT_WALK_Y_OFF + EVAS_FONT_WALK_Y_BEAR;
     }
   EVAS_FONT_WALK_TEXT_END();

@ -389,11 +237,6 @@ evas_common_font_draw_prepare(Evas_Text_Props *text_props)
   return;

 error:
-   if (fg)
-     {
-        if (fg->glyph_out) free(fg->glyph_out);
-        free(fg);
-     }
   eina_inarray_free(glyphs);
 }

--- a/src/lib/evas/common/evas_font_main.c
+++ b/src/lib/evas/common/evas_font_main.c
@ -15,6 +15,7 @@

 #include FT_OUTLINE_H
 #include FT_SYNTHESIS_H
+#include FT_BITMAP_H

 FT_Library      evas_ft_lib = 0;
 static int      initialised = 0;
@ -352,10 +353,17 @@ _glyph_free(RGBA_Font_Glyph *fg)
 {
   if ((!fg) || (fg == (void *)(-1))) return;

+   if (fg->glyph_out)
+     {
+        if ((fg->glyph_out->rle) && (fg->glyph_out->bitmap.rle_alloc))
+          free(fg->glyph_out->rle);
+        fg->glyph_out->rle = NULL;
+        if (!fg->glyph_out->bitmap.no_free_glout) free(fg->glyph_out);
+        fg->glyph_out = NULL;
+     }
   FT_Done_Glyph(fg->glyph);
   /* extension calls */
   if (fg->ext_dat_free) fg->ext_dat_free(fg->ext_dat);
-   if (fg->glyph_out_free) fg->glyph_out_free(fg->glyph_out);
   free(fg);
 }

@ -578,23 +586,32 @@ evas_common_font_int_cache_glyph_render(RGBA_Font_Glyph *fg)

   fbg = (FT_BitmapGlyph)fg->glyph;

-   fg->glyph_out = malloc(sizeof(RGBA_Font_Glyph_Out));
+   fg->glyph_out = calloc(1, sizeof(RGBA_Font_Glyph_Out));
   fg->glyph_out->bitmap.rows = fbg->bitmap.rows;
   fg->glyph_out->bitmap.width = fbg->bitmap.width;
   fg->glyph_out->bitmap.pitch = fbg->bitmap.pitch;
   fg->glyph_out->bitmap.buffer = fbg->bitmap.buffer;
-   fg->glyph_out->bitmap.num_grays = fbg->bitmap.num_grays;
-   fg->glyph_out->bitmap.pixel_mode = fbg->bitmap.pixel_mode;
-
-   fg->glyph_out_free = free;
-   /* This '+ 200' is just an estimation of how much memory freetype will use
+   fg->glyph_out->bitmap.rle_alloc = EINA_TRUE;
+   
+   /* This '+ 100' is just an estimation of how much memory freetype will use
    * on it's size. This value is not really used anywhere in code - it's
    * only for statistics. */
   size = sizeof(RGBA_Font_Glyph) + sizeof(Eina_List) +
-    (fg->glyph_out->bitmap.width * fg->glyph_out->bitmap.rows) + 200;
+    (fg->glyph_out->bitmap.width * fg->glyph_out->bitmap.rows / 2) + 100;
   fi->usage += size;
   if (fi->inuse) evas_common_font_int_use_increase(size);

+   fg->glyph_out->rle = evas_common_font_glyph_compress
+   (fbg->bitmap.buffer, fbg->bitmap.num_grays, fbg->bitmap.pixel_mode,
+    fbg->bitmap.pitch, fbg->bitmap.width, fbg->bitmap.rows,
+    &(fg->glyph_out->rle_size));
+
+   fg->glyph_out->bitmap.buffer = NULL;
+
+   // this may be technically incorrect as we go and free a bitmap buffer
+   // behind the ftglyph's back...
+   FT_Bitmap_Done(evas_ft_lib, &(fbg->bitmap));
+   
   return EINA_TRUE;
 }

--- a/src/lib/evas/cserve2/evas_cs2_client.c
+++ b/src/lib/evas/cserve2/evas_cs2_client.c
@ -1952,10 +1952,11 @@ _font_entry_glyph_map_rebuild_check(Font_Entry *fe, Font_Hint_Flags hints)
             gl->base.bitmap.rows = gd->rows;
             gl->base.bitmap.width = gd->width;
             gl->base.bitmap.pitch = gd->pitch;
-             gl->base.bitmap.buffer = (unsigned char *)
-                   fe->map->mempool.data + gl->offset;
-             gl->base.bitmap.num_grays = gd->num_grays;
-             gl->base.bitmap.pixel_mode = gd->pixel_mode;
+             gl->base.bitmap.buffer = NULL;
+             gl->base.rle = (unsigned char *)
+               fe->map->mempool.data + gl->offset;
+             gl->base.rle_size = gl->size;
+             gl->base.bitmap.rle_alloc = EINA_FALSE;
             gl->idx = gd->index;
             gl->rid = 0;

@ -2062,7 +2063,7 @@ _glyph_request_cb(void *data, const void *msg, int size)
     {
        string_t shm_id;
        unsigned int idx, offset, glsize, hints;
-        int rows, width, pitch, num_grays, pixel_mode;
+        int rows, width, pitch;
        CS_Glyph_Out *gl;

        pos = buf - (const char*) resp;
@ -2083,10 +2084,6 @@ _glyph_request_cb(void *data, const void *msg, int size)
        buf += sizeof(int);
        memcpy(&pitch, buf, sizeof(int));
        buf += sizeof(int);
-        memcpy(&num_grays, buf, sizeof(int));
-        buf += sizeof(int);
-        memcpy(&pixel_mode, buf, sizeof(int));
-        buf += sizeof(int);
        memcpy(&hints, buf, sizeof(int));
        buf += sizeof(int);
        if (hints != grd->hints)
@ -2112,10 +2109,12 @@ _glyph_request_cb(void *data, const void *msg, int size)
        gl->base.bitmap.rows = rows;
        gl->base.bitmap.width = width;
        gl->base.bitmap.pitch = pitch;
-        gl->base.bitmap.buffer =
+        gl->base.bitmap.buffer = NULL;
+        gl->base.bitmap.rle_alloc = 0;
+        gl->base.bitmap.no_free_glout = 1;
+        gl->base.rle =
              (unsigned char *) gl->map->mempool.data + gl->offset;
-        gl->base.bitmap.num_grays = num_grays;
-        gl->base.bitmap.pixel_mode = pixel_mode;
+        gl->base.rle_size = gl->size;
        gl->rid = 0;

        if (!eina_clist_element_is_linked(&gl->map_entry))
--- a/src/lib/evas/include/evas_common_private.h
+++ b/src/lib/evas/include/evas_common_private.h
@ -968,14 +968,16 @@ struct _RGBA_Font_Source
 */
 struct _RGBA_Font_Glyph_Out
 {
+   unsigned char *rle;
   struct {
-      int rows;
-      int width;
-      int pitch;
      unsigned char *buffer;
-      short num_grays;
-      char pixel_mode;
+      unsigned short rows;
+      unsigned short width;
+      unsigned short pitch;
+      unsigned short rle_alloc : 1;
+      unsigned short no_free_glout : 1;
   } bitmap;
+   int rle_size;
 };

 struct _RGBA_Font_Glyph
@ -986,7 +988,6 @@ struct _RGBA_Font_Glyph
   Evas_Coord      y_bear;
   FT_Glyph        glyph;
   RGBA_Font_Glyph_Out *glyph_out;
-   void            (*glyph_out_free)(void *);
   /* this is a problem - only 1 engine at a time can extend such a font... grrr */
   void           *ext_dat;
   void           (*ext_dat_free) (void *ext_dat);
--- a/src/modules/evas/engines/gl_common/evas_gl_font.c
+++ b/src/modules/evas/engines/gl_common/evas_gl_font.c
@ -5,10 +5,8 @@ evas_gl_font_texture_new(void *context, RGBA_Font_Glyph *fg)
 {
   Evas_Engine_GL_Context *gc = context;
   Evas_GL_Texture *tex;
-   DATA8 *data;
-   int w, h, j, nw;
-   DATA8 *ndata;
-   int fh;
+   int w, h, j, nw, fh, x, y;
+   DATA8 *ndata, *data, *p1, *p2;

   if (fg->ext_dat) return fg->ext_dat; // FIXME: one engine at a time can do this :(

@ -16,80 +14,38 @@ evas_gl_font_texture_new(void *context, RGBA_Font_Glyph *fg)
   h = fg->glyph_out->bitmap.rows;
   if ((w == 0) || (h == 0)) return NULL;

-   data = fg->glyph_out->bitmap.buffer;
-   j = fg->glyph_out->bitmap.pitch;
+   if (!fg->glyph_out->rle) return NULL;
+   data = evas_common_font_glyph_uncompress(fg, &w, &h);
+   if (!data) return NULL;
+   j = w;
   if (j < w) j = w;

+   // expand to 32bit (4 byte) aligned rows for texture upload
   nw = ((w + 3) / 4) * 4;
   ndata = alloca(nw *h);
   if (!ndata) return NULL;
-   if ((fg->glyph_out->bitmap.num_grays == 256) &&
-       (fg->glyph_out->bitmap.pixel_mode == FT_PIXEL_MODE_GRAY))
+   for (y = 0; y < h; y++)
     {
-	int x, y;
-	DATA8 *p1, *p2;
-
-	for (y = 0; y < h; y++)
-	  {
-	     p1 = data + (j * y);
-	     p2 = ndata + (nw * y);
-	     for (x = 0; x < w; x++)
-	       {
-		  *p2 = *p1;
-		  p1++;
-		  p2++;
-	       }
-	  }
+        p1 = data + (j * y);
+        p2 = ndata + (nw * y);
+        for (x = 0; x < w; x++)
+          {
+             *p2 = *p1;
+             p1++;
+             p2++;
+          }
     }
-   else
-     {
-	DATA8 *tmpbuf = NULL, *dp, *tp, bits;
-	int bi, bj, end;
-	const DATA8 bitrepl[2] = {0x0, 0xff};
-
-	tmpbuf = alloca(w);
-	if (tmpbuf)
-	  {
-	     int x, y;
-	     DATA8 *p1, *p2;
-
-	     for (y = 0; y < h; y++)
-	       {
-		  p1 = tmpbuf;
-		  p2 = ndata + (nw * y);
-		  tp = tmpbuf;
-		  dp = data + (y * fg->glyph_out->bitmap.pitch);
-		  for (bi = 0; bi < w; bi += 8)
-		    {
-		       bits = *dp;
-		       if ((w - bi) < 8) end = w - bi;
-		       else end = 8;
-		       for (bj = 0; bj < end; bj++)
-			 {
-			    *tp = bitrepl[(bits >> (7 - bj)) & 0x1];
-			    tp++;
-			 }
-		       dp++;
-		    }
-		  for (x = 0; x < w; x++)
-		    {
-		       *p2 = *p1;
-		       p1++;
-		       p2++;
-		    }
-	       }
-	  }
-     }
-//   fh = h;
   fh = fg->fi->max_h;
   tex = evas_gl_common_texture_alpha_new(gc, ndata, w, h, fh);
-   if (!tex) return NULL;
+   if (!tex) goto done;
   tex->sx1 = ((double)(tex->x)) / (double)tex->pt->w;
   tex->sy1 = ((double)(tex->y)) / (double)tex->pt->h;
   tex->sx2 = ((double)(tex->x + tex->w)) / (double)tex->pt->w;
   tex->sy2 = ((double)(tex->y + tex->h)) / (double)tex->pt->h;
   tex->fglyph = fg;
   gc->font_glyph_textures = eina_list_append(gc->font_glyph_textures, tex);
+done:
+   free(data);
   return tex;
 }