Eina unicode: Added utf8 handling functions and also added Eina_Unicode<->UTF-8 conversions.
It's the functions from Evas_Encoding that have been renamed. I also added support for 6byte UTF-8 conversion (Eina_Unicode->UTF-8). SVN revision: 57093
This commit is contained in:
parent
129a2eb57b
commit
7214694dba
|
@ -13,3 +13,8 @@
|
|||
2011-02-16 Mike Blumenkrantz
|
||||
|
||||
* Added EINA_INLIST_FOREACH_SAFE
|
||||
|
||||
2011-02-16 Tom Hacohen
|
||||
|
||||
* Added eina_unicode_utf8* functions for utf8 string handling
|
||||
and conversions to and from Eina_Unicode
|
||||
|
|
|
@ -58,6 +58,16 @@ EAPI Eina_Unicode *eina_unicode_strncpy(Eina_Unicode *dest, const Eina_Unicode *
|
|||
|
||||
EAPI Eina_Unicode *eina_unicode_escape(const Eina_Unicode *str) EINA_ARG_NONNULL(1) EINA_MALLOC EINA_WARN_UNUSED_RESULT;
|
||||
|
||||
/* UTF-8 Handling */
|
||||
|
||||
EAPI Eina_Unicode eina_unicode_utf8_get_next(const char *buf, int *iindex) EINA_ARG_NONNULL(1, 2);
|
||||
EAPI Eina_Unicode eina_unicode_utf8_get_prev(const char *buf, int *iindex) EINA_ARG_NONNULL(1, 2);
|
||||
EAPI int eina_unicode_utf8_get_len(const char *buf) EINA_ARG_NONNULL(1);
|
||||
|
||||
EAPI Eina_Unicode *eina_unicode_utf8_to_unicode(const char *utf, int *_len) EINA_WARN_UNUSED_RESULT EINA_ARG_NONNULL(1) EINA_MALLOC;
|
||||
|
||||
EAPI char * eina_unicode_unicode_to_utf8(const Eina_Unicode *uni, int *_len) EINA_WARN_UNUSED_RESULT EINA_ARG_NONNULL(1) EINA_MALLOC;
|
||||
|
||||
/**
|
||||
* @}
|
||||
*/
|
||||
|
|
|
@ -185,3 +185,310 @@ eina_unicode_escape(const Eina_Unicode *str)
|
|||
return s2;
|
||||
}
|
||||
|
||||
/* UTF-8 Handling */
|
||||
|
||||
#define EINA_UNICODE_UTF8_BYTES_PER_CHAR 6
|
||||
/* The replacement range that will be used for bad utf8 chars. */
|
||||
#define ERROR_REPLACEMENT_BASE 0xDC80
|
||||
#define ERROR_REPLACEMENT_END 0xDCFF
|
||||
#define IS_INVALID_BYTE(x) ((x == 192) || (x == 193) || (x >= 245))
|
||||
#define IS_CONTINUATION_BYTE(x) ((x & 0xC0) == 0x80)
|
||||
|
||||
/**
|
||||
* Reads UTF8 bytes from @buf, starting at *@index and returns
|
||||
* the decoded code point at iindex offset, and advances iindex
|
||||
* to the next code point after this. iindex is always advanced,
|
||||
* unless if the advancement is after the NULL.
|
||||
* On error: return a codepoint between DC80 to DCFF where the low 8 bits
|
||||
* are the byte's value.
|
||||
*
|
||||
* @param buf the string
|
||||
* @param iindex the index to look at and return by.
|
||||
* @return the codepoint found.
|
||||
* @since 1.1.0
|
||||
*/
|
||||
EAPI Eina_Unicode
|
||||
eina_unicode_utf8_get_next(const char *buf, int *iindex)
|
||||
{
|
||||
/* Note: we don't currently handle overlong forms and some other
|
||||
* error cases. */
|
||||
int ind = *iindex;
|
||||
Eina_Unicode r;
|
||||
unsigned char d;
|
||||
|
||||
/* if this char is the null terminator, exit */
|
||||
if ((d = buf[ind++]) == 0) return 0;
|
||||
|
||||
if ((d & 0x80) == 0)
|
||||
{ // 1 byte (7bit) - 0xxxxxxx
|
||||
*iindex = ind;
|
||||
return d;
|
||||
}
|
||||
if ((d & 0xe0) == 0xc0)
|
||||
{ // 2 byte (11bit) - 110xxxxx 10xxxxxx
|
||||
r = (d & 0x1f) << 6;
|
||||
if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
|
||||
!IS_CONTINUATION_BYTE(d)) goto error;
|
||||
r |= (d & 0x3f);
|
||||
if (!r) goto error;
|
||||
*iindex = ind;
|
||||
return r;
|
||||
}
|
||||
if ((d & 0xf0) == 0xe0)
|
||||
{ // 3 byte (16bit) - 1110xxxx 10xxxxxx 10xxxxxx
|
||||
r = (d & 0x0f) << 12;
|
||||
if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
|
||||
!IS_CONTINUATION_BYTE(d)) goto error;
|
||||
r |= (d & 0x3f) << 6;
|
||||
if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
|
||||
!IS_CONTINUATION_BYTE(d)) goto error;
|
||||
r |= (d & 0x3f);
|
||||
if (!r) goto error;
|
||||
*iindex = ind;
|
||||
return r;
|
||||
}
|
||||
if ((d & 0xf8) == 0xf0)
|
||||
{ // 4 byte (21bit) - 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
r = (d & 0x07) << 18;
|
||||
if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
|
||||
!IS_CONTINUATION_BYTE(d)) goto error;
|
||||
r |= (d & 0x3f) << 12;
|
||||
if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
|
||||
!IS_CONTINUATION_BYTE(d)) goto error;
|
||||
r |= (d & 0x3f) << 6;
|
||||
if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
|
||||
!IS_CONTINUATION_BYTE(d)) goto error;
|
||||
r |= (d & 0x3f);
|
||||
if (!r) goto error;
|
||||
*iindex = ind;
|
||||
return r;
|
||||
}
|
||||
if ((d & 0xfc) == 0xf8)
|
||||
{ // 5 byte (26bit) - 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
r = (d & 0x03) << 24;
|
||||
if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
|
||||
!IS_CONTINUATION_BYTE(d)) goto error;
|
||||
r |= (d & 0x3f) << 18;
|
||||
if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
|
||||
!IS_CONTINUATION_BYTE(d)) goto error;
|
||||
r |= (d & 0x3f) << 12;
|
||||
if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
|
||||
!IS_CONTINUATION_BYTE(d)) goto error;
|
||||
r |= (d & 0x3f) << 6;
|
||||
if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
|
||||
!IS_CONTINUATION_BYTE(d)) goto error;
|
||||
r |= (d & 0x3f);
|
||||
if (!r) goto error;
|
||||
*iindex = ind;
|
||||
return r;
|
||||
}
|
||||
if ((d & 0xfe) == 0xfc)
|
||||
{ // 6 byte (31bit) - 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
r = (d & 0x01) << 30;
|
||||
if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
|
||||
!IS_CONTINUATION_BYTE(d)) goto error;
|
||||
r |= (d & 0x3f) << 24;
|
||||
if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
|
||||
!IS_CONTINUATION_BYTE(d)) goto error;
|
||||
r |= (d & 0x3f) << 18;
|
||||
if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
|
||||
!IS_CONTINUATION_BYTE(d)) goto error;
|
||||
r |= (d & 0x3f) << 12;
|
||||
if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
|
||||
!IS_CONTINUATION_BYTE(d)) goto error;
|
||||
r |= (d & 0x3f) << 6;
|
||||
if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
|
||||
!IS_CONTINUATION_BYTE(d)) goto error;
|
||||
r |= (d & 0x3f);
|
||||
if (!r) goto error;
|
||||
*iindex = ind;
|
||||
return r;
|
||||
}
|
||||
|
||||
/* Gets here where there was an error and we want to replace the char
|
||||
* we just use the invalid unicode codepoints 8 lower bits represent
|
||||
* the original char */
|
||||
error:
|
||||
d = buf[*iindex];
|
||||
(*iindex)++;
|
||||
return ERROR_REPLACEMENT_BASE | d;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads UTF8 bytes from @buf, starting at *@iindex and returns
|
||||
* the decoded code point at iindex offset, and moves iindex
|
||||
* to the previous code point. iindex is always moved, as long
|
||||
* as it's not past the start of the string.
|
||||
* On error: return a codepoint between DC80 to DCFF where the low 8 bits
|
||||
* are the byte's value.
|
||||
*
|
||||
* @param buf the string
|
||||
* @param iindex the index to look at and return by.
|
||||
* @return the codepoint found.
|
||||
* @since 1.1.0
|
||||
*/
|
||||
EAPI Eina_Unicode
|
||||
eina_unicode_utf8_get_prev(const char *buf, int *iindex)
|
||||
{
|
||||
int r;
|
||||
int ind = *iindex;
|
||||
/* First obtain the codepoint at iindex */
|
||||
r = eina_unicode_utf8_get_next(buf, &ind);
|
||||
|
||||
/* although when ind == 0 there's no previous char, we still want to get
|
||||
* the current char */
|
||||
if (*iindex <= 0)
|
||||
return r;
|
||||
|
||||
/* Next advance iindex to previous codepoint */
|
||||
ind = *iindex;
|
||||
ind--;
|
||||
while ((ind > 0) && ((buf[ind] & 0xc0) == 0x80))
|
||||
ind--;
|
||||
|
||||
*iindex = ind;
|
||||
return r;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of unicode characters in the string. That is,
|
||||
* the number of Eina_Unicodes it'll take to store this string in
|
||||
* an Eina_Unicode string.
|
||||
*
|
||||
* @param buf the string
|
||||
* @return the number of unicode characters (not bytes) in the string
|
||||
* @since 1.1.0
|
||||
*/
|
||||
EAPI int
|
||||
eina_unicode_utf8_get_len(const char *buf)
|
||||
{
|
||||
/* returns the number of utf8 characters (not bytes) in the string */
|
||||
int i = 0, len = 0;
|
||||
|
||||
while (eina_unicode_utf8_get_next(buf, &i))
|
||||
len++;
|
||||
|
||||
return len;
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a utf-8 string to a newly allocated Eina_Unicode string.
|
||||
*
|
||||
* @param utf the string in utf-8
|
||||
* @param _len the length of the returned Eina_Unicode string.
|
||||
* @return the newly allocated Eina_Unicode string.
|
||||
* @since 1.1.0
|
||||
*/
|
||||
EAPI Eina_Unicode *
|
||||
eina_unicode_utf8_to_unicode(const char *utf, int *_len)
|
||||
{
|
||||
/* FIXME: Should optimize! */
|
||||
int len, i;
|
||||
int ind;
|
||||
Eina_Unicode *buf, *uind;
|
||||
|
||||
len = eina_unicode_utf8_get_len(utf);
|
||||
if (_len)
|
||||
*_len = len;
|
||||
buf = (Eina_Unicode *) calloc(sizeof(Eina_Unicode), (len + 1));
|
||||
if (!buf) return buf;
|
||||
|
||||
for (i = 0, ind = 0, uind = buf ; i < len ; i++, uind++)
|
||||
{
|
||||
*uind = eina_unicode_utf8_get_next(utf, &ind);
|
||||
}
|
||||
|
||||
return buf;
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts an Eina_Unicode string to a newly allocated utf-8 string.
|
||||
*
|
||||
* @param uni the Eina_Unicode string
|
||||
* @param _len the length byte length of the return utf8 string.
|
||||
* @return the newly allocated utf-8 string.
|
||||
* @since 1.1.0
|
||||
*/
|
||||
EAPI char *
|
||||
eina_unicode_unicode_to_utf8(const Eina_Unicode *uni, int *_len)
|
||||
{
|
||||
char *buf;
|
||||
const Eina_Unicode *uind;
|
||||
char *ind;
|
||||
int ulen, len;
|
||||
|
||||
ulen = eina_unicode_strlen(uni);
|
||||
buf = (char *) calloc(ulen + 1, EINA_UNICODE_UTF8_BYTES_PER_CHAR);
|
||||
|
||||
len = 0;
|
||||
for (uind = uni, ind = buf ; *uind ; uind++)
|
||||
{
|
||||
if (*uind <= 0x7F) /* 1 byte char */
|
||||
{
|
||||
*ind++ = *uind;
|
||||
len += 1;
|
||||
}
|
||||
else if (*uind <= 0x7FF) /* 2 byte char */
|
||||
{
|
||||
*ind++ = 0xC0 | (unsigned char) (*uind >> 6);
|
||||
*ind++ = 0x80 | (unsigned char) (*uind & 0x3F);
|
||||
len += 2;
|
||||
}
|
||||
else if (*uind <= 0xFFFF) /* 3 byte char */
|
||||
{
|
||||
/* If it's a special replacement codepoint */
|
||||
if (*uind >= ERROR_REPLACEMENT_BASE &&
|
||||
*uind <= ERROR_REPLACEMENT_END)
|
||||
{
|
||||
*ind++ = *uind & 0xFF;
|
||||
len += 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
*ind++ = 0xE0 | (unsigned char) (*uind >> 12);
|
||||
*ind++ = 0x80 | (unsigned char) ((*uind >> 6) & 0x3F);
|
||||
*ind++ = 0x80 | (unsigned char) (*uind & 0x3F);
|
||||
len += 3;
|
||||
}
|
||||
}
|
||||
else if (*uind <= 0x1FFFFF) /* 4 byte char */
|
||||
{
|
||||
*ind++ = 0xF0 | (unsigned char) ((*uind >> 18) & 0x07);
|
||||
*ind++ = 0x80 | (unsigned char) ((*uind >> 12) & 0x3F);
|
||||
*ind++ = 0x80 | (unsigned char) ((*uind >> 6) & 0x3F);
|
||||
*ind++ = 0x80 | (unsigned char) (*uind & 0x3F);
|
||||
len += 4;
|
||||
}
|
||||
else if (*uind <= 0x3FFFFFF) /* 5 byte char */
|
||||
{
|
||||
*ind++ = 0xF8 | (unsigned char) ((*uind >> 24) & 0x03);
|
||||
*ind++ = 0x80 | (unsigned char) ((*uind >> 18) & 0x3F);
|
||||
*ind++ = 0x80 | (unsigned char) ((*uind >> 12) & 0x3F);
|
||||
*ind++ = 0x80 | (unsigned char) ((*uind >> 6) & 0x3F);
|
||||
*ind++ = 0x80 | (unsigned char) (*uind & 0x3F);
|
||||
len += 5;
|
||||
}
|
||||
else if (*uind <= 0x7FFFFFFF) /* 6 byte char */
|
||||
{
|
||||
*ind++ = 0xFC | (unsigned char) ((*uind >> 30) & 0x01);
|
||||
*ind++ = 0x80 | (unsigned char) ((*uind >> 24) & 0x3F);
|
||||
*ind++ = 0x80 | (unsigned char) ((*uind >> 18) & 0x3F);
|
||||
*ind++ = 0x80 | (unsigned char) ((*uind >> 12) & 0x3F);
|
||||
*ind++ = 0x80 | (unsigned char) ((*uind >> 6) & 0x3F);
|
||||
*ind++ = 0x80 | (unsigned char) (*uind & 0x3F);
|
||||
len += 6;
|
||||
}
|
||||
else /* error */
|
||||
{
|
||||
/* Do something */
|
||||
}
|
||||
}
|
||||
buf = realloc(buf, len + 1);
|
||||
buf[len] = '\0';
|
||||
if (_len)
|
||||
*_len = len;
|
||||
return buf;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue