From 7214694dbaaf2b456e072db1acd644671cb37a9e Mon Sep 17 00:00:00 2001 From: Tom Hacohen Date: Wed, 16 Feb 2011 15:43:25 +0000 Subject: [PATCH] Eina unicode: Added utf8 handling functions and also added Eina_Unicode<->UTF-8 conversions. It's the functions from Evas_Encoding that have been renamed. I also added support for 6byte UTF-8 conversion (Eina_Unicode->UTF-8). SVN revision: 57093 --- legacy/eina/ChangeLog | 5 + legacy/eina/src/include/eina_unicode.h | 10 + legacy/eina/src/lib/eina_unicode.c | 307 +++++++++++++++++++++++++ 3 files changed, 322 insertions(+) diff --git a/legacy/eina/ChangeLog b/legacy/eina/ChangeLog index 7705ad1f78..fdd1d0fe0d 100644 --- a/legacy/eina/ChangeLog +++ b/legacy/eina/ChangeLog @@ -13,3 +13,8 @@ 2011-02-16 Mike Blumenkrantz * Added EINA_INLIST_FOREACH_SAFE + +2011-02-16 Tom Hacohen + + * Added eina_unicode_utf8* functions for utf8 string handling + and conversions to and from Eina_Unicode diff --git a/legacy/eina/src/include/eina_unicode.h b/legacy/eina/src/include/eina_unicode.h index 152177b68a..a20c6452ca 100644 --- a/legacy/eina/src/include/eina_unicode.h +++ b/legacy/eina/src/include/eina_unicode.h @@ -58,6 +58,16 @@ EAPI Eina_Unicode *eina_unicode_strncpy(Eina_Unicode *dest, const Eina_Unicode * EAPI Eina_Unicode *eina_unicode_escape(const Eina_Unicode *str) EINA_ARG_NONNULL(1) EINA_MALLOC EINA_WARN_UNUSED_RESULT; +/* UTF-8 Handling */ + +EAPI Eina_Unicode eina_unicode_utf8_get_next(const char *buf, int *iindex) EINA_ARG_NONNULL(1, 2); +EAPI Eina_Unicode eina_unicode_utf8_get_prev(const char *buf, int *iindex) EINA_ARG_NONNULL(1, 2); +EAPI int eina_unicode_utf8_get_len(const char *buf) EINA_ARG_NONNULL(1); + +EAPI Eina_Unicode *eina_unicode_utf8_to_unicode(const char *utf, int *_len) EINA_WARN_UNUSED_RESULT EINA_ARG_NONNULL(1) EINA_MALLOC; + +EAPI char * eina_unicode_unicode_to_utf8(const Eina_Unicode *uni, int *_len) EINA_WARN_UNUSED_RESULT EINA_ARG_NONNULL(1) EINA_MALLOC; + /** * @} */ diff --git a/legacy/eina/src/lib/eina_unicode.c b/legacy/eina/src/lib/eina_unicode.c index 6c8f7e9650..2b3dd1803f 100644 --- a/legacy/eina/src/lib/eina_unicode.c +++ b/legacy/eina/src/lib/eina_unicode.c @@ -185,3 +185,310 @@ eina_unicode_escape(const Eina_Unicode *str) return s2; } +/* UTF-8 Handling */ + +#define EINA_UNICODE_UTF8_BYTES_PER_CHAR 6 +/* The replacement range that will be used for bad utf8 chars. */ +#define ERROR_REPLACEMENT_BASE 0xDC80 +#define ERROR_REPLACEMENT_END 0xDCFF +#define IS_INVALID_BYTE(x) ((x == 192) || (x == 193) || (x >= 245)) +#define IS_CONTINUATION_BYTE(x) ((x & 0xC0) == 0x80) + +/** + * Reads UTF8 bytes from @buf, starting at *@index and returns + * the decoded code point at iindex offset, and advances iindex + * to the next code point after this. iindex is always advanced, + * unless if the advancement is after the NULL. + * On error: return a codepoint between DC80 to DCFF where the low 8 bits + * are the byte's value. + * + * @param buf the string + * @param iindex the index to look at and return by. + * @return the codepoint found. + * @since 1.1.0 + */ +EAPI Eina_Unicode +eina_unicode_utf8_get_next(const char *buf, int *iindex) +{ + /* Note: we don't currently handle overlong forms and some other + * error cases. */ + int ind = *iindex; + Eina_Unicode r; + unsigned char d; + + /* if this char is the null terminator, exit */ + if ((d = buf[ind++]) == 0) return 0; + + if ((d & 0x80) == 0) + { // 1 byte (7bit) - 0xxxxxxx + *iindex = ind; + return d; + } + if ((d & 0xe0) == 0xc0) + { // 2 byte (11bit) - 110xxxxx 10xxxxxx + r = (d & 0x1f) << 6; + if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) || + !IS_CONTINUATION_BYTE(d)) goto error; + r |= (d & 0x3f); + if (!r) goto error; + *iindex = ind; + return r; + } + if ((d & 0xf0) == 0xe0) + { // 3 byte (16bit) - 1110xxxx 10xxxxxx 10xxxxxx + r = (d & 0x0f) << 12; + if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) || + !IS_CONTINUATION_BYTE(d)) goto error; + r |= (d & 0x3f) << 6; + if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) || + !IS_CONTINUATION_BYTE(d)) goto error; + r |= (d & 0x3f); + if (!r) goto error; + *iindex = ind; + return r; + } + if ((d & 0xf8) == 0xf0) + { // 4 byte (21bit) - 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + r = (d & 0x07) << 18; + if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) || + !IS_CONTINUATION_BYTE(d)) goto error; + r |= (d & 0x3f) << 12; + if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) || + !IS_CONTINUATION_BYTE(d)) goto error; + r |= (d & 0x3f) << 6; + if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) || + !IS_CONTINUATION_BYTE(d)) goto error; + r |= (d & 0x3f); + if (!r) goto error; + *iindex = ind; + return r; + } + if ((d & 0xfc) == 0xf8) + { // 5 byte (26bit) - 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + r = (d & 0x03) << 24; + if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) || + !IS_CONTINUATION_BYTE(d)) goto error; + r |= (d & 0x3f) << 18; + if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) || + !IS_CONTINUATION_BYTE(d)) goto error; + r |= (d & 0x3f) << 12; + if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) || + !IS_CONTINUATION_BYTE(d)) goto error; + r |= (d & 0x3f) << 6; + if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) || + !IS_CONTINUATION_BYTE(d)) goto error; + r |= (d & 0x3f); + if (!r) goto error; + *iindex = ind; + return r; + } + if ((d & 0xfe) == 0xfc) + { // 6 byte (31bit) - 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + r = (d & 0x01) << 30; + if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) || + !IS_CONTINUATION_BYTE(d)) goto error; + r |= (d & 0x3f) << 24; + if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) || + !IS_CONTINUATION_BYTE(d)) goto error; + r |= (d & 0x3f) << 18; + if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) || + !IS_CONTINUATION_BYTE(d)) goto error; + r |= (d & 0x3f) << 12; + if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) || + !IS_CONTINUATION_BYTE(d)) goto error; + r |= (d & 0x3f) << 6; + if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) || + !IS_CONTINUATION_BYTE(d)) goto error; + r |= (d & 0x3f); + if (!r) goto error; + *iindex = ind; + return r; + } + +/* Gets here where there was an error and we want to replace the char + * we just use the invalid unicode codepoints 8 lower bits represent + * the original char */ +error: + d = buf[*iindex]; + (*iindex)++; + return ERROR_REPLACEMENT_BASE | d; +} + +/** + * Reads UTF8 bytes from @buf, starting at *@iindex and returns + * the decoded code point at iindex offset, and moves iindex + * to the previous code point. iindex is always moved, as long + * as it's not past the start of the string. + * On error: return a codepoint between DC80 to DCFF where the low 8 bits + * are the byte's value. + * + * @param buf the string + * @param iindex the index to look at and return by. + * @return the codepoint found. + * @since 1.1.0 + */ +EAPI Eina_Unicode +eina_unicode_utf8_get_prev(const char *buf, int *iindex) +{ + int r; + int ind = *iindex; + /* First obtain the codepoint at iindex */ + r = eina_unicode_utf8_get_next(buf, &ind); + + /* although when ind == 0 there's no previous char, we still want to get + * the current char */ + if (*iindex <= 0) + return r; + + /* Next advance iindex to previous codepoint */ + ind = *iindex; + ind--; + while ((ind > 0) && ((buf[ind] & 0xc0) == 0x80)) + ind--; + + *iindex = ind; + return r; +} + +/** + * Returns the number of unicode characters in the string. That is, + * the number of Eina_Unicodes it'll take to store this string in + * an Eina_Unicode string. + * + * @param buf the string + * @return the number of unicode characters (not bytes) in the string + * @since 1.1.0 + */ +EAPI int +eina_unicode_utf8_get_len(const char *buf) +{ + /* returns the number of utf8 characters (not bytes) in the string */ + int i = 0, len = 0; + + while (eina_unicode_utf8_get_next(buf, &i)) + len++; + + return len; +} + +/** + * Converts a utf-8 string to a newly allocated Eina_Unicode string. + * + * @param utf the string in utf-8 + * @param _len the length of the returned Eina_Unicode string. + * @return the newly allocated Eina_Unicode string. + * @since 1.1.0 + */ +EAPI Eina_Unicode * +eina_unicode_utf8_to_unicode(const char *utf, int *_len) +{ + /* FIXME: Should optimize! */ + int len, i; + int ind; + Eina_Unicode *buf, *uind; + + len = eina_unicode_utf8_get_len(utf); + if (_len) + *_len = len; + buf = (Eina_Unicode *) calloc(sizeof(Eina_Unicode), (len + 1)); + if (!buf) return buf; + + for (i = 0, ind = 0, uind = buf ; i < len ; i++, uind++) + { + *uind = eina_unicode_utf8_get_next(utf, &ind); + } + + return buf; +} + +/** + * Converts an Eina_Unicode string to a newly allocated utf-8 string. + * + * @param uni the Eina_Unicode string + * @param _len the length byte length of the return utf8 string. + * @return the newly allocated utf-8 string. + * @since 1.1.0 + */ +EAPI char * +eina_unicode_unicode_to_utf8(const Eina_Unicode *uni, int *_len) +{ + char *buf; + const Eina_Unicode *uind; + char *ind; + int ulen, len; + + ulen = eina_unicode_strlen(uni); + buf = (char *) calloc(ulen + 1, EINA_UNICODE_UTF8_BYTES_PER_CHAR); + + len = 0; + for (uind = uni, ind = buf ; *uind ; uind++) + { + if (*uind <= 0x7F) /* 1 byte char */ + { + *ind++ = *uind; + len += 1; + } + else if (*uind <= 0x7FF) /* 2 byte char */ + { + *ind++ = 0xC0 | (unsigned char) (*uind >> 6); + *ind++ = 0x80 | (unsigned char) (*uind & 0x3F); + len += 2; + } + else if (*uind <= 0xFFFF) /* 3 byte char */ + { + /* If it's a special replacement codepoint */ + if (*uind >= ERROR_REPLACEMENT_BASE && + *uind <= ERROR_REPLACEMENT_END) + { + *ind++ = *uind & 0xFF; + len += 1; + } + else + { + *ind++ = 0xE0 | (unsigned char) (*uind >> 12); + *ind++ = 0x80 | (unsigned char) ((*uind >> 6) & 0x3F); + *ind++ = 0x80 | (unsigned char) (*uind & 0x3F); + len += 3; + } + } + else if (*uind <= 0x1FFFFF) /* 4 byte char */ + { + *ind++ = 0xF0 | (unsigned char) ((*uind >> 18) & 0x07); + *ind++ = 0x80 | (unsigned char) ((*uind >> 12) & 0x3F); + *ind++ = 0x80 | (unsigned char) ((*uind >> 6) & 0x3F); + *ind++ = 0x80 | (unsigned char) (*uind & 0x3F); + len += 4; + } + else if (*uind <= 0x3FFFFFF) /* 5 byte char */ + { + *ind++ = 0xF8 | (unsigned char) ((*uind >> 24) & 0x03); + *ind++ = 0x80 | (unsigned char) ((*uind >> 18) & 0x3F); + *ind++ = 0x80 | (unsigned char) ((*uind >> 12) & 0x3F); + *ind++ = 0x80 | (unsigned char) ((*uind >> 6) & 0x3F); + *ind++ = 0x80 | (unsigned char) (*uind & 0x3F); + len += 5; + } + else if (*uind <= 0x7FFFFFFF) /* 6 byte char */ + { + *ind++ = 0xFC | (unsigned char) ((*uind >> 30) & 0x01); + *ind++ = 0x80 | (unsigned char) ((*uind >> 24) & 0x3F); + *ind++ = 0x80 | (unsigned char) ((*uind >> 18) & 0x3F); + *ind++ = 0x80 | (unsigned char) ((*uind >> 12) & 0x3F); + *ind++ = 0x80 | (unsigned char) ((*uind >> 6) & 0x3F); + *ind++ = 0x80 | (unsigned char) (*uind & 0x3F); + len += 6; + } + else /* error */ + { + /* Do something */ + } + } + buf = realloc(buf, len + 1); + buf[len] = '\0'; + if (_len) + *_len = len; + return buf; +} + + +