forked from enlightenment/efl
Eina ut8: Made the utf8 parser tests more complete.
Added standalone continuation bytes tests. Added isolated starting sequences tests Added incomplete sequences tests Added Overlong representations tests. And I think that's it. Still need to add tests for surrogate pairs. SVN revision: 57123
This commit is contained in:
parent
1c492ce551
commit
49a33976cc
|
@ -229,6 +229,7 @@ END_TEST
|
|||
START_TEST(eina_unicode_utf8)
|
||||
{
|
||||
int ind;
|
||||
unsigned char ch;
|
||||
eina_init();
|
||||
|
||||
/* Valid utf-8 cases */
|
||||
|
@ -289,9 +290,142 @@ START_TEST(eina_unicode_utf8)
|
|||
(ind != 4));
|
||||
|
||||
/* Error cases */
|
||||
/* Standalone continuation bytes */
|
||||
ind = 0;
|
||||
fail_if((eina_unicode_utf8_get_next("\x80", &ind) != 0xDC80) ||
|
||||
(ind != 1));
|
||||
ind = 0;
|
||||
fail_if((eina_unicode_utf8_get_next("\xBF", &ind) != 0xDCBF) ||
|
||||
(ind != 1));
|
||||
ind = 0;
|
||||
fail_if((eina_unicode_utf8_get_next("\x80\xBF", &ind) != 0xDC80) ||
|
||||
(ind != 1));
|
||||
ind = 0;
|
||||
fail_if((eina_unicode_utf8_get_next("\xBF\x80", &ind) != 0xDCBF) ||
|
||||
(ind != 1));
|
||||
/* All possible continuation bytes */
|
||||
for (ch = 0x80 ; ch <= 0xBF ; ch++)
|
||||
{
|
||||
char buf[] = {ch, 0};
|
||||
ind = 0;
|
||||
fail_if((eina_unicode_utf8_get_next(buf, &ind) != (0xDC00 | ch)) ||
|
||||
(ind != 1));
|
||||
}
|
||||
|
||||
/* Isolated starting sequences */
|
||||
#define _FIRST_SEQUENCES(start, end) \
|
||||
do \
|
||||
{ \
|
||||
int i; \
|
||||
char *buf = alloca(((end - start + 1) * 2) + 1); \
|
||||
for (i = 0, ch = start ; ch <= end ; i++, ch++) \
|
||||
{ \
|
||||
buf[i * 2] = ch; \
|
||||
buf[(i * 2) + 1] = ' '; \
|
||||
} \
|
||||
ind = 0; \
|
||||
for (i = 0, ch = start ; ch <= end ; ch++) \
|
||||
{ \
|
||||
fail_if((eina_unicode_utf8_get_next(buf, &ind) != (0xDC00 | ch)) || \
|
||||
(ind != ++i)); \
|
||||
fail_if((eina_unicode_utf8_get_next(buf, &ind) != 0x20) || \
|
||||
(ind != ++i)); \
|
||||
} \
|
||||
} \
|
||||
while (0)
|
||||
/* all first bytes of 2-byte sequences seperated by spaces. */
|
||||
_FIRST_SEQUENCES(0xC0, 0xDF);
|
||||
/* all first bytes of 3-byte sequences seperated by spaces. */
|
||||
_FIRST_SEQUENCES(0xE0, 0xEF);
|
||||
/* all first bytes of 4-byte sequences seperated by spaces. */
|
||||
_FIRST_SEQUENCES(0xF0, 0xF7);
|
||||
/* all first bytes of 5-byte sequences seperated by spaces. */
|
||||
_FIRST_SEQUENCES(0xF8, 0xFB);
|
||||
/* all first bytes of 6-byte sequences seperated by spaces. */
|
||||
_FIRST_SEQUENCES(0xFC, 0xFD);
|
||||
|
||||
/* Incomplete sequences first means the first utf8 char, len means
|
||||
* the correct length */
|
||||
#define _INCOMPLETE_SEQUENCES(first, conti, len) \
|
||||
do \
|
||||
{ \
|
||||
int i, j; \
|
||||
char *buf = alloca(len + 1); \
|
||||
i = 0; \
|
||||
buf[i++] = first; \
|
||||
for ( ; i < len ; i++) \
|
||||
{ \
|
||||
Eina_Unicode val; \
|
||||
for (j = 1 ; j < i ; j++) \
|
||||
{ \
|
||||
buf[j] = conti; \
|
||||
} \
|
||||
buf[j] = 0; \
|
||||
ind = 0; \
|
||||
fail_if( \
|
||||
(eina_unicode_utf8_get_next(buf, &ind) != (0xDC00 | first))); \
|
||||
while ((val = eina_unicode_utf8_get_next(buf, &ind))) \
|
||||
{ \
|
||||
fail_if(val != (0xDC00 | conti)); \
|
||||
} \
|
||||
fail_if(ind != i); \
|
||||
} \
|
||||
} \
|
||||
while (0)
|
||||
|
||||
/* Sequences with missing continuation */
|
||||
_INCOMPLETE_SEQUENCES(0xC0, 0x81, 2);
|
||||
_INCOMPLETE_SEQUENCES(0xDF, 0xBF, 2);
|
||||
_INCOMPLETE_SEQUENCES(0xE0, 0x81, 3);
|
||||
_INCOMPLETE_SEQUENCES(0xEF, 0xBF, 3);
|
||||
_INCOMPLETE_SEQUENCES(0xF0, 0x81, 4);
|
||||
_INCOMPLETE_SEQUENCES(0xF7, 0xBF, 4);
|
||||
_INCOMPLETE_SEQUENCES(0xF8, 0x81, 5);
|
||||
_INCOMPLETE_SEQUENCES(0xFB, 0xBF, 5);
|
||||
_INCOMPLETE_SEQUENCES(0xFC, 0x81, 6);
|
||||
_INCOMPLETE_SEQUENCES(0xFD, 0xBF, 6);
|
||||
|
||||
/* Impossible bytes */
|
||||
ind = 0;
|
||||
fail_if((eina_unicode_utf8_get_next("\xFE", &ind) != 0xDCFE) ||
|
||||
(ind != 1));
|
||||
ind = 0;
|
||||
fail_if((eina_unicode_utf8_get_next("\xFF", &ind) != 0xDCFF) ||
|
||||
(ind != 1));
|
||||
|
||||
/* Overlong sequences */
|
||||
ind = 0;
|
||||
fail_if((eina_unicode_utf8_get_next("\xC0\xAF", &ind) != 0xDCC0) ||
|
||||
(ind != 1));
|
||||
ind = 0;
|
||||
fail_if((eina_unicode_utf8_get_next("\xE0\x80\xAF", &ind) != 0xDCE0) ||
|
||||
(ind != 1));
|
||||
ind = 0;
|
||||
fail_if((eina_unicode_utf8_get_next("\xF0\x80\x80\xAF", &ind) != 0xDCF0) ||
|
||||
(ind != 1));
|
||||
ind = 0;
|
||||
fail_if((eina_unicode_utf8_get_next("\xF8\x80\x80\x80\xAF", &ind) != 0xDCF8) ||
|
||||
(ind != 1));
|
||||
ind = 0;
|
||||
fail_if((eina_unicode_utf8_get_next("\xFC\x80\x80\x80\x80\xAF", &ind) != 0xDCFC) ||
|
||||
(ind != 1));
|
||||
|
||||
/* Maximum overlong sequences */
|
||||
ind = 0;
|
||||
fail_if((eina_unicode_utf8_get_next("\xC1\xBF", &ind) != 0xDCC1) ||
|
||||
(ind != 1));
|
||||
ind = 0;
|
||||
fail_if((eina_unicode_utf8_get_next("\xE0\x9F\xBF", &ind) != 0xDCE0) ||
|
||||
(ind != 1));
|
||||
ind = 0;
|
||||
fail_if((eina_unicode_utf8_get_next("\xF0\x8F\xBF\xBF", &ind) != 0xDCF0) ||
|
||||
(ind != 1));
|
||||
ind = 0;
|
||||
fail_if((eina_unicode_utf8_get_next("\xF8\x87\xBF\xBF\xBF", &ind) != 0xDCF8) ||
|
||||
(ind != 1));
|
||||
ind = 0;
|
||||
fail_if((eina_unicode_utf8_get_next("\xFC\x83\xBF\xBF\xBF\xBF", &ind) != 0xDCFC) ||
|
||||
(ind != 1));
|
||||
/* Add some more error cases here */
|
||||
|
||||
/* Just to cover prev/len. General utf-8 parsing was covered above */
|
||||
|
|
Loading…
Reference in New Issue