Eina ut8: Made the utf8 parser tests more complete.

Added standalone continuation bytes tests. Added isolated starting sequences tests Added incomplete sequences tests Added Overlong representations tests. And I think that's it. Still need to add tests for surrogate pairs. SVN revision: 57123
2011-02-17 11:49:37 +00:00 · 2011-02-17 11:49:37 +00:00 · 49a33976cc
parent 1c492ce551
commit 49a33976cc
1 changed files with 134 additions and 0 deletions
--- a/legacy/eina/src/tests/eina_test_ustr.c
+++ b/legacy/eina/src/tests/eina_test_ustr.c
@ -229,6 +229,7 @@ END_TEST
 START_TEST(eina_unicode_utf8)
 {
   int ind;
+   unsigned char ch;
   eina_init();

   /* Valid utf-8 cases */
@ -289,9 +290,142 @@ START_TEST(eina_unicode_utf8)
           (ind != 4));

   /* Error cases */
+   /* Standalone continuation bytes */
   ind = 0;
   fail_if((eina_unicode_utf8_get_next("\x80", &ind) != 0xDC80) ||
           (ind != 1));
+   ind = 0;
+   fail_if((eina_unicode_utf8_get_next("\xBF", &ind) != 0xDCBF) ||
+           (ind != 1));
+   ind = 0;
+   fail_if((eina_unicode_utf8_get_next("\x80\xBF", &ind) != 0xDC80) ||
+           (ind != 1));
+   ind = 0;
+   fail_if((eina_unicode_utf8_get_next("\xBF\x80", &ind) != 0xDCBF) ||
+           (ind != 1));
+   /* All possible continuation bytes */
+   for (ch = 0x80 ; ch <= 0xBF ; ch++)
+     {
+        char buf[] = {ch, 0};
+        ind = 0;
+        fail_if((eina_unicode_utf8_get_next(buf, &ind) != (0xDC00 | ch)) ||
+                (ind != 1));
+     }
+
+   /* Isolated starting sequences */
+#define _FIRST_SEQUENCES(start, end) \
+   do \
+     { \
+        int i; \
+        char *buf = alloca(((end - start + 1) * 2) + 1); \
+        for (i = 0, ch = start ; ch <= end ; i++, ch++) \
+          { \
+             buf[i * 2] = ch; \
+             buf[(i * 2) + 1] = ' '; \
+          } \
+        ind = 0; \
+        for (i = 0, ch = start ; ch <= end ; ch++) \
+          { \
+             fail_if((eina_unicode_utf8_get_next(buf, &ind) != (0xDC00 | ch)) || \
+                     (ind != ++i)); \
+             fail_if((eina_unicode_utf8_get_next(buf, &ind) != 0x20) || \
+                     (ind != ++i)); \
+          } \
+     } \
+   while (0)
+   /* all first bytes of 2-byte sequences seperated by spaces. */
+   _FIRST_SEQUENCES(0xC0, 0xDF);
+   /* all first bytes of 3-byte sequences seperated by spaces. */
+   _FIRST_SEQUENCES(0xE0, 0xEF);
+   /* all first bytes of 4-byte sequences seperated by spaces. */
+   _FIRST_SEQUENCES(0xF0, 0xF7);
+   /* all first bytes of 5-byte sequences seperated by spaces. */
+   _FIRST_SEQUENCES(0xF8, 0xFB);
+   /* all first bytes of 6-byte sequences seperated by spaces. */
+   _FIRST_SEQUENCES(0xFC, 0xFD);
+
+   /* Incomplete sequences first means the first utf8 char, len means
+    * the correct length */
+#define _INCOMPLETE_SEQUENCES(first, conti, len) \
+   do \
+     { \
+        int i, j; \
+        char *buf = alloca(len + 1); \
+        i = 0; \
+        buf[i++] = first; \
+        for ( ; i < len ; i++) \
+          { \
+             Eina_Unicode val; \
+             for (j = 1 ; j < i ; j++) \
+               { \
+                  buf[j] = conti; \
+               } \
+             buf[j] = 0; \
+             ind = 0; \
+             fail_if( \
+                (eina_unicode_utf8_get_next(buf, &ind) != (0xDC00 | first))); \
+             while ((val = eina_unicode_utf8_get_next(buf, &ind))) \
+               { \
+                  fail_if(val != (0xDC00 | conti)); \
+               } \
+             fail_if(ind != i); \
+          } \
+     } \
+   while (0)
+
+   /* Sequences with missing continuation */
+   _INCOMPLETE_SEQUENCES(0xC0, 0x81, 2);
+   _INCOMPLETE_SEQUENCES(0xDF, 0xBF, 2);
+   _INCOMPLETE_SEQUENCES(0xE0, 0x81, 3);
+   _INCOMPLETE_SEQUENCES(0xEF, 0xBF, 3);
+   _INCOMPLETE_SEQUENCES(0xF0, 0x81, 4);
+   _INCOMPLETE_SEQUENCES(0xF7, 0xBF, 4);
+   _INCOMPLETE_SEQUENCES(0xF8, 0x81, 5);
+   _INCOMPLETE_SEQUENCES(0xFB, 0xBF, 5);
+   _INCOMPLETE_SEQUENCES(0xFC, 0x81, 6);
+   _INCOMPLETE_SEQUENCES(0xFD, 0xBF, 6);
+
+   /* Impossible bytes */
+   ind = 0;
+   fail_if((eina_unicode_utf8_get_next("\xFE", &ind) != 0xDCFE) ||
+           (ind != 1));
+   ind = 0;
+   fail_if((eina_unicode_utf8_get_next("\xFF", &ind) != 0xDCFF) ||
+           (ind != 1));
+
+   /* Overlong sequences */
+   ind = 0;
+   fail_if((eina_unicode_utf8_get_next("\xC0\xAF", &ind) != 0xDCC0) ||
+           (ind != 1));
+   ind = 0;
+   fail_if((eina_unicode_utf8_get_next("\xE0\x80\xAF", &ind) != 0xDCE0) ||
+           (ind != 1));
+   ind = 0;
+   fail_if((eina_unicode_utf8_get_next("\xF0\x80\x80\xAF", &ind) != 0xDCF0) ||
+           (ind != 1));
+   ind = 0;
+   fail_if((eina_unicode_utf8_get_next("\xF8\x80\x80\x80\xAF", &ind) != 0xDCF8) ||
+           (ind != 1));
+   ind = 0;
+   fail_if((eina_unicode_utf8_get_next("\xFC\x80\x80\x80\x80\xAF", &ind) != 0xDCFC) ||
+           (ind != 1));
+
+   /* Maximum overlong sequences */
+   ind = 0;
+   fail_if((eina_unicode_utf8_get_next("\xC1\xBF", &ind) != 0xDCC1) ||
+           (ind != 1));
+   ind = 0;
+   fail_if((eina_unicode_utf8_get_next("\xE0\x9F\xBF", &ind) != 0xDCE0) ||
+           (ind != 1));
+   ind = 0;
+   fail_if((eina_unicode_utf8_get_next("\xF0\x8F\xBF\xBF", &ind) != 0xDCF0) ||
+           (ind != 1));
+   ind = 0;
+   fail_if((eina_unicode_utf8_get_next("\xF8\x87\xBF\xBF\xBF", &ind) != 0xDCF8) ||
+           (ind != 1));
+   ind = 0;
+   fail_if((eina_unicode_utf8_get_next("\xFC\x83\xBF\xBF\xBF\xBF", &ind) != 0xDCFC) ||
+           (ind != 1));
   /* Add some more error cases here */

   /* Just to cover prev/len. General utf-8 parsing was covered above */