From c97cb203c839c4aa2c046c7420a375a271679563 Mon Sep 17 00:00:00 2001
From: tasn <tasn>
Date: Thu, 17 Feb 2011 11:49:37 +0000
Subject: [PATCH] Eina ut8: Made the utf8 parser tests more complete. Added
 standalone continuation bytes tests. Added isolated starting sequences tests
 Added incomplete sequences tests Added Overlong representations tests. And I
 think that's it. Still need to add tests for surrogate pairs.

git-svn-id: http://svn.enlightenment.org/svn/e/trunk/eina@57123 7cbeb6ba-43b4-40fd-8cce-4c39aea84d33
---
 src/tests/eina_test_ustr.c | 134 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 134 insertions(+)
diff --git a/src/tests/eina_test_ustr.c b/src/tests/eina_test_ustr.c
index 54c5f10..9b845a9 100644
--- a/src/tests/eina_test_ustr.c
+++ b/src/tests/eina_test_ustr.c
@@ -229,6 +229,7 @@ END_TEST
 START_TEST(eina_unicode_utf8)
 {
    int ind;
+   unsigned char ch;
    eina_init();
 
    /* Valid utf-8 cases */
@@ -289,9 +290,142 @@ START_TEST(eina_unicode_utf8)
            (ind != 4));
 
    /* Error cases */
+   /* Standalone continuation bytes */
    ind = 0;
    fail_if((eina_unicode_utf8_get_next("\x80", &ind) != 0xDC80) ||
            (ind != 1));
+   ind = 0;
+   fail_if((eina_unicode_utf8_get_next("\xBF", &ind) != 0xDCBF) ||
+           (ind != 1));
+   ind = 0;
+   fail_if((eina_unicode_utf8_get_next("\x80\xBF", &ind) != 0xDC80) ||
+           (ind != 1));
+   ind = 0;
+   fail_if((eina_unicode_utf8_get_next("\xBF\x80", &ind) != 0xDCBF) ||
+           (ind != 1));
+   /* All possible continuation bytes */
+   for (ch = 0x80 ; ch <= 0xBF ; ch++)
+     {
+        char buf[] = {ch, 0};
+        ind = 0;
+        fail_if((eina_unicode_utf8_get_next(buf, &ind) != (0xDC00 | ch)) ||
+                (ind != 1));
+     }
+
+   /* Isolated starting sequences */
+#define _FIRST_SEQUENCES(start, end) \
+   do \
+     { \
+        int i; \
+        char *buf = alloca(((end - start + 1) * 2) + 1); \
+        for (i = 0, ch = start ; ch <= end ; i++, ch++) \
+          { \
+             buf[i * 2] = ch; \
+             buf[(i * 2) + 1] = ' '; \
+          } \
+        ind = 0; \
+        for (i = 0, ch = start ; ch <= end ; ch++) \
+          { \
+             fail_if((eina_unicode_utf8_get_next(buf, &ind) != (0xDC00 | ch)) || \
+                     (ind != ++i)); \
+             fail_if((eina_unicode_utf8_get_next(buf, &ind) != 0x20) || \
+                     (ind != ++i)); \
+          } \
+     } \
+   while (0)
+   /* all first bytes of 2-byte sequences seperated by spaces. */
+   _FIRST_SEQUENCES(0xC0, 0xDF);
+   /* all first bytes of 3-byte sequences seperated by spaces. */
+   _FIRST_SEQUENCES(0xE0, 0xEF);
+   /* all first bytes of 4-byte sequences seperated by spaces. */
+   _FIRST_SEQUENCES(0xF0, 0xF7);
+   /* all first bytes of 5-byte sequences seperated by spaces. */
+   _FIRST_SEQUENCES(0xF8, 0xFB);
+   /* all first bytes of 6-byte sequences seperated by spaces. */
+   _FIRST_SEQUENCES(0xFC, 0xFD);
+
+   /* Incomplete sequences first means the first utf8 char, len means
+    * the correct length */
+#define _INCOMPLETE_SEQUENCES(first, conti, len) \
+   do \
+     { \
+        int i, j; \
+        char *buf = alloca(len + 1); \
+        i = 0; \
+        buf[i++] = first; \
+        for ( ; i < len ; i++) \
+          { \
+             Eina_Unicode val; \
+             for (j = 1 ; j < i ; j++) \
+               { \
+                  buf[j] = conti; \
+               } \
+             buf[j] = 0; \
+             ind = 0; \
+             fail_if( \
+                (eina_unicode_utf8_get_next(buf, &ind) != (0xDC00 | first))); \
+             while ((val = eina_unicode_utf8_get_next(buf, &ind))) \
+               { \
+                  fail_if(val != (0xDC00 | conti)); \
+               } \
+             fail_if(ind != i); \
+          } \
+     } \
+   while (0)
+
+   /* Sequences with missing continuation */
+   _INCOMPLETE_SEQUENCES(0xC0, 0x81, 2);
+   _INCOMPLETE_SEQUENCES(0xDF, 0xBF, 2);
+   _INCOMPLETE_SEQUENCES(0xE0, 0x81, 3);
+   _INCOMPLETE_SEQUENCES(0xEF, 0xBF, 3);
+   _INCOMPLETE_SEQUENCES(0xF0, 0x81, 4);
+   _INCOMPLETE_SEQUENCES(0xF7, 0xBF, 4);
+   _INCOMPLETE_SEQUENCES(0xF8, 0x81, 5);
+   _INCOMPLETE_SEQUENCES(0xFB, 0xBF, 5);
+   _INCOMPLETE_SEQUENCES(0xFC, 0x81, 6);
+   _INCOMPLETE_SEQUENCES(0xFD, 0xBF, 6);
+
+   /* Impossible bytes */
+   ind = 0;
+   fail_if((eina_unicode_utf8_get_next("\xFE", &ind) != 0xDCFE) ||
+           (ind != 1));
+   ind = 0;
+   fail_if((eina_unicode_utf8_get_next("\xFF", &ind) != 0xDCFF) ||
+           (ind != 1));
+
+   /* Overlong sequences */
+   ind = 0;
+   fail_if((eina_unicode_utf8_get_next("\xC0\xAF", &ind) != 0xDCC0) ||
+           (ind != 1));
+   ind = 0;
+   fail_if((eina_unicode_utf8_get_next("\xE0\x80\xAF", &ind) != 0xDCE0) ||
+           (ind != 1));
+   ind = 0;
+   fail_if((eina_unicode_utf8_get_next("\xF0\x80\x80\xAF", &ind) != 0xDCF0) ||
+           (ind != 1));
+   ind = 0;
+   fail_if((eina_unicode_utf8_get_next("\xF8\x80\x80\x80\xAF", &ind) != 0xDCF8) ||
+           (ind != 1));
+   ind = 0;
+   fail_if((eina_unicode_utf8_get_next("\xFC\x80\x80\x80\x80\xAF", &ind) != 0xDCFC) ||
+           (ind != 1));
+
+   /* Maximum overlong sequences */
+   ind = 0;
+   fail_if((eina_unicode_utf8_get_next("\xC1\xBF", &ind) != 0xDCC1) ||
+           (ind != 1));
+   ind = 0;
+   fail_if((eina_unicode_utf8_get_next("\xE0\x9F\xBF", &ind) != 0xDCE0) ||
+           (ind != 1));
+   ind = 0;
+   fail_if((eina_unicode_utf8_get_next("\xF0\x8F\xBF\xBF", &ind) != 0xDCF0) ||
+           (ind != 1));
+   ind = 0;
+   fail_if((eina_unicode_utf8_get_next("\xF8\x87\xBF\xBF\xBF", &ind) != 0xDCF8) ||
+           (ind != 1));
+   ind = 0;
+   fail_if((eina_unicode_utf8_get_next("\xFC\x83\xBF\xBF\xBF\xBF", &ind) != 0xDCFC) ||
+           (ind != 1));
    /* Add some more error cases here */
 
    /* Just to cover prev/len. General utf-8 parsing was covered above */
-- 
2.7.4