From c97cb203c839c4aa2c046c7420a375a271679563 Mon Sep 17 00:00:00 2001 From: tasn Date: Thu, 17 Feb 2011 11:49:37 +0000 Subject: [PATCH] Eina ut8: Made the utf8 parser tests more complete. Added standalone continuation bytes tests. Added isolated starting sequences tests Added incomplete sequences tests Added Overlong representations tests. And I think that's it. Still need to add tests for surrogate pairs. git-svn-id: http://svn.enlightenment.org/svn/e/trunk/eina@57123 7cbeb6ba-43b4-40fd-8cce-4c39aea84d33 --- src/tests/eina_test_ustr.c | 134 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 134 insertions(+) diff --git a/src/tests/eina_test_ustr.c b/src/tests/eina_test_ustr.c index 54c5f10..9b845a9 100644 --- a/src/tests/eina_test_ustr.c +++ b/src/tests/eina_test_ustr.c @@ -229,6 +229,7 @@ END_TEST START_TEST(eina_unicode_utf8) { int ind; + unsigned char ch; eina_init(); /* Valid utf-8 cases */ @@ -289,9 +290,142 @@ START_TEST(eina_unicode_utf8) (ind != 4)); /* Error cases */ + /* Standalone continuation bytes */ ind = 0; fail_if((eina_unicode_utf8_get_next("\x80", &ind) != 0xDC80) || (ind != 1)); + ind = 0; + fail_if((eina_unicode_utf8_get_next("\xBF", &ind) != 0xDCBF) || + (ind != 1)); + ind = 0; + fail_if((eina_unicode_utf8_get_next("\x80\xBF", &ind) != 0xDC80) || + (ind != 1)); + ind = 0; + fail_if((eina_unicode_utf8_get_next("\xBF\x80", &ind) != 0xDCBF) || + (ind != 1)); + /* All possible continuation bytes */ + for (ch = 0x80 ; ch <= 0xBF ; ch++) + { + char buf[] = {ch, 0}; + ind = 0; + fail_if((eina_unicode_utf8_get_next(buf, &ind) != (0xDC00 | ch)) || + (ind != 1)); + } + + /* Isolated starting sequences */ +#define _FIRST_SEQUENCES(start, end) \ + do \ + { \ + int i; \ + char *buf = alloca(((end - start + 1) * 2) + 1); \ + for (i = 0, ch = start ; ch <= end ; i++, ch++) \ + { \ + buf[i * 2] = ch; \ + buf[(i * 2) + 1] = ' '; \ + } \ + ind = 0; \ + for (i = 0, ch = start ; ch <= end ; ch++) \ + { \ + fail_if((eina_unicode_utf8_get_next(buf, &ind) != (0xDC00 | ch)) || \ + (ind != ++i)); \ + fail_if((eina_unicode_utf8_get_next(buf, &ind) != 0x20) || \ + (ind != ++i)); \ + } \ + } \ + while (0) + /* all first bytes of 2-byte sequences seperated by spaces. */ + _FIRST_SEQUENCES(0xC0, 0xDF); + /* all first bytes of 3-byte sequences seperated by spaces. */ + _FIRST_SEQUENCES(0xE0, 0xEF); + /* all first bytes of 4-byte sequences seperated by spaces. */ + _FIRST_SEQUENCES(0xF0, 0xF7); + /* all first bytes of 5-byte sequences seperated by spaces. */ + _FIRST_SEQUENCES(0xF8, 0xFB); + /* all first bytes of 6-byte sequences seperated by spaces. */ + _FIRST_SEQUENCES(0xFC, 0xFD); + + /* Incomplete sequences first means the first utf8 char, len means + * the correct length */ +#define _INCOMPLETE_SEQUENCES(first, conti, len) \ + do \ + { \ + int i, j; \ + char *buf = alloca(len + 1); \ + i = 0; \ + buf[i++] = first; \ + for ( ; i < len ; i++) \ + { \ + Eina_Unicode val; \ + for (j = 1 ; j < i ; j++) \ + { \ + buf[j] = conti; \ + } \ + buf[j] = 0; \ + ind = 0; \ + fail_if( \ + (eina_unicode_utf8_get_next(buf, &ind) != (0xDC00 | first))); \ + while ((val = eina_unicode_utf8_get_next(buf, &ind))) \ + { \ + fail_if(val != (0xDC00 | conti)); \ + } \ + fail_if(ind != i); \ + } \ + } \ + while (0) + + /* Sequences with missing continuation */ + _INCOMPLETE_SEQUENCES(0xC0, 0x81, 2); + _INCOMPLETE_SEQUENCES(0xDF, 0xBF, 2); + _INCOMPLETE_SEQUENCES(0xE0, 0x81, 3); + _INCOMPLETE_SEQUENCES(0xEF, 0xBF, 3); + _INCOMPLETE_SEQUENCES(0xF0, 0x81, 4); + _INCOMPLETE_SEQUENCES(0xF7, 0xBF, 4); + _INCOMPLETE_SEQUENCES(0xF8, 0x81, 5); + _INCOMPLETE_SEQUENCES(0xFB, 0xBF, 5); + _INCOMPLETE_SEQUENCES(0xFC, 0x81, 6); + _INCOMPLETE_SEQUENCES(0xFD, 0xBF, 6); + + /* Impossible bytes */ + ind = 0; + fail_if((eina_unicode_utf8_get_next("\xFE", &ind) != 0xDCFE) || + (ind != 1)); + ind = 0; + fail_if((eina_unicode_utf8_get_next("\xFF", &ind) != 0xDCFF) || + (ind != 1)); + + /* Overlong sequences */ + ind = 0; + fail_if((eina_unicode_utf8_get_next("\xC0\xAF", &ind) != 0xDCC0) || + (ind != 1)); + ind = 0; + fail_if((eina_unicode_utf8_get_next("\xE0\x80\xAF", &ind) != 0xDCE0) || + (ind != 1)); + ind = 0; + fail_if((eina_unicode_utf8_get_next("\xF0\x80\x80\xAF", &ind) != 0xDCF0) || + (ind != 1)); + ind = 0; + fail_if((eina_unicode_utf8_get_next("\xF8\x80\x80\x80\xAF", &ind) != 0xDCF8) || + (ind != 1)); + ind = 0; + fail_if((eina_unicode_utf8_get_next("\xFC\x80\x80\x80\x80\xAF", &ind) != 0xDCFC) || + (ind != 1)); + + /* Maximum overlong sequences */ + ind = 0; + fail_if((eina_unicode_utf8_get_next("\xC1\xBF", &ind) != 0xDCC1) || + (ind != 1)); + ind = 0; + fail_if((eina_unicode_utf8_get_next("\xE0\x9F\xBF", &ind) != 0xDCE0) || + (ind != 1)); + ind = 0; + fail_if((eina_unicode_utf8_get_next("\xF0\x8F\xBF\xBF", &ind) != 0xDCF0) || + (ind != 1)); + ind = 0; + fail_if((eina_unicode_utf8_get_next("\xF8\x87\xBF\xBF\xBF", &ind) != 0xDCF8) || + (ind != 1)); + ind = 0; + fail_if((eina_unicode_utf8_get_next("\xFC\x83\xBF\xBF\xBF\xBF", &ind) != 0xDCFC) || + (ind != 1)); /* Add some more error cases here */ /* Just to cover prev/len. General utf-8 parsing was covered above */ -- 2.7.4