Eina unicode: Added utf8 handling functions and also added Eina_Unicode<->UTF-8 conve...

author tasn <tasn@7cbeb6ba-43b4-40fd-8cce-4c39aea84d33>

Wed, 16 Feb 2011 15:43:25 +0000 (15:43 +0000)

committer tasn <tasn@7cbeb6ba-43b4-40fd-8cce-4c39aea84d33>

Wed, 16 Feb 2011 15:43:25 +0000 (15:43 +0000)
author tasn <tasn@7cbeb6ba-43b4-40fd-8cce-4c39aea84d33>
Wed, 16 Feb 2011 15:43:25 +0000 (15:43 +0000)
committer tasn <tasn@7cbeb6ba-43b4-40fd-8cce-4c39aea84d33>
Wed, 16 Feb 2011 15:43:25 +0000 (15:43 +0000)
diff --git a/ChangeLog b/ChangeLog

index 7705ad1..fdd1d0f 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -13,3 +13,8 @@
  2011-02-16  Mike Blumenkrantz
  
          * Added EINA_INLIST_FOREACH_SAFE
+
+2011-02-16  Tom Hacohen
+
+       * Added eina_unicode_utf8* functions for utf8 string handling
+       and conversions to and from Eina_Unicode
diff --git a/src/include/eina_unicode.h b/src/include/eina_unicode.h

index 152177b..a20c645 100644 (file)
--- a/src/include/eina_unicode.h
+++ b/src/include/eina_unicode.h
@@ -58,6 +58,16 @@ EAPI Eina_Unicode *eina_unicode_strncpy(Eina_Unicode *dest, const Eina_Unicode *
  
  EAPI Eina_Unicode *eina_unicode_escape(const Eina_Unicode *str) EINA_ARG_NONNULL(1) EINA_MALLOC EINA_WARN_UNUSED_RESULT;
  
+/* UTF-8 Handling */
+
+EAPI Eina_Unicode eina_unicode_utf8_get_next(const char *buf, int *iindex) EINA_ARG_NONNULL(1, 2);
+EAPI Eina_Unicode eina_unicode_utf8_get_prev(const char *buf, int *iindex) EINA_ARG_NONNULL(1, 2);
+EAPI int eina_unicode_utf8_get_len(const char *buf) EINA_ARG_NONNULL(1);
+
+EAPI Eina_Unicode *eina_unicode_utf8_to_unicode(const char *utf, int *_len) EINA_WARN_UNUSED_RESULT EINA_ARG_NONNULL(1) EINA_MALLOC;
+
+EAPI char * eina_unicode_unicode_to_utf8(const Eina_Unicode *uni, int *_len) EINA_WARN_UNUSED_RESULT EINA_ARG_NONNULL(1) EINA_MALLOC;
+
  /**
   * @}
   */
diff --git a/src/lib/eina_unicode.c b/src/lib/eina_unicode.c

index 6c8f7e9..2b3dd18 100644 (file)
--- a/src/lib/eina_unicode.c
+++ b/src/lib/eina_unicode.c
@@ -185,3 +185,310 @@ eina_unicode_escape(const Eina_Unicode *str)
     return s2;
  }
  
+/* UTF-8 Handling */
+
+#define EINA_UNICODE_UTF8_BYTES_PER_CHAR 6
+/* The replacement range that will be used for bad utf8 chars. */
+#define ERROR_REPLACEMENT_BASE  0xDC80
+#define ERROR_REPLACEMENT_END   0xDCFF
+#define IS_INVALID_BYTE(x)      ((x == 192) || (x == 193) || (x >= 245))
+#define IS_CONTINUATION_BYTE(x) ((x & 0xC0) == 0x80)
+
+/**
+ * Reads UTF8 bytes from @buf, starting at *@index and returns
+ * the decoded code point at iindex offset, and advances iindex
+ * to the next code point after this. iindex is always advanced,
+ * unless if the advancement is after the NULL.
+ * On error: return a codepoint between DC80 to DCFF where the low 8 bits
+ *   are the byte's value.
+ *
+ * @param buf the string
+ * @param iindex the index to look at and return by.
+ * @return the codepoint found.
+ * @since 1.1.0
+ */
+EAPI Eina_Unicode
+eina_unicode_utf8_get_next(const char *buf, int *iindex)
+{
+   /* Note: we don't currently handle overlong forms and some other
+    * error cases. */
+   int ind = *iindex;
+   Eina_Unicode r;
+   unsigned char d;
+
+   /* if this char is the null terminator, exit */
+   if ((d = buf[ind++]) == 0) return 0;
+
+   if ((d & 0x80) == 0)
+     { // 1 byte (7bit) - 0xxxxxxx
+        *iindex = ind;
+        return d;
+     }
+   if ((d & 0xe0) == 0xc0)
+     { // 2 byte (11bit) - 110xxxxx 10xxxxxx
+        r  = (d & 0x1f) << 6;
+        if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
+            !IS_CONTINUATION_BYTE(d)) goto error;
+        r |= (d & 0x3f);
+        if (!r) goto error;
+        *iindex = ind;
+        return r;
+     }
+   if ((d & 0xf0) == 0xe0)
+     { // 3 byte (16bit) - 1110xxxx 10xxxxxx 10xxxxxx
+        r  = (d & 0x0f) << 12;
+        if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
+            !IS_CONTINUATION_BYTE(d)) goto error;
+        r |= (d & 0x3f) << 6;
+        if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
+            !IS_CONTINUATION_BYTE(d)) goto error;
+        r |= (d & 0x3f);
+        if (!r) goto error;
+        *iindex = ind;
+        return r;
+     }
+   if ((d & 0xf8) == 0xf0)
+     { // 4 byte (21bit) - 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+        r  = (d & 0x07) << 18;
+        if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
+            !IS_CONTINUATION_BYTE(d)) goto error;
+        r |= (d & 0x3f) << 12;
+        if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
+            !IS_CONTINUATION_BYTE(d)) goto error;
+        r |= (d & 0x3f) << 6;
+        if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
+            !IS_CONTINUATION_BYTE(d)) goto error;
+        r |= (d & 0x3f);
+        if (!r) goto error;
+        *iindex = ind;
+        return r;
+     }
+   if ((d & 0xfc) == 0xf8)
+     { // 5 byte (26bit) - 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+        r  = (d & 0x03) << 24;
+        if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
+            !IS_CONTINUATION_BYTE(d)) goto error;
+        r |= (d & 0x3f) << 18;
+        if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
+            !IS_CONTINUATION_BYTE(d)) goto error;
+        r |= (d & 0x3f) << 12;
+        if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
+            !IS_CONTINUATION_BYTE(d)) goto error;
+        r |= (d & 0x3f) << 6;
+        if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
+            !IS_CONTINUATION_BYTE(d)) goto error;
+        r |= (d & 0x3f);
+        if (!r) goto error;
+        *iindex = ind;
+        return r;
+     }
+   if ((d & 0xfe) == 0xfc)
+     { // 6 byte (31bit) - 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+        r  = (d & 0x01) << 30;
+        if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
+            !IS_CONTINUATION_BYTE(d)) goto error;
+        r |= (d & 0x3f) << 24;
+        if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
+            !IS_CONTINUATION_BYTE(d)) goto error;
+        r |= (d & 0x3f) << 18;
+        if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
+            !IS_CONTINUATION_BYTE(d)) goto error;
+        r |= (d & 0x3f) << 12;
+        if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
+            !IS_CONTINUATION_BYTE(d)) goto error;
+        r |= (d & 0x3f) << 6;
+        if (((d = buf[ind++]) == 0) || IS_INVALID_BYTE(d) ||
+            !IS_CONTINUATION_BYTE(d)) goto error;
+        r |= (d & 0x3f);
+        if (!r) goto error;
+        *iindex = ind;
+        return r;
+     }
+
+/* Gets here where there was an error and we want to replace the char
+ * we just use the invalid unicode codepoints 8 lower bits represent
+ * the original char */
+error:
+   d = buf[*iindex];
+   (*iindex)++;
+   return ERROR_REPLACEMENT_BASE | d;
+}
+
+/**
+ * Reads UTF8 bytes from @buf, starting at *@iindex and returns
+ * the decoded code point at iindex offset, and moves iindex
+ * to the previous code point. iindex is always moved, as long
+ * as it's not past the start of the string.
+ * On error: return a codepoint between DC80 to DCFF where the low 8 bits
+ *   are the byte's value.
+ *
+ * @param buf the string
+ * @param iindex the index to look at and return by.
+ * @return the codepoint found.
+ * @since 1.1.0
+ */
+EAPI Eina_Unicode
+eina_unicode_utf8_get_prev(const char *buf, int *iindex)
+{
+   int r;
+   int ind = *iindex;
+   /* First obtain the codepoint at iindex */
+   r = eina_unicode_utf8_get_next(buf, &ind);
+
+   /* although when ind == 0 there's no previous char, we still want to get
+    * the current char */
+   if (*iindex <= 0)
+     return r;
+
+   /* Next advance iindex to previous codepoint */
+   ind = *iindex;
+   ind--;
+   while ((ind > 0) && ((buf[ind] & 0xc0) == 0x80))
+     ind--;
+
+   *iindex = ind;
+   return r;
+}
+
+/**
+ * Returns the number of unicode characters in the string. That is,
+ * the number of Eina_Unicodes it'll take to store this string in
+ * an Eina_Unicode string.
+ *
+ * @param buf the string
+ * @return the number of unicode characters (not bytes) in the string
+ * @since 1.1.0
+ */
+EAPI int
+eina_unicode_utf8_get_len(const char *buf)
+{
+   /* returns the number of utf8 characters (not bytes) in the string */
+   int i = 0, len = 0;
+
+   while (eina_unicode_utf8_get_next(buf, &i))
+        len++;
+
+   return len;
+}
+
+/**
+ * Converts a utf-8 string to a newly allocated Eina_Unicode string.
+ *
+ * @param utf the string in utf-8
+ * @param _len the length of the returned Eina_Unicode string.
+ * @return the newly allocated Eina_Unicode string.
+ * @since 1.1.0
+ */
+EAPI Eina_Unicode *
+eina_unicode_utf8_to_unicode(const char *utf, int *_len)
+{
+   /* FIXME: Should optimize! */
+   int len, i;
+   int ind;
+   Eina_Unicode *buf, *uind;
+
+   len = eina_unicode_utf8_get_len(utf);
+   if (_len)
+      *_len = len;
+   buf = (Eina_Unicode *) calloc(sizeof(Eina_Unicode), (len + 1));
+   if (!buf) return buf;
+
+   for (i = 0, ind = 0, uind = buf ; i < len ; i++, uind++)
+     {
+        *uind = eina_unicode_utf8_get_next(utf, &ind);
+     }
+
+   return buf;
+}
+
+/**
+ * Converts an Eina_Unicode string to a newly allocated utf-8 string.
+ *
+ * @param uni the Eina_Unicode string
+ * @param _len the length byte length of the return utf8 string.
+ * @return the newly allocated utf-8 string.
+ * @since 1.1.0
+ */
+EAPI char *
+eina_unicode_unicode_to_utf8(const Eina_Unicode *uni, int *_len)
+{
+   char *buf;
+   const Eina_Unicode *uind;
+   char *ind;
+   int ulen, len;
+
+   ulen = eina_unicode_strlen(uni);
+   buf = (char *) calloc(ulen + 1, EINA_UNICODE_UTF8_BYTES_PER_CHAR);
+
+   len = 0;
+   for (uind = uni, ind = buf ; *uind ; uind++)
+     {
+        if (*uind <= 0x7F) /* 1 byte char */
+          {
+             *ind++ = *uind;
+             len += 1;
+          }
+        else if (*uind <= 0x7FF) /* 2 byte char */
+          {
+             *ind++ = 0xC0 | (unsigned char) (*uind >> 6);
+             *ind++ = 0x80 | (unsigned char) (*uind & 0x3F);
+             len += 2;
+          }
+        else if (*uind <= 0xFFFF) /* 3 byte char */
+          {
+             /* If it's a special replacement codepoint */
+             if (*uind >= ERROR_REPLACEMENT_BASE &&
+                 *uind <= ERROR_REPLACEMENT_END)
+               {
+                  *ind++ = *uind & 0xFF;
+                  len += 1;
+               }
+             else
+               {
+                  *ind++ = 0xE0 | (unsigned char) (*uind >> 12);
+                  *ind++ = 0x80 | (unsigned char) ((*uind >> 6) & 0x3F);
+                  *ind++ = 0x80 | (unsigned char) (*uind & 0x3F);
+                  len += 3;
+               }
+          }
+        else if (*uind <= 0x1FFFFF) /* 4 byte char */
+          {
+             *ind++ = 0xF0 | (unsigned char) ((*uind >> 18) & 0x07);
+             *ind++ = 0x80 | (unsigned char) ((*uind >> 12) & 0x3F);
+             *ind++ = 0x80 | (unsigned char) ((*uind >> 6) & 0x3F);
+             *ind++ = 0x80 | (unsigned char) (*uind & 0x3F);
+             len += 4;
+          }
+        else if (*uind <= 0x3FFFFFF) /* 5 byte char */
+          {
+             *ind++ = 0xF8 | (unsigned char) ((*uind >> 24) & 0x03);
+             *ind++ = 0x80 | (unsigned char) ((*uind >> 18) & 0x3F);
+             *ind++ = 0x80 | (unsigned char) ((*uind >> 12) & 0x3F);
+             *ind++ = 0x80 | (unsigned char) ((*uind >> 6) & 0x3F);
+             *ind++ = 0x80 | (unsigned char) (*uind & 0x3F);
+             len += 5;
+          }
+        else if (*uind <= 0x7FFFFFFF) /* 6 byte char */
+          {
+             *ind++ = 0xFC | (unsigned char) ((*uind >> 30) & 0x01);
+             *ind++ = 0x80 | (unsigned char) ((*uind >> 24) & 0x3F);
+             *ind++ = 0x80 | (unsigned char) ((*uind >> 18) & 0x3F);
+             *ind++ = 0x80 | (unsigned char) ((*uind >> 12) & 0x3F);
+             *ind++ = 0x80 | (unsigned char) ((*uind >> 6) & 0x3F);
+             *ind++ = 0x80 | (unsigned char) (*uind & 0x3F);
+             len += 6;
+          }
+        else /* error */
+          {
+             /* Do something */
+          }
+     }
+   buf = realloc(buf, len + 1);
+   buf[len] = '\0';
+   if (_len)
+      *_len = len;
+   return buf;
+}
+
+
+
author	tasn <tasn@7cbeb6ba-43b4-40fd-8cce-4c39aea84d33>
	Wed, 16 Feb 2011 15:43:25 +0000 (15:43 +0000)
committer	tasn <tasn@7cbeb6ba-43b4-40fd-8cce-4c39aea84d33>
	Wed, 16 Feb 2011 15:43:25 +0000 (15:43 +0000)
ChangeLog		patch \| blob \| history
src/include/eina_unicode.h		patch \| blob \| history
src/lib/eina_unicode.c		patch \| blob \| history