Imported Upstream version 1.2.7

[platform/upstream/harfbuzz.git] / src / hb-utf-private.hh
diff --git a/src/hb-utf-private.hh b/src/hb-utf-private.hh

index b9a6519..74cf5d6 100644 (file)
--- a/src/hb-utf-private.hh
+++ b/src/hb-utf-private.hh
@@ -1,5 +1,5 @@
  /*
- * Copyright © 2011,2012  Google, Inc.
+ * Copyright © 2011,2012,2014  Google, Inc.
   *
   *  This is part of HarfBuzz, a text shaping library.
   *
@@ -30,175 +30,253 @@
  #include "hb-private.hh"
  
  
-/* UTF-8 */
+struct hb_utf8_t
+{
+  typedef uint8_t codepoint_t;
+
+  static inline const uint8_t *
+  next (const uint8_t *text,
+       const uint8_t *end,
+       hb_codepoint_t *unicode,
+       hb_codepoint_t replacement)
+  {
+    /* Written to only accept well-formed sequences.
+     * Based on ideas from ICU's U8_NEXT.
+     * Generates one "replacement" for each ill-formed byte. */
  
-#define HB_UTF8_COMPUTE(Char, Mask, Len) \
-  if (Char < 128) { Len = 1; Mask = 0x7f; } \
-  else if ((Char & 0xe0) == 0xc0) { Len = 2; Mask = 0x1f; } \
-  else if ((Char & 0xf0) == 0xe0) { Len = 3; Mask = 0x0f; } \
-  else if ((Char & 0xf8) == 0xf0) { Len = 4; Mask = 0x07; } \
-  else Len = 0;
+    hb_codepoint_t c = *text++;
  
-static inline const uint8_t *
-hb_utf_next (const uint8_t *text,
-            const uint8_t *end,
-            hb_codepoint_t *unicode)
-{
-  hb_codepoint_t c = *text, mask;
-  unsigned int len;
-
-  /* TODO check for overlong sequences? */
-
-  HB_UTF8_COMPUTE (c, mask, len);
-  if (unlikely (!len || (unsigned int) (end - text) < len)) {
-    *unicode = -1;
-    return text + 1;
-  } else {
-    hb_codepoint_t result;
-    unsigned int i;
-    result = c & mask;
-    for (i = 1; i < len; i++)
+    if (c > 0x7Fu)
+    {
+      if (hb_in_range (c, 0xC2u, 0xDFu)) /* Two-byte */
+      {
+       unsigned int t1;
+       if (likely (text < end &&
+                   (t1 = text[0] - 0x80u) <= 0x3Fu))
+       {
+         c = ((c&0x1Fu)<<6) | t1;
+         text++;
+       }
+       else
+         goto error;
+      }
+      else if (hb_in_range (c, 0xE0u, 0xEFu)) /* Three-byte */
+      {
+       unsigned int t1, t2;
+       if (likely (1 < end - text &&
+                   (t1 = text[0] - 0x80u) <= 0x3Fu &&
+                   (t2 = text[1] - 0x80u) <= 0x3Fu))
+       {
+         c = ((c&0xFu)<<12) | (t1<<6) | t2;
+         if (unlikely (c < 0x0800u || hb_in_range (c, 0xD800u, 0xDFFFu)))
+           goto error;
+         text += 2;
+       }
+       else
+         goto error;
+      }
+      else if (hb_in_range (c, 0xF0u, 0xF4u)) /* Four-byte */
        {
-       if (unlikely ((text[i] & 0xc0) != 0x80))
-         {
-           *unicode = -1;
-           return text + 1;
-         }
-       result <<= 6;
-       result |= (text[i] & 0x3f);
+       unsigned int t1, t2, t3;
+       if (likely (2 < end - text &&
+                   (t1 = text[0] - 0x80u) <= 0x3Fu &&
+                   (t2 = text[1] - 0x80u) <= 0x3Fu &&
+                   (t3 = text[2] - 0x80u) <= 0x3Fu))
+       {
+         c = ((c&0x7u)<<18) | (t1<<12) | (t2<<6) | t3;
+         if (unlikely (!hb_in_range (c, 0x10000u, 0x10FFFFu)))
+           goto error;
+         text += 3;
+       }
+       else
+         goto error;
        }
-    *unicode = result;
-    return text + len;
+      else
+       goto error;
+    }
+
+    *unicode = c;
+    return text;
+
+  error:
+    *unicode = replacement;
+    return text;
+  }
+
+  static inline const uint8_t *
+  prev (const uint8_t *text,
+       const uint8_t *start,
+       hb_codepoint_t *unicode,
+       hb_codepoint_t replacement)
+  {
+    const uint8_t *end = text--;
+    while (start < text && (*text & 0xc0) == 0x80 && end - text < 4)
+      text--;
+
+    if (likely (next (text, end, unicode, replacement) == end))
+      return text;
+
+    *unicode = replacement;
+    return end - 1;
    }
-}
  
-static inline const uint8_t *
-hb_utf_prev (const uint8_t *text,
-            const uint8_t *start,
-            hb_codepoint_t *unicode)
+  static inline unsigned int
+  strlen (const uint8_t *text)
+  {
+    return ::strlen ((const char *) text);
+  }
+};
+
+
+struct hb_utf16_t
  {
-  const uint8_t *end = text--;
-  while (start < text && (*text & 0xc0) == 0x80 && end - text < 4)
-    text--;
+  typedef uint16_t codepoint_t;
  
-  hb_codepoint_t c = *text, mask;
-  unsigned int len;
+  static inline const uint16_t *
+  next (const uint16_t *text,
+       const uint16_t *end,
+       hb_codepoint_t *unicode,
+       hb_codepoint_t replacement)
+  {
+    hb_codepoint_t c = *text++;
  
-  /* TODO check for overlong sequences? */
+    if (likely (!hb_in_range (c, 0xD800u, 0xDFFFu)))
+    {
+      *unicode = c;
+      return text;
+    }
  
-  HB_UTF8_COMPUTE (c, mask, len);
-  if (unlikely (!len || (unsigned int) (end - text) != len)) {
-    *unicode = -1;
-    return end - 1;
-  } else {
-    hb_codepoint_t result;
-    unsigned int i;
-    result = c & mask;
-    for (i = 1; i < len; i++)
+    if (likely (c <= 0xDBFFu && text < end))
+    {
+      /* High-surrogate in c */
+      hb_codepoint_t l = *text;
+      if (likely (hb_in_range (l, 0xDC00u, 0xDFFFu)))
        {
-       result <<= 6;
-       result |= (text[i] & 0x3f);
+       /* Low-surrogate in l */
+       *unicode = (c << 10) + l - ((0xD800u << 10) - 0x10000u + 0xDC00u);
+        text++;
+        return text;
        }
-    *unicode = result;
+    }
+
+    /* Lonely / out-of-order surrogate. */
+    *unicode = replacement;
      return text;
    }
-}
  
+  static inline const uint16_t *
+  prev (const uint16_t *text,
+       const uint16_t *start,
+       hb_codepoint_t *unicode,
+       hb_codepoint_t replacement)
+  {
+    hb_codepoint_t c = *--text;
  
-static inline unsigned int
-hb_utf_strlen (const uint8_t *text)
-{
-  return strlen ((const char *) text);
-}
+    if (likely (!hb_in_range (c, 0xD800u, 0xDFFFu)))
+    {
+      *unicode = c;
+      return text;
+    }
  
+    if (likely (c >= 0xDC00u && start < text))
+    {
+      /* Low-surrogate in c */
+      hb_codepoint_t h = text[-1];
+      if (likely (hb_in_range (h, 0xD800u, 0xDBFFu)))
+      {
+        /* High-surrogate in h */
+        *unicode = (h << 10) + c - ((0xD800u << 10) - 0x10000u + 0xDC00u);
+        text--;
+        return text;
+      }
+    }
  
-/* UTF-16 */
+    /* Lonely / out-of-order surrogate. */
+    *unicode = replacement;
+    return text;
+  }
  
-static inline const uint16_t *
-hb_utf_next (const uint16_t *text,
-            const uint16_t *end,
-            hb_codepoint_t *unicode)
-{
-  hb_codepoint_t c = *text++;
  
-  if (unlikely (hb_in_range<hb_codepoint_t> (c, 0xd800, 0xdbff)))
+  static inline unsigned int
+  strlen (const uint16_t *text)
    {
-    /* high surrogate */
-    hb_codepoint_t l;
-    if (text < end && ((l = *text), likely (hb_in_range<hb_codepoint_t> (l, 0xdc00, 0xdfff))))
-    {
-      /* low surrogate */
-      *unicode = (c << 10) + l - ((0xd800 << 10) - 0x10000 + 0xdc00);
-       text++;
-    } else
-      *unicode = -1;
-  } else
-    *unicode = c;
+    unsigned int l = 0;
+    while (*text++) l++;
+    return l;
+  }
+};
  
-  return text;
-}
  
-static inline const uint16_t *
-hb_utf_prev (const uint16_t *text,
-            const uint16_t *start,
-            hb_codepoint_t *unicode)
+template <bool validate=true>
+struct hb_utf32_t
  {
-  hb_codepoint_t c = *--text;
+  typedef uint32_t codepoint_t;
  
-  if (unlikely (hb_in_range<hb_codepoint_t> (c, 0xdc00, 0xdfff)))
+  static inline const uint32_t *
+  next (const uint32_t *text,
+       const uint32_t *end HB_UNUSED,
+       hb_codepoint_t *unicode,
+       hb_codepoint_t replacement)
    {
-    /* low surrogate */
-    hb_codepoint_t h;
-    if (start < text && ((h = *(text - 1)), likely (hb_in_range<hb_codepoint_t> (h, 0xd800, 0xdbff))))
-    {
-      /* high surrogate */
-      *unicode = (h << 10) + c - ((0xd800 << 10) - 0x10000 + 0xdc00);
-       text--;
-    } else
-      *unicode = -1;
-  } else
-    *unicode = c;
-
-  return text;
-}
-
+    hb_codepoint_t c = *unicode = *text++;
+    if (validate && unlikely (c >= 0xD800u && (c <= 0xDFFFu || c > 0x10FFFFu)))
+      *unicode = replacement;
+    return text;
+  }
  
-static inline unsigned int
-hb_utf_strlen (const uint16_t *text)
-{
-  unsigned int l = 0;
-  while (*text++) l++;
-  return l;
-}
+  static inline const uint32_t *
+  prev (const uint32_t *text,
+       const uint32_t *start HB_UNUSED,
+       hb_codepoint_t *unicode,
+       hb_codepoint_t replacement)
+  {
+    hb_codepoint_t c = *unicode = *--text;
+    if (validate && unlikely (c >= 0xD800u && (c <= 0xDFFFu || c > 0x10FFFFu)))
+      *unicode = replacement;
+    return text;
+  }
  
+  static inline unsigned int
+  strlen (const uint32_t *text)
+  {
+    unsigned int l = 0;
+    while (*text++) l++;
+    return l;
+  }
+};
  
-/* UTF-32 */
  
-static inline const uint32_t *
-hb_utf_next (const uint32_t *text,
-            const uint32_t *end HB_UNUSED,
-            hb_codepoint_t *unicode)
-{
-  *unicode = *text++;
-  return text;
-}
-
-static inline const uint32_t *
-hb_utf_prev (const uint32_t *text,
-            const uint32_t *start HB_UNUSED,
-            hb_codepoint_t *unicode)
+struct hb_latin1_t
  {
-  *unicode = *--text;
-  return text;
-}
+  typedef uint8_t codepoint_t;
  
-static inline unsigned int
-hb_utf_strlen (const uint32_t *text)
-{
-  unsigned int l = 0;
-  while (*text++) l++;
-  return l;
-}
+  static inline const uint8_t *
+  next (const uint8_t *text,
+       const uint8_t *end HB_UNUSED,
+       hb_codepoint_t *unicode,
+       hb_codepoint_t replacement HB_UNUSED)
+  {
+    *unicode = *text++;
+    return text;
+  }
+
+  static inline const uint8_t *
+  prev (const uint8_t *text,
+       const uint8_t *start HB_UNUSED,
+       hb_codepoint_t *unicode,
+       hb_codepoint_t replacement)
+  {
+    *unicode = *--text;
+    return text;
+  }
  
+  static inline unsigned int
+  strlen (const uint8_t *text)
+  {
+    unsigned int l = 0;
+    while (*text++) l++;
+    return l;
+  }
+};
  
  #endif /* HB_UTF_PRIVATE_HH */