- * Gets the next Unicode character in a UTF-8 sequence. The index will
- * be advanced to the next complete character, unless the end of string
- * is reached in the middle of a UTF-8 sequence.
- *
- * @param[in] s input UTF-8 string
- * @param[in] len length of the string in bytes
- * @param[in,out] ip pointer to the index
- * @return the Unicode character beginning at the index; or
- * #EOS if end of input is encountered
- */
-utf32_t lb_get_next_char_utf8(
- const utf8_t *s,
- size_t len,
- size_t *ip)
-{
- utf8_t ch;
- utf32_t res;
-
- assert(*ip <= len);
- if (*ip == len)
- return EOS;
- ch = s[*ip];
-
- if (ch < 0xC2 || ch > 0xF4)
- { /* One-byte sequence, tail (should not occur), or invalid */
- *ip += 1;
- return ch;
- }
- else if (ch < 0xE0)
- { /* Two-byte sequence */
- if (*ip + 2 > len)
- return EOS;
- res = ((ch & 0x1F) << 6) + (s[*ip + 1] & 0x3F);
- *ip += 2;
- return res;
- }
- else if (ch < 0xF0)
- { /* Three-byte sequence */
- if (*ip + 3 > len)
- return EOS;
- res = ((ch & 0x0F) << 12) +
- ((s[*ip + 1] & 0x3F) << 6) +
- ((s[*ip + 2] & 0x3F));
- *ip += 3;
- return res;
- }
- else
- { /* Four-byte sequence */
- if (*ip + 4 > len)
- return EOS;
- res = ((ch & 0x07) << 18) +
- ((s[*ip + 1] & 0x3F) << 12) +
- ((s[*ip + 2] & 0x3F) << 6) +
- ((s[*ip + 3] & 0x3F));
- *ip += 4;
- return res;
- }
-}
-
-/**
- * Gets the next Unicode character in a UTF-16 sequence. The index will
- * be advanced to the next complete character, unless the end of string
- * is reached in the middle of a UTF-16 surrogate pair.
- *
- * @param[in] s input UTF-16 string
- * @param[in] len length of the string in words
- * @param[in,out] ip pointer to the index
- * @return the Unicode character beginning at the index; or
- * #EOS if end of input is encountered
- */
-utf32_t lb_get_next_char_utf16(
- const utf16_t *s,
- size_t len,
- size_t *ip)
-{
- utf16_t ch;
-
- assert(*ip <= len);
- if (*ip == len)
- return EOS;
- ch = s[(*ip)++];
-
- if (ch < 0xD800 || ch > 0xDBFF)
- { /* If the character is not a high surrogate */
- return ch;
- }
- if (*ip == len)
- { /* If the input ends here (an error) */
- --(*ip);
- return EOS;
- }
- if (s[*ip] < 0xDC00 || s[*ip] > 0xDFFF)
- { /* If the next character is not the low surrogate (an error) */
- return ch;
- }
- /* Return the constructed character and advance the index again */
- return (((utf32_t)ch & 0x3FF) << 10) + (s[(*ip)++] & 0x3FF) + 0x10000;
-}
-
-/**
- * Gets the next Unicode character in a UTF-32 sequence. The index will
- * be advanced to the next character.
- *
- * @param[in] s input UTF-32 string
- * @param[in] len length of the string in dwords
- * @param[in,out] ip pointer to the index
- * @return the Unicode character beginning at the index; or
- * #EOS if end of input is encountered
- */
-utf32_t lb_get_next_char_utf32(
- const utf32_t *s,
- size_t len,
- size_t *ip)
-{
- assert(*ip <= len);
- if (*ip == len)
- return EOS;
- return s[(*ip)++];
-}
-
-/**