1 /* vim: set expandtab tabstop=4 softtabstop=4 shiftwidth=4: */
4 * Break processing in a Unicode sequence. Designed to be used in a
5 * generic text renderer.
7 * Copyright (C) 2015 Wu Yongwei <wuyongwei at gmail dot com>
9 * This software is provided 'as-is', without any express or implied
10 * warranty. In no event will the author be held liable for any damages
11 * arising from the use of this software.
13 * Permission is granted to anyone to use this software for any purpose,
14 * including commercial applications, and to alter it and redistribute
15 * it freely, subject to the following restrictions:
17 * 1. The origin of this software must not be misrepresented; you must
18 * not claim that you wrote the original software. If you use this
19 * software in a product, an acknowledgement in the product
20 * documentation would be appreciated but is not required.
21 * 2. Altered source versions must be plainly marked as such, and must
22 * not be misrepresented as being the original software.
23 * 3. This notice may not be removed or altered from any source
30 * Definition of utility functions used by the libunibreak library.
32 * @version 1.0, 2015/04/18
38 #include "unibreakdef.h"
41 * Gets the next Unicode character in a UTF-8 sequence. The index will
42 * be advanced to the next complete character, unless the end of string
43 * is reached in the middle of a UTF-8 sequence.
45 * @param[in] s input UTF-8 string
46 * @param[in] len length of the string in bytes
47 * @param[in,out] ip pointer to the index
48 * @return the Unicode character beginning at the index; or
49 * #EOS if end of input is encountered
51 utf32_t ub_get_next_char_utf8(
64 if (ch < 0xC2 || ch > 0xF4)
65 { /* One-byte sequence, tail (should not occur), or invalid */
70 { /* Two-byte sequence */
73 res = ((ch & 0x1F) << 6) + (s[*ip + 1] & 0x3F);
78 { /* Three-byte sequence */
81 res = ((ch & 0x0F) << 12) +
82 ((s[*ip + 1] & 0x3F) << 6) +
83 ((s[*ip + 2] & 0x3F));
88 { /* Four-byte sequence */
91 res = ((ch & 0x07) << 18) +
92 ((s[*ip + 1] & 0x3F) << 12) +
93 ((s[*ip + 2] & 0x3F) << 6) +
94 ((s[*ip + 3] & 0x3F));
101 * Gets the next Unicode character in a UTF-16 sequence. The index will
102 * be advanced to the next complete character, unless the end of string
103 * is reached in the middle of a UTF-16 surrogate pair.
105 * @param[in] s input UTF-16 string
106 * @param[in] len length of the string in words
107 * @param[in,out] ip pointer to the index
108 * @return the Unicode character beginning at the index; or
109 * #EOS if end of input is encountered
111 utf32_t ub_get_next_char_utf16(
123 if (ch < 0xD800 || ch > 0xDBFF)
124 { /* If the character is not a high surrogate */
128 { /* If the input ends here (an error) */
132 if (s[*ip] < 0xDC00 || s[*ip] > 0xDFFF)
133 { /* If the next character is not the low surrogate (an error) */
136 /* Return the constructed character and advance the index again */
137 return (((utf32_t)ch & 0x3FF) << 10) + (s[(*ip)++] & 0x3FF) + 0x10000;
141 * Gets the next Unicode character in a UTF-32 sequence. The index will
142 * be advanced to the next character.
144 * @param[in] s input UTF-32 string
145 * @param[in] len length of the string in dwords
146 * @param[in,out] ip pointer to the index
147 * @return the Unicode character beginning at the index; or
148 * #EOS if end of input is encountered
150 utf32_t ub_get_next_char_utf32(