1 // Copyright 2018 Google LLC.
2 // Use of this source code is governed by a BSD-style license that can be found in the LICENSE file.
4 #include "src/utils/SkUTF.h"
6 #include "include/private/SkTFitsIn.h"
8 static constexpr inline int32_t left_shift(int32_t value, int32_t shift) {
9 return (int32_t) ((uint32_t) value << shift);
12 template <typename T> static constexpr bool is_align2(T x) { return 0 == (x & 1); }
14 template <typename T> static constexpr bool is_align4(T x) { return 0 == (x & 3); }
16 static constexpr inline bool utf16_is_high_surrogate(uint16_t c) { return (c & 0xFC00) == 0xD800; }
18 static constexpr inline bool utf16_is_low_surrogate(uint16_t c) { return (c & 0xFC00) == 0xDC00; }
20 /** @returns -1 iff invalid UTF8 byte,
21 0 iff UTF8 continuation byte,
23 2 iff leading byte of 2-byte sequence,
24 3 iff leading byte of 3-byte sequence, and
25 4 iff leading byte of 4-byte sequence.
26 I.e.: if return value > 0, then gives length of sequence.
28 static int utf8_byte_type(uint8_t c) {
31 } else if (c < 0xC0) {
33 } else if (c >= 0xF5 || (c & 0xFE) == 0xC0) { // "octet values c0, c1, f5 to ff never appear"
36 int value = (((0xe5 << 24) >> ((unsigned)c >> 4 << 1)) & 3) + 1;
37 // assert(value >= 2 && value <=4);
41 static bool utf8_type_is_valid_leading_byte(int type) { return type > 0; }
43 static bool utf8_byte_is_continuation(uint8_t c) { return utf8_byte_type(c) == 0; }
45 ////////////////////////////////////////////////////////////////////////////////
47 int SkUTF::CountUTF8(const char* utf8, size_t byteLength) {
52 const char* stop = utf8 + byteLength;
54 int type = utf8_byte_type(*(const uint8_t*)utf8);
55 if (!utf8_type_is_valid_leading_byte(type) || utf8 + type > stop) {
56 return -1; // Sequence extends beyond end.
60 if (!utf8_byte_is_continuation(*(const uint8_t*)utf8)) {
70 int SkUTF::CountUTF16(const uint16_t* utf16, size_t byteLength) {
71 if (!utf16 || !is_align2(intptr_t(utf16)) || !is_align2(byteLength)) {
74 const uint16_t* src = (const uint16_t*)utf16;
75 const uint16_t* stop = src + (byteLength >> 1);
79 if (utf16_is_low_surrogate(c)) {
82 if (utf16_is_high_surrogate(c)) {
87 if (!utf16_is_low_surrogate(c)) {
96 int SkUTF::CountUTF32(const int32_t* utf32, size_t byteLength) {
97 if (!is_align4(intptr_t(utf32)) || !is_align4(byteLength) || !SkTFitsIn<int>(byteLength >> 2)) {
100 const uint32_t kInvalidUnicharMask = 0xFF000000; // unichar fits in 24 bits
101 const uint32_t* ptr = (const uint32_t*)utf32;
102 const uint32_t* stop = ptr + (byteLength >> 2);
104 if (*ptr & kInvalidUnicharMask) {
109 return (int)(byteLength >> 2);
112 template <typename T>
113 static SkUnichar next_fail(const T** ptr, const T* end) {
118 SkUnichar SkUTF::NextUTF8(const char** ptr, const char* end) {
122 const uint8_t* p = (const uint8_t*)*ptr;
123 if (!p || p >= (const uint8_t*)end) {
124 return next_fail(ptr, end);
129 if (!utf8_type_is_valid_leading_byte(utf8_byte_type(c))) {
130 return next_fail(ptr, end);
133 uint32_t mask = (uint32_t)~0x3F;
134 hic = left_shift(hic, 1);
137 if (p >= (const uint8_t*)end) {
138 return next_fail(ptr, end);
140 // check before reading off end of array.
141 uint8_t nextByte = *p;
142 if (!utf8_byte_is_continuation(nextByte)) {
143 return next_fail(ptr, end);
145 c = (c << 6) | (nextByte & 0x3F);
147 } while ((hic = left_shift(hic, 1)) < 0);
154 SkUnichar SkUTF::NextUTF16(const uint16_t** ptr, const uint16_t* end) {
158 const uint16_t* src = *ptr;
159 if (!src || src + 1 > end || !is_align2(intptr_t(src))) {
160 return next_fail(ptr, end);
163 SkUnichar result = c;
164 if (utf16_is_low_surrogate(c)) {
165 return next_fail(ptr, end); // srcPtr should never point at low surrogate.
167 if (utf16_is_high_surrogate(c)) {
169 return next_fail(ptr, end); // Truncated string.
171 uint16_t low = *src++;
172 if (!utf16_is_low_surrogate(low)) {
173 return next_fail(ptr, end);
176 [paraphrased from wikipedia]
177 Take the high surrogate and subtract 0xD800, then multiply by 0x400.
178 Take the low surrogate and subtract 0xDC00. Add these two results
179 together, and finally add 0x10000 to get the final decoded codepoint.
181 unicode = (high - 0xD800) * 0x400 + low - 0xDC00 + 0x10000
182 unicode = (high * 0x400) - (0xD800 * 0x400) + low - 0xDC00 + 0x10000
183 unicode = (high << 10) - (0xD800 << 10) + low - 0xDC00 + 0x10000
184 unicode = (high << 10) + low - ((0xD800 << 10) + 0xDC00 - 0x10000)
186 result = (result << 10) + (SkUnichar)low - ((0xD800 << 10) + 0xDC00 - 0x10000);
192 SkUnichar SkUTF::NextUTF32(const int32_t** ptr, const int32_t* end) {
196 const int32_t* s = *ptr;
197 if (!s || s + 1 > end || !is_align4(intptr_t(s))) {
198 return next_fail(ptr, end);
201 const uint32_t kInvalidUnicharMask = 0xFF000000; // unichar fits in 24 bits
202 if (value & kInvalidUnicharMask) {
203 return next_fail(ptr, end);
209 size_t SkUTF::ToUTF8(SkUnichar uni, char utf8[SkUTF::kMaxBytesInUTF8Sequence]) {
210 if ((uint32_t)uni > 0x10FFFF) {
222 while (uni > 0x7F >> count) {
223 *p++ = (char)(0x80 | (uni & 0x3F));
230 while (p < tmp + count - 1) {
233 *--utf8 = (char)(~(0xFF >> count) | uni);
238 size_t SkUTF::ToUTF16(SkUnichar uni, uint16_t utf16[2]) {
239 if ((uint32_t)uni > 0x10FFFF) {
242 int extra = (uni > 0xFFFF);
245 utf16[0] = (uint16_t)((0xD800 - 64) + (uni >> 10));
246 utf16[1] = (uint16_t)(0xDC00 | (uni & 0x3FF));
248 utf16[0] = (uint16_t)uni;
254 int SkUTF::UTF8ToUTF16(uint16_t dst[], int dstCapacity, const char src[], size_t srcByteLength) {
260 uint16_t* endDst = dst + dstCapacity;
261 const char* endSrc = src + srcByteLength;
262 while (src < endSrc) {
263 SkUnichar uni = NextUTF8(&src, endSrc);
269 size_t count = ToUTF16(uni, utf16);
276 uint16_t* elems = utf16;
277 while (dst < endDst && count > 0) {
286 int SkUTF::UTF16ToUTF8(char dst[], int dstCapacity, const uint16_t src[], size_t srcLength) {
292 const char* endDst = dst + dstCapacity;
293 const uint16_t* endSrc = src + srcLength;
294 while (src < endSrc) {
295 SkUnichar uni = NextUTF16(&src, endSrc);
300 char utf8[SkUTF::kMaxBytesInUTF8Sequence];
301 size_t count = ToUTF8(uni, utf8);
308 const char* elems = utf8;
309 while (dst < endDst && count > 0) {