submodule/skia/src/utils/SkUTF.cpp

   1 // Copyright 2018 Google LLC.
   2 // Use of this source code is governed by a BSD-style license that can be found in the LICENSE file.
   3
   4 #include "src/utils/SkUTF.h"
   5
   6 #include "include/private/SkTFitsIn.h"
   7
   8 static constexpr inline int32_t left_shift(int32_t value, int32_t shift) {
   9     return (int32_t) ((uint32_t) value << shift);
  10 }
  11
  12 template <typename T> static constexpr bool is_align2(T x) { return 0 == (x & 1); }
  13
  14 template <typename T> static constexpr bool is_align4(T x) { return 0 == (x & 3); }
  15
  16 static constexpr inline bool utf16_is_high_surrogate(uint16_t c) { return (c & 0xFC00) == 0xD800; }
  17
  18 static constexpr inline bool utf16_is_low_surrogate(uint16_t c) { return (c & 0xFC00) == 0xDC00; }
  19
  20 /** @returns   -1  iff invalid UTF8 byte,
  21                 0  iff UTF8 continuation byte,
  22                 1  iff ASCII byte,
  23                 2  iff leading byte of 2-byte sequence,
  24                 3  iff leading byte of 3-byte sequence, and
  25                 4  iff leading byte of 4-byte sequence.
  26       I.e.: if return value > 0, then gives length of sequence.
  27 */
  28 static int utf8_byte_type(uint8_t c) {
  29     if (c < 0x80) {
  30         return 1;
  31     } else if (c < 0xC0) {
  32         return 0;
  33     } else if (c >= 0xF5 || (c & 0xFE) == 0xC0) { // "octet values c0, c1, f5 to ff never appear"
  34         return -1;
  35     } else {
  36         int value = (((0xe5 << 24) >> ((unsigned)c >> 4 << 1)) & 3) + 1;
  37         // assert(value >= 2 && value <=4);
  38         return value;
  39     }
  40 }
  41 static bool utf8_type_is_valid_leading_byte(int type) { return type > 0; }
  42
  43 static bool utf8_byte_is_continuation(uint8_t c) { return utf8_byte_type(c) == 0; }
  44
  45 ////////////////////////////////////////////////////////////////////////////////
  46
  47 int SkUTF::CountUTF8(const char* utf8, size_t byteLength) {
  48     if (!utf8) {
  49         return -1;
  50     }
  51     int count = 0;
  52     const char* stop = utf8 + byteLength;
  53     while (utf8 < stop) {
  54         int type = utf8_byte_type(*(const uint8_t*)utf8);
  55         if (!utf8_type_is_valid_leading_byte(type) || utf8 + type > stop) {
  56             return -1;  // Sequence extends beyond end.
  57         }
  58         while(type-- > 1) {
  59             ++utf8;
  60             if (!utf8_byte_is_continuation(*(const uint8_t*)utf8)) {
  61                 return -1;
  62             }
  63         }
  64         ++utf8;
  65         ++count;
  66     }
  67     return count;
  68 }
  69
  70 int SkUTF::CountUTF16(const uint16_t* utf16, size_t byteLength) {
  71     if (!utf16 || !is_align2(intptr_t(utf16)) || !is_align2(byteLength)) {
  72         return -1;
  73     }
  74     const uint16_t* src = (const uint16_t*)utf16;
  75     const uint16_t* stop = src + (byteLength >> 1);
  76     int count = 0;
  77     while (src < stop) {
  78         unsigned c = *src++;
  79         if (utf16_is_low_surrogate(c)) {
  80             return -1;
  81         }
  82         if (utf16_is_high_surrogate(c)) {
  83             if (src >= stop) {
  84                 return -1;
  85             }
  86             c = *src++;
  87             if (!utf16_is_low_surrogate(c)) {
  88                 return -1;
  89             }
  90         }
  91         count += 1;
  92     }
  93     return count;
  94 }
  95
  96 int SkUTF::CountUTF32(const int32_t* utf32, size_t byteLength) {
  97     if (!is_align4(intptr_t(utf32)) || !is_align4(byteLength) || !SkTFitsIn<int>(byteLength >> 2)) {
  98         return -1;
  99     }
 100     const uint32_t kInvalidUnicharMask = 0xFF000000;    // unichar fits in 24 bits
 101     const uint32_t* ptr = (const uint32_t*)utf32;
 102     const uint32_t* stop = ptr + (byteLength >> 2);
 103     while (ptr < stop) {
 104         if (*ptr & kInvalidUnicharMask) {
 105             return -1;
 106         }
 107         ptr += 1;
 108     }
 109     return (int)(byteLength >> 2);
 110 }
 111
 112 template <typename T>
 113 static SkUnichar next_fail(const T** ptr, const T* end) {
 114     *ptr = end;
 115     return -1;
 116 }
 117
 118 SkUnichar SkUTF::NextUTF8(const char** ptr, const char* end) {
 119     if (!ptr || !end ) {
 120         return -1;
 121     }
 122     const uint8_t*  p = (const uint8_t*)*ptr;
 123     if (!p || p >= (const uint8_t*)end) {
 124         return next_fail(ptr, end);
 125     }
 126     int             c = *p;
 127     int             hic = c << 24;
 128
 129     if (!utf8_type_is_valid_leading_byte(utf8_byte_type(c))) {
 130         return next_fail(ptr, end);
 131     }
 132     if (hic < 0) {
 133         uint32_t mask = (uint32_t)~0x3F;
 134         hic = left_shift(hic, 1);
 135         do {
 136             ++p;
 137             if (p >= (const uint8_t*)end) {
 138                 return next_fail(ptr, end);
 139             }
 140             // check before reading off end of array.
 141             uint8_t nextByte = *p;
 142             if (!utf8_byte_is_continuation(nextByte)) {
 143                 return next_fail(ptr, end);
 144             }
 145             c = (c << 6) | (nextByte & 0x3F);
 146             mask <<= 5;
 147         } while ((hic = left_shift(hic, 1)) < 0);
 148         c &= ~mask;
 149     }
 150     *ptr = (char*)p + 1;
 151     return c;
 152 }
 153
 154 SkUnichar SkUTF::NextUTF16(const uint16_t** ptr, const uint16_t* end) {
 155     if (!ptr || !end ) {
 156         return -1;
 157     }
 158     const uint16_t* src = *ptr;
 159     if (!src || src + 1 > end || !is_align2(intptr_t(src))) {
 160         return next_fail(ptr, end);
 161     }
 162     uint16_t c = *src++;
 163     SkUnichar result = c;
 164     if (utf16_is_low_surrogate(c)) {
 165         return next_fail(ptr, end);  // srcPtr should never point at low surrogate.
 166     }
 167     if (utf16_is_high_surrogate(c)) {
 168         if (src + 1 > end) {
 169             return next_fail(ptr, end);  // Truncated string.
 170         }
 171         uint16_t low = *src++;
 172         if (!utf16_is_low_surrogate(low)) {
 173             return next_fail(ptr, end);
 174         }
 175         /*
 176         [paraphrased from wikipedia]
 177         Take the high surrogate and subtract 0xD800, then multiply by 0x400.
 178         Take the low surrogate and subtract 0xDC00.  Add these two results
 179         together, and finally add 0x10000 to get the final decoded codepoint.
 180
 181         unicode = (high - 0xD800) * 0x400 + low - 0xDC00 + 0x10000
 182         unicode = (high * 0x400) - (0xD800 * 0x400) + low - 0xDC00 + 0x10000
 183         unicode = (high << 10) - (0xD800 << 10) + low - 0xDC00 + 0x10000
 184         unicode = (high << 10) + low - ((0xD800 << 10) + 0xDC00 - 0x10000)
 185         */
 186         result = (result << 10) + (SkUnichar)low - ((0xD800 << 10) + 0xDC00 - 0x10000);
 187     }
 188     *ptr = src;
 189     return result;
 190 }
 191
 192 SkUnichar SkUTF::NextUTF32(const int32_t** ptr, const int32_t* end) {
 193     if (!ptr || !end ) {
 194         return -1;
 195     }
 196     const int32_t* s = *ptr;
 197     if (!s || s + 1 > end || !is_align4(intptr_t(s))) {
 198         return next_fail(ptr, end);
 199     }
 200     int32_t value = *s;
 201     const uint32_t kInvalidUnicharMask = 0xFF000000;    // unichar fits in 24 bits
 202     if (value & kInvalidUnicharMask) {
 203         return next_fail(ptr, end);
 204     }
 205     *ptr = s + 1;
 206     return value;
 207 }
 208
 209 size_t SkUTF::ToUTF8(SkUnichar uni, char utf8[SkUTF::kMaxBytesInUTF8Sequence]) {
 210     if ((uint32_t)uni > 0x10FFFF) {
 211         return 0;
 212     }
 213     if (uni <= 127) {
 214         if (utf8) {
 215             *utf8 = (char)uni;
 216         }
 217         return 1;
 218     }
 219     char    tmp[4];
 220     char*   p = tmp;
 221     size_t  count = 1;
 222     while (uni > 0x7F >> count) {
 223         *p++ = (char)(0x80 | (uni & 0x3F));
 224         uni >>= 6;
 225         count += 1;
 226     }
 227     if (utf8) {
 228         p = tmp;
 229         utf8 += count;
 230         while (p < tmp + count - 1) {
 231             *--utf8 = *p++;
 232         }
 233         *--utf8 = (char)(~(0xFF >> count) | uni);
 234     }
 235     return count;
 236 }
 237
 238 size_t SkUTF::ToUTF16(SkUnichar uni, uint16_t utf16[2]) {
 239     if ((uint32_t)uni > 0x10FFFF) {
 240         return 0;
 241     }
 242     int extra = (uni > 0xFFFF);
 243     if (utf16) {
 244         if (extra) {
 245             utf16[0] = (uint16_t)((0xD800 - 64) + (uni >> 10));
 246             utf16[1] = (uint16_t)(0xDC00 | (uni & 0x3FF));
 247         } else {
 248             utf16[0] = (uint16_t)uni;
 249         }
 250     }
 251     return 1 + extra;
 252 }
 253
 254 int SkUTF::UTF8ToUTF16(uint16_t dst[], int dstCapacity, const char src[], size_t srcByteLength) {
 255     if (!dst) {
 256         dstCapacity = 0;
 257     }
 258
 259     int dstLength = 0;
 260     uint16_t* endDst = dst + dstCapacity;
 261     const char* endSrc = src + srcByteLength;
 262     while (src < endSrc) {
 263         SkUnichar uni = NextUTF8(&src, endSrc);
 264         if (uni < 0) {
 265             return -1;
 266         }
 267
 268         uint16_t utf16[2];
 269         size_t count = ToUTF16(uni, utf16);
 270         if (count == 0) {
 271             return -1;
 272         }
 273         dstLength += count;
 274
 275         if (dst) {
 276             uint16_t* elems = utf16;
 277             while (dst < endDst && count > 0) {
 278                 *dst++ = *elems++;
 279                 count -= 1;
 280             }
 281         }
 282     }
 283     return dstLength;
 284 }
 285
 286 int SkUTF::UTF16ToUTF8(char dst[], int dstCapacity, const uint16_t src[], size_t srcLength) {
 287     if (!dst) {
 288         dstCapacity = 0;
 289     }
 290
 291     int dstLength = 0;
 292     const char* endDst = dst + dstCapacity;
 293     const uint16_t* endSrc = src + srcLength;
 294     while (src < endSrc) {
 295         SkUnichar uni = NextUTF16(&src, endSrc);
 296         if (uni < 0) {
 297             return -1;
 298         }
 299
 300         char utf8[SkUTF::kMaxBytesInUTF8Sequence];
 301         size_t count = ToUTF8(uni, utf8);
 302         if (count == 0) {
 303             return -1;
 304         }
 305         dstLength += count;
 306
 307         if (dst) {
 308             const char* elems = utf8;
 309             while (dst < endDst && count > 0) {
 310                 *dst++ = *elems++;
 311                 count -= 1;
 312             }
 313         }
 314     }
 315     return dstLength;
 316 }