2 * Copyright (c) 2015 Samsung Electronics Co., Ltd.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include <dali-toolkit/internal/text/character-set-conversion.h>
30 constexpr uint8_t U1 = 1u;
31 constexpr uint8_t U2 = 2u;
32 constexpr uint8_t U3 = 3u;
33 constexpr uint8_t U4 = 4u;
34 constexpr uint8_t U5 = 5u;
35 constexpr uint8_t U6 = 6u;
36 constexpr uint8_t U0 = 0u;
37 constexpr uint8_t UTF8_LENGTH[256] = {
38 U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
39 U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
40 U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
41 U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
42 U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
43 U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
44 U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
45 U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
46 U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // lead byte = 0xxx xxxx (U+0000 - U+007F + some extended ascii characters)
47 U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
48 U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
49 U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
50 U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
51 U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
52 U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
53 U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
54 U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
55 U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
56 U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
59 U2, U2, U2, U2, U2, U2, U2, U2, U2, U2, //
60 U2, U2, U2, U2, U2, U2, U2, U2, U2, U2, // lead byte = 110x xxxx (U+0080 - U+07FF)
61 U2, U2, U2, U2, U2, U2, U2, U2, U2, U2, //
64 U3, U3, U3, U3, U3, U3, U3, U3, U3, U3, // lead byte = 1110 xxxx (U+0800 - U+FFFF)
65 U3, U3, U3, U3, U3, U3, //
67 U4, U4, U4, U4, U4, U4, U4, U4, // lead byte = 1111 0xxx (U+10000 - U+1FFFFF)
69 U5, U5, U5, U5, // lead byte = 1111 10xx (U+200000 - U+3FFFFFF)
71 U6, U6, // lead byte = 1111 110x (U+4000000 - U+7FFFFFFF)
76 constexpr uint8_t CR = 0xd;
77 constexpr uint8_t LF = 0xa;
81 uint8_t GetUtf8Length(uint8_t utf8LeadByte)
83 return UTF8_LENGTH[utf8LeadByte];
86 uint32_t GetNumberOfUtf8Characters(const uint8_t* const utf8, uint32_t length)
88 uint32_t numberOfCharacters = 0u;
90 const uint8_t* begin = utf8;
91 const uint8_t* end = utf8 + length;
93 for(; begin < end; begin += UTF8_LENGTH[*begin])
98 return numberOfCharacters;
101 uint32_t GetNumberOfUtf8Bytes(const uint32_t* const utf32, uint32_t numberOfCharacters)
103 uint32_t numberOfBytes = 0u;
105 const uint32_t* begin = utf32;
106 const uint32_t* end = utf32 + numberOfCharacters;
108 for(; begin < end; ++begin)
110 const uint32_t code = *begin;
116 else if(code < 0x800u)
120 else if(code < 0x10000u)
124 else if(code < 0x200000u)
128 else if(code < 0x4000000u)
132 else if(code < 0x80000000u)
138 return numberOfBytes;
141 uint32_t Utf8ToUtf32(const uint8_t* const utf8, uint32_t length, uint32_t* utf32)
143 uint32_t numberOfCharacters = 0u;
145 const uint8_t* begin = utf8;
146 const uint8_t* end = utf8 + length;
148 for(; begin < end; ++numberOfCharacters)
150 const uint8_t leadByte = *begin;
152 switch(UTF8_LENGTH[leadByte])
158 // Replace CR+LF or CR by LF
161 // Look ahead if the next one is a LF.
181 uint32_t& code = *utf32++;
182 code = leadByte & 0x1fu;
185 code |= *begin++ & 0x3fu;
191 uint32_t& code = *utf32++;
192 code = leadByte & 0x0fu;
195 code |= *begin++ & 0x3fu;
197 code |= *begin++ & 0x3fu;
203 uint32_t& code = *utf32++;
204 code = leadByte & 0x07u;
207 code |= *begin++ & 0x3fu;
209 code |= *begin++ & 0x3fu;
211 code |= *begin++ & 0x3fu;
217 uint32_t& code = *utf32++;
218 code = leadByte & 0x03u;
221 code |= *begin++ & 0x3fu;
223 code |= *begin++ & 0x3fu;
225 code |= *begin++ & 0x3fu;
227 code |= *begin++ & 0x3fu;
233 uint32_t& code = *utf32++;
234 code = leadByte & 0x01u;
237 code |= *begin++ & 0x3fu;
239 code |= *begin++ & 0x3fu;
241 code |= *begin++ & 0x3fu;
243 code |= *begin++ & 0x3fu;
245 code |= *begin++ & 0x3fu;
249 case U0: // Invalid case
252 *utf32++ = 0x20; // Use white space
258 return numberOfCharacters;
261 uint32_t Utf32ToUtf8(const uint32_t* const utf32, uint32_t numberOfCharacters, uint8_t* utf8)
263 const uint32_t* begin = utf32;
264 const uint32_t* end = utf32 + numberOfCharacters;
266 uint8_t* utf8Begin = utf8;
268 for(; begin < end; ++begin)
270 const uint32_t code = *begin;
277 else if( code < 0x800u )
279 *utf8++ = static_cast<uint8_t>( code >> 6u) | 0xc0u; // lead byte for 2 byte sequence
280 *utf8++ = static_cast<uint8_t>( code & 0x3f) | 0x80u; // continuation byte
282 else if( code < 0x10000u )
284 *utf8++ = static_cast<uint8_t>( code >> 12u) | 0xe0u; // lead byte for 3 byte sequence
285 *utf8++ = static_cast<uint8_t>((code >> 6u ) & 0x3f) | 0x80u; // continuation byte
286 *utf8++ = static_cast<uint8_t>( code & 0x3f) | 0x80u; // continuation byte
288 else if( code < 0x200000u )
290 *utf8++ = static_cast<uint8_t>( code >> 18u) | 0xf0u; // lead byte for 4 byte sequence
291 *utf8++ = static_cast<uint8_t>((code >> 12u) & 0x3f) | 0x80u; // continuation byte
292 *utf8++ = static_cast<uint8_t>((code >> 6u ) & 0x3f) | 0x80u; // continuation byte
293 *utf8++ = static_cast<uint8_t>( code & 0x3f) | 0x80u; // continuation byte
295 else if( code < 0x4000000u )
297 *utf8++ = static_cast<uint8_t>( code >> 24u) | 0xf8u; // lead byte for 5 byte sequence
298 *utf8++ = static_cast<uint8_t>((code >> 18u) & 0x3f) | 0x80u; // continuation byte
299 *utf8++ = static_cast<uint8_t>((code >> 12u) & 0x3f) | 0x80u; // continuation byte
300 *utf8++ = static_cast<uint8_t>((code >> 6u ) & 0x3f) | 0x80u; // continuation byte
301 *utf8++ = static_cast<uint8_t>( code & 0x3f) | 0x80u; // continuation byte
303 else if( code < 0x80000000u )
305 *utf8++ = static_cast<uint8_t>( code >> 30u) | 0xfcu; // lead byte for 6 byte sequence
306 *utf8++ = static_cast<uint8_t>((code >> 24u) & 0x3f) | 0x80u; // continuation byte
307 *utf8++ = static_cast<uint8_t>((code >> 18u) & 0x3f) | 0x80u; // continuation byte
308 *utf8++ = static_cast<uint8_t>((code >> 12u) & 0x3f) | 0x80u; // continuation byte
309 *utf8++ = static_cast<uint8_t>((code >> 6u ) & 0x3f) | 0x80u; // continuation byte
310 *utf8++ = static_cast<uint8_t>( code & 0x3f) | 0x80u; // continuation byte
315 return utf8 - utf8Begin;
318 void Utf32ToUtf8(const uint32_t* const utf32, uint32_t numberOfCharacters, std::string& utf8)
322 uint32_t numberOfBytes = GetNumberOfUtf8Bytes(&utf32[0], numberOfCharacters);
323 utf8.resize(numberOfBytes);
325 // This is a bit horrible but std::string returns a (signed) char*
326 Utf32ToUtf8(utf32, numberOfCharacters, reinterpret_cast<uint8_t*>(&utf8[0]));
331 } // namespace Toolkit