X-Git-Url: http://review.tizen.org/git/?p=platform%2Fcore%2Fuifw%2Fdali-toolkit.git;a=blobdiff_plain;f=dali-toolkit%2Finternal%2Ftext%2Fcharacter-set-conversion.cpp;h=9b84c3ba01ad5ccd3168db9e5fc09f9a94848a65;hp=d79fbc38eeb21e31cfae2b7991a065917feaab27;hb=750fadba87bb959c9af32e89e3f1bc7af6cb6dd2;hpb=386634b00f264e667074fa7d956c81bc72233c62 diff --git a/dali-toolkit/internal/text/character-set-conversion.cpp b/dali-toolkit/internal/text/character-set-conversion.cpp index d79fbc3..9b84c3b 100644 --- a/dali-toolkit/internal/text/character-set-conversion.cpp +++ b/dali-toolkit/internal/text/character-set-conversion.cpp @@ -20,134 +20,149 @@ namespace Dali { - namespace Toolkit { - namespace Text { - namespace { - const static uint8_t U1 = 1u; - const static uint8_t U2 = 2u; - const static uint8_t U3 = 3u; - const static uint8_t U4 = 4u; - const static uint8_t U0 = 0u; - const static uint8_t UTF8_LENGTH[256] = { - U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // - U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // - U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // - U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // - U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // - U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // - U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // - U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // - U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // lead byte = 0xxx xxxx (U+0000 - U+007F + some extended ascii characters) - U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // - U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // - U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // - U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // - U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // - U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // - U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // - U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // - U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // - U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // - U1, U1, // - - U2, U2, U2, U2, U2, U2, U2, U2, U2, U2, // - U2, U2, U2, U2, U2, U2, U2, U2, U2, U2, // lead byte = 110x xxxx (U+0080 - U+07FF) - U2, U2, U2, U2, U2, U2, U2, U2, U2, U2, // - U2, U2, // - - U3, U3, U3, U3, U3, U3, U3, U3, U3, U3, // lead byte = 1110 xxxx (U+0800 - U+FFFF) - U3, U3, U3, U3, U3, U3, // - - U4, U4, U4, U4, U4, U4, U4, U4, // lead byte = 1111 0xxx (U+10000 - U+1FFFFF) - - U0, U0, U0, U0, // Non valid. - U0, U0, U0, U0, // Non valid. - }; - - const uint8_t CR = 0xd; - const uint8_t LF = 0xa; +// clang-format off +constexpr uint8_t U1 = 1u; +constexpr uint8_t U2 = 2u; +constexpr uint8_t U3 = 3u; +constexpr uint8_t U4 = 4u; +constexpr uint8_t U5 = 5u; +constexpr uint8_t U6 = 6u; +constexpr uint8_t U0 = 0u; +constexpr uint8_t UTF8_LENGTH[256] = { + U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // + U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // + U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // + U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // + U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // + U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // + U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // + U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // + U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // lead byte = 0xxx xxxx (U+0000 - U+007F + some extended ascii characters) + U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // + U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // + U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // + U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // + U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // + U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // + U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // + U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // + U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // + U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // + U1, U1, // + + U2, U2, U2, U2, U2, U2, U2, U2, U2, U2, // + U2, U2, U2, U2, U2, U2, U2, U2, U2, U2, // lead byte = 110x xxxx (U+0080 - U+07FF) + U2, U2, U2, U2, U2, U2, U2, U2, U2, U2, // + U2, U2, // + + U3, U3, U3, U3, U3, U3, U3, U3, U3, U3, // lead byte = 1110 xxxx (U+0800 - U+FFFF) + U3, U3, U3, U3, U3, U3, // + + U4, U4, U4, U4, U4, U4, U4, U4, // lead byte = 1111 0xxx (U+10000 - U+1FFFFF) + + U5, U5, U5, U5, // lead byte = 1111 10xx (U+200000 - U+3FFFFFF) + + U6, U6, // lead byte = 1111 110x (U+4000000 - U+7FFFFFFF) + + U0, U0, // Non valid. +}; + +constexpr uint8_t CR = 0xd; +constexpr uint8_t LF = 0xa; +// clang-format on } // namespace -uint8_t GetUtf8Length( uint8_t utf8LeadByte ) +uint8_t GetUtf8Length(uint8_t utf8LeadByte) { return UTF8_LENGTH[utf8LeadByte]; } -uint32_t GetNumberOfUtf8Characters( const uint8_t* const utf8, uint32_t length ) +uint32_t GetNumberOfUtf8Characters(const uint8_t* const utf8, uint32_t length) { uint32_t numberOfCharacters = 0u; const uint8_t* begin = utf8; - const uint8_t* end = utf8 + length; + const uint8_t* end = utf8 + length; - for( ; begin < end ; begin += UTF8_LENGTH[*begin], ++numberOfCharacters ); + for(; begin < end; begin += UTF8_LENGTH[*begin]) + { + ++numberOfCharacters; + } return numberOfCharacters; } -uint32_t GetNumberOfUtf8Bytes( const uint32_t* const utf32, uint32_t numberOfCharacters ) +uint32_t GetNumberOfUtf8Bytes(const uint32_t* const utf32, uint32_t numberOfCharacters) { uint32_t numberOfBytes = 0u; const uint32_t* begin = utf32; - const uint32_t* end = utf32 + numberOfCharacters; + const uint32_t* end = utf32 + numberOfCharacters; - for( ; begin < end; ++begin ) + for(; begin < end; ++begin) { const uint32_t code = *begin; - if( code < 0x80u ) + if(code < 0x80u) { ++numberOfBytes; } - else if( code < 0x800u ) + else if(code < 0x800u) { numberOfBytes += U2; } - else if( code < 0x10000u ) + else if(code < 0x10000u) { numberOfBytes += U3; } - else if( code < 0x200000u ) + else if(code < 0x200000u) { numberOfBytes += U4; } + else if(code < 0x4000000u) + { + numberOfBytes += U5; + } + else if(code < 0x80000000u) + { + numberOfBytes += U6; + } } return numberOfBytes; } -uint32_t Utf8ToUtf32( const uint8_t* const utf8, uint32_t length, uint32_t* utf32 ) +uint32_t Utf8ToUtf32(const uint8_t* const utf8, uint32_t length, uint32_t* utf32) { uint32_t numberOfCharacters = 0u; const uint8_t* begin = utf8; - const uint8_t* end = utf8 + length; + const uint8_t* end = utf8 + length; - for( ; begin < end ; ++numberOfCharacters ) + for(; begin < end; ++numberOfCharacters) { const uint8_t leadByte = *begin; - switch( UTF8_LENGTH[leadByte] ) + switch(UTF8_LENGTH[leadByte]) { case U1: { - if( CR == leadByte ) + if(CR == leadByte) { // Replace CR+LF or CR by LF *utf32++ = LF; // Look ahead if the next one is a LF. ++begin; - if( begin < end ) + if(begin < end) { - if( LF == *begin ) + if(LF == *begin) { ++begin; } @@ -164,7 +179,7 @@ uint32_t Utf8ToUtf32( const uint8_t* const utf8, uint32_t length, uint32_t* utf3 case U2: { uint32_t& code = *utf32++; - code = leadByte & 0x1fu; + code = leadByte & 0x1fu; begin++; code <<= 6u; code |= *begin++ & 0x3fu; @@ -174,7 +189,7 @@ uint32_t Utf8ToUtf32( const uint8_t* const utf8, uint32_t length, uint32_t* utf3 case U3: { uint32_t& code = *utf32++; - code = leadByte & 0x0fu; + code = leadByte & 0x0fu; begin++; code <<= 6u; code |= *begin++ & 0x3fu; @@ -186,7 +201,7 @@ uint32_t Utf8ToUtf32( const uint8_t* const utf8, uint32_t length, uint32_t* utf3 case U4: { uint32_t& code = *utf32++; - code = leadByte & 0x07u; + code = leadByte & 0x07u; begin++; code <<= 6u; code |= *begin++ & 0x3fu; @@ -197,10 +212,44 @@ uint32_t Utf8ToUtf32( const uint8_t* const utf8, uint32_t length, uint32_t* utf3 break; } - case U0: // Invalid case + case U5: { + uint32_t& code = *utf32++; + code = leadByte & 0x03u; begin++; - *utf32++ = 0x20; // Use white space + code <<= 6u; + code |= *begin++ & 0x3fu; + code <<= 6u; + code |= *begin++ & 0x3fu; + code <<= 6u; + code |= *begin++ & 0x3fu; + code <<= 6u; + code |= *begin++ & 0x3fu; + break; + } + + case U6: + { + uint32_t& code = *utf32++; + code = leadByte & 0x01u; + begin++; + code <<= 6u; + code |= *begin++ & 0x3fu; + code <<= 6u; + code |= *begin++ & 0x3fu; + code <<= 6u; + code |= *begin++ & 0x3fu; + code <<= 6u; + code |= *begin++ & 0x3fu; + code <<= 6u; + code |= *begin++ & 0x3fu; + break; + } + + case U0: // Invalid case + { + begin++; + *utf32++ = 0x20; // Use white space break; } } @@ -209,53 +258,72 @@ uint32_t Utf8ToUtf32( const uint8_t* const utf8, uint32_t length, uint32_t* utf3 return numberOfCharacters; } -uint32_t Utf32ToUtf8( const uint32_t* const utf32, uint32_t numberOfCharacters, uint8_t* utf8 ) +uint32_t Utf32ToUtf8(const uint32_t* const utf32, uint32_t numberOfCharacters, uint8_t* utf8) { const uint32_t* begin = utf32; - const uint32_t* end = utf32 + numberOfCharacters; + const uint32_t* end = utf32 + numberOfCharacters; uint8_t* utf8Begin = utf8; - for( ; begin < end; ++begin ) + for(; begin < end; ++begin) { const uint32_t code = *begin; + // clang-format off if( code < 0x80u ) { *utf8++ = code; } else if( code < 0x800u ) { - *utf8++ = static_cast( code >> 6u ) | 0xc0u; // lead byte for 2 byte sequence - *utf8++ = static_cast( code & 0x3f ) | 0x80u; // continuation byte + *utf8++ = static_cast( code >> 6u) | 0xc0u; // lead byte for 2 byte sequence + *utf8++ = static_cast( code & 0x3f) | 0x80u; // continuation byte } else if( code < 0x10000u ) { - *utf8++ = static_cast( code >> 12u ) | 0xe0u; // lead byte for 2 byte sequence - *utf8++ = static_cast( ( code >> 6u ) & 0x3f ) | 0x80u; // continuation byte - *utf8++ = static_cast( code & 0x3f ) | 0x80u; // continuation byte + *utf8++ = static_cast( code >> 12u) | 0xe0u; // lead byte for 3 byte sequence + *utf8++ = static_cast((code >> 6u ) & 0x3f) | 0x80u; // continuation byte + *utf8++ = static_cast( code & 0x3f) | 0x80u; // continuation byte } else if( code < 0x200000u ) { - *utf8++ = static_cast( code >> 18u ) | 0xf0u; // lead byte for 2 byte sequence - *utf8++ = static_cast( ( code >> 12u ) & 0x3f ) | 0x80u; // continuation byte - *utf8++ = static_cast( ( code >> 6u ) & 0x3f ) | 0x80u; // continuation byte - *utf8++ = static_cast( code & 0x3f ) | 0x80u; // continuation byte + *utf8++ = static_cast( code >> 18u) | 0xf0u; // lead byte for 4 byte sequence + *utf8++ = static_cast((code >> 12u) & 0x3f) | 0x80u; // continuation byte + *utf8++ = static_cast((code >> 6u ) & 0x3f) | 0x80u; // continuation byte + *utf8++ = static_cast( code & 0x3f) | 0x80u; // continuation byte + } + else if( code < 0x4000000u ) + { + *utf8++ = static_cast( code >> 24u) | 0xf8u; // lead byte for 5 byte sequence + *utf8++ = static_cast((code >> 18u) & 0x3f) | 0x80u; // continuation byte + *utf8++ = static_cast((code >> 12u) & 0x3f) | 0x80u; // continuation byte + *utf8++ = static_cast((code >> 6u ) & 0x3f) | 0x80u; // continuation byte + *utf8++ = static_cast( code & 0x3f) | 0x80u; // continuation byte + } + else if( code < 0x80000000u ) + { + *utf8++ = static_cast( code >> 30u) | 0xfcu; // lead byte for 6 byte sequence + *utf8++ = static_cast((code >> 24u) & 0x3f) | 0x80u; // continuation byte + *utf8++ = static_cast((code >> 18u) & 0x3f) | 0x80u; // continuation byte + *utf8++ = static_cast((code >> 12u) & 0x3f) | 0x80u; // continuation byte + *utf8++ = static_cast((code >> 6u ) & 0x3f) | 0x80u; // continuation byte + *utf8++ = static_cast( code & 0x3f) | 0x80u; // continuation byte } + // clang-format on } return utf8 - utf8Begin; } -void Utf32ToUtf8( const uint32_t* const utf32, uint32_t numberOfCharacters, std::string& utf8 ) +void Utf32ToUtf8(const uint32_t* const utf32, uint32_t numberOfCharacters, std::string& utf8) { utf8.clear(); - uint32_t numberOfBytes = GetNumberOfUtf8Bytes( &utf32[0], numberOfCharacters ); - utf8.resize( numberOfBytes ); + uint32_t numberOfBytes = GetNumberOfUtf8Bytes(&utf32[0], numberOfCharacters); + utf8.resize(numberOfBytes); // This is a bit horrible but std::string returns a (signed) char* - Utf32ToUtf8( utf32, numberOfCharacters, reinterpret_cast(&utf8[0]) ); + Utf32ToUtf8(utf32, numberOfCharacters, reinterpret_cast(&utf8[0])); } } // namespace Text