X-Git-Url: http://review.tizen.org/git/?p=platform%2Fcore%2Fuifw%2Fdali-toolkit.git;a=blobdiff_plain;f=dali-toolkit%2Finternal%2Ftext%2Fcharacter-set-conversion.cpp;h=74022925f6a7a49b3e437fa17cae9ab727107f26;hp=abda1021041dc3e0bab5d23507b7f519c2544d45;hb=528aa3699cd51dab5115bca1aaebb65d4bc67c15;hpb=830f03638ec6ecd3b12ba3d9eb6419fdb3a3db09 diff --git a/dali-toolkit/internal/text/character-set-conversion.cpp b/dali-toolkit/internal/text/character-set-conversion.cpp index abda102..7402292 100644 --- a/dali-toolkit/internal/text/character-set-conversion.cpp +++ b/dali-toolkit/internal/text/character-set-conversion.cpp @@ -24,12 +24,17 @@ namespace Dali namespace Toolkit { +namespace Text +{ + namespace { const static uint8_t U1 = 1u; const static uint8_t U2 = 2u; const static uint8_t U3 = 3u; const static uint8_t U4 = 4u; + const static uint8_t U5 = 5u; + const static uint8_t U6 = 6u; const static uint8_t U0 = 0u; const static uint8_t UTF8_LENGTH[256] = { U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // @@ -63,11 +68,22 @@ namespace U4, U4, U4, U4, U4, U4, U4, U4, // lead byte = 1111 0xxx (U+10000 - U+1FFFFF) - U0, U0, U0, U0, // Non valid. - U0, U0, U0, U0, // Non valid. + U5, U5, U5, U5, // lead byte = 1111 10xx (U+200000 - U+3FFFFFF) + + U6, U6, // lead byte = 1111 110x (U+4000000 - U+7FFFFFFF) + + U0, U0, // Non valid. }; + + const uint8_t CR = 0xd; + const uint8_t LF = 0xa; } // namespace +uint8_t GetUtf8Length( uint8_t utf8LeadByte ) +{ + return UTF8_LENGTH[utf8LeadByte]; +} + uint32_t GetNumberOfUtf8Characters( const uint8_t* const utf8, uint32_t length ) { uint32_t numberOfCharacters = 0u; @@ -107,6 +123,14 @@ uint32_t GetNumberOfUtf8Bytes( const uint32_t* const utf32, uint32_t numberOfCha { numberOfBytes += U4; } + else if( code < 0x4000000u ) + { + numberOfBytes += U5; + } + else if( code < 0x80000000u ) + { + numberOfBytes += U6; + } } return numberOfBytes; @@ -127,8 +151,26 @@ uint32_t Utf8ToUtf32( const uint8_t* const utf8, uint32_t length, uint32_t* utf3 { case U1: { - *utf32++ = leadByte; - begin++; + if( CR == leadByte ) + { + // Replace CR+LF or CR by LF + *utf32++ = LF; + + // Look ahead if the next one is a LF. + ++begin; + if( begin < end ) + { + if( LF == *begin ) + { + ++begin; + } + } + } + else + { + *utf32++ = leadByte; + begin++; + } break; } @@ -145,7 +187,7 @@ uint32_t Utf8ToUtf32( const uint8_t* const utf8, uint32_t length, uint32_t* utf3 case U3: { uint32_t& code = *utf32++; - code = leadByte & 0x1fu; + code = leadByte & 0x0fu; begin++; code <<= 6u; code |= *begin++ & 0x3fu; @@ -157,7 +199,7 @@ uint32_t Utf8ToUtf32( const uint8_t* const utf8, uint32_t length, uint32_t* utf3 case U4: { uint32_t& code = *utf32++; - code = leadByte & 0x1fu; + code = leadByte & 0x07u; begin++; code <<= 6u; code |= *begin++ & 0x3fu; @@ -167,17 +209,60 @@ uint32_t Utf8ToUtf32( const uint8_t* const utf8, uint32_t length, uint32_t* utf3 code |= *begin++ & 0x3fu; break; } + + case U5: + { + uint32_t& code = *utf32++; + code = leadByte & 0x03u; + begin++; + code <<= 6u; + code |= *begin++ & 0x3fu; + code <<= 6u; + code |= *begin++ & 0x3fu; + code <<= 6u; + code |= *begin++ & 0x3fu; + code <<= 6u; + code |= *begin++ & 0x3fu; + break; + } + + case U6: + { + uint32_t& code = *utf32++; + code = leadByte & 0x01u; + begin++; + code <<= 6u; + code |= *begin++ & 0x3fu; + code <<= 6u; + code |= *begin++ & 0x3fu; + code <<= 6u; + code |= *begin++ & 0x3fu; + code <<= 6u; + code |= *begin++ & 0x3fu; + code <<= 6u; + code |= *begin++ & 0x3fu; + break; + } + + case U0: // Invalid case + { + begin++; + *utf32++ = 0x20; // Use white space + break; + } } } return numberOfCharacters; } -void Utf32ToUtf8( const uint32_t* const utf32, uint32_t numberOfCharacters, uint8_t* utf8 ) +uint32_t Utf32ToUtf8( const uint32_t* const utf32, uint32_t numberOfCharacters, uint8_t* utf8 ) { const uint32_t* begin = utf32; const uint32_t* end = utf32 + numberOfCharacters; + uint8_t* utf8Begin = utf8; + for( ; begin < end; ++begin ) { const uint32_t code = *begin; @@ -193,20 +278,52 @@ void Utf32ToUtf8( const uint32_t* const utf32, uint32_t numberOfCharacters, uint } else if( code < 0x10000u ) { - *utf8++ = static_cast( code >> 12u ) | 0xe0u; // lead byte for 2 byte sequence + *utf8++ = static_cast( code >> 12u ) | 0xe0u; // lead byte for 3 byte sequence *utf8++ = static_cast( ( code >> 6u ) & 0x3f ) | 0x80u; // continuation byte *utf8++ = static_cast( code & 0x3f ) | 0x80u; // continuation byte } else if( code < 0x200000u ) { - *utf8++ = static_cast( code >> 18u ) | 0xf0u; // lead byte for 2 byte sequence + *utf8++ = static_cast( code >> 18u ) | 0xf0u; // lead byte for 4 byte sequence + *utf8++ = static_cast( ( code >> 12u ) & 0x3f ) | 0x80u; // continuation byte + *utf8++ = static_cast( ( code >> 6u ) & 0x3f ) | 0x80u; // continuation byte + *utf8++ = static_cast( code & 0x3f ) | 0x80u; // continuation byte + } + else if( code < 0x4000000u ) + { + *utf8++ = static_cast( code >> 24u ) | 0xf8u; // lead byte for 5 byte sequence + *utf8++ = static_cast( ( code >> 18u ) & 0x3f ) | 0x80u; // continuation byte + *utf8++ = static_cast( ( code >> 12u ) & 0x3f ) | 0x80u; // continuation byte + *utf8++ = static_cast( ( code >> 6u ) & 0x3f ) | 0x80u; // continuation byte + *utf8++ = static_cast( code & 0x3f ) | 0x80u; // continuation byte + } + else if( code < 0x80000000u ) + { + *utf8++ = static_cast( code >> 30u ) | 0xfcu; // lead byte for 6 byte sequence + *utf8++ = static_cast( ( code >> 24u ) & 0x3f ) | 0x80u; // continuation byte + *utf8++ = static_cast( ( code >> 18u ) & 0x3f ) | 0x80u; // continuation byte *utf8++ = static_cast( ( code >> 12u ) & 0x3f ) | 0x80u; // continuation byte *utf8++ = static_cast( ( code >> 6u ) & 0x3f ) | 0x80u; // continuation byte *utf8++ = static_cast( code & 0x3f ) | 0x80u; // continuation byte } } + + return utf8 - utf8Begin; +} + +void Utf32ToUtf8( const uint32_t* const utf32, uint32_t numberOfCharacters, std::string& utf8 ) +{ + utf8.clear(); + + uint32_t numberOfBytes = GetNumberOfUtf8Bytes( &utf32[0], numberOfCharacters ); + utf8.resize( numberOfBytes ); + + // This is a bit horrible but std::string returns a (signed) char* + Utf32ToUtf8( utf32, numberOfCharacters, reinterpret_cast(&utf8[0]) ); } +} // namespace Text + } // namespace Toolkit } // namespace Dali