namespace Dali
{
-
namespace Toolkit
{
-
+namespace Text
+{
namespace
{
- const static uint8_t U1 = 1u;
- const static uint8_t U2 = 2u;
- const static uint8_t U3 = 3u;
- const static uint8_t U4 = 4u;
- const static uint8_t U0 = 0u;
- const static uint8_t UTF8_LENGTH[256] = {
- U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
- U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
- U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
- U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
- U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
- U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
- U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
- U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
- U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // lead byte = 0xxx xxxx (U+0000 - U+007F + some extended ascii characters)
- U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
- U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
- U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
- U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
- U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
- U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
- U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
- U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
- U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
- U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
- U1, U1, //
-
- U2, U2, U2, U2, U2, U2, U2, U2, U2, U2, //
- U2, U2, U2, U2, U2, U2, U2, U2, U2, U2, // lead byte = 110x xxxx (U+0080 - U+07FF)
- U2, U2, U2, U2, U2, U2, U2, U2, U2, U2, //
- U2, U2, //
-
- U3, U3, U3, U3, U3, U3, U3, U3, U3, U3, // lead byte = 1110 xxxx (U+0800 - U+FFFF)
- U3, U3, U3, U3, U3, U3, //
-
- U4, U4, U4, U4, U4, U4, U4, U4, // lead byte = 1111 0xxx (U+10000 - U+1FFFFF)
-
- U0, U0, U0, U0, // Non valid.
- U0, U0, U0, U0, // Non valid.
- };
+// clang-format off
+constexpr uint8_t U1 = 1u;
+constexpr uint8_t U2 = 2u;
+constexpr uint8_t U3 = 3u;
+constexpr uint8_t U4 = 4u;
+constexpr uint8_t U5 = 5u;
+constexpr uint8_t U6 = 6u;
+constexpr uint8_t U0 = 0u;
+constexpr uint8_t UTF8_LENGTH[256] = {
+ U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
+ U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
+ U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
+ U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
+ U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
+ U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
+ U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
+ U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
+ U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // lead byte = 0xxx xxxx (U+0000 - U+007F + some extended ascii characters)
+ U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
+ U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
+ U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
+ U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
+ U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
+ U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
+ U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
+ U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
+ U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
+ U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
+ U1, U1, //
+
+ U2, U2, U2, U2, U2, U2, U2, U2, U2, U2, //
+ U2, U2, U2, U2, U2, U2, U2, U2, U2, U2, // lead byte = 110x xxxx (U+0080 - U+07FF)
+ U2, U2, U2, U2, U2, U2, U2, U2, U2, U2, //
+ U2, U2, //
+
+ U3, U3, U3, U3, U3, U3, U3, U3, U3, U3, // lead byte = 1110 xxxx (U+0800 - U+FFFF)
+ U3, U3, U3, U3, U3, U3, //
+
+ U4, U4, U4, U4, U4, U4, U4, U4, // lead byte = 1111 0xxx (U+10000 - U+1FFFFF)
+
+ U5, U5, U5, U5, // lead byte = 1111 10xx (U+200000 - U+3FFFFFF)
+
+ U6, U6, // lead byte = 1111 110x (U+4000000 - U+7FFFFFFF)
+
+ U0, U0, // Non valid.
+};
+
+constexpr uint8_t CR = 0xd;
+constexpr uint8_t LF = 0xa;
+// clang-format on
} // namespace
-uint32_t GetNumberOfUtf8Characters( const uint8_t* const utf8, uint32_t length )
+uint8_t GetUtf8Length(uint8_t utf8LeadByte)
+{
+ return UTF8_LENGTH[utf8LeadByte];
+}
+
+uint32_t GetNumberOfUtf8Characters(const uint8_t* const utf8, uint32_t length)
{
uint32_t numberOfCharacters = 0u;
const uint8_t* begin = utf8;
- const uint8_t* end = utf8 + length;
+ const uint8_t* end = utf8 + length;
- for( ; begin < end ; begin += UTF8_LENGTH[*begin], ++numberOfCharacters );
+ for(; begin < end; begin += UTF8_LENGTH[*begin])
+ {
+ ++numberOfCharacters;
+ }
return numberOfCharacters;
}
-uint32_t GetNumberOfUtf8Bytes( const uint32_t* const utf32, uint32_t numberOfCharacters )
+uint32_t GetNumberOfUtf8Bytes(const uint32_t* const utf32, uint32_t numberOfCharacters)
{
uint32_t numberOfBytes = 0u;
const uint32_t* begin = utf32;
- const uint32_t* end = utf32 + numberOfCharacters;
+ const uint32_t* end = utf32 + numberOfCharacters;
- for( ; begin < end; ++begin )
+ for(; begin < end; ++begin)
{
const uint32_t code = *begin;
- if( code < 0x80u )
+ if(code < 0x80u)
{
++numberOfBytes;
}
- else if( code < 0x800u )
+ else if(code < 0x800u)
{
numberOfBytes += U2;
}
- else if( code < 0x10000u )
+ else if(code < 0x10000u)
{
numberOfBytes += U3;
}
- else if( code < 0x200000u )
+ else if(code < 0x200000u)
{
numberOfBytes += U4;
}
+ else if(code < 0x4000000u)
+ {
+ numberOfBytes += U5;
+ }
+ else if(code < 0x80000000u)
+ {
+ numberOfBytes += U6;
+ }
}
return numberOfBytes;
}
-uint32_t Utf8ToUtf32( const uint8_t* const utf8, uint32_t length, uint32_t* utf32 )
+uint32_t Utf8ToUtf32(const uint8_t* const utf8, uint32_t length, uint32_t* utf32)
{
uint32_t numberOfCharacters = 0u;
const uint8_t* begin = utf8;
- const uint8_t* end = utf8 + length;
+ const uint8_t* end = utf8 + length;
- for( ; begin < end ; ++numberOfCharacters )
+ for(; begin < end; ++numberOfCharacters)
{
const uint8_t leadByte = *begin;
- switch( UTF8_LENGTH[leadByte] )
+ switch(UTF8_LENGTH[leadByte])
{
case U1:
{
- *utf32++ = leadByte;
- begin++;
+ if(CR == leadByte)
+ {
+ // Replace CR+LF or CR by LF
+ *utf32++ = LF;
+
+ // Look ahead if the next one is a LF.
+ ++begin;
+ if(begin < end)
+ {
+ if(LF == *begin)
+ {
+ ++begin;
+ }
+ }
+ }
+ else
+ {
+ *utf32++ = leadByte;
+ begin++;
+ }
break;
}
case U2:
{
uint32_t& code = *utf32++;
- code = leadByte & 0x1fu;
+ code = leadByte & 0x1fu;
begin++;
code <<= 6u;
code |= *begin++ & 0x3fu;
case U3:
{
uint32_t& code = *utf32++;
- code = leadByte & 0x0fu;
+ code = leadByte & 0x0fu;
begin++;
code <<= 6u;
code |= *begin++ & 0x3fu;
case U4:
{
uint32_t& code = *utf32++;
- code = leadByte & 0x07u;
+ code = leadByte & 0x07u;
+ begin++;
+ code <<= 6u;
+ code |= *begin++ & 0x3fu;
+ code <<= 6u;
+ code |= *begin++ & 0x3fu;
+ code <<= 6u;
+ code |= *begin++ & 0x3fu;
+ break;
+ }
+
+ case U5:
+ {
+ uint32_t& code = *utf32++;
+ code = leadByte & 0x03u;
+ begin++;
+ code <<= 6u;
+ code |= *begin++ & 0x3fu;
+ code <<= 6u;
+ code |= *begin++ & 0x3fu;
+ code <<= 6u;
+ code |= *begin++ & 0x3fu;
+ code <<= 6u;
+ code |= *begin++ & 0x3fu;
+ break;
+ }
+
+ case U6:
+ {
+ uint32_t& code = *utf32++;
+ code = leadByte & 0x01u;
begin++;
code <<= 6u;
code |= *begin++ & 0x3fu;
code |= *begin++ & 0x3fu;
code <<= 6u;
code |= *begin++ & 0x3fu;
+ code <<= 6u;
+ code |= *begin++ & 0x3fu;
+ code <<= 6u;
+ code |= *begin++ & 0x3fu;
+ break;
+ }
+
+ case U0: // Invalid case
+ {
+ begin++;
+ *utf32++ = 0x20; // Use white space
break;
}
}
return numberOfCharacters;
}
-void Utf32ToUtf8( const uint32_t* const utf32, uint32_t numberOfCharacters, uint8_t* utf8 )
+uint32_t Utf32ToUtf8(const uint32_t* const utf32, uint32_t numberOfCharacters, uint8_t* utf8)
{
const uint32_t* begin = utf32;
- const uint32_t* end = utf32 + numberOfCharacters;
+ const uint32_t* end = utf32 + numberOfCharacters;
+
+ uint8_t* utf8Begin = utf8;
- for( ; begin < end; ++begin )
+ for(; begin < end; ++begin)
{
const uint32_t code = *begin;
+ // clang-format off
if( code < 0x80u )
{
*utf8++ = code;
}
else if( code < 0x800u )
{
- *utf8++ = static_cast<uint8_t>( code >> 6u ) | 0xc0u; // lead byte for 2 byte sequence
- *utf8++ = static_cast<uint8_t>( code & 0x3f ) | 0x80u; // continuation byte
+ *utf8++ = static_cast<uint8_t>( code >> 6u) | 0xc0u; // lead byte for 2 byte sequence
+ *utf8++ = static_cast<uint8_t>( code & 0x3f) | 0x80u; // continuation byte
}
else if( code < 0x10000u )
{
- *utf8++ = static_cast<uint8_t>( code >> 12u ) | 0xe0u; // lead byte for 2 byte sequence
- *utf8++ = static_cast<uint8_t>( ( code >> 6u ) & 0x3f ) | 0x80u; // continuation byte
- *utf8++ = static_cast<uint8_t>( code & 0x3f ) | 0x80u; // continuation byte
+ *utf8++ = static_cast<uint8_t>( code >> 12u) | 0xe0u; // lead byte for 3 byte sequence
+ *utf8++ = static_cast<uint8_t>((code >> 6u ) & 0x3f) | 0x80u; // continuation byte
+ *utf8++ = static_cast<uint8_t>( code & 0x3f) | 0x80u; // continuation byte
}
else if( code < 0x200000u )
{
- *utf8++ = static_cast<uint8_t>( code >> 18u ) | 0xf0u; // lead byte for 2 byte sequence
- *utf8++ = static_cast<uint8_t>( ( code >> 12u ) & 0x3f ) | 0x80u; // continuation byte
- *utf8++ = static_cast<uint8_t>( ( code >> 6u ) & 0x3f ) | 0x80u; // continuation byte
- *utf8++ = static_cast<uint8_t>( code & 0x3f ) | 0x80u; // continuation byte
+ *utf8++ = static_cast<uint8_t>( code >> 18u) | 0xf0u; // lead byte for 4 byte sequence
+ *utf8++ = static_cast<uint8_t>((code >> 12u) & 0x3f) | 0x80u; // continuation byte
+ *utf8++ = static_cast<uint8_t>((code >> 6u ) & 0x3f) | 0x80u; // continuation byte
+ *utf8++ = static_cast<uint8_t>( code & 0x3f) | 0x80u; // continuation byte
+ }
+ else if( code < 0x4000000u )
+ {
+ *utf8++ = static_cast<uint8_t>( code >> 24u) | 0xf8u; // lead byte for 5 byte sequence
+ *utf8++ = static_cast<uint8_t>((code >> 18u) & 0x3f) | 0x80u; // continuation byte
+ *utf8++ = static_cast<uint8_t>((code >> 12u) & 0x3f) | 0x80u; // continuation byte
+ *utf8++ = static_cast<uint8_t>((code >> 6u ) & 0x3f) | 0x80u; // continuation byte
+ *utf8++ = static_cast<uint8_t>( code & 0x3f) | 0x80u; // continuation byte
}
+ else if( code < 0x80000000u )
+ {
+ *utf8++ = static_cast<uint8_t>( code >> 30u) | 0xfcu; // lead byte for 6 byte sequence
+ *utf8++ = static_cast<uint8_t>((code >> 24u) & 0x3f) | 0x80u; // continuation byte
+ *utf8++ = static_cast<uint8_t>((code >> 18u) & 0x3f) | 0x80u; // continuation byte
+ *utf8++ = static_cast<uint8_t>((code >> 12u) & 0x3f) | 0x80u; // continuation byte
+ *utf8++ = static_cast<uint8_t>((code >> 6u ) & 0x3f) | 0x80u; // continuation byte
+ *utf8++ = static_cast<uint8_t>( code & 0x3f) | 0x80u; // continuation byte
+ }
+ // clang-format on
}
+
+ return utf8 - utf8Begin;
}
+void Utf32ToUtf8(const uint32_t* const utf32, uint32_t numberOfCharacters, std::string& utf8)
+{
+ utf8.clear();
+
+ uint32_t numberOfBytes = GetNumberOfUtf8Bytes(&utf32[0], numberOfCharacters);
+ utf8.resize(numberOfBytes);
+
+ // This is a bit horrible but std::string returns a (signed) char*
+ Utf32ToUtf8(utf32, numberOfCharacters, reinterpret_cast<uint8_t*>(&utf8[0]));
+}
+
+} // namespace Text
+
} // namespace Toolkit
} // namespace Dali