dali-toolkit/internal/text/character-set-conversion.cpp

   1 /*
   2  * Copyright (c) 2015 Samsung Electronics Co., Ltd.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  * http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  *
  16  */
  17
  18 // FILE HEADER
  19 #include <dali-toolkit/internal/text/character-set-conversion.h>
  20
  21 namespace Dali
  22 {
  23 namespace Toolkit
  24 {
  25 namespace Text
  26 {
  27 namespace
  28 {
  29 // clang-format off
  30 constexpr uint8_t U1 = 1u;
  31 constexpr uint8_t U2 = 2u;
  32 constexpr uint8_t U3 = 3u;
  33 constexpr uint8_t U4 = 4u;
  34 constexpr uint8_t U5 = 5u;
  35 constexpr uint8_t U6 = 6u;
  36 constexpr uint8_t U0 = 0u;
  37 constexpr uint8_t UTF8_LENGTH[256] = {
  38   U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
  39   U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
  40   U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
  41   U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
  42   U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
  43   U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
  44   U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
  45   U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
  46   U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // lead byte = 0xxx xxxx (U+0000 - U+007F + some extended ascii characters)
  47   U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
  48   U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
  49   U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
  50   U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
  51   U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
  52   U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
  53   U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
  54   U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
  55   U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
  56   U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, //
  57   U1, U1,                                 //
  58
  59   U2, U2, U2, U2, U2, U2, U2, U2, U2, U2, //
  60   U2, U2, U2, U2, U2, U2, U2, U2, U2, U2, // lead byte = 110x xxxx (U+0080 - U+07FF)
  61   U2, U2, U2, U2, U2, U2, U2, U2, U2, U2, //
  62   U2, U2,                                 //
  63
  64   U3, U3, U3, U3, U3, U3, U3, U3, U3, U3, // lead byte = 1110 xxxx (U+0800 - U+FFFF)
  65   U3, U3, U3, U3, U3, U3,                 //
  66
  67   U4, U4, U4, U4, U4, U4, U4, U4,         // lead byte = 1111 0xxx (U+10000 - U+1FFFFF)
  68
  69   U5, U5, U5, U5,                         // lead byte = 1111 10xx (U+200000 - U+3FFFFFF)
  70
  71   U6, U6,                                 // lead byte = 1111 110x (U+4000000 - U+7FFFFFFF)
  72
  73   U0, U0,                                 // Non valid.
  74 };
  75
  76 constexpr uint8_t CR = 0xd;
  77 constexpr uint8_t LF = 0xa;
  78 // clang-format on
  79 } // namespace
  80
  81 uint8_t GetUtf8Length(uint8_t utf8LeadByte)
  82 {
  83   return UTF8_LENGTH[utf8LeadByte];
  84 }
  85
  86 uint32_t GetNumberOfUtf8Characters(const uint8_t* const utf8, uint32_t length)
  87 {
  88   uint32_t numberOfCharacters = 0u;
  89
  90   const uint8_t* begin = utf8;
  91   const uint8_t* end   = utf8 + length;
  92
  93   for(; begin < end; begin += UTF8_LENGTH[*begin])
  94   {
  95     ++numberOfCharacters;
  96   }
  97
  98   return numberOfCharacters;
  99 }
 100
 101 uint32_t GetNumberOfUtf8Bytes(const uint32_t* const utf32, uint32_t numberOfCharacters)
 102 {
 103   uint32_t numberOfBytes = 0u;
 104
 105   const uint32_t* begin = utf32;
 106   const uint32_t* end   = utf32 + numberOfCharacters;
 107
 108   for(; begin < end; ++begin)
 109   {
 110     const uint32_t code = *begin;
 111
 112     if(code < 0x80u)
 113     {
 114       ++numberOfBytes;
 115     }
 116     else if(code < 0x800u)
 117     {
 118       numberOfBytes += U2;
 119     }
 120     else if(code < 0x10000u)
 121     {
 122       numberOfBytes += U3;
 123     }
 124     else if(code < 0x200000u)
 125     {
 126       numberOfBytes += U4;
 127     }
 128     else if(code < 0x4000000u)
 129     {
 130       numberOfBytes += U5;
 131     }
 132     else if(code < 0x80000000u)
 133     {
 134       numberOfBytes += U6;
 135     }
 136   }
 137
 138   return numberOfBytes;
 139 }
 140
 141 uint32_t Utf8ToUtf32(const uint8_t* const utf8, uint32_t length, uint32_t* utf32)
 142 {
 143   uint32_t numberOfCharacters = 0u;
 144
 145   const uint8_t* begin = utf8;
 146   const uint8_t* end   = utf8 + length;
 147
 148   for(; begin < end; ++numberOfCharacters)
 149   {
 150     const uint8_t leadByte = *begin;
 151
 152     switch(UTF8_LENGTH[leadByte])
 153     {
 154       case U1:
 155       {
 156         if(CR == leadByte)
 157         {
 158           // Replace CR+LF or CR by LF
 159           *utf32++ = LF;
 160
 161           // Look ahead if the next one is a LF.
 162           ++begin;
 163           if(begin < end)
 164           {
 165             if(LF == *begin)
 166             {
 167               ++begin;
 168             }
 169           }
 170         }
 171         else
 172         {
 173           *utf32++ = leadByte;
 174           begin++;
 175         }
 176         break;
 177       }
 178
 179       case U2:
 180       {
 181         uint32_t& code = *utf32++;
 182         code           = leadByte & 0x1fu;
 183         begin++;
 184         code <<= 6u;
 185         code |= *begin++ & 0x3fu;
 186         break;
 187       }
 188
 189       case U3:
 190       {
 191         uint32_t& code = *utf32++;
 192         code           = leadByte & 0x0fu;
 193         begin++;
 194         code <<= 6u;
 195         code |= *begin++ & 0x3fu;
 196         code <<= 6u;
 197         code |= *begin++ & 0x3fu;
 198         break;
 199       }
 200
 201       case U4:
 202       {
 203         uint32_t& code = *utf32++;
 204         code           = leadByte & 0x07u;
 205         begin++;
 206         code <<= 6u;
 207         code |= *begin++ & 0x3fu;
 208         code <<= 6u;
 209         code |= *begin++ & 0x3fu;
 210         code <<= 6u;
 211         code |= *begin++ & 0x3fu;
 212         break;
 213       }
 214
 215       case U5:
 216       {
 217         uint32_t& code = *utf32++;
 218         code           = leadByte & 0x03u;
 219         begin++;
 220         code <<= 6u;
 221         code |= *begin++ & 0x3fu;
 222         code <<= 6u;
 223         code |= *begin++ & 0x3fu;
 224         code <<= 6u;
 225         code |= *begin++ & 0x3fu;
 226         code <<= 6u;
 227         code |= *begin++ & 0x3fu;
 228         break;
 229       }
 230
 231       case U6:
 232       {
 233         uint32_t& code = *utf32++;
 234         code           = leadByte & 0x01u;
 235         begin++;
 236         code <<= 6u;
 237         code |= *begin++ & 0x3fu;
 238         code <<= 6u;
 239         code |= *begin++ & 0x3fu;
 240         code <<= 6u;
 241         code |= *begin++ & 0x3fu;
 242         code <<= 6u;
 243         code |= *begin++ & 0x3fu;
 244         code <<= 6u;
 245         code |= *begin++ & 0x3fu;
 246         break;
 247       }
 248
 249       case U0: // Invalid case
 250       {
 251         begin++;
 252         *utf32++ = 0x20; // Use white space
 253         break;
 254       }
 255     }
 256   }
 257
 258   return numberOfCharacters;
 259 }
 260
 261 uint32_t Utf32ToUtf8(const uint32_t* const utf32, uint32_t numberOfCharacters, uint8_t* utf8)
 262 {
 263   const uint32_t* begin = utf32;
 264   const uint32_t* end   = utf32 + numberOfCharacters;
 265
 266   uint8_t* utf8Begin = utf8;
 267
 268   for(; begin < end; ++begin)
 269   {
 270     const uint32_t code = *begin;
 271
 272     // clang-format off
 273     if( code < 0x80u )
 274     {
 275       *utf8++ = code;
 276     }
 277     else if( code < 0x800u )
 278     {
 279       *utf8++ = static_cast<uint8_t>( code >> 6u)          | 0xc0u; // lead byte for 2 byte sequence
 280       *utf8++ = static_cast<uint8_t>( code         & 0x3f) | 0x80u; // continuation byte
 281     }
 282     else if( code < 0x10000u )
 283     {
 284       *utf8++ = static_cast<uint8_t>( code >> 12u)         | 0xe0u; // lead byte for 3 byte sequence
 285       *utf8++ = static_cast<uint8_t>((code >> 6u ) & 0x3f) | 0x80u; // continuation byte
 286       *utf8++ = static_cast<uint8_t>( code         & 0x3f) | 0x80u; // continuation byte
 287     }
 288     else if( code < 0x200000u )
 289     {
 290       *utf8++ = static_cast<uint8_t>( code >> 18u)         | 0xf0u; // lead byte for 4 byte sequence
 291       *utf8++ = static_cast<uint8_t>((code >> 12u) & 0x3f) | 0x80u; // continuation byte
 292       *utf8++ = static_cast<uint8_t>((code >> 6u ) & 0x3f) | 0x80u; // continuation byte
 293       *utf8++ = static_cast<uint8_t>( code         & 0x3f) | 0x80u; // continuation byte
 294     }
 295     else if( code < 0x4000000u )
 296     {
 297       *utf8++ = static_cast<uint8_t>( code >> 24u)         | 0xf8u; // lead byte for 5 byte sequence
 298       *utf8++ = static_cast<uint8_t>((code >> 18u) & 0x3f) | 0x80u; // continuation byte
 299       *utf8++ = static_cast<uint8_t>((code >> 12u) & 0x3f) | 0x80u; // continuation byte
 300       *utf8++ = static_cast<uint8_t>((code >> 6u ) & 0x3f) | 0x80u; // continuation byte
 301       *utf8++ = static_cast<uint8_t>( code         & 0x3f) | 0x80u; // continuation byte
 302     }
 303     else if( code < 0x80000000u )
 304     {
 305       *utf8++ = static_cast<uint8_t>( code >> 30u)         | 0xfcu; // lead byte for 6 byte sequence
 306       *utf8++ = static_cast<uint8_t>((code >> 24u) & 0x3f) | 0x80u; // continuation byte
 307       *utf8++ = static_cast<uint8_t>((code >> 18u) & 0x3f) | 0x80u; // continuation byte
 308       *utf8++ = static_cast<uint8_t>((code >> 12u) & 0x3f) | 0x80u; // continuation byte
 309       *utf8++ = static_cast<uint8_t>((code >> 6u ) & 0x3f) | 0x80u; // continuation byte
 310       *utf8++ = static_cast<uint8_t>( code         & 0x3f) | 0x80u; // continuation byte
 311     }
 312     // clang-format on
 313   }
 314
 315   return utf8 - utf8Begin;
 316 }
 317
 318 void Utf32ToUtf8(const uint32_t* const utf32, uint32_t numberOfCharacters, std::string& utf8)
 319 {
 320   utf8.clear();
 321
 322   uint32_t numberOfBytes = GetNumberOfUtf8Bytes(&utf32[0], numberOfCharacters);
 323   utf8.resize(numberOfBytes);
 324
 325   // This is a bit horrible but std::string returns a (signed) char*
 326   Utf32ToUtf8(utf32, numberOfCharacters, reinterpret_cast<uint8_t*>(&utf8[0]));
 327 }
 328
 329 } // namespace Text
 330
 331 } // namespace Toolkit
 332
 333 } // namespace Dali