Source/cm_codecvt.cxx

   1 /* Distributed under the OSI-approved BSD 3-Clause License.  See accompanying
   2    file Copyright.txt or https://cmake.org/licensing for details.  */
   3 #include "cm_codecvt.hxx"
   4
   5 #if defined(_WIN32)
   6 #  include <cassert>
   7 #  include <cstring>
   8
   9 #  include <windows.h>
  10 #  undef max
  11 #  include "cmsys/Encoding.hxx"
  12
  13 #  include "cm_utf8.h"
  14 #endif
  15
  16 codecvt::codecvt(Encoding e)
  17 #if defined(_WIN32)
  18   : m_codepage(0)
  19 #endif
  20 {
  21   switch (e) {
  22     case codecvt::ConsoleOutput:
  23 #if defined(_WIN32)
  24       m_noconv = false;
  25       m_codepage = GetConsoleOutputCP();
  26       break;
  27 #endif
  28     case codecvt::ANSI:
  29 #if defined(_WIN32)
  30       m_noconv = false;
  31       m_codepage = CP_ACP;
  32       break;
  33 #endif
  34     // We don't know which ANSI encoding to use for other platforms than
  35     // Windows so we don't do any conversion there
  36     case codecvt::UTF8:
  37     case codecvt::UTF8_WITH_BOM:
  38     // Assume internal encoding is UTF-8
  39     case codecvt::None:
  40     // No encoding
  41     default:
  42       this->m_noconv = true;
  43   }
  44 }
  45
  46 codecvt::~codecvt() = default;
  47
  48 bool codecvt::do_always_noconv() const noexcept
  49 {
  50   return this->m_noconv;
  51 }
  52
  53 std::codecvt_base::result codecvt::do_out(mbstate_t& state, const char* from,
  54                                           const char* from_end,
  55                                           const char*& from_next, char* to,
  56                                           char* to_end, char*& to_next) const
  57 {
  58   from_next = from;
  59   to_next = to;
  60   if (this->m_noconv) {
  61     return std::codecvt_base::noconv;
  62   }
  63 #if defined(_WIN32)
  64   // Use a const view of the state because we should not modify it until we
  65   // have fully processed and consume a byte (with sufficient space in the
  66   // output buffer).  We call helpers to re-cast and modify the state
  67   State const& lstate = reinterpret_cast<State&>(state);
  68
  69   while (from_next != from_end) {
  70     // Count leading ones in the bits of the next byte.
  71     unsigned char const ones =
  72       cm_utf8_ones[static_cast<unsigned char>(*from_next)];
  73
  74     if (ones != 1 && lstate.buffered != 0) {
  75       // We have a buffered partial codepoint that we never completed.
  76       return std::codecvt_base::error;
  77     } else if (ones == 1 && lstate.buffered == 0) {
  78       // This is a continuation of a codepoint that never started.
  79       return std::codecvt_base::error;
  80     }
  81
  82     // Compute the number of bytes in the current codepoint.
  83     int need = 0;
  84     switch (ones) {
  85       case 0: // 0xxx xxxx: new codepoint of size 1
  86         need = 1;
  87         break;
  88       case 1: // 10xx xxxx: continues a codepoint
  89         assert(lstate.size != 0);
  90         need = lstate.size;
  91         break;
  92       case 2: // 110x xxxx: new codepoint of size 2
  93         need = 2;
  94         break;
  95       case 3: // 1110 xxxx: new codepoint of size 3
  96         need = 3;
  97         break;
  98       case 4: // 1111 0xxx: new codepoint of size 4
  99         need = 4;
 100         break;
 101       default: // invalid byte
 102         return std::codecvt_base::error;
 103     }
 104     assert(need > 0);
 105
 106     if (lstate.buffered + 1 == need) {
 107       // This byte completes a codepoint.
 108       std::codecvt_base::result decode_result =
 109         this->Decode(state, need, from_next, to_next, to_end);
 110       if (decode_result != std::codecvt_base::ok) {
 111         return decode_result;
 112       }
 113     } else {
 114       // This byte does not complete a codepoint.
 115       this->BufferPartial(state, need, from_next);
 116     }
 117   }
 118
 119   return std::codecvt_base::ok;
 120 #else
 121   static_cast<void>(state);
 122   static_cast<void>(from);
 123   static_cast<void>(from_end);
 124   static_cast<void>(from_next);
 125   static_cast<void>(to);
 126   static_cast<void>(to_end);
 127   static_cast<void>(to_next);
 128   return std::codecvt_base::noconv;
 129 #endif
 130 }
 131
 132 std::codecvt_base::result codecvt::do_unshift(mbstate_t& state, char* to,
 133                                               char* to_end,
 134                                               char*& to_next) const
 135 {
 136   to_next = to;
 137   if (this->m_noconv) {
 138     return std::codecvt_base::noconv;
 139   }
 140 #if defined(_WIN32)
 141   State& lstate = reinterpret_cast<State&>(state);
 142   if (lstate.buffered != 0) {
 143     return this->DecodePartial(state, to_next, to_end);
 144   }
 145   return std::codecvt_base::ok;
 146 #else
 147   static_cast<void>(state);
 148   static_cast<void>(to_end);
 149   return std::codecvt_base::ok;
 150 #endif
 151 }
 152
 153 #if defined(_WIN32)
 154 std::codecvt_base::result codecvt::Decode(mbstate_t& state, int size,
 155                                           const char*& from_next,
 156                                           char*& to_next, char* to_end) const
 157 {
 158   State& lstate = reinterpret_cast<State&>(state);
 159
 160   // Collect all the bytes for this codepoint.
 161   char buf[4];
 162   memcpy(buf, lstate.partial, lstate.buffered);
 163   buf[lstate.buffered] = *from_next;
 164
 165   // Convert the encoding.
 166   wchar_t wbuf[2];
 167   int wlen =
 168     MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, buf, size, wbuf, 2);
 169   if (wlen <= 0) {
 170     return std::codecvt_base::error;
 171   }
 172
 173   int tlen = WideCharToMultiByte(m_codepage, 0, wbuf, wlen, to_next,
 174                                  to_end - to_next, NULL, NULL);
 175   if (tlen <= 0) {
 176     if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
 177       return std::codecvt_base::partial;
 178     }
 179     return std::codecvt_base::error;
 180   }
 181
 182   // Move past the now-consumed byte in the input buffer.
 183   ++from_next;
 184
 185   // Move past the converted codepoint in the output buffer.
 186   to_next += tlen;
 187
 188   // Re-initialize the state for the next codepoint to start.
 189   lstate = State();
 190
 191   return std::codecvt_base::ok;
 192 }
 193
 194 std::codecvt_base::result codecvt::DecodePartial(mbstate_t& state,
 195                                                  char*& to_next,
 196                                                  char* to_end) const
 197 {
 198   State& lstate = reinterpret_cast<State&>(state);
 199
 200   // Try converting the partial codepoint.
 201   wchar_t wbuf[2];
 202   int wlen = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, lstate.partial,
 203                                  lstate.buffered, wbuf, 2);
 204   if (wlen <= 0) {
 205     return std::codecvt_base::error;
 206   }
 207
 208   int tlen = WideCharToMultiByte(m_codepage, 0, wbuf, wlen, to_next,
 209                                  to_end - to_next, NULL, NULL);
 210   if (tlen <= 0) {
 211     if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
 212       return std::codecvt_base::partial;
 213     }
 214     return std::codecvt_base::error;
 215   }
 216
 217   // Move past the converted codepoint in the output buffer.
 218   to_next += tlen;
 219
 220   // Re-initialize the state for the next codepoint to start.
 221   lstate = State();
 222
 223   return std::codecvt_base::ok;
 224 }
 225
 226 void codecvt::BufferPartial(mbstate_t& state, int size,
 227                             const char*& from_next) const
 228 {
 229   State& lstate = reinterpret_cast<State&>(state);
 230
 231   // Save the byte in our buffer for later.
 232   lstate.partial[lstate.buffered++] = *from_next;
 233   lstate.size = size;
 234
 235   // Move past the now-consumed byte in the input buffer.
 236   ++from_next;
 237 }
 238 #endif
 239
 240 int codecvt::do_max_length() const noexcept
 241 {
 242   return 4;
 243 }
 244
 245 int codecvt::do_encoding() const noexcept
 246 {
 247   return 0;
 248 }