src/corelib/codecs/qtextcodec.cpp

   1 /****************************************************************************
   2 **
   3 ** Copyright (C) 2012 Nokia Corporation and/or its subsidiary(-ies).
   4 ** Contact: http://www.qt-project.org/
   5 **
   6 ** This file is part of the QtCore module of the Qt Toolkit.
   7 **
   8 ** $QT_BEGIN_LICENSE:LGPL$
   9 ** GNU Lesser General Public License Usage
  10 ** This file may be used under the terms of the GNU Lesser General Public
  11 ** License version 2.1 as published by the Free Software Foundation and
  12 ** appearing in the file LICENSE.LGPL included in the packaging of this
  13 ** file. Please review the following information to ensure the GNU Lesser
  14 ** General Public License version 2.1 requirements will be met:
  15 ** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
  16 **
  17 ** In addition, as a special exception, Nokia gives you certain additional
  18 ** rights. These rights are described in the Nokia Qt LGPL Exception
  19 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
  20 **
  21 ** GNU General Public License Usage
  22 ** Alternatively, this file may be used under the terms of the GNU General
  23 ** Public License version 3.0 as published by the Free Software Foundation
  24 ** and appearing in the file LICENSE.GPL included in the packaging of this
  25 ** file. Please review the following information to ensure the GNU General
  26 ** Public License version 3.0 requirements will be met:
  27 ** http://www.gnu.org/copyleft/gpl.html.
  28 **
  29 ** Other Usage
  30 ** Alternatively, this file may be used in accordance with the terms and
  31 ** conditions contained in a signed written agreement between you and Nokia.
  32 **
  33 **
  34 **
  35 **
  36 **
  37 **
  38 ** $QT_END_LICENSE$
  39 **
  40 ****************************************************************************/
  41
  42 #include "qplatformdefs.h"
  43 #include "qtextcodec.h"
  44 #include "qtextcodec_p.h"
  45
  46 #ifndef QT_NO_TEXTCODEC
  47
  48 #include "qlist.h"
  49 #include "qfile.h"
  50 #include "qstringlist.h"
  51
  52 #ifdef Q_OS_UNIX
  53 #  include "qiconvcodec_p.h"
  54 #endif
  55
  56 #include "qutfcodec_p.h"
  57 #include "qsimplecodec_p.h"
  58 #include "qlatincodec_p.h"
  59 #ifndef QT_NO_CODECS
  60 #  include "qtsciicodec_p.h"
  61 #  include "qisciicodec_p.h"
  62 #if !defined(Q_OS_INTEGRITY)
  63 #  if !defined(QT_BOOTSTRAPPED)
  64 #    include "qgb18030codec_p.h"
  65 #    include "qeucjpcodec_p.h"
  66 #    include "qjiscodec_p.h"
  67 #    include "qsjiscodec_p.h"
  68 #    include "qeuckrcodec_p.h"
  69 #    include "qbig5codec_p.h"
  70 #  endif // !QT_BOOTSTRAPPED
  71 #  if defined(Q_OS_UNIX) && !defined(QT_BOOTSTRAPPED)
  72 #    include "qfontlaocodec_p.h"
  73 #    include "qfontjpcodec_p.h"
  74 #  endif
  75 #endif // !Q_OS_INTEGRITY
  76 #endif // QT_NO_CODECS
  77 #include "qlocale.h"
  78 #include "qmutex.h"
  79 #include "qhash.h"
  80
  81 #include <stdlib.h>
  82 #include <ctype.h>
  83 #include <locale.h>
  84 #if defined (_XOPEN_UNIX) && !defined(Q_OS_QNX) && !defined(Q_OS_OSF)
  85 #include <langinfo.h>
  86 #endif
  87
  88 #ifdef Q_OS_WIN
  89 #  include <qt_windows.h>
  90 #  if defined(Q_OS_WINCE)
  91 #    define QT_NO_SETLOCALE
  92 #  endif
  93 #endif
  94
  95
  96 // enabling this is not exception safe!
  97 // #define Q_DEBUG_TEXTCODEC
  98
  99 QT_BEGIN_NAMESPACE
 100
 101 //Cache for QTextCodec::codecForName and codecForMib.
 102 typedef QHash<QByteArray, QTextCodec *> QTextCodecCache;
 103 Q_GLOBAL_STATIC(QTextCodecCache, qTextCodecCache)
 104
 105
 106 static char qtolower(register char c)
 107 { if (c >= 'A' && c <= 'Z') return c + 0x20; return c; }
 108 static bool qisalnum(register char c)
 109 { return (c >= '0' && c <= '9') || ((c | 0x20) >= 'a' && (c | 0x20) <= 'z'); }
 110
 111 static bool nameMatch(const QByteArray &name, const QByteArray &test)
 112 {
 113     // if they're the same, return a perfect score
 114     if (qstricmp(name, test) == 0)
 115         return true;
 116
 117     const char *n = name.constData();
 118     const char *h = test.constData();
 119
 120     // if the letters and numbers are the same, we have a match
 121     while (*n != '\0') {
 122         if (qisalnum(*n)) {
 123             for (;;) {
 124                 if (*h == '\0')
 125                     return false;
 126                 if (qisalnum(*h))
 127                     break;
 128                 ++h;
 129             }
 130             if (qtolower(*n) != qtolower(*h))
 131                 return false;
 132             ++h;
 133         }
 134         ++n;
 135     }
 136     while (*h && !qisalnum(*h))
 137            ++h;
 138     return (*h == '\0');
 139 }
 140
 141
 142 static QList<QTextCodec*> *all = 0;
 143 #ifdef Q_DEBUG_TEXTCODEC
 144 static bool destroying_is_ok = false;
 145 #endif
 146
 147 static QTextCodec *localeMapper = 0;
 148 QTextCodec *QTextCodec::cftr = 0;
 149
 150
 151 class QTextCodecCleanup
 152 {
 153 public:
 154     ~QTextCodecCleanup();
 155 };
 156
 157 /*
 158     Deletes all the created codecs. This destructor is called just
 159     before exiting to delete any QTextCodec objects that may be lying
 160     around.
 161 */
 162 QTextCodecCleanup::~QTextCodecCleanup()
 163 {
 164     if (!all)
 165         return;
 166
 167 #ifdef Q_DEBUG_TEXTCODEC
 168     destroying_is_ok = true;
 169 #endif
 170
 171     QList<QTextCodec *> *myAll = all;
 172     all = 0; // Otherwise the d'tor destroys the iterator
 173     for (QList<QTextCodec *>::const_iterator it = myAll->constBegin()
 174             ; it != myAll->constEnd(); ++it) {
 175         delete *it;
 176     }
 177     delete myAll;
 178     localeMapper = 0;
 179
 180 #ifdef Q_DEBUG_TEXTCODEC
 181     destroying_is_ok = false;
 182 #endif
 183 }
 184
 185 Q_GLOBAL_STATIC(QTextCodecCleanup, createQTextCodecCleanup)
 186
 187 bool QTextCodec::validCodecs()
 188 {
 189     return true;
 190 }
 191
 192
 193 #if defined(Q_OS_WIN32) || defined(Q_OS_WINCE)
 194 class QWindowsLocalCodec: public QTextCodec
 195 {
 196 public:
 197     QWindowsLocalCodec();
 198     ~QWindowsLocalCodec();
 199
 200     QString convertToUnicode(const char *, int, ConverterState *) const;
 201     QByteArray convertFromUnicode(const QChar *, int, ConverterState *) const;
 202     QString convertToUnicodeCharByChar(const char *chars, int length, ConverterState *state) const;
 203
 204     QByteArray name() const;
 205     int mibEnum() const;
 206
 207 };
 208
 209 QWindowsLocalCodec::QWindowsLocalCodec()
 210 {
 211 }
 212
 213 QWindowsLocalCodec::~QWindowsLocalCodec()
 214 {
 215 }
 216
 217 QString QWindowsLocalCodec::convertToUnicode(const char *chars, int length, ConverterState *state) const
 218 {
 219     const char *mb = chars;
 220     int mblen = length;
 221
 222     if (!mb || !mblen)
 223         return QString();
 224
 225     const int wclen_auto = 4096;
 226     wchar_t wc_auto[wclen_auto];
 227     int wclen = wclen_auto;
 228     wchar_t *wc = wc_auto;
 229     int len;
 230     QString sp;
 231     bool prepend = false;
 232     char state_data = 0;
 233     int remainingChars = 0;
 234
 235     //save the current state information
 236     if (state) {
 237         state_data = (char)state->state_data[0];
 238         remainingChars = state->remainingChars;
 239     }
 240
 241     //convert the pending charcter (if available)
 242     if (state && remainingChars) {
 243         char prev[3] = {0};
 244         prev[0] = state_data;
 245         prev[1] = mb[0];
 246         remainingChars = 0;
 247         len = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED,
 248                                     prev, 2, wc, wclen);
 249         if (len) {
 250             prepend = true;
 251             sp.append(QChar(wc[0]));
 252             mb++;
 253             mblen--;
 254             wc[0] = 0;
 255         }
 256     }
 257
 258     while (!(len=MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS,
 259                 mb, mblen, wc, wclen))) {
 260         int r = GetLastError();
 261         if (r == ERROR_INSUFFICIENT_BUFFER) {
 262             if (wc != wc_auto) {
 263                 qWarning("MultiByteToWideChar: Size changed");
 264                 break;
 265             } else {
 266                 wclen = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED,
 267                                     mb, mblen, 0, 0);
 268                 wc = new wchar_t[wclen];
 269                 // and try again...
 270             }
 271         } else if (r == ERROR_NO_UNICODE_TRANSLATION) {
 272             //find the last non NULL character
 273             while (mblen > 1  && !(mb[mblen-1]))
 274                 mblen--;
 275             //check whether,  we hit an invalid character in the middle
 276             if ((mblen <= 1) || (remainingChars && state_data))
 277                 return convertToUnicodeCharByChar(chars, length, state);
 278             //Remove the last character and try again...
 279             state_data = mb[mblen-1];
 280             remainingChars = 1;
 281             mblen--;
 282         } else {
 283             // Fail.
 284             qWarning("MultiByteToWideChar: Cannot convert multibyte text");
 285             break;
 286         }
 287     }
 288     if (len <= 0)
 289         return QString();
 290     if (wc[len-1] == 0) // len - 1: we don't want terminator
 291         --len;
 292
 293     //save the new state information
 294     if (state) {
 295         state->state_data[0] = (char)state_data;
 296         state->remainingChars = remainingChars;
 297     }
 298     QString s((QChar*)wc, len);
 299     if (wc != wc_auto)
 300         delete [] wc;
 301     if (prepend) {
 302         return sp+s;
 303     }
 304     return s;
 305 }
 306
 307 QString QWindowsLocalCodec::convertToUnicodeCharByChar(const char *chars, int length, ConverterState *state) const
 308 {
 309     if (!chars || !length)
 310         return QString();
 311
 312     int copyLocation = 0;
 313     int extra = 2;
 314     if (state && state->remainingChars) {
 315         copyLocation = state->remainingChars;
 316         extra += copyLocation;
 317     }
 318     int newLength = length + extra;
 319     char *mbcs = new char[newLength];
 320     //ensure that we have a NULL terminated string
 321     mbcs[newLength-1] = 0;
 322     mbcs[newLength-2] = 0;
 323     memcpy(&(mbcs[copyLocation]), chars, length);
 324     if (copyLocation) {
 325         //copy the last character from the state
 326         mbcs[0] = (char)state->state_data[0];
 327         state->remainingChars = 0;
 328     }
 329     const char *mb = mbcs;
 330 #ifndef Q_OS_WINCE
 331     const char *next = 0;
 332     QString s;
 333     while((next = CharNextExA(CP_ACP, mb, 0)) != mb) {
 334         wchar_t wc[2] ={0};
 335         int charlength = next - mb;
 336         int len = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS, mb, charlength, wc, 2);
 337         if (len>0) {
 338             s.append(QChar(wc[0]));
 339         } else {
 340             int r = GetLastError();
 341             //check if the character being dropped is the last character
 342             if (r == ERROR_NO_UNICODE_TRANSLATION && mb == (mbcs+newLength -3) && state) {
 343                 state->remainingChars = 1;
 344                 state->state_data[0] = (char)*mb;
 345             }
 346         }
 347         mb = next;
 348     }
 349 #else
 350     QString s;
 351     int size = mbstowcs(NULL, mb, length);
 352     if (size < 0) {
 353         Q_ASSERT("Error in CE TextCodec");
 354         return QString();
 355     }
 356     wchar_t* ws = new wchar_t[size + 2];
 357     ws[size +1] = 0;
 358     ws[size] = 0;
 359     size = mbstowcs(ws, mb, length);
 360     for (int i=0; i< size; i++)
 361         s.append(QChar(ws[i]));
 362     delete [] ws;
 363 #endif
 364     delete mbcs;
 365     return s;
 366 }
 367
 368 QByteArray QWindowsLocalCodec::convertFromUnicode(const QChar *ch, int uclen, ConverterState *) const
 369 {
 370     if (!ch)
 371         return QByteArray();
 372     if (uclen == 0)
 373         return QByteArray("");
 374     BOOL used_def;
 375     QByteArray mb(4096, 0);
 376     int len;
 377     while (!(len=WideCharToMultiByte(CP_ACP, 0, (const wchar_t*)ch, uclen,
 378                 mb.data(), mb.size()-1, 0, &used_def)))
 379     {
 380         int r = GetLastError();
 381         if (r == ERROR_INSUFFICIENT_BUFFER) {
 382             mb.resize(1+WideCharToMultiByte(CP_ACP, 0,
 383                                 (const wchar_t*)ch, uclen,
 384                                 0, 0, 0, &used_def));
 385                 // and try again...
 386         } else {
 387 #ifndef QT_NO_DEBUG
 388             // Fail.
 389             qWarning("WideCharToMultiByte: Cannot convert multibyte text (error %d): %s (UTF-8)",
 390                 r, QString(ch, uclen).toLocal8Bit().data());
 391 #endif
 392             break;
 393         }
 394     }
 395     mb.resize(len);
 396     return mb;
 397 }
 398
 399
 400 QByteArray QWindowsLocalCodec::name() const
 401 {
 402     return "System";
 403 }
 404
 405 int QWindowsLocalCodec::mibEnum() const
 406 {
 407     return 0;
 408 }
 409
 410 #else
 411
 412 /* locale names mostly copied from XFree86 */
 413 static const char * const iso8859_2locales[] = {
 414     "croatian", "cs", "cs_CS", "cs_CZ","cz", "cz_CZ", "czech", "hr",
 415     "hr_HR", "hu", "hu_HU", "hungarian", "pl", "pl_PL", "polish", "ro",
 416     "ro_RO", "rumanian", "serbocroatian", "sh", "sh_SP", "sh_YU", "sk",
 417     "sk_SK", "sl", "sl_CS", "sl_SI", "slovak", "slovene", "sr_SP", 0 };
 418
 419 static const char * const iso8859_3locales[] = {
 420     "eo", 0 };
 421
 422 static const char * const iso8859_4locales[] = {
 423     "ee", "ee_EE", 0 };
 424
 425 static const char * const iso8859_5locales[] = {
 426     "mk", "mk_MK", "sp", "sp_YU", 0 };
 427
 428 static const char * const cp_1251locales[] = {
 429     "be", "be_BY", "bg", "bg_BG", "bulgarian", 0 };
 430
 431 static const char * const pt_154locales[] = {
 432     "ba_RU", "ky", "ky_KG", "kk", "kk_KZ", 0 };
 433
 434 static const char * const iso8859_6locales[] = {
 435     "ar_AA", "ar_SA", "arabic", 0 };
 436
 437 static const char * const iso8859_7locales[] = {
 438     "el", "el_GR", "greek", 0 };
 439
 440 static const char * const iso8859_8locales[] = {
 441     "hebrew", "he", "he_IL", "iw", "iw_IL", 0 };
 442
 443 static const char * const iso8859_9locales[] = {
 444     "tr", "tr_TR", "turkish", 0 };
 445
 446 static const char * const iso8859_13locales[] = {
 447     "lt", "lt_LT", "lv", "lv_LV", 0 };
 448
 449 static const char * const iso8859_15locales[] = {
 450     "et", "et_EE",
 451     // Euro countries
 452     "br_FR", "ca_ES", "de", "de_AT", "de_BE", "de_DE", "de_LU", "en_IE",
 453     "es", "es_ES", "eu_ES", "fi", "fi_FI", "finnish", "fr", "fr_FR",
 454     "fr_BE", "fr_LU", "french", "ga_IE", "gl_ES", "it", "it_IT", "oc_FR",
 455     "nl", "nl_BE", "nl_NL", "pt", "pt_PT", "sv_FI", "wa_BE",
 456     0 };
 457
 458 static const char * const koi8_ulocales[] = {
 459     "uk", "uk_UA", "ru_UA", "ukrainian", 0 };
 460
 461 static const char * const tis_620locales[] = {
 462     "th", "th_TH", "thai", 0 };
 463
 464 // static const char * const tcvnlocales[] = {
 465 //     "vi", "vi_VN", 0 };
 466
 467 static bool try_locale_list(const char * const locale[], const QByteArray &lang)
 468 {
 469     int i;
 470     for(i=0; locale[i] && lang != locale[i]; i++)
 471         ;
 472     return locale[i] != 0;
 473 }
 474
 475 // For the probably_koi8_locales we have to look. the standard says
 476 // these are 8859-5, but almost all Russian users use KOI8-R and
 477 // incorrectly set $LANG to ru_RU. We'll check tolower() to see what
 478 // it thinks ru_RU means.
 479
 480 // If you read the history, it seems that many Russians blame ISO and
 481 // Perestroika for the confusion.
 482 //
 483 // The real bug is that some programs break if the user specifies
 484 // ru_RU.KOI8-R.
 485
 486 static const char * const probably_koi8_rlocales[] = {
 487     "ru", "ru_SU", "ru_RU", "russian", 0 };
 488
 489 static QTextCodec * ru_RU_hack(const char * i) {
 490     QTextCodec * ru_RU_codec = 0;
 491
 492 #if !defined(QT_NO_SETLOCALE)
 493     QByteArray origlocale(setlocale(LC_CTYPE, i));
 494 #else
 495     QByteArray origlocale(i);
 496 #endif
 497     // unicode   koi8r   latin5   name
 498     // 0x044E    0xC0    0xEE     CYRILLIC SMALL LETTER YU
 499     // 0x042E    0xE0    0xCE     CYRILLIC CAPITAL LETTER YU
 500     int latin5 = tolower(0xCE);
 501     int koi8r = tolower(0xE0);
 502     if (koi8r == 0xC0 && latin5 != 0xEE) {
 503         ru_RU_codec = QTextCodec::codecForName("KOI8-R");
 504     } else if (koi8r != 0xC0 && latin5 == 0xEE) {
 505         ru_RU_codec = QTextCodec::codecForName("ISO 8859-5");
 506     } else {
 507         // something else again... let's assume... *throws dice*
 508         ru_RU_codec = QTextCodec::codecForName("KOI8-R");
 509         qWarning("QTextCodec: Using KOI8-R, probe failed (%02x %02x %s)",
 510                   koi8r, latin5, i);
 511     }
 512 #if !defined(QT_NO_SETLOCALE)
 513     setlocale(LC_CTYPE, origlocale);
 514 #endif
 515
 516     return ru_RU_codec;
 517 }
 518
 519 #endif
 520
 521 #if !defined(Q_OS_WIN32) && !defined(Q_OS_WINCE)
 522 static QTextCodec *checkForCodec(const QByteArray &name) {
 523     QTextCodec *c = QTextCodec::codecForName(name);
 524     if (!c) {
 525         const int index = name.indexOf('@');
 526         if (index != -1) {
 527             c = QTextCodec::codecForName(name.left(index));
 528         }
 529     }
 530     return c;
 531 }
 532 #endif
 533
 534 /* the next two functions are implicitely thread safe,
 535    as they are only called by setup() which uses a mutex.
 536 */
 537 static void setupLocaleMapper()
 538 {
 539 #if defined(Q_OS_WIN32) || defined(Q_OS_WINCE)
 540     localeMapper = QTextCodec::codecForName("System");
 541 #else
 542
 543 #ifndef QT_NO_ICONV
 544     localeMapper = QTextCodec::codecForName("System");
 545 #endif
 546
 547 #if defined (_XOPEN_UNIX) && !defined(Q_OS_QNX) && !defined(Q_OS_OSF)
 548     if (!localeMapper) {
 549         char *charset = nl_langinfo (CODESET);
 550         if (charset)
 551             localeMapper = QTextCodec::codecForName(charset);
 552     }
 553 #endif
 554
 555     if (!localeMapper) {
 556         // Very poorly defined and followed standards causes lots of
 557         // code to try to get all the cases... This logic is
 558         // duplicated in QIconvCodec, so if you change it here, change
 559         // it there too.
 560
 561         // Try to determine locale codeset from locale name assigned to
 562         // LC_CTYPE category.
 563
 564         // First part is getting that locale name.  First try setlocale() which
 565         // definitely knows it, but since we cannot fully trust it, get ready
 566         // to fall back to environment variables.
 567 #if !defined(QT_NO_SETLOCALE)
 568         const QByteArray ctype = setlocale(LC_CTYPE, 0);
 569 #else
 570         const QByteArray ctype;
 571 #endif
 572
 573         // Get the first nonempty value from $LC_ALL, $LC_CTYPE, and $LANG
 574         // environment variables.
 575         QByteArray lang = qgetenv("LC_ALL");
 576         if (lang.isEmpty() || lang == "C") {
 577             lang = qgetenv("LC_CTYPE");
 578         }
 579         if (lang.isEmpty() || lang == "C") {
 580             lang = qgetenv("LANG");
 581         }
 582
 583         // Now try these in order:
 584         // 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15)
 585         // 2. CODESET from lang if it contains a .CODESET part
 586         // 3. ctype (maybe the locale is named "ISO-8859-1" or something)
 587         // 4. locale (ditto)
 588         // 5. check for "@euro"
 589         // 6. guess locale from ctype unless ctype is "C"
 590         // 7. guess locale from lang
 591
 592         // 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15)
 593         int indexOfDot = ctype.indexOf('.');
 594         if (indexOfDot != -1)
 595             localeMapper = checkForCodec( ctype.mid(indexOfDot + 1) );
 596
 597         // 2. CODESET from lang if it contains a .CODESET part
 598         if (!localeMapper) {
 599             indexOfDot = lang.indexOf('.');
 600             if (indexOfDot != -1)
 601                 localeMapper = checkForCodec( lang.mid(indexOfDot + 1) );
 602         }
 603
 604         // 3. ctype (maybe the locale is named "ISO-8859-1" or something)
 605         if (!localeMapper && !ctype.isEmpty() && ctype != "C")
 606             localeMapper = checkForCodec(ctype);
 607
 608         // 4. locale (ditto)
 609         if (!localeMapper && !lang.isEmpty())
 610             localeMapper = checkForCodec(lang);
 611
 612         // 5. "@euro"
 613         if ((!localeMapper && ctype.contains("@euro")) || lang.contains("@euro"))
 614             localeMapper = checkForCodec("ISO 8859-15");
 615
 616         // 6. guess locale from ctype unless ctype is "C"
 617         // 7. guess locale from lang
 618         const QByteArray &try_by_name = (!ctype.isEmpty() && ctype != "C") ? lang : ctype;
 619
 620         // Now do the guessing.
 621         if (!lang.isEmpty() && !localeMapper && !try_by_name.isEmpty()) {
 622             if (try_locale_list(iso8859_15locales, lang))
 623                 localeMapper = QTextCodec::codecForName("ISO 8859-15");
 624             else if (try_locale_list(iso8859_2locales, lang))
 625                 localeMapper = QTextCodec::codecForName("ISO 8859-2");
 626             else if (try_locale_list(iso8859_3locales, lang))
 627                 localeMapper = QTextCodec::codecForName("ISO 8859-3");
 628             else if (try_locale_list(iso8859_4locales, lang))
 629                 localeMapper = QTextCodec::codecForName("ISO 8859-4");
 630             else if (try_locale_list(iso8859_5locales, lang))
 631                 localeMapper = QTextCodec::codecForName("ISO 8859-5");
 632             else if (try_locale_list(iso8859_6locales, lang))
 633                 localeMapper = QTextCodec::codecForName("ISO 8859-6");
 634             else if (try_locale_list(iso8859_7locales, lang))
 635                 localeMapper = QTextCodec::codecForName("ISO 8859-7");
 636             else if (try_locale_list(iso8859_8locales, lang))
 637                 localeMapper = QTextCodec::codecForName("ISO 8859-8-I");
 638             else if (try_locale_list(iso8859_9locales, lang))
 639                 localeMapper = QTextCodec::codecForName("ISO 8859-9");
 640             else if (try_locale_list(iso8859_13locales, lang))
 641                 localeMapper = QTextCodec::codecForName("ISO 8859-13");
 642             else if (try_locale_list(tis_620locales, lang))
 643                 localeMapper = QTextCodec::codecForName("ISO 8859-11");
 644             else if (try_locale_list(koi8_ulocales, lang))
 645                 localeMapper = QTextCodec::codecForName("KOI8-U");
 646             else if (try_locale_list(cp_1251locales, lang))
 647                 localeMapper = QTextCodec::codecForName("CP 1251");
 648             else if (try_locale_list(pt_154locales, lang))
 649                 localeMapper = QTextCodec::codecForName("PT 154");
 650             else if (try_locale_list(probably_koi8_rlocales, lang))
 651                 localeMapper = ru_RU_hack(lang);
 652         }
 653
 654     }
 655
 656     // If everything failed, we default to 8859-1
 657     // We could perhaps default to 8859-15.
 658     if (!localeMapper)
 659         localeMapper = QTextCodec::codecForName("ISO 8859-1");
 660 #endif
 661 }
 662
 663 #ifndef QT_NO_THREAD
 664 Q_GLOBAL_STATIC_WITH_ARGS(QMutex, textCodecsMutex, (QMutex::Recursive));
 665 #endif
 666
 667 // textCodecsMutex need to be locked to enter this function
 668 static void setup()
 669 {
 670     if (all)
 671         return;
 672
 673 #ifdef Q_DEBUG_TEXTCODEC
 674     if (destroying_is_ok)
 675         qWarning("QTextCodec: Creating new codec during codec cleanup");
 676 #endif
 677     all = new QList<QTextCodec*>;
 678     // create the cleanup object to cleanup all codecs on exit
 679     (void) createQTextCodecCleanup();
 680
 681 #ifndef QT_NO_CODECS
 682     (void)new QTsciiCodec;
 683     for (int i = 0; i < 9; ++i)
 684         (void)new QIsciiCodec(i);
 685
 686     for (int i = 0; i < QSimpleTextCodec::numSimpleCodecs; ++i)
 687         (void)new QSimpleTextCodec(i);
 688
 689 #  if defined(Q_OS_UNIX) && !defined(QT_BOOTSTRAPPED)
 690     // no font codecs when bootstrapping
 691     (void)new QFontLaoCodec;
 692     (void)new QFontGb2312Codec;
 693     (void)new QFontGbkCodec;
 694     (void)new QFontGb18030_0Codec;
 695     (void)new QFontJis0208Codec;
 696     (void)new QFontJis0201Codec;
 697     (void)new QFontKsc5601Codec;
 698     (void)new QFontBig5hkscsCodec;
 699     (void)new QFontBig5Codec;
 700 #  endif // Q_OS_UNIX && !QT_BOOTSTRAPPED
 701
 702
 703 #if !defined(Q_OS_INTEGRITY)
 704 #  if !defined(QT_BOOTSTRAPPED)
 705     // no asian codecs when bootstrapping, sorry
 706     (void)new QGb18030Codec;
 707     (void)new QGbkCodec;
 708     (void)new QGb2312Codec;
 709     (void)new QEucJpCodec;
 710     (void)new QJisCodec;
 711     (void)new QSjisCodec;
 712     (void)new QEucKrCodec;
 713     (void)new QCP949Codec;
 714     (void)new QBig5Codec;
 715     (void)new QBig5hkscsCodec;
 716 #  endif // !QT_BOOTSTRAPPED
 717 #endif // !Q_OS_INTEGRITY
 718 #endif // QT_NO_CODECS
 719
 720 #if defined(Q_OS_WIN32) || defined(Q_OS_WINCE)
 721     (void) new QWindowsLocalCodec;
 722 #endif // Q_OS_WIN32
 723
 724     (void)new QUtf16Codec;
 725     (void)new QUtf16BECodec;
 726     (void)new QUtf16LECodec;
 727     (void)new QUtf32Codec;
 728     (void)new QUtf32BECodec;
 729     (void)new QUtf32LECodec;
 730     (void)new QLatin15Codec;
 731     (void)new QLatin1Codec;
 732     (void)new QUtf8Codec;
 733
 734 #if !defined(Q_OS_INTEGRITY)
 735 #if defined(Q_OS_UNIX) && !defined(QT_NO_ICONV) && !defined(QT_BOOTSTRAPPED)
 736     // QIconvCodec depends on the UTF-16 codec, so it needs to be created last
 737     (void) new QIconvCodec();
 738 #endif
 739 #endif
 740
 741     if (!localeMapper)
 742         setupLocaleMapper();
 743 }
 744
 745 /*!
 746     \enum QTextCodec::ConversionFlag
 747
 748     \value DefaultConversion  No flag is set.
 749     \value ConvertInvalidToNull  If this flag is set, each invalid input
 750                                  character is output as a null character.
 751     \value IgnoreHeader  Ignore any Unicode byte-order mark and don't generate any.
 752
 753     \omitvalue FreeFunction
 754 */
 755
 756 /*!
 757     \fn QTextCodec::ConverterState::ConverterState(ConversionFlags flags)
 758
 759     Constructs a ConverterState object initialized with the given \a flags.
 760 */
 761
 762 /*!
 763     Destroys the ConverterState object.
 764 */
 765 QTextCodec::ConverterState::~ConverterState()
 766 {
 767     if (flags & FreeFunction)
 768         (QTextCodecUnalignedPointer::decode(state_data))(this);
 769     else if (d)
 770         free(d);
 771 }
 772
 773 /*!
 774     \class QTextCodec
 775     \brief The QTextCodec class provides conversions between text encodings.
 776     \reentrant
 777     \ingroup i18n
 778
 779     Qt uses Unicode to store, draw and manipulate strings. In many
 780     situations you may wish to deal with data that uses a different
 781     encoding. For example, most Japanese documents are still stored
 782     in Shift-JIS or ISO 2022-JP, while Russian users often have their
 783     documents in KOI8-R or Windows-1251.
 784
 785     Qt provides a set of QTextCodec classes to help with converting
 786     non-Unicode formats to and from Unicode. You can also create your
 787     own codec classes.
 788
 789     The supported encodings are:
 790
 791     \list
 792     \o Apple Roman
 793     \o \l{Big5 Text Codec}{Big5}
 794     \o \l{Big5-HKSCS Text Codec}{Big5-HKSCS}
 795     \o CP949
 796     \o \l{EUC-JP Text Codec}{EUC-JP}
 797     \o \l{EUC-KR Text Codec}{EUC-KR}
 798     \o \l{GBK Text Codec}{GB18030-0}
 799     \o IBM 850
 800     \o IBM 866
 801     \o IBM 874
 802     \o \l{ISO 2022-JP (JIS) Text Codec}{ISO 2022-JP}
 803     \o ISO 8859-1 to 10
 804     \o ISO 8859-13 to 16
 805     \o Iscii-Bng, Dev, Gjr, Knd, Mlm, Ori, Pnj, Tlg, and Tml
 806     \o JIS X 0201
 807     \o JIS X 0208
 808     \o KOI8-R
 809     \o KOI8-U
 810     \o MuleLao-1
 811     \o ROMAN8
 812     \o \l{Shift-JIS Text Codec}{Shift-JIS}
 813     \o TIS-620
 814     \o \l{TSCII Text Codec}{TSCII}
 815     \o UTF-8
 816     \o UTF-16
 817     \o UTF-16BE
 818     \o UTF-16LE
 819     \o UTF-32
 820     \o UTF-32BE
 821     \o UTF-32LE
 822     \o Windows-1250 to 1258
 823     \o WINSAMI2
 824     \endlist
 825
 826     QTextCodecs can be used as follows to convert some locally encoded
 827     string to Unicode. Suppose you have some string encoded in Russian
 828     KOI8-R encoding, and want to convert it to Unicode. The simple way
 829     to do it is like this:
 830
 831     \snippet doc/src/snippets/code/src_corelib_codecs_qtextcodec.cpp 0
 832
 833     After this, \c string holds the text converted to Unicode.
 834     Converting a string from Unicode to the local encoding is just as
 835     easy:
 836
 837     \snippet doc/src/snippets/code/src_corelib_codecs_qtextcodec.cpp 1
 838
 839     To read or write files in various encodings, use QTextStream and
 840     its \l{QTextStream::setCodec()}{setCodec()} function. See the
 841     \l{tools/codecs}{Codecs} example for an application of QTextCodec
 842     to file I/O.
 843
 844     Some care must be taken when trying to convert the data in chunks,
 845     for example, when receiving it over a network. In such cases it is
 846     possible that a multi-byte character will be split over two
 847     chunks. At best this might result in the loss of a character and
 848     at worst cause the entire conversion to fail.
 849
 850     The approach to use in these situations is to create a QTextDecoder
 851     object for the codec and use this QTextDecoder for the whole
 852     decoding process, as shown below:
 853
 854     \snippet doc/src/snippets/code/src_corelib_codecs_qtextcodec.cpp 2
 855
 856     The QTextDecoder object maintains state between chunks and therefore
 857     works correctly even if a multi-byte character is split between
 858     chunks.
 859
 860     \section1 Creating Your Own Codec Class
 861
 862     Support for new text encodings can be added to Qt by creating
 863     QTextCodec subclasses.
 864
 865     The pure virtual functions describe the encoder to the system and
 866     the coder is used as required in the different text file formats
 867     supported by QTextStream, and under X11, for the locale-specific
 868     character input and output.
 869
 870     To add support for another encoding to Qt, make a subclass of
 871     QTextCodec and implement the functions listed in the table below.
 872
 873     \table
 874     \header \o Function \o Description
 875
 876     \row \o name()
 877          \o Returns the official name for the encoding. If the
 878             encoding is listed in the
 879             \l{IANA character-sets encoding file}, the name
 880             should be the preferred MIME name for the encoding.
 881
 882     \row \o aliases()
 883          \o Returns a list of alternative names for the encoding.
 884             QTextCodec provides a default implementation that returns
 885             an empty list. For example, "ISO-8859-1" has "latin1",
 886             "CP819", "IBM819", and "iso-ir-100" as aliases.
 887
 888     \row \o mibEnum()
 889          \o Return the MIB enum for the encoding if it is listed in
 890             the \l{IANA character-sets encoding file}.
 891
 892     \row \o convertToUnicode()
 893          \o Converts an 8-bit character string to Unicode.
 894
 895     \row \o convertFromUnicode()
 896          \o Converts a Unicode string to an 8-bit character string.
 897     \endtable
 898
 899     \sa QTextStream, QTextDecoder, QTextEncoder, {Codecs Example}
 900 */
 901
 902 /*!
 903     Constructs a QTextCodec, and gives it the highest precedence. The
 904     QTextCodec should always be constructed on the heap (i.e. with \c
 905     new). Qt takes ownership and will delete it when the application
 906     terminates.
 907 */
 908 QTextCodec::QTextCodec()
 909 {
 910 #ifndef QT_NO_THREAD
 911     QMutexLocker locker(textCodecsMutex());
 912 #endif
 913     setup();
 914     all->prepend(this);
 915 }
 916
 917
 918 /*!
 919     \nonreentrant
 920
 921     Destroys the QTextCodec. Note that you should not delete codecs
 922     yourself: once created they become Qt's responsibility.
 923 */
 924 QTextCodec::~QTextCodec()
 925 {
 926 #ifdef Q_DEBUG_TEXTCODEC
 927     if (!destroying_is_ok)
 928         qWarning("QTextCodec::~QTextCodec: Called by application");
 929 #endif
 930     if (all) {
 931 #ifndef QT_NO_THREAD
 932         QMutexLocker locker(textCodecsMutex());
 933 #endif
 934         all->removeAll(this);
 935         QTextCodecCache *cache = qTextCodecCache();
 936         if (cache)
 937             cache->clear();
 938     }
 939 }
 940
 941 /*!
 942     \fn QTextCodec *QTextCodec::codecForName(const char *name)
 943
 944     Searches all installed QTextCodec objects and returns the one
 945     which best matches \a name; the match is case-insensitive. Returns
 946     0 if no codec matching the name \a name could be found.
 947 */
 948
 949 /*!
 950     Searches all installed QTextCodec objects and returns the one
 951     which best matches \a name; the match is case-insensitive. Returns
 952     0 if no codec matching the name \a name could be found.
 953 */
 954 QTextCodec *QTextCodec::codecForName(const QByteArray &name)
 955 {
 956     if (name.isEmpty())
 957         return 0;
 958
 959 #ifndef QT_NO_THREAD
 960     QMutexLocker locker(textCodecsMutex());
 961 #endif
 962     setup();
 963
 964     if (!validCodecs())
 965         return 0;
 966
 967     QTextCodecCache *cache = qTextCodecCache();
 968     QTextCodec *codec;
 969     if (cache) {
 970         codec = cache->value(name);
 971         if (codec)
 972             return codec;
 973     }
 974
 975     for (int i = 0; i < all->size(); ++i) {
 976         QTextCodec *cursor = all->at(i);
 977         if (nameMatch(cursor->name(), name)) {
 978             if (cache)
 979                 cache->insert(name, cursor);
 980             return cursor;
 981         }
 982         QList<QByteArray> aliases = cursor->aliases();
 983         for (int y = 0; y < aliases.size(); ++y)
 984             if (nameMatch(aliases.at(y), name)) {
 985                 if (cache)
 986                     cache->insert(name, cursor);
 987                 return cursor;
 988             }
 989     }
 990
 991     return 0;
 992 }
 993
 994
 995 /*!
 996     Returns the QTextCodec which matches the \link
 997     QTextCodec::mibEnum() MIBenum\endlink \a mib.
 998 */
 999 QTextCodec* QTextCodec::codecForMib(int mib)
1000 {
1001 #ifndef QT_NO_THREAD
1002     QMutexLocker locker(textCodecsMutex());
1003 #endif
1004     setup();
1005
1006     if (!validCodecs())
1007         return 0;
1008
1009     QByteArray key = "MIB: " + QByteArray::number(mib);
1010     QTextCodecCache *cache = qTextCodecCache();
1011     QTextCodec *codec;
1012     if (cache) {
1013         codec = cache->value(key);
1014         if (codec)
1015             return codec;
1016     }
1017
1018     QList<QTextCodec*>::ConstIterator i;
1019     for (int i = 0; i < all->size(); ++i) {
1020         QTextCodec *cursor = all->at(i);
1021         if (cursor->mibEnum() == mib) {
1022             if (cache)
1023                 cache->insert(key, cursor);
1024             return cursor;
1025         }
1026     }
1027
1028     return 0;
1029 }
1030
1031 /*!
1032     Returns the list of all available codecs, by name. Call
1033     QTextCodec::codecForName() to obtain the QTextCodec for the name.
1034
1035     The list may contain many mentions of the same codec
1036     if the codec has aliases.
1037
1038     \sa availableMibs(), name(), aliases()
1039 */
1040 QList<QByteArray> QTextCodec::availableCodecs()
1041 {
1042 #ifndef QT_NO_THREAD
1043     QMutexLocker locker(textCodecsMutex());
1044 #endif
1045     setup();
1046
1047     QList<QByteArray> codecs;
1048
1049     if (!validCodecs())
1050         return codecs;
1051
1052     for (int i = 0; i < all->size(); ++i) {
1053         codecs += all->at(i)->name();
1054         codecs += all->at(i)->aliases();
1055     }
1056
1057 #ifndef QT_NO_THREAD
1058     locker.unlock();
1059 #endif
1060
1061     return codecs;
1062 }
1063
1064 /*!
1065     Returns the list of MIBs for all available codecs. Call
1066     QTextCodec::codecForMib() to obtain the QTextCodec for the MIB.
1067
1068     \sa availableCodecs(), mibEnum()
1069 */
1070 QList<int> QTextCodec::availableMibs()
1071 {
1072 #ifndef QT_NO_THREAD
1073     QMutexLocker locker(textCodecsMutex());
1074 #endif
1075     setup();
1076
1077     QList<int> codecs;
1078
1079     if (!validCodecs())
1080         return codecs;
1081
1082     for (int i = 0; i < all->size(); ++i)
1083         codecs += all->at(i)->mibEnum();
1084
1085 #ifndef QT_NO_THREAD
1086     locker.unlock();
1087 #endif
1088
1089     return codecs;
1090 }
1091
1092 /*!
1093     Set the codec to \a c; this will be returned by
1094     codecForLocale(). If \a c is a null pointer, the codec is reset to
1095     the default.
1096
1097     This might be needed for some applications that want to use their
1098     own mechanism for setting the locale.
1099
1100     \sa codecForLocale()
1101 */
1102 void QTextCodec::setCodecForLocale(QTextCodec *c)
1103 {
1104 #ifndef QT_NO_THREAD
1105     QMutexLocker locker(textCodecsMutex());
1106 #endif
1107     localeMapper = c;
1108     if (!localeMapper)
1109         setupLocaleMapper();
1110 }
1111
1112 /*!
1113     Returns a pointer to the codec most suitable for this locale.
1114
1115     On Windows, the codec will be based on a system locale. On Unix
1116     systems, starting with Qt 4.2, the codec will be using the \e
1117     iconv library. Note that in both cases the codec's name will be
1118     "System".
1119 */
1120
1121 QTextCodec* QTextCodec::codecForLocale()
1122 {
1123     if (!validCodecs())
1124         return 0;
1125
1126     if (localeMapper)
1127         return localeMapper;
1128
1129 #ifndef QT_NO_THREAD
1130     QMutexLocker locker(textCodecsMutex());
1131 #endif
1132     setup();
1133
1134     return localeMapper;
1135 }
1136
1137
1138 /*!
1139     \fn QByteArray QTextCodec::name() const
1140
1141     QTextCodec subclasses must reimplement this function. It returns
1142     the name of the encoding supported by the subclass.
1143
1144     If the codec is registered as a character set in the
1145     \l{IANA character-sets encoding file} this method should
1146     return the preferred mime name for the codec if defined,
1147     otherwise its name.
1148 */
1149
1150 /*!
1151     \fn int QTextCodec::mibEnum() const
1152
1153     Subclasses of QTextCodec must reimplement this function. It
1154     returns the MIBenum (see \l{IANA character-sets encoding file}
1155     for more information). It is important that each QTextCodec
1156     subclass returns the correct unique value for this function.
1157 */
1158
1159 /*!
1160   Subclasses can return a number of aliases for the codec in question.
1161
1162   Standard aliases for codecs can be found in the
1163   \l{IANA character-sets encoding file}.
1164 */
1165 QList<QByteArray> QTextCodec::aliases() const
1166 {
1167     return QList<QByteArray>();
1168 }
1169
1170 /*!
1171     \fn QString QTextCodec::convertToUnicode(const char *chars, int len,
1172                                              ConverterState *state) const
1173
1174     QTextCodec subclasses must reimplement this function.
1175
1176     Converts the first \a len characters of \a chars from the
1177     encoding of the subclass to Unicode, and returns the result in a
1178     QString.
1179
1180     \a state can be 0, in which case the conversion is stateless and
1181     default conversion rules should be used. If state is not 0, the
1182     codec should save the state after the conversion in \a state, and
1183     adjust the remainingChars and invalidChars members of the struct.
1184 */
1185
1186 /*!
1187     \fn QByteArray QTextCodec::convertFromUnicode(const QChar *input, int number,
1188                                                   ConverterState *state) const
1189
1190     QTextCodec subclasses must reimplement this function.
1191
1192     Converts the first \a number of characters from the \a input array
1193     from Unicode to the encoding of the subclass, and returns the result
1194     in a QByteArray.
1195
1196     \a state can be 0 in which case the conversion is stateless and
1197     default conversion rules should be used. If state is not 0, the
1198     codec should save the state after the conversion in \a state, and
1199     adjust the remainingChars and invalidChars members of the struct.
1200 */
1201
1202 /*!
1203     Creates a QTextDecoder with a specified \a flags to decode chunks
1204     of \c{char *} data to create chunks of Unicode data.
1205
1206     The caller is responsible for deleting the returned object.
1207
1208     \since 4.7
1209 */
1210 QTextDecoder* QTextCodec::makeDecoder(QTextCodec::ConversionFlags flags) const
1211 {
1212     return new QTextDecoder(this, flags);
1213 }
1214
1215 /*!
1216     Creates a QTextEncoder with a specified \a flags to encode chunks
1217     of Unicode data as \c{char *} data.
1218
1219     The caller is responsible for deleting the returned object.
1220
1221     \since 4.7
1222 */
1223 QTextEncoder* QTextCodec::makeEncoder(QTextCodec::ConversionFlags flags) const
1224 {
1225     return new QTextEncoder(this, flags);
1226 }
1227
1228 /*!
1229     \fn QByteArray QTextCodec::fromUnicode(const QChar *input, int number,
1230                                            ConverterState *state) const
1231
1232     Converts the first \a number of characters from the \a input array
1233     from Unicode to the encoding of this codec, and returns the result
1234     in a QByteArray.
1235
1236     The \a state of the convertor used is updated.
1237 */
1238
1239 /*!
1240     Converts \a str from Unicode to the encoding of this codec, and
1241     returns the result in a QByteArray.
1242 */
1243 QByteArray QTextCodec::fromUnicode(const QString& str) const
1244 {
1245     return convertFromUnicode(str.constData(), str.length(), 0);
1246 }
1247
1248 /*!
1249     \fn QString QTextCodec::toUnicode(const char *input, int size,
1250                                       ConverterState *state) const
1251
1252     Converts the first \a size characters from the \a input from the
1253     encoding of this codec to Unicode, and returns the result in a
1254     QString.
1255
1256     The \a state of the convertor used is updated.
1257 */
1258
1259 /*!
1260     Converts \a a from the encoding of this codec to Unicode, and
1261     returns the result in a QString.
1262 */
1263 QString QTextCodec::toUnicode(const QByteArray& a) const
1264 {
1265     return convertToUnicode(a.constData(), a.length(), 0);
1266 }
1267
1268 /*!
1269     Returns true if the Unicode character \a ch can be fully encoded
1270     with this codec; otherwise returns false.
1271 */
1272 bool QTextCodec::canEncode(QChar ch) const
1273 {
1274     ConverterState state;
1275     state.flags = ConvertInvalidToNull;
1276     convertFromUnicode(&ch, 1, &state);
1277     return (state.invalidChars == 0);
1278 }
1279
1280 /*!
1281     \overload
1282
1283     \a s contains the string being tested for encode-ability.
1284 */
1285 bool QTextCodec::canEncode(const QString& s) const
1286 {
1287     ConverterState state;
1288     state.flags = ConvertInvalidToNull;
1289     convertFromUnicode(s.constData(), s.length(), &state);
1290     return (state.invalidChars == 0);
1291 }
1292
1293 /*!
1294     \overload
1295
1296     \a chars contains the source characters.
1297 */
1298 QString QTextCodec::toUnicode(const char *chars) const
1299 {
1300     int len = qstrlen(chars);
1301     return convertToUnicode(chars, len, 0);
1302 }
1303
1304
1305 /*!
1306     \class QTextEncoder
1307     \brief The QTextEncoder class provides a state-based encoder.
1308     \reentrant
1309     \ingroup i18n
1310
1311     A text encoder converts text from Unicode into an encoded text format
1312     using a specific codec.
1313
1314     The encoder converts Unicode into another format, remembering any
1315     state that is required between calls.
1316
1317     \sa QTextCodec::makeEncoder(), QTextDecoder
1318 */
1319
1320 /*!
1321     \fn QTextEncoder::QTextEncoder(const QTextCodec *codec)
1322
1323     Constructs a text encoder for the given \a codec.
1324 */
1325
1326 /*!
1327     Constructs a text encoder for the given \a codec and conversion \a flags.
1328
1329     \since 4.7
1330 */
1331 QTextEncoder::QTextEncoder(const QTextCodec *codec, QTextCodec::ConversionFlags flags)
1332     : c(codec), state()
1333 {
1334     state.flags = flags;
1335 }
1336
1337 /*!
1338     Destroys the encoder.
1339 */
1340 QTextEncoder::~QTextEncoder()
1341 {
1342 }
1343
1344 /*! \internal
1345     \since 4.5
1346     Determines whether the eecoder encountered a failure while decoding the input. If
1347     an error was encountered, the produced result is undefined, and gets converted as according
1348     to the conversion flags.
1349  */
1350 bool QTextEncoder::hasFailure() const
1351 {
1352     return state.invalidChars != 0;
1353 }
1354
1355 /*!
1356     Converts the Unicode string \a str into an encoded QByteArray.
1357 */
1358 QByteArray QTextEncoder::fromUnicode(const QString& str)
1359 {
1360     QByteArray result = c->fromUnicode(str.constData(), str.length(), &state);
1361     return result;
1362 }
1363
1364 /*!
1365     \overload
1366
1367     Converts \a len characters (not bytes) from \a uc, and returns the
1368     result in a QByteArray.
1369 */
1370 QByteArray QTextEncoder::fromUnicode(const QChar *uc, int len)
1371 {
1372     QByteArray result = c->fromUnicode(uc, len, &state);
1373     return result;
1374 }
1375
1376 /*!
1377     \class QTextDecoder
1378     \brief The QTextDecoder class provides a state-based decoder.
1379     \reentrant
1380     \ingroup i18n
1381
1382     A text decoder converts text from an encoded text format into Unicode
1383     using a specific codec.
1384
1385     The decoder converts text in this format into Unicode, remembering any
1386     state that is required between calls.
1387
1388     \sa QTextCodec::makeDecoder(), QTextEncoder
1389 */
1390
1391 /*!
1392     \fn QTextDecoder::QTextDecoder(const QTextCodec *codec)
1393
1394     Constructs a text decoder for the given \a codec.
1395 */
1396
1397 /*!
1398     Constructs a text decoder for the given \a codec and conversion \a flags.
1399
1400     \since 4.7
1401 */
1402
1403 QTextDecoder::QTextDecoder(const QTextCodec *codec, QTextCodec::ConversionFlags flags)
1404     : c(codec), state()
1405 {
1406     state.flags = flags;
1407 }
1408
1409 /*!
1410     Destroys the decoder.
1411 */
1412 QTextDecoder::~QTextDecoder()
1413 {
1414 }
1415
1416 /*!
1417     \fn QString QTextDecoder::toUnicode(const char *chars, int len)
1418
1419     Converts the first \a len bytes in \a chars to Unicode, returning
1420     the result.
1421
1422     If not all characters are used (e.g. if only part of a multi-byte
1423     encoding is at the end of the characters), the decoder remembers
1424     enough state to continue with the next call to this function.
1425 */
1426 QString QTextDecoder::toUnicode(const char *chars, int len)
1427 {
1428     return c->toUnicode(chars, len, &state);
1429 }
1430
1431
1432 /*! \overload
1433
1434     The converted string is returned in \a target.
1435  */
1436 void QTextDecoder::toUnicode(QString *target, const char *chars, int len)
1437 {
1438     Q_ASSERT(target);
1439     switch (c->mibEnum()) {
1440     case 106: // utf8
1441         static_cast<const QUtf8Codec*>(c)->convertToUnicode(target, chars, len, &state);
1442         break;
1443     case 4: { // latin1
1444         target->resize(len);
1445         ushort *data = (ushort*)target->data();
1446         for (int i = len; i >=0; --i)
1447             data[i] = (uchar) chars[i];
1448     } break;
1449     default:
1450         *target = c->toUnicode(chars, len, &state);
1451     }
1452 }
1453
1454
1455 /*!
1456     \overload
1457
1458     Converts the bytes in the byte array specified by \a ba to Unicode
1459     and returns the result.
1460 */
1461 QString QTextDecoder::toUnicode(const QByteArray &ba)
1462 {
1463     return c->toUnicode(ba.constData(), ba.length(), &state);
1464 }
1465
1466
1467 /*!
1468     \fn QTextCodec* QTextCodec::codecForTr()
1469
1470     Returns the codec used by QObject::tr() on its argument. If this
1471     function returns 0 (the default), tr() assumes Latin-1.
1472
1473     \sa setCodecForTr()
1474 */
1475
1476 /*!
1477     \fn void QTextCodec::setCodecForTr(QTextCodec *c)
1478     \nonreentrant
1479
1480     Sets the codec used by QObject::tr() on its argument to \a c. If
1481     \a c is 0 (the default), tr() assumes Latin-1.
1482
1483     If the literal quoted text in the program is not in the Latin-1
1484     encoding, this function can be used to set the appropriate
1485     encoding. For example, software developed by Korean programmers
1486     might use eucKR for all the text in the program, in which case the
1487     main() function might look like this:
1488
1489     \snippet doc/src/snippets/code/src_corelib_codecs_qtextcodec.cpp 3
1490
1491     Note that this is not the way to select the encoding that the \e
1492     user has chosen. For example, to convert an application containing
1493     literal English strings to Korean, all that is needed is for the
1494     English strings to be passed through tr() and for translation
1495     files to be loaded. For details of internationalization, see
1496     \l{Internationalization with Qt}.
1497
1498     \sa codecForTr(), setCodecForCStrings()
1499 */
1500
1501
1502 /*!
1503     \fn QTextCodec* QTextCodec::codecForCStrings()
1504
1505     Returns the codec used by QString to convert to and from \c{const
1506     char *} and QByteArrays. If this function returns 0 (the default),
1507     QString assumes Latin-1.
1508
1509     \sa setCodecForCStrings()
1510 */
1511
1512 /*!
1513     \fn void QTextCodec::setCodecForCStrings(QTextCodec *codec)
1514     \nonreentrant
1515
1516     Sets the codec used by QString to convert to and from \c{const
1517     char *} and QByteArrays. If the \a codec is 0 (the default),
1518     QString assumes Latin-1.
1519
1520     \warning Some codecs do not preserve the characters in the ASCII
1521     range (0x00 to 0x7F). For example, the Japanese Shift-JIS
1522     encoding maps the backslash character (0x5A) to the Yen
1523     character. To avoid undesirable side-effects, we recommend
1524     avoiding such codecs with setCodecsForCString().
1525
1526     \sa codecForCStrings(), setCodecForTr()
1527 */
1528
1529 /*!
1530     \since 4.4
1531
1532     Tries to detect the encoding of the provided snippet of HTML in
1533     the given byte array, \a ba, by checking the BOM (Byte Order Mark)
1534     and the content-type meta header and returns a QTextCodec instance
1535     that is capable of decoding the html to unicode.  If the codec
1536     cannot be detected from the content provided, \a defaultCodec is
1537     returned.
1538
1539     \sa codecForUtfText()
1540 */
1541 QTextCodec *QTextCodec::codecForHtml(const QByteArray &ba, QTextCodec *defaultCodec)
1542 {
1543     // determine charset
1544     int pos;
1545     QTextCodec *c = 0;
1546
1547     c = QTextCodec::codecForUtfText(ba, c);
1548     if (!c) {
1549         QByteArray header = ba.left(512).toLower();
1550         if ((pos = header.indexOf("http-equiv=")) != -1) {
1551             if ((pos = header.lastIndexOf("meta ", pos)) != -1) {
1552                 pos = header.indexOf("charset=", pos) + int(strlen("charset="));
1553                 if (pos != -1) {
1554                     int pos2 = header.indexOf('\"', pos+1);
1555                     QByteArray cs = header.mid(pos, pos2-pos);
1556                     //            qDebug("found charset: %s", cs.data());
1557                     c = QTextCodec::codecForName(cs);
1558                 }
1559             }
1560         }
1561     }
1562     if (!c)
1563         c = defaultCodec;
1564
1565     return c;
1566 }
1567
1568 /*!
1569     \overload
1570
1571     Tries to detect the encoding of the provided snippet of HTML in
1572     the given byte array, \a ba, by checking the BOM (Byte Order Mark)
1573     and the content-type meta header and returns a QTextCodec instance
1574     that is capable of decoding the html to unicode. If the codec cannot
1575     be detected, this overload returns a Latin-1 QTextCodec.
1576 */
1577 QTextCodec *QTextCodec::codecForHtml(const QByteArray &ba)
1578 {
1579     return codecForHtml(ba, QTextCodec::codecForMib(/*Latin 1*/ 4));
1580 }
1581
1582 /*!
1583     \since 4.6
1584
1585     Tries to detect the encoding of the provided snippet \a ba by
1586     using the BOM (Byte Order Mark) and returns a QTextCodec instance
1587     that is capable of decoding the text to unicode. If the codec
1588     cannot be detected from the content provided, \a defaultCodec is
1589     returned.
1590
1591     \sa codecForHtml()
1592 */
1593 QTextCodec *QTextCodec::codecForUtfText(const QByteArray &ba, QTextCodec *defaultCodec)
1594 {
1595     const int arraySize = ba.size();
1596
1597     if (arraySize > 3) {
1598         if ((uchar)ba[0] == 0x00
1599             && (uchar)ba[1] == 0x00
1600             && (uchar)ba[2] == 0xFE
1601             && (uchar)ba[3] == 0xFF)
1602             return QTextCodec::codecForMib(1018); // utf-32 be
1603         else if ((uchar)ba[0] == 0xFF
1604                  && (uchar)ba[1] == 0xFE
1605                  && (uchar)ba[2] == 0x00
1606                  && (uchar)ba[3] == 0x00)
1607             return QTextCodec::codecForMib(1019); // utf-32 le
1608     }
1609
1610     if (arraySize < 2)
1611         return defaultCodec;
1612     if ((uchar)ba[0] == 0xfe && (uchar)ba[1] == 0xff)
1613         return QTextCodec::codecForMib(1013); // utf16 be
1614     else if ((uchar)ba[0] == 0xff && (uchar)ba[1] == 0xfe)
1615         return QTextCodec::codecForMib(1014); // utf16 le
1616
1617     if (arraySize < 3)
1618         return defaultCodec;
1619     if ((uchar)ba[0] == 0xef
1620         && (uchar)ba[1] == 0xbb
1621         && (uchar)ba[2] == 0xbf)
1622         return QTextCodec::codecForMib(106); // utf-8
1623
1624     return defaultCodec;
1625 }
1626
1627 /*!
1628     \overload
1629
1630     Tries to detect the encoding of the provided snippet \a ba by
1631     using the BOM (Byte Order Mark) and returns a QTextCodec instance
1632     that is capable of decoding the text to unicode. If the codec
1633     cannot be detected, this overload returns a Latin-1 QTextCodec.
1634
1635     \sa codecForHtml()
1636 */
1637 QTextCodec *QTextCodec::codecForUtfText(const QByteArray &ba)
1638 {
1639     return codecForUtfText(ba, QTextCodec::codecForMib(/*Latin 1*/ 4));
1640 }
1641
1642
1643 /*! \internal
1644     \since 4.3
1645     Determines whether the decoder encountered a failure while decoding the input. If
1646     an error was encountered, the produced result is undefined, and gets converted as according
1647     to the conversion flags.
1648  */
1649 bool QTextDecoder::hasFailure() const
1650 {
1651     return state.invalidChars != 0;
1652 }
1653
1654 QT_END_NAMESPACE
1655
1656 #endif // QT_NO_TEXTCODEC