1 /****************************************************************************
3 ** Copyright (C) 2012 Nokia Corporation and/or its subsidiary(-ies).
4 ** Contact: http://www.qt-project.org/
6 ** This file is part of the QtCore module of the Qt Toolkit.
8 ** $QT_BEGIN_LICENSE:LGPL$
9 ** GNU Lesser General Public License Usage
10 ** This file may be used under the terms of the GNU Lesser General Public
11 ** License version 2.1 as published by the Free Software Foundation and
12 ** appearing in the file LICENSE.LGPL included in the packaging of this
13 ** file. Please review the following information to ensure the GNU Lesser
14 ** General Public License version 2.1 requirements will be met:
15 ** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
17 ** In addition, as a special exception, Nokia gives you certain additional
18 ** rights. These rights are described in the Nokia Qt LGPL Exception
19 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
21 ** GNU General Public License Usage
22 ** Alternatively, this file may be used under the terms of the GNU General
23 ** Public License version 3.0 as published by the Free Software Foundation
24 ** and appearing in the file LICENSE.GPL included in the packaging of this
25 ** file. Please review the following information to ensure the GNU General
26 ** Public License version 3.0 requirements will be met:
27 ** http://www.gnu.org/copyleft/gpl.html.
30 ** Alternatively, this file may be used in accordance with the terms and
31 ** conditions contained in a signed written agreement between you and Nokia.
40 ****************************************************************************/
42 #include "qplatformdefs.h"
43 #include "qtextcodec.h"
44 #include "qtextcodec_p.h"
46 #ifndef QT_NO_TEXTCODEC
50 #include "qstringlist.h"
53 # include "qiconvcodec_p.h"
56 #include "qutfcodec_p.h"
57 #include "qsimplecodec_p.h"
58 #include "qlatincodec_p.h"
60 # include "qtsciicodec_p.h"
61 # include "qisciicodec_p.h"
62 #if !defined(Q_OS_INTEGRITY)
63 # if !defined(QT_BOOTSTRAPPED)
64 # include "qgb18030codec_p.h"
65 # include "qeucjpcodec_p.h"
66 # include "qjiscodec_p.h"
67 # include "qsjiscodec_p.h"
68 # include "qeuckrcodec_p.h"
69 # include "qbig5codec_p.h"
70 # endif // !QT_BOOTSTRAPPED
71 # if defined(Q_OS_UNIX) && !defined(QT_BOOTSTRAPPED)
72 # include "qfontlaocodec_p.h"
73 # include "qfontjpcodec_p.h"
75 #endif // !Q_OS_INTEGRITY
76 #endif // QT_NO_CODECS
84 #if defined (_XOPEN_UNIX) && !defined(Q_OS_QNX) && !defined(Q_OS_OSF)
89 # include <qt_windows.h>
90 # if defined(Q_OS_WINCE)
91 # define QT_NO_SETLOCALE
96 // enabling this is not exception safe!
97 // #define Q_DEBUG_TEXTCODEC
101 //Cache for QTextCodec::codecForName and codecForMib.
102 typedef QHash<QByteArray, QTextCodec *> QTextCodecCache;
103 Q_GLOBAL_STATIC(QTextCodecCache, qTextCodecCache)
106 static char qtolower(register char c)
107 { if (c >= 'A' && c <= 'Z') return c + 0x20; return c; }
108 static bool qisalnum(register char c)
109 { return (c >= '0' && c <= '9') || ((c | 0x20) >= 'a' && (c | 0x20) <= 'z'); }
111 static bool nameMatch(const QByteArray &name, const QByteArray &test)
113 // if they're the same, return a perfect score
114 if (qstricmp(name, test) == 0)
117 const char *n = name.constData();
118 const char *h = test.constData();
120 // if the letters and numbers are the same, we have a match
130 if (qtolower(*n) != qtolower(*h))
136 while (*h && !qisalnum(*h))
142 static QList<QTextCodec*> *all = 0;
143 #ifdef Q_DEBUG_TEXTCODEC
144 static bool destroying_is_ok = false;
147 static QTextCodec *localeMapper = 0;
148 QTextCodec *QTextCodec::cftr = 0;
151 class QTextCodecCleanup
154 ~QTextCodecCleanup();
158 Deletes all the created codecs. This destructor is called just
159 before exiting to delete any QTextCodec objects that may be lying
162 QTextCodecCleanup::~QTextCodecCleanup()
167 #ifdef Q_DEBUG_TEXTCODEC
168 destroying_is_ok = true;
171 QList<QTextCodec *> *myAll = all;
172 all = 0; // Otherwise the d'tor destroys the iterator
173 for (QList<QTextCodec *>::const_iterator it = myAll->constBegin()
174 ; it != myAll->constEnd(); ++it) {
180 #ifdef Q_DEBUG_TEXTCODEC
181 destroying_is_ok = false;
185 Q_GLOBAL_STATIC(QTextCodecCleanup, createQTextCodecCleanup)
187 bool QTextCodec::validCodecs()
193 #if defined(Q_OS_WIN32) || defined(Q_OS_WINCE)
194 class QWindowsLocalCodec: public QTextCodec
197 QWindowsLocalCodec();
198 ~QWindowsLocalCodec();
200 QString convertToUnicode(const char *, int, ConverterState *) const;
201 QByteArray convertFromUnicode(const QChar *, int, ConverterState *) const;
202 QString convertToUnicodeCharByChar(const char *chars, int length, ConverterState *state) const;
204 QByteArray name() const;
209 QWindowsLocalCodec::QWindowsLocalCodec()
213 QWindowsLocalCodec::~QWindowsLocalCodec()
217 QString QWindowsLocalCodec::convertToUnicode(const char *chars, int length, ConverterState *state) const
219 const char *mb = chars;
225 const int wclen_auto = 4096;
226 wchar_t wc_auto[wclen_auto];
227 int wclen = wclen_auto;
228 wchar_t *wc = wc_auto;
231 bool prepend = false;
233 int remainingChars = 0;
235 //save the current state information
237 state_data = (char)state->state_data[0];
238 remainingChars = state->remainingChars;
241 //convert the pending charcter (if available)
242 if (state && remainingChars) {
244 prev[0] = state_data;
247 len = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED,
251 sp.append(QChar(wc[0]));
258 while (!(len=MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS,
259 mb, mblen, wc, wclen))) {
260 int r = GetLastError();
261 if (r == ERROR_INSUFFICIENT_BUFFER) {
263 qWarning("MultiByteToWideChar: Size changed");
266 wclen = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED,
268 wc = new wchar_t[wclen];
271 } else if (r == ERROR_NO_UNICODE_TRANSLATION) {
272 //find the last non NULL character
273 while (mblen > 1 && !(mb[mblen-1]))
275 //check whether, we hit an invalid character in the middle
276 if ((mblen <= 1) || (remainingChars && state_data))
277 return convertToUnicodeCharByChar(chars, length, state);
278 //Remove the last character and try again...
279 state_data = mb[mblen-1];
284 qWarning("MultiByteToWideChar: Cannot convert multibyte text");
290 if (wc[len-1] == 0) // len - 1: we don't want terminator
293 //save the new state information
295 state->state_data[0] = (char)state_data;
296 state->remainingChars = remainingChars;
298 QString s((QChar*)wc, len);
307 QString QWindowsLocalCodec::convertToUnicodeCharByChar(const char *chars, int length, ConverterState *state) const
309 if (!chars || !length)
312 int copyLocation = 0;
314 if (state && state->remainingChars) {
315 copyLocation = state->remainingChars;
316 extra += copyLocation;
318 int newLength = length + extra;
319 char *mbcs = new char[newLength];
320 //ensure that we have a NULL terminated string
321 mbcs[newLength-1] = 0;
322 mbcs[newLength-2] = 0;
323 memcpy(&(mbcs[copyLocation]), chars, length);
325 //copy the last character from the state
326 mbcs[0] = (char)state->state_data[0];
327 state->remainingChars = 0;
329 const char *mb = mbcs;
331 const char *next = 0;
333 while((next = CharNextExA(CP_ACP, mb, 0)) != mb) {
335 int charlength = next - mb;
336 int len = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS, mb, charlength, wc, 2);
338 s.append(QChar(wc[0]));
340 int r = GetLastError();
341 //check if the character being dropped is the last character
342 if (r == ERROR_NO_UNICODE_TRANSLATION && mb == (mbcs+newLength -3) && state) {
343 state->remainingChars = 1;
344 state->state_data[0] = (char)*mb;
351 int size = mbstowcs(NULL, mb, length);
353 Q_ASSERT("Error in CE TextCodec");
356 wchar_t* ws = new wchar_t[size + 2];
359 size = mbstowcs(ws, mb, length);
360 for (int i=0; i< size; i++)
361 s.append(QChar(ws[i]));
368 QByteArray QWindowsLocalCodec::convertFromUnicode(const QChar *ch, int uclen, ConverterState *) const
373 return QByteArray("");
375 QByteArray mb(4096, 0);
377 while (!(len=WideCharToMultiByte(CP_ACP, 0, (const wchar_t*)ch, uclen,
378 mb.data(), mb.size()-1, 0, &used_def)))
380 int r = GetLastError();
381 if (r == ERROR_INSUFFICIENT_BUFFER) {
382 mb.resize(1+WideCharToMultiByte(CP_ACP, 0,
383 (const wchar_t*)ch, uclen,
384 0, 0, 0, &used_def));
389 qWarning("WideCharToMultiByte: Cannot convert multibyte text (error %d): %s (UTF-8)",
390 r, QString(ch, uclen).toLocal8Bit().data());
400 QByteArray QWindowsLocalCodec::name() const
405 int QWindowsLocalCodec::mibEnum() const
412 /* locale names mostly copied from XFree86 */
413 static const char * const iso8859_2locales[] = {
414 "croatian", "cs", "cs_CS", "cs_CZ","cz", "cz_CZ", "czech", "hr",
415 "hr_HR", "hu", "hu_HU", "hungarian", "pl", "pl_PL", "polish", "ro",
416 "ro_RO", "rumanian", "serbocroatian", "sh", "sh_SP", "sh_YU", "sk",
417 "sk_SK", "sl", "sl_CS", "sl_SI", "slovak", "slovene", "sr_SP", 0 };
419 static const char * const iso8859_3locales[] = {
422 static const char * const iso8859_4locales[] = {
425 static const char * const iso8859_5locales[] = {
426 "mk", "mk_MK", "sp", "sp_YU", 0 };
428 static const char * const cp_1251locales[] = {
429 "be", "be_BY", "bg", "bg_BG", "bulgarian", 0 };
431 static const char * const pt_154locales[] = {
432 "ba_RU", "ky", "ky_KG", "kk", "kk_KZ", 0 };
434 static const char * const iso8859_6locales[] = {
435 "ar_AA", "ar_SA", "arabic", 0 };
437 static const char * const iso8859_7locales[] = {
438 "el", "el_GR", "greek", 0 };
440 static const char * const iso8859_8locales[] = {
441 "hebrew", "he", "he_IL", "iw", "iw_IL", 0 };
443 static const char * const iso8859_9locales[] = {
444 "tr", "tr_TR", "turkish", 0 };
446 static const char * const iso8859_13locales[] = {
447 "lt", "lt_LT", "lv", "lv_LV", 0 };
449 static const char * const iso8859_15locales[] = {
452 "br_FR", "ca_ES", "de", "de_AT", "de_BE", "de_DE", "de_LU", "en_IE",
453 "es", "es_ES", "eu_ES", "fi", "fi_FI", "finnish", "fr", "fr_FR",
454 "fr_BE", "fr_LU", "french", "ga_IE", "gl_ES", "it", "it_IT", "oc_FR",
455 "nl", "nl_BE", "nl_NL", "pt", "pt_PT", "sv_FI", "wa_BE",
458 static const char * const koi8_ulocales[] = {
459 "uk", "uk_UA", "ru_UA", "ukrainian", 0 };
461 static const char * const tis_620locales[] = {
462 "th", "th_TH", "thai", 0 };
464 // static const char * const tcvnlocales[] = {
465 // "vi", "vi_VN", 0 };
467 static bool try_locale_list(const char * const locale[], const QByteArray &lang)
470 for(i=0; locale[i] && lang != locale[i]; i++)
472 return locale[i] != 0;
475 // For the probably_koi8_locales we have to look. the standard says
476 // these are 8859-5, but almost all Russian users use KOI8-R and
477 // incorrectly set $LANG to ru_RU. We'll check tolower() to see what
478 // it thinks ru_RU means.
480 // If you read the history, it seems that many Russians blame ISO and
481 // Perestroika for the confusion.
483 // The real bug is that some programs break if the user specifies
486 static const char * const probably_koi8_rlocales[] = {
487 "ru", "ru_SU", "ru_RU", "russian", 0 };
489 static QTextCodec * ru_RU_hack(const char * i) {
490 QTextCodec * ru_RU_codec = 0;
492 #if !defined(QT_NO_SETLOCALE)
493 QByteArray origlocale(setlocale(LC_CTYPE, i));
495 QByteArray origlocale(i);
497 // unicode koi8r latin5 name
498 // 0x044E 0xC0 0xEE CYRILLIC SMALL LETTER YU
499 // 0x042E 0xE0 0xCE CYRILLIC CAPITAL LETTER YU
500 int latin5 = tolower(0xCE);
501 int koi8r = tolower(0xE0);
502 if (koi8r == 0xC0 && latin5 != 0xEE) {
503 ru_RU_codec = QTextCodec::codecForName("KOI8-R");
504 } else if (koi8r != 0xC0 && latin5 == 0xEE) {
505 ru_RU_codec = QTextCodec::codecForName("ISO 8859-5");
507 // something else again... let's assume... *throws dice*
508 ru_RU_codec = QTextCodec::codecForName("KOI8-R");
509 qWarning("QTextCodec: Using KOI8-R, probe failed (%02x %02x %s)",
512 #if !defined(QT_NO_SETLOCALE)
513 setlocale(LC_CTYPE, origlocale);
521 #if !defined(Q_OS_WIN32) && !defined(Q_OS_WINCE)
522 static QTextCodec *checkForCodec(const QByteArray &name) {
523 QTextCodec *c = QTextCodec::codecForName(name);
525 const int index = name.indexOf('@');
527 c = QTextCodec::codecForName(name.left(index));
534 /* the next two functions are implicitely thread safe,
535 as they are only called by setup() which uses a mutex.
537 static void setupLocaleMapper()
539 #if defined(Q_OS_WIN32) || defined(Q_OS_WINCE)
540 localeMapper = QTextCodec::codecForName("System");
544 localeMapper = QTextCodec::codecForName("System");
547 #if defined (_XOPEN_UNIX) && !defined(Q_OS_QNX) && !defined(Q_OS_OSF)
549 char *charset = nl_langinfo (CODESET);
551 localeMapper = QTextCodec::codecForName(charset);
556 // Very poorly defined and followed standards causes lots of
557 // code to try to get all the cases... This logic is
558 // duplicated in QIconvCodec, so if you change it here, change
561 // Try to determine locale codeset from locale name assigned to
562 // LC_CTYPE category.
564 // First part is getting that locale name. First try setlocale() which
565 // definitely knows it, but since we cannot fully trust it, get ready
566 // to fall back to environment variables.
567 #if !defined(QT_NO_SETLOCALE)
568 const QByteArray ctype = setlocale(LC_CTYPE, 0);
570 const QByteArray ctype;
573 // Get the first nonempty value from $LC_ALL, $LC_CTYPE, and $LANG
574 // environment variables.
575 QByteArray lang = qgetenv("LC_ALL");
576 if (lang.isEmpty() || lang == "C") {
577 lang = qgetenv("LC_CTYPE");
579 if (lang.isEmpty() || lang == "C") {
580 lang = qgetenv("LANG");
583 // Now try these in order:
584 // 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15)
585 // 2. CODESET from lang if it contains a .CODESET part
586 // 3. ctype (maybe the locale is named "ISO-8859-1" or something)
588 // 5. check for "@euro"
589 // 6. guess locale from ctype unless ctype is "C"
590 // 7. guess locale from lang
592 // 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15)
593 int indexOfDot = ctype.indexOf('.');
594 if (indexOfDot != -1)
595 localeMapper = checkForCodec( ctype.mid(indexOfDot + 1) );
597 // 2. CODESET from lang if it contains a .CODESET part
599 indexOfDot = lang.indexOf('.');
600 if (indexOfDot != -1)
601 localeMapper = checkForCodec( lang.mid(indexOfDot + 1) );
604 // 3. ctype (maybe the locale is named "ISO-8859-1" or something)
605 if (!localeMapper && !ctype.isEmpty() && ctype != "C")
606 localeMapper = checkForCodec(ctype);
609 if (!localeMapper && !lang.isEmpty())
610 localeMapper = checkForCodec(lang);
613 if ((!localeMapper && ctype.contains("@euro")) || lang.contains("@euro"))
614 localeMapper = checkForCodec("ISO 8859-15");
616 // 6. guess locale from ctype unless ctype is "C"
617 // 7. guess locale from lang
618 const QByteArray &try_by_name = (!ctype.isEmpty() && ctype != "C") ? lang : ctype;
620 // Now do the guessing.
621 if (!lang.isEmpty() && !localeMapper && !try_by_name.isEmpty()) {
622 if (try_locale_list(iso8859_15locales, lang))
623 localeMapper = QTextCodec::codecForName("ISO 8859-15");
624 else if (try_locale_list(iso8859_2locales, lang))
625 localeMapper = QTextCodec::codecForName("ISO 8859-2");
626 else if (try_locale_list(iso8859_3locales, lang))
627 localeMapper = QTextCodec::codecForName("ISO 8859-3");
628 else if (try_locale_list(iso8859_4locales, lang))
629 localeMapper = QTextCodec::codecForName("ISO 8859-4");
630 else if (try_locale_list(iso8859_5locales, lang))
631 localeMapper = QTextCodec::codecForName("ISO 8859-5");
632 else if (try_locale_list(iso8859_6locales, lang))
633 localeMapper = QTextCodec::codecForName("ISO 8859-6");
634 else if (try_locale_list(iso8859_7locales, lang))
635 localeMapper = QTextCodec::codecForName("ISO 8859-7");
636 else if (try_locale_list(iso8859_8locales, lang))
637 localeMapper = QTextCodec::codecForName("ISO 8859-8-I");
638 else if (try_locale_list(iso8859_9locales, lang))
639 localeMapper = QTextCodec::codecForName("ISO 8859-9");
640 else if (try_locale_list(iso8859_13locales, lang))
641 localeMapper = QTextCodec::codecForName("ISO 8859-13");
642 else if (try_locale_list(tis_620locales, lang))
643 localeMapper = QTextCodec::codecForName("ISO 8859-11");
644 else if (try_locale_list(koi8_ulocales, lang))
645 localeMapper = QTextCodec::codecForName("KOI8-U");
646 else if (try_locale_list(cp_1251locales, lang))
647 localeMapper = QTextCodec::codecForName("CP 1251");
648 else if (try_locale_list(pt_154locales, lang))
649 localeMapper = QTextCodec::codecForName("PT 154");
650 else if (try_locale_list(probably_koi8_rlocales, lang))
651 localeMapper = ru_RU_hack(lang);
656 // If everything failed, we default to 8859-1
657 // We could perhaps default to 8859-15.
659 localeMapper = QTextCodec::codecForName("ISO 8859-1");
664 Q_GLOBAL_STATIC_WITH_ARGS(QMutex, textCodecsMutex, (QMutex::Recursive));
667 // textCodecsMutex need to be locked to enter this function
673 #ifdef Q_DEBUG_TEXTCODEC
674 if (destroying_is_ok)
675 qWarning("QTextCodec: Creating new codec during codec cleanup");
677 all = new QList<QTextCodec*>;
678 // create the cleanup object to cleanup all codecs on exit
679 (void) createQTextCodecCleanup();
682 (void)new QTsciiCodec;
683 for (int i = 0; i < 9; ++i)
684 (void)new QIsciiCodec(i);
686 for (int i = 0; i < QSimpleTextCodec::numSimpleCodecs; ++i)
687 (void)new QSimpleTextCodec(i);
689 # if defined(Q_OS_UNIX) && !defined(QT_BOOTSTRAPPED)
690 // no font codecs when bootstrapping
691 (void)new QFontLaoCodec;
692 (void)new QFontGb2312Codec;
693 (void)new QFontGbkCodec;
694 (void)new QFontGb18030_0Codec;
695 (void)new QFontJis0208Codec;
696 (void)new QFontJis0201Codec;
697 (void)new QFontKsc5601Codec;
698 (void)new QFontBig5hkscsCodec;
699 (void)new QFontBig5Codec;
700 # endif // Q_OS_UNIX && !QT_BOOTSTRAPPED
703 #if !defined(Q_OS_INTEGRITY)
704 # if !defined(QT_BOOTSTRAPPED)
705 // no asian codecs when bootstrapping, sorry
706 (void)new QGb18030Codec;
708 (void)new QGb2312Codec;
709 (void)new QEucJpCodec;
711 (void)new QSjisCodec;
712 (void)new QEucKrCodec;
713 (void)new QCP949Codec;
714 (void)new QBig5Codec;
715 (void)new QBig5hkscsCodec;
716 # endif // !QT_BOOTSTRAPPED
717 #endif // !Q_OS_INTEGRITY
718 #endif // QT_NO_CODECS
720 #if defined(Q_OS_WIN32) || defined(Q_OS_WINCE)
721 (void) new QWindowsLocalCodec;
724 (void)new QUtf16Codec;
725 (void)new QUtf16BECodec;
726 (void)new QUtf16LECodec;
727 (void)new QUtf32Codec;
728 (void)new QUtf32BECodec;
729 (void)new QUtf32LECodec;
730 (void)new QLatin15Codec;
731 (void)new QLatin1Codec;
732 (void)new QUtf8Codec;
734 #if !defined(Q_OS_INTEGRITY)
735 #if defined(Q_OS_UNIX) && !defined(QT_NO_ICONV) && !defined(QT_BOOTSTRAPPED)
736 // QIconvCodec depends on the UTF-16 codec, so it needs to be created last
737 (void) new QIconvCodec();
746 \enum QTextCodec::ConversionFlag
748 \value DefaultConversion No flag is set.
749 \value ConvertInvalidToNull If this flag is set, each invalid input
750 character is output as a null character.
751 \value IgnoreHeader Ignore any Unicode byte-order mark and don't generate any.
753 \omitvalue FreeFunction
757 \fn QTextCodec::ConverterState::ConverterState(ConversionFlags flags)
759 Constructs a ConverterState object initialized with the given \a flags.
763 Destroys the ConverterState object.
765 QTextCodec::ConverterState::~ConverterState()
767 if (flags & FreeFunction)
768 (QTextCodecUnalignedPointer::decode(state_data))(this);
775 \brief The QTextCodec class provides conversions between text encodings.
779 Qt uses Unicode to store, draw and manipulate strings. In many
780 situations you may wish to deal with data that uses a different
781 encoding. For example, most Japanese documents are still stored
782 in Shift-JIS or ISO 2022-JP, while Russian users often have their
783 documents in KOI8-R or Windows-1251.
785 Qt provides a set of QTextCodec classes to help with converting
786 non-Unicode formats to and from Unicode. You can also create your
789 The supported encodings are:
793 \o \l{Big5 Text Codec}{Big5}
794 \o \l{Big5-HKSCS Text Codec}{Big5-HKSCS}
796 \o \l{EUC-JP Text Codec}{EUC-JP}
797 \o \l{EUC-KR Text Codec}{EUC-KR}
798 \o \l{GBK Text Codec}{GB18030-0}
802 \o \l{ISO 2022-JP (JIS) Text Codec}{ISO 2022-JP}
805 \o Iscii-Bng, Dev, Gjr, Knd, Mlm, Ori, Pnj, Tlg, and Tml
812 \o \l{Shift-JIS Text Codec}{Shift-JIS}
814 \o \l{TSCII Text Codec}{TSCII}
822 \o Windows-1250 to 1258
826 QTextCodecs can be used as follows to convert some locally encoded
827 string to Unicode. Suppose you have some string encoded in Russian
828 KOI8-R encoding, and want to convert it to Unicode. The simple way
829 to do it is like this:
831 \snippet doc/src/snippets/code/src_corelib_codecs_qtextcodec.cpp 0
833 After this, \c string holds the text converted to Unicode.
834 Converting a string from Unicode to the local encoding is just as
837 \snippet doc/src/snippets/code/src_corelib_codecs_qtextcodec.cpp 1
839 To read or write files in various encodings, use QTextStream and
840 its \l{QTextStream::setCodec()}{setCodec()} function. See the
841 \l{tools/codecs}{Codecs} example for an application of QTextCodec
844 Some care must be taken when trying to convert the data in chunks,
845 for example, when receiving it over a network. In such cases it is
846 possible that a multi-byte character will be split over two
847 chunks. At best this might result in the loss of a character and
848 at worst cause the entire conversion to fail.
850 The approach to use in these situations is to create a QTextDecoder
851 object for the codec and use this QTextDecoder for the whole
852 decoding process, as shown below:
854 \snippet doc/src/snippets/code/src_corelib_codecs_qtextcodec.cpp 2
856 The QTextDecoder object maintains state between chunks and therefore
857 works correctly even if a multi-byte character is split between
860 \section1 Creating Your Own Codec Class
862 Support for new text encodings can be added to Qt by creating
863 QTextCodec subclasses.
865 The pure virtual functions describe the encoder to the system and
866 the coder is used as required in the different text file formats
867 supported by QTextStream, and under X11, for the locale-specific
868 character input and output.
870 To add support for another encoding to Qt, make a subclass of
871 QTextCodec and implement the functions listed in the table below.
874 \header \o Function \o Description
877 \o Returns the official name for the encoding. If the
878 encoding is listed in the
879 \l{IANA character-sets encoding file}, the name
880 should be the preferred MIME name for the encoding.
883 \o Returns a list of alternative names for the encoding.
884 QTextCodec provides a default implementation that returns
885 an empty list. For example, "ISO-8859-1" has "latin1",
886 "CP819", "IBM819", and "iso-ir-100" as aliases.
889 \o Return the MIB enum for the encoding if it is listed in
890 the \l{IANA character-sets encoding file}.
892 \row \o convertToUnicode()
893 \o Converts an 8-bit character string to Unicode.
895 \row \o convertFromUnicode()
896 \o Converts a Unicode string to an 8-bit character string.
899 \sa QTextStream, QTextDecoder, QTextEncoder, {Codecs Example}
903 Constructs a QTextCodec, and gives it the highest precedence. The
904 QTextCodec should always be constructed on the heap (i.e. with \c
905 new). Qt takes ownership and will delete it when the application
908 QTextCodec::QTextCodec()
911 QMutexLocker locker(textCodecsMutex());
921 Destroys the QTextCodec. Note that you should not delete codecs
922 yourself: once created they become Qt's responsibility.
924 QTextCodec::~QTextCodec()
926 #ifdef Q_DEBUG_TEXTCODEC
927 if (!destroying_is_ok)
928 qWarning("QTextCodec::~QTextCodec: Called by application");
932 QMutexLocker locker(textCodecsMutex());
934 all->removeAll(this);
935 QTextCodecCache *cache = qTextCodecCache();
942 \fn QTextCodec *QTextCodec::codecForName(const char *name)
944 Searches all installed QTextCodec objects and returns the one
945 which best matches \a name; the match is case-insensitive. Returns
946 0 if no codec matching the name \a name could be found.
950 Searches all installed QTextCodec objects and returns the one
951 which best matches \a name; the match is case-insensitive. Returns
952 0 if no codec matching the name \a name could be found.
954 QTextCodec *QTextCodec::codecForName(const QByteArray &name)
960 QMutexLocker locker(textCodecsMutex());
967 QTextCodecCache *cache = qTextCodecCache();
970 codec = cache->value(name);
975 for (int i = 0; i < all->size(); ++i) {
976 QTextCodec *cursor = all->at(i);
977 if (nameMatch(cursor->name(), name)) {
979 cache->insert(name, cursor);
982 QList<QByteArray> aliases = cursor->aliases();
983 for (int y = 0; y < aliases.size(); ++y)
984 if (nameMatch(aliases.at(y), name)) {
986 cache->insert(name, cursor);
996 Returns the QTextCodec which matches the \link
997 QTextCodec::mibEnum() MIBenum\endlink \a mib.
999 QTextCodec* QTextCodec::codecForMib(int mib)
1001 #ifndef QT_NO_THREAD
1002 QMutexLocker locker(textCodecsMutex());
1009 QByteArray key = "MIB: " + QByteArray::number(mib);
1010 QTextCodecCache *cache = qTextCodecCache();
1013 codec = cache->value(key);
1018 QList<QTextCodec*>::ConstIterator i;
1019 for (int i = 0; i < all->size(); ++i) {
1020 QTextCodec *cursor = all->at(i);
1021 if (cursor->mibEnum() == mib) {
1023 cache->insert(key, cursor);
1032 Returns the list of all available codecs, by name. Call
1033 QTextCodec::codecForName() to obtain the QTextCodec for the name.
1035 The list may contain many mentions of the same codec
1036 if the codec has aliases.
1038 \sa availableMibs(), name(), aliases()
1040 QList<QByteArray> QTextCodec::availableCodecs()
1042 #ifndef QT_NO_THREAD
1043 QMutexLocker locker(textCodecsMutex());
1047 QList<QByteArray> codecs;
1052 for (int i = 0; i < all->size(); ++i) {
1053 codecs += all->at(i)->name();
1054 codecs += all->at(i)->aliases();
1057 #ifndef QT_NO_THREAD
1065 Returns the list of MIBs for all available codecs. Call
1066 QTextCodec::codecForMib() to obtain the QTextCodec for the MIB.
1068 \sa availableCodecs(), mibEnum()
1070 QList<int> QTextCodec::availableMibs()
1072 #ifndef QT_NO_THREAD
1073 QMutexLocker locker(textCodecsMutex());
1082 for (int i = 0; i < all->size(); ++i)
1083 codecs += all->at(i)->mibEnum();
1085 #ifndef QT_NO_THREAD
1093 Set the codec to \a c; this will be returned by
1094 codecForLocale(). If \a c is a null pointer, the codec is reset to
1097 This might be needed for some applications that want to use their
1098 own mechanism for setting the locale.
1100 \sa codecForLocale()
1102 void QTextCodec::setCodecForLocale(QTextCodec *c)
1104 #ifndef QT_NO_THREAD
1105 QMutexLocker locker(textCodecsMutex());
1109 setupLocaleMapper();
1113 Returns a pointer to the codec most suitable for this locale.
1115 On Windows, the codec will be based on a system locale. On Unix
1116 systems, starting with Qt 4.2, the codec will be using the \e
1117 iconv library. Note that in both cases the codec's name will be
1121 QTextCodec* QTextCodec::codecForLocale()
1127 return localeMapper;
1129 #ifndef QT_NO_THREAD
1130 QMutexLocker locker(textCodecsMutex());
1134 return localeMapper;
1139 \fn QByteArray QTextCodec::name() const
1141 QTextCodec subclasses must reimplement this function. It returns
1142 the name of the encoding supported by the subclass.
1144 If the codec is registered as a character set in the
1145 \l{IANA character-sets encoding file} this method should
1146 return the preferred mime name for the codec if defined,
1151 \fn int QTextCodec::mibEnum() const
1153 Subclasses of QTextCodec must reimplement this function. It
1154 returns the MIBenum (see \l{IANA character-sets encoding file}
1155 for more information). It is important that each QTextCodec
1156 subclass returns the correct unique value for this function.
1160 Subclasses can return a number of aliases for the codec in question.
1162 Standard aliases for codecs can be found in the
1163 \l{IANA character-sets encoding file}.
1165 QList<QByteArray> QTextCodec::aliases() const
1167 return QList<QByteArray>();
1171 \fn QString QTextCodec::convertToUnicode(const char *chars, int len,
1172 ConverterState *state) const
1174 QTextCodec subclasses must reimplement this function.
1176 Converts the first \a len characters of \a chars from the
1177 encoding of the subclass to Unicode, and returns the result in a
1180 \a state can be 0, in which case the conversion is stateless and
1181 default conversion rules should be used. If state is not 0, the
1182 codec should save the state after the conversion in \a state, and
1183 adjust the remainingChars and invalidChars members of the struct.
1187 \fn QByteArray QTextCodec::convertFromUnicode(const QChar *input, int number,
1188 ConverterState *state) const
1190 QTextCodec subclasses must reimplement this function.
1192 Converts the first \a number of characters from the \a input array
1193 from Unicode to the encoding of the subclass, and returns the result
1196 \a state can be 0 in which case the conversion is stateless and
1197 default conversion rules should be used. If state is not 0, the
1198 codec should save the state after the conversion in \a state, and
1199 adjust the remainingChars and invalidChars members of the struct.
1203 Creates a QTextDecoder with a specified \a flags to decode chunks
1204 of \c{char *} data to create chunks of Unicode data.
1206 The caller is responsible for deleting the returned object.
1210 QTextDecoder* QTextCodec::makeDecoder(QTextCodec::ConversionFlags flags) const
1212 return new QTextDecoder(this, flags);
1216 Creates a QTextEncoder with a specified \a flags to encode chunks
1217 of Unicode data as \c{char *} data.
1219 The caller is responsible for deleting the returned object.
1223 QTextEncoder* QTextCodec::makeEncoder(QTextCodec::ConversionFlags flags) const
1225 return new QTextEncoder(this, flags);
1229 \fn QByteArray QTextCodec::fromUnicode(const QChar *input, int number,
1230 ConverterState *state) const
1232 Converts the first \a number of characters from the \a input array
1233 from Unicode to the encoding of this codec, and returns the result
1236 The \a state of the convertor used is updated.
1240 Converts \a str from Unicode to the encoding of this codec, and
1241 returns the result in a QByteArray.
1243 QByteArray QTextCodec::fromUnicode(const QString& str) const
1245 return convertFromUnicode(str.constData(), str.length(), 0);
1249 \fn QString QTextCodec::toUnicode(const char *input, int size,
1250 ConverterState *state) const
1252 Converts the first \a size characters from the \a input from the
1253 encoding of this codec to Unicode, and returns the result in a
1256 The \a state of the convertor used is updated.
1260 Converts \a a from the encoding of this codec to Unicode, and
1261 returns the result in a QString.
1263 QString QTextCodec::toUnicode(const QByteArray& a) const
1265 return convertToUnicode(a.constData(), a.length(), 0);
1269 Returns true if the Unicode character \a ch can be fully encoded
1270 with this codec; otherwise returns false.
1272 bool QTextCodec::canEncode(QChar ch) const
1274 ConverterState state;
1275 state.flags = ConvertInvalidToNull;
1276 convertFromUnicode(&ch, 1, &state);
1277 return (state.invalidChars == 0);
1283 \a s contains the string being tested for encode-ability.
1285 bool QTextCodec::canEncode(const QString& s) const
1287 ConverterState state;
1288 state.flags = ConvertInvalidToNull;
1289 convertFromUnicode(s.constData(), s.length(), &state);
1290 return (state.invalidChars == 0);
1296 \a chars contains the source characters.
1298 QString QTextCodec::toUnicode(const char *chars) const
1300 int len = qstrlen(chars);
1301 return convertToUnicode(chars, len, 0);
1307 \brief The QTextEncoder class provides a state-based encoder.
1311 A text encoder converts text from Unicode into an encoded text format
1312 using a specific codec.
1314 The encoder converts Unicode into another format, remembering any
1315 state that is required between calls.
1317 \sa QTextCodec::makeEncoder(), QTextDecoder
1321 \fn QTextEncoder::QTextEncoder(const QTextCodec *codec)
1323 Constructs a text encoder for the given \a codec.
1327 Constructs a text encoder for the given \a codec and conversion \a flags.
1331 QTextEncoder::QTextEncoder(const QTextCodec *codec, QTextCodec::ConversionFlags flags)
1334 state.flags = flags;
1338 Destroys the encoder.
1340 QTextEncoder::~QTextEncoder()
1346 Determines whether the eecoder encountered a failure while decoding the input. If
1347 an error was encountered, the produced result is undefined, and gets converted as according
1348 to the conversion flags.
1350 bool QTextEncoder::hasFailure() const
1352 return state.invalidChars != 0;
1356 Converts the Unicode string \a str into an encoded QByteArray.
1358 QByteArray QTextEncoder::fromUnicode(const QString& str)
1360 QByteArray result = c->fromUnicode(str.constData(), str.length(), &state);
1367 Converts \a len characters (not bytes) from \a uc, and returns the
1368 result in a QByteArray.
1370 QByteArray QTextEncoder::fromUnicode(const QChar *uc, int len)
1372 QByteArray result = c->fromUnicode(uc, len, &state);
1378 \brief The QTextDecoder class provides a state-based decoder.
1382 A text decoder converts text from an encoded text format into Unicode
1383 using a specific codec.
1385 The decoder converts text in this format into Unicode, remembering any
1386 state that is required between calls.
1388 \sa QTextCodec::makeDecoder(), QTextEncoder
1392 \fn QTextDecoder::QTextDecoder(const QTextCodec *codec)
1394 Constructs a text decoder for the given \a codec.
1398 Constructs a text decoder for the given \a codec and conversion \a flags.
1403 QTextDecoder::QTextDecoder(const QTextCodec *codec, QTextCodec::ConversionFlags flags)
1406 state.flags = flags;
1410 Destroys the decoder.
1412 QTextDecoder::~QTextDecoder()
1417 \fn QString QTextDecoder::toUnicode(const char *chars, int len)
1419 Converts the first \a len bytes in \a chars to Unicode, returning
1422 If not all characters are used (e.g. if only part of a multi-byte
1423 encoding is at the end of the characters), the decoder remembers
1424 enough state to continue with the next call to this function.
1426 QString QTextDecoder::toUnicode(const char *chars, int len)
1428 return c->toUnicode(chars, len, &state);
1434 The converted string is returned in \a target.
1436 void QTextDecoder::toUnicode(QString *target, const char *chars, int len)
1439 switch (c->mibEnum()) {
1441 static_cast<const QUtf8Codec*>(c)->convertToUnicode(target, chars, len, &state);
1444 target->resize(len);
1445 ushort *data = (ushort*)target->data();
1446 for (int i = len; i >=0; --i)
1447 data[i] = (uchar) chars[i];
1450 *target = c->toUnicode(chars, len, &state);
1458 Converts the bytes in the byte array specified by \a ba to Unicode
1459 and returns the result.
1461 QString QTextDecoder::toUnicode(const QByteArray &ba)
1463 return c->toUnicode(ba.constData(), ba.length(), &state);
1468 \fn QTextCodec* QTextCodec::codecForTr()
1470 Returns the codec used by QObject::tr() on its argument. If this
1471 function returns 0 (the default), tr() assumes Latin-1.
1477 \fn void QTextCodec::setCodecForTr(QTextCodec *c)
1480 Sets the codec used by QObject::tr() on its argument to \a c. If
1481 \a c is 0 (the default), tr() assumes Latin-1.
1483 If the literal quoted text in the program is not in the Latin-1
1484 encoding, this function can be used to set the appropriate
1485 encoding. For example, software developed by Korean programmers
1486 might use eucKR for all the text in the program, in which case the
1487 main() function might look like this:
1489 \snippet doc/src/snippets/code/src_corelib_codecs_qtextcodec.cpp 3
1491 Note that this is not the way to select the encoding that the \e
1492 user has chosen. For example, to convert an application containing
1493 literal English strings to Korean, all that is needed is for the
1494 English strings to be passed through tr() and for translation
1495 files to be loaded. For details of internationalization, see
1496 \l{Internationalization with Qt}.
1498 \sa codecForTr(), setCodecForCStrings()
1503 \fn QTextCodec* QTextCodec::codecForCStrings()
1505 Returns the codec used by QString to convert to and from \c{const
1506 char *} and QByteArrays. If this function returns 0 (the default),
1507 QString assumes Latin-1.
1509 \sa setCodecForCStrings()
1513 \fn void QTextCodec::setCodecForCStrings(QTextCodec *codec)
1516 Sets the codec used by QString to convert to and from \c{const
1517 char *} and QByteArrays. If the \a codec is 0 (the default),
1518 QString assumes Latin-1.
1520 \warning Some codecs do not preserve the characters in the ASCII
1521 range (0x00 to 0x7F). For example, the Japanese Shift-JIS
1522 encoding maps the backslash character (0x5A) to the Yen
1523 character. To avoid undesirable side-effects, we recommend
1524 avoiding such codecs with setCodecsForCString().
1526 \sa codecForCStrings(), setCodecForTr()
1532 Tries to detect the encoding of the provided snippet of HTML in
1533 the given byte array, \a ba, by checking the BOM (Byte Order Mark)
1534 and the content-type meta header and returns a QTextCodec instance
1535 that is capable of decoding the html to unicode. If the codec
1536 cannot be detected from the content provided, \a defaultCodec is
1539 \sa codecForUtfText()
1541 QTextCodec *QTextCodec::codecForHtml(const QByteArray &ba, QTextCodec *defaultCodec)
1543 // determine charset
1547 c = QTextCodec::codecForUtfText(ba, c);
1549 QByteArray header = ba.left(512).toLower();
1550 if ((pos = header.indexOf("http-equiv=")) != -1) {
1551 if ((pos = header.lastIndexOf("meta ", pos)) != -1) {
1552 pos = header.indexOf("charset=", pos) + int(strlen("charset="));
1554 int pos2 = header.indexOf('\"', pos+1);
1555 QByteArray cs = header.mid(pos, pos2-pos);
1556 // qDebug("found charset: %s", cs.data());
1557 c = QTextCodec::codecForName(cs);
1571 Tries to detect the encoding of the provided snippet of HTML in
1572 the given byte array, \a ba, by checking the BOM (Byte Order Mark)
1573 and the content-type meta header and returns a QTextCodec instance
1574 that is capable of decoding the html to unicode. If the codec cannot
1575 be detected, this overload returns a Latin-1 QTextCodec.
1577 QTextCodec *QTextCodec::codecForHtml(const QByteArray &ba)
1579 return codecForHtml(ba, QTextCodec::codecForMib(/*Latin 1*/ 4));
1585 Tries to detect the encoding of the provided snippet \a ba by
1586 using the BOM (Byte Order Mark) and returns a QTextCodec instance
1587 that is capable of decoding the text to unicode. If the codec
1588 cannot be detected from the content provided, \a defaultCodec is
1593 QTextCodec *QTextCodec::codecForUtfText(const QByteArray &ba, QTextCodec *defaultCodec)
1595 const int arraySize = ba.size();
1597 if (arraySize > 3) {
1598 if ((uchar)ba[0] == 0x00
1599 && (uchar)ba[1] == 0x00
1600 && (uchar)ba[2] == 0xFE
1601 && (uchar)ba[3] == 0xFF)
1602 return QTextCodec::codecForMib(1018); // utf-32 be
1603 else if ((uchar)ba[0] == 0xFF
1604 && (uchar)ba[1] == 0xFE
1605 && (uchar)ba[2] == 0x00
1606 && (uchar)ba[3] == 0x00)
1607 return QTextCodec::codecForMib(1019); // utf-32 le
1611 return defaultCodec;
1612 if ((uchar)ba[0] == 0xfe && (uchar)ba[1] == 0xff)
1613 return QTextCodec::codecForMib(1013); // utf16 be
1614 else if ((uchar)ba[0] == 0xff && (uchar)ba[1] == 0xfe)
1615 return QTextCodec::codecForMib(1014); // utf16 le
1618 return defaultCodec;
1619 if ((uchar)ba[0] == 0xef
1620 && (uchar)ba[1] == 0xbb
1621 && (uchar)ba[2] == 0xbf)
1622 return QTextCodec::codecForMib(106); // utf-8
1624 return defaultCodec;
1630 Tries to detect the encoding of the provided snippet \a ba by
1631 using the BOM (Byte Order Mark) and returns a QTextCodec instance
1632 that is capable of decoding the text to unicode. If the codec
1633 cannot be detected, this overload returns a Latin-1 QTextCodec.
1637 QTextCodec *QTextCodec::codecForUtfText(const QByteArray &ba)
1639 return codecForUtfText(ba, QTextCodec::codecForMib(/*Latin 1*/ 4));
1645 Determines whether the decoder encountered a failure while decoding the input. If
1646 an error was encountered, the produced result is undefined, and gets converted as according
1647 to the conversion flags.
1649 bool QTextDecoder::hasFailure() const
1651 return state.invalidChars != 0;
1656 #endif // QT_NO_TEXTCODEC