From 908d3ef0d456ef9e3ade05984532ecab66ccbbd1 Mon Sep 17 00:00:00 2001 From: Tor Lillqvist Date: Thu, 20 Mar 2008 02:40:23 +0000 Subject: [PATCH] Bug 523298 - win_iconv can't convert from UTF-8 to GB18030 (or vice versa) 2008-03-19 Tor Lillqvist Bug 523298 - win_iconv can't convert from UTF-8 to GB18030 (or vice versa) * glib/win_iconv.c: Fixes for code page 54936 (GB18030) (mbtowc_flags): New function. Check if a code page is one of those for which the dwFlags parameter to MultiByteToWideChar() must be zero. Return 0 or MB_ERR_INVALID_CHARS. (mbcs_mblen): New function for multi-byte (more than two bytes for some characters) code pages. Only handles 54936 for now. (make_csconv): Use it for 54936. (kernel_mbtowc): Use mbtowc_flags(). svn path=/trunk/; revision=6741 --- ChangeLog | 13 ++++++++++++ glib/win_iconv.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 71 insertions(+), 4 deletions(-) diff --git a/ChangeLog b/ChangeLog index 28b5af0..6de2a0c 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,16 @@ +2008-03-19 Tor Lillqvist + + Bug 523298 - win_iconv can't convert from UTF-8 to GB18030 (or vice versa) + + * glib/win_iconv.c: Fixes for code page 54936 (GB18030) + (mbtowc_flags): New function. Check if a code page is one of those + for which the dwFlags parameter to MultiByteToWideChar() must be + zero. Return 0 or MB_ERR_INVALID_CHARS. + (mbcs_mblen): New function for multi-byte (more than two bytes for + some characters) code pages. Only handles 54936 for now. + (make_csconv): Use it for 54936. + (kernel_mbtowc): Use mbtowc_flags(). + 2008-03-18 Sebastian Dröge Bug 522292 - Gives warnings in glib/gutils.h with GCC in C99 mode diff --git a/glib/win_iconv.c b/glib/win_iconv.c index acc62d5..7ed272b 100644 --- a/glib/win_iconv.c +++ b/glib/win_iconv.c @@ -137,6 +137,7 @@ static int name_to_codepage(const char *name); static uint utf16_to_ucs4(const ushort *wbuf); static void ucs4_to_utf16(uint wc, ushort *wbuf, int *wbufsize); static int is_unicode(int codepage); +static int mbtowc_flags(int codepage); static int must_use_null_useddefaultchar(int codepage); static void check_utf_bom(rec_iconv_t *cd, ushort *wbuf, int *wbufsize); static char *strrstr(const char *str, const char *token); @@ -152,6 +153,7 @@ static HMODULE hlastdll; /* keep dll loaded for efficiency (unnecessary?) */ static int sbcs_mblen(csconv_t *cv, const uchar *buf, int bufsize); static int dbcs_mblen(csconv_t *cv, const uchar *buf, int bufsize); +static int mbcs_mblen(csconv_t *cv, const uchar *buf, int bufsize); static int utf8_mblen(csconv_t *cv, const uchar *buf, int bufsize); static int eucjp_mblen(csconv_t *cv, const uchar *buf, int bufsize); @@ -925,15 +927,16 @@ make_csconv(const char *_name) cv.mblen = eucjp_mblen; } else if (IsValidCodePage(cv.codepage) - && GetCPInfoEx(cv.codepage, 0, &cpinfoex) != 0 - && (cpinfoex.MaxCharSize == 1 || cpinfoex.MaxCharSize == 2)) + && GetCPInfoEx(cv.codepage, 0, &cpinfoex) != 0) { cv.mbtowc = kernel_mbtowc; cv.wctomb = kernel_wctomb; if (cpinfoex.MaxCharSize == 1) cv.mblen = sbcs_mblen; - else + else if (cpinfoex.MaxCharSize == 2) cv.mblen = dbcs_mblen; + else + cv.mblen = mbcs_mblen; } else { @@ -1013,6 +1016,35 @@ is_unicode(int codepage) codepage == 65000 || codepage == 65001); } +/* + * Check if codepage is one of those for which the dwFlags parameter + * to MultiByteToWideChar() must be zero. Return zero or + * MB_ERR_INVALID_CHARS. The docs in Platform SDK for for Windows + * Server 2003 R2 claims that also codepage 65001 is one of these, but + * that doesn't seem to be the case. The MSDN docs for MSVS2008 leave + * out 65001 (UTF-8), and that indeed seems to be the case on XP, it + * works fine to pass MB_ERR_INVALID_CHARS in dwFlags when converting + * from UTF-8. + */ +static int +mbtowc_flags(int codepage) +{ + return (codepage == 50220 || codepage == 50221 || + codepage == 50222 || codepage == 50225 || + codepage == 50227 || codepage == 50229 || + codepage == 52936 || codepage == 54936 || + (codepage >= 57002 && codepage <= 57011) || + codepage == 65000 || codepage == 42) ? 0 : MB_ERR_INVALID_CHARS; +} + +/* + * Check if codepage is one those for which the lpUsedDefaultChar + * parameter to WideCharToMultiByte() must be NULL. The docs in + * Platform SDK for for Windows Server 2003 R2 claims that this is the + * list below, while the MSDN docs for MSVS2008 claim that it is only + * for 65000 (UTF-7) and 65001 (UTF-8). This time the earlier Platform + * SDK seems to be correct, at least for XP. + */ static int must_use_null_useddefaultchar(int codepage) { @@ -1221,6 +1253,28 @@ dbcs_mblen(csconv_t *cv, const uchar *buf, int bufsize) } static int +mbcs_mblen(csconv_t *cv, const uchar *buf, int bufsize) +{ + int len = 0; + + if (cv->codepage == 54936) { + if (buf[0] <= 0x7F) len = 1; + else if (buf[0] >= 0x81 && buf[0] <= 0xFE && + bufsize >= 2 && + ((buf[1] >= 0x40 && buf[1] <= 0x7E) || + (buf[1] >= 0x80 && buf[1] <= 0xFE))) len = 2; + else if (buf[0] >= 0x81 && buf[0] <= 0xFE && + bufsize >= 4 && + buf[1] >= 0x30 && buf[1] <= 0x39) len = 4; + else + return_error(EINVAL); + return len; + } + else + return_error(EINVAL); +} + +static int utf8_mblen(csconv_t *cv, const uchar *buf, int bufsize) { int len = 0; @@ -1280,7 +1334,7 @@ kernel_mbtowc(csconv_t *cv, const uchar *buf, int bufsize, ushort *wbuf, int *wb len = cv->mblen(cv, buf, bufsize); if (len == -1) return -1; - *wbufsize = MultiByteToWideChar(cv->codepage, MB_ERR_INVALID_CHARS, + *wbufsize = MultiByteToWideChar(cv->codepage, mbtowc_flags (cv->codepage), (const char *)buf, len, (wchar_t *)wbuf, *wbufsize); if (*wbufsize == 0) return_error(EILSEQ); -- 2.7.4