From 15eb66a7077fd06bb510222d35a672b42ddcb50b Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marc-Andr=C3=A9=20Moreau?= Date: Sun, 16 Dec 2012 20:44:40 -0500 Subject: [PATCH] libwinpr-crt: document usage of unicode functions --- winpr/libwinpr/crt/unicode.c | 139 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 134 insertions(+), 5 deletions(-) diff --git a/winpr/libwinpr/crt/unicode.c b/winpr/libwinpr/crt/unicode.c index be2d87c..5c96aa4 100644 --- a/winpr/libwinpr/crt/unicode.c +++ b/winpr/libwinpr/crt/unicode.c @@ -31,14 +31,113 @@ #include "utf.h" -/* - * Advanced String Techniques in C++ - Part I: Unicode - * http://www.flipcode.com/archives/Advanced_String_Techniques_in_C-Part_I_Unicode.shtml +/** + * Notes on cross-platform Unicode portability: + * + * Unicode has many possible Unicode Transformation Format (UTF) encodings, + * where some of the most commonly used are UTF-8, UTF-16 and sometimes UTF-32. + * + * The number in the UTF encoding name (8, 16, 32) refers to the number of bits + * per code unit. A code unit is the minimal bit combination that can represent + * a unit of encoded text in the given encoding. For instance, UTF-8 encodes + * the English alphabet using 8 bits (or one byte) each, just like in ASCII. + * + * However, the total number of code points (values in the Unicode codespace) + * only fits completely within 32 bits. This means that for UTF-8 and UTF-16, + * more than one code unit may be required to fully encode a specific value. + * UTF-8 and UTF-16 are variable-width encodings, while UTF-32 is fixed-width. + * + * UTF-8 has the advantage of being backwards compatible with ASCII, and is + * one of the most commonly used Unicode encoding. + * + * UTF-16 is used everywhere in the Windows API. The strategy employed by + * Microsoft to provide backwards compatibility in their API was to create + * an ANSI and a Unicode version of the same function, ending with A (ANSI) + * and W (Wide character, or UTF-16 Unicode). In headers, the original + * function name is replaced by a macro that defines to either the ANSI + * or Unicode version based on the definition of the _UNICODE macro. + * + * UTF-32 has the advantage of being fixed width, but wastes a lot of space + * for English text (4x more than UTF-8, 2x more than UTF-16). + * + * In C, wide character strings are often defined with the wchar_t type. + * Many functions are provided to deal with those wide character strings, + * such as wcslen (strlen equivalent) or wprintf (printf equivalent). + * + * This may lead to some confusion, since many of these functions exist + * on both Windows and Linux, but they are *not* the same! + * + * This sample hello world is a good example: + * + * #include + * + * wchar_t hello[] = L"Hello, World!\n"; + * + * int main(int argc, char** argv) + * { + * wprintf(hello); + * wprintf(L"sizeof(wchar_t): %d\n", sizeof(wchar_t)); + * return 0; + * } + * + * There is a reason why the sample prints the size of the wchar_t type: + * On Windows, wchar_t is two bytes (UTF-16), while on most other systems + * it is 4 bytes (UTF-32). This means that if you write code on Windows, + * use L"" to define a string which is meant to be UTF-16 and not UTF-32, + * you will have a little surprise when trying to port your code to Linux. + * + * Since the Windows API uses UTF-16, not UTF-32, WinPR defines the WCHAR + * type to always be 2-bytes long and uses it instead of wchar_t. Do not + * ever use wchar_t with WinPR unless you know what you are doing. + * + * As for L"", it is unfortunately unusable in a portable way, unless a + * special option is passed to GCC to define wchar_t as being two bytes. + * For string constants that must be UTF-16, it is a pain, but they can + * be defined in a portable way like this: + * + * WCHAR hello[] = { 'H','e','l','l','o','\0' }; + * + * Such strings cannot be passed to native functions like wcslen(), which + * may expect a different wchar_t size. For this reason, WinPR provides + * _wcslen, which expects UTF-16 WCHAR strings on all platforms. + * */ /* - * Conversion *to* Unicode + * Conversion to Unicode (UTF-16) * MultiByteToWideChar: http://msdn.microsoft.com/en-us/library/windows/desktop/dd319072/ + * + * cbMultiByte is an input size in bytes (BYTE) + * cchWideChar is an output size in wide characters (WCHAR) + * + * Null-terminated UTF-8 strings: + * + * cchWideChar *cannot* be assumed to be cbMultiByte since UTF-8 is variable-width! + * + * Instead, obtain the required cchWideChar output size like this: + * cchWideChar = MultiByteToWideChar(CP_UTF8, 0, (LPCSTR) lpMultiByteStr, -1, NULL, 0); + * + * A value of -1 for cbMultiByte indicates that the input string is null-terminated, + * and the null terminator *will* be processed. The size returned by MultiByteToWideChar + * will therefore include the null terminator. Equivalent behavior can be obtained by + * computing the length in bytes of the input buffer, including the null terminator: + * + * cbMultiByte = strlen((char*) lpMultiByteStr) + 1; + * + * An output buffer of the proper size can then be allocated: + * + * lpWideCharStr = (LPWSTR) malloc(cchWideChar * sizeof(WCHAR)); + * + * Since cchWideChar is an output size in wide characters, the actual buffer size is: + * (cchWideChar * sizeof(WCHAR)) or (cchWideChar * 2) + * + * Finally, perform the conversion: + * + * cchWideChar = MultiByteToWideChar(CP_UTF8, 0, (LPCSTR) lpMultiByteStr, -1, lpWideCharStr, cchWideChar); + * + * The value returned by MultiByteToWideChar corresponds to the number of wide characters written + * to the output buffer, and should match the value obtained on the first call to MultiByteToWideChar. + * */ int MultiByteToWideChar(UINT CodePage, DWORD dwFlags, LPCSTR lpMultiByteStr, @@ -91,8 +190,38 @@ int MultiByteToWideChar(UINT CodePage, DWORD dwFlags, LPCSTR lpMultiByteStr, } /* - * Conversion *from* Unicode + * Conversion from Unicode (UTF-16) * WideCharToMultiByte: http://msdn.microsoft.com/en-us/library/windows/desktop/dd374130/ + * + * cchWideChar is an input size in wide characters (WCHAR) + * cbMultiByte is an output size in bytes (BYTE) + * + * Null-terminated UTF-16 strings: + * + * cbMultiByte *cannot* be assumed to be cchWideChar since UTF-8 is variable-width! + * + * Instead, obtain the required cbMultiByte output size like this: + * cbMultiByte = WideCharToMultiByte(CP_UTF8, 0, (LPCWSTR) lpWideCharStr, -1, NULL, 0, NULL, NULL); + * + * A value of -1 for cbMultiByte indicates that the input string is null-terminated, + * and the null terminator *will* be processed. The size returned by WideCharToMultiByte + * will therefore include the null terminator. Equivalent behavior can be obtained by + * computing the length in bytes of the input buffer, including the null terminator: + * + * cchWideChar = _wcslen((WCHAR*) lpWideCharStr) + 1; + * + * An output buffer of the proper size can then be allocated: + * lpMultiByteStr = (LPSTR) malloc(cbMultiByte); + * + * Since cbMultiByte is an output size in bytes, it is the same as the buffer size + * + * Finally, perform the conversion: + * + * cbMultiByte = WideCharToMultiByte(CP_UTF8, 0, (LPCWSTR) lpWideCharStr, -1, lpMultiByteStr, cbMultiByte, NULL, NULL); + * + * The value returned by WideCharToMultiByte corresponds to the number of bytes written + * to the output buffer, and should match the value obtained on the first call to WideCharToMultiByte. + * */ int WideCharToMultiByte(UINT CodePage, DWORD dwFlags, LPCWSTR lpWideCharStr, int cchWideChar, -- 2.7.4