// given and ANSI String, copy it into a wide buffer.
// be careful about scoping when using this macro!
//
-// how to use the below two macros:
-//
-// ...
-// LPSTR pszA;
-// pszA = MyGetAnsiStringRoutine();
-// MAKE_WIDEPTR_FROMANSI(pwsz, pszA);
-// MyUseWideStringRoutine(pwsz);
-// ...
-//
// similarily for MAKE_ANSIPTR_FROMWIDE. note that the first param does not
// have to be declared, and no clean up must be done.
//
#define MAKE_TRANSLATIONFAILED ThrowWin32(ERROR_NO_UNICODE_TRANSLATION)
#endif
-// This version throws on conversion errors (ie, no best fit character
-// mapping to characters that look similar, and no use of the default char
-// ('?') when printing out unrepresentable characters. Use this method for
-// most development in the EE, especially anything like metadata or class
-// names. See the BESTFIT version if you're printing out info to the console.
-#define MAKE_MULTIBYTE_FROMWIDE(ptrname, widestr, codepage) \
- int __l##ptrname = (int)u16_strlen(widestr); \
- if (__l##ptrname > MAKE_MAX_LENGTH) \
- MAKE_TOOLONGACTION; \
- __l##ptrname = (int)((__l##ptrname + 1) * 2 * sizeof(char)); \
- CQuickBytes __CQuickBytes##ptrname; \
- __CQuickBytes##ptrname.AllocThrows(__l##ptrname); \
- BOOL __b##ptrname; \
- DWORD __cBytes##ptrname = WszWideCharToMultiByte(codepage, WC_NO_BEST_FIT_CHARS, widestr, -1, (LPSTR)__CQuickBytes##ptrname.Ptr(), __l##ptrname, NULL, &__b##ptrname); \
- if (__b##ptrname || (__cBytes##ptrname == 0 && (widestr[0] != W('\0')))) { \
- MAKE_TRANSLATIONFAILED; \
- } \
- LPSTR ptrname = (LPSTR)__CQuickBytes##ptrname.Ptr()
-
// This version does best fit character mapping and also allows the use
// of the default char ('?') for any Unicode character that isn't
// representable. This is reasonable for writing to the console, but
} \
LPSTR ptrname = (LPSTR)__CQuickBytes##ptrname.Ptr()
-// Use for anything critical other than output to console, where weird
-// character mappings are unacceptable.
-#define MAKE_ANSIPTR_FROMWIDE(ptrname, widestr) MAKE_MULTIBYTE_FROMWIDE(ptrname, widestr, CP_ACP)
-
-// Use for output to the console.
-#define MAKE_ANSIPTR_FROMWIDE_BESTFIT(ptrname, widestr) MAKE_MULTIBYTE_FROMWIDE_BESTFIT(ptrname, widestr, CP_ACP)
-
-#define MAKE_WIDEPTR_FROMANSI(ptrname, ansistr) \
- CQuickBytes __qb##ptrname; \
- int __l##ptrname; \
- __l##ptrname = WszMultiByteToWideChar(CP_ACP, 0, ansistr, -1, 0, 0); \
- if (__l##ptrname > MAKE_MAX_LENGTH) \
- MAKE_TOOLONGACTION; \
- LPWSTR ptrname = (LPWSTR) __qb##ptrname.AllocThrows((__l##ptrname+1)*sizeof(WCHAR)); \
- if (WszMultiByteToWideChar(CP_ACP, MB_ERR_INVALID_CHARS, ansistr, -1, ptrname, __l##ptrname) == 0) { \
- MAKE_TRANSLATIONFAILED; \
- }
-
-#define MAKE_WIDEPTR_FROMANSI_NOTHROW(ptrname, ansistr) \
- CQuickBytes __qb##ptrname; \
- LPWSTR ptrname = 0; \
- int __l##ptrname; \
- __l##ptrname = WszMultiByteToWideChar(CP_ACP, 0, ansistr, -1, 0, 0); \
- if (__l##ptrname <= MAKE_MAX_LENGTH) { \
- ptrname = (LPWSTR) __qb##ptrname.AllocNoThrow((__l##ptrname+1)*sizeof(WCHAR)); \
- if (ptrname) { \
- if (WszMultiByteToWideChar(CP_ACP, MB_ERR_INVALID_CHARS, ansistr, -1, ptrname, __l##ptrname) != 0) { \
- ptrname[__l##ptrname] = 0; \
- } else { \
- ptrname = 0; \
- } \
- } \
- }
-
#define MAKE_UTF8PTR_FROMWIDE(ptrname, widestr) CQuickBytes _##ptrname; _##ptrname.ConvertUnicode_Utf8(widestr); LPSTR ptrname = (LPSTR) _##ptrname.Ptr();
#define MAKE_UTF8PTR_FROMWIDE_NOTHROW(ptrname, widestr) \
} \
} \
-#define MAKE_WIDEPTR_FROMUTF8N(ptrname, utf8str, n8chrs) \
- CQuickBytes __qb##ptrname; \
- int __l##ptrname; \
- __l##ptrname = WszMultiByteToWideChar(CP_UTF8, 0, utf8str, n8chrs, 0, 0); \
- if (__l##ptrname > MAKE_MAX_LENGTH) \
- MAKE_TOOLONGACTION; \
- LPWSTR ptrname = (LPWSTR) __qb##ptrname .AllocThrows((__l##ptrname+1)*sizeof(WCHAR)); \
- if (0==WszMultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, utf8str, n8chrs, ptrname, __l##ptrname)) { \
- MAKE_TRANSLATIONFAILED; \
- } \
- ptrname[__l##ptrname] = 0;
-
-
#define MAKE_WIDEPTR_FROMUTF8(ptrname, utf8str) CQuickBytes _##ptrname; _##ptrname.ConvertUtf8_Unicode(utf8str); LPCWSTR ptrname = (LPCWSTR) _##ptrname.Ptr();
-
#define MAKE_WIDEPTR_FROMUTF8N_NOTHROW(ptrname, utf8str, n8chrs) \
CQuickBytes __qb##ptrname; \
int __l##ptrname; \
#define MAKE_WIDEPTR_FROMUTF8_NOTHROW(ptrname, utf8str) MAKE_WIDEPTR_FROMUTF8N_NOTHROW(ptrname, utf8str, -1)
-// This method takes the number of characters
-#define MAKE_MULTIBYTE_FROMWIDEN(ptrname, widestr, _nCharacters, _pCnt, codepage) \
- CQuickBytes __qb##ptrname; \
- int __l##ptrname; \
- __l##ptrname = WszWideCharToMultiByte(codepage, WC_NO_BEST_FIT_CHARS, widestr, _nCharacters, NULL, 0, NULL, NULL); \
- if (__l##ptrname > MAKE_MAX_LENGTH) \
- MAKE_TOOLONGACTION; \
- ptrname = (LPUTF8) __qb##ptrname .AllocThrows(__l##ptrname+1); \
- BOOL __b##ptrname; \
- DWORD _pCnt = WszWideCharToMultiByte(codepage, WC_NO_BEST_FIT_CHARS, widestr, _nCharacters, ptrname, __l##ptrname, NULL, &__b##ptrname); \
- if (__b##ptrname || (_pCnt == 0 && _nCharacters > 0)) { \
- MAKE_TRANSLATIONFAILED; \
- } \
- ptrname[__l##ptrname] = 0;
-
-#define MAKE_MULTIBYTE_FROMWIDEN_BESTFIT(ptrname, widestr, _nCharacters, _pCnt, codepage) \
- CQuickBytes __qb##ptrname; \
- int __l##ptrname; \
- __l##ptrname = WszWideCharToMultiByte(codepage, 0, widestr, _nCharacters, NULL, 0, NULL, NULL); \
- if (__l##ptrname > MAKE_MAX_LENGTH) \
- MAKE_TOOLONGACTION; \
- ptrname = (LPUTF8) __qb##ptrname .AllocThrows(__l##ptrname+1); \
- DWORD _pCnt = WszWideCharToMultiByte(codepage, 0, widestr, _nCharacters, ptrname, __l##ptrname, NULL, NULL); \
- if (_pCnt == 0 && _nCharacters > 0) { \
- MAKE_TRANSLATIONFAILED; \
- } \
- ptrname[__l##ptrname] = 0;
-
-#define MAKE_ANSIPTR_FROMWIDEN(ptrname, widestr, _nCharacters, _pCnt) \
- MAKE_MULTIBYTE_FROMWIDEN(ptrname, widestr, _nCharacters, _pCnt, CP_ACP)
-
const SIZE_T MaxSigned32BitDecString = ARRAY_SIZE("-2147483648") - 1;
const SIZE_T MaxUnsigned32BitDecString = ARRAY_SIZE("4294967295") - 1;
const SIZE_T MaxIntegerDecHexString = ARRAY_SIZE("-9223372036854775808") - 1;
-const SIZE_T Max16BitHexString = ARRAY_SIZE("1234") - 1;
const SIZE_T Max32BitHexString = ARRAY_SIZE("12345678") - 1;
const SIZE_T Max64BitHexString = ARRAY_SIZE("1234567812345678") - 1;
return str;
}
-inline
-LPWSTR DuplicateString(
- LPCWSTR wszString,
- size_t cchString)
-{
- STATIC_CONTRACT_NOTHROW;
-
- LPWSTR wszDup = NULL;
- if (wszString != NULL)
- {
- wszDup = new (nothrow) WCHAR[cchString + 1];
- if (wszDup != NULL)
- {
- wcscpy_s(wszDup, cchString + 1, wszString);
- }
- }
- return wszDup;
-}
-
-inline
-LPWSTR DuplicateString(
- LPCWSTR wszString)
-{
- STATIC_CONTRACT_NOTHROW;
-
- if (wszString != NULL)
- {
- return DuplicateString(wszString, u16_strlen(wszString));
- }
- else
- {
- return NULL;
- }
-}
-
-void DECLSPEC_NORETURN ThrowOutOfMemory();
-
-inline
-LPWSTR DuplicateStringThrowing(
- LPCWSTR wszString,
- size_t cchString)
-{
- STATIC_CONTRACT_THROWS;
-
- if (wszString == NULL)
- return NULL;
-
- LPWSTR wszDup = DuplicateString(wszString, cchString);
- if (wszDup == NULL)
- ThrowOutOfMemory();
-
- return wszDup;
-}
-
-inline
-LPWSTR DuplicateStringThrowing(
- LPCWSTR wszString)
-{
- STATIC_CONTRACT_THROWS;
-
- if (wszString == NULL)
- return NULL;
-
- LPWSTR wszDup = DuplicateString(wszString);
- if (wszDup == NULL)
- ThrowOutOfMemory();
-
- return wszDup;
-}
-
-
//*****************************************************************************
// Placement new is used to new and object at an exact location. The pointer
// is simply returned to the caller without actually using the heap. The
loader/module.cpp
locale/unicode.cpp
locale/unicodedata.cpp
- locale/utf8.cpp
+ ${CLR_SRC_NATIVE_DIR}/minipal/utf8.c
map/common.cpp
map/map.cpp
map/virtual.cpp
+++ /dev/null
-// Licensed to the .NET Foundation under one or more agreements.
-// The .NET Foundation licenses this file to you under the MIT license.
-
-/*++
-
-
-
-Module Name:
-
- include/pal/utf8.h
-
-Abstract:
- Header file for UTF-8 conversion functions.
-
-Revision History:
-
-
-
---*/
-
-#ifndef _PAL_UTF8_H_
-#define _PAL_UTF8_H_
-
-#include <pal/palinternal.h> /* for WCHAR */
-
-#ifdef __cplusplus
-extern "C"
-{
-#endif // __cplusplus
-
-/*++
-Function :
- UTF8ToUnicode
-
- Convert a string from UTF-8 to UTF-16 (UCS-2)
---*/
-int UTF8ToUnicode(LPCSTR lpSrcStr, int cchSrc, LPWSTR lpDestStr, int cchDest, DWORD dwFlags);
-
-
-/*++
-Function :
- UnicodeToUTF8
-
- Convert a string from UTF-16 (UCS-2) to UTF-8
---*/
-int UnicodeToUTF8(LPCWSTR lpSrcStr, int cchSrc, LPSTR lpDestStr, int cchDest);
-
-#ifdef __cplusplus
-}
-#endif // __cplusplus
-
-#endif /* _PAL_UTF8_H_ */
#include "pal/palinternal.h"
#include "pal/dbgmsg.h"
#include "pal/file.h"
-#include "pal/utf8.h"
+#include <minipal/utf8.h>
#include "pal/cruntime.h"
#include "pal/stackstring.hpp"
#include "pal/unicodedata.h"
goto EXIT;
}
- // Use UTF8ToUnicode on all systems, since it replaces
- // invalid characters and Core Foundation doesn't do that.
if (CodePage == CP_UTF8 || CodePage == CP_ACP)
{
- if (cbMultiByte <= -1)
+ if (cbMultiByte < 0)
+ cbMultiByte = strlen(lpMultiByteStr) + 1;
+
+ if (!lpWideCharStr || cchWideChar == 0)
+ retval = minipal_get_length_utf8_to_utf16(lpMultiByteStr, cbMultiByte, dwFlags);
+
+ if (lpWideCharStr)
{
- cbMultiByte = strlen(lpMultiByteStr) + 1;
+ if (cchWideChar == 0) cchWideChar = retval;
+ retval = minipal_convert_utf8_to_utf16(lpMultiByteStr, cbMultiByte, (CHAR16_T*)lpWideCharStr, cchWideChar, dwFlags);
}
- retval = UTF8ToUnicode(lpMultiByteStr, cbMultiByte, lpWideCharStr, cchWideChar, dwFlags);
goto EXIT;
}
defaultChar = *lpDefaultChar;
}
- // Use UnicodeToUTF8 on all systems because we use
- // UTF8ToUnicode in MultiByteToWideChar() on all systems.
if (CodePage == CP_UTF8 || CodePage == CP_ACP)
{
- if (cchWideChar == -1)
- {
+ if (cchWideChar < 0)
cchWideChar = PAL_wcslen(lpWideCharStr) + 1;
+
+ if (!lpMultiByteStr || cbMultiByte == 0)
+ retval = minipal_get_length_utf16_to_utf8((CHAR16_T*)lpWideCharStr, cchWideChar, dwFlags);
+
+ if (lpMultiByteStr)
+ {
+ if (cbMultiByte == 0) cbMultiByte = retval;
+ retval = minipal_convert_utf16_to_utf8((CHAR16_T*)lpWideCharStr, cchWideChar, lpMultiByteStr, cbMultiByte, dwFlags);
}
- retval = UnicodeToUTF8(lpWideCharStr, cchWideChar, lpMultiByteStr, cbMultiByte);
+
goto EXIT;
}
+++ /dev/null
-// Licensed to the .NET Foundation under one or more agreements.
-// The .NET Foundation licenses this file to you under the MIT license.
-
-/*++
-
-Module Name:
-
- unicode/utf8.c
-
-Abstract:
- Functions to encode and decode UTF-8 strings. This is a port of the C# version from Utf8Encoding.cs.
-
-Revision History:
-
---*/
-
-#include "pal/utf8.h"
-#include "pal/malloc.hpp"
-
-using namespace CorUnix;
-
-#define FASTLOOP
-
-struct CharUnicodeInfo
-{
- static const WCHAR HIGH_SURROGATE_START = 0xd800;
- static const WCHAR HIGH_SURROGATE_END = 0xdbff;
- static const WCHAR LOW_SURROGATE_START = 0xdc00;
- static const WCHAR LOW_SURROGATE_END = 0xdfff;
-};
-
-struct Char
-{
- // Test if the wide character is a high surrogate
- static bool IsHighSurrogate(const WCHAR c)
- {
- return (c & 0xFC00) == CharUnicodeInfo::HIGH_SURROGATE_START;
- }
-
- // Test if the wide character is a low surrogate
- static bool IsLowSurrogate(const WCHAR c)
- {
- return (c & 0xFC00) == CharUnicodeInfo::LOW_SURROGATE_START;
- }
-
- // Test if the wide character is a surrogate half
- static bool IsSurrogate(const WCHAR c)
- {
- return (c & 0xF800) == CharUnicodeInfo::HIGH_SURROGATE_START;
- }
-
- // Test if the wide character is a high surrogate
- static bool IsHighSurrogate(const WCHAR* s, int index)
- {
- return IsHighSurrogate(s[index]);
- }
-
- // Test if the wide character is a low surrogate
- static bool IsLowSurrogate(const WCHAR* s, int index)
- {
- return IsLowSurrogate(s[index]);
- }
-
- // Test if the wide character is a surrogate half
- static bool IsSurrogate(const WCHAR* s, int index)
- {
- return IsSurrogate(s[index]);
- }
-};
-
-class ArgumentException
-{
-
-public:
- ArgumentException(LPCSTR message)
- {
- }
-
- ArgumentException(LPCSTR message, LPCSTR argName)
- {
- }
-};
-
-class ArgumentNullException : public ArgumentException
-{
-public:
- ArgumentNullException(LPCSTR argName)
- : ArgumentException("Argument is NULL", argName)
- {
-
- }
-};
-
-class ArgumentOutOfRangeException : public ArgumentException
-{
-public:
- ArgumentOutOfRangeException(LPCSTR argName, LPCSTR message)
- : ArgumentException(message, argName)
- {
-
- }
-};
-
-class InsufficientBufferException : public ArgumentException
-{
-public:
- InsufficientBufferException(LPCSTR message, LPCSTR argName)
- : ArgumentException(message, argName)
- {
-
- }
-};
-
-class Contract
-{
-public:
- static void Assert(bool cond, LPCSTR str)
- {
- if (!cond)
- {
- throw ArgumentException(str);
- }
- }
-
- static void EndContractBlock()
- {
- }
-};
-
-class DecoderFallbackException : public ArgumentException
-{
- BYTE *bytesUnknown;
- int index;
-
-public:
- DecoderFallbackException(
- LPCSTR message, BYTE bytesUnknown[], int index) : ArgumentException(message)
- {
- this->bytesUnknown = bytesUnknown;
- this->index = index;
- }
-
- BYTE *BytesUnknown()
- {
- return (bytesUnknown);
- }
-
- int GetIndex()
- {
- return index;
- }
-};
-
-class DecoderFallbackBuffer;
-
-class DecoderFallback
-{
-public:
-
- // Fallback
- //
- // Return the appropriate unicode string alternative to the character that need to fall back.
-
- virtual DecoderFallbackBuffer* CreateFallbackBuffer() = 0;
-
- // Maximum number of characters that this instance of this fallback could return
-
- virtual int GetMaxCharCount() = 0;
-};
-
-class DecoderReplacementFallback : public DecoderFallback
-{
- // Our variables
- WCHAR strDefault[2];
- int strDefaultLength;
-
-public:
- // Construction. Default replacement fallback uses no best fit and ? replacement string
- DecoderReplacementFallback() : DecoderReplacementFallback(W("?"))
- {
- }
-
- DecoderReplacementFallback(const WCHAR* replacement)
- {
- // Must not be null
- if (replacement == nullptr)
- throw ArgumentNullException("replacement");
- Contract::EndContractBlock();
-
- // Make sure it doesn't have bad surrogate pairs
- bool bFoundHigh = false;
- int replacementLength = PAL_wcslen((const WCHAR *)replacement);
- for (int i = 0; i < replacementLength; i++)
- {
- // Found a surrogate?
- if (Char::IsSurrogate(replacement, i))
- {
- // High or Low?
- if (Char::IsHighSurrogate(replacement, i))
- {
- // if already had a high one, stop
- if (bFoundHigh)
- break; // break & throw at the bFoundHIgh below
- bFoundHigh = true;
- }
- else
- {
- // Low, did we have a high?
- if (!bFoundHigh)
- {
- // Didn't have one, make if fail when we stop
- bFoundHigh = true;
- break;
- }
-
- // Clear flag
- bFoundHigh = false;
- }
- }
- // If last was high we're in trouble (not surrogate so not low surrogate, so break)
- else if (bFoundHigh)
- break;
- }
- if (bFoundHigh)
- throw ArgumentException("String 'replacement' contains invalid Unicode code points.", "replacement");
-
- wcscpy_s(strDefault, ARRAY_SIZE(strDefault), replacement);
- strDefaultLength = replacementLength;
- }
-
- WCHAR* GetDefaultString()
- {
- return strDefault;
- }
-
- virtual DecoderFallbackBuffer* CreateFallbackBuffer();
-
- // Maximum number of characters that this instance of this fallback could return
- virtual int GetMaxCharCount()
- {
- return strDefaultLength;
- }
-};
-
-class DecoderFallbackBuffer
-{
- friend class UTF8Encoding;
- // Most implementations will probably need an implementation-specific constructor
-
- // internal methods that cannot be overridden that let us do our fallback thing
- // These wrap the internal methods so that we can check for people doing stuff that's incorrect
-
-public:
- virtual ~DecoderFallbackBuffer() = default;
-
- virtual bool Fallback(BYTE bytesUnknown[], int index, int size) = 0;
-
- // Get next character
- virtual WCHAR GetNextChar() = 0;
-
- //Back up a character
- virtual bool MovePrevious() = 0;
-
- // How many chars left in this fallback?
- virtual int GetRemaining() = 0;
-
- // Clear the buffer
- virtual void Reset()
- {
- while (GetNextChar() != (WCHAR)0);
- }
-
- // Internal items to help us figure out what we're doing as far as error messages, etc.
- // These help us with our performance and messages internally
-protected:
- BYTE* byteStart;
- WCHAR* charEnd;
-
- // Internal reset
- void InternalReset()
- {
- byteStart = nullptr;
- Reset();
- }
-
- // Set the above values
- // This can't be part of the constructor because EncoderFallbacks would have to know how to implement these.
- void InternalInitialize(BYTE* byteStart, WCHAR* charEnd)
- {
- this->byteStart = byteStart;
- this->charEnd = charEnd;
- }
-
- // Fallback the current byte by sticking it into the remaining char buffer.
- // This can only be called by our encodings (other have to use the public fallback methods), so
- // we can use our DecoderNLS here too (except we don't).
- // Returns true if we are successful, false if we can't fallback the character (no buffer space)
- // So caller needs to throw buffer space if return false.
- // Right now this has both bytes and bytes[], since we might have extra bytes, hence the
- // array, and we might need the index, hence the byte*
- // Don't touch ref chars unless we succeed
- virtual bool InternalFallback(BYTE bytes[], BYTE* pBytes, WCHAR** chars, int size)
- {
-
- Contract::Assert(byteStart != nullptr, "[DecoderFallback.InternalFallback]Used InternalFallback without calling InternalInitialize");
-
- // See if there's a fallback character and we have an output buffer then copy our string.
- if (this->Fallback(bytes, (int)(pBytes - byteStart - size), size))
- {
- // Copy the chars to our output
- WCHAR ch;
- WCHAR* charTemp = *chars;
- bool bHighSurrogate = false;
- while ((ch = GetNextChar()) != 0)
- {
- // Make sure no mixed up surrogates
- if (Char::IsSurrogate(ch))
- {
- if (Char::IsHighSurrogate(ch))
- {
- // High Surrogate
- if (bHighSurrogate)
- throw ArgumentException("String 'chars' contains invalid Unicode code points.");
- bHighSurrogate = true;
- }
- else
- {
- // Low surrogate
- if (!bHighSurrogate)
- throw ArgumentException("String 'chars' contains invalid Unicode code points.");
- bHighSurrogate = false;
- }
- }
-
- if (charTemp >= charEnd)
- {
- // No buffer space
- return false;
- }
-
- *(charTemp++) = ch;
- }
-
- // Need to make sure that bHighSurrogate isn't true
- if (bHighSurrogate)
- throw ArgumentException("String 'chars' contains invalid Unicode code points.");
-
- // Now we aren't going to be false, so its OK to update chars
- *chars = charTemp;
- }
-
- return true;
- }
-
- // This version just counts the fallback and doesn't actually copy anything.
- virtual int InternalFallback(BYTE bytes[], BYTE* pBytes, int size)
- // Right now this has both bytes[] and BYTE* bytes, since we might have extra bytes, hence the
- // array, and we might need the index, hence the byte*
- {
-
- Contract::Assert(byteStart != nullptr, "[DecoderFallback.InternalFallback]Used InternalFallback without calling InternalInitialize");
-
- // See if there's a fallback character and we have an output buffer then copy our string.
- if (this->Fallback(bytes, (int)(pBytes - byteStart - size), size))
- {
- int count = 0;
-
- WCHAR ch;
- bool bHighSurrogate = false;
- while ((ch = GetNextChar()) != 0)
- {
- // Make sure no mixed up surrogates
- if (Char::IsSurrogate(ch))
- {
- if (Char::IsHighSurrogate(ch))
- {
- // High Surrogate
- if (bHighSurrogate)
- throw ArgumentException("String 'chars' contains invalid Unicode code points.");
- bHighSurrogate = true;
- }
- else
- {
- // Low surrogate
- if (!bHighSurrogate)
- throw ArgumentException("String 'chars' contains invalid Unicode code points.");
- bHighSurrogate = false;
- }
- }
-
- count++;
- }
-
- // Need to make sure that bHighSurrogate isn't true
- if (bHighSurrogate)
- throw ArgumentException("String 'chars' contains invalid Unicode code points.");
-
- return count;
- }
-
- // If no fallback return 0
- return 0;
- }
-
- // private helper methods
- void ThrowLastBytesRecursive(BYTE bytesUnknown[])
- {
- throw ArgumentException("Recursive fallback not allowed");
- }
-};
-
-class DecoderReplacementFallbackBuffer : public DecoderFallbackBuffer
-{
- // Store our default string
- WCHAR strDefault[2];
- int strDefaultLength;
- int fallbackCount = -1;
- int fallbackIndex = -1;
-
-public:
- // Construction
- DecoderReplacementFallbackBuffer(DecoderReplacementFallback* fallback)
- {
- wcscpy_s(strDefault, ARRAY_SIZE(strDefault), fallback->GetDefaultString());
- strDefaultLength = PAL_wcslen((const WCHAR *)fallback->GetDefaultString());
- }
-
- // Fallback Methods
- virtual bool Fallback(BYTE bytesUnknown[], int index, int size)
- {
- // We expect no previous fallback in our buffer
- // We can't call recursively but others might (note, we don't test on last char!!!)
- if (fallbackCount >= 1)
- {
- ThrowLastBytesRecursive(bytesUnknown);
- }
-
- // Go ahead and get our fallback
- if (strDefaultLength == 0)
- return false;
-
- fallbackCount = strDefaultLength;
- fallbackIndex = -1;
-
- return true;
- }
-
- virtual WCHAR GetNextChar()
- {
- // We want it to get < 0 because == 0 means that the current/last character is a fallback
- // and we need to detect recursion. We could have a flag but we already have this counter.
- fallbackCount--;
- fallbackIndex++;
-
- // Do we have anything left? 0 is now last fallback char, negative is nothing left
- if (fallbackCount < 0)
- return '\0';
-
- // Need to get it out of the buffer.
- // Make sure it didn't wrap from the fast count-- path
- if (fallbackCount == INT_MAX)
- {
- fallbackCount = -1;
- return '\0';
- }
-
- // Now make sure its in the expected range
- Contract::Assert(fallbackIndex < strDefaultLength && fallbackIndex >= 0,
- "Index exceeds buffer range");
-
- return strDefault[fallbackIndex];
- }
-
- virtual bool MovePrevious()
- {
- // Back up one, only if we just processed the last character (or earlier)
- if (fallbackCount >= -1 && fallbackIndex >= 0)
- {
- fallbackIndex--;
- fallbackCount++;
- return true;
- }
-
- // Return false 'cause we couldn't do it.
- return false;
- }
-
- // How many characters left to output?
- virtual int GetRemaining()
- {
- // Our count is 0 for 1 character left.
- return (fallbackCount < 0) ? 0 : fallbackCount;
- }
-
- // Clear the buffer
- virtual void Reset()
- {
- fallbackCount = -1;
- fallbackIndex = -1;
- byteStart = nullptr;
- }
-
- // This version just counts the fallback and doesn't actually copy anything.
- virtual int InternalFallback(BYTE bytes[], BYTE* pBytes, int size)
- // Right now this has both bytes and bytes[], since we might have extra bytes, hence the
- // array, and we might need the index, hence the byte*
- {
- // return our replacement string Length
- return strDefaultLength;
- }
-};
-
-class DecoderExceptionFallbackBuffer : public DecoderFallbackBuffer
-{
-public:
- DecoderExceptionFallbackBuffer()
- {
- }
-
- virtual bool Fallback(BYTE bytesUnknown[], int index, int size)
- {
- throw DecoderFallbackException(
- "Unable to translate UTF-8 character to Unicode", bytesUnknown, index);
- }
-
- virtual WCHAR GetNextChar()
- {
- return 0;
- }
-
- virtual bool MovePrevious()
- {
- // Exception fallback doesn't have anywhere to back up to.
- return false;
- }
-
- // Exceptions are always empty
- virtual int GetRemaining()
- {
- return 0;
- }
-
-};
-
-class DecoderExceptionFallback : public DecoderFallback
-{
- // Construction
-public:
- DecoderExceptionFallback()
- {
- }
-
- virtual DecoderFallbackBuffer* CreateFallbackBuffer()
- {
- return InternalNew<DecoderExceptionFallbackBuffer>();
- }
-
- // Maximum number of characters that this instance of this fallback could return
- virtual int GetMaxCharCount()
- {
- return 0;
- }
-};
-
-DecoderFallbackBuffer* DecoderReplacementFallback::CreateFallbackBuffer()
-{
- return InternalNew<DecoderReplacementFallbackBuffer>(this);
-}
-
-class EncoderFallbackException : public ArgumentException
-{
- WCHAR charUnknown;
- WCHAR charUnknownHigh;
- WCHAR charUnknownLow;
- int index;
-
-public:
- EncoderFallbackException(
- LPCSTR message, WCHAR charUnknown, int index) : ArgumentException(message)
- {
- this->charUnknown = charUnknown;
- this->index = index;
- }
-
- EncoderFallbackException(
- LPCSTR message, WCHAR charUnknownHigh, WCHAR charUnknownLow, int index) : ArgumentException(message)
- {
- if (!Char::IsHighSurrogate(charUnknownHigh))
- {
- throw ArgumentOutOfRangeException("charUnknownHigh",
- "Argument out of range 0xD800..0xDBFF");
- }
- if (!Char::IsLowSurrogate(charUnknownLow))
- {
- throw ArgumentOutOfRangeException("charUnknownLow",
- "Argument out of range 0xDC00..0xDFFF");
- }
- Contract::EndContractBlock();
-
- this->charUnknownHigh = charUnknownHigh;
- this->charUnknownLow = charUnknownLow;
- this->index = index;
- }
-
- WCHAR GetCharUnknown()
- {
- return (charUnknown);
- }
-
- WCHAR GetCharUnknownHigh()
- {
- return (charUnknownHigh);
- }
-
- WCHAR GetCharUnknownLow()
- {
- return (charUnknownLow);
- }
-
- int GetIndex()
- {
- return index;
- }
-
- // Return true if the unknown character is a surrogate pair.
- bool IsUnknownSurrogate()
- {
- return (charUnknownHigh != '\0');
- }
-};
-
-class EncoderFallbackBuffer;
-
-class EncoderFallback
-{
-public:
-
- // Fallback
- //
- // Return the appropriate unicode string alternative to the character that need to fall back.
-
- virtual EncoderFallbackBuffer* CreateFallbackBuffer() = 0;
-
- // Maximum number of characters that this instance of this fallback could return
- virtual int GetMaxCharCount() = 0;
-};
-
-class EncoderReplacementFallback : public EncoderFallback
-{
- // Our variables
- WCHAR strDefault[2];
- int strDefaultLength;
-
-public:
- // Construction. Default replacement fallback uses no best fit and ? replacement string
- EncoderReplacementFallback() : EncoderReplacementFallback(W("?"))
- {
- }
-
- EncoderReplacementFallback(const WCHAR* replacement)
- {
- // Must not be null
- if (replacement == nullptr)
- throw ArgumentNullException("replacement");
- Contract::EndContractBlock();
-
- // Make sure it doesn't have bad surrogate pairs
- bool bFoundHigh = false;
- int replacementLength = PAL_wcslen((const WCHAR *)replacement);
- for (int i = 0; i < replacementLength; i++)
- {
- // Found a surrogate?
- if (Char::IsSurrogate(replacement, i))
- {
- // High or Low?
- if (Char::IsHighSurrogate(replacement, i))
- {
- // if already had a high one, stop
- if (bFoundHigh)
- break; // break & throw at the bFoundHIgh below
- bFoundHigh = true;
- }
- else
- {
- // Low, did we have a high?
- if (!bFoundHigh)
- {
- // Didn't have one, make if fail when we stop
- bFoundHigh = true;
- break;
- }
-
- // Clear flag
- bFoundHigh = false;
- }
- }
- // If last was high we're in trouble (not surrogate so not low surrogate, so break)
- else if (bFoundHigh)
- break;
- }
- if (bFoundHigh)
- throw ArgumentException("String 'replacement' contains invalid Unicode code points.", "replacement");
-
- wcscpy_s(strDefault, ARRAY_SIZE(strDefault), replacement);
- strDefaultLength = replacementLength;
- }
-
- WCHAR* GetDefaultString()
- {
- return strDefault;
- }
-
- virtual EncoderFallbackBuffer* CreateFallbackBuffer();
-
- // Maximum number of characters that this instance of this fallback could return
- virtual int GetMaxCharCount()
- {
- return strDefaultLength;
- }
-};
-
-class EncoderFallbackBuffer
-{
- friend class UTF8Encoding;
- // Most implementations will probably need an implementation-specific constructor
-
- // Public methods that cannot be overridden that let us do our fallback thing
- // These wrap the internal methods so that we can check for people doing stuff that is incorrect
-
-public:
- virtual ~EncoderFallbackBuffer() = default;
-
- virtual bool Fallback(WCHAR charUnknown, int index) = 0;
-
- virtual bool Fallback(WCHAR charUnknownHigh, WCHAR charUnknownLow, int index) = 0;
-
- // Get next character
- virtual WCHAR GetNextChar() = 0;
-
- // Back up a character
- virtual bool MovePrevious() = 0;
-
- // How many chars left in this fallback?
- virtual int GetRemaining() = 0;
-
- // Not sure if this should be public or not.
- // Clear the buffer
- virtual void Reset()
- {
- while (GetNextChar() != (WCHAR)0);
- }
-
- // Internal items to help us figure out what we're doing as far as error messages, etc.
- // These help us with our performance and messages internally
-protected:
- WCHAR* charStart;
- WCHAR* charEnd;
- bool setEncoder;
- bool bUsedEncoder;
- bool bFallingBack = false;
- int iRecursionCount = 0;
- static const int iMaxRecursion = 250;
-
- // Internal Reset
- // For example, what if someone fails a conversion and wants to reset one of our fallback buffers?
- void InternalReset()
- {
- charStart = nullptr;
- bFallingBack = false;
- iRecursionCount = 0;
- Reset();
- }
-
- // Set the above values
- // This can't be part of the constructor because EncoderFallbacks would have to know how to implement these.
- void InternalInitialize(WCHAR* charStart, WCHAR* charEnd, bool setEncoder)
- {
- this->charStart = charStart;
- this->charEnd = charEnd;
- this->setEncoder = setEncoder;
- this->bUsedEncoder = false;
- this->bFallingBack = false;
- this->iRecursionCount = 0;
- }
-
- WCHAR InternalGetNextChar()
- {
- WCHAR ch = GetNextChar();
- bFallingBack = (ch != 0);
- if (ch == 0) iRecursionCount = 0;
- return ch;
- }
-
- // Fallback the current character using the remaining buffer and encoder if necessary
- // This can only be called by our encodings (other have to use the public fallback methods), so
- // we can use our EncoderNLS here too.
- // setEncoder is true if we're calling from a GetBytes method, false if we're calling from a GetByteCount
- //
- // Note that this could also change the contents of this->encoder, which is the same
- // object that the caller is using, so the caller could mess up the encoder for us
- // if they aren't careful.
- virtual bool InternalFallback(WCHAR ch, WCHAR** chars)
- {
- // Shouldn't have null charStart
- Contract::Assert(charStart != nullptr,
- "[EncoderFallback.InternalFallbackBuffer]Fallback buffer is not initialized");
-
- // Get our index, remember chars was preincremented to point at next char, so have to -1
- int index = (int)(*chars - charStart) - 1;
-
- // See if it was a high surrogate
- if (Char::IsHighSurrogate(ch))
- {
- // See if there's a low surrogate to go with it
- if (*chars >= this->charEnd)
- {
- // Nothing left in input buffer
- // No input, return 0
- }
- else
- {
- // Might have a low surrogate
- WCHAR cNext = **chars;
- if (Char::IsLowSurrogate(cNext))
- {
- // If already falling back then fail
- if (bFallingBack && iRecursionCount++ > iMaxRecursion)
- ThrowLastCharRecursive(ch, cNext);
-
- // Next is a surrogate, add it as surrogate pair, and increment chars
- (*chars)++;
- bFallingBack = Fallback(ch, cNext, index);
- return bFallingBack;
- }
-
- // Next isn't a low surrogate, just fallback the high surrogate
- }
- }
-
- // If already falling back then fail
- if (bFallingBack && iRecursionCount++ > iMaxRecursion)
- ThrowLastCharRecursive((int)ch);
-
- // Fall back our char
- bFallingBack = Fallback(ch, index);
-
- return bFallingBack;
- }
-
- // private helper methods
- void ThrowLastCharRecursive(WCHAR highSurrogate, WCHAR lowSurrogate)
- {
- // Throw it, using our complete character
- throw ArgumentException("Recursive fallback not allowed", "chars");
- }
-
- void ThrowLastCharRecursive(int utf32Char)
- {
- throw ArgumentException("Recursive fallback not allowed", "chars");
- }
-
-};
-
-class EncoderReplacementFallbackBuffer : public EncoderFallbackBuffer
-{
- // Store our default string
- WCHAR strDefault[4];
- int strDefaultLength;
- int fallbackCount = -1;
- int fallbackIndex = -1;
-public:
- // Construction
- EncoderReplacementFallbackBuffer(EncoderReplacementFallback* fallback)
- {
- // 2X in case we're a surrogate pair
- wcscpy_s(strDefault, ARRAY_SIZE(strDefault), fallback->GetDefaultString());
- wcscat_s(strDefault, ARRAY_SIZE(strDefault), fallback->GetDefaultString());
- strDefaultLength = 2 * PAL_wcslen((const WCHAR *)fallback->GetDefaultString());
-
- }
-
- // Fallback Methods
- virtual bool Fallback(WCHAR charUnknown, int index)
- {
- // If we had a buffer already we're being recursive, throw, it's probably at the suspect
- // character in our array.
- if (fallbackCount >= 1)
- {
- // If we're recursive we may still have something in our buffer that makes this a surrogate
- if (Char::IsHighSurrogate(charUnknown) && fallbackCount >= 0 &&
- Char::IsLowSurrogate(strDefault[fallbackIndex + 1]))
- ThrowLastCharRecursive(charUnknown, strDefault[fallbackIndex + 1]);
-
- // Nope, just one character
- ThrowLastCharRecursive((int)charUnknown);
- }
-
- // Go ahead and get our fallback
- // Divide by 2 because we aren't a surrogate pair
- fallbackCount = strDefaultLength / 2;
- fallbackIndex = -1;
-
- return fallbackCount != 0;
- }
-
- virtual bool Fallback(WCHAR charUnknownHigh, WCHAR charUnknownLow, int index)
- {
- // Double check input surrogate pair
- if (!Char::IsHighSurrogate(charUnknownHigh))
- throw ArgumentOutOfRangeException("charUnknownHigh",
- "Argument out of range 0xD800..0xDBFF");
-
- if (!Char::IsLowSurrogate(charUnknownLow))
- throw ArgumentOutOfRangeException("charUnknownLow",
- "Argument out of range 0xDC00..0xDFFF");
- Contract::EndContractBlock();
-
- // If we had a buffer already we're being recursive, throw, it's probably at the suspect
- // character in our array.
- if (fallbackCount >= 1)
- ThrowLastCharRecursive(charUnknownHigh, charUnknownLow);
-
- // Go ahead and get our fallback
- fallbackCount = strDefaultLength;
- fallbackIndex = -1;
-
- return fallbackCount != 0;
- }
-
- virtual WCHAR GetNextChar()
- {
- // We want it to get < 0 because == 0 means that the current/last character is a fallback
- // and we need to detect recursion. We could have a flag but we already have this counter.
- fallbackCount--;
- fallbackIndex++;
-
- // Do we have anything left? 0 is now last fallback char, negative is nothing left
- if (fallbackCount < 0)
- return '\0';
-
- // Need to get it out of the buffer.
- // Make sure it didn't wrap from the fast count-- path
- if (fallbackCount == INT_MAX)
- {
- fallbackCount = -1;
- return '\0';
- }
-
- // Now make sure its in the expected range
- Contract::Assert(fallbackIndex < strDefaultLength && fallbackIndex >= 0,
- "Index exceeds buffer range");
-
- return strDefault[fallbackIndex];
- }
-
- virtual bool MovePrevious()
- {
- // Back up one, only if we just processed the last character (or earlier)
- if (fallbackCount >= -1 && fallbackIndex >= 0)
- {
- fallbackIndex--;
- fallbackCount++;
- return true;
- }
-
- // Return false 'cause we couldn't do it.
- return false;
- }
-
- // How many characters left to output?
- virtual int GetRemaining()
- {
- // Our count is 0 for 1 character left.
- return (fallbackCount < 0) ? 0 : fallbackCount;
- }
-
- // Clear the buffer
- virtual void Reset()
- {
- fallbackCount = -1;
- fallbackIndex = 0;
- charStart = nullptr;
- bFallingBack = false;
- }
-};
-
-class EncoderExceptionFallbackBuffer : public EncoderFallbackBuffer
-{
-public:
- EncoderExceptionFallbackBuffer()
- {
- }
-
- virtual bool Fallback(WCHAR charUnknown, int index)
- {
- // Fall back our char
- throw EncoderFallbackException("Unable to translate Unicode character to UTF-8", charUnknown, index);
- }
-
- virtual bool Fallback(WCHAR charUnknownHigh, WCHAR charUnknownLow, int index)
- {
- if (!Char::IsHighSurrogate(charUnknownHigh))
- {
- throw ArgumentOutOfRangeException("charUnknownHigh",
- "Argument out of range 0xD800..0xDBFF");
- }
- if (!Char::IsLowSurrogate(charUnknownLow))
- {
- throw ArgumentOutOfRangeException("charUnknownLow",
- "Argument out of range 0xDC00..0xDFFF");
- }
- Contract::EndContractBlock();
-
- //int iTemp = Char::ConvertToUtf32(charUnknownHigh, charUnknownLow);
-
- // Fall back our char
- throw EncoderFallbackException(
- "Unable to translate Unicode character to UTF-8", charUnknownHigh, charUnknownLow, index);
- }
-
- virtual WCHAR GetNextChar()
- {
- return 0;
- }
-
- virtual bool MovePrevious()
- {
- // Exception fallback doesn't have anywhere to back up to.
- return false;
- }
-
- // Exceptions are always empty
- virtual int GetRemaining()
- {
- return 0;
- }
-};
-
-class EncoderExceptionFallback : public EncoderFallback
-{
- // Construction
-public:
- EncoderExceptionFallback()
- {
- }
-
- virtual EncoderFallbackBuffer* CreateFallbackBuffer()
- {
- return InternalNew<EncoderExceptionFallbackBuffer>();
- }
-
- // Maximum number of characters that this instance of this fallback could return
- virtual int GetMaxCharCount()
- {
- return 0;
- }
-};
-
-EncoderFallbackBuffer* EncoderReplacementFallback::CreateFallbackBuffer()
-{
- return InternalNew<EncoderReplacementFallbackBuffer>(this);
-}
-
-class UTF8Encoding
-{
- EncoderFallback* encoderFallback;
- // Instances of the two possible fallbacks. The constructor parameter
- // determines which one to use.
- EncoderReplacementFallback encoderReplacementFallback;
- EncoderExceptionFallback encoderExceptionFallback;
-
- DecoderFallback* decoderFallback;
- // Instances of the two possible fallbacks. The constructor parameter
- // determines which one to use.
- DecoderReplacementFallback decoderReplacementFallback;
- DecoderExceptionFallback decoderExceptionFallback;
-
- bool InRange(int c, int begin, int end)
- {
- return begin <= c && c <= end;
- }
-
- size_t PtrDiff(WCHAR* ptr1, WCHAR* ptr2)
- {
- return ptr1 - ptr2;
- }
-
- size_t PtrDiff(BYTE* ptr1, BYTE* ptr2)
- {
- return ptr1 - ptr2;
- }
-
- void ThrowBytesOverflow()
- {
- // Special message to include fallback type in case fallback's GetMaxCharCount is broken
- // This happens if user has implemented an encoder fallback with a broken GetMaxCharCount
- throw InsufficientBufferException("The output byte buffer is too small to contain the encoded data", "bytes");
- }
-
- void ThrowBytesOverflow(bool nothingEncoded)
- {
- // Special message to include fallback type in case fallback's GetMaxCharCount is broken
- // This happens if user has implemented an encoder fallback with a broken GetMaxCharCount
- if (nothingEncoded){
- ThrowBytesOverflow();
- }
- }
-
- void ThrowCharsOverflow()
- {
- // Special message to include fallback type in case fallback's GetMaxCharCount is broken
- // This happens if user has implemented a decoder fallback with a broken GetMaxCharCount
- throw InsufficientBufferException("The output char buffer is too small to contain the encoded data", "chars");
- }
-
- void ThrowCharsOverflow(bool nothingEncoded)
- {
- // Special message to include fallback type in case fallback's GetMaxCharCount is broken
- // This happens if user has implemented an decoder fallback with a broken GetMaxCharCount
- if (nothingEncoded){
- ThrowCharsOverflow();
- }
- }
-
- // During GetChars we had an invalid byte sequence
- // pSrc is backed up to the start of the bad sequence if we didn't have room to
- // fall it back. Otherwise pSrc remains where it is.
- bool FallbackInvalidByteSequence(BYTE** pSrc, int ch, DecoderFallbackBuffer* fallback, WCHAR** pTarget)
- {
- // Get our byte[]
- BYTE* pStart = *pSrc;
- BYTE bytesUnknown[3];
- int size = GetBytesUnknown(pStart, ch, bytesUnknown);
-
- // Do the actual fallback
- if (!fallback->InternalFallback(bytesUnknown, *pSrc, pTarget, size))
- {
- // Oops, it failed, back up to pStart
- *pSrc = pStart;
- return false;
- }
-
- // It worked
- return true;
- }
-
- int FallbackInvalidByteSequence(BYTE* pSrc, int ch, DecoderFallbackBuffer *fallback)
- {
- // Get our byte[]
- BYTE bytesUnknown[3];
- int size = GetBytesUnknown(pSrc, ch, bytesUnknown);
-
- // Do the actual fallback
- int count = fallback->InternalFallback(bytesUnknown, pSrc, size);
-
- // # of fallback chars expected.
- // Note that we only get here for "long" sequences, and have already unreserved
- // the count that we prereserved for the input bytes
- return count;
- }
-
- int GetBytesUnknown(BYTE* pSrc, int ch, BYTE* bytesUnknown)
- {
- int size;
-
- // See if it was a plain char
- // (have to check >= 0 because we have all sorts of weird bit flags)
- if (ch < 0x100 && ch >= 0)
- {
- pSrc--;
- bytesUnknown[0] = (BYTE)ch;
- size = 1;
- }
- // See if its an unfinished 2 byte sequence
- else if ((ch & (SupplimentarySeq | ThreeByteSeq)) == 0)
- {
- pSrc--;
- bytesUnknown[0] = (BYTE)((ch & 0x1F) | 0xc0);
- size = 1;
- }
- // So now we're either 2nd byte of 3 or 4 byte sequence or
- // we hit a non-trail byte or we ran out of space for 3rd byte of 4 byte sequence
- // 1st check if its a 4 byte sequence
- else if ((ch & SupplimentarySeq) != 0)
- {
- // 3rd byte of 4 byte sequence?
- if ((ch & (FinalByte >> 6)) != 0)
- {
- // 3rd byte of 4 byte sequence
- pSrc -= 3;
- bytesUnknown[0] = (BYTE)(((ch >> 12) & 0x07) | 0xF0);
- bytesUnknown[1] = (BYTE)(((ch >> 6) & 0x3F) | 0x80);
- bytesUnknown[2] = (BYTE)(((ch)& 0x3F) | 0x80);
- size = 3;
- }
- else if ((ch & (FinalByte >> 12)) != 0)
- {
- // 2nd byte of a 4 byte sequence
- pSrc -= 2;
- bytesUnknown[0] = (BYTE)(((ch >> 6) & 0x07) | 0xF0);
- bytesUnknown[1] = (BYTE)(((ch)& 0x3F) | 0x80);
- size = 2;
- }
- else
- {
- // 4th byte of a 4 byte sequence
- pSrc--;
- bytesUnknown[0] = (BYTE)(((ch)& 0x07) | 0xF0);
- size = 1;
- }
- }
- else
- {
- // 2nd byte of 3 byte sequence?
- if ((ch & (FinalByte >> 6)) != 0)
- {
- // So its 2nd byte of a 3 byte sequence
- pSrc -= 2;
- bytesUnknown[0] = (BYTE)(((ch >> 6) & 0x0F) | 0xE0);
- bytesUnknown[1] = (BYTE)(((ch)& 0x3F) | 0x80);
- size = 2;
- }
- else
- {
- // 1st byte of a 3 byte sequence
- pSrc--;
- bytesUnknown[0] = (BYTE)(((ch)& 0x0F) | 0xE0);
- size = 1;
- }
- }
-
- return size;
- }
-
-public:
-
- UTF8Encoding(bool isThrowException)
- : encoderReplacementFallback(W("\xFFFD")), decoderReplacementFallback(W("\xFFFD"))
- {
- if (isThrowException)
- {
- encoderFallback = &encoderExceptionFallback;
- decoderFallback = &decoderExceptionFallback;
- }
- else
- {
- encoderFallback = &encoderReplacementFallback;
- decoderFallback = &decoderReplacementFallback;
- }
- }
-
- // These are bitmasks used to maintain the state in the decoder. They occupy the higher bits
- // while the actual character is being built in the lower bits. They are shifted together
- // with the actual bits of the character.
-
- // bits 30 & 31 are used for pending bits fixup
- const int FinalByte = 1 << 29;
- const int SupplimentarySeq = 1 << 28;
- const int ThreeByteSeq = 1 << 27;
-
- int GetCharCount(BYTE* bytes, int count)
- {
- Contract::Assert(bytes != nullptr, "[UTF8Encoding.GetCharCount]bytes!=nullptr");
- Contract::Assert(count >= 0, "[UTF8Encoding.GetCharCount]count >=0");
-
- // Initialize stuff
- BYTE *pSrc = bytes;
- BYTE *pEnd = pSrc + count;
-
- // Start by assuming we have as many as count, charCount always includes the adjustment
- // for the character being decoded
- int charCount = count;
- int ch = 0;
- DecoderFallbackBuffer *fallback = nullptr;
-
- while (true)
- {
- // SLOWLOOP: does all range checks, handles all special cases, but it is slow
- if (pSrc >= pEnd) {
- break;
- }
-
- // read next byte. The JIT optimization seems to be getting confused when
- // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
- int cha = *pSrc;
-
- if (ch == 0) {
- // no pending bits
- goto ReadChar;
- }
-
- pSrc++;
-
- // we are expecting to see trailing bytes like 10vvvvvv
- if ((cha & 0xC0) != 0x80) {
- // This can be a valid starting byte for another UTF8 byte sequence, so let's put
- // the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence
- pSrc--;
- charCount += (ch >> 30);
- goto InvalidByteSequence;
- }
-
- // fold in the new byte
- ch = (ch << 6) | (cha & 0x3F);
-
- if ((ch & FinalByte) == 0) {
- Contract::Assert((ch & (SupplimentarySeq | ThreeByteSeq)) != 0,
- "[UTF8Encoding.GetChars]Invariant volation");
-
- if ((ch & SupplimentarySeq) != 0) {
- if ((ch & (FinalByte >> 6)) != 0) {
- // this is 3rd byte (of 4 byte supplimentary) - nothing to do
- continue;
- }
-
- // 2nd byte, check for non-shortest form of supplimentary char and the valid
- // supplimentary characters in range 0x010000 - 0x10FFFF at the same time
- if (!InRange(ch & 0x1F0, 0x10, 0x100)) {
- goto InvalidByteSequence;
- }
- }
- else {
- // Must be 2nd byte of a 3-byte sequence
- // check for non-shortest form of 3 byte seq
- if ((ch & (0x1F << 5)) == 0 || // non-shortest form
- (ch & (0xF800 >> 6)) == (0xD800 >> 6)) // illegal individually encoded surrogate
- {
- goto InvalidByteSequence;
- }
- }
- continue;
- }
-
- // ready to punch
-
- // adjust for surrogates in non-shortest form
- if ((ch & (SupplimentarySeq | 0x1F0000)) == SupplimentarySeq) {
- charCount--;
- }
- goto EncodeChar;
-
- InvalidByteSequence:
- // this code fragment should be close to the gotos referencing it
- // Have to do fallback for invalid bytes
- if (fallback == nullptr)
- {
- fallback = decoderFallback->CreateFallbackBuffer();
- fallback->InternalInitialize(bytes, nullptr);
- }
- charCount += FallbackInvalidByteSequence(pSrc, ch, fallback);
-
- ch = 0;
- continue;
-
- ReadChar:
- ch = *pSrc;
- pSrc++;
-
- ProcessChar:
- if (ch > 0x7F) {
- // If its > 0x7F, its start of a new multi-byte sequence
-
- // Long sequence, so unreserve our char.
- charCount--;
-
- // bit 6 has to be non-zero for start of multibyte chars.
- if ((ch & 0x40) == 0) {
- // Unexpected trail byte
- goto InvalidByteSequence;
- }
-
- // start a new long code
- if ((ch & 0x20) != 0) {
- if ((ch & 0x10) != 0) {
- // 4 byte encoding - supplimentary character (2 surrogates)
-
- ch &= 0x0F;
-
- // check that bit 4 is zero and the valid supplimentary character
- // range 0x000000 - 0x10FFFF at the same time
- if (ch > 0x04) {
- ch |= 0xf0;
- goto InvalidByteSequence;
- }
-
- // Add bit flags so that when we check new characters & rotate we'll be flagged correctly.
- // Final byte flag, count fix if we don't make final byte & supplimentary sequence flag.
- ch |= (FinalByte >> 3 * 6) | // Final byte is 3 more bytes from now
- (1 << 30) | // If it dies on next byte we'll need an extra char
- (3 << (30 - 2 * 6)) | // If it dies on last byte we'll need to subtract a char
- (SupplimentarySeq) | (SupplimentarySeq >> 6) |
- (SupplimentarySeq >> 2 * 6) | (SupplimentarySeq >> 3 * 6);
-
- // Our character count will be 2 characters for these 4 bytes, so subtract another char
- charCount--;
- }
- else {
- // 3 byte encoding
- // Add bit flags so that when we check new characters & rotate we'll be flagged correctly.
- ch = (ch & 0x0F) | ((FinalByte >> 2 * 6) | (1 << 30) |
- (ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2 * 6));
-
- // We'll expect 1 character for these 3 bytes, so subtract another char.
- charCount--;
- }
- }
- else {
- // 2 byte encoding
-
- ch &= 0x1F;
-
- // check for non-shortest form
- if (ch <= 1) {
- ch |= 0xc0;
- goto InvalidByteSequence;
- }
-
- // Add bit flags so we'll be flagged correctly
- ch |= (FinalByte >> 6);
- }
- continue;
- }
-
- EncodeChar:
-
-#ifdef FASTLOOP
- int availableBytes = PtrDiff(pEnd, pSrc);
-
- // don't fall into the fast decoding loop if we don't have enough bytes
- if (availableBytes <= 13) {
- // try to get over the remainder of the ascii characters fast though
- BYTE* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
- while (pSrc < pLocalEnd) {
- ch = *pSrc;
- pSrc++;
-
- if (ch > 0x7F)
- goto ProcessChar;
- }
- // we are done
- ch = 0;
- break;
- }
-
- // To compute the upper bound, assume that all characters are ASCII characters at this point,
- // the boundary will be decreased for every non-ASCII character we encounter
- // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences
- BYTE *pStop = pSrc + availableBytes - 7;
-
- while (pSrc < pStop) {
- ch = *pSrc;
- pSrc++;
-
- if (ch > 0x7F) {
- goto LongCode;
- }
-
- // get pSrc 2-byte aligned
- if (((size_t)pSrc & 0x1) != 0) {
- ch = *pSrc;
- pSrc++;
- if (ch > 0x7F) {
- goto LongCode;
- }
- }
-
- // get pSrc 4-byte aligned
- if (((size_t)pSrc & 0x2) != 0) {
- ch = *(USHORT*)pSrc;
- if ((ch & 0x8080) != 0) {
- goto LongCodeWithMask16;
- }
- pSrc += 2;
- }
-
-
- // Run 8 + 8 characters at a time!
- while (pSrc < pStop) {
- ch = *(int*)pSrc;
- int chb = *(int*)(pSrc + 4);
- if (((ch | chb) & (int)0x80808080) != 0) {
- goto LongCodeWithMask32;
- }
- pSrc += 8;
-
- // This is a really small loop - unroll it
- if (pSrc >= pStop)
- break;
-
- ch = *(int*)pSrc;
- chb = *(int*)(pSrc + 4);
- if (((ch | chb) & (int)0x80808080) != 0) {
- goto LongCodeWithMask32;
- }
- pSrc += 8;
- }
- break;
-
-#if BIGENDIAN
- LongCodeWithMask32 :
- // be careful about the sign extension
- ch = (int)(((uint)ch) >> 16);
- LongCodeWithMask16:
- ch = (int)(((uint)ch) >> 8);
-#else // BIGENDIAN
- LongCodeWithMask32:
- LongCodeWithMask16:
- ch &= 0xFF;
-#endif // BIGENDIAN
- pSrc++;
- if (ch <= 0x7F) {
- continue;
- }
-
- LongCode:
- int chc = *pSrc;
- pSrc++;
-
- if (
- // bit 6 has to be zero
- (ch & 0x40) == 0 ||
- // we are expecting to see trailing bytes like 10vvvvvv
- (chc & 0xC0) != 0x80)
- {
- goto BadLongCode;
- }
-
- chc &= 0x3F;
-
- // start a new long code
- if ((ch & 0x20) != 0) {
-
- // fold the first two bytes together
- chc |= (ch & 0x0F) << 6;
-
- if ((ch & 0x10) != 0) {
- // 4 byte encoding - surrogate
- ch = *pSrc;
- if (
- // check that bit 4 is zero, the non-shortest form of surrogate
- // and the valid surrogate range 0x000000 - 0x10FFFF at the same time
- !InRange(chc >> 4, 0x01, 0x10) ||
- // we are expecting to see trailing bytes like 10vvvvvv
- (ch & 0xC0) != 0x80)
- {
- goto BadLongCode;
- }
-
- chc = (chc << 6) | (ch & 0x3F);
-
- ch = *(pSrc + 1);
- // we are expecting to see trailing bytes like 10vvvvvv
- if ((ch & 0xC0) != 0x80) {
- goto BadLongCode;
- }
- pSrc += 2;
-
- // extra byte
- charCount--;
- }
- else {
- // 3 byte encoding
- ch = *pSrc;
- if (
- // check for non-shortest form of 3 byte seq
- (chc & (0x1F << 5)) == 0 ||
- // Can't have surrogates here.
- (chc & (0xF800 >> 6)) == (0xD800 >> 6) ||
- // we are expecting to see trailing bytes like 10vvvvvv
- (ch & 0xC0) != 0x80)
- {
- goto BadLongCode;
- }
- pSrc++;
-
- // extra byte
- charCount--;
- }
- }
- else {
- // 2 byte encoding
-
- // check for non-shortest form
- if ((ch & 0x1E) == 0) {
- goto BadLongCode;
- }
- }
-
- // extra byte
- charCount--;
- }
-#endif // FASTLOOP
-
- // no pending bits at this point
- ch = 0;
- continue;
-
- BadLongCode:
- pSrc -= 2;
- ch = 0;
- continue;
- }
-
- // May have a problem if we have to flush
- if (ch != 0)
- {
- // We were already adjusting for these, so need to unadjust
- charCount += (ch >> 30);
- // Have to do fallback for invalid bytes
- if (fallback == nullptr)
- {
- fallback = decoderFallback->CreateFallbackBuffer();
- fallback->InternalInitialize(bytes, nullptr);
- }
- charCount += FallbackInvalidByteSequence(pSrc, ch, fallback);
- }
-
- // Shouldn't have anything in fallback buffer for GetCharCount
- // (don't have to check m_throwOnOverflow for count)
- Contract::Assert(fallback == nullptr || fallback->GetRemaining() == 0,
- "[UTF8Encoding.GetCharCount]Expected empty fallback buffer at end");
-
- InternalDelete(fallback);
-
- return charCount;
-
- }
-
- int GetChars(BYTE* bytes, int byteCount, WCHAR* chars, int charCount)
- {
- Contract::Assert(chars != nullptr, "[UTF8Encoding.GetChars]chars!=nullptr");
- Contract::Assert(byteCount >= 0, "[UTF8Encoding.GetChars]byteCount >=0");
- Contract::Assert(charCount >= 0, "[UTF8Encoding.GetChars]charCount >=0");
- Contract::Assert(bytes != nullptr, "[UTF8Encoding.GetChars]bytes!=nullptr");
-
- BYTE *pSrc = bytes;
- WCHAR *pTarget = chars;
-
- BYTE *pEnd = pSrc + byteCount;
- WCHAR *pAllocatedBufferEnd = pTarget + charCount;
-
- int ch = 0;
-
- DecoderFallbackBuffer *fallback = nullptr;
-
- while (true)
- {
- // SLOWLOOP: does all range checks, handles all special cases, but it is slow
-
- if (pSrc >= pEnd) {
- break;
- }
-
- // read next byte. The JIT optimization seems to be getting confused when
- // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
- int cha = *pSrc;
-
- if (ch == 0) {
- // no pending bits
- goto ReadChar;
- }
-
- pSrc++;
-
- // we are expecting to see trailing bytes like 10vvvvvv
- if ((cha & 0xC0) != 0x80) {
- // This can be a valid starting byte for another UTF8 byte sequence, so let's put
- // the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence
- pSrc--;
- goto InvalidByteSequence;
- }
-
- // fold in the new byte
- ch = (ch << 6) | (cha & 0x3F);
-
- if ((ch & FinalByte) == 0) {
- // Not at last byte yet
- Contract::Assert((ch & (SupplimentarySeq | ThreeByteSeq)) != 0,
- "[UTF8Encoding.GetChars]Invariant volation");
-
- if ((ch & SupplimentarySeq) != 0) {
- // Its a 4-byte supplimentary sequence
- if ((ch & (FinalByte >> 6)) != 0) {
- // this is 3rd byte of 4 byte sequence - nothing to do
- continue;
- }
-
- // 2nd byte of 4 bytes
- // check for non-shortest form of surrogate and the valid surrogate
- // range 0x000000 - 0x10FFFF at the same time
- if (!InRange(ch & 0x1F0, 0x10, 0x100)) {
- goto InvalidByteSequence;
- }
- }
- else {
- // Must be 2nd byte of a 3-byte sequence
- // check for non-shortest form of 3 byte seq
- if ((ch & (0x1F << 5)) == 0 || // non-shortest form
- (ch & (0xF800 >> 6)) == (0xD800 >> 6)) // illegal individually encoded surrogate
- {
- goto InvalidByteSequence;
- }
- }
- continue;
- }
-
- // ready to punch
-
- // surrogate in shortest form?
- // Might be possible to get rid of this? Already did non-shortest check for 4-byte sequence when reading 2nd byte?
- if ((ch & (SupplimentarySeq | 0x1F0000)) > SupplimentarySeq) {
- // let the range check for the second char throw the exception
- if (pTarget < pAllocatedBufferEnd) {
- *pTarget = (WCHAR)(((ch >> 10) & 0x7FF) +
- (SHORT)((CharUnicodeInfo::HIGH_SURROGATE_START - (0x10000 >> 10))));
- pTarget++;
-
- ch = (ch & 0x3FF) +
- (int)(CharUnicodeInfo::LOW_SURROGATE_START);
- }
- }
-
- goto EncodeChar;
-
- InvalidByteSequence:
- // this code fragment should be close to the gotos referencing it
- // Have to do fallback for invalid bytes
- if (fallback == nullptr)
- {
- fallback = decoderFallback->CreateFallbackBuffer();
- fallback->InternalInitialize(bytes, pAllocatedBufferEnd);
- }
-
- // That'll back us up the appropriate # of bytes if we didn't get anywhere
- if (!FallbackInvalidByteSequence(&pSrc, ch, fallback, &pTarget))
- {
- // Ran out of buffer space
- // Need to throw an exception?
- Contract::Assert(pSrc >= bytes || pTarget == chars,
- "[UTF8Encoding.GetChars]Expected to throw or remain in byte buffer after fallback");
- fallback->InternalReset();
- ThrowCharsOverflow(pTarget == chars);
- ch = 0;
- break;
- }
- Contract::Assert(pSrc >= bytes,
- "[UTF8Encoding.GetChars]Expected invalid byte sequence to have remained within the byte array");
- ch = 0;
- continue;
-
- ReadChar:
- ch = *pSrc;
- pSrc++;
-
- ProcessChar:
- if (ch > 0x7F) {
- // If its > 0x7F, its start of a new multi-byte sequence
-
- // bit 6 has to be non-zero
- if ((ch & 0x40) == 0) {
- goto InvalidByteSequence;
- }
-
- // start a new long code
- if ((ch & 0x20) != 0) {
- if ((ch & 0x10) != 0) {
- // 4 byte encoding - supplimentary character (2 surrogates)
-
- ch &= 0x0F;
-
- // check that bit 4 is zero and the valid supplimentary character
- // range 0x000000 - 0x10FFFF at the same time
- if (ch > 0x04) {
- ch |= 0xf0;
- goto InvalidByteSequence;
- }
-
- ch |= (FinalByte >> 3 * 6) | (1 << 30) | (3 << (30 - 2 * 6)) |
- (SupplimentarySeq) | (SupplimentarySeq >> 6) |
- (SupplimentarySeq >> 2 * 6) | (SupplimentarySeq >> 3 * 6);
- }
- else {
- // 3 byte encoding
- ch = (ch & 0x0F) | ((FinalByte >> 2 * 6) | (1 << 30) |
- (ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2 * 6));
- }
- }
- else {
- // 2 byte encoding
-
- ch &= 0x1F;
-
- // check for non-shortest form
- if (ch <= 1) {
- ch |= 0xc0;
- goto InvalidByteSequence;
- }
-
- ch |= (FinalByte >> 6);
- }
- continue;
- }
-
- EncodeChar:
- // write the pending character
- if (pTarget >= pAllocatedBufferEnd)
- {
- // Fix chars so we make sure to throw if we didn't output anything
- ch &= 0x1fffff;
- if (ch > 0x7f)
- {
- if (ch > 0x7ff)
- {
- if (ch >= CharUnicodeInfo::LOW_SURROGATE_START &&
- ch <= CharUnicodeInfo::LOW_SURROGATE_END)
- {
- pSrc--; // It was 4 bytes
- pTarget--; // 1 was stored already, but we can't remember 1/2, so back up
- }
- else if (ch > 0xffff)
- {
- pSrc--; // It was 4 bytes, nothing was stored
- }
- pSrc--; // It was at least 3 bytes
- }
- pSrc--; // It was at least 2 bytes
- }
- pSrc--;
-
- // Throw that we don't have enough room (pSrc could be < chars if we had started to process
- // a 4 byte sequence already)
- Contract::Assert(pSrc >= bytes || pTarget == chars,
- "[UTF8Encoding.GetChars]Expected pSrc to be within input buffer or throw due to no output]");
- ThrowCharsOverflow(pTarget == chars);
-
- // Don't store ch in decoder, we already backed up to its start
- ch = 0;
-
- // Didn't throw, just use this buffer size.
- break;
- }
- *pTarget = (WCHAR)ch;
- pTarget++;
-
-#ifdef FASTLOOP
- int availableChars = PtrDiff(pAllocatedBufferEnd, pTarget);
- int availableBytes = PtrDiff(pEnd, pSrc);
-
- // don't fall into the fast decoding loop if we don't have enough bytes
- // Test for availableChars is done because pStop would be <= pTarget.
- if (availableBytes <= 13) {
- // we may need as many as 1 character per byte
- if (availableChars < availableBytes) {
- // not enough output room. no pending bits at this point
- ch = 0;
- continue;
- }
-
- // try to get over the remainder of the ascii characters fast though
- BYTE* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
- while (pSrc < pLocalEnd) {
- ch = *pSrc;
- pSrc++;
-
- if (ch > 0x7F)
- goto ProcessChar;
-
- *pTarget = (WCHAR)ch;
- pTarget++;
- }
- // we are done
- ch = 0;
- break;
- }
-
- // we may need as many as 1 character per byte, so reduce the byte count if necessary.
- // If availableChars is too small, pStop will be before pTarget and we won't do fast loop.
- if (availableChars < availableBytes) {
- availableBytes = availableChars;
- }
-
- // To compute the upper bound, assume that all characters are ASCII characters at this point,
- // the boundary will be decreased for every non-ASCII character we encounter
- // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences
- WCHAR *pStop = pTarget + availableBytes - 7;
-
- while (pTarget < pStop) {
- ch = *pSrc;
- pSrc++;
-
- if (ch > 0x7F) {
- goto LongCode;
- }
- *pTarget = (WCHAR)ch;
- pTarget++;
-
- // get pSrc to be 2-byte aligned
- if ((((size_t)pSrc) & 0x1) != 0) {
- ch = *pSrc;
- pSrc++;
- if (ch > 0x7F) {
- goto LongCode;
- }
- *pTarget = (WCHAR)ch;
- pTarget++;
- }
-
- // get pSrc to be 4-byte aligned
- if ((((size_t)pSrc) & 0x2) != 0) {
- ch = *(USHORT*)pSrc;
- if ((ch & 0x8080) != 0) {
- goto LongCodeWithMask16;
- }
-
- // Unfortunately, this is endianness sensitive
-#if BIGENDIAN
- *pTarget = (WCHAR)((ch >> 8) & 0x7F);
- pSrc += 2;
- *(pTarget + 1) = (WCHAR)(ch & 0x7F);
- pTarget += 2;
-#else // BIGENDIAN
- *pTarget = (WCHAR)(ch & 0x7F);
- pSrc += 2;
- *(pTarget + 1) = (WCHAR)((ch >> 8) & 0x7F);
- pTarget += 2;
-#endif // BIGENDIAN
- }
-
- // Run 8 characters at a time!
- while (pTarget < pStop) {
- ch = *(int*)pSrc;
- int chb = *(int*)(pSrc + 4);
- if (((ch | chb) & (int)0x80808080) != 0) {
- goto LongCodeWithMask32;
- }
-
- // Unfortunately, this is endianness sensitive
-#if BIGENDIAN
- *pTarget = (WCHAR)((ch >> 24) & 0x7F);
- *(pTarget + 1) = (WCHAR)((ch >> 16) & 0x7F);
- *(pTarget + 2) = (WCHAR)((ch >> 8) & 0x7F);
- *(pTarget + 3) = (WCHAR)(ch & 0x7F);
- pSrc += 8;
- *(pTarget + 4) = (WCHAR)((chb >> 24) & 0x7F);
- *(pTarget + 5) = (WCHAR)((chb >> 16) & 0x7F);
- *(pTarget + 6) = (WCHAR)((chb >> 8) & 0x7F);
- *(pTarget + 7) = (WCHAR)(chb & 0x7F);
- pTarget += 8;
-#else // BIGENDIAN
- *pTarget = (WCHAR)(ch & 0x7F);
- *(pTarget + 1) = (WCHAR)((ch >> 8) & 0x7F);
- *(pTarget + 2) = (WCHAR)((ch >> 16) & 0x7F);
- *(pTarget + 3) = (WCHAR)((ch >> 24) & 0x7F);
- pSrc += 8;
- *(pTarget + 4) = (WCHAR)(chb & 0x7F);
- *(pTarget + 5) = (WCHAR)((chb >> 8) & 0x7F);
- *(pTarget + 6) = (WCHAR)((chb >> 16) & 0x7F);
- *(pTarget + 7) = (WCHAR)((chb >> 24) & 0x7F);
- pTarget += 8;
-#endif // BIGENDIAN
- }
- break;
-
-#if BIGENDIAN
- LongCodeWithMask32 :
- // be careful about the sign extension
- ch = (int)(((uint)ch) >> 16);
- LongCodeWithMask16:
- ch = (int)(((uint)ch) >> 8);
-#else // BIGENDIAN
- LongCodeWithMask32:
- LongCodeWithMask16:
- ch &= 0xFF;
-#endif // BIGENDIAN
- pSrc++;
- if (ch <= 0x7F) {
- *pTarget = (WCHAR)ch;
- pTarget++;
- continue;
- }
-
- LongCode:
- int chc = *pSrc;
- pSrc++;
-
- if (
- // bit 6 has to be zero
- (ch & 0x40) == 0 ||
- // we are expecting to see trailing bytes like 10vvvvvv
- (chc & 0xC0) != 0x80)
- {
- goto BadLongCode;
- }
-
- chc &= 0x3F;
-
- // start a new long code
- if ((ch & 0x20) != 0) {
-
- // fold the first two bytes together
- chc |= (ch & 0x0F) << 6;
-
- if ((ch & 0x10) != 0) {
- // 4 byte encoding - surrogate
- ch = *pSrc;
- if (
- // check that bit 4 is zero, the non-shortest form of surrogate
- // and the valid surrogate range 0x000000 - 0x10FFFF at the same time
- !InRange(chc >> 4, 0x01, 0x10) ||
- // we are expecting to see trailing bytes like 10vvvvvv
- (ch & 0xC0) != 0x80)
- {
- goto BadLongCode;
- }
-
- chc = (chc << 6) | (ch & 0x3F);
-
- ch = *(pSrc + 1);
- // we are expecting to see trailing bytes like 10vvvvvv
- if ((ch & 0xC0) != 0x80) {
- goto BadLongCode;
- }
- pSrc += 2;
-
- ch = (chc << 6) | (ch & 0x3F);
-
- *pTarget = (WCHAR)(((ch >> 10) & 0x7FF) +
- (SHORT)(CharUnicodeInfo::HIGH_SURROGATE_START - (0x10000 >> 10)));
- pTarget++;
-
- ch = (ch & 0x3FF) +
- (SHORT)(CharUnicodeInfo::LOW_SURROGATE_START);
-
- // extra byte, we're already planning 2 chars for 2 of these bytes,
- // but the big loop is testing the target against pStop, so we need
- // to subtract 2 more or we risk overrunning the input. Subtract
- // one here and one below.
- pStop--;
- }
- else {
- // 3 byte encoding
- ch = *pSrc;
- if (
- // check for non-shortest form of 3 byte seq
- (chc & (0x1F << 5)) == 0 ||
- // Can't have surrogates here.
- (chc & (0xF800 >> 6)) == (0xD800 >> 6) ||
- // we are expecting to see trailing bytes like 10vvvvvv
- (ch & 0xC0) != 0x80)
- {
- goto BadLongCode;
- }
- pSrc++;
-
- ch = (chc << 6) | (ch & 0x3F);
-
- // extra byte, we're only expecting 1 char for each of these 3 bytes,
- // but the loop is testing the target (not source) against pStop, so
- // we need to subtract 2 more or we risk overrunning the input.
- // Subtract 1 here and one more below
- pStop--;
- }
- }
- else {
- // 2 byte encoding
-
- ch &= 0x1F;
-
- // check for non-shortest form
- if (ch <= 1) {
- goto BadLongCode;
- }
- ch = (ch << 6) | chc;
- }
-
- *pTarget = (WCHAR)ch;
- pTarget++;
-
- // extra byte, we're only expecting 1 char for each of these 2 bytes,
- // but the loop is testing the target (not source) against pStop.
- // subtract an extra count from pStop so that we don't overrun the input.
- pStop--;
- }
-#endif // FASTLOOP
-
- Contract::Assert(pTarget <= pAllocatedBufferEnd, "[UTF8Encoding.GetChars]pTarget <= pAllocatedBufferEnd");
-
- // no pending bits at this point
- ch = 0;
- continue;
-
- BadLongCode:
- pSrc -= 2;
- ch = 0;
- continue;
- }
-
- if (ch != 0)
- {
- // Have to do fallback for invalid bytes
- if (fallback == nullptr)
- {
- fallback = decoderFallback->CreateFallbackBuffer();
- fallback->InternalInitialize(bytes, pAllocatedBufferEnd);
- }
-
- // This'll back us up the appropriate # of bytes if we didn't get anywhere
- if (!FallbackInvalidByteSequence(pSrc, ch, fallback))
- {
- Contract::Assert(pSrc >= bytes || pTarget == chars,
- "[UTF8Encoding.GetChars]Expected to throw or remain in byte buffer while flushing");
-
- // Ran out of buffer space
- // Need to throw an exception?
- fallback->InternalReset();
- ThrowCharsOverflow(pTarget == chars);
- }
- Contract::Assert(pSrc >= bytes,
- "[UTF8Encoding.GetChars]Expected flushing invalid byte sequence to have remained within the byte array");
- ch = 0;
- }
-
- // Shouldn't have anything in fallback buffer for GetChars
- // (don't have to check m_throwOnOverflow for chars)
- Contract::Assert(fallback == nullptr || fallback->GetRemaining() == 0,
- "[UTF8Encoding.GetChars]Expected empty fallback buffer at end");
-
- InternalDelete(fallback);
-
- return PtrDiff(pTarget, chars);
- }
-
- int GetBytes(WCHAR* chars, int charCount, BYTE* bytes, int byteCount)
- {
- Contract::Assert(chars != nullptr, "[UTF8Encoding.GetBytes]chars!=nullptr");
- Contract::Assert(byteCount >= 0, "[UTF8Encoding.GetBytes]byteCount >=0");
- Contract::Assert(charCount >= 0, "[UTF8Encoding.GetBytes]charCount >=0");
- Contract::Assert(bytes != nullptr, "[UTF8Encoding.GetBytes]bytes!=nullptr");
-
- // For fallback we may need a fallback buffer.
- // We wait to initialize it though in case we don't have any broken input unicode
- EncoderFallbackBuffer* fallbackBuffer = nullptr;
- WCHAR *pSrc = chars;
- BYTE *pTarget = bytes;
-
- WCHAR *pEnd = pSrc + charCount;
- BYTE *pAllocatedBufferEnd = pTarget + byteCount;
-
- int ch = 0;
-
- // assume that JIT will enregister pSrc, pTarget and ch
-
- while (true) {
- // SLOWLOOP: does all range checks, handles all special cases, but it is slow
-
- if (pSrc >= pEnd) {
-
- if (ch == 0) {
- // Check if there's anything left to get out of the fallback buffer
- ch = fallbackBuffer != nullptr ? fallbackBuffer->InternalGetNextChar() : 0;
- if (ch > 0) {
- goto ProcessChar;
- }
- }
- else {
- // Case of leftover surrogates in the fallback buffer
- if (fallbackBuffer != nullptr && fallbackBuffer->bFallingBack) {
- Contract::Assert(ch >= 0xD800 && ch <= 0xDBFF,
- "[UTF8Encoding.GetBytes]expected high surrogate"); //, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
-
- int cha = ch;
-
- ch = fallbackBuffer->InternalGetNextChar();
-
- if (InRange(ch, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) {
- ch = ch + (cha << 10) + (0x10000 - CharUnicodeInfo::LOW_SURROGATE_START - (CharUnicodeInfo::HIGH_SURROGATE_START << 10));
- goto EncodeChar;
- }
- else if (ch > 0){
- goto ProcessChar;
- }
- else {
- break;
- }
- }
- }
-
- // attempt to encode the partial surrogate (will fail or ignore)
- if (ch > 0)
- goto EncodeChar;
-
- // We're done
- break;
- }
-
- if (ch > 0) {
- // We have a high surrogate left over from a previous loop.
- Contract::Assert(ch >= 0xD800 && ch <= 0xDBFF,
- "[UTF8Encoding.GetBytes]expected high surrogate");//, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
-
- // use separate helper variables for local contexts so that the jit optimizations
- // won't get confused about the variable lifetimes
- int cha = *pSrc;
-
- // In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here.
- // if (IsLowSurrogate(cha)) {
- if (InRange(cha, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) {
- ch = cha + (ch << 10) +
- (0x10000
- - CharUnicodeInfo::LOW_SURROGATE_START
- - (CharUnicodeInfo::HIGH_SURROGATE_START << 10));
-
- pSrc++;
- }
- // else ch is still high surrogate and encoding will fail
-
- // attempt to encode the surrogate or partial surrogate
- goto EncodeChar;
- }
-
- // If we've used a fallback, then we have to check for it
- if (fallbackBuffer != nullptr)
- {
- ch = fallbackBuffer->InternalGetNextChar();
- if (ch > 0) goto ProcessChar;
- }
-
- // read next char. The JIT optimization seems to be getting confused when
- // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
- ch = *pSrc;
- pSrc++;
-
- ProcessChar:
- if (InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::HIGH_SURROGATE_END)) {
- continue;
- }
- // either good char or partial surrogate
-
- EncodeChar:
- // throw exception on partial surrogate if necessary
- if (InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END))
- {
- // Lone surrogates aren't allowed, we have to do fallback for them
- // Have to make a fallback buffer if we don't have one
- if (fallbackBuffer == nullptr)
- {
- // wait on fallbacks if we can
- // For fallback we may need a fallback buffer
- fallbackBuffer = encoderFallback->CreateFallbackBuffer();
-
- // Set our internal fallback interesting things.
- fallbackBuffer->InternalInitialize(chars, pEnd, true);
- }
-
- // Do our fallback. Actually we already know its a mixed up surrogate,
- // so the ref pSrc isn't gonna do anything.
- fallbackBuffer->InternalFallback((WCHAR)ch, &pSrc);
-
- // Ignore it if we don't throw
- ch = 0;
- continue;
- }
-
- // Count bytes needed
- int bytesNeeded = 1;
- if (ch > 0x7F) {
- if (ch > 0x7FF) {
- if (ch > 0xFFFF) {
- bytesNeeded++; // 4 bytes (surrogate pair)
- }
- bytesNeeded++; // 3 bytes (800-FFFF)
- }
- bytesNeeded++; // 2 bytes (80-7FF)
- }
-
- if (pTarget > pAllocatedBufferEnd - bytesNeeded) {
- // Left over surrogate from last time will cause pSrc == chars, so we'll throw
- if (fallbackBuffer != nullptr && fallbackBuffer->bFallingBack)
- {
- fallbackBuffer->MovePrevious(); // Didn't use this fallback char
- if (ch > 0xFFFF)
- fallbackBuffer->MovePrevious(); // Was surrogate, didn't use 2nd part either
- }
- else
- {
- pSrc--; // Didn't use this char
- if (ch > 0xFFFF)
- pSrc--; // Was surrogate, didn't use 2nd part either
- }
- Contract::Assert(pSrc >= chars || pTarget == bytes,
- "[UTF8Encoding.GetBytes]Expected pSrc to be within buffer or to throw with insufficient room.");
- ThrowBytesOverflow(pTarget == bytes); // Throw if we must
- ch = 0; // Nothing left over (we backed up to start of pair if supplimentary)
- break;
- }
-
- if (ch <= 0x7F) {
- *pTarget = (BYTE)ch;
- }
- else {
- // use separate helper variables for local contexts so that the jit optimizations
- // won't get confused about the variable lifetimes
- int chb;
- if (ch <= 0x7FF) {
- // 2 BYTE encoding
- chb = (BYTE)(0xC0 | (ch >> 6));
- }
- else
- {
- if (ch <= 0xFFFF) {
- chb = (BYTE)(0xE0 | (ch >> 12));
- }
- else
- {
- *pTarget = (BYTE)(0xF0 | (ch >> 18));
- pTarget++;
-
- chb = 0x80 | ((ch >> 12) & 0x3F);
- }
- *pTarget = (BYTE)chb;
- pTarget++;
-
- chb = 0x80 | ((ch >> 6) & 0x3F);
- }
- *pTarget = (BYTE)chb;
- pTarget++;
-
- *pTarget = (BYTE)0x80 | (ch & 0x3F);
- }
- pTarget++;
-
-
-#ifdef FASTLOOP
- // If still have fallback don't do fast loop
- if (fallbackBuffer != nullptr && (ch = fallbackBuffer->InternalGetNextChar()) != 0)
- goto ProcessChar;
-
- int availableChars = PtrDiff(pEnd, pSrc);
- int availableBytes = PtrDiff(pAllocatedBufferEnd, pTarget);
-
- // don't fall into the fast decoding loop if we don't have enough characters
- // Note that if we don't have enough bytes, pStop will prevent us from entering the fast loop.
- if (availableChars <= 13) {
- // we are hoping for 1 BYTE per char
- if (availableBytes < availableChars) {
- // not enough output room. no pending bits at this point
- ch = 0;
- continue;
- }
-
- // try to get over the remainder of the ascii characters fast though
- WCHAR* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
- while (pSrc < pLocalEnd) {
- ch = *pSrc;
- pSrc++;
-
- // Not ASCII, need more than 1 BYTE per char
- if (ch > 0x7F)
- goto ProcessChar;
-
- *pTarget = (BYTE)ch;
- pTarget++;
- }
- // we are done, let ch be 0 to clear encoder
- ch = 0;
- break;
- }
-
- // we need at least 1 BYTE per character, but Convert might allow us to convert
- // only part of the input, so try as much as we can. Reduce charCount if necessary
- if (availableBytes < availableChars)
- {
- availableChars = availableBytes;
- }
-
- // FASTLOOP:
- // - optimistic range checks
- // - fallbacks to the slow loop for all special cases, exception throwing, etc.
-
- // To compute the upper bound, assume that all characters are ASCII characters at this point,
- // the boundary will be decreased for every non-ASCII character we encounter
- // Also, we need 5 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates
- // If there aren't enough bytes for the output, then pStop will be <= pSrc and will bypass the loop.
- WCHAR *pStop = pSrc + availableChars - 5;
-
- while (pSrc < pStop) {
- ch = *pSrc;
- pSrc++;
-
- if (ch > 0x7F) {
- goto LongCode;
- }
- *pTarget = (BYTE)ch;
- pTarget++;
-
- // get pSrc aligned
- if (((size_t)pSrc & 0x2) != 0) {
- ch = *pSrc;
- pSrc++;
- if (ch > 0x7F) {
- goto LongCode;
- }
- *pTarget = (BYTE)ch;
- pTarget++;
- }
-
- // Run 4 characters at a time!
- while (pSrc < pStop) {
- ch = *(int*)pSrc;
- int chc = *(int*)(pSrc + 2);
- if (((ch | chc) & (int)0xFF80FF80) != 0) {
- goto LongCodeWithMask;
- }
-
- // Unfortunately, this is endianness sensitive
-#if BIGENDIAN
- *pTarget = (BYTE)(ch >> 16);
- *(pTarget + 1) = (BYTE)ch;
- pSrc += 4;
- *(pTarget + 2) = (BYTE)(chc >> 16);
- *(pTarget + 3) = (BYTE)chc;
- pTarget += 4;
-#else // BIGENDIAN
- *pTarget = (BYTE)ch;
- *(pTarget + 1) = (BYTE)(ch >> 16);
- pSrc += 4;
- *(pTarget + 2) = (BYTE)chc;
- *(pTarget + 3) = (BYTE)(chc >> 16);
- pTarget += 4;
-#endif // BIGENDIAN
- }
- continue;
-
- LongCodeWithMask:
-#if BIGENDIAN
- // be careful about the sign extension
- ch = (int)(((uint)ch) >> 16);
-#else // BIGENDIAN
- ch = (WCHAR)ch;
-#endif // BIGENDIAN
- pSrc++;
-
- if (ch > 0x7F) {
- goto LongCode;
- }
- *pTarget = (BYTE)ch;
- pTarget++;
- continue;
-
- LongCode:
- // use separate helper variables for slow and fast loop so that the jit optimizations
- // won't get confused about the variable lifetimes
- int chd;
- if (ch <= 0x7FF) {
- // 2 BYTE encoding
- chd = 0xC0 | (ch >> 6);
- }
- else {
- if (!InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) {
- // 3 BYTE encoding
- chd = 0xE0 | (ch >> 12);
- }
- else
- {
- // 4 BYTE encoding - high surrogate + low surrogate
- if (ch > CharUnicodeInfo::HIGH_SURROGATE_END) {
- // low without high -> bad, try again in slow loop
- pSrc -= 1;
- break;
- }
-
- chd = *pSrc;
- pSrc++;
-
- // if (!IsLowSurrogate(chd)) {
- if (!InRange(chd, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) {
- // high not followed by low -> bad, try again in slow loop
- pSrc -= 2;
- break;
- }
-
- ch = chd + (ch << 10) +
- (0x10000
- - CharUnicodeInfo::LOW_SURROGATE_START
- - (CharUnicodeInfo::HIGH_SURROGATE_START << 10));
-
- *pTarget = (BYTE)(0xF0 | (ch >> 18));
- // pStop - this BYTE is compensated by the second surrogate character
- // 2 input chars require 4 output bytes. 2 have been anticipated already
- // and 2 more will be accounted for by the 2 pStop-- calls below.
- pTarget++;
-
- chd = 0x80 | ((ch >> 12) & 0x3F);
- }
- *pTarget = (BYTE)chd;
- pStop--; // 3 BYTE sequence for 1 char, so need pStop-- and the one below too.
- pTarget++;
-
- chd = 0x80 | ((ch >> 6) & 0x3F);
- }
- *pTarget = (BYTE)chd;
- pStop--; // 2 BYTE sequence for 1 char so need pStop--.
- pTarget++;
-
- *pTarget = (BYTE)(0x80 | (ch & 0x3F));
- // pStop - this BYTE is already included
- pTarget++;
- }
-
- Contract::Assert(pTarget <= pAllocatedBufferEnd, "[UTF8Encoding.GetBytes]pTarget <= pAllocatedBufferEnd");
-
-#endif // FASTLOOP
-
- // no pending char at this point
- ch = 0;
- }
-
- InternalDelete(fallbackBuffer);
-
- return (int)(pTarget - bytes);
- }
-
- int GetByteCount(WCHAR *chars, int count)
- {
- // For fallback we may need a fallback buffer.
- // We wait to initialize it though in case we don't have any broken input unicode
- EncoderFallbackBuffer* fallbackBuffer = nullptr;
- WCHAR *pSrc = chars;
- WCHAR *pEnd = pSrc + count;
-
- // Start by assuming we have as many as count
- int byteCount = count;
-
- int ch = 0;
-
- while (true) {
- // SLOWLOOP: does all range checks, handles all special cases, but it is slow
- if (pSrc >= pEnd) {
-
- if (ch == 0) {
- // Unroll any fallback that happens at the end
- ch = fallbackBuffer != nullptr ? fallbackBuffer->InternalGetNextChar() : 0;
- if (ch > 0) {
- byteCount++;
- goto ProcessChar;
- }
- }
- else {
- // Case of surrogates in the fallback.
- if (fallbackBuffer != nullptr && fallbackBuffer->bFallingBack) {
- Contract::Assert(ch >= 0xD800 && ch <= 0xDBFF,
- "[UTF8Encoding.GetBytes]expected high surrogate");// , not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
-
- ch = fallbackBuffer->InternalGetNextChar();
- byteCount++;
-
- if (InRange(ch, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) {
- ch = 0xfffd;
- byteCount++;
- goto EncodeChar;
- }
- else if (ch > 0){
- goto ProcessChar;
- }
- else {
- byteCount--; // ignore last one.
- break;
- }
- }
- }
-
- if (ch <= 0) {
- break;
- }
-
- // attempt to encode the partial surrogate (will fallback or ignore it), it'll also subtract 1.
- byteCount++;
- goto EncodeChar;
- }
-
- if (ch > 0) {
- Contract::Assert(ch >= 0xD800 && ch <= 0xDBFF,
- "[UTF8Encoding.GetBytes]expected high surrogate"); // , not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
-
- // use separate helper variables for local contexts so that the jit optimizations
- // won't get confused about the variable lifetimes
- int cha = *pSrc;
-
- // count the pending surrogate
- byteCount++;
-
- // In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here.
- // if (IsLowSurrogate(cha)) {
- if (InRange(cha, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) {
- // Don't need a real # because we're just counting, anything > 0x7ff ('cept surrogate) will do.
- ch = 0xfffd;
- // ch = cha + (ch << 10) +
- // (0x10000
- // - CharUnicodeInfo::LOW_SURROGATE_START
- // - (CharUnicodeInfo::HIGH_SURROGATE_START << 10) );
-
- // Use this next char
- pSrc++;
- }
- // else ch is still high surrogate and encoding will fail (so don't add count)
-
- // attempt to encode the surrogate or partial surrogate
- goto EncodeChar;
- }
-
- // If we've used a fallback, then we have to check for it
- if (fallbackBuffer != nullptr)
- {
- ch = fallbackBuffer->InternalGetNextChar();
- if (ch > 0)
- {
- // We have an extra byte we weren't expecting.
- byteCount++;
- goto ProcessChar;
- }
- }
-
- // read next char. The JIT optimization seems to be getting confused when
- // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
- ch = *pSrc;
- pSrc++;
-
- ProcessChar:
- if (InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::HIGH_SURROGATE_END)) {
- // we will count this surrogate next time around
- byteCount--;
- continue;
- }
- // either good char or partial surrogate
-
- EncodeChar:
- // throw exception on partial surrogate if necessary
- if (InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END))
- {
- // Lone surrogates aren't allowed
- // Have to make a fallback buffer if we don't have one
- if (fallbackBuffer == nullptr)
- {
- // wait on fallbacks if we can
- // For fallback we may need a fallback buffer
- fallbackBuffer = encoderFallback->CreateFallbackBuffer();
-
- // Set our internal fallback interesting things.
- fallbackBuffer->InternalInitialize(chars, chars + count, false);
- }
-
- // Do our fallback. Actually we already know its a mixed up surrogate,
- // so the ref pSrc isn't gonna do anything.
- fallbackBuffer->InternalFallback((WCHAR)ch, &pSrc);
-
- // Ignore it if we don't throw (we had preallocated this ch)
- byteCount--;
- ch = 0;
- continue;
- }
-
- // Count them
- if (ch > 0x7F) {
- if (ch > 0x7FF) {
- // the extra surrogate byte was compensated by the second surrogate character
- // (2 surrogates make 4 bytes. We've already counted 2 bytes, 1 per char)
- byteCount++;
- }
- byteCount++;
- }
-
-#if WIN64
- // check for overflow
- if (byteCount < 0) {
- break;
- }
-#endif
-
-#ifdef FASTLOOP
- // If still have fallback don't do fast loop
- if (fallbackBuffer != nullptr && (ch = fallbackBuffer->InternalGetNextChar()) != 0)
- {
- // We're reserving 1 byte for each char by default
- byteCount++;
- goto ProcessChar;
- }
-
- int availableChars = PtrDiff(pEnd, pSrc);
-
- // don't fall into the fast decoding loop if we don't have enough characters
- if (availableChars <= 13) {
- // try to get over the remainder of the ascii characters fast though
- WCHAR* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
- while (pSrc < pLocalEnd) {
- ch = *pSrc;
- pSrc++;
- if (ch > 0x7F)
- goto ProcessChar;
- }
-
- // we are done
- break;
- }
-
-#if WIN64
- // make sure that we won't get a silent overflow inside the fast loop
- // (Fall out to slow loop if we have this many characters)
- availableChars &= 0x0FFFFFFF;
-#endif
-
- // To compute the upper bound, assume that all characters are ASCII characters at this point,
- // the boundary will be decreased for every non-ASCII character we encounter
- // Also, we need 3 + 4 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates
- WCHAR *pStop = pSrc + availableChars - (3 + 4);
-
- while (pSrc < pStop) {
- ch = *pSrc;
- pSrc++;
-
- if (ch > 0x7F) // Not ASCII
- {
- if (ch > 0x7FF) // Not 2 Byte
- {
- if ((ch & 0xF800) == 0xD800) // See if its a Surrogate
- goto LongCode;
- byteCount++;
- }
- byteCount++;
- }
-
- // get pSrc aligned
- if (((size_t)pSrc & 0x2) != 0) {
- ch = *pSrc;
- pSrc++;
- if (ch > 0x7F) // Not ASCII
- {
- if (ch > 0x7FF) // Not 2 Byte
- {
- if ((ch & 0xF800) == 0xD800) // See if its a Surrogate
- goto LongCode;
- byteCount++;
- }
- byteCount++;
- }
- }
-
- // Run 2 * 4 characters at a time!
- while (pSrc < pStop) {
- ch = *(int*)pSrc;
- int chc = *(int*)(pSrc + 2);
- if (((ch | chc) & (int)0xFF80FF80) != 0) // See if not ASCII
- {
- if (((ch | chc) & (int)0xF800F800) != 0) // See if not 2 Byte
- {
- goto LongCodeWithMask;
- }
-
-
- if ((ch & (int)0xFF800000) != 0) // Actually 0x07800780 is all we care about (4 bits)
- byteCount++;
- if ((ch & (int)0xFF80) != 0)
- byteCount++;
- if ((chc & (int)0xFF800000) != 0)
- byteCount++;
- if ((chc & (int)0xFF80) != 0)
- byteCount++;
- }
- pSrc += 4;
-
- ch = *(int*)pSrc;
- chc = *(int*)(pSrc + 2);
- if (((ch | chc) & (int)0xFF80FF80) != 0) // See if not ASCII
- {
- if (((ch | chc) & (int)0xF800F800) != 0) // See if not 2 Byte
- {
- goto LongCodeWithMask;
- }
-
- if ((ch & (int)0xFF800000) != 0)
- byteCount++;
- if ((ch & (int)0xFF80) != 0)
- byteCount++;
- if ((chc & (int)0xFF800000) != 0)
- byteCount++;
- if ((chc & (int)0xFF80) != 0)
- byteCount++;
- }
- pSrc += 4;
- }
- break;
-
- LongCodeWithMask:
-#if BIGENDIAN
- // be careful about the sign extension
- ch = (int)(((uint)ch) >> 16);
-#else // BIGENDIAN
- ch = (WCHAR)ch;
-#endif // BIGENDIAN
- pSrc++;
-
- if (ch <= 0x7F) {
- continue;
- }
-
- LongCode:
- // use separate helper variables for slow and fast loop so that the jit optimizations
- // won't get confused about the variable lifetimes
- if (ch > 0x7FF) {
- if (InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) {
- // 4 byte encoding - high surrogate + low surrogate
-
- int chd = *pSrc;
- if (
- ch > CharUnicodeInfo::HIGH_SURROGATE_END ||
- !InRange(chd, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END))
- {
- // Back up and drop out to slow loop to figure out error
- pSrc--;
- break;
- }
- pSrc++;
-
- // byteCount - this byte is compensated by the second surrogate character
- }
- byteCount++;
- }
- byteCount++;
-
- // byteCount - the last byte is already included
- }
-#endif // FASTLOOP
-
- // no pending char at this point
- ch = 0;
- }
-
-#if WIN64
- // check for overflow
- if (byteCount < 0) {
- throw ArgumentException("Conversion buffer overflow.");
- }
-#endif
-
- Contract::Assert(fallbackBuffer == nullptr || fallbackBuffer->GetRemaining() == 0,
- "[UTF8Encoding.GetByteCount]Expected Empty fallback buffer");
-
- InternalDelete(fallbackBuffer);
-
- return byteCount;
- }
-
-};
-
-
-////////////////////////////////////////////////////////////////////////////
-//
-// UTF8ToUnicode
-//
-// Maps a UTF-8 character string to its wide character string counterpart.
-//
-////////////////////////////////////////////////////////////////////////////
-
-int UTF8ToUnicode(
- LPCSTR lpSrcStr,
- int cchSrc,
- LPWSTR lpDestStr,
- int cchDest,
- DWORD dwFlags
- )
-{
- int ret;
- UTF8Encoding enc(dwFlags & MB_ERR_INVALID_CHARS);
- try {
- ret = enc.GetCharCount((BYTE*)lpSrcStr, cchSrc);
- if (cchDest){
- if (ret > cchDest){
- SetLastError(ERROR_INSUFFICIENT_BUFFER);
- ret = 0;
- }
- enc.GetChars((BYTE*)lpSrcStr, cchSrc, (WCHAR*)lpDestStr, ret);
- }
- }
- catch (const InsufficientBufferException& e){
- SetLastError(ERROR_INSUFFICIENT_BUFFER);
- return 0;
- }
- catch (const DecoderFallbackException& e){
- SetLastError(ERROR_NO_UNICODE_TRANSLATION);
- return 0;
- }
- catch (const ArgumentException& e){
- SetLastError(ERROR_INVALID_PARAMETER);
- return 0;
- }
- return ret;
-}
-
-////////////////////////////////////////////////////////////////////////////
-//
-// UnicodeToUTF8
-//
-// Maps a Unicode character string to its UTF-8 string counterpart.
-//
-////////////////////////////////////////////////////////////////////////////
-
-int UnicodeToUTF8(
- LPCWSTR lpSrcStr,
- int cchSrc,
- LPSTR lpDestStr,
- int cchDest)
-{
- int ret;
- UTF8Encoding enc(false);
- try{
- ret = enc.GetByteCount((WCHAR*)lpSrcStr, cchSrc);
- if (cchDest){
- if (ret > cchDest){
- SetLastError(ERROR_INSUFFICIENT_BUFFER);
- ret = 0;
- }
- enc.GetBytes((WCHAR*)lpSrcStr, cchSrc, (BYTE*)lpDestStr, ret);
- }
- }
- catch (const InsufficientBufferException& e){
- SetLastError(ERROR_INSUFFICIENT_BUFFER);
- return 0;
- }
- catch (const EncoderFallbackException& e){
- SetLastError(ERROR_NO_UNICODE_TRANSLATION);
- return 0;
- }
- catch (const ArgumentException& e){
- SetLastError(ERROR_INVALID_PARAMETER);
- return 0;
- }
- return ret;
-}
}
else
{
- NewArrayHolder<WCHAR> wzTempName(DuplicateStringThrowing(ssTempName.GetUnicode()));
+ NewArrayHolder<WCHAR> wzTempName(ssTempName.GetCopyOfUnicodeString());
// publish result
if (InterlockedCompareExchangeT(&wszModuleName, (LPWSTR)wzTempName, nullptr) == nullptr)
gspawn.c
gfile.c
gfile-posix.c
- gutf8.c)
+ gutf8.c
+ ${CLR_SRC_NATIVE_DIR}/minipal/utf8.c)
+
+if(IS_BIG_ENDIAN)
+ set_source_files_properties("${CLR_SRC_NATIVE_DIR}/minipal/utf8.c" PROPERTIES COMPILE_FLAGS "-DBIGENDIAN=1")
+endif()
set(eglib_headers
glib.h
gmodule.h)
if(HAVE_CLOCK_NANOSLEEP)
-list(APPEND eglib_common_sources gclock-nanosleep.c)
+ list(APPEND eglib_common_sources gclock-nanosleep.c)
endif()
set(eglib_sources "${eglib_platform_sources};${eglib_common_sources}")
#include <errno.h>
#include "../utils/mono-errno.h"
+#include <minipal/utf8.h>
+
#ifdef _MSC_VER
#define FORCE_INLINE(RET_TYPE) __forceinline RET_TYPE
#else
#define FORCE_INLINE(RET_TYPE) inline RET_TYPE __attribute__((always_inline))
#endif
-
-#define UNROLL_DECODE_UTF8 0
-#define UNROLL_ENCODE_UTF8 0
-
-static int decode_utf32be (char *inbuf, size_t inleft, gunichar *outchar);
-static int encode_utf32be (gunichar c, char *outbuf, size_t outleft);
-
-static int decode_utf32le (char *inbuf, size_t inleft, gunichar *outchar);
-static int encode_utf32le (gunichar c, char *outbuf, size_t outleft);
-
-static int decode_utf16be (char *inbuf, size_t inleft, gunichar *outchar);
-static int encode_utf16be (gunichar c, char *outbuf, size_t outleft);
-
-static int decode_utf16le (char *inbuf, size_t inleft, gunichar *outchar);
-static int encode_utf16le (gunichar c, char *outbuf, size_t outleft);
-
-static FORCE_INLINE (int) decode_utf8 (char *inbuf, size_t inleft, gunichar *outchar);
-static int encode_utf8 (gunichar c, char *outbuf, size_t outleft);
-
-static int decode_latin1 (char *inbuf, size_t inleft, gunichar *outchar);
-static int encode_latin1 (gunichar c, char *outbuf, size_t outleft);
-
#if G_BYTE_ORDER == G_LITTLE_ENDIAN
-#define decode_utf32 decode_utf32le
-#define encode_utf32 encode_utf32le
#define decode_utf16 decode_utf16le
-#define encode_utf16 encode_utf16le
#else
-#define decode_utf32 decode_utf32be
-#define encode_utf32 encode_utf32be
#define decode_utf16 decode_utf16be
-#define encode_utf16 encode_utf16be
#endif
-/*
- * Unicode encoders and decoders
- */
-
-static FORCE_INLINE (uint32_t)
-read_uint32_endian (unsigned char *inptr, unsigned endian)
-{
- if (endian == G_LITTLE_ENDIAN)
- return (inptr[3] << 24) | (inptr[2] << 16) | (inptr[1] << 8) | inptr[0];
- return (inptr[0] << 24) | (inptr[1] << 16) | (inptr[2] << 8) | inptr[3];
-}
-
-static int
-decode_utf32_endian (char *inbuf, size_t inleft, gunichar *outchar, unsigned endian)
-{
- unsigned char *inptr = (unsigned char *) inbuf;
- gunichar c;
-
- if (inleft < 4) {
- mono_set_errno (EINVAL);
- return -1;
- }
-
- c = read_uint32_endian (inptr, endian);
-
- if (c >= 0xd800 && c < 0xe000) {
- mono_set_errno (EILSEQ);
- return -1;
- } else if (c >= 0x110000) {
- mono_set_errno (EILSEQ);
- return -1;
- }
-
- *outchar = c;
-
- return 4;
-}
-
-static int
-decode_utf32be (char *inbuf, size_t inleft, gunichar *outchar)
-{
- return decode_utf32_endian (inbuf, inleft, outchar, G_BIG_ENDIAN);
-}
-
-static int
-decode_utf32le (char *inbuf, size_t inleft, gunichar *outchar)
-{
- return decode_utf32_endian (inbuf, inleft, outchar, G_LITTLE_ENDIAN);
-}
-
-static int
-encode_utf32be (gunichar c, char *outbuf, size_t outleft)
-{
- unsigned char *outptr = (unsigned char *) outbuf;
-
- if (outleft < 4) {
- mono_set_errno (E2BIG);
- return -1;
- }
-
- outptr[0] = (c >> 24) & 0xff;
- outptr[1] = (c >> 16) & 0xff;
- outptr[2] = (c >> 8) & 0xff;
- outptr[3] = c & 0xff;
-
- return 4;
-}
-
-static int
-encode_utf32le (gunichar c, char *outbuf, size_t outleft)
-{
- unsigned char *outptr = (unsigned char *) outbuf;
-
- if (outleft < 4) {
- mono_set_errno (E2BIG);
- return -1;
- }
-
- outptr[0] = c & 0xff;
- outptr[1] = (c >> 8) & 0xff;
- outptr[2] = (c >> 16) & 0xff;
- outptr[3] = (c >> 24) & 0xff;
-
- return 4;
-}
-
static FORCE_INLINE (uint16_t)
read_uint16_endian (unsigned char *inptr, unsigned endian)
{
}
static FORCE_INLINE (int)
-encode_utf16_endian (gunichar c, char *outbuf, size_t outleft, unsigned endian)
-{
- unsigned char *outptr = (unsigned char *) outbuf;
- gunichar2 ch;
- gunichar c2;
-
- if (c < 0x10000) {
- if (outleft < 2) {
- mono_set_errno (E2BIG);
- return -1;
- }
-
- write_uint16_endian (outptr, GUNICHAR_TO_UINT16 (c), endian);
- return 2;
- } else {
- if (outleft < 4) {
- mono_set_errno (E2BIG);
- return -1;
- }
-
- c2 = c - 0x10000;
-
- ch = (gunichar2) ((c2 >> 10) + 0xd800);
- write_uint16_endian (outptr, ch, endian);
-
- ch = (gunichar2) ((c2 & 0x3ff) + 0xdc00);
- write_uint16_endian (outptr + 2, ch, endian);
- return 4;
- }
-}
-
-static int
-encode_utf16be (gunichar c, char *outbuf, size_t outleft)
-{
- return encode_utf16_endian (c, outbuf, outleft, G_BIG_ENDIAN);
-}
-
-static int
-encode_utf16le (gunichar c, char *outbuf, size_t outleft)
-{
- return encode_utf16_endian (c, outbuf, outleft, G_LITTLE_ENDIAN);
-}
-
-static FORCE_INLINE (int)
decode_utf8 (char *inbuf, size_t inleft, gunichar *outchar)
{
unsigned char *inptr = (unsigned char *) inbuf;
return GSIZE_TO_INT(n);
}
-static int
-encode_utf8 (gunichar c, char *outbuf, size_t outleft)
-{
- unsigned char *outptr = (unsigned char *) outbuf;
- int base;
- size_t n;
-
- if (c < 0x80) {
- outptr[0] = GUNICHAR_TO_UINT8 (c);
- return 1;
- } else if (c < 0x800) {
- base = 192;
- n = 2;
- } else if (c < 0x10000) {
- base = 224;
- n = 3;
- } else if (c < 0x200000) {
- base = 240;
- n = 4;
- } else if (c < 0x4000000) {
- base = 248;
- n = 5;
- } else {
- base = 252;
- n = 6;
- }
-
- if (outleft < n) {
- mono_set_errno (E2BIG);
- return -1;
- }
-
-#if UNROLL_ENCODE_UTF8
- switch (n) {
- case 6: outptr[5] = (c & 0x3f) | 0x80; c >>= 6;
- case 5: outptr[4] = (c & 0x3f) | 0x80; c >>= 6;
- case 4: outptr[3] = (c & 0x3f) | 0x80; c >>= 6;
- case 3: outptr[2] = (c & 0x3f) | 0x80; c >>= 6;
- case 2: outptr[1] = (c & 0x3f) | 0x80; c >>= 6;
- case 1: outptr[0] = c | base;
- }
-#else
- for (size_t i = n - 1; i > 0; i--) {
- outptr[i] = (c & 0x3f) | 0x80;
- c >>= 6;
- }
-
- outptr[0] = GUNICHAR_TO_UINT8 (c | base);
-#endif
-
- return GSIZE_TO_INT(n);
-}
-
-static int
-decode_latin1 (char *inbuf, size_t inleft, gunichar *outchar)
-{
- *outchar = (unsigned char) *inbuf;
- return 1;
-}
-
-static int
-encode_latin1 (gunichar c, char *outbuf, size_t outleft)
-{
- if (outleft < 1) {
- mono_set_errno (E2BIG);
- return -1;
- }
-
- if (c > 0xff) {
- mono_set_errno (EILSEQ);
- return -1;
- }
-
- *outbuf = (char) c;
-
- return 1;
-}
-
-
-/*
- * Simple conversion API
- */
-
static gpointer error_quark = (gpointer)"ConvertError";
gpointer
{
return error_quark;
}
-/*
- * Unicode conversion
- */
/**
* An explanation of the conversion can be found at:
return outbuf;
}
-static gunichar2 *
-eg_utf8_to_utf16_general (const gchar *str, glong len, glong *items_read, glong *items_written, gboolean include_nuls, gboolean replace_invalid_codepoints, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err, unsigned endian)
+static FORCE_INLINE (void)
+map_error(GError **err)
{
- gunichar2 *outbuf, *outptr;
- size_t outlen = 0;
- size_t inleft;
- char *inptr;
- gunichar c;
- int u, n;
-
- g_return_val_if_fail (str != NULL, NULL);
-
- if (len < 0) {
- if (include_nuls) {
- g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED, "Conversions with embedded nulls must pass the string length");
- return NULL;
- }
-
- len = (glong)strlen (str);
+ if (errno == MINIPAL_ERROR_INSUFFICIENT_BUFFER) {
+ g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_MEMORY, "Allocation failed.");
+ } else if (errno == MINIPAL_ERROR_NO_UNICODE_TRANSLATION) {
+ g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "Illegal byte sequence encountered in the input.");
}
+}
- inptr = (char *) str;
- inleft = len;
-
- while (inleft > 0) {
- if ((n = decode_utf8 (inptr, inleft, &c)) < 0)
- goto error;
-
- if (c == 0 && !include_nuls)
- break;
+static gunichar2 *
+g_utf8_to_utf16_impl (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err, int flags, bool treatAsLE)
+{
+ errno = 0;
+ gunichar2* lpDestStr = NULL;
+#if G_BYTE_ORDER == G_BIG_ENDIAN
+ if (treatAsLE)
+ flags |= MINIPAL_TREAT_AS_LITTLE_ENDIAN;
+#endif
- if ((u = g_unichar_to_utf16_endian (c, NULL, endian)) < 0) {
- if (replace_invalid_codepoints) {
- u = 2;
- } else {
- mono_set_errno (EILSEQ);
- goto error;
- }
- }
+ if (len < 0)
+ len = (glong)strlen(str) + 1;
- outlen += u;
- inleft -= n;
- inptr += n;
- }
+ glong ret = (glong)minipal_get_length_utf8_to_utf16 (str, len, flags);
- if (items_read)
- *items_read = GPTRDIFF_TO_LONG (inptr - str);
+ map_error(err);
if (items_written)
- *items_written = (glong)outlen;
+ *items_written = errno == 0 ? ret : 0;
- if (G_LIKELY (!custom_alloc_func))
- outptr = outbuf = g_malloc ((outlen + 1) * sizeof (gunichar2));
- else
- outptr = outbuf = (gunichar2 *)custom_alloc_func ((outlen + 1) * sizeof (gunichar2), custom_alloc_data);
+ if (ret <= 0)
+ return NULL;
- if (G_UNLIKELY (custom_alloc_func && !outbuf)) {
- mono_set_errno (ENOMEM);
- goto error;
- }
+ lpDestStr = malloc((ret + 1) * sizeof(gunichar2));
+ ret = (glong)minipal_convert_utf8_to_utf16 (str, len, lpDestStr, ret, flags);
+ lpDestStr[ret] = '\0';
- inptr = (char *) str;
- inleft = len;
+ if (items_written)
+ *items_written = errno == 0 ? ret : 0;
- while (inleft > 0) {
- if ((n = decode_utf8 (inptr, inleft, &c)) < 0)
- break;
+ map_error(err);
+ return lpDestStr;
+}
- if (c == 0 && !include_nuls)
- break;
+static gunichar2 *
+g_utf8_to_utf16le_custom_alloc_impl (const gchar *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err, bool treatAsLE)
+{
+ guint flags = 0;
+ errno = 0;
+#if G_BYTE_ORDER == G_BIG_ENDIAN
+ if (treatAsLE)
+ flags = MINIPAL_TREAT_AS_LITTLE_ENDIAN;
+#endif
+ if (len < 0)
+ len = (glong)strlen(str) + 1;
- u = g_unichar_to_utf16_endian (c, outptr, endian);
- if ((u < 0) && replace_invalid_codepoints) {
- outptr[0] = 0xFFFD;
- outptr[1] = 0xFFFD;
- u = 2;
- }
+ glong ret = (glong)minipal_get_length_utf8_to_utf16 (str, len, flags);
- outptr += u;
- inleft -= n;
- inptr += n;
- }
+ map_error(err);
- *outptr = '\0';
+ if (items_written)
+ *items_written = errno == 0 ? ret : 0;
- return outbuf;
+ if (ret <= 0)
+ return NULL;
-error:
- if (errno == ENOMEM) {
- g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_MEMORY,
- "Allocation failed.");
- } else if (errno == EILSEQ) {
- g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
- "Illegal byte sequence encountered in the input.");
- } else if (items_read) {
- /* partial input is ok if we can let our caller know... */
- } else {
- g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
- "Partial byte sequence encountered in the input.");
+ gunichar2 *lpDestStr = custom_alloc_func((ret + 1) * sizeof(gunichar2), custom_alloc_data);
+ if (G_UNLIKELY (!lpDestStr)) {
+ g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_MEMORY, "Allocation failed.");
+ return NULL;
}
- if (items_read)
- *items_read = GPTRDIFF_TO_LONG (inptr - str);
-
- if (items_written)
- *items_written = 0;
+ flags |= MINIPAL_MB_NO_REPLACE_INVALID_CHARS;
+ ret = (glong)minipal_convert_utf8_to_utf16 (str, len, lpDestStr, ret, flags);
+ lpDestStr[ret] = '\0';
- return NULL;
+ map_error(err);
+ return lpDestStr;
}
gunichar2 *
g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err)
{
- return eg_utf8_to_utf16_general (str, len, items_read, items_written, FALSE, FALSE, NULL, NULL, err, G_BYTE_ORDER);
-}
-
-gunichar2 *
-g_utf8_to_utf16be (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err)
-{
- return eg_utf8_to_utf16_general (str, len, items_read, items_written, FALSE, FALSE, NULL, NULL, err, G_BIG_ENDIAN);
+ return g_utf8_to_utf16_impl (str, len, items_read, items_written, err, MINIPAL_MB_NO_REPLACE_INVALID_CHARS, false);
}
gunichar2 *
g_utf8_to_utf16le (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err)
{
- return eg_utf8_to_utf16_general (str, len, items_read, items_written, FALSE, FALSE, NULL, NULL, err, G_LITTLE_ENDIAN);
+ return g_utf8_to_utf16_impl (str, len, items_read, items_written, err, MINIPAL_MB_NO_REPLACE_INVALID_CHARS, true);
}
gunichar2 *
-g_utf8_to_utf16_custom_alloc (const gchar *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err)
+eg_wtf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err)
{
- return eg_utf8_to_utf16_general (str, len, items_read, items_written, FALSE, FALSE, custom_alloc_func, custom_alloc_data, err, G_BYTE_ORDER);
+ return g_utf8_to_utf16_impl (str, len, items_read, items_written, err, 0, false);
}
gunichar2 *
-g_utf8_to_utf16be_custom_alloc (const gchar *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err)
+g_utf8_to_utf16_custom_alloc (const gchar *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err)
{
- return eg_utf8_to_utf16_general (str, len, items_read, items_written, FALSE, FALSE, custom_alloc_func, custom_alloc_data, err, G_BIG_ENDIAN);
+ return g_utf8_to_utf16le_custom_alloc_impl (str, len, items_read, items_written, custom_alloc_func, custom_alloc_data, err, false);
}
gunichar2 *
g_utf8_to_utf16le_custom_alloc (const gchar *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err)
{
- return eg_utf8_to_utf16_general (str, len, items_read, items_written, FALSE, FALSE, custom_alloc_func, custom_alloc_data, err, G_LITTLE_ENDIAN);
-}
-
-gunichar2 *
-eg_utf8_to_utf16_with_nuls (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err)
-{
- return eg_utf8_to_utf16_general (str, len, items_read, items_written, TRUE, FALSE, NULL, NULL, err, G_BYTE_ORDER);
-}
-
-gunichar2 *
-eg_wtf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err)
-{
- return eg_utf8_to_utf16_general (str, len, items_read, items_written, TRUE, TRUE, NULL, NULL, err, G_BYTE_ORDER);
+ return g_utf8_to_utf16le_custom_alloc_impl (str, len, items_read, items_written, custom_alloc_func, custom_alloc_data, err, true);
}
gunichar *
return outbuf;
}
-static
-gchar *
-eg_utf16_to_utf8_general (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err, unsigned endian)
+static gchar *
+g_utf16_to_utf8_impl (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err, bool treatAsLE)
{
- char *inptr, *outbuf, *outptr;
- size_t outlen = 0;
- size_t inleft;
- gunichar c;
- int n;
-
- g_return_val_if_fail (str != NULL, NULL);
-
+ guint flags = 0;
+ errno = 0;
+ gchar* lpDestStr = NULL;
+#if G_BYTE_ORDER == G_BIG_ENDIAN
+ if (treatAsLE)
+ flags |= MINIPAL_TREAT_AS_LITTLE_ENDIAN;
+#endif
if (len < 0) {
len = 0;
while (str[len])
len++;
- }
-
- inptr = (char *) str;
- inleft = len * 2;
-
- while (inleft > 0) {
- if ((n = decode_utf16_endian (inptr, inleft, &c, endian)) < 0) {
- if (n == -2 && inleft > 2) {
- /* This means that the first UTF-16 char was read, but second failed */
- inleft -= 2;
- inptr += 2;
- }
- if (errno == EILSEQ) {
- g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
- "Illegal byte sequence encountered in the input.");
- } else if (items_read) {
- /* partial input is ok if we can let our caller know... */
- break;
- } else {
- g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
- "Partial byte sequence encountered in the input.");
- }
-
- if (items_read)
- *items_read = GPTRDIFF_TO_LONG ((inptr - (char *) str) / 2);
-
- if (items_written)
- *items_written = 0;
-
- return NULL;
- } else if (c == 0)
- break;
-
- outlen += g_unichar_to_utf8 (c, NULL);
- inleft -= n;
- inptr += n;
+ len++;
}
- if (items_read)
- *items_read = GPTRDIFF_TO_LONG ((inptr - (char *) str) / 2);
+ glong ret = (glong)minipal_get_length_utf16_to_utf8 (str, len, flags);
+ map_error(err);
if (items_written)
- *items_written = (glong)outlen;
-
- if (G_LIKELY (!custom_alloc_func))
- outptr = outbuf = g_malloc (outlen + 1);
- else
- outptr = outbuf = (char *)custom_alloc_func (outlen + 1, custom_alloc_data);
+ *items_written = errno == 0 ? ret : 0;
- if (G_UNLIKELY (custom_alloc_func && !outbuf)) {
- g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_MEMORY, "Allocation failed.");
- if (items_written)
- *items_written = 0;
+ if (ret <= 0)
return NULL;
- }
-
- inptr = (char *) str;
- inleft = len * 2;
-
- while (inleft > 0) {
- if ((n = decode_utf16_endian (inptr, inleft, &c, endian)) < 0)
- break;
- else if (c == 0)
- break;
- outptr += g_unichar_to_utf8 (c, outptr);
- inleft -= n;
- inptr += n;
- }
+ lpDestStr = (gchar *)g_malloc((ret + 1) * sizeof(gchar));
+ ret = (glong)minipal_convert_utf16_to_utf8 (str, len, lpDestStr, ret, flags);
+ lpDestStr[ret] = '\0';
- *outptr = '\0';
+ if (items_written)
+ *items_written = errno == 0 ? ret : 0;
- return outbuf;
+ map_error(err);
+ return lpDestStr;
}
gchar *
g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err)
{
- return eg_utf16_to_utf8_general (str, len, items_read, items_written, NULL, NULL, err, G_BYTE_ORDER);
+ return g_utf16_to_utf8_impl (str, len, items_read, items_written, err, /* treatAsLE */ false);
}
gchar *
g_utf16le_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err)
{
- return eg_utf16_to_utf8_general (str, len, items_read, items_written, NULL, NULL, err, G_LITTLE_ENDIAN);
-}
-
-gchar *
-g_utf16be_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err)
-{
- return eg_utf16_to_utf8_general (str, len, items_read, items_written, NULL, NULL, err, G_BIG_ENDIAN);
+ return g_utf16_to_utf8_impl (str, len, items_read, items_written, err, /* treatAsLE */ true);
}
gchar *
g_utf16_to_utf8_custom_alloc (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err)
{
- return eg_utf16_to_utf8_general (str, len, items_read, items_written, custom_alloc_func, custom_alloc_data, err, G_BYTE_ORDER);
+ errno = 0;
+
+ if (len < 0) {
+ len = 0;
+ while (str[len])
+ len++;
+
+ len++;
+ }
+
+ glong ret = (glong)minipal_get_length_utf16_to_utf8 (str, len, 0);
+ map_error(err);
+
+ if (items_written)
+ *items_written = errno == 0 ? ret : 0;
+
+ if (ret <= 0)
+ return NULL;
+
+ gchar *lpDestStr = custom_alloc_func((ret + 1) * sizeof (gunichar2), custom_alloc_data);
+ if (G_UNLIKELY (!lpDestStr)) {
+ g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_MEMORY, "Allocation failed.");
+ return NULL;
+ }
+
+ ret = (glong)minipal_convert_utf16_to_utf8 (str, len, lpDestStr, ret, 0);
+ lpDestStr[ret] = '\0';
+
+ map_error(err);
+ return lpDestStr;
}
gunichar *
gunichar *g_utf8_to_ucs4 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err);
G_EXTERN_C // Used by libtest, at least.
gunichar2 *g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err);
-gunichar2 *g_utf8_to_utf16be (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err);
gunichar2 *g_utf8_to_utf16le (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err);
-gunichar2 *eg_utf8_to_utf16_with_nuls (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err);
gunichar2 *eg_wtf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err);
G_EXTERN_C // Used by libtest, at least.
gchar *g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err);
gchar *g_utf16le_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err);
-gchar *g_utf16be_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err);
gunichar *g_utf16_to_ucs4 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err);
gchar *g_ucs4_to_utf8 (const gunichar *str, glong len, glong *items_read, glong *items_written, GError **err);
gunichar2 *g_ucs4_to_utf16 (const gunichar *str, glong len, glong *items_read, glong *items_written, GError **err);
g_fixed_buffer_custom_allocator (gsize req_size, gpointer custom_alloc_data);
gunichar2 *g_utf8_to_utf16_custom_alloc (const gchar *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err);
-gunichar2 *g_utf8_to_utf16be_custom_alloc (const gchar *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err);
gunichar2 *g_utf8_to_utf16le_custom_alloc (const gchar *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err);
gchar *g_utf16_to_utf8_custom_alloc (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err);
gerror = NULL;
if (include_nuls)
- ret = eg_utf8_to_utf16_with_nuls (utf8, size_spec, &in_read, &out_read, &gerror);
+ ret = g_utf8_to_utf16 (utf8, size_spec, &in_read, &out_read, &gerror);
else
ret = g_utf8_to_utf16 (utf8, size_spec, &in_read, &out_read, &gerror);
#endif
/* implicit length is forbidden */
- if (eg_utf8_to_utf16_with_nuls (src1, -1, NULL, NULL, NULL) != NULL)
+ if (g_utf8_to_utf16 (src1, -1, NULL, NULL, NULL) != NULL)
return FAILED ("explicit nulls must fail with -1 length\n");
/* empty string */
static Test utf8_tests [] = {
{"g_utf16_to_utf8", test_utf16_to_utf8},
{"g_utf8_to_utf16", test_utf8_to_utf16},
- {"g_utf8_to_utf16_with_nuls", test_utf8_to_utf16_with_nuls},
+ {"g_utf8_to_utf16_nuls", test_utf8_to_utf16_with_nuls},
{"g_utf8_seq", test_utf8_seq},
{"g_ucs4_to_utf16", test_ucs4_to_utf16 },
{"g_utf16_to_ucs4", test_utf16_to_ucs4 },
mono_mem_manager_init_reflection_hashes (mem_manager);
- /*
+ /*
* If the initializing thread was rudely aborted, the exception is not stored
* in the hash.
*/
gunichar2 *ut = NULL;
glong items_written;
- ut = eg_utf8_to_utf16_with_nuls (text, length, NULL, &items_written, &eg_error);
+ ut = g_utf8_to_utf16 (text, length, NULL, &items_written, &eg_error);
if (eg_error) {
o = NULL_HANDLE_STRING;
target_link_libraries(mono-sgen PRIVATE monoapi eglib_api monosgen-static)
if(HAVE_ICU_SHIM)
target_link_libraries(mono-sgen PRIVATE icu_shim_objects)
- endif()
+ endif()
target_link_libraries(mono-sgen PRIVATE ${OS_LIBS} ${LLVM_LIBS} ${ICU_LIBS} ${Z_LIBS})
# Alpine Linux implements ucontext in a different library
if(CLR_CMAKE_HOST_ALPINE_LINUX AND TARGET_S390X)
--- /dev/null
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+#include <minipal/utf8.h>
+
+#include <errno.h>
+#include <limits.h>
+#include <string.h>
+#include <assert.h>
+
+#define HIGH_SURROGATE_START 0xd800
+#define HIGH_SURROGATE_END 0xdbff
+#define LOW_SURROGATE_START 0xdc00
+#define LOW_SURROGATE_END 0xdfff
+
+// Test if the wide character is a high surrogate
+static bool IsHighSurrogate(const CHAR16_T c)
+{
+ return (c & 0xFC00) == HIGH_SURROGATE_START;
+}
+
+// Test if the wide character is a low surrogate
+static bool IsLowSurrogate(const CHAR16_T c)
+{
+ return (c & 0xFC00) == LOW_SURROGATE_START;
+}
+
+// Test if the wide character is a surrogate half
+static bool IsSurrogate(const CHAR16_T c)
+{
+ return (c & 0xF800) == HIGH_SURROGATE_START;
+}
+
+typedef struct
+{
+ // Store our default string
+ unsigned char* byteStart;
+ CHAR16_T* charEnd;
+ const CHAR16_T strDefault[2];
+ int strDefaultLength;
+ int fallbackCount;
+ int fallbackIndex;
+} DecoderBuffer;
+
+static CHAR16_T DecoderReplacementFallbackBuffer_GetNextChar(DecoderBuffer* self)
+{
+ // We want it to get < 0 because == 0 means that the current/last character is a fallback
+ // and we need to detect recursion. We could have a flag but we already have this counter.
+ self->fallbackCount--;
+ self->fallbackIndex++;
+
+ // Do we have anything left? 0 is now last fallback char, negative is nothing left
+ if (self->fallbackCount < 0)
+ return '\0';
+
+ // Need to get it out of the buffer.
+ // Make sure it didn't wrap from the fast count-- path
+ if (self->fallbackCount == INT_MAX)
+ {
+ self->fallbackCount = -1;
+ return '\0';
+ }
+
+ // Now make sure its in the expected range
+ assert(self->fallbackIndex < self->strDefaultLength && self->fallbackIndex >= 0);
+
+ return self->strDefault[self->fallbackIndex];
+}
+
+// Fallback Methods
+static bool DecoderReplacementFallbackBuffer_Fallback(DecoderBuffer* self)
+{
+ // We expect no previous fallback in our buffer
+ // We can't call recursively but others might (note, we don't test on last char!!!)
+ assert(self->fallbackCount < 1);
+
+ // Go ahead and get our fallback
+ if (self->strDefaultLength == 0)
+ return false;
+
+ self->fallbackCount = self->strDefaultLength;
+ self->fallbackIndex = -1;
+
+ return true;
+}
+
+// Fallback the current byte by sticking it into the remaining char buffer.
+// This can only be called by our encodings (other have to use the public fallback methods), so
+// we can use our DecoderNLS here too (except we don't).
+// Returns true if we are successful, false if we can't fallback the character (no buffer space)
+// So caller needs to throw buffer space if return false.
+// Right now this has both bytes and bytes[], since we might have extra bytes, hence the
+// array, and we might need the index, hence the byte*
+// Don't touch ref chars unless we succeed
+static bool DecoderReplacementFallbackBuffer_InternalFallback_Copy(DecoderBuffer* self, CHAR16_T** chars, CHAR16_T* pAllocatedBufferEnd)
+{
+ assert(self->byteStart != NULL);
+
+ bool fallbackResult = DecoderReplacementFallbackBuffer_Fallback(self);
+
+ // See if there's a fallback character and we have an output buffer then copy our string.
+ if (fallbackResult)
+ {
+ // Copy the chars to our output
+ CHAR16_T ch;
+ CHAR16_T* charTemp = *chars;
+ bool bHighSurrogate = false;
+ (void)bHighSurrogate; // unused in release build
+ while ((ch = DecoderReplacementFallbackBuffer_GetNextChar(self)) != 0)
+ {
+ // Make sure no mixed up surrogates
+ if (IsSurrogate(ch))
+ {
+ if (IsHighSurrogate(ch))
+ {
+ // High Surrogate
+ assert(!bHighSurrogate);
+ bHighSurrogate = true;
+ }
+ else
+ {
+ // Low surrogate
+ assert(bHighSurrogate);
+ bHighSurrogate = false;
+ }
+ }
+
+ if (charTemp >= self->charEnd)
+ {
+ // No buffer space
+ return false;
+ }
+
+ *(charTemp++) = ch;
+ if (charTemp > pAllocatedBufferEnd)
+ {
+ errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER;
+ return false;
+ }
+ }
+
+ // Need to make sure that bHighSurrogate isn't true
+ assert(!bHighSurrogate);
+
+ // Now we aren't going to be false, so its OK to update chars
+ *chars = charTemp;
+ }
+
+ return true;
+}
+
+// Clear the buffer
+static void DecoderReplacementFallbackBuffer_Reset(DecoderBuffer* self)
+{
+ self->fallbackCount = -1;
+ self->fallbackIndex = -1;
+ self->byteStart = NULL;
+}
+
+typedef struct
+{
+ const CHAR16_T strDefault[3];
+ int strDefaultLength;
+ CHAR16_T* charStart;
+ CHAR16_T* charEnd;
+ bool setEncoder;
+ bool bUsedEncoder;
+ bool bFallingBack;
+ int iRecursionCount;
+ int fallbackCount;
+ int fallbackIndex;
+} EncoderBuffer;
+
+#define MAX_RECURSION 250
+
+// Set the above values
+// This can't be part of the constructor because EncoderFallbacks would have to know how to implement these.
+static void EncoderReplacementFallbackBuffer_InternalInitialize(EncoderBuffer* self, CHAR16_T* charStart, CHAR16_T* charEnd, bool setEncoder)
+{
+ self->charStart = charStart;
+ self->charEnd = charEnd;
+ self->setEncoder = setEncoder;
+ self->bUsedEncoder = false;
+ self->bFallingBack = false;
+ self->iRecursionCount = 0;
+}
+
+static CHAR16_T EncoderReplacementFallbackBuffer_InternalGetNextChar(EncoderBuffer* self)
+{
+ // We want it to get < 0 because == 0 means that the current/last character is a fallback
+ // and we need to detect recursion. We could have a flag but we already have this counter.
+ self->fallbackCount--;
+ self->fallbackIndex++;
+
+ // Do we have anything left? 0 is now last fallback char, negative is nothing left
+ if (self->fallbackCount < 0)
+ return '\0';
+
+ // Need to get it out of the buffer.
+ // Make sure it didn't wrap from the fast count-- path
+ if (self->fallbackCount == INT_MAX)
+ {
+ self->fallbackCount = -1;
+ return '\0';
+ }
+
+ // Now make sure its in the expected range
+ assert(self->fallbackIndex < self->strDefaultLength && self->fallbackIndex >= 0);
+
+ CHAR16_T ch = self->strDefault[self->fallbackIndex];
+ self->bFallingBack = (ch != 0);
+ if (ch == 0) self->iRecursionCount = 0;
+ return ch;
+}
+
+// Fallback Methods
+static bool EncoderReplacementFallbackBuffer_Fallback(EncoderBuffer* self)
+{
+ // If we had a buffer already we're being recursive, throw, it's probably at the suspect
+ // character in our array.
+ assert(self->fallbackCount < 1);
+
+ // Go ahead and get our fallback
+ // Divide by 2 because we aren't a surrogate pair
+ self->fallbackCount = self->strDefaultLength / 2;
+ self->fallbackIndex = -1;
+
+ return self->fallbackCount != 0;
+}
+
+static bool EncoderReplacementFallbackBuffer_Fallback_Unknown(EncoderBuffer* self)
+{
+ // If we had a buffer already we're being recursive, throw, it's probably at the suspect
+ // character in our array.
+ assert(self->fallbackCount < 1);
+
+ // Go ahead and get our fallback
+ self->fallbackCount = self->strDefaultLength;
+ self->fallbackIndex = -1;
+
+ return self->fallbackCount != 0;
+}
+
+// Fallback the current character using the remaining buffer and encoder if necessary
+// This can only be called by our encodings (other have to use the public fallback methods), so
+// we can use our EncoderNLS here too.
+// setEncoder is true if we're calling from a GetBytes method, false if we're calling from a GetByteCount
+//
+// Note that this could also change the contents of self->buffer.encoder, which is the same
+// object that the caller is using, so the caller could mess up the encoder for us
+// if they aren't careful.
+static bool EncoderReplacementFallbackBuffer_InternalFallback(EncoderBuffer* self, CHAR16_T ch, CHAR16_T** chars)
+{
+ // Shouldn't have null charStart
+ assert(self->charStart != NULL);
+
+ // See if it was a high surrogate
+ if (IsHighSurrogate(ch))
+ {
+ // See if there's a low surrogate to go with it
+ if (*chars >= self->charEnd)
+ {
+ // Nothing left in input buffer
+ // No input, return 0
+ }
+ else
+ {
+ // Might have a low surrogate
+ CHAR16_T cNext = **chars;
+ if (IsLowSurrogate(cNext))
+ {
+ // If already falling back then fail
+ assert(!self->bFallingBack || self->iRecursionCount++ <= MAX_RECURSION);
+
+ // Next is a surrogate, add it as surrogate pair, and increment chars
+ (*chars)++;
+ self->bFallingBack = EncoderReplacementFallbackBuffer_Fallback_Unknown(self);
+ return self->bFallingBack;
+ }
+
+ // Next isn't a low surrogate, just fallback the high surrogate
+ }
+ }
+
+ // If already falling back then fail
+ assert(!self->bFallingBack || self->iRecursionCount++ <= MAX_RECURSION);
+
+ // Fall back our char
+ self->bFallingBack = EncoderReplacementFallbackBuffer_Fallback(self);
+
+ return self->bFallingBack;
+}
+
+static bool EncoderReplacementFallbackBuffer_MovePrevious(EncoderBuffer* self)
+{
+ // Back up one, only if we just processed the last character (or earlier)
+ if (self->fallbackCount >= -1 && self->fallbackIndex >= 0)
+ {
+ self->fallbackIndex--;
+ self->fallbackCount++;
+ return true;
+ }
+
+ // Return false 'cause we couldn't do it.
+ return false;
+}
+
+typedef struct
+{
+ union
+ {
+ DecoderBuffer decoder;
+ EncoderBuffer encoder;
+ } buffer;
+
+ bool useFallback;
+
+#if BIGENDIAN
+ bool treatAsLE;
+#endif
+} UTF8Encoding;
+
+// These are bitmasks used to maintain the state in the decoder. They occupy the higher bits
+// while the actual character is being built in the lower bits. They are shifted together
+// with the actual bits of the character.
+
+// bits 30 & 31 are used for pending bits fixup
+#define FinalByte (1 << 29)
+#define SupplimentarySeq (1 << 28)
+#define ThreeByteSeq (1 << 27)
+
+static bool InRange(int c, int begin, int end)
+{
+ return begin <= c && c <= end;
+}
+
+// During GetChars we had an invalid byte sequence
+// pSrc is backed up to the start of the bad sequence if we didn't have room to
+// fall it back. Otherwise pSrc remains where it is.
+static bool FallbackInvalidByteSequence_Copy(UTF8Encoding* self, unsigned char** pSrc, CHAR16_T** pTarget, CHAR16_T* pAllocatedBufferEnd)
+{
+ assert(self->useFallback);
+
+ // Get our byte[]
+ unsigned char* pStart = *pSrc;
+ bool fallbackResult = DecoderReplacementFallbackBuffer_InternalFallback_Copy(&self->buffer.decoder, pTarget, pAllocatedBufferEnd);
+
+ // Do the actual fallback
+ if (!fallbackResult)
+ {
+ // Oops, it failed, back up to pStart
+ *pSrc = pStart;
+ return false;
+ }
+
+ // It worked
+ return true;
+}
+
+static size_t GetCharCount(UTF8Encoding* self, unsigned char* bytes, size_t count)
+{
+ assert(bytes != NULL);
+ assert(count >= 0);
+
+ // Initialize stuff
+ unsigned char *pSrc = bytes;
+ unsigned char *pEnd = pSrc + count;
+ int availableBytes, chc;
+
+ // Start by assuming we have as many as count, charCount always includes the adjustment
+ // for the character being decoded
+ size_t charCount = count;
+ int ch = 0;
+ bool fallbackUsed = false;
+
+ while (true)
+ {
+ // SLOWLOOP: does all range checks, handles all special cases, but it is slow
+ if (pSrc >= pEnd) break;
+
+ // read next byte. The JIT optimization seems to be getting confused when
+ // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
+ int cha = *pSrc;
+
+ // no pending bits
+ if (ch == 0) goto ReadChar;
+
+ pSrc++;
+
+ // we are expecting to see trailing bytes like 10vvvvvv
+ if ((cha & 0xC0) != 0x80)
+ {
+ // This can be a valid starting byte for another UTF8 byte sequence, so let's put
+ // the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence
+ pSrc--;
+ charCount += (ch >> 30);
+ goto InvalidByteSequence;
+ }
+
+ // fold in the new byte
+ ch = (ch << 6) | (cha & 0x3F);
+
+ if ((ch & FinalByte) == 0)
+ {
+ assert((ch & (SupplimentarySeq | ThreeByteSeq)) != 0);
+
+ if ((ch & SupplimentarySeq) != 0)
+ {
+ if ((ch & (FinalByte >> 6)) != 0)
+ {
+ // this is 3rd byte (of 4 byte supplimentary) - nothing to do
+ continue;
+ }
+
+ // 2nd byte, check for non-shortest form of supplimentary char and the valid
+ // supplimentary characters in range 0x010000 - 0x10FFFF at the same time
+ if (!InRange(ch & 0x1F0, 0x10, 0x100))
+ {
+ goto InvalidByteSequence;
+ }
+ }
+ else
+ {
+ // Must be 2nd byte of a 3-byte sequence
+ // check for non-shortest form of 3 byte seq
+ if ((ch & (0x1F << 5)) == 0 || // non-shortest form
+ (ch & (0xF800 >> 6)) == (0xD800 >> 6)) // illegal individually encoded surrogate
+ {
+ goto InvalidByteSequence;
+ }
+ }
+ continue;
+ }
+
+ // ready to punch
+
+ // adjust for surrogates in non-shortest form
+ if ((ch & (SupplimentarySeq | 0x1F0000)) == SupplimentarySeq) charCount--;
+
+ goto EncodeChar;
+
+ InvalidByteSequence:
+ if (!self->useFallback)
+ {
+ errno = MINIPAL_ERROR_NO_UNICODE_TRANSLATION;
+ return 0;
+ }
+
+ if (!fallbackUsed)
+ {
+ fallbackUsed = true;
+ self->buffer.decoder.byteStart = bytes;
+ self->buffer.decoder.charEnd = NULL;
+ }
+ charCount += self->buffer.decoder.strDefaultLength;
+
+ ch = 0;
+ continue;
+
+ ReadChar:
+ ch = *pSrc;
+ pSrc++;
+
+ ProcessChar:
+ if (ch > 0x7F)
+ {
+ // If its > 0x7F, its start of a new multi-byte sequence
+
+ // Long sequence, so unreserve our char.
+ charCount--;
+
+ // bit 6 has to be non-zero for start of multibyte chars.
+ if ((ch & 0x40) == 0) goto InvalidByteSequence;
+
+ // start a new long code
+ if ((ch & 0x20) != 0)
+ {
+ if ((ch & 0x10) != 0)
+ {
+ // 4 byte encoding - supplimentary character (2 surrogates)
+
+ ch &= 0x0F;
+
+ // check that bit 4 is zero and the valid supplimentary character
+ // range 0x000000 - 0x10FFFF at the same time
+ if (ch > 0x04)
+ {
+ ch |= 0xf0;
+ goto InvalidByteSequence;
+ }
+
+ // Add bit flags so that when we check new characters & rotate we'll be flagged correctly.
+ // Final byte flag, count fix if we don't make final byte & supplimentary sequence flag.
+ ch |= (FinalByte >> 3 * 6) | // Final byte is 3 more bytes from now
+ (1 << 30) | // If it dies on next byte we'll need an extra char
+ (3 << (30 - 2 * 6)) | // If it dies on last byte we'll need to subtract a char
+ (SupplimentarySeq) | (SupplimentarySeq >> 6) |
+ (SupplimentarySeq >> 2 * 6) | (SupplimentarySeq >> 3 * 6);
+
+ // Our character count will be 2 characters for these 4 bytes, so subtract another char
+ charCount--;
+ }
+ else
+ {
+ // 3 byte encoding
+ // Add bit flags so that when we check new characters & rotate we'll be flagged correctly.
+ ch = (ch & 0x0F) | ((FinalByte >> 2 * 6) | (1 << 30) |
+ (ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2 * 6));
+
+ // We'll expect 1 character for these 3 bytes, so subtract another char.
+ charCount--;
+ }
+ }
+ else
+ {
+ // 2 byte encoding
+
+ ch &= 0x1F;
+
+ // check for non-shortest form
+ if (ch <= 1)
+ {
+ ch |= 0xc0;
+ goto InvalidByteSequence;
+ }
+
+ // Add bit flags so we'll be flagged correctly
+ ch |= (FinalByte >> 6);
+ }
+ continue;
+ }
+
+ EncodeChar:
+
+ availableBytes = pEnd - pSrc;
+
+ // don't fall into the fast decoding loop if we don't have enough bytes
+ if (availableBytes <= 13)
+ {
+ // try to get over the remainder of the ascii characters fast though
+ unsigned char* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
+ while (pSrc < pLocalEnd)
+ {
+ ch = *pSrc;
+ pSrc++;
+
+ if (ch > 0x7F)
+ goto ProcessChar;
+ }
+ // we are done
+ ch = 0;
+ break;
+ }
+
+ // To compute the upper bound, assume that all characters are ASCII characters at this point,
+ // the boundary will be decreased for every non-ASCII character we encounter
+ // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences
+ unsigned char *pStop = pSrc + availableBytes - 7;
+
+ while (pSrc < pStop)
+ {
+ ch = *pSrc;
+ pSrc++;
+
+ if (ch > 0x7F)
+ {
+ goto LongCode;
+ }
+
+ // get pSrc 2-byte aligned
+ if (((size_t)pSrc & 0x1) != 0)
+ {
+ ch = *pSrc;
+ pSrc++;
+ if (ch > 0x7F)
+ {
+ goto LongCode;
+ }
+ }
+
+ // get pSrc 4-byte aligned
+ if (((size_t)pSrc & 0x2) != 0)
+ {
+ ch = *(unsigned short*)pSrc;
+ if ((ch & 0x8080) != 0)
+ {
+ goto LongCodeWithMask16;
+ }
+ pSrc += 2;
+ }
+
+
+ // Run 8 + 8 characters at a time!
+ while (pSrc < pStop)
+ {
+ ch = *(int*)pSrc;
+ int chb = *(int*)(pSrc + 4);
+ if (((ch | chb) & (int)0x80808080) != 0)
+ {
+ goto LongCodeWithMask32;
+ }
+ pSrc += 8;
+
+ // This is a really small loop - unroll it
+ if (pSrc >= pStop)
+ break;
+
+ ch = *(int*)pSrc;
+ chb = *(int*)(pSrc + 4);
+ if (((ch | chb) & (int)0x80808080) != 0)
+ {
+ goto LongCodeWithMask32;
+ }
+ pSrc += 8;
+ }
+ break;
+
+ LongCodeWithMask32 :
+#if BIGENDIAN
+ // be careful about the sign extension
+ if (!self->treatAsLE) ch = (int)(((unsigned int)ch) >> 16);
+ else
+#endif
+ ch &= 0xFF;
+
+ LongCodeWithMask16:
+#if BIGENDIAN
+ if (!self->treatAsLE) ch = (int)(((unsigned int)ch) >> 8);
+ else
+#endif
+ ch &= 0xFF;
+
+ pSrc++;
+ if (ch <= 0x7F)
+ {
+ continue;
+ }
+
+ LongCode:
+ chc = *pSrc;
+ pSrc++;
+
+ if (
+ // bit 6 has to be zero
+ (ch & 0x40) == 0 ||
+ // we are expecting to see trailing bytes like 10vvvvvv
+ (chc & 0xC0) != 0x80)
+ {
+ goto BadLongCode;
+ }
+
+ chc &= 0x3F;
+
+ // start a new long code
+ if ((ch & 0x20) != 0)
+ {
+ // fold the first two bytes together
+ chc |= (ch & 0x0F) << 6;
+
+ if ((ch & 0x10) != 0)
+ {
+ // 4 byte encoding - surrogate
+ ch = *pSrc;
+ if (
+ // check that bit 4 is zero, the non-shortest form of surrogate
+ // and the valid surrogate range 0x000000 - 0x10FFFF at the same time
+ !InRange(chc >> 4, 0x01, 0x10) ||
+ // we are expecting to see trailing bytes like 10vvvvvv
+ (ch & 0xC0) != 0x80)
+ {
+ goto BadLongCode;
+ }
+
+ chc = (chc << 6) | (ch & 0x3F);
+
+ ch = *(pSrc + 1);
+ // we are expecting to see trailing bytes like 10vvvvvv
+ if ((ch & 0xC0) != 0x80)
+ {
+ goto BadLongCode;
+ }
+ pSrc += 2;
+
+ // extra byte
+ charCount--;
+ }
+ else
+ {
+ // 3 byte encoding
+ ch = *pSrc;
+ if (
+ // check for non-shortest form of 3 byte seq
+ (chc & (0x1F << 5)) == 0 ||
+ // Can't have surrogates here.
+ (chc & (0xF800 >> 6)) == (0xD800 >> 6) ||
+ // we are expecting to see trailing bytes like 10vvvvvv
+ (ch & 0xC0) != 0x80)
+ {
+ goto BadLongCode;
+ }
+ pSrc++;
+
+ // extra byte
+ charCount--;
+ }
+ }
+ else
+ {
+ // 2 byte encoding
+
+ // check for non-shortest form
+ if ((ch & 0x1E) == 0) goto BadLongCode;
+ }
+
+ // extra byte
+ charCount--;
+ }
+
+ // no pending bits at this point
+ ch = 0;
+ continue;
+
+ BadLongCode:
+ pSrc -= 2;
+ ch = 0;
+ continue;
+ }
+
+ // May have a problem if we have to flush
+ if (ch != 0)
+ {
+ // We were already adjusting for these, so need to unadjust
+ charCount += (ch >> 30);
+ charCount += self->buffer.decoder.strDefaultLength;
+ }
+
+ // Shouldn't have anything in fallback buffer for GetCharCount
+ // (don't have to check m_throwOnOverflow for count)
+ assert(!fallbackUsed || !self->useFallback || self->buffer.decoder.fallbackCount < 0);
+
+ return charCount;
+}
+
+#define ENSURE_BUFFER_INC \
+ pTarget++; \
+ if (pTarget > pAllocatedBufferEnd) \
+ { \
+ errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER; \
+ return 0; \
+ }
+
+static int GetChars(UTF8Encoding* self, unsigned char* bytes, size_t byteCount, CHAR16_T* chars, size_t charCount)
+{
+ assert(chars != NULL);
+ assert(byteCount >= 0);
+ assert(charCount >= 0);
+ assert(bytes != NULL);
+
+ unsigned char *pSrc = bytes;
+ CHAR16_T *pTarget = chars;
+
+ unsigned char *pEnd = pSrc + byteCount;
+ CHAR16_T *pAllocatedBufferEnd = pTarget + charCount;
+
+ int ch = 0;
+ int chc;
+
+ bool fallbackUsed = false;
+
+ while (true)
+ {
+ // SLOWLOOP: does all range checks, handles all special cases, but it is slow
+
+ if (pSrc >= pEnd) break;
+
+ // read next byte. The JIT optimization seems to be getting confused when
+ // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
+ int cha = *pSrc;
+
+ if (ch == 0)
+ {
+ // no pending bits
+ goto ReadChar;
+ }
+
+ pSrc++;
+
+ // we are expecting to see trailing bytes like 10vvvvvv
+ if ((cha & 0xC0) != 0x80)
+ {
+ // This can be a valid starting byte for another UTF8 byte sequence, so let's put
+ // the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence
+ pSrc--;
+ goto InvalidByteSequence;
+ }
+
+ // fold in the new byte
+ ch = (ch << 6) | (cha & 0x3F);
+
+ if ((ch & FinalByte) == 0)
+ {
+ // Not at last byte yet
+ assert((ch & (SupplimentarySeq | ThreeByteSeq)) != 0);
+
+ if ((ch & SupplimentarySeq) != 0)
+ {
+ // Its a 4-byte supplimentary sequence
+ if ((ch & (FinalByte >> 6)) != 0)
+ {
+ // this is 3rd byte of 4 byte sequence - nothing to do
+ continue;
+ }
+
+ // 2nd byte of 4 bytes
+ // check for non-shortest form of surrogate and the valid surrogate
+ // range 0x000000 - 0x10FFFF at the same time
+ if (!InRange(ch & 0x1F0, 0x10, 0x100))
+ {
+ goto InvalidByteSequence;
+ }
+ }
+ else
+ {
+ // Must be 2nd byte of a 3-byte sequence
+ // check for non-shortest form of 3 byte seq
+ if ((ch & (0x1F << 5)) == 0 || // non-shortest form
+ (ch & (0xF800 >> 6)) == (0xD800 >> 6)) // illegal individually encoded surrogate
+ {
+ goto InvalidByteSequence;
+ }
+ }
+ continue;
+ }
+
+ // ready to punch
+
+ // surrogate in shortest form?
+ // Might be possible to get rid of this? Already did non-shortest check for 4-byte sequence when reading 2nd byte?
+ if ((ch & (SupplimentarySeq | 0x1F0000)) > SupplimentarySeq)
+ {
+ // let the range check for the second char throw the exception
+ if (pTarget < pAllocatedBufferEnd)
+ {
+ *pTarget = (CHAR16_T)(((ch >> 10) & 0x7FF) +
+ (HIGH_SURROGATE_START - (0x10000 >> 10)));
+
+ ENSURE_BUFFER_INC
+
+ ch = (ch & 0x3FF) +
+ (int)(LOW_SURROGATE_START);
+ }
+ }
+
+ goto EncodeChar;
+
+ InvalidByteSequence:
+ if (!self->useFallback)
+ {
+ errno = MINIPAL_ERROR_NO_UNICODE_TRANSLATION;
+ return 0;
+ }
+
+ // this code fragment should be close to the gotos referencing it
+ // Have to do fallback for invalid bytes
+ if (!fallbackUsed)
+ {
+ fallbackUsed = true;
+ self->buffer.decoder.byteStart = bytes;
+ self->buffer.decoder.charEnd = pAllocatedBufferEnd;
+ }
+
+ // That'll back us up the appropriate # of bytes if we didn't get anywhere
+ if (!FallbackInvalidByteSequence_Copy(self, &pSrc, &pTarget, pAllocatedBufferEnd))
+ {
+ if (errno == MINIPAL_ERROR_INSUFFICIENT_BUFFER) return 0;
+
+ // Check if we ran out of buffer space
+ assert(pSrc >= bytes);
+
+ DecoderReplacementFallbackBuffer_Reset(&self->buffer.decoder);
+ ch = 0;
+ break;
+ }
+
+ assert(pSrc >= bytes);
+
+ ch = 0;
+ continue;
+
+ ReadChar:
+ ch = *pSrc;
+ pSrc++;
+
+ ProcessChar:
+ if (ch > 0x7F)
+ {
+ // If its > 0x7F, its start of a new multi-byte sequence
+
+ // bit 6 has to be non-zero
+ if ((ch & 0x40) == 0) goto InvalidByteSequence;
+
+ // start a new long code
+ if ((ch & 0x20) != 0)
+ {
+ if ((ch & 0x10) != 0)
+ {
+ // 4 byte encoding - supplimentary character (2 surrogates)
+
+ ch &= 0x0F;
+
+ // check that bit 4 is zero and the valid supplimentary character
+ // range 0x000000 - 0x10FFFF at the same time
+ if (ch > 0x04)
+ {
+ ch |= 0xf0;
+ goto InvalidByteSequence;
+ }
+
+ ch |= (FinalByte >> 3 * 6) | (1 << 30) | (3 << (30 - 2 * 6)) |
+ (SupplimentarySeq) | (SupplimentarySeq >> 6) |
+ (SupplimentarySeq >> 2 * 6) | (SupplimentarySeq >> 3 * 6);
+ }
+ else
+ {
+ // 3 byte encoding
+ ch = (ch & 0x0F) | ((FinalByte >> 2 * 6) | (1 << 30) |
+ (ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2 * 6));
+ }
+ }
+ else
+ {
+ // 2 byte encoding
+
+ ch &= 0x1F;
+
+ // check for non-shortest form
+ if (ch <= 1)
+ {
+ ch |= 0xc0;
+ goto InvalidByteSequence;
+ }
+
+ ch |= (FinalByte >> 6);
+ }
+ continue;
+ }
+
+ EncodeChar:
+ // write the pending character
+ if (pTarget >= pAllocatedBufferEnd)
+ {
+ // Fix chars so we make sure to throw if we didn't output anything
+ ch &= 0x1fffff;
+ if (ch > 0x7f)
+ {
+ if (ch > 0x7ff)
+ {
+ if (ch >= LOW_SURROGATE_START &&
+ ch <= LOW_SURROGATE_END)
+ {
+ pSrc--; // It was 4 bytes
+ pTarget--; // 1 was stored already, but we can't remember 1/2, so back up
+ }
+ else if (ch > 0xffff)
+ {
+ pSrc--; // It was 4 bytes, nothing was stored
+ }
+ pSrc--; // It was at least 3 bytes
+ }
+ pSrc--; // It was at least 2 bytes
+ }
+ pSrc--;
+
+ assert(pSrc >= bytes);
+
+ // Don't store ch in decoder, we already backed up to its start
+ ch = 0;
+
+ // Didn't throw, just use this buffer size.
+ break;
+ }
+ *pTarget = (CHAR16_T)ch;
+ ENSURE_BUFFER_INC
+
+ int availableChars = pAllocatedBufferEnd - pTarget;
+ int availableBytes = pEnd - pSrc;
+
+ // don't fall into the fast decoding loop if we don't have enough bytes
+ // Test for availableChars is done because pStop would be <= pTarget.
+ if (availableBytes <= 13)
+ {
+ // we may need as many as 1 character per byte
+ if (availableChars < availableBytes)
+ {
+ // not enough output room. no pending bits at this point
+ ch = 0;
+ continue;
+ }
+
+ // try to get over the remainder of the ascii characters fast though
+ unsigned char* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
+ while (pSrc < pLocalEnd)
+ {
+ ch = *pSrc;
+ pSrc++;
+
+ if (ch > 0x7F) goto ProcessChar;
+
+ *pTarget = (CHAR16_T)ch;
+ ENSURE_BUFFER_INC
+ }
+ // we are done
+ ch = 0;
+ break;
+ }
+
+ // we may need as many as 1 character per byte, so reduce the byte count if necessary.
+ // If availableChars is too small, pStop will be before pTarget and we won't do fast loop.
+ if (availableChars < availableBytes) availableBytes = availableChars;
+
+ // To compute the upper bound, assume that all characters are ASCII characters at this point,
+ // the boundary will be decreased for every non-ASCII character we encounter
+ // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences
+ CHAR16_T *pStop = pTarget + availableBytes - 7;
+
+ while (pTarget < pStop)
+ {
+ ch = *pSrc;
+ pSrc++;
+
+ if (ch > 0x7F) goto LongCode;
+
+ *pTarget = (CHAR16_T)ch;
+ ENSURE_BUFFER_INC
+
+ // get pSrc to be 2-byte aligned
+ if ((((size_t)pSrc) & 0x1) != 0)
+ {
+ ch = *pSrc;
+ pSrc++;
+ if (ch > 0x7F) goto LongCode;
+
+ *pTarget = (CHAR16_T)ch;
+ ENSURE_BUFFER_INC
+ }
+
+ // get pSrc to be 4-byte aligned
+ if ((((size_t)pSrc) & 0x2) != 0)
+ {
+ ch = *(unsigned short*)pSrc;
+ if ((ch & 0x8080) != 0) goto LongCodeWithMask16;
+
+
+ if (pTarget + 2 > pAllocatedBufferEnd)
+ {
+ errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER;
+ return 0;
+ }
+
+ // Unfortunately, this is endianness sensitive
+#if BIGENDIAN
+ if (!self->treatAsLE)
+ {
+ *pTarget = (CHAR16_T)((ch >> 8) & 0x7F);
+ pSrc += 2;
+ *(pTarget + 1) = (CHAR16_T)(ch & 0x7F);
+ pTarget += 2;
+ }
+ else
+#endif
+ {
+ *pTarget = (CHAR16_T)(ch & 0x7F);
+ pSrc += 2;
+ *(pTarget + 1) = (CHAR16_T)((ch >> 8) & 0x7F);
+ pTarget += 2;
+ }
+ }
+
+ // Run 8 characters at a time!
+ while (pTarget < pStop)
+ {
+ ch = *(int*)pSrc;
+ int chb = *(int*)(pSrc + 4);
+ if (((ch | chb) & (int)0x80808080) != 0) goto LongCodeWithMask32;
+
+ if (pTarget + 8 > pAllocatedBufferEnd)
+ {
+ errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER;
+ return 0;
+ }
+
+ // Unfortunately, this is endianness sensitive
+#if BIGENDIAN
+ if (!self->treatAsLE)
+ {
+ *pTarget = (CHAR16_T)((ch >> 24) & 0x7F);
+ *(pTarget + 1) = (CHAR16_T)((ch >> 16) & 0x7F);
+ *(pTarget + 2) = (CHAR16_T)((ch >> 8) & 0x7F);
+ *(pTarget + 3) = (CHAR16_T)(ch & 0x7F);
+ pSrc += 8;
+ *(pTarget + 4) = (CHAR16_T)((chb >> 24) & 0x7F);
+ *(pTarget + 5) = (CHAR16_T)((chb >> 16) & 0x7F);
+ *(pTarget + 6) = (CHAR16_T)((chb >> 8) & 0x7F);
+ *(pTarget + 7) = (CHAR16_T)(chb & 0x7F);
+ pTarget += 8;
+ }
+ else
+#endif
+ {
+ *pTarget = (CHAR16_T)(ch & 0x7F);
+ *(pTarget + 1) = (CHAR16_T)((ch >> 8) & 0x7F);
+ *(pTarget + 2) = (CHAR16_T)((ch >> 16) & 0x7F);
+ *(pTarget + 3) = (CHAR16_T)((ch >> 24) & 0x7F);
+ pSrc += 8;
+ *(pTarget + 4) = (CHAR16_T)(chb & 0x7F);
+ *(pTarget + 5) = (CHAR16_T)((chb >> 8) & 0x7F);
+ *(pTarget + 6) = (CHAR16_T)((chb >> 16) & 0x7F);
+ *(pTarget + 7) = (CHAR16_T)((chb >> 24) & 0x7F);
+ pTarget += 8;
+ }
+ }
+ break;
+
+ LongCodeWithMask32 :
+#if BIGENDIAN
+ // be careful about the sign extension
+ if (!self->treatAsLE) ch = (int)(((unsigned int)ch) >> 16);
+ else
+#endif
+ ch &= 0xFF;
+
+ LongCodeWithMask16:
+#if BIGENDIAN
+ if (!self->treatAsLE) ch = (int)(((unsigned int)ch) >> 8);
+ else
+#endif
+ ch &= 0xFF;
+
+ pSrc++;
+ if (ch <= 0x7F)
+ {
+ *pTarget = (CHAR16_T)ch;
+ ENSURE_BUFFER_INC
+ continue;
+ }
+
+ LongCode:
+ chc = *pSrc;
+ pSrc++;
+
+ if (
+ // bit 6 has to be zero
+ (ch & 0x40) == 0 ||
+ // we are expecting to see trailing bytes like 10vvvvvv
+ (chc & 0xC0) != 0x80)
+ {
+ goto BadLongCode;
+ }
+
+ chc &= 0x3F;
+
+ // start a new long code
+ if ((ch & 0x20) != 0)
+ {
+
+ // fold the first two bytes together
+ chc |= (ch & 0x0F) << 6;
+
+ if ((ch & 0x10) != 0)
+ {
+ // 4 byte encoding - surrogate
+ ch = *pSrc;
+ if (
+ // check that bit 4 is zero, the non-shortest form of surrogate
+ // and the valid surrogate range 0x000000 - 0x10FFFF at the same time
+ !InRange(chc >> 4, 0x01, 0x10) ||
+ // we are expecting to see trailing bytes like 10vvvvvv
+ (ch & 0xC0) != 0x80)
+ {
+ goto BadLongCode;
+ }
+
+ chc = (chc << 6) | (ch & 0x3F);
+
+ ch = *(pSrc + 1);
+ // we are expecting to see trailing bytes like 10vvvvvv
+ if ((ch & 0xC0) != 0x80) goto BadLongCode;
+
+ pSrc += 2;
+
+ ch = (chc << 6) | (ch & 0x3F);
+
+ *pTarget = (CHAR16_T)(((ch >> 10) & 0x7FF) +
+ (HIGH_SURROGATE_START - (0x10000 >> 10)));
+ ENSURE_BUFFER_INC
+
+ ch = (ch & 0x3FF) + (LOW_SURROGATE_START);
+
+ // extra byte, we're already planning 2 chars for 2 of these bytes,
+ // but the big loop is testing the target against pStop, so we need
+ // to subtract 2 more or we risk overrunning the input. Subtract
+ // one here and one below.
+ pStop--;
+ }
+ else
+ {
+ // 3 byte encoding
+ ch = *pSrc;
+ if (
+ // check for non-shortest form of 3 byte seq
+ (chc & (0x1F << 5)) == 0 ||
+ // Can't have surrogates here.
+ (chc & (0xF800 >> 6)) == (0xD800 >> 6) ||
+ // we are expecting to see trailing bytes like 10vvvvvv
+ (ch & 0xC0) != 0x80)
+ {
+ goto BadLongCode;
+ }
+ pSrc++;
+
+ ch = (chc << 6) | (ch & 0x3F);
+
+ // extra byte, we're only expecting 1 char for each of these 3 bytes,
+ // but the loop is testing the target (not source) against pStop, so
+ // we need to subtract 2 more or we risk overrunning the input.
+ // Subtract 1 here and one more below
+ pStop--;
+ }
+ }
+ else
+ {
+ // 2 byte encoding
+
+ ch &= 0x1F;
+
+ // check for non-shortest form
+ if (ch <= 1) goto BadLongCode;
+
+ ch = (ch << 6) | chc;
+ }
+
+ *pTarget = (CHAR16_T)ch;
+ ENSURE_BUFFER_INC
+
+ // extra byte, we're only expecting 1 char for each of these 2 bytes,
+ // but the loop is testing the target (not source) against pStop.
+ // subtract an extra count from pStop so that we don't overrun the input.
+ pStop--;
+ }
+
+ assert(pTarget <= pAllocatedBufferEnd);
+
+ // no pending bits at this point
+ ch = 0;
+ continue;
+
+ BadLongCode:
+ pSrc -= 2;
+ ch = 0;
+ continue;
+ }
+
+ if (ch != 0)
+ {
+ // This'll back us up the appropriate # of bytes if we didn't get anywhere
+ if (!self->useFallback)
+ {
+ assert(pSrc >= bytes || pTarget == chars);
+
+ // Ran out of buffer space
+ // Need to throw an exception?
+ if (pTarget == chars)
+ {
+ errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER;
+ return 0;
+ }
+ }
+ assert(pSrc >= bytes);
+ ch = 0;
+ }
+
+ // Shouldn't have anything in fallback buffer for GetChars
+ // (don't have to check m_throwOnOverflow for chars)
+ assert(!fallbackUsed || self->buffer.decoder.fallbackCount < 0);
+
+ if (pSrc < pEnd)
+ {
+ errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER;
+ return 0;
+ }
+
+ return pTarget - chars;
+}
+
+static size_t GetBytes(UTF8Encoding* self, CHAR16_T* chars, size_t charCount, unsigned char* bytes, size_t byteCount)
+{
+ assert(chars != NULL);
+ assert(byteCount >= 0);
+ assert(charCount >= 0);
+ assert(bytes != NULL);
+
+ // For fallback we may need a fallback buffer.
+ // We wait to initialize it though in case we don't have any broken input unicode
+ bool fallbackUsed = false;
+ CHAR16_T *pSrc = chars;
+ unsigned char *pTarget = bytes;
+
+ CHAR16_T *pEnd = pSrc + charCount;
+ unsigned char *pAllocatedBufferEnd = pTarget + byteCount;
+
+ int ch = 0;
+ int chd;
+
+ // assume that JIT will enregister pSrc, pTarget and ch
+
+ while (true)
+ {
+ // SLOWLOOP: does all range checks, handles all special cases, but it is slow
+
+ if (pSrc >= pEnd)
+ {
+ if (ch == 0)
+ {
+ // Check if there's anything left to get out of the fallback buffer
+ ch = fallbackUsed ? EncoderReplacementFallbackBuffer_InternalGetNextChar(&self->buffer.encoder) : 0;
+ if (ch > 0) goto ProcessChar;
+ }
+ else
+ {
+ // Case of leftover surrogates in the fallback buffer
+ if (fallbackUsed && self->buffer.encoder.bFallingBack)
+ {
+ assert(ch >= 0xD800 && ch <= 0xDBFF);
+
+ int cha = ch;
+
+ ch = EncoderReplacementFallbackBuffer_InternalGetNextChar(&self->buffer.encoder);
+
+ if (InRange(ch, LOW_SURROGATE_START, LOW_SURROGATE_END))
+ {
+ ch = ch + (cha << 10) + (0x10000 - LOW_SURROGATE_START - (HIGH_SURROGATE_START << 10));
+ goto EncodeChar;
+ }
+ else if (ch > 0)
+ {
+ goto ProcessChar;
+ }
+
+ break;
+ }
+ }
+
+ // attempt to encode the partial surrogate (will fail or ignore)
+ if (ch > 0) goto EncodeChar;
+
+ // We're done
+ break;
+ }
+
+ if (ch > 0)
+ {
+ // We have a high surrogate left over from a previous loop.
+ assert(ch >= 0xD800 && ch <= 0xDBFF);
+
+ // use separate helper variables for local contexts so that the jit optimizations
+ // won't get confused about the variable lifetimes
+ int cha = *pSrc;
+
+ // In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here.
+ if (InRange(cha, LOW_SURROGATE_START, LOW_SURROGATE_END))
+ {
+ ch = cha + (ch << 10) +
+ (0x10000
+ - LOW_SURROGATE_START
+ - (HIGH_SURROGATE_START << 10));
+
+ pSrc++;
+ }
+ // else ch is still high surrogate and encoding will fail
+
+ // attempt to encode the surrogate or partial surrogate
+ goto EncodeChar;
+ }
+
+ // If we've used a fallback, then we have to check for it
+ if (fallbackUsed)
+ {
+ ch = EncoderReplacementFallbackBuffer_InternalGetNextChar(&self->buffer.encoder);
+ if (ch > 0) goto ProcessChar;
+ }
+
+ // read next char. The JIT optimization seems to be getting confused when
+ // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
+ ch = *pSrc;
+ pSrc++;
+
+ ProcessChar:
+ if (InRange(ch, HIGH_SURROGATE_START, HIGH_SURROGATE_END)) continue;
+
+ // either good char or partial surrogate
+
+ EncodeChar:
+ // throw exception on partial surrogate if necessary
+ if (InRange(ch, HIGH_SURROGATE_START, LOW_SURROGATE_END))
+ {
+ // Lone surrogates aren't allowed, we have to do fallback for them
+ // Have to make a fallback buffer if we don't have one
+ if (!fallbackUsed)
+ {
+ // wait on fallbacks if we can
+ // For fallback we may need a fallback buffer
+ fallbackUsed = true;
+
+ // Set our internal fallback interesting things.
+ EncoderReplacementFallbackBuffer_InternalInitialize(&self->buffer.encoder, chars, pEnd, true);
+ }
+
+ // Do our fallback. Actually we already know its a mixed up surrogate,
+ // so the ref pSrc isn't gonna do anything.
+ EncoderReplacementFallbackBuffer_InternalFallback(&self->buffer.encoder, (CHAR16_T)ch, &pSrc);
+
+ // Ignore it if we don't throw
+ ch = 0;
+ continue;
+ }
+
+ // Count bytes needed
+ int bytesNeeded = 1;
+ if (ch > 0x7F)
+ {
+ if (ch > 0x7FF)
+ {
+ if (ch > 0xFFFF)
+ {
+ bytesNeeded++; // 4 bytes (surrogate pair)
+ }
+ bytesNeeded++; // 3 bytes (800-FFFF)
+ }
+ bytesNeeded++; // 2 bytes (80-7FF)
+ }
+
+ if (pTarget > pAllocatedBufferEnd - bytesNeeded)
+ {
+ // Left over surrogate from last time will cause pSrc == chars, so we'll throw
+ if (fallbackUsed && self->buffer.encoder.bFallingBack)
+ {
+ EncoderReplacementFallbackBuffer_MovePrevious(&self->buffer.encoder); // Didn't use this fallback char
+ if (ch > 0xFFFF)
+ EncoderReplacementFallbackBuffer_MovePrevious(&self->buffer.encoder); // Was surrogate, didn't use 2nd part either
+ }
+ else
+ {
+ pSrc--; // Didn't use this char
+ if (ch > 0xFFFF)
+ pSrc--; // Was surrogate, didn't use 2nd part either
+ }
+
+ assert(pSrc >= chars || pTarget == bytes);
+
+ if (pTarget == bytes) // Throw if we must
+ {
+ errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER;
+ return 0;
+ }
+ ch = 0; // Nothing left over (we backed up to start of pair if supplimentary)
+ break;
+ }
+
+ if (ch <= 0x7F)
+ {
+ *pTarget = (unsigned char)ch;
+ }
+ else
+ {
+ // use separate helper variables for local contexts so that the jit optimizations
+ // won't get confused about the variable lifetimes
+ int chb;
+ if (ch <= 0x7FF)
+ {
+ // 2 unsigned char encoding
+ chb = (unsigned char)(0xC0 | (ch >> 6));
+ }
+ else
+ {
+ if (ch <= 0xFFFF)
+ {
+ chb = (unsigned char)(0xE0 | (ch >> 12));
+ }
+ else
+ {
+ *pTarget = (unsigned char)(0xF0 | (ch >> 18));
+ ENSURE_BUFFER_INC
+
+ chb = 0x80 | ((ch >> 12) & 0x3F);
+ }
+ *pTarget = (unsigned char)chb;
+ ENSURE_BUFFER_INC
+
+ chb = 0x80 | ((ch >> 6) & 0x3F);
+ }
+ *pTarget = (unsigned char)chb;
+ ENSURE_BUFFER_INC
+
+ *pTarget = (unsigned char)0x80 | (ch & 0x3F);
+ }
+
+ ENSURE_BUFFER_INC
+
+ // If still have fallback don't do fast loop
+ if (fallbackUsed && (ch = EncoderReplacementFallbackBuffer_InternalGetNextChar(&self->buffer.encoder)) != 0)
+ goto ProcessChar;
+
+ int availableChars = pEnd - pSrc;
+ int availableBytes = pAllocatedBufferEnd - pTarget;
+
+ // don't fall into the fast decoding loop if we don't have enough characters
+ // Note that if we don't have enough bytes, pStop will prevent us from entering the fast loop.
+ if (availableChars <= 13)
+ {
+ // we are hoping for 1 unsigned char per char
+ if (availableBytes < availableChars)
+ {
+ // not enough output room. no pending bits at this point
+ ch = 0;
+ continue;
+ }
+
+ // try to get over the remainder of the ascii characters fast though
+ CHAR16_T* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
+ while (pSrc < pLocalEnd)
+ {
+ ch = *pSrc;
+ pSrc++;
+
+ // Not ASCII, need more than 1 unsigned char per char
+ if (ch > 0x7F) goto ProcessChar;
+
+ *pTarget = (unsigned char)ch;
+ ENSURE_BUFFER_INC
+ }
+ // we are done, let ch be 0 to clear encoder
+ ch = 0;
+ break;
+ }
+
+ // we need at least 1 unsigned char per character, but Convert might allow us to convert
+ // only part of the input, so try as much as we can. Reduce charCount if necessary
+ if (availableBytes < availableChars)
+ {
+ availableChars = availableBytes;
+ }
+
+ // FASTLOOP:
+ // - optimistic range checks
+ // - fallbacks to the slow loop for all special cases, exception throwing, etc.
+
+ // To compute the upper bound, assume that all characters are ASCII characters at this point,
+ // the boundary will be decreased for every non-ASCII character we encounter
+ // Also, we need 5 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates
+ // If there aren't enough bytes for the output, then pStop will be <= pSrc and will bypass the loop.
+ CHAR16_T *pStop = pSrc + availableChars - 5;
+
+ while (pSrc < pStop)
+ {
+ ch = *pSrc;
+ pSrc++;
+
+ if (ch > 0x7F) goto LongCode;
+
+ *pTarget = (unsigned char)ch;
+ ENSURE_BUFFER_INC
+
+ // get pSrc aligned
+ if (((size_t)pSrc & 0x2) != 0)
+ {
+ ch = *pSrc;
+ pSrc++;
+ if (ch > 0x7F) goto LongCode;
+
+ *pTarget = (unsigned char)ch;
+ ENSURE_BUFFER_INC
+ }
+
+ // Run 4 characters at a time!
+ while (pSrc < pStop)
+ {
+ ch = *(int*)pSrc;
+ int chc = *(int*)(pSrc + 2);
+
+ if (((ch | chc) & (int)0xFF80FF80) != 0) goto LongCodeWithMask;
+
+ if (pTarget + 4 > pAllocatedBufferEnd)
+ {
+ errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER;
+ return 0;
+ }
+
+ // Unfortunately, this is endianness sensitive
+#if BIGENDIAN
+ if (!self->treatAsLE)
+ {
+ *pTarget = (unsigned char)(ch >> 16);
+ *(pTarget + 1) = (unsigned char)ch;
+ pSrc += 4;
+ *(pTarget + 2) = (unsigned char)(chc >> 16);
+ *(pTarget + 3) = (unsigned char)chc;
+ pTarget += 4;
+ }
+ else
+#endif
+ {
+ *pTarget = (unsigned char)ch;
+ *(pTarget + 1) = (unsigned char)(ch >> 16);
+ pSrc += 4;
+ *(pTarget + 2) = (unsigned char)chc;
+ *(pTarget + 3) = (unsigned char)(chc >> 16);
+ pTarget += 4;
+ }
+ }
+ continue;
+
+ LongCodeWithMask:
+#if BIGENDIAN
+ // be careful about the sign extension
+ if (!self->treatAsLE) ch = (int)(((unsigned int)ch) >> 16);
+ else
+#endif
+ ch = (CHAR16_T)ch;
+ pSrc++;
+
+ if (ch > 0x7F) goto LongCode;
+
+ *pTarget = (unsigned char)ch;
+ ENSURE_BUFFER_INC
+ continue;
+
+ LongCode:
+ // use separate helper variables for slow and fast loop so that the jit optimizations
+ // won't get confused about the variable lifetimes
+ if (ch <= 0x7FF)
+ {
+ // 2 unsigned char encoding
+ chd = 0xC0 | (ch >> 6);
+ }
+ else
+ {
+ if (!InRange(ch, HIGH_SURROGATE_START, LOW_SURROGATE_END))
+ {
+ // 3 unsigned char encoding
+ chd = 0xE0 | (ch >> 12);
+ }
+ else
+ {
+ // 4 unsigned char encoding - high surrogate + low surrogate
+ if (ch > HIGH_SURROGATE_END)
+ {
+ // low without high -> bad, try again in slow loop
+ pSrc -= 1;
+ break;
+ }
+
+ chd = *pSrc;
+ pSrc++;
+
+ if (!InRange(chd, LOW_SURROGATE_START, LOW_SURROGATE_END))
+ {
+ // high not followed by low -> bad, try again in slow loop
+ pSrc -= 2;
+ break;
+ }
+
+ ch = chd + (ch << 10) +
+ (0x10000
+ - LOW_SURROGATE_START
+ - (HIGH_SURROGATE_START << 10));
+
+ *pTarget = (unsigned char)(0xF0 | (ch >> 18));
+ // pStop - this unsigned char is compensated by the second surrogate character
+ // 2 input chars require 4 output bytes. 2 have been anticipated already
+ // and 2 more will be accounted for by the 2 pStop-- calls below.
+ ENSURE_BUFFER_INC
+
+ chd = 0x80 | ((ch >> 12) & 0x3F);
+ }
+ *pTarget = (unsigned char)chd;
+ pStop--; // 3 unsigned char sequence for 1 char, so need pStop-- and the one below too.
+ ENSURE_BUFFER_INC
+
+ chd = 0x80 | ((ch >> 6) & 0x3F);
+ }
+ *pTarget = (unsigned char)chd;
+ pStop--; // 2 unsigned char sequence for 1 char so need pStop--.
+ ENSURE_BUFFER_INC
+
+ *pTarget = (unsigned char)(0x80 | (ch & 0x3F));
+ // pStop - this unsigned char is already included
+ ENSURE_BUFFER_INC
+ }
+
+ assert(pTarget <= pAllocatedBufferEnd);
+
+ // no pending char at this point
+ ch = 0;
+ }
+
+ if (pSrc < pEnd)
+ {
+ errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER;
+ return 0;
+ }
+
+ return (int)(pTarget - bytes);
+}
+
+static size_t GetByteCount(UTF8Encoding* self, CHAR16_T *chars, size_t count)
+{
+ // For fallback we may need a fallback buffer.
+ // We wait to initialize it though in case we don't have any broken input unicode
+ bool fallbackUsed = false;
+ CHAR16_T *pSrc = chars;
+ CHAR16_T *pEnd = pSrc + count;
+
+ // Start by assuming we have as many as count
+ size_t byteCount = count;
+
+ int ch = 0;
+
+ while (true)
+ {
+ // SLOWLOOP: does all range checks, handles all special cases, but it is slow
+ if (pSrc >= pEnd)
+ {
+
+ if (ch == 0)
+ {
+ // Unroll any fallback that happens at the end
+ ch = fallbackUsed ? EncoderReplacementFallbackBuffer_InternalGetNextChar(&self->buffer.encoder) : 0;
+ if (ch > 0)
+ {
+ byteCount++;
+ goto ProcessChar;
+ }
+ }
+ else
+ {
+ // Case of surrogates in the fallback.
+ if (fallbackUsed && self->buffer.encoder.bFallingBack)
+ {
+ assert(ch >= 0xD800 && ch <= 0xDBFF);
+
+ ch = EncoderReplacementFallbackBuffer_InternalGetNextChar(&self->buffer.encoder);
+ byteCount++;
+
+ if (InRange(ch, LOW_SURROGATE_START, LOW_SURROGATE_END))
+ {
+ ch = 0xfffd;
+ byteCount++;
+ goto EncodeChar;
+ }
+ else if (ch > 0)
+ {
+ goto ProcessChar;
+ }
+ else
+ {
+ byteCount--; // ignore last one.
+ break;
+ }
+ }
+ }
+
+ if (ch <= 0)
+ {
+ break;
+ }
+
+ // attempt to encode the partial surrogate (will fallback or ignore it), it'll also subtract 1.
+ byteCount++;
+ goto EncodeChar;
+ }
+
+ if (ch > 0)
+ {
+ assert(ch >= 0xD800 && ch <= 0xDBFF);
+
+ // use separate helper variables for local contexts so that the jit optimizations
+ // won't get confused about the variable lifetimes
+ int cha = *pSrc;
+
+ // count the pending surrogate
+ byteCount++;
+
+ // In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here.
+ if (InRange(cha, LOW_SURROGATE_START, LOW_SURROGATE_END))
+ {
+ // Don't need a real # because we're just counting, anything > 0x7ff ('cept surrogate) will do.
+ ch = 0xfffd;
+ // ch = cha + (ch << 10) +
+ // (0x10000
+ // - LOW_SURROGATE_START
+ // - (HIGH_SURROGATE_START << 10) );
+
+ // Use this next char
+ pSrc++;
+ }
+ // else ch is still high surrogate and encoding will fail (so don't add count)
+
+ // attempt to encode the surrogate or partial surrogate
+ goto EncodeChar;
+ }
+
+ // If we've used a fallback, then we have to check for it
+ if (fallbackUsed)
+ {
+ ch = EncoderReplacementFallbackBuffer_InternalGetNextChar(&self->buffer.encoder);
+ if (ch > 0)
+ {
+ // We have an extra byte we weren't expecting.
+ byteCount++;
+ goto ProcessChar;
+ }
+ }
+
+ // read next char. The JIT optimization seems to be getting confused when
+ // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
+ ch = *pSrc;
+ pSrc++;
+
+ ProcessChar:
+ if (InRange(ch, HIGH_SURROGATE_START, HIGH_SURROGATE_END))
+ {
+ // we will count this surrogate next time around
+ byteCount--;
+ continue;
+ }
+ // either good char or partial surrogate
+
+ EncodeChar:
+ // throw exception on partial surrogate if necessary
+ if (InRange(ch, HIGH_SURROGATE_START, LOW_SURROGATE_END))
+ {
+ // Lone surrogates aren't allowed
+ // Have to make a fallback buffer if we don't have one
+ if (!fallbackUsed)
+ {
+ // wait on fallbacks if we can
+ // For fallback we may need a fallback buffer
+ fallbackUsed = true;
+
+ // Set our internal fallback interesting things.
+ EncoderReplacementFallbackBuffer_InternalInitialize(&self->buffer.encoder, chars, chars + count, false);
+ }
+
+ // Do our fallback. Actually we already know its a mixed up surrogate,
+ // so the ref pSrc isn't gonna do anything.
+ EncoderReplacementFallbackBuffer_InternalFallback(&self->buffer.encoder, (CHAR16_T)ch, &pSrc);
+
+ // Ignore it if we don't throw (we had preallocated this ch)
+ byteCount--;
+ ch = 0;
+ continue;
+ }
+
+ // Count them
+ if (ch > 0x7F)
+ {
+ if (ch > 0x7FF)
+ {
+ // the extra surrogate byte was compensated by the second surrogate character
+ // (2 surrogates make 4 bytes. We've already counted 2 bytes, 1 per char)
+ byteCount++;
+ }
+ byteCount++;
+ }
+
+#if WIN64
+ // check for overflow
+ if (byteCount < 0)
+ {
+ break;
+ }
+#endif
+
+ // If still have fallback don't do fast loop
+ if (fallbackUsed && (ch = EncoderReplacementFallbackBuffer_InternalGetNextChar(&self->buffer.encoder)) != 0)
+ {
+ // We're reserving 1 byte for each char by default
+ byteCount++;
+ goto ProcessChar;
+ }
+
+ int availableChars = pEnd - pSrc;
+
+ // don't fall into the fast decoding loop if we don't have enough characters
+ if (availableChars <= 13)
+ {
+ // try to get over the remainder of the ascii characters fast though
+ CHAR16_T* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
+ while (pSrc < pLocalEnd)
+ {
+ ch = *pSrc;
+ pSrc++;
+ if (ch > 0x7F) goto ProcessChar;
+ }
+
+ // we are done
+ break;
+ }
+
+#if WIN64
+ // make sure that we won't get a silent overflow inside the fast loop
+ // (Fall out to slow loop if we have this many characters)
+ availableChars &= 0x0FFFFFFF;
+#endif
+
+ // To compute the upper bound, assume that all characters are ASCII characters at this point,
+ // the boundary will be decreased for every non-ASCII character we encounter
+ // Also, we need 3 + 4 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates
+ CHAR16_T *pStop = pSrc + availableChars - (3 + 4);
+
+ while (pSrc < pStop)
+ {
+ ch = *pSrc;
+ pSrc++;
+
+ if (ch > 0x7F) // Not ASCII
+ {
+ if (ch > 0x7FF) // Not 2 Byte
+ {
+ if ((ch & 0xF800) == 0xD800) // See if its a Surrogate
+ goto LongCode;
+ byteCount++;
+ }
+ byteCount++;
+ }
+
+ // get pSrc aligned
+ if (((size_t)pSrc & 0x2) != 0)
+ {
+ ch = *pSrc;
+ pSrc++;
+ if (ch > 0x7F) // Not ASCII
+ {
+ if (ch > 0x7FF) // Not 2 Byte
+ {
+ if ((ch & 0xF800) == 0xD800) // See if its a Surrogate
+ goto LongCode;
+ byteCount++;
+ }
+ byteCount++;
+ }
+ }
+
+ // Run 2 * 4 characters at a time!
+ while (pSrc < pStop)
+ {
+ ch = *(int*)pSrc;
+ int chc = *(int*)(pSrc + 2);
+ if (((ch | chc) & (int)0xFF80FF80) != 0) // See if not ASCII
+ {
+ if (((ch | chc) & (int)0xF800F800) != 0) // See if not 2 Byte
+ {
+ goto LongCodeWithMask;
+ }
+
+
+ if ((ch & (int)0xFF800000) != 0) // Actually 0x07800780 is all we care about (4 bits)
+ byteCount++;
+ if ((ch & (int)0xFF80) != 0)
+ byteCount++;
+ if ((chc & (int)0xFF800000) != 0)
+ byteCount++;
+ if ((chc & (int)0xFF80) != 0)
+ byteCount++;
+ }
+ pSrc += 4;
+
+ ch = *(int*)pSrc;
+ chc = *(int*)(pSrc + 2);
+ if (((ch | chc) & (int)0xFF80FF80) != 0) // See if not ASCII
+ {
+ if (((ch | chc) & (int)0xF800F800) != 0) // See if not 2 Byte
+ {
+ goto LongCodeWithMask;
+ }
+
+ if ((ch & (int)0xFF800000) != 0)
+ byteCount++;
+ if ((ch & (int)0xFF80) != 0)
+ byteCount++;
+ if ((chc & (int)0xFF800000) != 0)
+ byteCount++;
+ if ((chc & (int)0xFF80) != 0)
+ byteCount++;
+ }
+ pSrc += 4;
+ }
+ break;
+
+ LongCodeWithMask:
+#if BIGENDIAN
+ // be careful about the sign extension
+ if (!self->treatAsLE) ch = (int)(((unsigned int)ch) >> 16);
+ else
+#endif
+ ch = (CHAR16_T)ch;
+
+ pSrc++;
+
+ if (ch <= 0x7F)
+ {
+ continue;
+ }
+
+ LongCode:
+ // use separate helper variables for slow and fast loop so that the jit optimizations
+ // won't get confused about the variable lifetimes
+ if (ch > 0x7FF)
+ {
+ if (InRange(ch, HIGH_SURROGATE_START, LOW_SURROGATE_END))
+ {
+ // 4 byte encoding - high surrogate + low surrogate
+
+ int chd = *pSrc;
+ if (
+ ch > HIGH_SURROGATE_END ||
+ !InRange(chd, LOW_SURROGATE_START, LOW_SURROGATE_END))
+ {
+ // Back up and drop out to slow loop to figure out error
+ pSrc--;
+ break;
+ }
+ pSrc++;
+
+ // byteCount - this byte is compensated by the second surrogate character
+ }
+ byteCount++;
+ }
+ byteCount++;
+
+ // byteCount - the last byte is already included
+ }
+
+ // no pending char at this point
+ ch = 0;
+ }
+
+#if WIN64
+ // check for overflow
+ assert(byteCount >= 0);
+#endif
+ assert(!fallbackUsed || self->buffer.encoder.fallbackCount < 0);
+
+ return byteCount;
+}
+
+size_t minipal_get_length_utf8_to_utf16(const char* source, size_t sourceLength, unsigned int flags)
+{
+ errno = 0;
+
+ if (sourceLength == 0)
+ return 0;
+
+ UTF8Encoding enc =
+ {
+ .buffer = { .decoder = { .fallbackCount = -1, .fallbackIndex = -1, .strDefault = { 0xFFFD, 0 }, .strDefaultLength = 1 } },
+ .useFallback = !(flags & MINIPAL_MB_NO_REPLACE_INVALID_CHARS),
+#if BIGENDIAN
+ .treatAsLE = (flags & MINIPAL_TREAT_AS_LITTLE_ENDIAN)
+#endif
+ };
+
+ return GetCharCount(&enc, (unsigned char*)source, sourceLength);
+}
+
+size_t minipal_get_length_utf16_to_utf8(const CHAR16_T* source, size_t sourceLength, unsigned int flags)
+{
+ errno = 0;
+
+ if (sourceLength == 0)
+ return 0;
+
+ UTF8Encoding enc =
+ {
+ // repeat replacement char (0xFFFD) twice for a surrogate pair
+ .buffer = { .encoder = { .fallbackCount = -1, .fallbackIndex = -1, .strDefault = { 0xFFFD, 0xFFFD, 0 }, .strDefaultLength = 2 } },
+ .useFallback = true,
+#if BIGENDIAN
+ .treatAsLE = (flags & MINIPAL_TREAT_AS_LITTLE_ENDIAN)
+#endif
+ };
+
+#if !BIGENDIAN
+ (void)flags; // unused
+#endif
+
+ return GetByteCount(&enc, (CHAR16_T*)source, sourceLength);
+}
+
+size_t minipal_convert_utf8_to_utf16(const char* source, size_t sourceLength, CHAR16_T* destination, size_t destinationLength, unsigned int flags)
+{
+ size_t ret;
+ errno = 0;
+
+ if (sourceLength == 0)
+ return 0;
+
+ UTF8Encoding enc =
+ {
+ .buffer = { .decoder = { .fallbackCount = -1, .fallbackIndex = -1, .strDefault = { 0xFFFD, 0 }, .strDefaultLength = 1 } },
+ .useFallback = !(flags & MINIPAL_MB_NO_REPLACE_INVALID_CHARS),
+#if BIGENDIAN
+ .treatAsLE = (flags & MINIPAL_TREAT_AS_LITTLE_ENDIAN)
+#endif
+ };
+
+ ret = GetChars(&enc, (unsigned char*)source, sourceLength, destination, destinationLength);
+ if (errno) ret = 0;
+
+ return ret;
+}
+
+size_t minipal_convert_utf16_to_utf8(const CHAR16_T* source, size_t sourceLength, char* destination, size_t destinationLength, unsigned int flags)
+{
+ size_t ret;
+ errno = 0;
+
+ if (sourceLength == 0)
+ return 0;
+
+ UTF8Encoding enc =
+ {
+ // repeat replacement char (0xFFFD) twice for a surrogate pair
+ .buffer = { .encoder = { .fallbackCount = -1, .fallbackIndex = -1, .strDefault = { 0xFFFD, 0xFFFD, 0 }, .strDefaultLength = 2 } },
+ .useFallback = true,
+#if BIGENDIAN
+ .treatAsLE = (flags & MINIPAL_TREAT_AS_LITTLE_ENDIAN)
+#endif
+ };
+
+#if !BIGENDIAN
+ (void)flags; // unused
+#endif
+
+ ret = GetBytes(&enc, (CHAR16_T*)source, sourceLength, (unsigned char*)destination, destinationLength);
+ if (errno) ret = 0;
+
+ return ret;
+}
--- /dev/null
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+#ifndef HAVE_MINIPAL_UTF8_H
+#define HAVE_MINIPAL_UTF8_H
+
+#include <minipal/utils.h>
+#include <stdlib.h>
+#include <stdbool.h>
+
+#define MINIPAL_MB_NO_REPLACE_INVALID_CHARS 0x00000008
+#define MINIPAL_TREAT_AS_LITTLE_ENDIAN 0x00000016
+#define MINIPAL_ERROR_INSUFFICIENT_BUFFER 122L
+#define MINIPAL_ERROR_NO_UNICODE_TRANSLATION 1113L
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif // __cplusplus
+
+#ifdef TARGET_WINDOWS
+typedef wchar_t CHAR16_T;
+#else
+typedef unsigned short CHAR16_T;
+#endif
+
+/**
+ * Get length of destination needed for UTF-8 to UTF-16 (UCS-2) conversion
+ *
+ * @param source The source string in UTF-8 format.
+ * @param sourceLength Length of the source string.
+ * @param flags Flags to alter the behavior of converter. Supported flags are MINIPAL_MB_NO_REPLACE_INVALID_CHARS and MINIPAL_TREAT_AS_LITTLE_ENDIAN.
+ * @return Length of UTF-16 buffer required by the conversion.
+ */
+size_t minipal_get_length_utf8_to_utf16(const char* source, size_t sourceLength, unsigned int flags);
+
+/**
+ * Get length of destination needed for UTF-16 (UCS-2) to UTF-8 conversion
+ *
+ * @param source The source string in UTF-16 format.
+ * @param sourceLength Length of the source string.
+ * @param flags Flags to alter the behavior of converter. Supported flags are MINIPAL_MB_NO_REPLACE_INVALID_CHARS and MINIPAL_TREAT_AS_LITTLE_ENDIAN.
+ * @return Length of UTF-8 buffer required by the conversion.
+ */
+size_t minipal_get_length_utf16_to_utf8(const CHAR16_T* source, size_t sourceLength, unsigned int flags);
+
+/**
+ * Convert a string from UTF-8 to UTF-16 (UCS-2) with preallocated memory
+ *
+ * @param source The source string in UTF-8 format.
+ * @param sourceLength Length of the source string.
+ * @param destination Pointer to the destination UTF-16 string. It can be NULL to query number of items required by the conversion.
+ * @param destinationLength Length of the destination string.
+ * @param flags Flags to alter the behavior of converter. Supported flags are MINIPAL_MB_NO_REPLACE_INVALID_CHARS and MINIPAL_TREAT_AS_LITTLE_ENDIAN.
+ * @return Number of items written by the conversion.
+ */
+size_t minipal_convert_utf8_to_utf16(const char* source, size_t sourceLength, CHAR16_T* destination, size_t destinationLength, unsigned int flags);
+
+/**
+ * Convert a string from UTF-16 (UCS-2) to UTF-8 with preallocated memory
+ *
+ * @param source The source string in UTF-16 format.
+ * @param sourceLength Length of the source string.
+ * @param destination Pointer to the destination UTF-8 string. It can be NULL to query number of items required by the conversion.
+ * @param destinationLength Length of the destination string.
+ * @param flags Flags to alter the behavior of converter. Supported flags are MINIPAL_MB_NO_REPLACE_INVALID_CHARS and MINIPAL_TREAT_AS_LITTLE_ENDIAN.
+ * @return Number of items written by the conversion.
+ */
+size_t minipal_convert_utf16_to_utf8(const CHAR16_T* source, size_t sourceLength, char* destination, size_t destinationLength, unsigned int flags);
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+
+#endif /* HAVE_MINIPAL_UTF8_H */