Share UTF8 converters between coreclr and mono (#85558)

author Adeel Mujahid <3840695+am11@users.noreply.github.com>

Thu, 22 Jun 2023 13:30:16 +0000 (16:30 +0300)

committer GitHub <noreply@github.com>

Thu, 22 Jun 2023 13:30:16 +0000 (06:30 -0700)
author Adeel Mujahid <3840695+am11@users.noreply.github.com>
Thu, 22 Jun 2023 13:30:16 +0000 (16:30 +0300)
committer GitHub <noreply@github.com>
Thu, 22 Jun 2023 13:30:16 +0000 (06:30 -0700)
diff --git a/src/coreclr/inc/utilcode.h b/src/coreclr/inc/utilcode.h

index a332a6c..bc84e71 100644 (file)
--- a/src/coreclr/inc/utilcode.h
+++ b/src/coreclr/inc/utilcode.h
@@ -185,15 +185,6 @@ typedef LPSTR   LPUTF8;
  // given and ANSI String, copy it into a wide buffer.
  // be careful about scoping when using this macro!
  //
-// how to use the below two macros:
-//
-//  ...
-//  LPSTR pszA;
-//  pszA = MyGetAnsiStringRoutine();
-//  MAKE_WIDEPTR_FROMANSI(pwsz, pszA);
-//  MyUseWideStringRoutine(pwsz);
-//  ...
-//
  // similarily for MAKE_ANSIPTR_FROMWIDE.  note that the first param does not
  // have to be declared, and no clean up must be done.
  //
@@ -211,25 +202,6 @@ typedef LPSTR   LPUTF8;
  #define MAKE_TRANSLATIONFAILED ThrowWin32(ERROR_NO_UNICODE_TRANSLATION)
  #endif
  
-// This version throws on conversion errors (ie, no best fit character
-// mapping to characters that look similar, and no use of the default char
-// ('?') when printing out unrepresentable characters.  Use this method for
-// most development in the EE, especially anything like metadata or class
-// names.  See the BESTFIT version if you're printing out info to the console.
-#define MAKE_MULTIBYTE_FROMWIDE(ptrname, widestr, codepage) \
-    int __l##ptrname = (int)u16_strlen(widestr);        \
-    if (__l##ptrname > MAKE_MAX_LENGTH)         \
-        MAKE_TOOLONGACTION;                     \
-    __l##ptrname = (int)((__l##ptrname + 1) * 2 * sizeof(char)); \
-    CQuickBytes __CQuickBytes##ptrname; \
-    __CQuickBytes##ptrname.AllocThrows(__l##ptrname); \
-    BOOL __b##ptrname; \
-    DWORD __cBytes##ptrname = WszWideCharToMultiByte(codepage, WC_NO_BEST_FIT_CHARS, widestr, -1, (LPSTR)__CQuickBytes##ptrname.Ptr(), __l##ptrname, NULL, &__b##ptrname); \
-    if (__b##ptrname || (__cBytes##ptrname == 0 && (widestr[0] != W('\0')))) { \
-        MAKE_TRANSLATIONFAILED; \
-    } \
-    LPSTR ptrname = (LPSTR)__CQuickBytes##ptrname.Ptr()
-
  // This version does best fit character mapping and also allows the use
  // of the default char ('?') for any Unicode character that isn't
  // representable.  This is reasonable for writing to the console, but
@@ -247,40 +219,6 @@ typedef LPSTR   LPUTF8;
      } \
      LPSTR ptrname = (LPSTR)__CQuickBytes##ptrname.Ptr()
  
-// Use for anything critical other than output to console, where weird
-// character mappings are unacceptable.
-#define MAKE_ANSIPTR_FROMWIDE(ptrname, widestr) MAKE_MULTIBYTE_FROMWIDE(ptrname, widestr, CP_ACP)
-
-// Use for output to the console.
-#define MAKE_ANSIPTR_FROMWIDE_BESTFIT(ptrname, widestr) MAKE_MULTIBYTE_FROMWIDE_BESTFIT(ptrname, widestr, CP_ACP)
-
-#define MAKE_WIDEPTR_FROMANSI(ptrname, ansistr) \
-    CQuickBytes __qb##ptrname; \
-    int __l##ptrname; \
-    __l##ptrname = WszMultiByteToWideChar(CP_ACP, 0, ansistr, -1, 0, 0); \
-    if (__l##ptrname > MAKE_MAX_LENGTH) \
-        MAKE_TOOLONGACTION; \
-    LPWSTR ptrname = (LPWSTR) __qb##ptrname.AllocThrows((__l##ptrname+1)*sizeof(WCHAR));  \
-    if (WszMultiByteToWideChar(CP_ACP, MB_ERR_INVALID_CHARS, ansistr, -1, ptrname, __l##ptrname) == 0) { \
-        MAKE_TRANSLATIONFAILED; \
-    }
-
-#define MAKE_WIDEPTR_FROMANSI_NOTHROW(ptrname, ansistr) \
-    CQuickBytes __qb##ptrname; \
-    LPWSTR ptrname = 0; \
-    int __l##ptrname; \
-    __l##ptrname = WszMultiByteToWideChar(CP_ACP, 0, ansistr, -1, 0, 0); \
-    if (__l##ptrname <= MAKE_MAX_LENGTH) { \
-        ptrname = (LPWSTR) __qb##ptrname.AllocNoThrow((__l##ptrname+1)*sizeof(WCHAR));  \
-        if (ptrname) { \
-            if (WszMultiByteToWideChar(CP_ACP, MB_ERR_INVALID_CHARS, ansistr, -1, ptrname, __l##ptrname) != 0) { \
-                ptrname[__l##ptrname] = 0; \
-            } else { \
-                ptrname = 0; \
-            } \
-        } \
-    }
-
  #define MAKE_UTF8PTR_FROMWIDE(ptrname, widestr) CQuickBytes _##ptrname; _##ptrname.ConvertUnicode_Utf8(widestr); LPSTR ptrname = (LPSTR) _##ptrname.Ptr();
  
  #define MAKE_UTF8PTR_FROMWIDE_NOTHROW(ptrname, widestr) \
@@ -312,22 +250,8 @@ typedef LPSTR   LPUTF8;
          } \
      } \
  
-#define MAKE_WIDEPTR_FROMUTF8N(ptrname, utf8str, n8chrs) \
-    CQuickBytes __qb##ptrname; \
-    int __l##ptrname; \
-    __l##ptrname = WszMultiByteToWideChar(CP_UTF8, 0, utf8str, n8chrs, 0, 0); \
-    if (__l##ptrname > MAKE_MAX_LENGTH) \
-        MAKE_TOOLONGACTION; \
-    LPWSTR ptrname = (LPWSTR) __qb##ptrname .AllocThrows((__l##ptrname+1)*sizeof(WCHAR)); \
-    if (0==WszMultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, utf8str, n8chrs, ptrname, __l##ptrname)) { \
-        MAKE_TRANSLATIONFAILED; \
-    } \
-    ptrname[__l##ptrname] = 0;
-
-
  #define MAKE_WIDEPTR_FROMUTF8(ptrname, utf8str) CQuickBytes _##ptrname;  _##ptrname.ConvertUtf8_Unicode(utf8str); LPCWSTR ptrname = (LPCWSTR) _##ptrname.Ptr();
  
-
  #define MAKE_WIDEPTR_FROMUTF8N_NOTHROW(ptrname, utf8str, n8chrs) \
      CQuickBytes __qb##ptrname; \
      int __l##ptrname; \
@@ -346,42 +270,10 @@ typedef LPSTR   LPUTF8;
  
  #define MAKE_WIDEPTR_FROMUTF8_NOTHROW(ptrname, utf8str)   MAKE_WIDEPTR_FROMUTF8N_NOTHROW(ptrname, utf8str, -1)
  
-// This method takes the number of characters
-#define MAKE_MULTIBYTE_FROMWIDEN(ptrname, widestr, _nCharacters, _pCnt, codepage)        \
-    CQuickBytes __qb##ptrname; \
-    int __l##ptrname; \
-    __l##ptrname = WszWideCharToMultiByte(codepage, WC_NO_BEST_FIT_CHARS, widestr, _nCharacters, NULL, 0, NULL, NULL);           \
-    if (__l##ptrname > MAKE_MAX_LENGTH) \
-        MAKE_TOOLONGACTION; \
-    ptrname = (LPUTF8) __qb##ptrname .AllocThrows(__l##ptrname+1); \
-    BOOL __b##ptrname; \
-    DWORD _pCnt = WszWideCharToMultiByte(codepage, WC_NO_BEST_FIT_CHARS, widestr, _nCharacters, ptrname, __l##ptrname, NULL, &__b##ptrname);  \
-    if (__b##ptrname || (_pCnt == 0 && _nCharacters > 0)) { \
-        MAKE_TRANSLATIONFAILED; \
-    } \
-    ptrname[__l##ptrname] = 0;
-
-#define MAKE_MULTIBYTE_FROMWIDEN_BESTFIT(ptrname, widestr, _nCharacters, _pCnt, codepage)        \
-    CQuickBytes __qb##ptrname; \
-    int __l##ptrname; \
-    __l##ptrname = WszWideCharToMultiByte(codepage, 0, widestr, _nCharacters, NULL, 0, NULL, NULL);           \
-    if (__l##ptrname > MAKE_MAX_LENGTH) \
-        MAKE_TOOLONGACTION; \
-    ptrname = (LPUTF8) __qb##ptrname .AllocThrows(__l##ptrname+1); \
-    DWORD _pCnt = WszWideCharToMultiByte(codepage, 0, widestr, _nCharacters, ptrname, __l##ptrname, NULL, NULL);  \
-    if (_pCnt == 0 && _nCharacters > 0) { \
-        MAKE_TRANSLATIONFAILED; \
-    } \
-    ptrname[__l##ptrname] = 0;
-
-#define MAKE_ANSIPTR_FROMWIDEN(ptrname, widestr, _nCharacters, _pCnt)        \
-       MAKE_MULTIBYTE_FROMWIDEN(ptrname, widestr, _nCharacters, _pCnt, CP_ACP)
-
  const SIZE_T MaxSigned32BitDecString = ARRAY_SIZE("-2147483648") - 1;
  const SIZE_T MaxUnsigned32BitDecString = ARRAY_SIZE("4294967295") - 1;
  const SIZE_T MaxIntegerDecHexString = ARRAY_SIZE("-9223372036854775808") - 1;
  
-const SIZE_T Max16BitHexString = ARRAY_SIZE("1234") - 1;
  const SIZE_T Max32BitHexString = ARRAY_SIZE("12345678") - 1;
  const SIZE_T Max64BitHexString = ARRAY_SIZE("1234567812345678") - 1;
  
@@ -410,77 +302,6 @@ inline WCHAR* FormatInteger(WCHAR* str, size_t strCount, const char* fmt, I v)
      return str;
  }
  
-inline
-LPWSTR DuplicateString(
-    LPCWSTR wszString,
-    size_t  cchString)
-{
-    STATIC_CONTRACT_NOTHROW;
-
-    LPWSTR wszDup = NULL;
-    if (wszString != NULL)
-    {
-        wszDup = new (nothrow) WCHAR[cchString + 1];
-        if (wszDup != NULL)
-        {
-            wcscpy_s(wszDup, cchString + 1, wszString);
-        }
-    }
-    return wszDup;
-}
-
-inline
-LPWSTR DuplicateString(
-    LPCWSTR wszString)
-{
-    STATIC_CONTRACT_NOTHROW;
-
-    if (wszString != NULL)
-    {
-        return DuplicateString(wszString, u16_strlen(wszString));
-    }
-    else
-    {
-        return NULL;
-    }
-}
-
-void DECLSPEC_NORETURN ThrowOutOfMemory();
-
-inline
-LPWSTR DuplicateStringThrowing(
-    LPCWSTR wszString,
-    size_t cchString)
-{
-    STATIC_CONTRACT_THROWS;
-
-    if (wszString == NULL)
-        return NULL;
-
-    LPWSTR wszDup = DuplicateString(wszString, cchString);
-    if (wszDup == NULL)
-        ThrowOutOfMemory();
-
-    return wszDup;
-}
-
-inline
-LPWSTR DuplicateStringThrowing(
-    LPCWSTR wszString)
-{
-    STATIC_CONTRACT_THROWS;
-
-    if (wszString == NULL)
-        return NULL;
-
-    LPWSTR wszDup = DuplicateString(wszString);
-    if (wszDup == NULL)
-        ThrowOutOfMemory();
-
-    return wszDup;
-}
-
-
  //*****************************************************************************
  // Placement new is used to new and object at an exact location.  The pointer
  // is simply returned to the caller without actually using the heap.  The
diff --git a/src/coreclr/pal/src/CMakeLists.txt b/src/coreclr/pal/src/CMakeLists.txt

index 4ec2c2b..804a712 100644 (file)
--- a/src/coreclr/pal/src/CMakeLists.txt
+++ b/src/coreclr/pal/src/CMakeLists.txt
@@ -152,7 +152,7 @@ set(SOURCES
    loader/module.cpp
    locale/unicode.cpp
    locale/unicodedata.cpp
-  locale/utf8.cpp
+  ${CLR_SRC_NATIVE_DIR}/minipal/utf8.c
    map/common.cpp
    map/map.cpp
    map/virtual.cpp
diff --git a/src/coreclr/pal/src/include/pal/utf8.h b/src/coreclr/pal/src/include/pal/utf8.h

deleted file mode 100644 (file)

index fa417c0..0000000
--- a/src/coreclr/pal/src/include/pal/utf8.h
+++ /dev/null
@@ -1,52 +0,0 @@
-// Licensed to the .NET Foundation under one or more agreements.
-// The .NET Foundation licenses this file to you under the MIT license.
-
-/*++
-
-
-
-Module Name:
-
-    include/pal/utf8.h
-
-Abstract:
-    Header file for UTF-8 conversion functions.
-
-Revision History:
-
-
-
---*/
-
-#ifndef _PAL_UTF8_H_
-#define _PAL_UTF8_H_
-
-#include <pal/palinternal.h> /* for WCHAR */
-
-#ifdef __cplusplus
-extern "C"
-{
-#endif // __cplusplus
-
-/*++
-Function :
-    UTF8ToUnicode
-
-    Convert a string from UTF-8 to UTF-16 (UCS-2)
---*/
-int UTF8ToUnicode(LPCSTR lpSrcStr, int cchSrc, LPWSTR lpDestStr, int cchDest, DWORD dwFlags);
-
-
-/*++
-Function :
-    UnicodeToUTF8
-
-    Convert a string from UTF-16 (UCS-2) to UTF-8
---*/
-int UnicodeToUTF8(LPCWSTR lpSrcStr, int cchSrc, LPSTR lpDestStr, int cchDest);
-
-#ifdef __cplusplus
-}
-#endif // __cplusplus
-
-#endif /* _PAL_UTF8_H_ */
diff --git a/src/coreclr/pal/src/locale/unicode.cpp b/src/coreclr/pal/src/locale/unicode.cpp

index f29eabc..8bfa586 100644 (file)
--- a/src/coreclr/pal/src/locale/unicode.cpp
+++ b/src/coreclr/pal/src/locale/unicode.cpp
@@ -24,7 +24,7 @@ Revision History:
  #include "pal/palinternal.h"
  #include "pal/dbgmsg.h"
  #include "pal/file.h"
-#include "pal/utf8.h"
+#include <minipal/utf8.h>
  #include "pal/cruntime.h"
  #include "pal/stackstring.hpp"
  #include "pal/unicodedata.h"
@@ -253,16 +253,20 @@ MultiByteToWideChar(
          goto EXIT;
      }
  
-    // Use UTF8ToUnicode on all systems, since it replaces
-    // invalid characters and Core Foundation doesn't do that.
      if (CodePage == CP_UTF8 || CodePage == CP_ACP)
      {
-        if (cbMultiByte <= -1)
+        if (cbMultiByte < 0)
+            cbMultiByte = strlen(lpMultiByteStr) + 1;
+
+        if (!lpWideCharStr || cchWideChar == 0)
+            retval = minipal_get_length_utf8_to_utf16(lpMultiByteStr, cbMultiByte, dwFlags);
+
+        if (lpWideCharStr)
          {
-        cbMultiByte = strlen(lpMultiByteStr) + 1;
+            if (cchWideChar == 0) cchWideChar = retval;
+            retval = minipal_convert_utf8_to_utf16(lpMultiByteStr, cbMultiByte, (CHAR16_T*)lpWideCharStr, cchWideChar, dwFlags);
          }
  
-        retval = UTF8ToUnicode(lpMultiByteStr, cbMultiByte, lpWideCharStr, cchWideChar, dwFlags);
          goto EXIT;
      }
  
@@ -338,15 +342,20 @@ WideCharToMultiByte(
          defaultChar = *lpDefaultChar;
      }
  
-    // Use UnicodeToUTF8 on all systems because we use
-    // UTF8ToUnicode in MultiByteToWideChar() on all systems.
      if (CodePage == CP_UTF8 || CodePage == CP_ACP)
      {
-        if (cchWideChar == -1)
-        {
+        if (cchWideChar < 0)
              cchWideChar = PAL_wcslen(lpWideCharStr) + 1;
+
+        if (!lpMultiByteStr || cbMultiByte == 0)
+            retval = minipal_get_length_utf16_to_utf8((CHAR16_T*)lpWideCharStr, cchWideChar, dwFlags);
+
+        if (lpMultiByteStr)
+        {
+            if (cbMultiByte == 0) cbMultiByte = retval;
+            retval = minipal_convert_utf16_to_utf8((CHAR16_T*)lpWideCharStr, cchWideChar, lpMultiByteStr, cbMultiByte, dwFlags);
          }
-        retval = UnicodeToUTF8(lpWideCharStr, cchWideChar, lpMultiByteStr, cbMultiByte);
+
          goto EXIT;
      }
  
diff --git a/src/coreclr/pal/src/locale/utf8.cpp b/src/coreclr/pal/src/locale/utf8.cpp

deleted file mode 100644 (file)

index f07c69f..0000000
--- a/src/coreclr/pal/src/locale/utf8.cpp
+++ /dev/null
@@ -1,2937 +0,0 @@
-// Licensed to the .NET Foundation under one or more agreements.
-// The .NET Foundation licenses this file to you under the MIT license.
-
-/*++
-
-Module Name:
-
-    unicode/utf8.c
-
-Abstract:
-    Functions to encode and decode UTF-8 strings. This is a port of the C# version from Utf8Encoding.cs.
-
-Revision History:
-
---*/
-
-#include "pal/utf8.h"
-#include "pal/malloc.hpp"
-
-using namespace CorUnix;
-
-#define FASTLOOP
-
-struct CharUnicodeInfo
-{
-    static const WCHAR HIGH_SURROGATE_START = 0xd800;
-    static const WCHAR HIGH_SURROGATE_END = 0xdbff;
-    static const WCHAR LOW_SURROGATE_START = 0xdc00;
-    static const WCHAR LOW_SURROGATE_END = 0xdfff;
-};
-
-struct Char
-{
-    // Test if the wide character is a high surrogate
-    static bool IsHighSurrogate(const WCHAR c)
-    {
-        return (c & 0xFC00) == CharUnicodeInfo::HIGH_SURROGATE_START;
-    }
-
-    // Test if the wide character is a low surrogate
-    static bool IsLowSurrogate(const WCHAR c)
-    {
-        return (c & 0xFC00) == CharUnicodeInfo::LOW_SURROGATE_START;
-    }
-
-    // Test if the wide character is a surrogate half
-    static bool IsSurrogate(const WCHAR c)
-    {
-        return (c & 0xF800) == CharUnicodeInfo::HIGH_SURROGATE_START;
-    }
-
-    // Test if the wide character is a high surrogate
-    static bool IsHighSurrogate(const WCHAR* s, int index)
-    {
-        return IsHighSurrogate(s[index]);
-    }
-
-    // Test if the wide character is a low surrogate
-    static bool IsLowSurrogate(const WCHAR* s, int index)
-    {
-        return IsLowSurrogate(s[index]);
-    }
-
-    // Test if the wide character is a surrogate half
-    static bool IsSurrogate(const WCHAR* s, int index)
-    {
-        return IsSurrogate(s[index]);
-    }
-};
-
-class ArgumentException
-{
-
-public:
-    ArgumentException(LPCSTR message)
-    {
-    }
-
-    ArgumentException(LPCSTR message, LPCSTR argName)
-    {
-    }
-};
-
-class ArgumentNullException : public ArgumentException
-{
-public:
-    ArgumentNullException(LPCSTR argName)
-        : ArgumentException("Argument is NULL", argName)
-    {
-
-    }
-};
-
-class ArgumentOutOfRangeException : public ArgumentException
-{
-public:
-    ArgumentOutOfRangeException(LPCSTR argName, LPCSTR message)
-        : ArgumentException(message, argName)
-    {
-
-    }
-};
-
-class InsufficientBufferException : public ArgumentException
-{
-public:
-    InsufficientBufferException(LPCSTR message, LPCSTR argName)
-        : ArgumentException(message, argName)
-    {
-
-    }
-};
-
-class Contract
-{
-public:
-    static void Assert(bool cond, LPCSTR str)
-    {
-        if (!cond)
-        {
-            throw ArgumentException(str);
-        }
-    }
-
-    static void EndContractBlock()
-    {
-    }
-};
-
-class DecoderFallbackException : public ArgumentException
-{
-    BYTE *bytesUnknown;
-    int index;
-
-public:
-    DecoderFallbackException(
-        LPCSTR message, BYTE bytesUnknown[], int index) : ArgumentException(message)
-    {
-        this->bytesUnknown = bytesUnknown;
-        this->index = index;
-    }
-
-    BYTE *BytesUnknown()
-    {
-        return (bytesUnknown);
-    }
-
-    int GetIndex()
-    {
-        return index;
-    }
-};
-
-class DecoderFallbackBuffer;
-
-class DecoderFallback
-{
-public:
-
-    // Fallback
-    //
-    // Return the appropriate unicode string alternative to the character that need to fall back.
-
-    virtual DecoderFallbackBuffer* CreateFallbackBuffer() = 0;
-
-    // Maximum number of characters that this instance of this fallback could return
-
-    virtual int GetMaxCharCount() = 0;
-};
-
-class DecoderReplacementFallback : public DecoderFallback
-{
-    // Our variables
-    WCHAR strDefault[2];
-    int strDefaultLength;
-
-public:
-    // Construction.  Default replacement fallback uses no best fit and ? replacement string
-    DecoderReplacementFallback() : DecoderReplacementFallback(W("?"))
-    {
-    }
-
-    DecoderReplacementFallback(const WCHAR* replacement)
-    {
-        // Must not be null
-        if (replacement == nullptr)
-            throw ArgumentNullException("replacement");
-        Contract::EndContractBlock();
-
-        // Make sure it doesn't have bad surrogate pairs
-        bool bFoundHigh = false;
-        int replacementLength = PAL_wcslen((const WCHAR *)replacement);
-        for (int i = 0; i < replacementLength; i++)
-        {
-            // Found a surrogate?
-            if (Char::IsSurrogate(replacement, i))
-            {
-                // High or Low?
-                if (Char::IsHighSurrogate(replacement, i))
-                {
-                    // if already had a high one, stop
-                    if (bFoundHigh)
-                        break;  // break & throw at the bFoundHIgh below
-                    bFoundHigh = true;
-                }
-                else
-                {
-                    // Low, did we have a high?
-                    if (!bFoundHigh)
-                    {
-                        // Didn't have one, make if fail when we stop
-                        bFoundHigh = true;
-                        break;
-                    }
-
-                    // Clear flag
-                    bFoundHigh = false;
-                }
-            }
-            // If last was high we're in trouble (not surrogate so not low surrogate, so break)
-            else if (bFoundHigh)
-                break;
-        }
-        if (bFoundHigh)
-            throw ArgumentException("String 'replacement' contains invalid Unicode code points.", "replacement");
-
-        wcscpy_s(strDefault, ARRAY_SIZE(strDefault), replacement);
-        strDefaultLength = replacementLength;
-    }
-
-    WCHAR* GetDefaultString()
-    {
-        return strDefault;
-    }
-
-    virtual DecoderFallbackBuffer* CreateFallbackBuffer();
-
-    // Maximum number of characters that this instance of this fallback could return
-    virtual int GetMaxCharCount()
-    {
-        return strDefaultLength;
-    }
-};
-
-class DecoderFallbackBuffer
-{
-    friend class UTF8Encoding;
-    // Most implementations will probably need an implementation-specific constructor
-
-    // internal methods that cannot be overridden that let us do our fallback thing
-    // These wrap the internal methods so that we can check for people doing stuff that's incorrect
-
-public:
-    virtual ~DecoderFallbackBuffer() = default;
-
-    virtual bool Fallback(BYTE bytesUnknown[], int index, int size) = 0;
-
-    // Get next character
-    virtual WCHAR GetNextChar() = 0;
-
-    //Back up a character
-    virtual bool MovePrevious() = 0;
-
-    // How many chars left in this fallback?
-    virtual int GetRemaining() = 0;
-
-    // Clear the buffer
-    virtual void Reset()
-    {
-        while (GetNextChar() != (WCHAR)0);
-    }
-
-    // Internal items to help us figure out what we're doing as far as error messages, etc.
-    // These help us with our performance and messages internally
-protected:
-    BYTE*           byteStart;
-    WCHAR*          charEnd;
-
-    // Internal reset
-    void InternalReset()
-    {
-        byteStart = nullptr;
-        Reset();
-    }
-
-    // Set the above values
-    // This can't be part of the constructor because EncoderFallbacks would have to know how to implement these.
-    void InternalInitialize(BYTE* byteStart, WCHAR* charEnd)
-    {
-        this->byteStart = byteStart;
-        this->charEnd = charEnd;
-    }
-
-    // Fallback the current byte by sticking it into the remaining char buffer.
-    // This can only be called by our encodings (other have to use the public fallback methods), so
-    // we can use our DecoderNLS here too (except we don't).
-    // Returns true if we are successful, false if we can't fallback the character (no buffer space)
-    // So caller needs to throw buffer space if return false.
-    // Right now this has both bytes and bytes[], since we might have extra bytes, hence the
-    // array, and we might need the index, hence the byte*
-    // Don't touch ref chars unless we succeed
-    virtual bool InternalFallback(BYTE bytes[], BYTE* pBytes, WCHAR** chars, int size)
-    {
-
-        Contract::Assert(byteStart != nullptr, "[DecoderFallback.InternalFallback]Used InternalFallback without calling InternalInitialize");
-
-        // See if there's a fallback character and we have an output buffer then copy our string.
-        if (this->Fallback(bytes, (int)(pBytes - byteStart - size), size))
-        {
-            // Copy the chars to our output
-            WCHAR ch;
-            WCHAR* charTemp = *chars;
-            bool bHighSurrogate = false;
-            while ((ch = GetNextChar()) != 0)
-            {
-                // Make sure no mixed up surrogates
-                if (Char::IsSurrogate(ch))
-                {
-                    if (Char::IsHighSurrogate(ch))
-                    {
-                        // High Surrogate
-                        if (bHighSurrogate)
-                            throw ArgumentException("String 'chars' contains invalid Unicode code points.");
-                        bHighSurrogate = true;
-                    }
-                    else
-                    {
-                        // Low surrogate
-                        if (!bHighSurrogate)
-                            throw ArgumentException("String 'chars' contains invalid Unicode code points.");
-                        bHighSurrogate = false;
-                    }
-                }
-
-                if (charTemp >= charEnd)
-                {
-                    // No buffer space
-                    return false;
-                }
-
-                *(charTemp++) = ch;
-            }
-
-            // Need to make sure that bHighSurrogate isn't true
-            if (bHighSurrogate)
-                throw ArgumentException("String 'chars' contains invalid Unicode code points.");
-
-            // Now we aren't going to be false, so its OK to update chars
-            *chars = charTemp;
-        }
-
-        return true;
-    }
-
-    // This version just counts the fallback and doesn't actually copy anything.
-    virtual int InternalFallback(BYTE bytes[], BYTE* pBytes, int size)
-        // Right now this has both bytes[] and BYTE* bytes, since we might have extra bytes, hence the
-        // array, and we might need the index, hence the byte*
-    {
-
-        Contract::Assert(byteStart != nullptr, "[DecoderFallback.InternalFallback]Used InternalFallback without calling InternalInitialize");
-
-        // See if there's a fallback character and we have an output buffer then copy our string.
-        if (this->Fallback(bytes, (int)(pBytes - byteStart - size), size))
-        {
-            int count = 0;
-
-            WCHAR ch;
-            bool bHighSurrogate = false;
-            while ((ch = GetNextChar()) != 0)
-            {
-                // Make sure no mixed up surrogates
-                if (Char::IsSurrogate(ch))
-                {
-                    if (Char::IsHighSurrogate(ch))
-                    {
-                        // High Surrogate
-                        if (bHighSurrogate)
-                            throw ArgumentException("String 'chars' contains invalid Unicode code points.");
-                        bHighSurrogate = true;
-                    }
-                    else
-                    {
-                        // Low surrogate
-                        if (!bHighSurrogate)
-                            throw ArgumentException("String 'chars' contains invalid Unicode code points.");
-                        bHighSurrogate = false;
-                    }
-                }
-
-                count++;
-            }
-
-            // Need to make sure that bHighSurrogate isn't true
-            if (bHighSurrogate)
-                throw ArgumentException("String 'chars' contains invalid Unicode code points.");
-
-            return count;
-        }
-
-        // If no fallback return 0
-        return 0;
-    }
-
-    // private helper methods
-    void ThrowLastBytesRecursive(BYTE bytesUnknown[])
-    {
-        throw ArgumentException("Recursive fallback not allowed");
-    }
-};
-
-class DecoderReplacementFallbackBuffer : public DecoderFallbackBuffer
-{
-    // Store our default string
-    WCHAR strDefault[2];
-    int strDefaultLength;
-    int fallbackCount = -1;
-    int fallbackIndex = -1;
-
-public:
-    // Construction
-    DecoderReplacementFallbackBuffer(DecoderReplacementFallback* fallback)
-    {
-        wcscpy_s(strDefault, ARRAY_SIZE(strDefault), fallback->GetDefaultString());
-        strDefaultLength = PAL_wcslen((const WCHAR *)fallback->GetDefaultString());
-    }
-
-    // Fallback Methods
-    virtual bool Fallback(BYTE bytesUnknown[], int index, int size)
-    {
-        // We expect no previous fallback in our buffer
-        // We can't call recursively but others might (note, we don't test on last char!!!)
-        if (fallbackCount >= 1)
-        {
-            ThrowLastBytesRecursive(bytesUnknown);
-        }
-
-        // Go ahead and get our fallback
-        if (strDefaultLength == 0)
-            return false;
-
-        fallbackCount = strDefaultLength;
-        fallbackIndex = -1;
-
-        return true;
-    }
-
-    virtual WCHAR GetNextChar()
-    {
-        // We want it to get < 0 because == 0 means that the current/last character is a fallback
-        // and we need to detect recursion.  We could have a flag but we already have this counter.
-        fallbackCount--;
-        fallbackIndex++;
-
-        // Do we have anything left? 0 is now last fallback char, negative is nothing left
-        if (fallbackCount < 0)
-            return '\0';
-
-        // Need to get it out of the buffer.
-        // Make sure it didn't wrap from the fast count-- path
-        if (fallbackCount == INT_MAX)
-        {
-            fallbackCount = -1;
-            return '\0';
-        }
-
-        // Now make sure its in the expected range
-        Contract::Assert(fallbackIndex < strDefaultLength && fallbackIndex >= 0,
-            "Index exceeds buffer range");
-
-        return strDefault[fallbackIndex];
-    }
-
-    virtual bool MovePrevious()
-    {
-        // Back up one, only if we just processed the last character (or earlier)
-        if (fallbackCount >= -1 && fallbackIndex >= 0)
-        {
-            fallbackIndex--;
-            fallbackCount++;
-            return true;
-        }
-
-        // Return false 'cause we couldn't do it.
-        return false;
-    }
-
-    // How many characters left to output?
-    virtual int GetRemaining()
-    {
-        // Our count is 0 for 1 character left.
-        return (fallbackCount < 0) ? 0 : fallbackCount;
-    }
-
-    // Clear the buffer
-    virtual void Reset()
-    {
-        fallbackCount = -1;
-        fallbackIndex = -1;
-        byteStart = nullptr;
-    }
-
-    // This version just counts the fallback and doesn't actually copy anything.
-    virtual int InternalFallback(BYTE bytes[], BYTE* pBytes, int size)
-        // Right now this has both bytes and bytes[], since we might have extra bytes, hence the
-        // array, and we might need the index, hence the byte*
-    {
-        // return our replacement string Length
-        return strDefaultLength;
-    }
-};
-
-class DecoderExceptionFallbackBuffer : public DecoderFallbackBuffer
-{
-public:
-    DecoderExceptionFallbackBuffer()
-    {
-    }
-
-    virtual bool Fallback(BYTE bytesUnknown[], int index, int size)
-    {
-        throw DecoderFallbackException(
-            "Unable to translate UTF-8 character to Unicode", bytesUnknown, index);
-    }
-
-    virtual WCHAR GetNextChar()
-    {
-        return 0;
-    }
-
-    virtual bool MovePrevious()
-    {
-        // Exception fallback doesn't have anywhere to back up to.
-        return false;
-    }
-
-    // Exceptions are always empty
-    virtual int GetRemaining()
-    {
-        return 0;
-    }
-
-};
-
-class DecoderExceptionFallback : public DecoderFallback
-{
-    // Construction
-public:
-    DecoderExceptionFallback()
-    {
-    }
-
-    virtual DecoderFallbackBuffer* CreateFallbackBuffer()
-    {
-        return InternalNew<DecoderExceptionFallbackBuffer>();
-    }
-
-    // Maximum number of characters that this instance of this fallback could return
-    virtual int GetMaxCharCount()
-    {
-        return 0;
-    }
-};
-
-DecoderFallbackBuffer* DecoderReplacementFallback::CreateFallbackBuffer()
-{
-    return InternalNew<DecoderReplacementFallbackBuffer>(this);
-}
-
-class EncoderFallbackException : public ArgumentException
-{
-    WCHAR   charUnknown;
-    WCHAR   charUnknownHigh;
-    WCHAR   charUnknownLow;
-    int     index;
-
-public:
-    EncoderFallbackException(
-        LPCSTR message, WCHAR charUnknown, int index) : ArgumentException(message)
-    {
-        this->charUnknown = charUnknown;
-        this->index = index;
-    }
-
-    EncoderFallbackException(
-        LPCSTR message, WCHAR charUnknownHigh, WCHAR charUnknownLow, int index) : ArgumentException(message)
-    {
-        if (!Char::IsHighSurrogate(charUnknownHigh))
-        {
-            throw ArgumentOutOfRangeException("charUnknownHigh",
-                "Argument out of range 0xD800..0xDBFF");
-        }
-        if (!Char::IsLowSurrogate(charUnknownLow))
-        {
-            throw ArgumentOutOfRangeException("charUnknownLow",
-                "Argument out of range 0xDC00..0xDFFF");
-        }
-        Contract::EndContractBlock();
-
-        this->charUnknownHigh = charUnknownHigh;
-        this->charUnknownLow = charUnknownLow;
-        this->index = index;
-    }
-
-    WCHAR GetCharUnknown()
-    {
-        return (charUnknown);
-    }
-
-    WCHAR GetCharUnknownHigh()
-    {
-        return (charUnknownHigh);
-    }
-
-    WCHAR GetCharUnknownLow()
-    {
-        return (charUnknownLow);
-    }
-
-    int GetIndex()
-    {
-        return index;
-    }
-
-    // Return true if the unknown character is a surrogate pair.
-    bool IsUnknownSurrogate()
-    {
-        return (charUnknownHigh != '\0');
-    }
-};
-
-class EncoderFallbackBuffer;
-
-class EncoderFallback
-{
-public:
-
-    // Fallback
-    //
-    // Return the appropriate unicode string alternative to the character that need to fall back.
-
-    virtual EncoderFallbackBuffer* CreateFallbackBuffer() = 0;
-
-    // Maximum number of characters that this instance of this fallback could return
-    virtual int GetMaxCharCount() = 0;
-};
-
-class EncoderReplacementFallback : public EncoderFallback
-{
-    // Our variables
-    WCHAR strDefault[2];
-    int strDefaultLength;
-
-public:
-    // Construction.  Default replacement fallback uses no best fit and ? replacement string
-    EncoderReplacementFallback() : EncoderReplacementFallback(W("?"))
-    {
-    }
-
-    EncoderReplacementFallback(const WCHAR* replacement)
-    {
-        // Must not be null
-        if (replacement == nullptr)
-            throw ArgumentNullException("replacement");
-        Contract::EndContractBlock();
-
-        // Make sure it doesn't have bad surrogate pairs
-        bool bFoundHigh = false;
-        int replacementLength = PAL_wcslen((const WCHAR *)replacement);
-        for (int i = 0; i < replacementLength; i++)
-        {
-            // Found a surrogate?
-            if (Char::IsSurrogate(replacement, i))
-            {
-                // High or Low?
-                if (Char::IsHighSurrogate(replacement, i))
-                {
-                    // if already had a high one, stop
-                    if (bFoundHigh)
-                        break;  // break & throw at the bFoundHIgh below
-                    bFoundHigh = true;
-                }
-                else
-                {
-                    // Low, did we have a high?
-                    if (!bFoundHigh)
-                    {
-                        // Didn't have one, make if fail when we stop
-                        bFoundHigh = true;
-                        break;
-                    }
-
-                    // Clear flag
-                    bFoundHigh = false;
-                }
-            }
-            // If last was high we're in trouble (not surrogate so not low surrogate, so break)
-            else if (bFoundHigh)
-                break;
-        }
-        if (bFoundHigh)
-            throw ArgumentException("String 'replacement' contains invalid Unicode code points.", "replacement");
-
-        wcscpy_s(strDefault, ARRAY_SIZE(strDefault), replacement);
-        strDefaultLength = replacementLength;
-    }
-
-    WCHAR* GetDefaultString()
-    {
-        return strDefault;
-    }
-
-    virtual EncoderFallbackBuffer* CreateFallbackBuffer();
-
-    // Maximum number of characters that this instance of this fallback could return
-    virtual int GetMaxCharCount()
-    {
-        return strDefaultLength;
-    }
-};
-
-class EncoderFallbackBuffer
-{
-    friend class UTF8Encoding;
-    // Most implementations will probably need an implementation-specific constructor
-
-    // Public methods that cannot be overridden that let us do our fallback thing
-    // These wrap the internal methods so that we can check for people doing stuff that is incorrect
-
-public:
-    virtual ~EncoderFallbackBuffer() = default;
-
-    virtual bool Fallback(WCHAR charUnknown, int index) = 0;
-
-    virtual bool Fallback(WCHAR charUnknownHigh, WCHAR charUnknownLow, int index) = 0;
-
-    // Get next character
-    virtual WCHAR GetNextChar() = 0;
-
-    // Back up a character
-    virtual bool MovePrevious() = 0;
-
-    // How many chars left in this fallback?
-    virtual int GetRemaining() = 0;
-
-    // Not sure if this should be public or not.
-    // Clear the buffer
-    virtual void Reset()
-    {
-        while (GetNextChar() != (WCHAR)0);
-    }
-
-    // Internal items to help us figure out what we're doing as far as error messages, etc.
-    // These help us with our performance and messages internally
-protected:
-    WCHAR*          charStart;
-    WCHAR*          charEnd;
-    bool            setEncoder;
-    bool            bUsedEncoder;
-    bool            bFallingBack = false;
-    int             iRecursionCount = 0;
-    static const int iMaxRecursion = 250;
-
-    // Internal Reset
-    // For example, what if someone fails a conversion and wants to reset one of our fallback buffers?
-    void InternalReset()
-    {
-        charStart = nullptr;
-        bFallingBack = false;
-        iRecursionCount = 0;
-        Reset();
-    }
-
-    // Set the above values
-    // This can't be part of the constructor because EncoderFallbacks would have to know how to implement these.
-    void InternalInitialize(WCHAR* charStart, WCHAR* charEnd, bool setEncoder)
-    {
-        this->charStart = charStart;
-        this->charEnd = charEnd;
-        this->setEncoder = setEncoder;
-        this->bUsedEncoder = false;
-        this->bFallingBack = false;
-        this->iRecursionCount = 0;
-    }
-
-    WCHAR InternalGetNextChar()
-    {
-        WCHAR ch = GetNextChar();
-        bFallingBack = (ch != 0);
-        if (ch == 0) iRecursionCount = 0;
-        return ch;
-    }
-
-    // Fallback the current character using the remaining buffer and encoder if necessary
-    // This can only be called by our encodings (other have to use the public fallback methods), so
-    // we can use our EncoderNLS here too.
-    // setEncoder is true if we're calling from a GetBytes method, false if we're calling from a GetByteCount
-    //
-    // Note that this could also change the contents of this->encoder, which is the same
-    // object that the caller is using, so the caller could mess up the encoder for us
-    // if they aren't careful.
-    virtual bool InternalFallback(WCHAR ch, WCHAR** chars)
-    {
-        // Shouldn't have null charStart
-        Contract::Assert(charStart != nullptr,
-            "[EncoderFallback.InternalFallbackBuffer]Fallback buffer is not initialized");
-
-        // Get our index, remember chars was preincremented to point at next char, so have to -1
-        int index = (int)(*chars - charStart) - 1;
-
-        // See if it was a high surrogate
-        if (Char::IsHighSurrogate(ch))
-        {
-            // See if there's a low surrogate to go with it
-            if (*chars >= this->charEnd)
-            {
-                // Nothing left in input buffer
-                // No input, return 0
-            }
-            else
-            {
-                // Might have a low surrogate
-                WCHAR cNext = **chars;
-                if (Char::IsLowSurrogate(cNext))
-                {
-                    // If already falling back then fail
-                    if (bFallingBack && iRecursionCount++ > iMaxRecursion)
-                        ThrowLastCharRecursive(ch, cNext);
-
-                    // Next is a surrogate, add it as surrogate pair, and increment chars
-                    (*chars)++;
-                    bFallingBack = Fallback(ch, cNext, index);
-                    return bFallingBack;
-                }
-
-                // Next isn't a low surrogate, just fallback the high surrogate
-            }
-        }
-
-        // If already falling back then fail
-        if (bFallingBack && iRecursionCount++ > iMaxRecursion)
-            ThrowLastCharRecursive((int)ch);
-
-        // Fall back our char
-        bFallingBack = Fallback(ch, index);
-
-        return bFallingBack;
-    }
-
-    // private helper methods
-    void ThrowLastCharRecursive(WCHAR highSurrogate, WCHAR lowSurrogate)
-    {
-        // Throw it, using our complete character
-        throw ArgumentException("Recursive fallback not allowed", "chars");
-    }
-
-    void ThrowLastCharRecursive(int utf32Char)
-    {
-        throw ArgumentException("Recursive fallback not allowed", "chars");
-    }
-
-};
-
-class EncoderReplacementFallbackBuffer : public EncoderFallbackBuffer
-{
-    // Store our default string
-    WCHAR strDefault[4];
-    int strDefaultLength;
-    int fallbackCount = -1;
-    int fallbackIndex = -1;
-public:
-    // Construction
-    EncoderReplacementFallbackBuffer(EncoderReplacementFallback* fallback)
-    {
-        // 2X in case we're a surrogate pair
-        wcscpy_s(strDefault, ARRAY_SIZE(strDefault), fallback->GetDefaultString());
-        wcscat_s(strDefault, ARRAY_SIZE(strDefault), fallback->GetDefaultString());
-        strDefaultLength = 2 * PAL_wcslen((const WCHAR *)fallback->GetDefaultString());
-
-    }
-
-    // Fallback Methods
-    virtual bool Fallback(WCHAR charUnknown, int index)
-    {
-        // If we had a buffer already we're being recursive, throw, it's probably at the suspect
-        // character in our array.
-        if (fallbackCount >= 1)
-        {
-            // If we're recursive we may still have something in our buffer that makes this a surrogate
-            if (Char::IsHighSurrogate(charUnknown) && fallbackCount >= 0 &&
-                Char::IsLowSurrogate(strDefault[fallbackIndex + 1]))
-                ThrowLastCharRecursive(charUnknown, strDefault[fallbackIndex + 1]);
-
-            // Nope, just one character
-            ThrowLastCharRecursive((int)charUnknown);
-        }
-
-        // Go ahead and get our fallback
-        // Divide by 2 because we aren't a surrogate pair
-        fallbackCount = strDefaultLength / 2;
-        fallbackIndex = -1;
-
-        return fallbackCount != 0;
-    }
-
-    virtual bool Fallback(WCHAR charUnknownHigh, WCHAR charUnknownLow, int index)
-    {
-        // Double check input surrogate pair
-        if (!Char::IsHighSurrogate(charUnknownHigh))
-            throw ArgumentOutOfRangeException("charUnknownHigh",
-            "Argument out of range 0xD800..0xDBFF");
-
-        if (!Char::IsLowSurrogate(charUnknownLow))
-            throw ArgumentOutOfRangeException("charUnknownLow",
-            "Argument out of range 0xDC00..0xDFFF");
-        Contract::EndContractBlock();
-
-        // If we had a buffer already we're being recursive, throw, it's probably at the suspect
-        // character in our array.
-        if (fallbackCount >= 1)
-            ThrowLastCharRecursive(charUnknownHigh, charUnknownLow);
-
-        // Go ahead and get our fallback
-        fallbackCount = strDefaultLength;
-        fallbackIndex = -1;
-
-        return fallbackCount != 0;
-    }
-
-    virtual WCHAR GetNextChar()
-    {
-        // We want it to get < 0 because == 0 means that the current/last character is a fallback
-        // and we need to detect recursion.  We could have a flag but we already have this counter.
-        fallbackCount--;
-        fallbackIndex++;
-
-        // Do we have anything left? 0 is now last fallback char, negative is nothing left
-        if (fallbackCount < 0)
-            return '\0';
-
-        // Need to get it out of the buffer.
-        // Make sure it didn't wrap from the fast count-- path
-        if (fallbackCount == INT_MAX)
-        {
-            fallbackCount = -1;
-            return '\0';
-        }
-
-        // Now make sure its in the expected range
-        Contract::Assert(fallbackIndex < strDefaultLength && fallbackIndex >= 0,
-            "Index exceeds buffer range");
-
-        return strDefault[fallbackIndex];
-    }
-
-    virtual bool MovePrevious()
-    {
-        // Back up one, only if we just processed the last character (or earlier)
-        if (fallbackCount >= -1 && fallbackIndex >= 0)
-        {
-            fallbackIndex--;
-            fallbackCount++;
-            return true;
-        }
-
-        // Return false 'cause we couldn't do it.
-        return false;
-    }
-
-    // How many characters left to output?
-    virtual int GetRemaining()
-    {
-        // Our count is 0 for 1 character left.
-        return (fallbackCount < 0) ? 0 : fallbackCount;
-    }
-
-    // Clear the buffer
-    virtual void Reset()
-    {
-        fallbackCount = -1;
-        fallbackIndex = 0;
-        charStart = nullptr;
-        bFallingBack = false;
-    }
-};
-
-class EncoderExceptionFallbackBuffer : public EncoderFallbackBuffer
-{
-public:
-    EncoderExceptionFallbackBuffer()
-    {
-    }
-
-    virtual bool Fallback(WCHAR charUnknown, int index)
-    {
-        // Fall back our char
-        throw EncoderFallbackException("Unable to translate Unicode character to UTF-8", charUnknown, index);
-    }
-
-    virtual bool Fallback(WCHAR charUnknownHigh, WCHAR charUnknownLow, int index)
-    {
-        if (!Char::IsHighSurrogate(charUnknownHigh))
-        {
-            throw ArgumentOutOfRangeException("charUnknownHigh",
-                "Argument out of range 0xD800..0xDBFF");
-        }
-        if (!Char::IsLowSurrogate(charUnknownLow))
-        {
-            throw ArgumentOutOfRangeException("charUnknownLow",
-                "Argument out of range 0xDC00..0xDFFF");
-        }
-        Contract::EndContractBlock();
-
-        //int iTemp = Char::ConvertToUtf32(charUnknownHigh, charUnknownLow);
-
-        // Fall back our char
-        throw EncoderFallbackException(
-            "Unable to translate Unicode character to UTF-8", charUnknownHigh, charUnknownLow, index);
-    }
-
-    virtual WCHAR GetNextChar()
-    {
-        return 0;
-    }
-
-    virtual bool MovePrevious()
-    {
-        // Exception fallback doesn't have anywhere to back up to.
-        return false;
-    }
-
-    // Exceptions are always empty
-    virtual int GetRemaining()
-    {
-        return 0;
-    }
-};
-
-class EncoderExceptionFallback : public EncoderFallback
-{
-    // Construction
-public:
-    EncoderExceptionFallback()
-    {
-    }
-
-    virtual EncoderFallbackBuffer* CreateFallbackBuffer()
-    {
-        return InternalNew<EncoderExceptionFallbackBuffer>();
-    }
-
-    // Maximum number of characters that this instance of this fallback could return
-    virtual int GetMaxCharCount()
-    {
-        return 0;
-    }
-};
-
-EncoderFallbackBuffer* EncoderReplacementFallback::CreateFallbackBuffer()
-{
-    return InternalNew<EncoderReplacementFallbackBuffer>(this);
-}
-
-class UTF8Encoding
-{
-    EncoderFallback* encoderFallback;
-    // Instances of the two possible fallbacks. The constructor parameter
-    // determines which one to use.
-    EncoderReplacementFallback encoderReplacementFallback;
-    EncoderExceptionFallback encoderExceptionFallback;
-
-    DecoderFallback* decoderFallback;
-    // Instances of the two possible fallbacks. The constructor parameter
-    // determines which one to use.
-    DecoderReplacementFallback decoderReplacementFallback;
-    DecoderExceptionFallback decoderExceptionFallback;
-
-    bool InRange(int c, int begin, int end)
-    {
-        return begin <= c && c <= end;
-    }
-
-    size_t PtrDiff(WCHAR* ptr1, WCHAR* ptr2)
-    {
-        return ptr1 - ptr2;
-    }
-
-    size_t PtrDiff(BYTE* ptr1, BYTE* ptr2)
-    {
-        return ptr1 - ptr2;
-    }
-
-    void ThrowBytesOverflow()
-    {
-        // Special message to include fallback type in case fallback's GetMaxCharCount is broken
-        // This happens if user has implemented an encoder fallback with a broken GetMaxCharCount
-        throw InsufficientBufferException("The output byte buffer is too small to contain the encoded data", "bytes");
-    }
-
-    void ThrowBytesOverflow(bool nothingEncoded)
-    {
-        // Special message to include fallback type in case fallback's GetMaxCharCount is broken
-        // This happens if user has implemented an encoder fallback with a broken GetMaxCharCount
-        if (nothingEncoded){
-            ThrowBytesOverflow();
-        }
-    }
-
-    void ThrowCharsOverflow()
-    {
-        // Special message to include fallback type in case fallback's GetMaxCharCount is broken
-        // This happens if user has implemented a decoder fallback with a broken GetMaxCharCount
-        throw InsufficientBufferException("The output char buffer is too small to contain the encoded data", "chars");
-    }
-
-    void ThrowCharsOverflow(bool nothingEncoded)
-    {
-        // Special message to include fallback type in case fallback's GetMaxCharCount is broken
-        // This happens if user has implemented an decoder fallback with a broken GetMaxCharCount
-        if (nothingEncoded){
-            ThrowCharsOverflow();
-        }
-    }
-
-    // During GetChars we had an invalid byte sequence
-    // pSrc is backed up to the start of the bad sequence if we didn't have room to
-    // fall it back.  Otherwise pSrc remains where it is.
-    bool FallbackInvalidByteSequence(BYTE** pSrc, int ch, DecoderFallbackBuffer* fallback, WCHAR** pTarget)
-    {
-        // Get our byte[]
-        BYTE* pStart = *pSrc;
-        BYTE bytesUnknown[3];
-        int size = GetBytesUnknown(pStart, ch, bytesUnknown);
-
-        // Do the actual fallback
-        if (!fallback->InternalFallback(bytesUnknown, *pSrc, pTarget, size))
-        {
-            // Oops, it failed, back up to pStart
-            *pSrc = pStart;
-            return false;
-        }
-
-        // It worked
-        return true;
-    }
-
-    int FallbackInvalidByteSequence(BYTE* pSrc, int ch, DecoderFallbackBuffer *fallback)
-    {
-        // Get our byte[]
-        BYTE bytesUnknown[3];
-        int size = GetBytesUnknown(pSrc, ch, bytesUnknown);
-
-        // Do the actual fallback
-        int count = fallback->InternalFallback(bytesUnknown, pSrc, size);
-
-        // # of fallback chars expected.
-        // Note that we only get here for "long" sequences, and have already unreserved
-        // the count that we prereserved for the input bytes
-        return count;
-    }
-
-    int GetBytesUnknown(BYTE* pSrc, int ch, BYTE* bytesUnknown)
-    {
-        int size;
-
-        // See if it was a plain char
-        // (have to check >= 0 because we have all sorts of weird bit flags)
-        if (ch < 0x100 && ch >= 0)
-        {
-            pSrc--;
-            bytesUnknown[0] = (BYTE)ch;
-            size =  1;
-        }
-        // See if its an unfinished 2 byte sequence
-        else if ((ch & (SupplimentarySeq | ThreeByteSeq)) == 0)
-        {
-            pSrc--;
-            bytesUnknown[0] = (BYTE)((ch & 0x1F) | 0xc0);
-            size = 1;
-        }
-        // So now we're either 2nd byte of 3 or 4 byte sequence or
-        // we hit a non-trail byte or we ran out of space for 3rd byte of 4 byte sequence
-        // 1st check if its a 4 byte sequence
-        else if ((ch & SupplimentarySeq) != 0)
-        {
-            //  3rd byte of 4 byte sequence?
-            if ((ch & (FinalByte >> 6)) != 0)
-            {
-                // 3rd byte of 4 byte sequence
-                pSrc -= 3;
-                bytesUnknown[0] = (BYTE)(((ch >> 12) & 0x07) | 0xF0);
-                bytesUnknown[1] = (BYTE)(((ch >> 6) & 0x3F) | 0x80);
-                bytesUnknown[2] = (BYTE)(((ch)& 0x3F) | 0x80);
-                size = 3;
-            }
-            else if ((ch & (FinalByte >> 12)) != 0)
-            {
-                // 2nd byte of a 4 byte sequence
-                pSrc -= 2;
-                bytesUnknown[0] = (BYTE)(((ch >> 6) & 0x07) | 0xF0);
-                bytesUnknown[1] = (BYTE)(((ch)& 0x3F) | 0x80);
-                size = 2;
-            }
-            else
-            {
-                // 4th byte of a 4 byte sequence
-                pSrc--;
-                bytesUnknown[0] = (BYTE)(((ch)& 0x07) | 0xF0);
-                size = 1;
-            }
-        }
-        else
-        {
-            // 2nd byte of 3 byte sequence?
-            if ((ch & (FinalByte >> 6)) != 0)
-            {
-                // So its 2nd byte of a 3 byte sequence
-                pSrc -= 2;
-                bytesUnknown[0] = (BYTE)(((ch >> 6) & 0x0F) | 0xE0);
-                bytesUnknown[1] = (BYTE)(((ch)& 0x3F) | 0x80);
-                size = 2;
-            }
-            else
-            {
-                // 1st byte of a 3 byte sequence
-                pSrc--;
-                bytesUnknown[0] = (BYTE)(((ch)& 0x0F) | 0xE0);
-                size = 1;
-            }
-        }
-
-        return size;
-    }
-
-public:
-
-    UTF8Encoding(bool isThrowException)
-        : encoderReplacementFallback(W("\xFFFD")), decoderReplacementFallback(W("\xFFFD"))
-    {
-        if (isThrowException)
-        {
-            encoderFallback = &encoderExceptionFallback;
-            decoderFallback = &decoderExceptionFallback;
-        }
-        else
-        {
-            encoderFallback = &encoderReplacementFallback;
-            decoderFallback = &decoderReplacementFallback;
-        }
-    }
-
-    // These are bitmasks used to maintain the state in the decoder. They occupy the higher bits
-    // while the actual character is being built in the lower bits. They are shifted together
-    // with the actual bits of the character.
-
-    // bits 30 & 31 are used for pending bits fixup
-    const int FinalByte = 1 << 29;
-    const int SupplimentarySeq = 1 << 28;
-    const int ThreeByteSeq = 1 << 27;
-
-    int GetCharCount(BYTE* bytes, int count)
-    {
-        Contract::Assert(bytes != nullptr, "[UTF8Encoding.GetCharCount]bytes!=nullptr");
-        Contract::Assert(count >= 0, "[UTF8Encoding.GetCharCount]count >=0");
-
-        // Initialize stuff
-        BYTE *pSrc = bytes;
-        BYTE *pEnd = pSrc + count;
-
-        // Start by assuming we have as many as count, charCount always includes the adjustment
-        // for the character being decoded
-        int charCount = count;
-        int ch = 0;
-        DecoderFallbackBuffer *fallback = nullptr;
-
-        while (true)
-        {
-            // SLOWLOOP: does all range checks, handles all special cases, but it is slow
-            if (pSrc >= pEnd) {
-                break;
-            }
-
-            // read next byte. The JIT optimization seems to be getting confused when
-            // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
-            int cha = *pSrc;
-
-            if (ch == 0) {
-                // no pending bits
-                goto ReadChar;
-            }
-
-            pSrc++;
-
-            // we are expecting to see trailing bytes like 10vvvvvv
-            if ((cha & 0xC0) != 0x80) {
-                // This can be a valid starting byte for another UTF8 byte sequence, so let's put
-                // the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence
-                pSrc--;
-                charCount += (ch >> 30);
-                goto InvalidByteSequence;
-            }
-
-            // fold in the new byte
-            ch = (ch << 6) | (cha & 0x3F);
-
-            if ((ch & FinalByte) == 0) {
-                Contract::Assert((ch & (SupplimentarySeq | ThreeByteSeq)) != 0,
-                    "[UTF8Encoding.GetChars]Invariant volation");
-
-                if ((ch & SupplimentarySeq) != 0) {
-                    if ((ch & (FinalByte >> 6)) != 0) {
-                        // this is 3rd byte (of 4 byte supplimentary) - nothing to do
-                        continue;
-                    }
-
-                    // 2nd byte, check for non-shortest form of supplimentary char and the valid
-                    // supplimentary characters in range 0x010000 - 0x10FFFF at the same time
-                    if (!InRange(ch & 0x1F0, 0x10, 0x100)) {
-                        goto InvalidByteSequence;
-                    }
-                }
-                else {
-                    // Must be 2nd byte of a 3-byte sequence
-                    // check for non-shortest form of 3 byte seq
-                    if ((ch & (0x1F << 5)) == 0 ||                  // non-shortest form
-                        (ch & (0xF800 >> 6)) == (0xD800 >> 6))     // illegal individually encoded surrogate
-                    {
-                        goto InvalidByteSequence;
-                    }
-                }
-                continue;
-            }
-
-            // ready to punch
-
-            // adjust for surrogates in non-shortest form
-            if ((ch & (SupplimentarySeq | 0x1F0000)) == SupplimentarySeq) {
-                charCount--;
-            }
-            goto EncodeChar;
-
-        InvalidByteSequence:
-            // this code fragment should be close to the gotos referencing it
-            // Have to do fallback for invalid bytes
-            if (fallback == nullptr)
-            {
-                fallback = decoderFallback->CreateFallbackBuffer();
-                fallback->InternalInitialize(bytes, nullptr);
-            }
-            charCount += FallbackInvalidByteSequence(pSrc, ch, fallback);
-
-            ch = 0;
-            continue;
-
-        ReadChar:
-            ch = *pSrc;
-            pSrc++;
-
-        ProcessChar:
-            if (ch > 0x7F) {
-                // If its > 0x7F, its start of a new multi-byte sequence
-
-                // Long sequence, so unreserve our char.
-                charCount--;
-
-                // bit 6 has to be non-zero for start of multibyte chars.
-                if ((ch & 0x40) == 0) {
-                    // Unexpected trail byte
-                    goto InvalidByteSequence;
-                }
-
-                // start a new long code
-                if ((ch & 0x20) != 0) {
-                    if ((ch & 0x10) != 0) {
-                        // 4 byte encoding - supplimentary character (2 surrogates)
-
-                        ch &= 0x0F;
-
-                        // check that bit 4 is zero and the valid supplimentary character
-                        // range 0x000000 - 0x10FFFF at the same time
-                        if (ch > 0x04) {
-                            ch |= 0xf0;
-                            goto InvalidByteSequence;
-                        }
-
-                        // Add bit flags so that when we check new characters & rotate we'll be flagged correctly.
-                        // Final byte flag, count fix if we don't make final byte & supplimentary sequence flag.
-                        ch |= (FinalByte >> 3 * 6) |  // Final byte is 3 more bytes from now
-                            (1 << 30) |           // If it dies on next byte we'll need an extra char
-                            (3 << (30 - 2 * 6)) |     // If it dies on last byte we'll need to subtract a char
-                            (SupplimentarySeq) | (SupplimentarySeq >> 6) |
-                            (SupplimentarySeq >> 2 * 6) | (SupplimentarySeq >> 3 * 6);
-
-                        // Our character count will be 2 characters for these 4 bytes, so subtract another char
-                        charCount--;
-                    }
-                    else {
-                        // 3 byte encoding
-                        // Add bit flags so that when we check new characters & rotate we'll be flagged correctly.
-                        ch = (ch & 0x0F) | ((FinalByte >> 2 * 6) | (1 << 30) |
-                            (ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2 * 6));
-
-                        // We'll expect 1 character for these 3 bytes, so subtract another char.
-                        charCount--;
-                    }
-                }
-                else {
-                    // 2 byte encoding
-
-                    ch &= 0x1F;
-
-                    // check for non-shortest form
-                    if (ch <= 1) {
-                        ch |= 0xc0;
-                        goto InvalidByteSequence;
-                    }
-
-                    // Add bit flags so we'll be flagged correctly
-                    ch |= (FinalByte >> 6);
-                }
-                continue;
-            }
-
-        EncodeChar:
-
-#ifdef FASTLOOP
-            int availableBytes = PtrDiff(pEnd, pSrc);
-
-            // don't fall into the fast decoding loop if we don't have enough bytes
-            if (availableBytes <= 13) {
-                // try to get over the remainder of the ascii characters fast though
-            BYTE* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
-                while (pSrc < pLocalEnd) {
-                    ch = *pSrc;
-                    pSrc++;
-
-                    if (ch > 0x7F)
-                        goto ProcessChar;
-                }
-                // we are done
-                ch = 0;
-                break;
-            }
-
-            // To compute the upper bound, assume that all characters are ASCII characters at this point,
-            //  the boundary will be decreased for every non-ASCII character we encounter
-            // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences
-            BYTE *pStop = pSrc + availableBytes - 7;
-
-            while (pSrc < pStop) {
-                ch = *pSrc;
-                pSrc++;
-
-                if (ch > 0x7F) {
-                    goto LongCode;
-                }
-
-                // get pSrc 2-byte aligned
-                if (((size_t)pSrc & 0x1) != 0) {
-                    ch = *pSrc;
-                    pSrc++;
-                    if (ch > 0x7F) {
-                        goto LongCode;
-                    }
-                }
-
-                // get pSrc 4-byte aligned
-                if (((size_t)pSrc & 0x2) != 0) {
-                    ch = *(USHORT*)pSrc;
-                    if ((ch & 0x8080) != 0) {
-                        goto LongCodeWithMask16;
-                    }
-                    pSrc += 2;
-                }
-
-
-                // Run 8 + 8 characters at a time!
-                while (pSrc < pStop) {
-                    ch = *(int*)pSrc;
-                    int chb = *(int*)(pSrc + 4);
-                    if (((ch | chb) & (int)0x80808080) != 0) {
-                        goto LongCodeWithMask32;
-                    }
-                    pSrc += 8;
-
-                    // This is a really small loop - unroll it
-                    if (pSrc >= pStop)
-                        break;
-
-                    ch = *(int*)pSrc;
-                    chb = *(int*)(pSrc + 4);
-                    if (((ch | chb) & (int)0x80808080) != 0) {
-                        goto LongCodeWithMask32;
-                    }
-                    pSrc += 8;
-                }
-                break;
-
-#if BIGENDIAN
-            LongCodeWithMask32 :
-                // be careful about the sign extension
-                ch = (int)(((uint)ch) >> 16);
-            LongCodeWithMask16:
-                ch = (int)(((uint)ch) >> 8);
-#else // BIGENDIAN
-            LongCodeWithMask32:
-            LongCodeWithMask16:
-                ch &= 0xFF;
-#endif // BIGENDIAN
-                pSrc++;
-                if (ch <= 0x7F) {
-                    continue;
-                }
-
-            LongCode:
-                int chc = *pSrc;
-                pSrc++;
-
-                if (
-                    // bit 6 has to be zero
-                    (ch & 0x40) == 0 ||
-                    // we are expecting to see trailing bytes like 10vvvvvv
-                    (chc & 0xC0) != 0x80)
-                {
-                    goto BadLongCode;
-                }
-
-                chc &= 0x3F;
-
-                // start a new long code
-                if ((ch & 0x20) != 0) {
-
-                    // fold the first two bytes together
-                    chc |= (ch & 0x0F) << 6;
-
-                    if ((ch & 0x10) != 0) {
-                        // 4 byte encoding - surrogate
-                        ch = *pSrc;
-                        if (
-                            // check that bit 4 is zero, the non-shortest form of surrogate
-                            // and the valid surrogate range 0x000000 - 0x10FFFF at the same time
-                            !InRange(chc >> 4, 0x01, 0x10) ||
-                            // we are expecting to see trailing bytes like 10vvvvvv
-                            (ch & 0xC0) != 0x80)
-                        {
-                            goto BadLongCode;
-                        }
-
-                        chc = (chc << 6) | (ch & 0x3F);
-
-                        ch = *(pSrc + 1);
-                        // we are expecting to see trailing bytes like 10vvvvvv
-                        if ((ch & 0xC0) != 0x80) {
-                            goto BadLongCode;
-                        }
-                        pSrc += 2;
-
-                        // extra byte
-                        charCount--;
-                    }
-                    else {
-                        // 3 byte encoding
-                        ch = *pSrc;
-                        if (
-                            // check for non-shortest form of 3 byte seq
-                            (chc & (0x1F << 5)) == 0 ||
-                            // Can't have surrogates here.
-                            (chc & (0xF800 >> 6)) == (0xD800 >> 6) ||
-                            // we are expecting to see trailing bytes like 10vvvvvv
-                            (ch & 0xC0) != 0x80)
-                        {
-                            goto BadLongCode;
-                        }
-                        pSrc++;
-
-                        // extra byte
-                        charCount--;
-                    }
-                }
-                else {
-                    // 2 byte encoding
-
-                    // check for non-shortest form
-                    if ((ch & 0x1E) == 0) {
-                        goto BadLongCode;
-                    }
-                }
-
-                // extra byte
-                charCount--;
-            }
-#endif // FASTLOOP
-
-            // no pending bits at this point
-            ch = 0;
-            continue;
-
-        BadLongCode:
-            pSrc -= 2;
-            ch = 0;
-            continue;
-        }
-
-        // May have a problem if we have to flush
-        if (ch != 0)
-        {
-            // We were already adjusting for these, so need to unadjust
-            charCount += (ch >> 30);
-            // Have to do fallback for invalid bytes
-            if (fallback == nullptr)
-            {
-                fallback = decoderFallback->CreateFallbackBuffer();
-                fallback->InternalInitialize(bytes, nullptr);
-            }
-            charCount += FallbackInvalidByteSequence(pSrc, ch, fallback);
-        }
-
-        // Shouldn't have anything in fallback buffer for GetCharCount
-        // (don't have to check m_throwOnOverflow for count)
-        Contract::Assert(fallback == nullptr || fallback->GetRemaining() == 0,
-            "[UTF8Encoding.GetCharCount]Expected empty fallback buffer at end");
-
-        InternalDelete(fallback);
-
-        return charCount;
-
-    }
-
-    int GetChars(BYTE* bytes, int byteCount, WCHAR* chars, int charCount)
-    {
-        Contract::Assert(chars != nullptr, "[UTF8Encoding.GetChars]chars!=nullptr");
-        Contract::Assert(byteCount >= 0, "[UTF8Encoding.GetChars]byteCount >=0");
-        Contract::Assert(charCount >= 0, "[UTF8Encoding.GetChars]charCount >=0");
-        Contract::Assert(bytes != nullptr, "[UTF8Encoding.GetChars]bytes!=nullptr");
-
-        BYTE *pSrc = bytes;
-        WCHAR *pTarget = chars;
-
-        BYTE *pEnd = pSrc + byteCount;
-        WCHAR *pAllocatedBufferEnd = pTarget + charCount;
-
-        int ch = 0;
-
-        DecoderFallbackBuffer *fallback = nullptr;
-
-        while (true)
-        {
-            // SLOWLOOP: does all range checks, handles all special cases, but it is slow
-
-            if (pSrc >= pEnd) {
-                break;
-            }
-
-            // read next byte. The JIT optimization seems to be getting confused when
-            // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
-            int cha = *pSrc;
-
-            if (ch == 0) {
-                // no pending bits
-                goto ReadChar;
-            }
-
-            pSrc++;
-
-            // we are expecting to see trailing bytes like 10vvvvvv
-            if ((cha & 0xC0) != 0x80) {
-                // This can be a valid starting byte for another UTF8 byte sequence, so let's put
-                // the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence
-                pSrc--;
-                goto InvalidByteSequence;
-            }
-
-            // fold in the new byte
-            ch = (ch << 6) | (cha & 0x3F);
-
-            if ((ch & FinalByte) == 0) {
-                // Not at last byte yet
-                Contract::Assert((ch & (SupplimentarySeq | ThreeByteSeq)) != 0,
-                    "[UTF8Encoding.GetChars]Invariant volation");
-
-                if ((ch & SupplimentarySeq) != 0) {
-                    // Its a 4-byte supplimentary sequence
-                    if ((ch & (FinalByte >> 6)) != 0) {
-                        // this is 3rd byte of 4 byte sequence - nothing to do
-                        continue;
-                    }
-
-                    // 2nd byte of 4 bytes
-                    // check for non-shortest form of surrogate and the valid surrogate
-                    // range 0x000000 - 0x10FFFF at the same time
-                    if (!InRange(ch & 0x1F0, 0x10, 0x100)) {
-                        goto InvalidByteSequence;
-                    }
-                }
-                else {
-                    // Must be 2nd byte of a 3-byte sequence
-                    // check for non-shortest form of 3 byte seq
-                    if ((ch & (0x1F << 5)) == 0 ||                  // non-shortest form
-                        (ch & (0xF800 >> 6)) == (0xD800 >> 6))     // illegal individually encoded surrogate
-                    {
-                        goto InvalidByteSequence;
-                    }
-                }
-                continue;
-            }
-
-            // ready to punch
-
-            // surrogate in shortest form?
-            // Might be possible to get rid of this?  Already did non-shortest check for 4-byte sequence when reading 2nd byte?
-            if ((ch & (SupplimentarySeq | 0x1F0000)) > SupplimentarySeq) {
-                // let the range check for the second char throw the exception
-                if (pTarget < pAllocatedBufferEnd) {
-                    *pTarget = (WCHAR)(((ch >> 10) & 0x7FF) +
-                        (SHORT)((CharUnicodeInfo::HIGH_SURROGATE_START - (0x10000 >> 10))));
-                    pTarget++;
-
-                    ch = (ch & 0x3FF) +
-                        (int)(CharUnicodeInfo::LOW_SURROGATE_START);
-                }
-            }
-
-            goto EncodeChar;
-
-        InvalidByteSequence:
-            // this code fragment should be close to the gotos referencing it
-            // Have to do fallback for invalid bytes
-            if (fallback == nullptr)
-            {
-                fallback = decoderFallback->CreateFallbackBuffer();
-                fallback->InternalInitialize(bytes, pAllocatedBufferEnd);
-            }
-
-            // That'll back us up the appropriate # of bytes if we didn't get anywhere
-            if (!FallbackInvalidByteSequence(&pSrc, ch, fallback, &pTarget))
-            {
-                // Ran out of buffer space
-                // Need to throw an exception?
-                Contract::Assert(pSrc >= bytes || pTarget == chars,
-                    "[UTF8Encoding.GetChars]Expected to throw or remain in byte buffer after fallback");
-                fallback->InternalReset();
-                ThrowCharsOverflow(pTarget == chars);
-                ch = 0;
-                break;
-            }
-            Contract::Assert(pSrc >= bytes,
-                "[UTF8Encoding.GetChars]Expected invalid byte sequence to have remained within the byte array");
-            ch = 0;
-            continue;
-
-        ReadChar:
-            ch = *pSrc;
-            pSrc++;
-
-        ProcessChar:
-            if (ch > 0x7F) {
-                // If its > 0x7F, its start of a new multi-byte sequence
-
-                // bit 6 has to be non-zero
-                if ((ch & 0x40) == 0) {
-                    goto InvalidByteSequence;
-                }
-
-                // start a new long code
-                if ((ch & 0x20) != 0) {
-                    if ((ch & 0x10) != 0) {
-                        // 4 byte encoding - supplimentary character (2 surrogates)
-
-                        ch &= 0x0F;
-
-                        // check that bit 4 is zero and the valid supplimentary character
-                        // range 0x000000 - 0x10FFFF at the same time
-                        if (ch > 0x04) {
-                            ch |= 0xf0;
-                            goto InvalidByteSequence;
-                        }
-
-                        ch |= (FinalByte >> 3 * 6) | (1 << 30) | (3 << (30 - 2 * 6)) |
-                            (SupplimentarySeq) | (SupplimentarySeq >> 6) |
-                            (SupplimentarySeq >> 2 * 6) | (SupplimentarySeq >> 3 * 6);
-                    }
-                    else {
-                        // 3 byte encoding
-                        ch = (ch & 0x0F) | ((FinalByte >> 2 * 6) | (1 << 30) |
-                            (ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2 * 6));
-                    }
-                }
-                else {
-                    // 2 byte encoding
-
-                    ch &= 0x1F;
-
-                    // check for non-shortest form
-                    if (ch <= 1) {
-                        ch |= 0xc0;
-                        goto InvalidByteSequence;
-                    }
-
-                    ch |= (FinalByte >> 6);
-                }
-                continue;
-            }
-
-        EncodeChar:
-            // write the pending character
-            if (pTarget >= pAllocatedBufferEnd)
-            {
-                // Fix chars so we make sure to throw if we didn't output anything
-                ch &= 0x1fffff;
-                if (ch > 0x7f)
-                {
-                    if (ch > 0x7ff)
-                    {
-                        if (ch >= CharUnicodeInfo::LOW_SURROGATE_START &&
-                            ch <= CharUnicodeInfo::LOW_SURROGATE_END)
-                        {
-                            pSrc--;     // It was 4 bytes
-                            pTarget--;  // 1 was stored already, but we can't remember 1/2, so back up
-                        }
-                        else if (ch > 0xffff)
-                        {
-                            pSrc--;     // It was 4 bytes, nothing was stored
-                        }
-                        pSrc--;         // It was at least 3 bytes
-                    }
-                    pSrc--;             // It was at least 2 bytes
-                }
-                pSrc--;
-
-                // Throw that we don't have enough room (pSrc could be < chars if we had started to process
-                // a 4 byte sequence already)
-                Contract::Assert(pSrc >= bytes || pTarget == chars,
-                    "[UTF8Encoding.GetChars]Expected pSrc to be within input buffer or throw due to no output]");
-                ThrowCharsOverflow(pTarget == chars);
-
-                // Don't store ch in decoder, we already backed up to its start
-                ch = 0;
-
-                // Didn't throw, just use this buffer size.
-                break;
-            }
-            *pTarget = (WCHAR)ch;
-            pTarget++;
-
-#ifdef FASTLOOP
-            int availableChars = PtrDiff(pAllocatedBufferEnd, pTarget);
-            int availableBytes = PtrDiff(pEnd, pSrc);
-
-            // don't fall into the fast decoding loop if we don't have enough bytes
-            // Test for availableChars is done because pStop would be <= pTarget.
-            if (availableBytes <= 13) {
-                // we may need as many as 1 character per byte
-                if (availableChars < availableBytes) {
-                    // not enough output room.  no pending bits at this point
-                    ch = 0;
-                    continue;
-                }
-
-                // try to get over the remainder of the ascii characters fast though
-                BYTE* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
-                while (pSrc < pLocalEnd) {
-                    ch = *pSrc;
-                    pSrc++;
-
-                    if (ch > 0x7F)
-                        goto ProcessChar;
-
-                    *pTarget = (WCHAR)ch;
-                    pTarget++;
-                }
-                // we are done
-                ch = 0;
-                break;
-            }
-
-            // we may need as many as 1 character per byte, so reduce the byte count if necessary.
-            // If availableChars is too small, pStop will be before pTarget and we won't do fast loop.
-            if (availableChars < availableBytes) {
-                availableBytes = availableChars;
-            }
-
-            // To compute the upper bound, assume that all characters are ASCII characters at this point,
-            //  the boundary will be decreased for every non-ASCII character we encounter
-            // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences
-            WCHAR *pStop = pTarget + availableBytes - 7;
-
-            while (pTarget < pStop) {
-                ch = *pSrc;
-                pSrc++;
-
-                if (ch > 0x7F) {
-                    goto LongCode;
-                }
-                *pTarget = (WCHAR)ch;
-                pTarget++;
-
-                // get pSrc to be 2-byte aligned
-                if ((((size_t)pSrc) & 0x1) != 0) {
-                    ch = *pSrc;
-                    pSrc++;
-                    if (ch > 0x7F) {
-                        goto LongCode;
-                    }
-                    *pTarget = (WCHAR)ch;
-                    pTarget++;
-                }
-
-                // get pSrc to be 4-byte aligned
-                if ((((size_t)pSrc) & 0x2) != 0) {
-                    ch = *(USHORT*)pSrc;
-                    if ((ch & 0x8080) != 0) {
-                        goto LongCodeWithMask16;
-                    }
-
-                    // Unfortunately, this is endianness sensitive
-#if BIGENDIAN
-                    *pTarget = (WCHAR)((ch >> 8) & 0x7F);
-                    pSrc += 2;
-                    *(pTarget + 1) = (WCHAR)(ch & 0x7F);
-                    pTarget += 2;
-#else // BIGENDIAN
-                    *pTarget = (WCHAR)(ch & 0x7F);
-                    pSrc += 2;
-                    *(pTarget + 1) = (WCHAR)((ch >> 8) & 0x7F);
-                    pTarget += 2;
-#endif // BIGENDIAN
-                }
-
-                // Run 8 characters at a time!
-                while (pTarget < pStop) {
-                    ch = *(int*)pSrc;
-                    int chb = *(int*)(pSrc + 4);
-                    if (((ch | chb) & (int)0x80808080) != 0) {
-                        goto LongCodeWithMask32;
-                    }
-
-                    // Unfortunately, this is endianness sensitive
-#if BIGENDIAN
-                    *pTarget = (WCHAR)((ch >> 24) & 0x7F);
-                    *(pTarget + 1) = (WCHAR)((ch >> 16) & 0x7F);
-                    *(pTarget + 2) = (WCHAR)((ch >> 8) & 0x7F);
-                    *(pTarget + 3) = (WCHAR)(ch & 0x7F);
-                    pSrc += 8;
-                    *(pTarget + 4) = (WCHAR)((chb >> 24) & 0x7F);
-                    *(pTarget + 5) = (WCHAR)((chb >> 16) & 0x7F);
-                    *(pTarget + 6) = (WCHAR)((chb >> 8) & 0x7F);
-                    *(pTarget + 7) = (WCHAR)(chb & 0x7F);
-                    pTarget += 8;
-#else // BIGENDIAN
-                    *pTarget = (WCHAR)(ch & 0x7F);
-                    *(pTarget + 1) = (WCHAR)((ch >> 8) & 0x7F);
-                    *(pTarget + 2) = (WCHAR)((ch >> 16) & 0x7F);
-                    *(pTarget + 3) = (WCHAR)((ch >> 24) & 0x7F);
-                    pSrc += 8;
-                    *(pTarget + 4) = (WCHAR)(chb & 0x7F);
-                    *(pTarget + 5) = (WCHAR)((chb >> 8) & 0x7F);
-                    *(pTarget + 6) = (WCHAR)((chb >> 16) & 0x7F);
-                    *(pTarget + 7) = (WCHAR)((chb >> 24) & 0x7F);
-                    pTarget += 8;
-#endif // BIGENDIAN
-                }
-                break;
-
-#if BIGENDIAN
-                LongCodeWithMask32 :
-                    // be careful about the sign extension
-                    ch = (int)(((uint)ch) >> 16);
-                LongCodeWithMask16:
-                    ch = (int)(((uint)ch) >> 8);
-#else // BIGENDIAN
-            LongCodeWithMask32:
-            LongCodeWithMask16:
-                ch &= 0xFF;
-#endif // BIGENDIAN
-                pSrc++;
-                if (ch <= 0x7F) {
-                    *pTarget = (WCHAR)ch;
-                    pTarget++;
-                    continue;
-                }
-
-            LongCode:
-                int chc = *pSrc;
-                pSrc++;
-
-                if (
-                    // bit 6 has to be zero
-                    (ch & 0x40) == 0 ||
-                    // we are expecting to see trailing bytes like 10vvvvvv
-                    (chc & 0xC0) != 0x80)
-                {
-                    goto BadLongCode;
-                }
-
-                chc &= 0x3F;
-
-                // start a new long code
-                if ((ch & 0x20) != 0) {
-
-                    // fold the first two bytes together
-                    chc |= (ch & 0x0F) << 6;
-
-                    if ((ch & 0x10) != 0) {
-                        // 4 byte encoding - surrogate
-                        ch = *pSrc;
-                        if (
-                            // check that bit 4 is zero, the non-shortest form of surrogate
-                            // and the valid surrogate range 0x000000 - 0x10FFFF at the same time
-                            !InRange(chc >> 4, 0x01, 0x10) ||
-                            // we are expecting to see trailing bytes like 10vvvvvv
-                            (ch & 0xC0) != 0x80)
-                        {
-                            goto BadLongCode;
-                        }
-
-                        chc = (chc << 6) | (ch & 0x3F);
-
-                        ch = *(pSrc + 1);
-                        // we are expecting to see trailing bytes like 10vvvvvv
-                        if ((ch & 0xC0) != 0x80) {
-                            goto BadLongCode;
-                        }
-                        pSrc += 2;
-
-                        ch = (chc << 6) | (ch & 0x3F);
-
-                        *pTarget = (WCHAR)(((ch >> 10) & 0x7FF) +
-                            (SHORT)(CharUnicodeInfo::HIGH_SURROGATE_START - (0x10000 >> 10)));
-                        pTarget++;
-
-                        ch = (ch & 0x3FF) +
-                            (SHORT)(CharUnicodeInfo::LOW_SURROGATE_START);
-
-                        // extra byte, we're already planning 2 chars for 2 of these bytes,
-                        // but the big loop is testing the target against pStop, so we need
-                        // to subtract 2 more or we risk overrunning the input.  Subtract
-                        // one here and one below.
-                        pStop--;
-                    }
-                    else {
-                        // 3 byte encoding
-                        ch = *pSrc;
-                        if (
-                            // check for non-shortest form of 3 byte seq
-                            (chc & (0x1F << 5)) == 0 ||
-                            // Can't have surrogates here.
-                            (chc & (0xF800 >> 6)) == (0xD800 >> 6) ||
-                            // we are expecting to see trailing bytes like 10vvvvvv
-                            (ch & 0xC0) != 0x80)
-                        {
-                            goto BadLongCode;
-                        }
-                        pSrc++;
-
-                        ch = (chc << 6) | (ch & 0x3F);
-
-                        // extra byte, we're only expecting 1 char for each of these 3 bytes,
-                        // but the loop is testing the target (not source) against pStop, so
-                        // we need to subtract 2 more or we risk overrunning the input.
-                        // Subtract 1 here and one more below
-                        pStop--;
-                    }
-                }
-                else {
-                    // 2 byte encoding
-
-                    ch &= 0x1F;
-
-                    // check for non-shortest form
-                    if (ch <= 1) {
-                        goto BadLongCode;
-                    }
-                    ch = (ch << 6) | chc;
-                }
-
-                *pTarget = (WCHAR)ch;
-                pTarget++;
-
-                // extra byte, we're only expecting 1 char for each of these 2 bytes,
-                // but the loop is testing the target (not source) against pStop.
-                // subtract an extra count from pStop so that we don't overrun the input.
-                pStop--;
-            }
-#endif // FASTLOOP
-
-            Contract::Assert(pTarget <= pAllocatedBufferEnd, "[UTF8Encoding.GetChars]pTarget <= pAllocatedBufferEnd");
-
-            // no pending bits at this point
-            ch = 0;
-            continue;
-
-        BadLongCode:
-            pSrc -= 2;
-            ch = 0;
-            continue;
-        }
-
-        if (ch != 0)
-        {
-            // Have to do fallback for invalid bytes
-            if (fallback == nullptr)
-            {
-                fallback = decoderFallback->CreateFallbackBuffer();
-                fallback->InternalInitialize(bytes, pAllocatedBufferEnd);
-            }
-
-            // This'll back us up the appropriate # of bytes if we didn't get anywhere
-            if (!FallbackInvalidByteSequence(pSrc, ch, fallback))
-            {
-                Contract::Assert(pSrc >= bytes || pTarget == chars,
-                    "[UTF8Encoding.GetChars]Expected to throw or remain in byte buffer while flushing");
-
-                // Ran out of buffer space
-                // Need to throw an exception?
-                fallback->InternalReset();
-                ThrowCharsOverflow(pTarget == chars);
-            }
-            Contract::Assert(pSrc >= bytes,
-                "[UTF8Encoding.GetChars]Expected flushing invalid byte sequence to have remained within the byte array");
-            ch = 0;
-        }
-
-        // Shouldn't have anything in fallback buffer for GetChars
-        // (don't have to check m_throwOnOverflow for chars)
-        Contract::Assert(fallback == nullptr || fallback->GetRemaining() == 0,
-            "[UTF8Encoding.GetChars]Expected empty fallback buffer at end");
-
-        InternalDelete(fallback);
-
-        return PtrDiff(pTarget, chars);
-    }
-
-    int GetBytes(WCHAR* chars, int charCount, BYTE* bytes, int byteCount)
-    {
-        Contract::Assert(chars != nullptr, "[UTF8Encoding.GetBytes]chars!=nullptr");
-        Contract::Assert(byteCount >= 0, "[UTF8Encoding.GetBytes]byteCount >=0");
-        Contract::Assert(charCount >= 0, "[UTF8Encoding.GetBytes]charCount >=0");
-        Contract::Assert(bytes != nullptr, "[UTF8Encoding.GetBytes]bytes!=nullptr");
-
-        // For fallback we may need a fallback buffer.
-        // We wait to initialize it though in case we don't have any broken input unicode
-        EncoderFallbackBuffer* fallbackBuffer = nullptr;
-        WCHAR *pSrc = chars;
-        BYTE *pTarget = bytes;
-
-        WCHAR *pEnd = pSrc + charCount;
-        BYTE *pAllocatedBufferEnd = pTarget + byteCount;
-
-        int ch = 0;
-
-        // assume that JIT will enregister pSrc, pTarget and ch
-
-        while (true) {
-            // SLOWLOOP: does all range checks, handles all special cases, but it is slow
-
-            if (pSrc >= pEnd) {
-
-                if (ch == 0) {
-                    // Check if there's anything left to get out of the fallback buffer
-                    ch = fallbackBuffer != nullptr ? fallbackBuffer->InternalGetNextChar() : 0;
-                    if (ch > 0) {
-                        goto ProcessChar;
-                    }
-                }
-                else {
-                    // Case of leftover surrogates in the fallback buffer
-                    if (fallbackBuffer != nullptr && fallbackBuffer->bFallingBack) {
-                        Contract::Assert(ch >= 0xD800 && ch <= 0xDBFF,
-                            "[UTF8Encoding.GetBytes]expected high surrogate"); //, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
-
-                        int cha = ch;
-
-                        ch = fallbackBuffer->InternalGetNextChar();
-
-                        if (InRange(ch, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) {
-                            ch = ch + (cha << 10) + (0x10000 - CharUnicodeInfo::LOW_SURROGATE_START - (CharUnicodeInfo::HIGH_SURROGATE_START << 10));
-                            goto EncodeChar;
-                        }
-                        else if (ch > 0){
-                            goto ProcessChar;
-                        }
-                        else {
-                            break;
-                        }
-                    }
-                }
-
-                // attempt to encode the partial surrogate (will fail or ignore)
-                if (ch > 0)
-                    goto EncodeChar;
-
-                // We're done
-                break;
-            }
-
-            if (ch > 0) {
-                // We have a high surrogate left over from a previous loop.
-                Contract::Assert(ch >= 0xD800 && ch <= 0xDBFF,
-                    "[UTF8Encoding.GetBytes]expected high surrogate");//, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
-
-                // use separate helper variables for local contexts so that the jit optimizations
-                // won't get confused about the variable lifetimes
-                int cha = *pSrc;
-
-                // In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here.
-                // if (IsLowSurrogate(cha)) {
-                if (InRange(cha, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) {
-                    ch = cha + (ch << 10) +
-                        (0x10000
-                        - CharUnicodeInfo::LOW_SURROGATE_START
-                        - (CharUnicodeInfo::HIGH_SURROGATE_START << 10));
-
-                    pSrc++;
-                }
-                // else ch is still high surrogate and encoding will fail
-
-                // attempt to encode the surrogate or partial surrogate
-                goto EncodeChar;
-            }
-
-            // If we've used a fallback, then we have to check for it
-            if (fallbackBuffer != nullptr)
-            {
-                ch = fallbackBuffer->InternalGetNextChar();
-                if (ch > 0) goto ProcessChar;
-            }
-
-            // read next char. The JIT optimization seems to be getting confused when
-            // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
-            ch = *pSrc;
-            pSrc++;
-
-        ProcessChar:
-            if (InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::HIGH_SURROGATE_END)) {
-                continue;
-            }
-            // either good char or partial surrogate
-
-        EncodeChar:
-            // throw exception on partial surrogate if necessary
-            if (InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END))
-            {
-                // Lone surrogates aren't allowed, we have to do fallback for them
-                // Have to make a fallback buffer if we don't have one
-                if (fallbackBuffer == nullptr)
-                {
-                    // wait on fallbacks if we can
-                    // For fallback we may need a fallback buffer
-                    fallbackBuffer = encoderFallback->CreateFallbackBuffer();
-
-                    // Set our internal fallback interesting things.
-                    fallbackBuffer->InternalInitialize(chars, pEnd, true);
-                }
-
-                // Do our fallback.  Actually we already know its a mixed up surrogate,
-                // so the ref pSrc isn't gonna do anything.
-                fallbackBuffer->InternalFallback((WCHAR)ch, &pSrc);
-
-                // Ignore it if we don't throw
-                ch = 0;
-                continue;
-            }
-
-            // Count bytes needed
-            int bytesNeeded = 1;
-            if (ch > 0x7F) {
-                if (ch > 0x7FF) {
-                    if (ch > 0xFFFF) {
-                        bytesNeeded++;  // 4 bytes (surrogate pair)
-                    }
-                    bytesNeeded++;      // 3 bytes (800-FFFF)
-                }
-                bytesNeeded++;          // 2 bytes (80-7FF)
-            }
-
-            if (pTarget > pAllocatedBufferEnd - bytesNeeded) {
-                // Left over surrogate from last time will cause pSrc == chars, so we'll throw
-                if (fallbackBuffer != nullptr && fallbackBuffer->bFallingBack)
-                {
-                    fallbackBuffer->MovePrevious();              // Didn't use this fallback char
-                    if (ch > 0xFFFF)
-                        fallbackBuffer->MovePrevious();          // Was surrogate, didn't use 2nd part either
-                }
-                else
-                {
-                    pSrc--;                                     // Didn't use this char
-                    if (ch > 0xFFFF)
-                        pSrc--;                                 // Was surrogate, didn't use 2nd part either
-                }
-                Contract::Assert(pSrc >= chars || pTarget == bytes,
-                    "[UTF8Encoding.GetBytes]Expected pSrc to be within buffer or to throw with insufficient room.");
-                ThrowBytesOverflow(pTarget == bytes);  // Throw if we must
-                ch = 0;                                         // Nothing left over (we backed up to start of pair if supplimentary)
-                break;
-            }
-
-            if (ch <= 0x7F) {
-                *pTarget = (BYTE)ch;
-            }
-            else {
-                // use separate helper variables for local contexts so that the jit optimizations
-                // won't get confused about the variable lifetimes
-                int chb;
-                if (ch <= 0x7FF) {
-                    // 2 BYTE encoding
-                    chb = (BYTE)(0xC0 | (ch >> 6));
-                }
-                else
-                {
-                    if (ch <= 0xFFFF) {
-                        chb = (BYTE)(0xE0 | (ch >> 12));
-                    }
-                    else
-                    {
-                        *pTarget = (BYTE)(0xF0 | (ch >> 18));
-                        pTarget++;
-
-                        chb = 0x80 | ((ch >> 12) & 0x3F);
-                    }
-                    *pTarget = (BYTE)chb;
-                    pTarget++;
-
-                    chb = 0x80 | ((ch >> 6) & 0x3F);
-                }
-                *pTarget = (BYTE)chb;
-                pTarget++;
-
-                *pTarget = (BYTE)0x80 | (ch & 0x3F);
-            }
-            pTarget++;
-
-
-#ifdef FASTLOOP
-            // If still have fallback don't do fast loop
-            if (fallbackBuffer != nullptr && (ch = fallbackBuffer->InternalGetNextChar()) != 0)
-                goto ProcessChar;
-
-            int availableChars = PtrDiff(pEnd, pSrc);
-            int availableBytes = PtrDiff(pAllocatedBufferEnd, pTarget);
-
-            // don't fall into the fast decoding loop if we don't have enough characters
-            // Note that if we don't have enough bytes, pStop will prevent us from entering the fast loop.
-            if (availableChars <= 13) {
-                // we are hoping for 1 BYTE per char
-                if (availableBytes < availableChars) {
-                    // not enough output room.  no pending bits at this point
-                    ch = 0;
-                    continue;
-                }
-
-                // try to get over the remainder of the ascii characters fast though
-                WCHAR* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
-                while (pSrc < pLocalEnd) {
-                    ch = *pSrc;
-                    pSrc++;
-
-                    // Not ASCII, need more than 1 BYTE per char
-                    if (ch > 0x7F)
-                        goto ProcessChar;
-
-                    *pTarget = (BYTE)ch;
-                    pTarget++;
-                }
-                // we are done, let ch be 0 to clear encoder
-                ch = 0;
-                break;
-            }
-
-            // we need at least 1 BYTE per character, but Convert might allow us to convert
-            // only part of the input, so try as much as we can.  Reduce charCount if necessary
-            if (availableBytes < availableChars)
-            {
-                availableChars = availableBytes;
-            }
-
-            // FASTLOOP:
-            // - optimistic range checks
-            // - fallbacks to the slow loop for all special cases, exception throwing, etc.
-
-            // To compute the upper bound, assume that all characters are ASCII characters at this point,
-            //  the boundary will be decreased for every non-ASCII character we encounter
-            // Also, we need 5 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates
-            // If there aren't enough bytes for the output, then pStop will be <= pSrc and will bypass the loop.
-            WCHAR *pStop = pSrc + availableChars - 5;
-
-            while (pSrc < pStop) {
-                ch = *pSrc;
-                pSrc++;
-
-                if (ch > 0x7F) {
-                    goto LongCode;
-                }
-                *pTarget = (BYTE)ch;
-                pTarget++;
-
-                // get pSrc aligned
-                if (((size_t)pSrc & 0x2) != 0) {
-                    ch = *pSrc;
-                    pSrc++;
-                    if (ch > 0x7F) {
-                        goto LongCode;
-                    }
-                    *pTarget = (BYTE)ch;
-                    pTarget++;
-                }
-
-                // Run 4 characters at a time!
-                while (pSrc < pStop) {
-                    ch = *(int*)pSrc;
-                    int chc = *(int*)(pSrc + 2);
-                    if (((ch | chc) & (int)0xFF80FF80) != 0) {
-                        goto LongCodeWithMask;
-                    }
-
-                    // Unfortunately, this is endianness sensitive
-#if BIGENDIAN
-                    *pTarget = (BYTE)(ch >> 16);
-                    *(pTarget + 1) = (BYTE)ch;
-                    pSrc += 4;
-                    *(pTarget + 2) = (BYTE)(chc >> 16);
-                    *(pTarget + 3) = (BYTE)chc;
-                    pTarget += 4;
-#else // BIGENDIAN
-                    *pTarget = (BYTE)ch;
-                    *(pTarget + 1) = (BYTE)(ch >> 16);
-                    pSrc += 4;
-                    *(pTarget + 2) = (BYTE)chc;
-                    *(pTarget + 3) = (BYTE)(chc >> 16);
-                    pTarget += 4;
-#endif // BIGENDIAN
-                }
-                continue;
-
-            LongCodeWithMask:
-#if BIGENDIAN
-                // be careful about the sign extension
-                ch = (int)(((uint)ch) >> 16);
-#else // BIGENDIAN
-                ch = (WCHAR)ch;
-#endif // BIGENDIAN
-                pSrc++;
-
-                if (ch > 0x7F) {
-                    goto LongCode;
-                }
-                *pTarget = (BYTE)ch;
-                pTarget++;
-                continue;
-
-            LongCode:
-                // use separate helper variables for slow and fast loop so that the jit optimizations
-                // won't get confused about the variable lifetimes
-                int chd;
-                if (ch <= 0x7FF) {
-                    // 2 BYTE encoding
-                    chd = 0xC0 | (ch >> 6);
-                }
-                else {
-                    if (!InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) {
-                        // 3 BYTE encoding
-                        chd = 0xE0 | (ch >> 12);
-                    }
-                    else
-                    {
-                        // 4 BYTE encoding - high surrogate + low surrogate
-                        if (ch > CharUnicodeInfo::HIGH_SURROGATE_END) {
-                            // low without high -> bad, try again in slow loop
-                            pSrc -= 1;
-                            break;
-                        }
-
-                        chd = *pSrc;
-                        pSrc++;
-
-                        // if (!IsLowSurrogate(chd)) {
-                        if (!InRange(chd, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) {
-                            // high not followed by low -> bad, try again in slow loop
-                            pSrc -= 2;
-                            break;
-                        }
-
-                        ch = chd + (ch << 10) +
-                            (0x10000
-                            - CharUnicodeInfo::LOW_SURROGATE_START
-                            - (CharUnicodeInfo::HIGH_SURROGATE_START << 10));
-
-                        *pTarget = (BYTE)(0xF0 | (ch >> 18));
-                        // pStop - this BYTE is compensated by the second surrogate character
-                        // 2 input chars require 4 output bytes.  2 have been anticipated already
-                        // and 2 more will be accounted for by the 2 pStop-- calls below.
-                        pTarget++;
-
-                        chd = 0x80 | ((ch >> 12) & 0x3F);
-                    }
-                    *pTarget = (BYTE)chd;
-                    pStop--;                    // 3 BYTE sequence for 1 char, so need pStop-- and the one below too.
-                    pTarget++;
-
-                    chd = 0x80 | ((ch >> 6) & 0x3F);
-                }
-                *pTarget = (BYTE)chd;
-                pStop--;                        // 2 BYTE sequence for 1 char so need pStop--.
-                pTarget++;
-
-                *pTarget = (BYTE)(0x80 | (ch & 0x3F));
-                // pStop - this BYTE is already included
-                pTarget++;
-            }
-
-            Contract::Assert(pTarget <= pAllocatedBufferEnd, "[UTF8Encoding.GetBytes]pTarget <= pAllocatedBufferEnd");
-
-#endif // FASTLOOP
-
-            // no pending char at this point
-            ch = 0;
-        }
-
-        InternalDelete(fallbackBuffer);
-
-        return (int)(pTarget - bytes);
-    }
-
-    int GetByteCount(WCHAR *chars, int count)
-    {
-        // For fallback we may need a fallback buffer.
-        // We wait to initialize it though in case we don't have any broken input unicode
-        EncoderFallbackBuffer* fallbackBuffer = nullptr;
-        WCHAR *pSrc = chars;
-        WCHAR *pEnd = pSrc + count;
-
-        // Start by assuming we have as many as count
-        int byteCount = count;
-
-        int ch = 0;
-
-        while (true) {
-            // SLOWLOOP: does all range checks, handles all special cases, but it is slow
-            if (pSrc >= pEnd) {
-
-                if (ch == 0) {
-                    // Unroll any fallback that happens at the end
-                    ch = fallbackBuffer != nullptr ? fallbackBuffer->InternalGetNextChar() : 0;
-                    if (ch > 0) {
-                        byteCount++;
-                        goto ProcessChar;
-                    }
-                }
-                else {
-                    // Case of surrogates in the fallback.
-                    if (fallbackBuffer != nullptr && fallbackBuffer->bFallingBack) {
-                        Contract::Assert(ch >= 0xD800 && ch <= 0xDBFF,
-                            "[UTF8Encoding.GetBytes]expected high surrogate");// , not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
-
-                        ch = fallbackBuffer->InternalGetNextChar();
-                        byteCount++;
-
-                        if (InRange(ch, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) {
-                            ch = 0xfffd;
-                            byteCount++;
-                            goto EncodeChar;
-                        }
-                        else if (ch > 0){
-                            goto ProcessChar;
-                        }
-                        else {
-                            byteCount--; // ignore last one.
-                            break;
-                        }
-                    }
-                }
-
-                if (ch <= 0) {
-                    break;
-                }
-
-                // attempt to encode the partial surrogate (will fallback or ignore it), it'll also subtract 1.
-                byteCount++;
-                goto EncodeChar;
-            }
-
-            if (ch > 0) {
-                Contract::Assert(ch >= 0xD800 && ch <= 0xDBFF,
-                    "[UTF8Encoding.GetBytes]expected high surrogate"); // , not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
-
-                // use separate helper variables for local contexts so that the jit optimizations
-                // won't get confused about the variable lifetimes
-                int cha = *pSrc;
-
-                // count the pending surrogate
-                byteCount++;
-
-                // In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here.
-                // if (IsLowSurrogate(cha)) {
-                if (InRange(cha, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) {
-                    // Don't need a real # because we're just counting, anything > 0x7ff ('cept surrogate) will do.
-                    ch = 0xfffd;
-                    //                        ch = cha + (ch << 10) +
-                    //                            (0x10000
-                    //                            - CharUnicodeInfo::LOW_SURROGATE_START
-                    //                            - (CharUnicodeInfo::HIGH_SURROGATE_START << 10) );
-
-                    // Use this next char
-                    pSrc++;
-                }
-                // else ch is still high surrogate and encoding will fail (so don't add count)
-
-                // attempt to encode the surrogate or partial surrogate
-                goto EncodeChar;
-            }
-
-            // If we've used a fallback, then we have to check for it
-            if (fallbackBuffer != nullptr)
-            {
-                ch = fallbackBuffer->InternalGetNextChar();
-                if (ch > 0)
-                {
-                    // We have an extra byte we weren't expecting.
-                    byteCount++;
-                    goto ProcessChar;
-                }
-            }
-
-            // read next char. The JIT optimization seems to be getting confused when
-            // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
-            ch = *pSrc;
-            pSrc++;
-
-        ProcessChar:
-            if (InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::HIGH_SURROGATE_END)) {
-                // we will count this surrogate next time around
-                byteCount--;
-                continue;
-            }
-            // either good char or partial surrogate
-
-        EncodeChar:
-            // throw exception on partial surrogate if necessary
-            if (InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END))
-            {
-                // Lone surrogates aren't allowed
-                // Have to make a fallback buffer if we don't have one
-                if (fallbackBuffer == nullptr)
-                {
-                    // wait on fallbacks if we can
-                    // For fallback we may need a fallback buffer
-                    fallbackBuffer = encoderFallback->CreateFallbackBuffer();
-
-                    // Set our internal fallback interesting things.
-                    fallbackBuffer->InternalInitialize(chars, chars + count, false);
-                }
-
-                // Do our fallback.  Actually we already know its a mixed up surrogate,
-                // so the ref pSrc isn't gonna do anything.
-                fallbackBuffer->InternalFallback((WCHAR)ch, &pSrc);
-
-                // Ignore it if we don't throw (we had preallocated this ch)
-                byteCount--;
-                ch = 0;
-                continue;
-            }
-
-            // Count them
-            if (ch > 0x7F) {
-                if (ch > 0x7FF) {
-                    // the extra surrogate byte was compensated by the second surrogate character
-                    // (2 surrogates make 4 bytes.  We've already counted 2 bytes, 1 per char)
-                    byteCount++;
-                }
-                byteCount++;
-            }
-
-#if WIN64
-            // check for overflow
-            if (byteCount < 0) {
-                break;
-            }
-#endif
-
-#ifdef FASTLOOP
-            // If still have fallback don't do fast loop
-            if (fallbackBuffer != nullptr && (ch = fallbackBuffer->InternalGetNextChar()) != 0)
-            {
-                // We're reserving 1 byte for each char by default
-                byteCount++;
-                goto ProcessChar;
-            }
-
-            int availableChars = PtrDiff(pEnd, pSrc);
-
-            // don't fall into the fast decoding loop if we don't have enough characters
-            if (availableChars <= 13) {
-                // try to get over the remainder of the ascii characters fast though
-                WCHAR* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
-                while (pSrc < pLocalEnd) {
-                    ch = *pSrc;
-                    pSrc++;
-                    if (ch > 0x7F)
-                        goto ProcessChar;
-                }
-
-                // we are done
-                break;
-            }
-
-#if WIN64
-            // make sure that we won't get a silent overflow inside the fast loop
-            // (Fall out to slow loop if we have this many characters)
-            availableChars &= 0x0FFFFFFF;
-#endif
-
-            // To compute the upper bound, assume that all characters are ASCII characters at this point,
-            //  the boundary will be decreased for every non-ASCII character we encounter
-            // Also, we need 3 + 4 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates
-            WCHAR *pStop = pSrc + availableChars - (3 + 4);
-
-            while (pSrc < pStop) {
-                ch = *pSrc;
-                pSrc++;
-
-                if (ch > 0x7F)                                                  // Not ASCII
-                {
-                    if (ch > 0x7FF)                                             // Not 2 Byte
-                    {
-                        if ((ch & 0xF800) == 0xD800)                            // See if its a Surrogate
-                            goto LongCode;
-                        byteCount++;
-                    }
-                    byteCount++;
-                }
-
-                // get pSrc aligned
-                if (((size_t)pSrc & 0x2) != 0) {
-                    ch = *pSrc;
-                    pSrc++;
-                    if (ch > 0x7F)                                              // Not ASCII
-                    {
-                        if (ch > 0x7FF)                                         // Not 2 Byte
-                        {
-                            if ((ch & 0xF800) == 0xD800)                        // See if its a Surrogate
-                                goto LongCode;
-                            byteCount++;
-                        }
-                        byteCount++;
-                    }
-                }
-
-                // Run 2 * 4 characters at a time!
-                while (pSrc < pStop) {
-                    ch = *(int*)pSrc;
-                    int chc = *(int*)(pSrc + 2);
-                    if (((ch | chc) & (int)0xFF80FF80) != 0)         // See if not ASCII
-                    {
-                        if (((ch | chc) & (int)0xF800F800) != 0)     // See if not 2 Byte
-                        {
-                            goto LongCodeWithMask;
-                        }
-
-
-                        if ((ch & (int)0xFF800000) != 0)             // Actually 0x07800780 is all we care about (4 bits)
-                            byteCount++;
-                        if ((ch & (int)0xFF80) != 0)
-                            byteCount++;
-                        if ((chc & (int)0xFF800000) != 0)
-                            byteCount++;
-                        if ((chc & (int)0xFF80) != 0)
-                            byteCount++;
-                    }
-                    pSrc += 4;
-
-                    ch = *(int*)pSrc;
-                    chc = *(int*)(pSrc + 2);
-                    if (((ch | chc) & (int)0xFF80FF80) != 0)         // See if not ASCII
-                    {
-                        if (((ch | chc) & (int)0xF800F800) != 0)     // See if not 2 Byte
-                        {
-                            goto LongCodeWithMask;
-                        }
-
-                        if ((ch & (int)0xFF800000) != 0)
-                            byteCount++;
-                        if ((ch & (int)0xFF80) != 0)
-                            byteCount++;
-                        if ((chc & (int)0xFF800000) != 0)
-                            byteCount++;
-                        if ((chc & (int)0xFF80) != 0)
-                            byteCount++;
-                    }
-                    pSrc += 4;
-                }
-                break;
-
-            LongCodeWithMask:
-#if BIGENDIAN
-                // be careful about the sign extension
-                ch = (int)(((uint)ch) >> 16);
-#else // BIGENDIAN
-                ch = (WCHAR)ch;
-#endif // BIGENDIAN
-                pSrc++;
-
-                if (ch <= 0x7F) {
-                    continue;
-                }
-
-            LongCode:
-                // use separate helper variables for slow and fast loop so that the jit optimizations
-                // won't get confused about the variable lifetimes
-                if (ch > 0x7FF) {
-                    if (InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) {
-                        // 4 byte encoding - high surrogate + low surrogate
-
-                        int chd = *pSrc;
-                        if (
-                            ch > CharUnicodeInfo::HIGH_SURROGATE_END ||
-                            !InRange(chd, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END))
-                        {
-                            // Back up and drop out to slow loop to figure out error
-                            pSrc--;
-                            break;
-                        }
-                        pSrc++;
-
-                        // byteCount - this byte is compensated by the second surrogate character
-                    }
-                    byteCount++;
-                }
-                byteCount++;
-
-                // byteCount - the last byte is already included
-            }
-#endif // FASTLOOP
-
-            // no pending char at this point
-            ch = 0;
-        }
-
-#if WIN64
-        // check for overflow
-        if (byteCount < 0) {
-            throw ArgumentException("Conversion buffer overflow.");
-        }
-#endif
-
-        Contract::Assert(fallbackBuffer == nullptr || fallbackBuffer->GetRemaining() == 0,
-            "[UTF8Encoding.GetByteCount]Expected Empty fallback buffer");
-
-        InternalDelete(fallbackBuffer);
-
-        return byteCount;
-    }
-
-};
-
-
-////////////////////////////////////////////////////////////////////////////
-//
-//  UTF8ToUnicode
-//
-//  Maps a UTF-8 character string to its wide character string counterpart.
-//
-////////////////////////////////////////////////////////////////////////////
-
-int UTF8ToUnicode(
-    LPCSTR lpSrcStr,
-    int cchSrc,
-    LPWSTR lpDestStr,
-    int cchDest,
-    DWORD dwFlags
-    )
-{
-    int ret;
-    UTF8Encoding enc(dwFlags & MB_ERR_INVALID_CHARS);
-    try {
-        ret = enc.GetCharCount((BYTE*)lpSrcStr, cchSrc);
-        if (cchDest){
-            if (ret > cchDest){
-                SetLastError(ERROR_INSUFFICIENT_BUFFER);
-                ret = 0;
-            }
-            enc.GetChars((BYTE*)lpSrcStr, cchSrc, (WCHAR*)lpDestStr, ret);
-        }
-    }
-    catch (const InsufficientBufferException& e){
-        SetLastError(ERROR_INSUFFICIENT_BUFFER);
-        return 0;
-    }
-    catch (const DecoderFallbackException& e){
-        SetLastError(ERROR_NO_UNICODE_TRANSLATION);
-        return 0;
-    }
-    catch (const ArgumentException& e){
-        SetLastError(ERROR_INVALID_PARAMETER);
-        return 0;
-    }
-    return ret;
-}
-
-////////////////////////////////////////////////////////////////////////////
-//
-//  UnicodeToUTF8
-//
-//  Maps a Unicode character string to its UTF-8 string counterpart.
-//
-////////////////////////////////////////////////////////////////////////////
-
-int UnicodeToUTF8(
-    LPCWSTR lpSrcStr,
-    int cchSrc,
-    LPSTR lpDestStr,
-    int cchDest)
-{
-    int ret;
-    UTF8Encoding enc(false);
-    try{
-        ret = enc.GetByteCount((WCHAR*)lpSrcStr, cchSrc);
-        if (cchDest){
-            if (ret > cchDest){
-                SetLastError(ERROR_INSUFFICIENT_BUFFER);
-                ret = 0;
-            }
-            enc.GetBytes((WCHAR*)lpSrcStr, cchSrc, (BYTE*)lpDestStr, ret);
-        }
-    }
-    catch (const InsufficientBufferException& e){
-        SetLastError(ERROR_INSUFFICIENT_BUFFER);
-        return 0;
-    }
-    catch (const EncoderFallbackException& e){
-        SetLastError(ERROR_NO_UNICODE_TRANSLATION);
-        return 0;
-    }
-    catch (const ArgumentException& e){
-        SetLastError(ERROR_INVALID_PARAMETER);
-        return 0;
-    }
-    return ret;
-}
diff --git a/src/coreclr/vm/rtlfunctions.cpp b/src/coreclr/vm/rtlfunctions.cpp

index 23f662b..f3f8033 100644 (file)
--- a/src/coreclr/vm/rtlfunctions.cpp
+++ b/src/coreclr/vm/rtlfunctions.cpp
@@ -103,7 +103,7 @@ VOID InstallEEFunctionTable (
          }
          else
          {
-            NewArrayHolder<WCHAR> wzTempName(DuplicateStringThrowing(ssTempName.GetUnicode()));
+            NewArrayHolder<WCHAR> wzTempName(ssTempName.GetCopyOfUnicodeString());
  
              // publish result
              if (InterlockedCompareExchangeT(&wszModuleName, (LPWSTR)wzTempName, nullptr) == nullptr)
diff --git a/src/mono/mono/eglib/CMakeLists.txt b/src/mono/mono/eglib/CMakeLists.txt

index 3de4a9c..09cf32e 100644 (file)
--- a/src/mono/mono/eglib/CMakeLists.txt
+++ b/src/mono/mono/eglib/CMakeLists.txt
@@ -33,7 +33,12 @@ set(eglib_common_sources
      gspawn.c
      gfile.c
      gfile-posix.c
-    gutf8.c)
+    gutf8.c
+    ${CLR_SRC_NATIVE_DIR}/minipal/utf8.c)
+
+if(IS_BIG_ENDIAN)
+  set_source_files_properties("${CLR_SRC_NATIVE_DIR}/minipal/utf8.c" PROPERTIES COMPILE_FLAGS "-DBIGENDIAN=1")
+endif()
  
  set(eglib_headers
    glib.h
@@ -41,7 +46,7 @@ set(eglib_headers
    gmodule.h)
  
  if(HAVE_CLOCK_NANOSLEEP)
-list(APPEND eglib_common_sources gclock-nanosleep.c)
+  list(APPEND eglib_common_sources gclock-nanosleep.c)
  endif()
  
  set(eglib_sources "${eglib_platform_sources};${eglib_common_sources}")
diff --git a/src/mono/mono/eglib/giconv.c b/src/mono/mono/eglib/giconv.c

index 664ad31..8ae955c 100644 (file)
--- a/src/mono/mono/eglib/giconv.c
+++ b/src/mono/mono/eglib/giconv.c
@@ -28,132 +28,20 @@
  #include <errno.h>
  #include "../utils/mono-errno.h"
  
+#include <minipal/utf8.h>
+
  #ifdef _MSC_VER
  #define FORCE_INLINE(RET_TYPE) __forceinline RET_TYPE
  #else
  #define FORCE_INLINE(RET_TYPE) inline RET_TYPE __attribute__((always_inline))
  #endif
  
-
-#define UNROLL_DECODE_UTF8 0
-#define UNROLL_ENCODE_UTF8 0
-
-static int decode_utf32be (char *inbuf, size_t inleft, gunichar *outchar);
-static int encode_utf32be (gunichar c, char *outbuf, size_t outleft);
-
-static int decode_utf32le (char *inbuf, size_t inleft, gunichar *outchar);
-static int encode_utf32le (gunichar c, char *outbuf, size_t outleft);
-
-static int decode_utf16be (char *inbuf, size_t inleft, gunichar *outchar);
-static int encode_utf16be (gunichar c, char *outbuf, size_t outleft);
-
-static int decode_utf16le (char *inbuf, size_t inleft, gunichar *outchar);
-static int encode_utf16le (gunichar c, char *outbuf, size_t outleft);
-
-static FORCE_INLINE (int) decode_utf8 (char *inbuf, size_t inleft, gunichar *outchar);
-static int encode_utf8 (gunichar c, char *outbuf, size_t outleft);
-
-static int decode_latin1 (char *inbuf, size_t inleft, gunichar *outchar);
-static int encode_latin1 (gunichar c, char *outbuf, size_t outleft);
-
  #if G_BYTE_ORDER == G_LITTLE_ENDIAN
-#define decode_utf32 decode_utf32le
-#define encode_utf32 encode_utf32le
  #define decode_utf16 decode_utf16le
-#define encode_utf16 encode_utf16le
  #else
-#define decode_utf32 decode_utf32be
-#define encode_utf32 encode_utf32be
  #define decode_utf16 decode_utf16be
-#define encode_utf16 encode_utf16be
  #endif
  
-/*
- * Unicode encoders and decoders
- */
-
-static FORCE_INLINE (uint32_t)
-read_uint32_endian (unsigned char *inptr, unsigned endian)
-{
-       if (endian == G_LITTLE_ENDIAN)
-               return (inptr[3] << 24) | (inptr[2] << 16) | (inptr[1] << 8) | inptr[0];
-       return (inptr[0] << 24) | (inptr[1] << 16) | (inptr[2] << 8) | inptr[3];
-}
-
-static int
-decode_utf32_endian (char *inbuf, size_t inleft, gunichar *outchar, unsigned endian)
-{
-       unsigned char *inptr = (unsigned char *) inbuf;
-       gunichar c;
-
-       if (inleft < 4) {
-               mono_set_errno (EINVAL);
-               return -1;
-       }
-
-       c = read_uint32_endian (inptr, endian);
-
-       if (c >= 0xd800 && c < 0xe000) {
-               mono_set_errno (EILSEQ);
-               return -1;
-       } else if (c >= 0x110000) {
-               mono_set_errno (EILSEQ);
-               return -1;
-       }
-
-       *outchar = c;
-
-       return 4;
-}
-
-static int
-decode_utf32be (char *inbuf, size_t inleft, gunichar *outchar)
-{
-       return decode_utf32_endian (inbuf, inleft, outchar, G_BIG_ENDIAN);
-}
-
-static int
-decode_utf32le (char *inbuf, size_t inleft, gunichar *outchar)
-{
-       return decode_utf32_endian (inbuf, inleft, outchar, G_LITTLE_ENDIAN);
-}
-
-static int
-encode_utf32be (gunichar c, char *outbuf, size_t outleft)
-{
-       unsigned char *outptr = (unsigned char *) outbuf;
-
-       if (outleft < 4) {
-               mono_set_errno (E2BIG);
-               return -1;
-       }
-
-       outptr[0] = (c >> 24) & 0xff;
-       outptr[1] = (c >> 16) & 0xff;
-       outptr[2] = (c >> 8) & 0xff;
-       outptr[3] = c & 0xff;
-
-       return 4;
-}
-
-static int
-encode_utf32le (gunichar c, char *outbuf, size_t outleft)
-{
-       unsigned char *outptr = (unsigned char *) outbuf;
-
-       if (outleft < 4) {
-               mono_set_errno (E2BIG);
-               return -1;
-       }
-
-       outptr[0] = c & 0xff;
-       outptr[1] = (c >> 8) & 0xff;
-       outptr[2] = (c >> 16) & 0xff;
-       outptr[3] = (c >> 24) & 0xff;
-
-       return 4;
-}
-
  static FORCE_INLINE (uint16_t)
  read_uint16_endian (unsigned char *inptr, unsigned endian)
  {
@@ -234,50 +122,6 @@ write_uint16_endian (unsigned char *outptr, uint16_t c, unsigned endian)
  }
  
  static FORCE_INLINE (int)
-encode_utf16_endian (gunichar c, char *outbuf, size_t outleft, unsigned endian)
-{
-       unsigned char *outptr = (unsigned char *) outbuf;
-       gunichar2 ch;
-       gunichar c2;
-
-       if (c < 0x10000) {
-               if (outleft < 2) {
-                       mono_set_errno (E2BIG);
-                       return -1;
-               }
-
-               write_uint16_endian (outptr, GUNICHAR_TO_UINT16 (c), endian);
-               return 2;
-       } else {
-               if (outleft < 4) {
-                       mono_set_errno (E2BIG);
-                       return -1;
-               }
-
-               c2 = c - 0x10000;
-
-               ch = (gunichar2) ((c2 >> 10) + 0xd800);
-               write_uint16_endian (outptr, ch, endian);
-
-               ch = (gunichar2) ((c2 & 0x3ff) + 0xdc00);
-               write_uint16_endian (outptr + 2, ch, endian);
-               return 4;
-       }
-}
-
-static int
-encode_utf16be (gunichar c, char *outbuf, size_t outleft)
-{
-       return encode_utf16_endian (c, outbuf, outleft, G_BIG_ENDIAN);
-}
-
-static int
-encode_utf16le (gunichar c, char *outbuf, size_t outleft)
-{
-       return encode_utf16_endian (c, outbuf, outleft, G_LITTLE_ENDIAN);
-}
-
-static FORCE_INLINE (int)
  decode_utf8 (char *inbuf, size_t inleft, gunichar *outchar)
  {
         unsigned char *inptr = (unsigned char *) inbuf;
@@ -336,89 +180,6 @@ decode_utf8 (char *inbuf, size_t inleft, gunichar *outchar)
         return GSIZE_TO_INT(n);
  }
  
-static int
-encode_utf8 (gunichar c, char *outbuf, size_t outleft)
-{
-       unsigned char *outptr = (unsigned char *) outbuf;
-       int base;
-       size_t n;
-
-       if (c < 0x80) {
-               outptr[0] = GUNICHAR_TO_UINT8 (c);
-               return 1;
-       } else if (c < 0x800) {
-               base = 192;
-               n = 2;
-       } else if (c < 0x10000) {
-               base = 224;
-               n = 3;
-       } else if (c < 0x200000) {
-               base = 240;
-               n = 4;
-       } else if (c < 0x4000000) {
-               base = 248;
-               n = 5;
-       } else {
-               base = 252;
-               n = 6;
-       }
-
-       if (outleft < n) {
-               mono_set_errno (E2BIG);
-               return -1;
-       }
-
-#if UNROLL_ENCODE_UTF8
-       switch (n) {
-       case 6: outptr[5] = (c & 0x3f) | 0x80; c >>= 6;
-       case 5: outptr[4] = (c & 0x3f) | 0x80; c >>= 6;
-       case 4: outptr[3] = (c & 0x3f) | 0x80; c >>= 6;
-       case 3: outptr[2] = (c & 0x3f) | 0x80; c >>= 6;
-       case 2: outptr[1] = (c & 0x3f) | 0x80; c >>= 6;
-       case 1: outptr[0] = c | base;
-       }
-#else
-       for (size_t i = n - 1; i > 0; i--) {
-               outptr[i] = (c & 0x3f) | 0x80;
-               c >>= 6;
-       }
-
-       outptr[0] = GUNICHAR_TO_UINT8 (c | base);
-#endif
-
-       return GSIZE_TO_INT(n);
-}
-
-static int
-decode_latin1 (char *inbuf, size_t inleft, gunichar *outchar)
-{
-       *outchar = (unsigned char) *inbuf;
-       return 1;
-}
-
-static int
-encode_latin1 (gunichar c, char *outbuf, size_t outleft)
-{
-       if (outleft < 1) {
-               mono_set_errno (E2BIG);
-               return -1;
-       }
-
-       if (c > 0xff) {
-               mono_set_errno (EILSEQ);
-               return -1;
-       }
-
-       *outbuf = (char) c;
-
-       return 1;
-}
-
-
-/*
- * Simple conversion API
- */
-
  static gpointer error_quark = (gpointer)"ConvertError";
  
  gpointer
@@ -426,9 +187,6 @@ g_convert_error_quark (void)
  {
         return error_quark;
  }
-/*
- * Unicode conversion
- */
  
  /**
   * An explanation of the conversion can be found at:
@@ -559,162 +317,114 @@ g_utf8_to_ucs4_fast (const gchar *str, glong len, glong *items_written)
         return outbuf;
  }
  
-static gunichar2 *
-eg_utf8_to_utf16_general (const gchar *str, glong len, glong *items_read, glong *items_written, gboolean include_nuls, gboolean replace_invalid_codepoints, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err, unsigned endian)
+static FORCE_INLINE (void)
+map_error(GError **err)
  {
-       gunichar2 *outbuf, *outptr;
-       size_t outlen = 0;
-       size_t inleft;
-       char *inptr;
-       gunichar c;
-       int u, n;
-
-       g_return_val_if_fail (str != NULL, NULL);
-
-       if (len < 0) {
-               if (include_nuls) {
-                       g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED, "Conversions with embedded nulls must pass the string length");
-                       return NULL;
-               }
-
-               len = (glong)strlen (str);
+       if (errno == MINIPAL_ERROR_INSUFFICIENT_BUFFER) {
+               g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_MEMORY, "Allocation failed.");
+       } else if (errno == MINIPAL_ERROR_NO_UNICODE_TRANSLATION) {
+               g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "Illegal byte sequence encountered in the input.");
         }
+}
  
-       inptr = (char *) str;
-       inleft = len;
-
-       while (inleft > 0) {
-               if ((n = decode_utf8 (inptr, inleft, &c)) < 0)
-                       goto error;
-
-               if (c == 0 && !include_nuls)
-                       break;
+static gunichar2 *
+g_utf8_to_utf16_impl (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err, int flags, bool treatAsLE)
+{
+       errno = 0;
+       gunichar2* lpDestStr = NULL;
+#if G_BYTE_ORDER == G_BIG_ENDIAN
+       if (treatAsLE)
+               flags |= MINIPAL_TREAT_AS_LITTLE_ENDIAN;
+#endif
  
-               if ((u = g_unichar_to_utf16_endian (c, NULL, endian)) < 0) {
-                       if (replace_invalid_codepoints) {
-                               u = 2;
-                       } else {
-                               mono_set_errno (EILSEQ);
-                               goto error;
-                       }
-               }
+       if (len < 0)
+               len = (glong)strlen(str) + 1;
  
-               outlen += u;
-               inleft -= n;
-               inptr += n;
-       }
+       glong ret = (glong)minipal_get_length_utf8_to_utf16 (str, len, flags);
  
-       if (items_read)
-               *items_read = GPTRDIFF_TO_LONG (inptr - str);
+       map_error(err);
  
         if (items_written)
-               *items_written = (glong)outlen;
+               *items_written = errno == 0 ? ret : 0;
  
-       if (G_LIKELY (!custom_alloc_func))
-               outptr = outbuf = g_malloc ((outlen + 1) * sizeof (gunichar2));
-       else
-               outptr = outbuf = (gunichar2 *)custom_alloc_func ((outlen + 1) * sizeof (gunichar2), custom_alloc_data);
+       if (ret <= 0)
+               return NULL;
  
-       if (G_UNLIKELY (custom_alloc_func && !outbuf)) {
-               mono_set_errno (ENOMEM);
-               goto error;
-       }
+       lpDestStr = malloc((ret + 1) * sizeof(gunichar2));
+       ret = (glong)minipal_convert_utf8_to_utf16 (str, len, lpDestStr, ret, flags);
+       lpDestStr[ret] = '\0';
  
-       inptr = (char *) str;
-       inleft = len;
+       if (items_written)
+               *items_written = errno == 0 ? ret : 0;
  
-       while (inleft > 0) {
-               if ((n = decode_utf8 (inptr, inleft, &c)) < 0)
-                       break;
+       map_error(err);
+       return lpDestStr;
+}
  
-               if (c == 0 && !include_nuls)
-                       break;
+static gunichar2 *
+g_utf8_to_utf16le_custom_alloc_impl (const gchar *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err, bool treatAsLE)
+{
+       guint flags = 0;
+       errno = 0;
+#if G_BYTE_ORDER == G_BIG_ENDIAN
+       if (treatAsLE)
+               flags = MINIPAL_TREAT_AS_LITTLE_ENDIAN;
+#endif
+       if (len < 0)
+               len = (glong)strlen(str) + 1;
  
-               u = g_unichar_to_utf16_endian (c, outptr, endian);
-               if ((u < 0) && replace_invalid_codepoints) {
-                       outptr[0] = 0xFFFD;
-                       outptr[1] = 0xFFFD;
-                       u = 2;
-               }
+       glong ret = (glong)minipal_get_length_utf8_to_utf16 (str, len, flags);
  
-               outptr += u;
-               inleft -= n;
-               inptr += n;
-       }
+       map_error(err);
  
-       *outptr = '\0';
+       if (items_written)
+               *items_written = errno == 0 ? ret : 0;
  
-       return outbuf;
+       if (ret <= 0)
+               return NULL;
  
-error:
-       if (errno == ENOMEM) {
-               g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_MEMORY,
-                            "Allocation failed.");
-       } else if (errno == EILSEQ) {
-               g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
-                            "Illegal byte sequence encountered in the input.");
-       } else if (items_read) {
-               /* partial input is ok if we can let our caller know... */
-       } else {
-               g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
-                            "Partial byte sequence encountered in the input.");
+       gunichar2 *lpDestStr = custom_alloc_func((ret + 1) * sizeof(gunichar2), custom_alloc_data);
+       if (G_UNLIKELY (!lpDestStr)) {
+               g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_MEMORY, "Allocation failed.");
+               return NULL;
         }
  
-       if (items_read)
-               *items_read = GPTRDIFF_TO_LONG (inptr - str);
-
-       if (items_written)
-               *items_written = 0;
+       flags |= MINIPAL_MB_NO_REPLACE_INVALID_CHARS;
+       ret = (glong)minipal_convert_utf8_to_utf16 (str, len, lpDestStr, ret, flags);
+       lpDestStr[ret] = '\0';
  
-       return NULL;
+       map_error(err);
+       return lpDestStr;
  }
  
  gunichar2 *
  g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err)
  {
-       return eg_utf8_to_utf16_general (str, len, items_read, items_written, FALSE, FALSE, NULL, NULL, err, G_BYTE_ORDER);
-}
-
-gunichar2 *
-g_utf8_to_utf16be (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err)
-{
-       return eg_utf8_to_utf16_general (str, len, items_read, items_written, FALSE, FALSE, NULL, NULL, err, G_BIG_ENDIAN);
+       return g_utf8_to_utf16_impl (str, len, items_read, items_written, err, MINIPAL_MB_NO_REPLACE_INVALID_CHARS, false);
  }
  
  gunichar2 *
  g_utf8_to_utf16le (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err)
  {
-       return eg_utf8_to_utf16_general (str, len, items_read, items_written, FALSE, FALSE, NULL, NULL, err, G_LITTLE_ENDIAN);
+       return g_utf8_to_utf16_impl (str, len, items_read, items_written, err, MINIPAL_MB_NO_REPLACE_INVALID_CHARS, true);
  }
  
  gunichar2 *
-g_utf8_to_utf16_custom_alloc (const gchar *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err)
+eg_wtf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err)
  {
-       return eg_utf8_to_utf16_general (str, len, items_read, items_written, FALSE, FALSE, custom_alloc_func, custom_alloc_data, err, G_BYTE_ORDER);
+       return g_utf8_to_utf16_impl (str, len, items_read, items_written, err, 0, false);
  }
  
  gunichar2 *
-g_utf8_to_utf16be_custom_alloc (const gchar *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err)
+g_utf8_to_utf16_custom_alloc (const gchar *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err)
  {
-       return eg_utf8_to_utf16_general (str, len, items_read, items_written, FALSE, FALSE, custom_alloc_func, custom_alloc_data, err, G_BIG_ENDIAN);
+       return g_utf8_to_utf16le_custom_alloc_impl (str, len, items_read, items_written, custom_alloc_func, custom_alloc_data, err, false);
  }
  
  gunichar2 *
  g_utf8_to_utf16le_custom_alloc (const gchar *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err)
  {
-       return eg_utf8_to_utf16_general (str, len, items_read, items_written, FALSE, FALSE, custom_alloc_func, custom_alloc_data, err, G_LITTLE_ENDIAN);
-}
-
-gunichar2 *
-eg_utf8_to_utf16_with_nuls (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err)
-{
-       return eg_utf8_to_utf16_general (str, len, items_read, items_written, TRUE, FALSE, NULL, NULL, err, G_BYTE_ORDER);
-}
-
-gunichar2 *
-eg_wtf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err)
-{
-       return eg_utf8_to_utf16_general (str, len, items_read, items_written, TRUE, TRUE, NULL, NULL, err, G_BYTE_ORDER);
+       return g_utf8_to_utf16le_custom_alloc_impl (str, len, items_read, items_written, custom_alloc_func, custom_alloc_data, err, true);
  }
  
  gunichar *
@@ -789,120 +499,89 @@ g_utf8_to_ucs4 (const gchar *str, glong len, glong *items_read, glong *items_wri
         return outbuf;
  }
  
-static
-gchar *
-eg_utf16_to_utf8_general (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err, unsigned endian)
+static gchar *
+g_utf16_to_utf8_impl (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err, bool treatAsLE)
  {
-       char *inptr, *outbuf, *outptr;
-       size_t outlen = 0;
-       size_t inleft;
-       gunichar c;
-       int n;
-
-       g_return_val_if_fail (str != NULL, NULL);
-
+       guint flags = 0;
+       errno = 0;
+       gchar* lpDestStr = NULL;
+#if G_BYTE_ORDER == G_BIG_ENDIAN
+       if (treatAsLE)
+               flags |= MINIPAL_TREAT_AS_LITTLE_ENDIAN;
+#endif
         if (len < 0) {
                 len = 0;
                 while (str[len])
                         len++;
-       }
-
-       inptr = (char *) str;
-       inleft = len * 2;
-
-       while (inleft > 0) {
-               if ((n = decode_utf16_endian (inptr, inleft, &c, endian)) < 0) {
-                       if (n == -2 && inleft > 2) {
-                               /* This means that the first UTF-16 char was read, but second failed */
-                               inleft -= 2;
-                               inptr += 2;
-                       }
  
-                       if (errno == EILSEQ) {
-                               g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
-                                            "Illegal byte sequence encountered in the input.");
-                       } else if (items_read) {
-                               /* partial input is ok if we can let our caller know... */
-                               break;
-                       } else {
-                               g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
-                                            "Partial byte sequence encountered in the input.");
-                       }
-
-                       if (items_read)
-                               *items_read = GPTRDIFF_TO_LONG ((inptr - (char *) str) / 2);
-
-                       if (items_written)
-                               *items_written = 0;
-
-                       return NULL;
-               } else if (c == 0)
-                       break;
-
-               outlen += g_unichar_to_utf8 (c, NULL);
-               inleft -= n;
-               inptr += n;
+               len++;
         }
  
-       if (items_read)
-               *items_read = GPTRDIFF_TO_LONG ((inptr - (char *) str) / 2);
+       glong ret = (glong)minipal_get_length_utf16_to_utf8 (str, len, flags);
+       map_error(err);
  
         if (items_written)
-               *items_written = (glong)outlen;
-
-       if (G_LIKELY (!custom_alloc_func))
-               outptr = outbuf = g_malloc (outlen + 1);
-       else
-               outptr = outbuf = (char *)custom_alloc_func (outlen + 1, custom_alloc_data);
+               *items_written = errno == 0 ? ret : 0;
  
-       if (G_UNLIKELY (custom_alloc_func && !outbuf)) {
-               g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_MEMORY, "Allocation failed.");
-               if (items_written)
-                       *items_written = 0;
+       if (ret <= 0)
                 return NULL;
-       }
-
-       inptr = (char *) str;
-       inleft = len * 2;
-
-       while (inleft > 0) {
-               if ((n = decode_utf16_endian (inptr, inleft, &c, endian)) < 0)
-                       break;
-               else if (c == 0)
-                       break;
  
-               outptr += g_unichar_to_utf8 (c, outptr);
-               inleft -= n;
-               inptr += n;
-       }
+       lpDestStr = (gchar *)g_malloc((ret + 1) * sizeof(gchar));
+       ret = (glong)minipal_convert_utf16_to_utf8 (str, len, lpDestStr, ret, flags);
+       lpDestStr[ret] = '\0';
  
-       *outptr = '\0';
+       if (items_written)
+               *items_written = errno == 0 ? ret : 0;
  
-       return outbuf;
+       map_error(err);
+       return lpDestStr;
  }
  
  gchar *
  g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err)
  {
-       return eg_utf16_to_utf8_general (str, len, items_read, items_written, NULL, NULL, err, G_BYTE_ORDER);
+       return g_utf16_to_utf8_impl (str, len, items_read, items_written, err, /* treatAsLE */ false);
  }
  
  gchar *
  g_utf16le_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err)
  {
-       return eg_utf16_to_utf8_general (str, len, items_read, items_written, NULL, NULL, err, G_LITTLE_ENDIAN);
-}
-
-gchar *
-g_utf16be_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err)
-{
-       return eg_utf16_to_utf8_general (str, len, items_read, items_written, NULL, NULL, err, G_BIG_ENDIAN);
+       return g_utf16_to_utf8_impl (str, len, items_read, items_written, err, /* treatAsLE */ true);
  }
  
  gchar *
  g_utf16_to_utf8_custom_alloc (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err)
  {
-       return eg_utf16_to_utf8_general (str, len, items_read, items_written, custom_alloc_func, custom_alloc_data, err, G_BYTE_ORDER);
+       errno = 0;
+
+       if (len < 0) {
+               len = 0;
+               while (str[len])
+                       len++;
+
+               len++;
+       }
+
+       glong ret = (glong)minipal_get_length_utf16_to_utf8 (str, len, 0);
+       map_error(err);
+
+       if (items_written)
+               *items_written = errno == 0 ? ret : 0;
+
+       if (ret <= 0)
+               return NULL;
+
+       gchar *lpDestStr = custom_alloc_func((ret + 1) * sizeof (gunichar2), custom_alloc_data);
+       if (G_UNLIKELY (!lpDestStr)) {
+               g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_MEMORY, "Allocation failed.");
+               return NULL;
+       }
+
+       ret = (glong)minipal_convert_utf16_to_utf8 (str, len, lpDestStr, ret, 0);
+       lpDestStr[ret] = '\0';
+
+       map_error(err);
+       return lpDestStr;
  }
  
  gunichar *
diff --git a/src/mono/mono/eglib/glib.h b/src/mono/mono/eglib/glib.h

index e438c00..fcd8d2e 100644 (file)
--- a/src/mono/mono/eglib/glib.h
+++ b/src/mono/mono/eglib/glib.h
@@ -882,14 +882,11 @@ gunichar  *g_utf8_to_ucs4_fast (const gchar *str, glong len, glong *items_writte
  gunichar  *g_utf8_to_ucs4 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err);
  G_EXTERN_C // Used by libtest, at least.
  gunichar2 *g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err);
-gunichar2 *g_utf8_to_utf16be (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err);
  gunichar2 *g_utf8_to_utf16le (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err);
-gunichar2 *eg_utf8_to_utf16_with_nuls (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err);
  gunichar2 *eg_wtf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err);
  G_EXTERN_C // Used by libtest, at least.
  gchar     *g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err);
  gchar     *g_utf16le_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err);
-gchar     *g_utf16be_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err);
  gunichar  *g_utf16_to_ucs4 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err);
  gchar     *g_ucs4_to_utf8  (const gunichar *str, glong len, glong *items_read, glong *items_written, GError **err);
  gunichar2 *g_ucs4_to_utf16 (const gunichar *str, glong len, glong *items_read, glong *items_written, GError **err);
@@ -915,7 +912,6 @@ gpointer
  g_fixed_buffer_custom_allocator (gsize req_size, gpointer custom_alloc_data);
  
  gunichar2 *g_utf8_to_utf16_custom_alloc (const gchar *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err);
-gunichar2 *g_utf8_to_utf16be_custom_alloc (const gchar *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err);
  gunichar2 *g_utf8_to_utf16le_custom_alloc (const gchar *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err);
  gchar *g_utf16_to_utf8_custom_alloc (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err);
  
diff --git a/src/mono/mono/eglib/test/utf8.c b/src/mono/mono/eglib/test/utf8.c

index d36dbfa..5602bbc 100644 (file)
--- a/src/mono/mono/eglib/test/utf8.c
+++ b/src/mono/mono/eglib/test/utf8.c
@@ -155,7 +155,7 @@ compare_utf8_to_utf16_explicit (const gunichar2 *expected, const gchar *utf8, gl
  
         gerror = NULL;
         if (include_nuls)
-               ret = eg_utf8_to_utf16_with_nuls (utf8, size_spec, &in_read, &out_read, &gerror);
+               ret = g_utf8_to_utf16 (utf8, size_spec, &in_read, &out_read, &gerror);
         else
                 ret = g_utf8_to_utf16 (utf8, size_spec, &in_read, &out_read, &gerror);
  
@@ -271,7 +271,7 @@ test_utf8_to_utf16_with_nuls (void)
  #endif
  
         /* implicit length is forbidden */
-               if (eg_utf8_to_utf16_with_nuls (src1, -1, NULL, NULL, NULL) != NULL)
+               if (g_utf8_to_utf16 (src1, -1, NULL, NULL, NULL) != NULL)
                 return FAILED ("explicit nulls must fail with -1 length\n");
  
         /* empty string */
@@ -699,7 +699,7 @@ utf8_byteslen (const gchar *src)
  static Test utf8_tests [] = {
         {"g_utf16_to_utf8", test_utf16_to_utf8},
         {"g_utf8_to_utf16", test_utf8_to_utf16},
-       {"g_utf8_to_utf16_with_nuls", test_utf8_to_utf16_with_nuls},
+       {"g_utf8_to_utf16_nuls", test_utf8_to_utf16_with_nuls},
         {"g_utf8_seq", test_utf8_seq},
         {"g_ucs4_to_utf16", test_ucs4_to_utf16 },
         {"g_utf16_to_ucs4", test_utf16_to_ucs4 },
diff --git a/src/mono/mono/metadata/object.c b/src/mono/mono/metadata/object.c

index 8604114..b0289ce 100644 (file)
--- a/src/mono/mono/metadata/object.c
+++ b/src/mono/mono/metadata/object.c
@@ -327,7 +327,7 @@ get_type_init_exception_for_vtable (MonoVTable *vtable)
  
         mono_mem_manager_init_reflection_hashes (mem_manager);
  
-       /* 
+       /*
          * If the initializing thread was rudely aborted, the exception is not stored
          * in the hash.
          */
@@ -6361,7 +6361,7 @@ mono_string_new_utf8_len (const char *text, guint length, MonoError *error)
         gunichar2 *ut = NULL;
         glong items_written;
  
-       ut = eg_utf8_to_utf16_with_nuls (text, length, NULL, &items_written, &eg_error);
+       ut = g_utf8_to_utf16 (text, length, NULL, &items_written, &eg_error);
  
         if (eg_error) {
                 o = NULL_HANDLE_STRING;
diff --git a/src/mono/mono/mini/CMakeLists.txt b/src/mono/mono/mini/CMakeLists.txt

index 8e60bab..6f5e850 100644 (file)
--- a/src/mono/mono/mini/CMakeLists.txt
+++ b/src/mono/mono/mini/CMakeLists.txt
@@ -551,7 +551,7 @@ if(NOT DISABLE_EXECUTABLES)
    target_link_libraries(mono-sgen PRIVATE monoapi eglib_api monosgen-static)
    if(HAVE_ICU_SHIM)
      target_link_libraries(mono-sgen PRIVATE icu_shim_objects)
-  endif() 
+  endif()
    target_link_libraries(mono-sgen PRIVATE ${OS_LIBS} ${LLVM_LIBS} ${ICU_LIBS} ${Z_LIBS})
    # Alpine Linux implements ucontext in a different library
    if(CLR_CMAKE_HOST_ALPINE_LINUX AND TARGET_S390X)
diff --git a/src/native/minipal/utf8.c b/src/native/minipal/utf8.c

new file mode 100644 (file)

index 0000000..a54b805
--- /dev/null
+++ b/src/native/minipal/utf8.c
@@ -0,0 +1,2149 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+#include <minipal/utf8.h>
+
+#include <errno.h>
+#include <limits.h>
+#include <string.h>
+#include <assert.h>
+
+#define HIGH_SURROGATE_START 0xd800
+#define HIGH_SURROGATE_END 0xdbff
+#define LOW_SURROGATE_START 0xdc00
+#define LOW_SURROGATE_END 0xdfff
+
+// Test if the wide character is a high surrogate
+static bool IsHighSurrogate(const CHAR16_T c)
+{
+    return (c & 0xFC00) == HIGH_SURROGATE_START;
+}
+
+// Test if the wide character is a low surrogate
+static bool IsLowSurrogate(const CHAR16_T c)
+{
+    return (c & 0xFC00) == LOW_SURROGATE_START;
+}
+
+// Test if the wide character is a surrogate half
+static bool IsSurrogate(const CHAR16_T c)
+{
+    return (c & 0xF800) == HIGH_SURROGATE_START;
+}
+
+typedef struct
+{
+    // Store our default string
+    unsigned char* byteStart;
+    CHAR16_T* charEnd;
+    const CHAR16_T strDefault[2];
+    int strDefaultLength;
+    int fallbackCount;
+    int fallbackIndex;
+} DecoderBuffer;
+
+static CHAR16_T DecoderReplacementFallbackBuffer_GetNextChar(DecoderBuffer* self)
+{
+    // We want it to get < 0 because == 0 means that the current/last character is a fallback
+    // and we need to detect recursion.  We could have a flag but we already have this counter.
+    self->fallbackCount--;
+    self->fallbackIndex++;
+
+    // Do we have anything left? 0 is now last fallback char, negative is nothing left
+    if (self->fallbackCount < 0)
+        return '\0';
+
+    // Need to get it out of the buffer.
+    // Make sure it didn't wrap from the fast count-- path
+    if (self->fallbackCount == INT_MAX)
+    {
+        self->fallbackCount = -1;
+        return '\0';
+    }
+
+    // Now make sure its in the expected range
+    assert(self->fallbackIndex < self->strDefaultLength && self->fallbackIndex >= 0);
+
+    return self->strDefault[self->fallbackIndex];
+}
+
+// Fallback Methods
+static bool DecoderReplacementFallbackBuffer_Fallback(DecoderBuffer* self)
+{
+    // We expect no previous fallback in our buffer
+    // We can't call recursively but others might (note, we don't test on last char!!!)
+    assert(self->fallbackCount < 1);
+
+    // Go ahead and get our fallback
+    if (self->strDefaultLength == 0)
+        return false;
+
+    self->fallbackCount = self->strDefaultLength;
+    self->fallbackIndex = -1;
+
+    return true;
+}
+
+// Fallback the current byte by sticking it into the remaining char buffer.
+// This can only be called by our encodings (other have to use the public fallback methods), so
+// we can use our DecoderNLS here too (except we don't).
+// Returns true if we are successful, false if we can't fallback the character (no buffer space)
+// So caller needs to throw buffer space if return false.
+// Right now this has both bytes and bytes[], since we might have extra bytes, hence the
+// array, and we might need the index, hence the byte*
+// Don't touch ref chars unless we succeed
+static bool DecoderReplacementFallbackBuffer_InternalFallback_Copy(DecoderBuffer* self, CHAR16_T** chars, CHAR16_T* pAllocatedBufferEnd)
+{
+    assert(self->byteStart != NULL);
+
+    bool fallbackResult = DecoderReplacementFallbackBuffer_Fallback(self);
+
+    // See if there's a fallback character and we have an output buffer then copy our string.
+    if (fallbackResult)
+    {
+        // Copy the chars to our output
+        CHAR16_T ch;
+        CHAR16_T* charTemp = *chars;
+        bool bHighSurrogate = false;
+        (void)bHighSurrogate; // unused in release build
+        while ((ch = DecoderReplacementFallbackBuffer_GetNextChar(self)) != 0)
+        {
+            // Make sure no mixed up surrogates
+            if (IsSurrogate(ch))
+            {
+                if (IsHighSurrogate(ch))
+                {
+                    // High Surrogate
+                    assert(!bHighSurrogate);
+                    bHighSurrogate = true;
+                }
+                else
+                {
+                    // Low surrogate
+                    assert(bHighSurrogate);
+                    bHighSurrogate = false;
+                }
+            }
+
+            if (charTemp >= self->charEnd)
+            {
+                // No buffer space
+                return false;
+            }
+
+            *(charTemp++) = ch;
+            if (charTemp > pAllocatedBufferEnd)
+            {
+                errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER;
+                return false;
+            }
+        }
+
+        // Need to make sure that bHighSurrogate isn't true
+        assert(!bHighSurrogate);
+
+        // Now we aren't going to be false, so its OK to update chars
+        *chars = charTemp;
+    }
+
+    return true;
+}
+
+// Clear the buffer
+static void DecoderReplacementFallbackBuffer_Reset(DecoderBuffer* self)
+{
+    self->fallbackCount = -1;
+    self->fallbackIndex = -1;
+    self->byteStart = NULL;
+}
+
+typedef struct
+{
+    const CHAR16_T strDefault[3];
+    int strDefaultLength;
+    CHAR16_T* charStart;
+    CHAR16_T* charEnd;
+    bool setEncoder;
+    bool bUsedEncoder;
+    bool bFallingBack;
+    int iRecursionCount;
+    int fallbackCount;
+    int fallbackIndex;
+} EncoderBuffer;
+
+#define MAX_RECURSION 250
+
+// Set the above values
+// This can't be part of the constructor because EncoderFallbacks would have to know how to implement these.
+static void EncoderReplacementFallbackBuffer_InternalInitialize(EncoderBuffer* self, CHAR16_T* charStart, CHAR16_T* charEnd, bool setEncoder)
+{
+    self->charStart = charStart;
+    self->charEnd = charEnd;
+    self->setEncoder = setEncoder;
+    self->bUsedEncoder = false;
+    self->bFallingBack = false;
+    self->iRecursionCount = 0;
+}
+
+static CHAR16_T EncoderReplacementFallbackBuffer_InternalGetNextChar(EncoderBuffer* self)
+{
+    // We want it to get < 0 because == 0 means that the current/last character is a fallback
+    // and we need to detect recursion.  We could have a flag but we already have this counter.
+    self->fallbackCount--;
+    self->fallbackIndex++;
+
+    // Do we have anything left? 0 is now last fallback char, negative is nothing left
+    if (self->fallbackCount < 0)
+        return '\0';
+
+    // Need to get it out of the buffer.
+    // Make sure it didn't wrap from the fast count-- path
+    if (self->fallbackCount == INT_MAX)
+    {
+        self->fallbackCount = -1;
+        return '\0';
+    }
+
+    // Now make sure its in the expected range
+    assert(self->fallbackIndex < self->strDefaultLength && self->fallbackIndex >= 0);
+
+    CHAR16_T ch = self->strDefault[self->fallbackIndex];
+    self->bFallingBack = (ch != 0);
+    if (ch == 0) self->iRecursionCount = 0;
+    return ch;
+}
+
+// Fallback Methods
+static bool EncoderReplacementFallbackBuffer_Fallback(EncoderBuffer* self)
+{
+    // If we had a buffer already we're being recursive, throw, it's probably at the suspect
+    // character in our array.
+    assert(self->fallbackCount < 1);
+
+    // Go ahead and get our fallback
+    // Divide by 2 because we aren't a surrogate pair
+    self->fallbackCount = self->strDefaultLength / 2;
+    self->fallbackIndex = -1;
+
+    return self->fallbackCount != 0;
+}
+
+static bool EncoderReplacementFallbackBuffer_Fallback_Unknown(EncoderBuffer* self)
+{
+    // If we had a buffer already we're being recursive, throw, it's probably at the suspect
+    // character in our array.
+    assert(self->fallbackCount < 1);
+
+    // Go ahead and get our fallback
+    self->fallbackCount = self->strDefaultLength;
+    self->fallbackIndex = -1;
+
+    return self->fallbackCount != 0;
+}
+
+// Fallback the current character using the remaining buffer and encoder if necessary
+// This can only be called by our encodings (other have to use the public fallback methods), so
+// we can use our EncoderNLS here too.
+// setEncoder is true if we're calling from a GetBytes method, false if we're calling from a GetByteCount
+//
+// Note that this could also change the contents of self->buffer.encoder, which is the same
+// object that the caller is using, so the caller could mess up the encoder for us
+// if they aren't careful.
+static bool EncoderReplacementFallbackBuffer_InternalFallback(EncoderBuffer* self, CHAR16_T ch, CHAR16_T** chars)
+{
+    // Shouldn't have null charStart
+    assert(self->charStart != NULL);
+
+    // See if it was a high surrogate
+    if (IsHighSurrogate(ch))
+    {
+        // See if there's a low surrogate to go with it
+        if (*chars >= self->charEnd)
+        {
+            // Nothing left in input buffer
+            // No input, return 0
+        }
+        else
+        {
+            // Might have a low surrogate
+            CHAR16_T cNext = **chars;
+            if (IsLowSurrogate(cNext))
+            {
+                // If already falling back then fail
+                assert(!self->bFallingBack || self->iRecursionCount++ <= MAX_RECURSION);
+
+                // Next is a surrogate, add it as surrogate pair, and increment chars
+                (*chars)++;
+                self->bFallingBack = EncoderReplacementFallbackBuffer_Fallback_Unknown(self);
+                return self->bFallingBack;
+            }
+
+            // Next isn't a low surrogate, just fallback the high surrogate
+        }
+    }
+
+    // If already falling back then fail
+    assert(!self->bFallingBack || self->iRecursionCount++ <= MAX_RECURSION);
+
+    // Fall back our char
+    self->bFallingBack = EncoderReplacementFallbackBuffer_Fallback(self);
+
+    return self->bFallingBack;
+}
+
+static bool EncoderReplacementFallbackBuffer_MovePrevious(EncoderBuffer* self)
+{
+    // Back up one, only if we just processed the last character (or earlier)
+    if (self->fallbackCount >= -1 && self->fallbackIndex >= 0)
+    {
+        self->fallbackIndex--;
+        self->fallbackCount++;
+        return true;
+    }
+
+    // Return false 'cause we couldn't do it.
+    return false;
+}
+
+typedef struct
+{
+    union
+    {
+        DecoderBuffer decoder;
+        EncoderBuffer encoder;
+    } buffer;
+
+    bool useFallback;
+
+#if BIGENDIAN
+    bool treatAsLE;
+#endif
+} UTF8Encoding;
+
+// These are bitmasks used to maintain the state in the decoder. They occupy the higher bits
+// while the actual character is being built in the lower bits. They are shifted together
+// with the actual bits of the character.
+
+// bits 30 & 31 are used for pending bits fixup
+#define FinalByte (1 << 29)
+#define SupplimentarySeq (1 << 28)
+#define ThreeByteSeq (1 << 27)
+
+static bool InRange(int c, int begin, int end)
+{
+    return begin <= c && c <= end;
+}
+
+// During GetChars we had an invalid byte sequence
+// pSrc is backed up to the start of the bad sequence if we didn't have room to
+// fall it back.  Otherwise pSrc remains where it is.
+static bool FallbackInvalidByteSequence_Copy(UTF8Encoding* self, unsigned char** pSrc, CHAR16_T** pTarget, CHAR16_T* pAllocatedBufferEnd)
+{
+    assert(self->useFallback);
+
+    // Get our byte[]
+    unsigned char* pStart = *pSrc;
+    bool fallbackResult = DecoderReplacementFallbackBuffer_InternalFallback_Copy(&self->buffer.decoder, pTarget, pAllocatedBufferEnd);
+
+    // Do the actual fallback
+    if (!fallbackResult)
+    {
+        // Oops, it failed, back up to pStart
+        *pSrc = pStart;
+        return false;
+    }
+
+    // It worked
+    return true;
+}
+
+static size_t GetCharCount(UTF8Encoding* self, unsigned char* bytes, size_t count)
+{
+    assert(bytes != NULL);
+    assert(count >= 0);
+
+    // Initialize stuff
+    unsigned char *pSrc = bytes;
+    unsigned char *pEnd = pSrc + count;
+    int availableBytes, chc;
+
+    // Start by assuming we have as many as count, charCount always includes the adjustment
+    // for the character being decoded
+    size_t charCount = count;
+    int ch = 0;
+    bool fallbackUsed = false;
+
+    while (true)
+    {
+        // SLOWLOOP: does all range checks, handles all special cases, but it is slow
+        if (pSrc >= pEnd) break;
+
+        // read next byte. The JIT optimization seems to be getting confused when
+        // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
+        int cha = *pSrc;
+
+        // no pending bits
+        if (ch == 0) goto ReadChar;
+
+        pSrc++;
+
+        // we are expecting to see trailing bytes like 10vvvvvv
+        if ((cha & 0xC0) != 0x80)
+        {
+            // This can be a valid starting byte for another UTF8 byte sequence, so let's put
+            // the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence
+            pSrc--;
+            charCount += (ch >> 30);
+            goto InvalidByteSequence;
+        }
+
+        // fold in the new byte
+        ch = (ch << 6) | (cha & 0x3F);
+
+        if ((ch & FinalByte) == 0)
+        {
+            assert((ch & (SupplimentarySeq | ThreeByteSeq)) != 0);
+
+            if ((ch & SupplimentarySeq) != 0)
+            {
+                if ((ch & (FinalByte >> 6)) != 0)
+                {
+                    // this is 3rd byte (of 4 byte supplimentary) - nothing to do
+                    continue;
+                }
+
+                // 2nd byte, check for non-shortest form of supplimentary char and the valid
+                // supplimentary characters in range 0x010000 - 0x10FFFF at the same time
+                if (!InRange(ch & 0x1F0, 0x10, 0x100))
+                {
+                    goto InvalidByteSequence;
+                }
+            }
+            else
+            {
+                // Must be 2nd byte of a 3-byte sequence
+                // check for non-shortest form of 3 byte seq
+                if ((ch & (0x1F << 5)) == 0 ||                  // non-shortest form
+                    (ch & (0xF800 >> 6)) == (0xD800 >> 6))     // illegal individually encoded surrogate
+                {
+                    goto InvalidByteSequence;
+                }
+            }
+            continue;
+        }
+
+        // ready to punch
+
+        // adjust for surrogates in non-shortest form
+        if ((ch & (SupplimentarySeq | 0x1F0000)) == SupplimentarySeq) charCount--;
+
+        goto EncodeChar;
+
+    InvalidByteSequence:
+        if (!self->useFallback)
+        {
+            errno = MINIPAL_ERROR_NO_UNICODE_TRANSLATION;
+            return 0;
+        }
+
+        if (!fallbackUsed)
+        {
+            fallbackUsed = true;
+            self->buffer.decoder.byteStart = bytes;
+            self->buffer.decoder.charEnd = NULL;
+        }
+        charCount += self->buffer.decoder.strDefaultLength;
+
+        ch = 0;
+        continue;
+
+    ReadChar:
+        ch = *pSrc;
+        pSrc++;
+
+    ProcessChar:
+        if (ch > 0x7F)
+        {
+            // If its > 0x7F, its start of a new multi-byte sequence
+
+            // Long sequence, so unreserve our char.
+            charCount--;
+
+            // bit 6 has to be non-zero for start of multibyte chars.
+            if ((ch & 0x40) == 0) goto InvalidByteSequence;
+
+            // start a new long code
+            if ((ch & 0x20) != 0)
+            {
+                if ((ch & 0x10) != 0)
+                {
+                    // 4 byte encoding - supplimentary character (2 surrogates)
+
+                    ch &= 0x0F;
+
+                    // check that bit 4 is zero and the valid supplimentary character
+                    // range 0x000000 - 0x10FFFF at the same time
+                    if (ch > 0x04)
+                    {
+                        ch |= 0xf0;
+                        goto InvalidByteSequence;
+                    }
+
+                    // Add bit flags so that when we check new characters & rotate we'll be flagged correctly.
+                    // Final byte flag, count fix if we don't make final byte & supplimentary sequence flag.
+                    ch |= (FinalByte >> 3 * 6) |  // Final byte is 3 more bytes from now
+                        (1 << 30) |           // If it dies on next byte we'll need an extra char
+                        (3 << (30 - 2 * 6)) |     // If it dies on last byte we'll need to subtract a char
+                        (SupplimentarySeq) | (SupplimentarySeq >> 6) |
+                        (SupplimentarySeq >> 2 * 6) | (SupplimentarySeq >> 3 * 6);
+
+                    // Our character count will be 2 characters for these 4 bytes, so subtract another char
+                    charCount--;
+                }
+                else
+                {
+                    // 3 byte encoding
+                    // Add bit flags so that when we check new characters & rotate we'll be flagged correctly.
+                    ch = (ch & 0x0F) | ((FinalByte >> 2 * 6) | (1 << 30) |
+                        (ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2 * 6));
+
+                    // We'll expect 1 character for these 3 bytes, so subtract another char.
+                    charCount--;
+                }
+            }
+            else
+            {
+                // 2 byte encoding
+
+                ch &= 0x1F;
+
+                // check for non-shortest form
+                if (ch <= 1)
+                {
+                    ch |= 0xc0;
+                    goto InvalidByteSequence;
+                }
+
+                // Add bit flags so we'll be flagged correctly
+                ch |= (FinalByte >> 6);
+            }
+            continue;
+        }
+
+    EncodeChar:
+
+        availableBytes = pEnd - pSrc;
+
+        // don't fall into the fast decoding loop if we don't have enough bytes
+        if (availableBytes <= 13)
+        {
+            // try to get over the remainder of the ascii characters fast though
+            unsigned char* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
+            while (pSrc < pLocalEnd)
+            {
+                ch = *pSrc;
+                pSrc++;
+
+                if (ch > 0x7F)
+                    goto ProcessChar;
+            }
+            // we are done
+            ch = 0;
+            break;
+        }
+
+        // To compute the upper bound, assume that all characters are ASCII characters at this point,
+        //  the boundary will be decreased for every non-ASCII character we encounter
+        // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences
+        unsigned char *pStop = pSrc + availableBytes - 7;
+
+        while (pSrc < pStop)
+        {
+            ch = *pSrc;
+            pSrc++;
+
+            if (ch > 0x7F)
+            {
+                goto LongCode;
+            }
+
+            // get pSrc 2-byte aligned
+            if (((size_t)pSrc & 0x1) != 0)
+            {
+                ch = *pSrc;
+                pSrc++;
+                if (ch > 0x7F)
+                {
+                    goto LongCode;
+                }
+            }
+
+            // get pSrc 4-byte aligned
+            if (((size_t)pSrc & 0x2) != 0)
+            {
+                ch = *(unsigned short*)pSrc;
+                if ((ch & 0x8080) != 0)
+                {
+                    goto LongCodeWithMask16;
+                }
+                pSrc += 2;
+            }
+
+
+            // Run 8 + 8 characters at a time!
+            while (pSrc < pStop)
+            {
+                ch = *(int*)pSrc;
+                int chb = *(int*)(pSrc + 4);
+                if (((ch | chb) & (int)0x80808080) != 0)
+                {
+                    goto LongCodeWithMask32;
+                }
+                pSrc += 8;
+
+                // This is a really small loop - unroll it
+                if (pSrc >= pStop)
+                    break;
+
+                ch = *(int*)pSrc;
+                chb = *(int*)(pSrc + 4);
+                if (((ch | chb) & (int)0x80808080) != 0)
+                {
+                    goto LongCodeWithMask32;
+                }
+                pSrc += 8;
+            }
+            break;
+
+        LongCodeWithMask32 :
+#if BIGENDIAN
+        // be careful about the sign extension
+        if (!self->treatAsLE) ch = (int)(((unsigned int)ch) >> 16);
+        else
+#endif
+        ch &= 0xFF;
+
+        LongCodeWithMask16:
+#if BIGENDIAN
+        if (!self->treatAsLE) ch = (int)(((unsigned int)ch) >> 8);
+        else
+#endif
+        ch &= 0xFF;
+
+        pSrc++;
+        if (ch <= 0x7F)
+        {
+            continue;
+        }
+
+        LongCode:
+            chc = *pSrc;
+            pSrc++;
+
+            if (
+                // bit 6 has to be zero
+                (ch & 0x40) == 0 ||
+                // we are expecting to see trailing bytes like 10vvvvvv
+                (chc & 0xC0) != 0x80)
+            {
+                goto BadLongCode;
+            }
+
+            chc &= 0x3F;
+
+            // start a new long code
+            if ((ch & 0x20) != 0)
+            {
+                // fold the first two bytes together
+                chc |= (ch & 0x0F) << 6;
+
+                if ((ch & 0x10) != 0)
+                {
+                    // 4 byte encoding - surrogate
+                    ch = *pSrc;
+                    if (
+                        // check that bit 4 is zero, the non-shortest form of surrogate
+                        // and the valid surrogate range 0x000000 - 0x10FFFF at the same time
+                        !InRange(chc >> 4, 0x01, 0x10) ||
+                        // we are expecting to see trailing bytes like 10vvvvvv
+                        (ch & 0xC0) != 0x80)
+                    {
+                        goto BadLongCode;
+                    }
+
+                    chc = (chc << 6) | (ch & 0x3F);
+
+                    ch = *(pSrc + 1);
+                    // we are expecting to see trailing bytes like 10vvvvvv
+                    if ((ch & 0xC0) != 0x80)
+                    {
+                        goto BadLongCode;
+                    }
+                    pSrc += 2;
+
+                    // extra byte
+                    charCount--;
+                }
+                else
+                {
+                    // 3 byte encoding
+                    ch = *pSrc;
+                    if (
+                        // check for non-shortest form of 3 byte seq
+                        (chc & (0x1F << 5)) == 0 ||
+                        // Can't have surrogates here.
+                        (chc & (0xF800 >> 6)) == (0xD800 >> 6) ||
+                        // we are expecting to see trailing bytes like 10vvvvvv
+                        (ch & 0xC0) != 0x80)
+                    {
+                        goto BadLongCode;
+                    }
+                    pSrc++;
+
+                    // extra byte
+                    charCount--;
+                }
+            }
+            else
+            {
+                // 2 byte encoding
+
+                // check for non-shortest form
+                if ((ch & 0x1E) == 0) goto BadLongCode;
+            }
+
+            // extra byte
+            charCount--;
+        }
+
+        // no pending bits at this point
+        ch = 0;
+        continue;
+
+    BadLongCode:
+        pSrc -= 2;
+        ch = 0;
+        continue;
+    }
+
+    // May have a problem if we have to flush
+    if (ch != 0)
+    {
+        // We were already adjusting for these, so need to unadjust
+        charCount += (ch >> 30);
+        charCount += self->buffer.decoder.strDefaultLength;
+    }
+
+    // Shouldn't have anything in fallback buffer for GetCharCount
+    // (don't have to check m_throwOnOverflow for count)
+    assert(!fallbackUsed || !self->useFallback || self->buffer.decoder.fallbackCount < 0);
+
+    return charCount;
+}
+
+#define ENSURE_BUFFER_INC                          \
+    pTarget++;                                     \
+    if (pTarget > pAllocatedBufferEnd)             \
+    {                                              \
+        errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER; \
+        return 0;                                  \
+    }
+
+static int GetChars(UTF8Encoding* self, unsigned char* bytes, size_t byteCount, CHAR16_T* chars, size_t charCount)
+{
+    assert(chars != NULL);
+    assert(byteCount >= 0);
+    assert(charCount >= 0);
+    assert(bytes != NULL);
+
+    unsigned char *pSrc = bytes;
+    CHAR16_T *pTarget = chars;
+
+    unsigned char *pEnd = pSrc + byteCount;
+    CHAR16_T *pAllocatedBufferEnd = pTarget + charCount;
+
+    int ch = 0;
+    int chc;
+
+    bool fallbackUsed = false;
+
+    while (true)
+    {
+        // SLOWLOOP: does all range checks, handles all special cases, but it is slow
+
+        if (pSrc >= pEnd) break;
+
+        // read next byte. The JIT optimization seems to be getting confused when
+        // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
+        int cha = *pSrc;
+
+        if (ch == 0)
+        {
+            // no pending bits
+            goto ReadChar;
+        }
+
+        pSrc++;
+
+        // we are expecting to see trailing bytes like 10vvvvvv
+        if ((cha & 0xC0) != 0x80)
+        {
+            // This can be a valid starting byte for another UTF8 byte sequence, so let's put
+            // the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence
+            pSrc--;
+            goto InvalidByteSequence;
+        }
+
+        // fold in the new byte
+        ch = (ch << 6) | (cha & 0x3F);
+
+        if ((ch & FinalByte) == 0)
+        {
+            // Not at last byte yet
+            assert((ch & (SupplimentarySeq | ThreeByteSeq)) != 0);
+
+            if ((ch & SupplimentarySeq) != 0)
+            {
+                // Its a 4-byte supplimentary sequence
+                if ((ch & (FinalByte >> 6)) != 0)
+                {
+                    // this is 3rd byte of 4 byte sequence - nothing to do
+                    continue;
+                }
+
+                // 2nd byte of 4 bytes
+                // check for non-shortest form of surrogate and the valid surrogate
+                // range 0x000000 - 0x10FFFF at the same time
+                if (!InRange(ch & 0x1F0, 0x10, 0x100))
+                {
+                    goto InvalidByteSequence;
+                }
+            }
+            else
+            {
+                // Must be 2nd byte of a 3-byte sequence
+                // check for non-shortest form of 3 byte seq
+                if ((ch & (0x1F << 5)) == 0 ||                  // non-shortest form
+                    (ch & (0xF800 >> 6)) == (0xD800 >> 6))     // illegal individually encoded surrogate
+                {
+                    goto InvalidByteSequence;
+                }
+            }
+            continue;
+        }
+
+        // ready to punch
+
+        // surrogate in shortest form?
+        // Might be possible to get rid of this?  Already did non-shortest check for 4-byte sequence when reading 2nd byte?
+        if ((ch & (SupplimentarySeq | 0x1F0000)) > SupplimentarySeq)
+        {
+            // let the range check for the second char throw the exception
+            if (pTarget < pAllocatedBufferEnd)
+            {
+                *pTarget = (CHAR16_T)(((ch >> 10) & 0x7FF) +
+                    (HIGH_SURROGATE_START - (0x10000 >> 10)));
+
+                ENSURE_BUFFER_INC
+
+                ch = (ch & 0x3FF) +
+                    (int)(LOW_SURROGATE_START);
+            }
+        }
+
+        goto EncodeChar;
+
+    InvalidByteSequence:
+        if (!self->useFallback)
+        {
+            errno = MINIPAL_ERROR_NO_UNICODE_TRANSLATION;
+            return 0;
+        }
+
+        // this code fragment should be close to the gotos referencing it
+        // Have to do fallback for invalid bytes
+        if (!fallbackUsed)
+        {
+            fallbackUsed = true;
+            self->buffer.decoder.byteStart = bytes;
+            self->buffer.decoder.charEnd = pAllocatedBufferEnd;
+        }
+
+        // That'll back us up the appropriate # of bytes if we didn't get anywhere
+        if (!FallbackInvalidByteSequence_Copy(self, &pSrc, &pTarget, pAllocatedBufferEnd))
+        {
+            if (errno == MINIPAL_ERROR_INSUFFICIENT_BUFFER) return 0;
+
+            // Check if we ran out of buffer space
+            assert(pSrc >= bytes);
+
+            DecoderReplacementFallbackBuffer_Reset(&self->buffer.decoder);
+            ch = 0;
+            break;
+        }
+
+        assert(pSrc >= bytes);
+
+        ch = 0;
+        continue;
+
+    ReadChar:
+        ch = *pSrc;
+        pSrc++;
+
+    ProcessChar:
+        if (ch > 0x7F)
+        {
+            // If its > 0x7F, its start of a new multi-byte sequence
+
+            // bit 6 has to be non-zero
+            if ((ch & 0x40) == 0) goto InvalidByteSequence;
+
+            // start a new long code
+            if ((ch & 0x20) != 0)
+            {
+                if ((ch & 0x10) != 0)
+                {
+                    // 4 byte encoding - supplimentary character (2 surrogates)
+
+                    ch &= 0x0F;
+
+                    // check that bit 4 is zero and the valid supplimentary character
+                    // range 0x000000 - 0x10FFFF at the same time
+                    if (ch > 0x04)
+                    {
+                        ch |= 0xf0;
+                        goto InvalidByteSequence;
+                    }
+
+                    ch |= (FinalByte >> 3 * 6) | (1 << 30) | (3 << (30 - 2 * 6)) |
+                        (SupplimentarySeq) | (SupplimentarySeq >> 6) |
+                        (SupplimentarySeq >> 2 * 6) | (SupplimentarySeq >> 3 * 6);
+                }
+                else
+                {
+                    // 3 byte encoding
+                    ch = (ch & 0x0F) | ((FinalByte >> 2 * 6) | (1 << 30) |
+                        (ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2 * 6));
+                }
+            }
+            else
+            {
+                // 2 byte encoding
+
+                ch &= 0x1F;
+
+                // check for non-shortest form
+                if (ch <= 1)
+                {
+                    ch |= 0xc0;
+                    goto InvalidByteSequence;
+                }
+
+                ch |= (FinalByte >> 6);
+            }
+            continue;
+        }
+
+    EncodeChar:
+        // write the pending character
+        if (pTarget >= pAllocatedBufferEnd)
+        {
+            // Fix chars so we make sure to throw if we didn't output anything
+            ch &= 0x1fffff;
+            if (ch > 0x7f)
+            {
+                if (ch > 0x7ff)
+                {
+                    if (ch >= LOW_SURROGATE_START &&
+                        ch <= LOW_SURROGATE_END)
+                    {
+                        pSrc--;     // It was 4 bytes
+                        pTarget--;  // 1 was stored already, but we can't remember 1/2, so back up
+                    }
+                    else if (ch > 0xffff)
+                    {
+                        pSrc--;     // It was 4 bytes, nothing was stored
+                    }
+                    pSrc--;         // It was at least 3 bytes
+                }
+                pSrc--;             // It was at least 2 bytes
+            }
+            pSrc--;
+
+            assert(pSrc >= bytes);
+
+            // Don't store ch in decoder, we already backed up to its start
+            ch = 0;
+
+            // Didn't throw, just use this buffer size.
+            break;
+        }
+        *pTarget = (CHAR16_T)ch;
+        ENSURE_BUFFER_INC
+
+        int availableChars = pAllocatedBufferEnd - pTarget;
+        int availableBytes = pEnd - pSrc;
+
+        // don't fall into the fast decoding loop if we don't have enough bytes
+        // Test for availableChars is done because pStop would be <= pTarget.
+        if (availableBytes <= 13)
+        {
+            // we may need as many as 1 character per byte
+            if (availableChars < availableBytes)
+            {
+                // not enough output room.  no pending bits at this point
+                ch = 0;
+                continue;
+            }
+
+            // try to get over the remainder of the ascii characters fast though
+            unsigned char* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
+            while (pSrc < pLocalEnd)
+            {
+                ch = *pSrc;
+                pSrc++;
+
+                if (ch > 0x7F) goto ProcessChar;
+
+                *pTarget = (CHAR16_T)ch;
+                ENSURE_BUFFER_INC
+            }
+            // we are done
+            ch = 0;
+            break;
+        }
+
+        // we may need as many as 1 character per byte, so reduce the byte count if necessary.
+        // If availableChars is too small, pStop will be before pTarget and we won't do fast loop.
+        if (availableChars < availableBytes) availableBytes = availableChars;
+
+        // To compute the upper bound, assume that all characters are ASCII characters at this point,
+        //  the boundary will be decreased for every non-ASCII character we encounter
+        // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences
+        CHAR16_T *pStop = pTarget + availableBytes - 7;
+
+        while (pTarget < pStop)
+        {
+            ch = *pSrc;
+            pSrc++;
+
+            if (ch > 0x7F) goto LongCode;
+
+            *pTarget = (CHAR16_T)ch;
+            ENSURE_BUFFER_INC
+
+            // get pSrc to be 2-byte aligned
+            if ((((size_t)pSrc) & 0x1) != 0)
+            {
+                ch = *pSrc;
+                pSrc++;
+                if (ch > 0x7F) goto LongCode;
+
+                *pTarget = (CHAR16_T)ch;
+                ENSURE_BUFFER_INC
+            }
+
+            // get pSrc to be 4-byte aligned
+            if ((((size_t)pSrc) & 0x2) != 0)
+            {
+                ch = *(unsigned short*)pSrc;
+                if ((ch & 0x8080) != 0) goto LongCodeWithMask16;
+
+
+                if (pTarget + 2 > pAllocatedBufferEnd)
+                {
+                    errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER;
+                    return 0;
+                }
+
+                // Unfortunately, this is endianness sensitive
+#if BIGENDIAN
+                if (!self->treatAsLE)
+                {
+                    *pTarget = (CHAR16_T)((ch >> 8) & 0x7F);
+                    pSrc += 2;
+                    *(pTarget + 1) = (CHAR16_T)(ch & 0x7F);
+                    pTarget += 2;
+                }
+                else
+#endif
+                {
+                    *pTarget = (CHAR16_T)(ch & 0x7F);
+                    pSrc += 2;
+                    *(pTarget + 1) = (CHAR16_T)((ch >> 8) & 0x7F);
+                    pTarget += 2;
+                }
+            }
+
+            // Run 8 characters at a time!
+            while (pTarget < pStop)
+            {
+                ch = *(int*)pSrc;
+                int chb = *(int*)(pSrc + 4);
+                if (((ch | chb) & (int)0x80808080) != 0) goto LongCodeWithMask32;
+
+                if (pTarget + 8 > pAllocatedBufferEnd)
+                {
+                    errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER;
+                    return 0;
+                }
+
+                // Unfortunately, this is endianness sensitive
+#if BIGENDIAN
+                if (!self->treatAsLE)
+                {
+                    *pTarget = (CHAR16_T)((ch >> 24) & 0x7F);
+                    *(pTarget + 1) = (CHAR16_T)((ch >> 16) & 0x7F);
+                    *(pTarget + 2) = (CHAR16_T)((ch >> 8) & 0x7F);
+                    *(pTarget + 3) = (CHAR16_T)(ch & 0x7F);
+                    pSrc += 8;
+                    *(pTarget + 4) = (CHAR16_T)((chb >> 24) & 0x7F);
+                    *(pTarget + 5) = (CHAR16_T)((chb >> 16) & 0x7F);
+                    *(pTarget + 6) = (CHAR16_T)((chb >> 8) & 0x7F);
+                    *(pTarget + 7) = (CHAR16_T)(chb & 0x7F);
+                    pTarget += 8;
+                }
+                else
+#endif
+                {
+                    *pTarget = (CHAR16_T)(ch & 0x7F);
+                    *(pTarget + 1) = (CHAR16_T)((ch >> 8) & 0x7F);
+                    *(pTarget + 2) = (CHAR16_T)((ch >> 16) & 0x7F);
+                    *(pTarget + 3) = (CHAR16_T)((ch >> 24) & 0x7F);
+                    pSrc += 8;
+                    *(pTarget + 4) = (CHAR16_T)(chb & 0x7F);
+                    *(pTarget + 5) = (CHAR16_T)((chb >> 8) & 0x7F);
+                    *(pTarget + 6) = (CHAR16_T)((chb >> 16) & 0x7F);
+                    *(pTarget + 7) = (CHAR16_T)((chb >> 24) & 0x7F);
+                    pTarget += 8;
+                }
+            }
+            break;
+
+            LongCodeWithMask32 :
+#if BIGENDIAN
+            // be careful about the sign extension
+            if (!self->treatAsLE) ch = (int)(((unsigned int)ch) >> 16);
+            else
+#endif
+            ch &= 0xFF;
+
+            LongCodeWithMask16:
+#if BIGENDIAN
+            if (!self->treatAsLE) ch = (int)(((unsigned int)ch) >> 8);
+            else
+#endif
+            ch &= 0xFF;
+
+            pSrc++;
+            if (ch <= 0x7F)
+            {
+                *pTarget = (CHAR16_T)ch;
+                ENSURE_BUFFER_INC
+                continue;
+            }
+
+        LongCode:
+            chc = *pSrc;
+            pSrc++;
+
+            if (
+                // bit 6 has to be zero
+                (ch & 0x40) == 0 ||
+                // we are expecting to see trailing bytes like 10vvvvvv
+                (chc & 0xC0) != 0x80)
+            {
+                goto BadLongCode;
+            }
+
+            chc &= 0x3F;
+
+            // start a new long code
+            if ((ch & 0x20) != 0)
+            {
+
+                // fold the first two bytes together
+                chc |= (ch & 0x0F) << 6;
+
+                if ((ch & 0x10) != 0)
+                {
+                    // 4 byte encoding - surrogate
+                    ch = *pSrc;
+                    if (
+                        // check that bit 4 is zero, the non-shortest form of surrogate
+                        // and the valid surrogate range 0x000000 - 0x10FFFF at the same time
+                        !InRange(chc >> 4, 0x01, 0x10) ||
+                        // we are expecting to see trailing bytes like 10vvvvvv
+                        (ch & 0xC0) != 0x80)
+                    {
+                        goto BadLongCode;
+                    }
+
+                    chc = (chc << 6) | (ch & 0x3F);
+
+                    ch = *(pSrc + 1);
+                    // we are expecting to see trailing bytes like 10vvvvvv
+                    if ((ch & 0xC0) != 0x80) goto BadLongCode;
+
+                    pSrc += 2;
+
+                    ch = (chc << 6) | (ch & 0x3F);
+
+                    *pTarget = (CHAR16_T)(((ch >> 10) & 0x7FF) +
+                        (HIGH_SURROGATE_START - (0x10000 >> 10)));
+                    ENSURE_BUFFER_INC
+
+                    ch = (ch & 0x3FF) + (LOW_SURROGATE_START);
+
+                    // extra byte, we're already planning 2 chars for 2 of these bytes,
+                    // but the big loop is testing the target against pStop, so we need
+                    // to subtract 2 more or we risk overrunning the input.  Subtract
+                    // one here and one below.
+                    pStop--;
+                }
+                else
+                {
+                    // 3 byte encoding
+                    ch = *pSrc;
+                    if (
+                        // check for non-shortest form of 3 byte seq
+                        (chc & (0x1F << 5)) == 0 ||
+                        // Can't have surrogates here.
+                        (chc & (0xF800 >> 6)) == (0xD800 >> 6) ||
+                        // we are expecting to see trailing bytes like 10vvvvvv
+                        (ch & 0xC0) != 0x80)
+                    {
+                        goto BadLongCode;
+                    }
+                    pSrc++;
+
+                    ch = (chc << 6) | (ch & 0x3F);
+
+                    // extra byte, we're only expecting 1 char for each of these 3 bytes,
+                    // but the loop is testing the target (not source) against pStop, so
+                    // we need to subtract 2 more or we risk overrunning the input.
+                    // Subtract 1 here and one more below
+                    pStop--;
+                }
+            }
+            else
+            {
+                // 2 byte encoding
+
+                ch &= 0x1F;
+
+                // check for non-shortest form
+                if (ch <= 1) goto BadLongCode;
+
+                ch = (ch << 6) | chc;
+            }
+
+            *pTarget = (CHAR16_T)ch;
+            ENSURE_BUFFER_INC
+
+            // extra byte, we're only expecting 1 char for each of these 2 bytes,
+            // but the loop is testing the target (not source) against pStop.
+            // subtract an extra count from pStop so that we don't overrun the input.
+            pStop--;
+        }
+
+        assert(pTarget <= pAllocatedBufferEnd);
+
+        // no pending bits at this point
+        ch = 0;
+        continue;
+
+    BadLongCode:
+        pSrc -= 2;
+        ch = 0;
+        continue;
+    }
+
+    if (ch != 0)
+    {
+        // This'll back us up the appropriate # of bytes if we didn't get anywhere
+        if (!self->useFallback)
+        {
+            assert(pSrc >= bytes || pTarget == chars);
+
+            // Ran out of buffer space
+            // Need to throw an exception?
+            if (pTarget == chars)
+            {
+                errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER;
+                return 0;
+            }
+        }
+        assert(pSrc >= bytes);
+        ch = 0;
+    }
+
+    // Shouldn't have anything in fallback buffer for GetChars
+    // (don't have to check m_throwOnOverflow for chars)
+    assert(!fallbackUsed || self->buffer.decoder.fallbackCount < 0);
+
+    if (pSrc < pEnd)
+    {
+        errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER;
+        return 0;
+    }
+
+    return pTarget - chars;
+}
+
+static size_t GetBytes(UTF8Encoding* self, CHAR16_T* chars, size_t charCount, unsigned char* bytes, size_t byteCount)
+{
+    assert(chars != NULL);
+    assert(byteCount >= 0);
+    assert(charCount >= 0);
+    assert(bytes != NULL);
+
+    // For fallback we may need a fallback buffer.
+    // We wait to initialize it though in case we don't have any broken input unicode
+    bool fallbackUsed = false;
+    CHAR16_T *pSrc = chars;
+    unsigned char *pTarget = bytes;
+
+    CHAR16_T *pEnd = pSrc + charCount;
+    unsigned char *pAllocatedBufferEnd = pTarget + byteCount;
+
+    int ch = 0;
+    int chd;
+
+    // assume that JIT will enregister pSrc, pTarget and ch
+
+    while (true)
+    {
+        // SLOWLOOP: does all range checks, handles all special cases, but it is slow
+
+        if (pSrc >= pEnd)
+        {
+            if (ch == 0)
+            {
+                // Check if there's anything left to get out of the fallback buffer
+                ch = fallbackUsed ? EncoderReplacementFallbackBuffer_InternalGetNextChar(&self->buffer.encoder) : 0;
+                if (ch > 0) goto ProcessChar;
+            }
+            else
+            {
+                // Case of leftover surrogates in the fallback buffer
+                if (fallbackUsed && self->buffer.encoder.bFallingBack)
+                {
+                    assert(ch >= 0xD800 && ch <= 0xDBFF);
+
+                    int cha = ch;
+
+                    ch = EncoderReplacementFallbackBuffer_InternalGetNextChar(&self->buffer.encoder);
+
+                    if (InRange(ch, LOW_SURROGATE_START, LOW_SURROGATE_END))
+                    {
+                        ch = ch + (cha << 10) + (0x10000 - LOW_SURROGATE_START - (HIGH_SURROGATE_START << 10));
+                        goto EncodeChar;
+                    }
+                    else if (ch > 0)
+                    {
+                        goto ProcessChar;
+                    }
+
+                    break;
+                }
+            }
+
+            // attempt to encode the partial surrogate (will fail or ignore)
+            if (ch > 0) goto EncodeChar;
+
+            // We're done
+            break;
+        }
+
+        if (ch > 0)
+        {
+            // We have a high surrogate left over from a previous loop.
+            assert(ch >= 0xD800 && ch <= 0xDBFF);
+
+            // use separate helper variables for local contexts so that the jit optimizations
+            // won't get confused about the variable lifetimes
+            int cha = *pSrc;
+
+            // In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here.
+            if (InRange(cha, LOW_SURROGATE_START, LOW_SURROGATE_END))
+            {
+                ch = cha + (ch << 10) +
+                    (0x10000
+                    - LOW_SURROGATE_START
+                    - (HIGH_SURROGATE_START << 10));
+
+                pSrc++;
+            }
+            // else ch is still high surrogate and encoding will fail
+
+            // attempt to encode the surrogate or partial surrogate
+            goto EncodeChar;
+        }
+
+        // If we've used a fallback, then we have to check for it
+        if (fallbackUsed)
+        {
+            ch = EncoderReplacementFallbackBuffer_InternalGetNextChar(&self->buffer.encoder);
+            if (ch > 0) goto ProcessChar;
+        }
+
+        // read next char. The JIT optimization seems to be getting confused when
+        // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
+        ch = *pSrc;
+        pSrc++;
+
+    ProcessChar:
+        if (InRange(ch, HIGH_SURROGATE_START, HIGH_SURROGATE_END)) continue;
+
+        // either good char or partial surrogate
+
+    EncodeChar:
+        // throw exception on partial surrogate if necessary
+        if (InRange(ch, HIGH_SURROGATE_START, LOW_SURROGATE_END))
+        {
+            // Lone surrogates aren't allowed, we have to do fallback for them
+            // Have to make a fallback buffer if we don't have one
+            if (!fallbackUsed)
+            {
+                // wait on fallbacks if we can
+                // For fallback we may need a fallback buffer
+                fallbackUsed = true;
+
+                // Set our internal fallback interesting things.
+                EncoderReplacementFallbackBuffer_InternalInitialize(&self->buffer.encoder, chars, pEnd, true);
+            }
+
+            // Do our fallback.  Actually we already know its a mixed up surrogate,
+            // so the ref pSrc isn't gonna do anything.
+            EncoderReplacementFallbackBuffer_InternalFallback(&self->buffer.encoder, (CHAR16_T)ch, &pSrc);
+
+            // Ignore it if we don't throw
+            ch = 0;
+            continue;
+        }
+
+        // Count bytes needed
+        int bytesNeeded = 1;
+        if (ch > 0x7F)
+        {
+            if (ch > 0x7FF)
+            {
+                if (ch > 0xFFFF)
+                {
+                    bytesNeeded++;  // 4 bytes (surrogate pair)
+                }
+                bytesNeeded++;      // 3 bytes (800-FFFF)
+            }
+            bytesNeeded++;          // 2 bytes (80-7FF)
+        }
+
+        if (pTarget > pAllocatedBufferEnd - bytesNeeded)
+        {
+            // Left over surrogate from last time will cause pSrc == chars, so we'll throw
+            if (fallbackUsed && self->buffer.encoder.bFallingBack)
+            {
+                EncoderReplacementFallbackBuffer_MovePrevious(&self->buffer.encoder);              // Didn't use this fallback char
+                if (ch > 0xFFFF)
+                    EncoderReplacementFallbackBuffer_MovePrevious(&self->buffer.encoder);          // Was surrogate, didn't use 2nd part either
+            }
+            else
+            {
+                pSrc--;                                     // Didn't use this char
+                if (ch > 0xFFFF)
+                    pSrc--;                                 // Was surrogate, didn't use 2nd part either
+            }
+
+            assert(pSrc >= chars || pTarget == bytes);
+
+            if (pTarget == bytes)  // Throw if we must
+            {
+                errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER;
+                return 0;
+            }
+            ch = 0;                                         // Nothing left over (we backed up to start of pair if supplimentary)
+            break;
+        }
+
+        if (ch <= 0x7F)
+        {
+            *pTarget = (unsigned char)ch;
+        }
+        else
+        {
+            // use separate helper variables for local contexts so that the jit optimizations
+            // won't get confused about the variable lifetimes
+            int chb;
+            if (ch <= 0x7FF)
+            {
+                // 2 unsigned char encoding
+                chb = (unsigned char)(0xC0 | (ch >> 6));
+            }
+            else
+            {
+                if (ch <= 0xFFFF)
+                {
+                    chb = (unsigned char)(0xE0 | (ch >> 12));
+                }
+                else
+                {
+                    *pTarget = (unsigned char)(0xF0 | (ch >> 18));
+                    ENSURE_BUFFER_INC
+
+                    chb = 0x80 | ((ch >> 12) & 0x3F);
+                }
+                *pTarget = (unsigned char)chb;
+                ENSURE_BUFFER_INC
+
+                chb = 0x80 | ((ch >> 6) & 0x3F);
+            }
+            *pTarget = (unsigned char)chb;
+            ENSURE_BUFFER_INC
+
+            *pTarget = (unsigned char)0x80 | (ch & 0x3F);
+        }
+
+        ENSURE_BUFFER_INC
+
+        // If still have fallback don't do fast loop
+        if (fallbackUsed && (ch = EncoderReplacementFallbackBuffer_InternalGetNextChar(&self->buffer.encoder)) != 0)
+            goto ProcessChar;
+
+        int availableChars = pEnd - pSrc;
+        int availableBytes = pAllocatedBufferEnd - pTarget;
+
+        // don't fall into the fast decoding loop if we don't have enough characters
+        // Note that if we don't have enough bytes, pStop will prevent us from entering the fast loop.
+        if (availableChars <= 13)
+        {
+            // we are hoping for 1 unsigned char per char
+            if (availableBytes < availableChars)
+            {
+                // not enough output room.  no pending bits at this point
+                ch = 0;
+                continue;
+            }
+
+            // try to get over the remainder of the ascii characters fast though
+            CHAR16_T* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
+            while (pSrc < pLocalEnd)
+            {
+                ch = *pSrc;
+                pSrc++;
+
+                // Not ASCII, need more than 1 unsigned char per char
+                if (ch > 0x7F) goto ProcessChar;
+
+                *pTarget = (unsigned char)ch;
+                ENSURE_BUFFER_INC
+            }
+            // we are done, let ch be 0 to clear encoder
+            ch = 0;
+            break;
+        }
+
+        // we need at least 1 unsigned char per character, but Convert might allow us to convert
+        // only part of the input, so try as much as we can.  Reduce charCount if necessary
+        if (availableBytes < availableChars)
+        {
+            availableChars = availableBytes;
+        }
+
+        // FASTLOOP:
+        // - optimistic range checks
+        // - fallbacks to the slow loop for all special cases, exception throwing, etc.
+
+        // To compute the upper bound, assume that all characters are ASCII characters at this point,
+        //  the boundary will be decreased for every non-ASCII character we encounter
+        // Also, we need 5 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates
+        // If there aren't enough bytes for the output, then pStop will be <= pSrc and will bypass the loop.
+        CHAR16_T *pStop = pSrc + availableChars - 5;
+
+        while (pSrc < pStop)
+        {
+            ch = *pSrc;
+            pSrc++;
+
+            if (ch > 0x7F) goto LongCode;
+
+            *pTarget = (unsigned char)ch;
+            ENSURE_BUFFER_INC
+
+            // get pSrc aligned
+            if (((size_t)pSrc & 0x2) != 0)
+            {
+                ch = *pSrc;
+                pSrc++;
+                if (ch > 0x7F) goto LongCode;
+
+                *pTarget = (unsigned char)ch;
+                ENSURE_BUFFER_INC
+            }
+
+            // Run 4 characters at a time!
+            while (pSrc < pStop)
+            {
+                ch = *(int*)pSrc;
+                int chc = *(int*)(pSrc + 2);
+
+                if (((ch | chc) & (int)0xFF80FF80) != 0) goto LongCodeWithMask;
+
+                if (pTarget + 4 > pAllocatedBufferEnd)
+                {
+                    errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER;
+                    return 0;
+                }
+
+                // Unfortunately, this is endianness sensitive
+#if BIGENDIAN
+                if (!self->treatAsLE)
+                {
+                    *pTarget = (unsigned char)(ch >> 16);
+                    *(pTarget + 1) = (unsigned char)ch;
+                    pSrc += 4;
+                    *(pTarget + 2) = (unsigned char)(chc >> 16);
+                    *(pTarget + 3) = (unsigned char)chc;
+                    pTarget += 4;
+                }
+                else
+#endif
+                {
+                    *pTarget = (unsigned char)ch;
+                    *(pTarget + 1) = (unsigned char)(ch >> 16);
+                    pSrc += 4;
+                    *(pTarget + 2) = (unsigned char)chc;
+                    *(pTarget + 3) = (unsigned char)(chc >> 16);
+                    pTarget += 4;
+                }
+            }
+            continue;
+
+        LongCodeWithMask:
+#if BIGENDIAN
+        // be careful about the sign extension
+        if (!self->treatAsLE) ch = (int)(((unsigned int)ch) >> 16);
+        else
+#endif
+        ch = (CHAR16_T)ch;
+        pSrc++;
+
+        if (ch > 0x7F) goto LongCode;
+
+        *pTarget = (unsigned char)ch;
+        ENSURE_BUFFER_INC
+        continue;
+
+        LongCode:
+            // use separate helper variables for slow and fast loop so that the jit optimizations
+            // won't get confused about the variable lifetimes
+            if (ch <= 0x7FF)
+            {
+                // 2 unsigned char encoding
+                chd = 0xC0 | (ch >> 6);
+            }
+            else
+            {
+                if (!InRange(ch, HIGH_SURROGATE_START, LOW_SURROGATE_END))
+                {
+                    // 3 unsigned char encoding
+                    chd = 0xE0 | (ch >> 12);
+                }
+                else
+                {
+                    // 4 unsigned char encoding - high surrogate + low surrogate
+                    if (ch > HIGH_SURROGATE_END)
+                    {
+                        // low without high -> bad, try again in slow loop
+                        pSrc -= 1;
+                        break;
+                    }
+
+                    chd = *pSrc;
+                    pSrc++;
+
+                    if (!InRange(chd, LOW_SURROGATE_START, LOW_SURROGATE_END))
+                    {
+                        // high not followed by low -> bad, try again in slow loop
+                        pSrc -= 2;
+                        break;
+                    }
+
+                    ch = chd + (ch << 10) +
+                        (0x10000
+                        - LOW_SURROGATE_START
+                        - (HIGH_SURROGATE_START << 10));
+
+                    *pTarget = (unsigned char)(0xF0 | (ch >> 18));
+                    // pStop - this unsigned char is compensated by the second surrogate character
+                    // 2 input chars require 4 output bytes.  2 have been anticipated already
+                    // and 2 more will be accounted for by the 2 pStop-- calls below.
+                    ENSURE_BUFFER_INC
+
+                    chd = 0x80 | ((ch >> 12) & 0x3F);
+                }
+                *pTarget = (unsigned char)chd;
+                pStop--;                    // 3 unsigned char sequence for 1 char, so need pStop-- and the one below too.
+                ENSURE_BUFFER_INC
+
+                chd = 0x80 | ((ch >> 6) & 0x3F);
+            }
+            *pTarget = (unsigned char)chd;
+            pStop--;                        // 2 unsigned char sequence for 1 char so need pStop--.
+            ENSURE_BUFFER_INC
+
+            *pTarget = (unsigned char)(0x80 | (ch & 0x3F));
+            // pStop - this unsigned char is already included
+            ENSURE_BUFFER_INC
+        }
+
+        assert(pTarget <= pAllocatedBufferEnd);
+
+        // no pending char at this point
+        ch = 0;
+    }
+
+    if (pSrc < pEnd)
+    {
+        errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER;
+        return 0;
+    }
+
+    return (int)(pTarget - bytes);
+}
+
+static size_t GetByteCount(UTF8Encoding* self, CHAR16_T *chars, size_t count)
+{
+    // For fallback we may need a fallback buffer.
+    // We wait to initialize it though in case we don't have any broken input unicode
+    bool fallbackUsed = false;
+    CHAR16_T *pSrc = chars;
+    CHAR16_T *pEnd = pSrc + count;
+
+    // Start by assuming we have as many as count
+    size_t byteCount = count;
+
+    int ch = 0;
+
+    while (true)
+    {
+        // SLOWLOOP: does all range checks, handles all special cases, but it is slow
+        if (pSrc >= pEnd)
+        {
+
+            if (ch == 0)
+            {
+                // Unroll any fallback that happens at the end
+                ch = fallbackUsed ? EncoderReplacementFallbackBuffer_InternalGetNextChar(&self->buffer.encoder) : 0;
+                if (ch > 0)
+                {
+                    byteCount++;
+                    goto ProcessChar;
+                }
+            }
+            else
+            {
+                // Case of surrogates in the fallback.
+                if (fallbackUsed && self->buffer.encoder.bFallingBack)
+                {
+                    assert(ch >= 0xD800 && ch <= 0xDBFF);
+
+                    ch = EncoderReplacementFallbackBuffer_InternalGetNextChar(&self->buffer.encoder);
+                    byteCount++;
+
+                    if (InRange(ch, LOW_SURROGATE_START, LOW_SURROGATE_END))
+                    {
+                        ch = 0xfffd;
+                        byteCount++;
+                        goto EncodeChar;
+                    }
+                    else if (ch > 0)
+                    {
+                        goto ProcessChar;
+                    }
+                    else
+                    {
+                        byteCount--; // ignore last one.
+                        break;
+                    }
+                }
+            }
+
+            if (ch <= 0)
+            {
+                break;
+            }
+
+            // attempt to encode the partial surrogate (will fallback or ignore it), it'll also subtract 1.
+            byteCount++;
+            goto EncodeChar;
+        }
+
+        if (ch > 0)
+        {
+            assert(ch >= 0xD800 && ch <= 0xDBFF);
+
+            // use separate helper variables for local contexts so that the jit optimizations
+            // won't get confused about the variable lifetimes
+            int cha = *pSrc;
+
+            // count the pending surrogate
+            byteCount++;
+
+            // In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here.
+            if (InRange(cha, LOW_SURROGATE_START, LOW_SURROGATE_END))
+            {
+                // Don't need a real # because we're just counting, anything > 0x7ff ('cept surrogate) will do.
+                ch = 0xfffd;
+                //                        ch = cha + (ch << 10) +
+                //                            (0x10000
+                //                            - LOW_SURROGATE_START
+                //                            - (HIGH_SURROGATE_START << 10) );
+
+                // Use this next char
+                pSrc++;
+            }
+            // else ch is still high surrogate and encoding will fail (so don't add count)
+
+            // attempt to encode the surrogate or partial surrogate
+            goto EncodeChar;
+        }
+
+        // If we've used a fallback, then we have to check for it
+        if (fallbackUsed)
+        {
+            ch = EncoderReplacementFallbackBuffer_InternalGetNextChar(&self->buffer.encoder);
+            if (ch > 0)
+            {
+                // We have an extra byte we weren't expecting.
+                byteCount++;
+                goto ProcessChar;
+            }
+        }
+
+        // read next char. The JIT optimization seems to be getting confused when
+        // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
+        ch = *pSrc;
+        pSrc++;
+
+    ProcessChar:
+        if (InRange(ch, HIGH_SURROGATE_START, HIGH_SURROGATE_END))
+        {
+            // we will count this surrogate next time around
+            byteCount--;
+            continue;
+        }
+        // either good char or partial surrogate
+
+    EncodeChar:
+        // throw exception on partial surrogate if necessary
+        if (InRange(ch, HIGH_SURROGATE_START, LOW_SURROGATE_END))
+        {
+            // Lone surrogates aren't allowed
+            // Have to make a fallback buffer if we don't have one
+            if (!fallbackUsed)
+            {
+                // wait on fallbacks if we can
+                // For fallback we may need a fallback buffer
+                fallbackUsed = true;
+
+                // Set our internal fallback interesting things.
+                EncoderReplacementFallbackBuffer_InternalInitialize(&self->buffer.encoder, chars, chars + count, false);
+            }
+
+            // Do our fallback.  Actually we already know its a mixed up surrogate,
+            // so the ref pSrc isn't gonna do anything.
+            EncoderReplacementFallbackBuffer_InternalFallback(&self->buffer.encoder, (CHAR16_T)ch, &pSrc);
+
+            // Ignore it if we don't throw (we had preallocated this ch)
+            byteCount--;
+            ch = 0;
+            continue;
+        }
+
+        // Count them
+        if (ch > 0x7F)
+        {
+            if (ch > 0x7FF)
+            {
+                // the extra surrogate byte was compensated by the second surrogate character
+                // (2 surrogates make 4 bytes.  We've already counted 2 bytes, 1 per char)
+                byteCount++;
+            }
+            byteCount++;
+        }
+
+#if WIN64
+        // check for overflow
+        if (byteCount < 0)
+        {
+            break;
+        }
+#endif
+
+        // If still have fallback don't do fast loop
+        if (fallbackUsed && (ch = EncoderReplacementFallbackBuffer_InternalGetNextChar(&self->buffer.encoder)) != 0)
+        {
+            // We're reserving 1 byte for each char by default
+            byteCount++;
+            goto ProcessChar;
+        }
+
+        int availableChars = pEnd - pSrc;
+
+        // don't fall into the fast decoding loop if we don't have enough characters
+        if (availableChars <= 13)
+        {
+            // try to get over the remainder of the ascii characters fast though
+            CHAR16_T* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
+            while (pSrc < pLocalEnd)
+            {
+                ch = *pSrc;
+                pSrc++;
+                if (ch > 0x7F) goto ProcessChar;
+            }
+
+            // we are done
+            break;
+        }
+
+#if WIN64
+        // make sure that we won't get a silent overflow inside the fast loop
+        // (Fall out to slow loop if we have this many characters)
+        availableChars &= 0x0FFFFFFF;
+#endif
+
+        // To compute the upper bound, assume that all characters are ASCII characters at this point,
+        //  the boundary will be decreased for every non-ASCII character we encounter
+        // Also, we need 3 + 4 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates
+        CHAR16_T *pStop = pSrc + availableChars - (3 + 4);
+
+        while (pSrc < pStop)
+        {
+            ch = *pSrc;
+            pSrc++;
+
+            if (ch > 0x7F)                                                  // Not ASCII
+            {
+                if (ch > 0x7FF)                                             // Not 2 Byte
+                {
+                    if ((ch & 0xF800) == 0xD800)                            // See if its a Surrogate
+                        goto LongCode;
+                    byteCount++;
+                }
+                byteCount++;
+            }
+
+            // get pSrc aligned
+            if (((size_t)pSrc & 0x2) != 0)
+            {
+                ch = *pSrc;
+                pSrc++;
+                if (ch > 0x7F)                                              // Not ASCII
+                {
+                    if (ch > 0x7FF)                                         // Not 2 Byte
+                    {
+                        if ((ch & 0xF800) == 0xD800)                        // See if its a Surrogate
+                            goto LongCode;
+                        byteCount++;
+                    }
+                    byteCount++;
+                }
+            }
+
+            // Run 2 * 4 characters at a time!
+            while (pSrc < pStop)
+            {
+                ch = *(int*)pSrc;
+                int chc = *(int*)(pSrc + 2);
+                if (((ch | chc) & (int)0xFF80FF80) != 0)         // See if not ASCII
+                {
+                    if (((ch | chc) & (int)0xF800F800) != 0)     // See if not 2 Byte
+                    {
+                        goto LongCodeWithMask;
+                    }
+
+
+                    if ((ch & (int)0xFF800000) != 0)             // Actually 0x07800780 is all we care about (4 bits)
+                        byteCount++;
+                    if ((ch & (int)0xFF80) != 0)
+                        byteCount++;
+                    if ((chc & (int)0xFF800000) != 0)
+                        byteCount++;
+                    if ((chc & (int)0xFF80) != 0)
+                        byteCount++;
+                }
+                pSrc += 4;
+
+                ch = *(int*)pSrc;
+                chc = *(int*)(pSrc + 2);
+                if (((ch | chc) & (int)0xFF80FF80) != 0)         // See if not ASCII
+                {
+                    if (((ch | chc) & (int)0xF800F800) != 0)     // See if not 2 Byte
+                    {
+                        goto LongCodeWithMask;
+                    }
+
+                    if ((ch & (int)0xFF800000) != 0)
+                        byteCount++;
+                    if ((ch & (int)0xFF80) != 0)
+                        byteCount++;
+                    if ((chc & (int)0xFF800000) != 0)
+                        byteCount++;
+                    if ((chc & (int)0xFF80) != 0)
+                        byteCount++;
+                }
+                pSrc += 4;
+            }
+            break;
+
+        LongCodeWithMask:
+#if BIGENDIAN
+        // be careful about the sign extension
+        if (!self->treatAsLE) ch = (int)(((unsigned int)ch) >> 16);
+        else
+#endif
+        ch = (CHAR16_T)ch;
+
+        pSrc++;
+
+        if (ch <= 0x7F)
+        {
+            continue;
+        }
+
+        LongCode:
+            // use separate helper variables for slow and fast loop so that the jit optimizations
+            // won't get confused about the variable lifetimes
+            if (ch > 0x7FF)
+            {
+                if (InRange(ch, HIGH_SURROGATE_START, LOW_SURROGATE_END))
+                {
+                    // 4 byte encoding - high surrogate + low surrogate
+
+                    int chd = *pSrc;
+                    if (
+                        ch > HIGH_SURROGATE_END ||
+                        !InRange(chd, LOW_SURROGATE_START, LOW_SURROGATE_END))
+                    {
+                        // Back up and drop out to slow loop to figure out error
+                        pSrc--;
+                        break;
+                    }
+                    pSrc++;
+
+                    // byteCount - this byte is compensated by the second surrogate character
+                }
+                byteCount++;
+            }
+            byteCount++;
+
+            // byteCount - the last byte is already included
+        }
+
+        // no pending char at this point
+        ch = 0;
+    }
+
+#if WIN64
+    // check for overflow
+    assert(byteCount >= 0);
+#endif
+    assert(!fallbackUsed || self->buffer.encoder.fallbackCount < 0);
+
+    return byteCount;
+}
+
+size_t minipal_get_length_utf8_to_utf16(const char* source, size_t sourceLength, unsigned int flags)
+{
+    errno = 0;
+
+    if (sourceLength == 0)
+        return 0;
+
+    UTF8Encoding enc =
+    {
+        .buffer = { .decoder = { .fallbackCount = -1, .fallbackIndex = -1, .strDefault = { 0xFFFD, 0 }, .strDefaultLength = 1 } },
+        .useFallback = !(flags & MINIPAL_MB_NO_REPLACE_INVALID_CHARS),
+#if BIGENDIAN
+        .treatAsLE = (flags & MINIPAL_TREAT_AS_LITTLE_ENDIAN)
+#endif
+    };
+
+    return GetCharCount(&enc, (unsigned char*)source, sourceLength);
+}
+
+size_t minipal_get_length_utf16_to_utf8(const CHAR16_T* source, size_t sourceLength, unsigned int flags)
+{
+    errno = 0;
+
+    if (sourceLength == 0)
+        return 0;
+
+    UTF8Encoding enc =
+    {
+        // repeat replacement char (0xFFFD) twice for a surrogate pair
+        .buffer = { .encoder = { .fallbackCount = -1, .fallbackIndex = -1, .strDefault = { 0xFFFD, 0xFFFD, 0 }, .strDefaultLength = 2 } },
+        .useFallback = true,
+#if BIGENDIAN
+        .treatAsLE = (flags & MINIPAL_TREAT_AS_LITTLE_ENDIAN)
+#endif
+    };
+
+#if !BIGENDIAN
+    (void)flags; // unused
+#endif
+
+    return GetByteCount(&enc, (CHAR16_T*)source, sourceLength);
+}
+
+size_t minipal_convert_utf8_to_utf16(const char* source, size_t sourceLength, CHAR16_T* destination, size_t destinationLength, unsigned int flags)
+{
+    size_t ret;
+    errno = 0;
+
+    if (sourceLength == 0)
+        return 0;
+
+    UTF8Encoding enc =
+    {
+        .buffer = { .decoder = { .fallbackCount = -1, .fallbackIndex = -1, .strDefault = { 0xFFFD, 0 }, .strDefaultLength = 1 } },
+        .useFallback = !(flags & MINIPAL_MB_NO_REPLACE_INVALID_CHARS),
+#if BIGENDIAN
+        .treatAsLE = (flags & MINIPAL_TREAT_AS_LITTLE_ENDIAN)
+#endif
+    };
+
+    ret = GetChars(&enc, (unsigned char*)source, sourceLength, destination, destinationLength);
+    if (errno) ret = 0;
+
+    return ret;
+}
+
+size_t minipal_convert_utf16_to_utf8(const CHAR16_T* source, size_t sourceLength, char* destination, size_t destinationLength, unsigned int flags)
+{
+    size_t ret;
+    errno = 0;
+
+    if (sourceLength == 0)
+        return 0;
+
+    UTF8Encoding enc =
+    {
+        // repeat replacement char (0xFFFD) twice for a surrogate pair
+        .buffer = { .encoder = { .fallbackCount = -1, .fallbackIndex = -1, .strDefault = { 0xFFFD, 0xFFFD, 0 }, .strDefaultLength = 2 } },
+        .useFallback = true,
+#if BIGENDIAN
+        .treatAsLE = (flags & MINIPAL_TREAT_AS_LITTLE_ENDIAN)
+#endif
+    };
+
+#if !BIGENDIAN
+    (void)flags; // unused
+#endif
+
+    ret = GetBytes(&enc, (CHAR16_T*)source, sourceLength, (unsigned char*)destination, destinationLength);
+    if (errno) ret = 0;
+
+    return ret;
+}
diff --git a/src/native/minipal/utf8.h b/src/native/minipal/utf8.h

new file mode 100644 (file)

index 0000000..bd648f1
--- /dev/null
+++ b/src/native/minipal/utf8.h
@@ -0,0 +1,75 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+#ifndef HAVE_MINIPAL_UTF8_H
+#define HAVE_MINIPAL_UTF8_H
+
+#include <minipal/utils.h>
+#include <stdlib.h>
+#include <stdbool.h>
+
+#define MINIPAL_MB_NO_REPLACE_INVALID_CHARS 0x00000008
+#define MINIPAL_TREAT_AS_LITTLE_ENDIAN 0x00000016
+#define MINIPAL_ERROR_INSUFFICIENT_BUFFER 122L
+#define MINIPAL_ERROR_NO_UNICODE_TRANSLATION 1113L
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif // __cplusplus
+
+#ifdef TARGET_WINDOWS
+typedef wchar_t CHAR16_T;
+#else
+typedef unsigned short CHAR16_T;
+#endif
+
+/**
+ * Get length of destination needed for UTF-8 to UTF-16 (UCS-2) conversion
+ *
+ * @param source The source string in UTF-8 format.
+ * @param sourceLength Length of the source string.
+ * @param flags Flags to alter the behavior of converter. Supported flags are MINIPAL_MB_NO_REPLACE_INVALID_CHARS and MINIPAL_TREAT_AS_LITTLE_ENDIAN.
+ * @return Length of UTF-16 buffer required by the conversion.
+ */
+size_t minipal_get_length_utf8_to_utf16(const char* source, size_t sourceLength, unsigned int flags);
+
+/**
+ * Get length of destination needed for UTF-16 (UCS-2) to UTF-8 conversion
+ *
+ * @param source The source string in UTF-16 format.
+ * @param sourceLength Length of the source string.
+ * @param flags Flags to alter the behavior of converter. Supported flags are MINIPAL_MB_NO_REPLACE_INVALID_CHARS and MINIPAL_TREAT_AS_LITTLE_ENDIAN.
+ * @return Length of UTF-8 buffer required by the conversion.
+ */
+size_t minipal_get_length_utf16_to_utf8(const CHAR16_T* source, size_t sourceLength, unsigned int flags);
+
+/**
+ * Convert a string from UTF-8 to UTF-16 (UCS-2) with preallocated memory
+ *
+ * @param source The source string in UTF-8 format.
+ * @param sourceLength Length of the source string.
+ * @param destination Pointer to the destination UTF-16 string. It can be NULL to query number of items required by the conversion.
+ * @param destinationLength Length of the destination string.
+ * @param flags Flags to alter the behavior of converter. Supported flags are MINIPAL_MB_NO_REPLACE_INVALID_CHARS and MINIPAL_TREAT_AS_LITTLE_ENDIAN.
+ * @return Number of items written by the conversion.
+ */
+size_t minipal_convert_utf8_to_utf16(const char* source, size_t sourceLength, CHAR16_T* destination, size_t destinationLength, unsigned int flags);
+
+/**
+ * Convert a string from UTF-16 (UCS-2) to UTF-8 with preallocated memory
+ *
+ * @param source The source string in UTF-16 format.
+ * @param sourceLength Length of the source string.
+ * @param destination Pointer to the destination UTF-8 string. It can be NULL to query number of items required by the conversion.
+ * @param destinationLength Length of the destination string.
+ * @param flags Flags to alter the behavior of converter. Supported flags are MINIPAL_MB_NO_REPLACE_INVALID_CHARS and MINIPAL_TREAT_AS_LITTLE_ENDIAN.
+ * @return Number of items written by the conversion.
+ */
+size_t minipal_convert_utf16_to_utf8(const CHAR16_T* source, size_t sourceLength, char* destination, size_t destinationLength, unsigned int flags);
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+
+#endif /* HAVE_MINIPAL_UTF8_H */
author	Adeel Mujahid <3840695+am11@users.noreply.github.com>
	Thu, 22 Jun 2023 13:30:16 +0000 (16:30 +0300)
committer	GitHub <noreply@github.com>
	Thu, 22 Jun 2023 13:30:16 +0000 (06:30 -0700)
src/coreclr/inc/utilcode.h		patch \| blob \| history
src/coreclr/pal/src/CMakeLists.txt		patch \| blob \| history
src/coreclr/pal/src/include/pal/utf8.h	[deleted file]	patch \| blob \| history
src/coreclr/pal/src/locale/unicode.cpp		patch \| blob \| history
src/coreclr/pal/src/locale/utf8.cpp	[deleted file]	patch \| blob \| history
src/coreclr/vm/rtlfunctions.cpp		patch \| blob \| history
src/mono/mono/eglib/CMakeLists.txt		patch \| blob \| history
src/mono/mono/eglib/giconv.c		patch \| blob \| history
src/mono/mono/eglib/glib.h		patch \| blob \| history
src/mono/mono/eglib/test/utf8.c		patch \| blob \| history
src/mono/mono/metadata/object.c		patch \| blob \| history
src/mono/mono/mini/CMakeLists.txt		patch \| blob \| history
src/native/minipal/utf8.c	[new file with mode: 0644]	patch \| blob
src/native/minipal/utf8.h	[new file with mode: 0644]	patch \| blob