// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
-// See the LICENSE file in the project root for more information.
/*++
unicode/utf8.c
Abstract:
- Functions to encode and decode UTF-8 strings. This is a port of the C# version from mscorlib.
+ Functions to encode and decode UTF-8 strings. This is a port of the C# version from Utf8Encoding.cs.
Revision History:
#define FASTLOOP
+#ifndef COUNTOF
+#define COUNTOF(x) (sizeof(x) / sizeof((x)[0]))
+#endif
+
struct CharUnicodeInfo
{
static const WCHAR HIGH_SURROGATE_START = 0xd800;
return (c & 0xFC00) == CharUnicodeInfo::LOW_SURROGATE_START;
}
- // Test if the wide character is a low surrogate
+ // Test if the wide character is a surrogate half
static bool IsSurrogate(const WCHAR c)
{
return (c & 0xF800) == CharUnicodeInfo::HIGH_SURROGATE_START;
return IsLowSurrogate(s[index]);
}
- // Test if the wide character is a low surrogate
+ // Test if the wide character is a surrogate half
static bool IsSurrogate(const WCHAR* s, int index)
{
return IsSurrogate(s[index]);
if (bFoundHigh)
throw ArgumentException("String 'replacement' contains invalid Unicode code points.", "replacement");
- wcscpy_s(strDefault, sizeof(strDefault), replacement);
+ wcscpy_s(strDefault, COUNTOF(strDefault), replacement);
strDefaultLength = replacementLength;
}
else
{
// Low surrogate
- if (bHighSurrogate == false)
+ if (!bHighSurrogate)
throw ArgumentException("String 'chars' contains invalid Unicode code points.");
bHighSurrogate = false;
}
else
{
// Low surrogate
- if (bHighSurrogate == false)
+ if (!bHighSurrogate)
throw ArgumentException("String 'chars' contains invalid Unicode code points.");
bHighSurrogate = false;
}
// Construction
DecoderReplacementFallbackBuffer(DecoderReplacementFallback* fallback)
{
- wcscpy_s(strDefault, sizeof(strDefault), fallback->GetDefaultString());
+ wcscpy_s(strDefault, COUNTOF(strDefault), fallback->GetDefaultString());
strDefaultLength = PAL_wcslen((const WCHAR *)fallback->GetDefaultString());
}
WCHAR GetCharUnknownHigh()
{
return (charUnknownHigh);
- }
+ }
WCHAR GetCharUnknownLow()
{
if (bFoundHigh)
throw ArgumentException("String 'replacement' contains invalid Unicode code points.", "replacement");
- wcscpy_s(strDefault, sizeof(strDefault), replacement);
+ wcscpy_s(strDefault, COUNTOF(strDefault), replacement);
strDefaultLength = replacementLength;
}
EncoderReplacementFallbackBuffer(EncoderReplacementFallback* fallback)
{
// 2X in case we're a surrogate pair
- wcscpy_s(strDefault, sizeof(strDefault), fallback->GetDefaultString());
- wcscat_s(strDefault, sizeof(strDefault), fallback->GetDefaultString());
+ wcscpy_s(strDefault, COUNTOF(strDefault), fallback->GetDefaultString());
+ wcscat_s(strDefault, COUNTOF(strDefault), fallback->GetDefaultString());
strDefaultLength = 2 * PAL_wcslen((const WCHAR *)fallback->GetDefaultString());
}
{
// Get our byte[]
BYTE* pStart = *pSrc;
- BYTE* bytesUnknown;
- int size = GetBytesUnknown(pStart, ch, &bytesUnknown);
+ BYTE bytesUnknown[3];
+ int size = GetBytesUnknown(pStart, ch, bytesUnknown);
// Do the actual fallback
if (!fallback->InternalFallback(bytesUnknown, *pSrc, pTarget, size))
int FallbackInvalidByteSequence(BYTE* pSrc, int ch, DecoderFallbackBuffer *fallback)
{
// Get our byte[]
- BYTE *bytesUnknown;
- int size = GetBytesUnknown(pSrc, ch, &bytesUnknown);
+ BYTE bytesUnknown[3];
+ int size = GetBytesUnknown(pSrc, ch, bytesUnknown);
// Do the actual fallback
int count = fallback->InternalFallback(bytesUnknown, pSrc, size);
return count;
}
- int GetBytesUnknown(BYTE* pSrc, int ch, BYTE **bytesUnknown)
+ int GetBytesUnknown(BYTE* pSrc, int ch, BYTE* bytesUnknown)
{
int size;
- BYTE bytes[3];
// See if it was a plain char
// (have to check >= 0 because we have all sorts of wierd bit flags)
if (ch < 0x100 && ch >= 0)
{
pSrc--;
- bytes[0] = (BYTE)ch;
+ bytesUnknown[0] = (BYTE)ch;
size = 1;
}
// See if its an unfinished 2 byte sequence
else if ((ch & (SupplimentarySeq | ThreeByteSeq)) == 0)
{
pSrc--;
- bytes[0] = (BYTE)((ch & 0x1F) | 0xc0);
+ bytesUnknown[0] = (BYTE)((ch & 0x1F) | 0xc0);
size = 1;
}
// So now we're either 2nd byte of 3 or 4 byte sequence or
{
// 3rd byte of 4 byte sequence
pSrc -= 3;
- bytes[0] = (BYTE)(((ch >> 12) & 0x07) | 0xF0);
- bytes[1] = (BYTE)(((ch >> 6) & 0x3F) | 0x80);
- bytes[2] = (BYTE)(((ch)& 0x3F) | 0x80);
+ bytesUnknown[0] = (BYTE)(((ch >> 12) & 0x07) | 0xF0);
+ bytesUnknown[1] = (BYTE)(((ch >> 6) & 0x3F) | 0x80);
+ bytesUnknown[2] = (BYTE)(((ch)& 0x3F) | 0x80);
size = 3;
}
else if ((ch & (FinalByte >> 12)) != 0)
{
// 2nd byte of a 4 byte sequence
pSrc -= 2;
- bytes[0] = (BYTE)(((ch >> 6) & 0x07) | 0xF0);
- bytes[1] = (BYTE)(((ch)& 0x3F) | 0x80);
+ bytesUnknown[0] = (BYTE)(((ch >> 6) & 0x07) | 0xF0);
+ bytesUnknown[1] = (BYTE)(((ch)& 0x3F) | 0x80);
size = 2;
}
else
{
// 4th byte of a 4 byte sequence
pSrc--;
- bytes[0] = (BYTE)(((ch)& 0x07) | 0xF0);
+ bytesUnknown[0] = (BYTE)(((ch)& 0x07) | 0xF0);
size = 1;
}
}
{
// So its 2nd byte of a 3 byte sequence
pSrc -= 2;
- bytes[0] = (BYTE)(((ch >> 6) & 0x0F) | 0xE0);
- bytes[1] = (BYTE)(((ch)& 0x3F) | 0x80);
+ bytesUnknown[0] = (BYTE)(((ch >> 6) & 0x0F) | 0xE0);
+ bytesUnknown[1] = (BYTE)(((ch)& 0x3F) | 0x80);
size = 2;
}
else
{
// 1st byte of a 3 byte sequence
pSrc--;
- bytes[0] = (BYTE)(((ch)& 0x0F) | 0xE0);
+ bytesUnknown[0] = (BYTE)(((ch)& 0x0F) | 0xE0);
size = 1;
}
}
- *bytesUnknown = bytes;
return size;
}
int ch = 0;
DecoderFallbackBuffer *fallback = nullptr;
- for (;;)
+ while (true)
{
// SLOWLOOP: does all range checks, handles all special cases, but it is slow
if (pSrc >= pEnd) {
}
// get pSrc 2-byte aligned
- if (((int)pSrc & 0x1) != 0) {
+ if (((size_t)pSrc & 0x1) != 0) {
ch = *pSrc;
pSrc++;
if (ch > 0x7F) {
}
// get pSrc 4-byte aligned
- if (((int)pSrc & 0x2) != 0) {
+ if (((size_t)pSrc & 0x2) != 0) {
ch = *(USHORT*)pSrc;
if ((ch & 0x8080) != 0) {
goto LongCodeWithMask16;
DecoderFallbackBuffer *fallback = nullptr;
- for (;;)
+ while (true)
{
// SLOWLOOP: does all range checks, handles all special cases, but it is slow
fallback = decoderFallback->CreateFallbackBuffer();
fallback->InternalInitialize(bytes, pAllocatedBufferEnd);
}
-
+
// That'll back us up the appropriate # of bytes if we didn't get anywhere
if (!FallbackInvalidByteSequence(&pSrc, ch, fallback, &pTarget))
{
pSrc--;
// Throw that we don't have enough room (pSrc could be < chars if we had started to process
- // a 4 byte sequence alredy)
+ // a 4 byte sequence already)
Contract::Assert(pSrc >= bytes || pTarget == chars,
"[UTF8Encoding.GetChars]Expected pSrc to be within input buffer or throw due to no output]");
ThrowCharsOverflow(pTarget == chars);
pTarget++;
// get pSrc to be 2-byte aligned
- if ((((int)pSrc) & 0x1) != 0) {
+ if ((((size_t)pSrc) & 0x1) != 0) {
ch = *pSrc;
pSrc++;
if (ch > 0x7F) {
}
// get pSrc to be 4-byte aligned
- if ((((int)pSrc) & 0x2) != 0) {
+ if ((((size_t)pSrc) & 0x2) != 0) {
ch = *(USHORT*)pSrc;
if ((ch & 0x8080) != 0) {
goto LongCodeWithMask16;
// extra byte, we're already planning 2 chars for 2 of these bytes,
// but the big loop is testing the target against pStop, so we need
- // to subtract 2 more or we risk overrunning the input. Subtract
+ // to subtract 2 more or we risk overrunning the input. Subtract
// one here and one below.
pStop--;
}
// assume that JIT will enregister pSrc, pTarget and ch
- for (;;) {
+ while (true) {
// SLOWLOOP: does all range checks, handles all special cases, but it is slow
if (pSrc >= pEnd) {
ch = 0;
}
- InternalDelete(fallbackBuffer);
+ InternalDelete(fallbackBuffer);
return (int)(pTarget - bytes);
}
int ch = 0;
- for (;;) {
+ while (true) {
// SLOWLOOP: does all range checks, handles all special cases, but it is slow
if (pSrc >= pEnd) {
}
// get pSrc aligned
- if (((int)pSrc & 0x2) != 0) {
+ if (((size_t)pSrc & 0x2) != 0) {
ch = *pSrc;
pSrc++;
if (ch > 0x7F) // Not ASCII