From 80800eb0ead17dc724275d426904755b3bc64139 Mon Sep 17 00:00:00 2001 From: stephentoub Date: Tue, 20 Oct 2015 15:34:38 -0400 Subject: [PATCH] Improve string.{Last}IndexOf perf on Unix for Ordinal/OrdinalIgnoreCase Our current implementation of IndexOfOrdinal for strings on Unix uses Substring to get the piece of the source string we care about; this results in an unnecessary allocation / string copy. When using OrdinalIgnoreCase, we also convert both the source and search strings to upper-case using ToUpperInvariant, resulting in more allocations. And our LastIndexOfOrdinal implementation delegates to IndexOfOrdinal repeatedly, incurring such allocations potentially multiple times. This change reimplements Ordinal searching in managed code to not use Substring, and it implements OrdinalIgnoreCase searching via new functions exposed in the native globalization shim, so as to use ICU without having to make managed/native transitions for each character. With the changes, {Last}IndexOf with Ordinal/OrdinalIgnoreCase are now allocateion-free (as you'd expect), and throughput when startIndex/count and/or OrdinalIgnoreCase are used is increased significantly, on my machine anywhere from 20% to 3x, depending on the inputs. --- .../System.Globalization.Native/collation.cpp | 78 +++++++++++++++++++++ .../Interop.Collation.cs | 6 ++ .../System/Globalization/CompareInfo.Unix.cs | 80 ++++++++++++++-------- 3 files changed, 135 insertions(+), 29 deletions(-) diff --git a/src/corefx/System.Globalization.Native/collation.cpp b/src/corefx/System.Globalization.Native/collation.cpp index 7cf32b9..d82d8d7 100644 --- a/src/corefx/System.Globalization.Native/collation.cpp +++ b/src/corefx/System.Globalization.Native/collation.cpp @@ -126,6 +126,84 @@ extern "C" int32_t LastIndexOf( } /* +Static Function: +AreEqualOrdinalIgnoreCase +*/ +static bool AreEqualOrdinalIgnoreCase(UChar one, UChar two) +{ + // Return whether the two characters are identical or would be identical if they were upper-cased. + + if (one == two) + { + return true; + } + + if (one == 0x0131 || two == 0x0131) + { + // On Windows with InvariantCulture, the LATIN SMALL LETTER DOTLESS I (U+0131) + // capitalizes to itself, whereas with ICU it capitalizes to LATIN CAPITAL LETTER I (U+0049). + // We special case it to match the Windows invariant behavior. + return false; + } + + return u_toupper(one) == u_toupper(two); +} + +/* +Function: +IndexOfOrdinalIgnoreCase +*/ +extern "C" int32_t +IndexOfOrdinalIgnoreCase(const UChar* lpTarget, int32_t cwTargetLength, const UChar* lpSource, int32_t cwSourceLength) +{ + int32_t endIndex = cwSourceLength - cwTargetLength; + assert(endIndex >= 0); + + for (int32_t i = 0; i <= endIndex; i++) + { + int32_t targetIdx = 0; + for (int32_t srcIdx = i; targetIdx < cwTargetLength; srcIdx++, targetIdx++) { + if (!AreEqualOrdinalIgnoreCase(lpSource[srcIdx], lpTarget[targetIdx])) { + break; + } + } + + if (targetIdx == cwTargetLength) { + return i; + } + } + + return -1; +} + +/* +Function: +LastIndexOfOrdinalIgnoreCase +*/ +extern "C" int32_t +LastIndexOfOrdinalIgnoreCase(const UChar* lpTarget, int32_t cwTargetLength, const UChar* lpSource, int32_t cwSourceLength) +{ + int32_t endIndex = cwSourceLength - cwTargetLength; + assert(endIndex >= 0); + + for (int32_t i = endIndex; i >= 0; i--) + { + int32_t targetIdx = 0; + for (int32_t srcIdx = i; targetIdx < cwTargetLength; srcIdx++, targetIdx++) { + if (!AreEqualOrdinalIgnoreCase(lpSource[srcIdx], lpTarget[targetIdx])) { + break; + } + } + + if (targetIdx == cwTargetLength) { + return i; + } + } + + return -1; +} + +/* Return value is a "Win32 BOOL" (1 = true, 0 = false) */ extern "C" int32_t StartsWith( diff --git a/src/mscorlib/corefx/Interop/Unix/System.Globalization.Native/Interop.Collation.cs b/src/mscorlib/corefx/Interop/Unix/System.Globalization.Native/Interop.Collation.cs index b600fa5..1a3654e 100644 --- a/src/mscorlib/corefx/Interop/Unix/System.Globalization.Native/Interop.Collation.cs +++ b/src/mscorlib/corefx/Interop/Unix/System.Globalization.Native/Interop.Collation.cs @@ -19,6 +19,12 @@ internal static partial class Interop internal unsafe static extern int LastIndexOf(byte[] localeName, string target, char* pSource, int cwSourceLength, CompareOptions options); [DllImport(Libraries.GlobalizationInterop, CharSet = CharSet.Unicode)] + internal unsafe static extern int IndexOfOrdinalIgnoreCase(string target, int cwTargetLength, char* pSource, int cwSourceLength); + + [DllImport(Libraries.GlobalizationInterop, CharSet = CharSet.Unicode)] + internal unsafe static extern int LastIndexOfOrdinalIgnoreCase(string target, int cwTargetLength, char* pSource, int cwSourceLength); + + [DllImport(Libraries.GlobalizationInterop, CharSet = CharSet.Unicode)] [return: MarshalAs(UnmanagedType.Bool)] internal unsafe static extern bool StartsWith(byte[] localeName, string target, string source, int cwSourceLength, CompareOptions options); diff --git a/src/mscorlib/corefx/System/Globalization/CompareInfo.Unix.cs b/src/mscorlib/corefx/System/Globalization/CompareInfo.Unix.cs index ba658d0..8df1d79 100644 --- a/src/mscorlib/corefx/System/Globalization/CompareInfo.Unix.cs +++ b/src/mscorlib/corefx/System/Globalization/CompareInfo.Unix.cs @@ -20,7 +20,7 @@ namespace System.Globalization m_sortNameAsUtf8 = System.Text.Encoding.UTF8.GetBytes(m_sortName); } - internal static int IndexOfOrdinal(string source, string value, int startIndex, int count, bool ignoreCase) + internal static unsafe int IndexOfOrdinal(string source, string value, int startIndex, int count, bool ignoreCase) { Contract.Assert(source != null); Contract.Assert(value != null); @@ -30,33 +30,41 @@ namespace System.Globalization return startIndex; } - // TODO (dotnet/corefx#3468): Move this into the shim so we don't have to do the ToUpper or call substring. + if (count < value.Length) + { + return -1; + } if (ignoreCase) { - source = source.ToUpper(CultureInfo.InvariantCulture); - value = value.ToUpper(CultureInfo.InvariantCulture); + fixed (char* pSource = source) + { + int index = Interop.GlobalizationInterop.IndexOfOrdinalIgnoreCase(value, value.Length, pSource + startIndex, count); + return index != -1 ? + startIndex + index : + -1; + } } - source = source.Substring(startIndex, count); - - for (int i = 0; i + value.Length <= source.Length; i++) + int endIndex = startIndex + (count - value.Length); + for (int i = startIndex; i <= endIndex; i++) { - for (int j = 0; j < value.Length; j++) { - if (source[i + j] != value[j]) { - break; - } + int valueIndex, sourceIndex; + + for (valueIndex = 0, sourceIndex = i; + valueIndex < value.Length && source[sourceIndex] == value[valueIndex]; + valueIndex++, sourceIndex++) ; - if (j == value.Length - 1) { - return i + startIndex; - } + if (valueIndex == value.Length) + { + return i; } } return -1; } - internal static int LastIndexOfOrdinal(string source, string value, int startIndex, int count, bool ignoreCase) + internal static unsafe int LastIndexOfOrdinal(string source, string value, int startIndex, int count, bool ignoreCase) { Contract.Assert(source != null); Contract.Assert(value != null); @@ -66,27 +74,41 @@ namespace System.Globalization return startIndex; } - // TODO (dotnet/corefx#3468): Move this into the shim so we don't have to do the ToUpper or call substring. + if (count < value.Length) + { + return -1; + } + + // startIndex is the index into source where we start search backwards from. + // leftStartIndex is the index into source of the start of the string that is + // count characters away from startIndex. + int leftStartIndex = startIndex - count + 1; if (ignoreCase) { - source = source.ToUpper(CultureInfo.InvariantCulture); - value = value.ToUpper(CultureInfo.InvariantCulture); + fixed (char* pSource = source) + { + int lastIndex = Interop.GlobalizationInterop.LastIndexOfOrdinalIgnoreCase(value, value.Length, pSource + leftStartIndex, count); + return lastIndex != -1 ? + leftStartIndex + lastIndex : + -1; + } } - source = source.Substring(startIndex - count + 1, count); + for (int i = startIndex - value.Length + 1; i >= leftStartIndex; i--) + { + int valueIndex, sourceIndex; - int last = -1; + for (valueIndex = 0, sourceIndex = i; + valueIndex < value.Length && source[sourceIndex] == value[valueIndex]; + valueIndex++, sourceIndex++) ; - int cur = 0; - while ((cur = IndexOfOrdinal(source, value, last + 1, source.Length - last - 1, false)) != -1) - { - last = cur; + if (valueIndex == value.Length) { + return i; + } } - return last >= 0 ? - last + startIndex - count + 1 : - -1; + return -1; } private unsafe int GetHashCodeOfStringCore(string source, CompareOptions options) @@ -138,9 +160,9 @@ namespace System.Globalization fixed (char* pSource = source) { - int lastIndex = Interop.GlobalizationInterop.IndexOf(m_sortNameAsUtf8, target, pSource + startIndex, count, options); + int index = Interop.GlobalizationInterop.IndexOf(m_sortNameAsUtf8, target, pSource + startIndex, count, options); - return lastIndex != -1 ? lastIndex + startIndex : -1; + return index != -1 ? index + startIndex : -1; } } -- 2.7.4