}
/// <summary>Adds a SearchValues instance declaration to the required helpers collection.</summary>
- private static string EmitSearchValues(char[] asciiChars, Dictionary<string, string[]> requiredHelpers)
+ private static string EmitSearchValues(char[] chars, Dictionary<string, string[]> requiredHelpers)
{
- Debug.Assert(RegexCharClass.IsAscii(asciiChars));
+ Array.Sort(chars);
- // The set of ASCII characters can be represented as a 128-bit bitmap. Use the 16-byte hex string as the key.
- byte[] bitmap = new byte[16];
- foreach (char c in asciiChars)
+ string fieldName;
+ if (RegexCharClass.IsAscii(chars))
{
- bitmap[c >> 3] |= (byte)(1 << (c & 7));
+ // The set of ASCII characters can be represented as a 128-bit bitmap. Use the 16-byte hex string as the key.
+ var bitmap = new byte[16];
+ foreach (char c in chars)
+ {
+ bitmap[c >> 3] |= (byte)(1 << (c & 7));
+ }
+
+ string hexBitmap = BitConverter.ToString(bitmap).Replace("-", string.Empty);
+
+ fieldName = hexBitmap switch
+ {
+ "FFFFFFFF000000000000000000000080" => "s_asciiControl",
+ "000000000000FF030000000000000000" => "s_asciiDigits",
+ "0000000000000000FEFFFF07FEFFFF07" => "s_asciiLetters",
+ "000000000000FF03FEFFFF07FEFFFF07" => "s_asciiLettersAndDigits",
+ "000000000000FF037E0000007E000000" => "s_asciiHexDigits",
+ "000000000000FF03000000007E000000" => "s_asciiHexDigitsLower",
+ "000000000000FF037E00000000000000" => "s_asciiHexDigitsUpper",
+ "00000000EEF7008C010000B800000028" => "s_asciiPunctuation",
+ "00000000010000000000000000000000" => "s_asciiSeparators",
+ "00000000100800700000004001000050" => "s_asciiSymbols",
+ "003E0000010000000000000000000000" => "s_asciiWhiteSpace",
+ "000000000000FF03FEFFFF87FEFFFF07" => "s_asciiWordChars",
+
+ "00000000FFFFFFFFFFFFFFFFFFFFFF7F" => "s_asciiExceptControl",
+ "FFFFFFFFFFFF00FCFFFFFFFFFFFFFFFF" => "s_asciiExceptDigits",
+ "FFFFFFFFFFFFFFFF010000F8010000F8" => "s_asciiExceptLetters",
+ "FFFFFFFFFFFF00FC010000F8010000F8" => "s_asciiExceptLettersAndDigits",
+ "FFFFFFFFFFFFFFFFFFFFFFFF010000F8" => "s_asciiExceptLower",
+ "FFFFFFFF1108FF73FEFFFF47FFFFFFD7" => "s_asciiExceptPunctuation",
+ "FFFFFFFFFEFFFFFFFFFFFFFFFFFFFFFF" => "s_asciiExceptSeparators",
+ "FFFFFFFFEFF7FF8FFFFFFFBFFEFFFFAF" => "s_asciiExceptSymbols",
+ "FFFFFFFFFFFFFFFF010000F8FFFFFFFF" => "s_asciiExceptUpper",
+ "FFC1FFFFFEFFFFFFFFFFFFFFFFFFFFFF" => "s_asciiExceptWhiteSpace",
+ "FFFFFFFFFFFF00FC01000078010000F8" => "s_asciiExceptWordChars",
+
+ _ => $"s_ascii_{hexBitmap.TrimStart('0')}"
+ };
}
-
- string hexBitmap = BitConverter.ToString(bitmap).Replace("-", string.Empty);
-
- string fieldName = hexBitmap switch
+ else
{
- "FFFFFFFF000000000000000000000080" => "s_asciiControl",
- "000000000000FF030000000000000000" => "s_asciiDigits",
- "0000000000000000FEFFFF07FEFFFF07" => "s_asciiLetters",
- "000000000000FF03FEFFFF07FEFFFF07" => "s_asciiLettersAndDigits",
- "000000000000FF037E0000007E000000" => "s_asciiHexDigits",
- "000000000000FF03000000007E000000" => "s_asciiHexDigitsLower",
- "000000000000FF037E00000000000000" => "s_asciiHexDigitsUpper",
- "00000000EEF7008C010000B800000028" => "s_asciiPunctuation",
- "00000000010000000000000000000000" => "s_asciiSeparators",
- "00000000100800700000004001000050" => "s_asciiSymbols",
- "003E0000010000000000000000000000" => "s_asciiWhiteSpace",
- "000000000000FF03FEFFFF87FEFFFF07" => "s_asciiWordChars",
-
- "00000000FFFFFFFFFFFFFFFFFFFFFF7F" => "s_asciiExceptControl",
- "FFFFFFFFFFFF00FCFFFFFFFFFFFFFFFF" => "s_asciiExceptDigits",
- "FFFFFFFFFFFFFFFF010000F8010000F8" => "s_asciiExceptLetters",
- "FFFFFFFFFFFF00FC010000F8010000F8" => "s_asciiExceptLettersAndDigits",
- "FFFFFFFFFFFFFFFFFFFFFFFF010000F8" => "s_asciiExceptLower",
- "FFFFFFFF1108FF73FEFFFF47FFFFFFD7" => "s_asciiExceptPunctuation",
- "FFFFFFFFFEFFFFFFFFFFFFFFFFFFFFFF" => "s_asciiExceptSeparators",
- "FFFFFFFFEFF7FF8FFFFFFFBFFEFFFFAF" => "s_asciiExceptSymbols",
- "FFFFFFFFFFFFFFFF010000F8FFFFFFFF" => "s_asciiExceptUpper",
- "FFC1FFFFFEFFFFFFFFFFFFFFFFFFFFFF" => "s_asciiExceptWhiteSpace",
- "FFFFFFFFFFFF00FC01000078010000F8" => "s_asciiExceptWordChars",
-
- _ => $"s_ascii_{hexBitmap.TrimStart('0')}"
- };
+ using (SHA256 sha = SHA256.Create())
+ {
+#pragma warning disable CA1850 // SHA256.HashData isn't available on netstandard2.0
+ fieldName = $"s_nonAscii_{BitConverter.ToString(sha.ComputeHash(Encoding.UTF8.GetBytes(chars))).Replace("-", "")}";
+#pragma warning restore CA1850
+ }
+ }
if (!requiredHelpers.ContainsKey(fieldName))
{
- Array.Sort(asciiChars);
-
- string setLiteral = Literal(new string(asciiChars));
+ string setLiteral = Literal(new string(chars));
requiredHelpers.Add(fieldName, new string[]
{
// a sequential walk). In order to do that search, we actually build up a set for all of the ASCII
// characters _not_ contained in the set, and then do a search for the inverse of that, which will be
// all of the target ASCII characters and all of non-ASCII.
- var asciiChars = new List<char>();
+ var excludedAsciiChars = new List<char>();
for (int i = 0; i < 128; i++)
{
if (!RegexCharClass.CharInClass((char)i, set))
{
- asciiChars.Add((char)i);
+ excludedAsciiChars.Add((char)i);
}
}
lines.Add($"internal static int {helperName}(this ReadOnlySpan<char> span)");
lines.Add($"{{");
int uncheckedStart = lines.Count;
- lines.Add(asciiChars.Count == 128 ?
+ lines.Add(excludedAsciiChars.Count == 128 ?
$" int i = span.IndexOfAnyExceptInRange('\0', '\u007f');" :
- $" int i = span.IndexOfAnyExcept({EmitSearchValues(asciiChars.ToArray(), requiredHelpers)});");
+ $" int i = span.IndexOfAnyExcept({EmitSearchValues(excludedAsciiChars.ToArray(), requiredHelpers)});");
lines.Add($" if ((uint)i < (uint)span.Length)");
lines.Add($" {{");
lines.Add($" if (char.IsAscii(span[i]))");
string indexOf;
if (primarySet.Chars is not null)
{
+ Debug.Assert(primarySet.Chars.Length > 0);
+
// We have a chars array, so we can use IndexOf{Any}{Except} to search for it. Choose the best overload.
string indexOfName = "IndexOf", indexOfAnyName = "IndexOfAny";
if (primarySet.Negated)
indexOf = primarySet.Chars.Length switch
{
+ // 1, 2, 3 have dedicated optimized IndexOfAny overloads
1 => $"{span}.{indexOfName}({Literal(primarySet.Chars[0])})",
2 => $"{span}.{indexOfAnyName}({Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])})",
3 => $"{span}.{indexOfAnyName}({Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])}, {Literal(primarySet.Chars[2])})",
- _ => $"{span}.{indexOfAnyName}({EmitSearchValuesOrLiteral(primarySet.Chars, requiredHelpers)})",
+
+ // 4, 5 have dedicated optimized IndexOfAny overloads accessible via the ReadOnlySpan<char> overload,
+ // but can also be handled via SearchValues
+ 4 or 5 => $"{span}.{indexOfAnyName}({EmitSearchValuesOrLiteral(primarySet.Chars, requiredHelpers)})",
+
+ // > 5 can only be handled efficiently via SearchValues
+ _ => $"{span}.{indexOfAnyName}({EmitSearchValues(primarySet.Chars, requiredHelpers)})",
};
}
- else if (primarySet.AsciiSet is not null)
- {
- // We have a set of ASCII chars, so we can use IndexOfAny(SearchValues) to search for it.
- Debug.Assert(!primarySet.Negated);
- indexOf = $"{span}.IndexOfAny({EmitSearchValues(primarySet.AsciiSet, requiredHelpers)})";
- }
else if (primarySet.Range is not null)
{
// We have a range, so we can use IndexOfAny{Except}InRange to search for it. In the corner case,
}
else
{
- // We have an arbitrary set of characters that includes at least one non-ASCII char. We use a custom IndexOfAny helper that
- // will perform the search as efficiently as possible.
+ // We have an arbitrary set of characters that's really large or otherwise not enumerable.
+ // We use a custom IndexOfAny helper that will perform the search as efficiently as possible.
indexOf = $"{span}.{EmitIndexOfAnyCustomHelper(primarySet.Set, requiredHelpers, checkOverflow)}()";
}
TryFindRawFixedSets(root, results, ref distance, thorough);
#if DEBUG
results.ForEach(r => Debug.Assert(
- !r.Negated && r.Chars is null && r.AsciiSet is null && r.Range is null,
+ !r.Negated && r.Chars is null && r.Range is null,
$"{nameof(TryFindRawFixedSets)} should have only populated {nameof(r.Set)} and {nameof(r.Distance)}"));
#endif
// For every entry, try to get the chars that make up the set, if there are few enough.
// For any for which we couldn't get the small chars list, see if we can get other useful info.
- Span<char> scratch = stackalloc char[5]; // max efficiently optimized by IndexOfAny today
+ Span<char> scratch = stackalloc char[128]; // limit based on what's currently efficiently handled by SearchValues
for (int i = 0; i < results.Count; i++)
{
RegexFindOptimizations.FixedDistanceSet result = results[i];
result.Negated = RegexCharClass.IsNegated(result.Set);
int count = RegexCharClass.GetSetChars(result.Set, scratch);
-
if (count > 0)
{
result.Chars = scratch.Slice(0, count).ToArray();
}
- if (thorough)
+ // Prefer IndexOfAnyInRange over IndexOfAny for sets of 3-5 values that fit in a single range.
+ if (thorough &&
+ (result.Chars is null || result.Chars.Length > 2) &&
+ RegexCharClass.TryGetSingleRange(result.Set, out char lowInclusive, out char highInclusive))
{
- // Prefer IndexOfAnyInRange over IndexOfAny for sets of 3-5 values that fit in a single range.
- if ((result.Chars is null || count > 2) && RegexCharClass.TryGetSingleRange(result.Set, out char lowInclusive, out char highInclusive))
- {
- result.Chars = null;
- result.Range = (lowInclusive, highInclusive);
- }
- else if (result.Chars is null && !result.Negated && RegexCharClass.TryGetAsciiSetChars(result.Set, out char[]? asciiChars))
- {
- result.AsciiSet = asciiChars;
- }
+ result.Chars = null;
+ result.Range = (lowInclusive, highInclusive);
}
results[i] = result;
// for the fastest and that have the best chance of matching as few false positives as possible.
results.Sort(static (s1, s2) =>
{
- char[]? s1Chars = s1.Chars ?? s1.AsciiSet;
- char[]? s2Chars = s2.Chars ?? s2.AsciiSet;
+ char[]? s1Chars = s1.Chars;
+ char[]? s2Chars = s2.Chars;
int s1CharsLength = s1Chars?.Length ?? 0;
int s2CharsLength = s2Chars?.Length ?? 0;
bool s1Negated = s1.Negated;