Link="Common\DisableRuntimeMarshalling.cs" />
<Compile Include="$(CommonPath)System\Runtime\InteropServices\HandleRefMarshaller.cs"
Link="Common\System\Runtime\InteropServices\HandleRefMarshaller.cs" />
+ </ItemGroup>
+
+ <ItemGroup Condition="$([MSBuild]::IsTargetFrameworkCompatible('$(TargetFramework)', 'net8.0'))">
<ProjectReference Include="..\..\System.Text.RegularExpressions\gen\System.Text.RegularExpressions.Generator.csproj"
SetTargetFramework="TargetFramework=netstandard2.0"
OutputItemType="Analyzer"
<ProjectReference Include="$(LibrariesProjectRoot)System.Diagnostics.PerformanceCounter\src\System.Diagnostics.PerformanceCounter.csproj" />
</ItemGroup>
- <ItemGroup Condition="$([MSBuild]::IsTargetFrameworkCompatible('$(TargetFramework)', 'net7.0'))">
+ <ItemGroup Condition="$([MSBuild]::IsTargetFrameworkCompatible('$(TargetFramework)', 'net8.0'))">
<ProjectReference Include="..\..\System.Text.RegularExpressions\gen\System.Text.RegularExpressions.Generator.csproj"
SetTargetFramework="TargetFramework=netstandard2.0"
OutputItemType="Analyzer"
// characters _not_ contained in the set, and then do a search for the inverse of that, which will be
// all of the target ASCII characters and all of non-ASCII.
var asciiChars = new List<char>();
- for (int i = 0; i <= 0x7f; i++)
+ for (int i = 0; i < 128; i++)
{
if (!RegexCharClass.CharInClass((char)i, set))
{
}
}
+ // If this is a known set, use a predetermined simple name for the helper.
string? helperName = set switch
{
RegexCharClass.DigitClass => "IndexOfAnyDigit",
_ => null,
};
+
+ // If this set is just from a few Unicode categories, derive a name from the categories.
+ if (helperName is null)
+ {
+ Span<UnicodeCategory> categories = stackalloc UnicodeCategory[5]; // arbitrary limit to keep names from being too unwieldy
+ if (RegexCharClass.TryGetOnlyCategories(set, categories, out int numCategories, out bool negatedCategory))
+ {
+ helperName = $"IndexOfAny{(negatedCategory ? "Except" : "")}{string.Concat(categories.Slice(0, numCategories).ToArray().Select(c => c.ToString()))}";
+ }
+ }
+
+ // As a final fallback, manufacture a name unique to the full set description.
if (helperName is null)
{
using (SHA256 sha = SHA256.Create())
$" int i = span.IndexOfAnyExcept({EmitIndexOfAnyValues(asciiChars.ToArray(), requiredHelpers)});");
lines.Add($" if ((uint)i < (uint)span.Length)");
lines.Add($" {{");
- lines.Add($" if (span[i] <= 0x7f)");
+ lines.Add($" if (char.IsAscii(span[i]))");
lines.Add($" {{");
lines.Add($" return i;");
lines.Add($" }}");
// Use IndexOf{Any} to accelerate the skip loop via vectorization to match the first prefix.
// But we avoid using it for the relatively common case of the starting set being '.', aka anything other than
- // a newline, as it's very rare to have long, uninterrupted sequences of newlines.
+ // a newline, as it's very rare to have long, uninterrupted sequences of newlines. And we avoid using it
+ // for the case of the starting set being anything (e.g. '.' with SingleLine), as in that case it'll always match
+ // the first char.
int setIndex = 0;
- bool canUseIndexOf = primarySet.Set != RegexCharClass.NotNewLineClass;
+ bool canUseIndexOf =
+ primarySet.Set != RegexCharClass.NotNewLineClass &&
+ primarySet.Set != RegexCharClass.AnyClass;
bool needLoop = !canUseIndexOf || setsToUse > 1;
FinishEmitBlock loopBlock = default;
return false;
}
- return CharInCategory(ch, set, start, setLength, categoryLength);
+ return CharInCategory(ch, set.AsSpan(SetStartIndex + start + setLength, categoryLength));
}
- private static bool CharInCategory(char ch, string set, int start, int setLength, int categoryLength)
+ private static bool CharInCategory(char ch, ReadOnlySpan<char> categorySetSegment)
{
UnicodeCategory chcategory = char.GetUnicodeCategory(ch);
- int i = start + SetStartIndex + setLength;
- int end = i + categoryLength;
- while (i < end)
+ for (int i = 0; i < categorySetSegment.Length; i++)
{
- int curcat = (short)set[i];
+ int curcat = (short)categorySetSegment[i];
if (curcat == 0)
{
// zero is our marker for a group of categories - treated as a unit
- if (CharInCategoryGroup(chcategory, set, ref i))
+ if (CharInCategoryGroup(chcategory, categorySetSegment, ref i))
{
return true;
}
return true;
}
}
-
- i++;
}
return false;
/// This is used for categories which are composed of other categories - L, N, Z, W...
/// These groups need special treatment when they are negated
/// </summary>
- private static bool CharInCategoryGroup(UnicodeCategory chcategory, string category, ref int i)
+ private static bool CharInCategoryGroup(UnicodeCategory chcategory, ReadOnlySpan<char> category, ref int i)
{
int pos = i + 1;
int curcat = (short)category[pos];
}
}
}
+
+ // If the class now has a range that includes everything, and if it doesn't have subtraction,
+ // we can remove all of its categories, as they're duplicative (the set already includes everything).
+ if (!_negate &&
+ _subtractor is null &&
+ _categories?.Length > 0 &&
+ rangelist.Count == 1 && rangelist[0].First == 0 && rangelist[0].Last == LastChar)
+ {
+ _categories.Clear();
+ }
+
+ // If there's only a single character omitted from ranges, if there's no subtractor, and if there are categories,
+ // see if that character is in the categories. If it is, then we can replace whole thing with a complete "any" range.
+ // If it's not, then we can remove the categories, as they're only duplicating the rest of the range, turning the set
+ // into a "not one". This primarily helps in the case of a synthesized set from analysis that ends up combining '.' with
+ // categories, as we want to reduce that set down to either [^\n] or [\0-\uFFFF]. (This can be extrapolated to any number
+ // of missing characters; in fact, categories in general are superfluous and the entire set can be represented as ranges.
+ // But categories serve as a space optimization, and we strike a balance between testing many characters and the time/complexity
+ // it takes to do so. Thus, we limit this to the common case of a single missing character.)
+ if (!_negate &&
+ _subtractor is null &&
+ _categories?.Length > 0 &&
+ rangelist.Count == 2 && rangelist[0].First == 0 && rangelist[0].Last + 2 == rangelist[1].First && rangelist[1].Last == LastChar)
+ {
+ var vsb = new ValueStringBuilder(stackalloc char[256]);
+ foreach (ReadOnlyMemory<char> chunk in _categories!.GetChunks())
+ {
+ vsb.Append(chunk.Span);
+ }
+
+ if (CharInCategory((char)(rangelist[0].Last + 1), vsb.AsSpan()))
+ {
+ rangelist.RemoveAt(1);
+ rangelist[0] = ('\0', LastChar);
+ }
+ else
+ {
+ _negate = true;
+ rangelist.RemoveAt(1);
+ char notOne = (char)(rangelist[0].Last + 1);
+ rangelist[0] = (notOne, notOne);
+ }
+ _categories.Clear();
+
+ vsb.Dispose();
+ }
}
}
void RenderRanges()
{
- for (; index < SetStartIndex + set[SetLengthIndex]; index += 2)
+ int rangesEnd = SetStartIndex + set[SetLengthIndex];
+ while (index < rangesEnd)
{
ch1 = set[index];
- ch2 = index + 1 < set.Length ?
- (char)(set[index + 1] - 1) :
- LastChar;
+ if (index + 1 < rangesEnd)
+ {
+ ch2 = (char)(set[index + 1] - 1);
+ index += 2;
+ }
+ else
+ {
+ ch2 = LastChar;
+ index++;
+ }
desc.Append(DescribeChar(ch1));
using System.Globalization;
using System.Reflection;
using System.Reflection.Emit;
+using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Threading;
private static readonly MethodInfo s_stringGetCharsMethod = typeof(string).GetMethod("get_Chars", new Type[] { typeof(int) })!;
private static readonly MethodInfo s_arrayResize = typeof(Array).GetMethod("Resize")!.MakeGenericMethod(typeof(int));
private static readonly MethodInfo s_mathMinIntInt = typeof(Math).GetMethod("Min", new Type[] { typeof(int), typeof(int) })!;
+ private static readonly MethodInfo s_memoryMarshalGetArrayDataReferenceIndexOfAnyValues = typeof(MemoryMarshal).GetMethod("GetArrayDataReference", new Type[] { Type.MakeGenericMethodParameter(0).MakeArrayType() })!.MakeGenericMethod(typeof(IndexOfAnyValues<char>))!;
// Note:
// Single-range helpers like IsAsciiLetterLower, IsAsciiLetterUpper, IsAsciiDigit, and IsBetween aren't used here, as the IL generated for those
// single-range checks is as cheap as the method call, and there's no readability issue as with the source generator.
// Use IndexOf{Any} to accelerate the skip loop via vectorization to match the first prefix.
// But we avoid using it for the relatively common case of the starting set being '.', aka anything other than
- // a newline, as it's very rare to have long, uninterrupted sequences of newlines.
+ // a newline, as it's very rare to have long, uninterrupted sequences of newlines. And we avoid using it
+ // for the case of the starting set being anything (e.g. '.' with SingleLine), as in that case it'll always match
+ // the first char.
int setIndex = 0;
- bool canUseIndexOf = primarySet.Set != RegexCharClass.NotNewLineClass;
+ bool canUseIndexOf =
+ primarySet.Set != RegexCharClass.NotNewLineClass &&
+ primarySet.Set != RegexCharClass.AnyClass;
bool needLoop = !canUseIndexOf || setsToUse > 1;
Label checkSpanLengthLabel = default;
int index = list.Count;
list.Add(IndexOfAnyValues.Create(chars));
- // this._indexOfAnyValues[index]
+ // Logically do _indexOfAnyValues[index], but avoid the bounds check on accessing the array,
+ // and cast to the known derived sealed type to enable devirtualization.
+
+ // DerivedIndexOfAnyValues d = Unsafe.Add(ref MemoryMarshal.GetArrayDataReference(this._indexOfAnyValues), index);
+ // ... = d;
Ldthisfld(s_indexOfAnyValuesArrayField);
- Ldc(index);
- _ilg!.Emit(OpCodes.Ldelem_Ref);
+ Call(s_memoryMarshalGetArrayDataReferenceIndexOfAnyValues);
+ Ldc(index * IntPtr.Size);
+ Add();
+ _ilg!.Emit(OpCodes.Ldind_Ref);
+ LocalBuilder ioavLocal = _ilg!.DeclareLocal(list[index].GetType());
+ Stloc(ioavLocal);
+ Ldloc(ioavLocal);
}
}
}