libstdc++: Fix handling of invalid ranges in std::regex [PR102447]
authorJonathan Wakely <jwakely@redhat.com>
Tue, 14 Dec 2021 14:32:35 +0000 (14:32 +0000)
committerJonathan Wakely <jwakely@redhat.com>
Tue, 14 Dec 2021 21:45:46 +0000 (21:45 +0000)
std::regex currently allows invalid bracket ranges such as [\w-a] which
are only allowed by ECMAScript when in web browser compatibility mode.
It should be an error, because the start of the range is a character
class, not a single character. The current implementation of
_Compiler::_M_expression_term does not provide a way to reject this,
because we only remember a previous character, not whether we just
processed a character class (or collating symbol etc.)

This patch replaces the pair<bool, CharT> used to emulate
optional<CharT> with a custom class closer to pair<tribool,CharT>. That
allows us to track three states, so that we can tell when we've just
seen a character class.

With this additional state the code in _M_expression_term for processing
the _S_token_bracket_dash can be improved to correctly reject the [\w-a]
case, without regressing for valid cases such as [\w-] and [----].

libstdc++-v3/ChangeLog:

PR libstdc++/102447
* include/bits/regex_compiler.h (_Compiler::_BracketState): New
class.
(_Compiler::_BrackeyMatcher): New alias template.
(_Compiler::_M_expression_term): Change pair<bool, CharT>
parameter to _BracketState. Process first character for
ECMAScript syntax as well as POSIX.
* include/bits/regex_compiler.tcc
(_Compiler::_M_insert_bracket_matcher): Pass _BracketState.
(_Compiler::_M_expression_term): Use _BracketState to store
state between calls. Improve handling of dashes in ranges.
* testsuite/28_regex/algorithms/regex_match/cstring_bracket_01.cc:
Add more tests for ranges containing dashes. Check invalid
ranges with character class at the beginning.

libstdc++-v3/include/bits/regex_compiler.h
libstdc++-v3/include/bits/regex_compiler.tcc
libstdc++-v3/testsuite/28_regex/algorithms/regex_match/cstring_bracket_01.cc

index 88c60c2..06cb48f 100644 (file)
@@ -121,13 +121,45 @@ namespace __detail
        void
        _M_insert_bracket_matcher(bool __neg);
 
-      // Returns true if successfully matched one term and should continue.
+      // Cache of the last atom seen in a bracketed range expression.
+      struct _BracketState
+      {
+       enum class _Type : char { _None, _Char, _Class } _M_type = _Type::_None;
+       _CharT _M_char;
+
+       void
+       set(_CharT __c) noexcept { _M_type = _Type::_Char; _M_char = __c; }
+
+       _GLIBCXX_NODISCARD _CharT
+       get() const noexcept { return _M_char; }
+
+       void
+       reset(_Type __t = _Type::_None) noexcept { _M_type = __t; }
+
+       explicit operator bool() const noexcept
+       { return _M_type != _Type::_None; }
+
+       // Previous token was a single character.
+       _GLIBCXX_NODISCARD bool
+       _M_is_char() const noexcept { return _M_type == _Type::_Char; }
+
+       // Previous token was a character class, equivalent class,
+       // collating symbol etc.
+       _GLIBCXX_NODISCARD bool
+       _M_is_class() const noexcept { return _M_type == _Type::_Class; }
+      };
+
+      template<bool __icase, bool __collate>
+       using _BracketMatcher
+         = std::__detail::_BracketMatcher<_TraitsT, __icase, __collate>;
+
+      // Returns true if successfully parsed one term and should continue
+      // compiling a bracket expression.
       // Returns false if the compiler should move on.
       template<bool __icase, bool __collate>
        bool
-       _M_expression_term(pair<bool, _CharT>& __last_char,
-                          _BracketMatcher<_TraitsT, __icase, __collate>&
-                          __matcher);
+       _M_expression_term(_BracketState& __last_char,
+                          _BracketMatcher<__icase, __collate>& __matcher);
 
       int
       _M_cur_int_value(int __radix);
index 0e2e132..9000aec 100644 (file)
@@ -403,7 +403,7 @@ namespace __detail
     _M_insert_character_class_matcher()
     {
       __glibcxx_assert(_M_value.size() == 1);
-      _BracketMatcher<_TraitsT, __icase, __collate> __matcher
+      _BracketMatcher<__icase, __collate> __matcher
        (_M_ctype.is(_CtypeT::upper, _M_value[0]), _M_traits);
       __matcher._M_add_character_class(_M_value, false);
       __matcher._M_ready();
@@ -417,26 +417,17 @@ namespace __detail
     _Compiler<_TraitsT>::
     _M_insert_bracket_matcher(bool __neg)
     {
-      _BracketMatcher<_TraitsT, __icase, __collate> __matcher(__neg, _M_traits);
-      pair<bool, _CharT> __last_char; // Optional<_CharT>
-      __last_char.first = false;
-      if (!(_M_flags & regex_constants::ECMAScript))
-       {
-         if (_M_try_char())
-           {
-             __last_char.first = true;
-             __last_char.second = _M_value[0];
-           }
-         else if (_M_match_token(_ScannerT::_S_token_bracket_dash))
-           {
-             __last_char.first = true;
-             __last_char.second = '-';
-           }
-       }
+      _BracketMatcher<__icase, __collate> __matcher(__neg, _M_traits);
+      _BracketState __last_char;
+      if (_M_try_char())
+       __last_char.set(_M_value[0]);
+      else if (_M_match_token(_ScannerT::_S_token_bracket_dash))
+       // Dash as first character is a normal character.
+       __last_char.set('-');
       while (_M_expression_term(__last_char, __matcher))
        ;
-      if (__last_char.first)
-       __matcher._M_add_char(__last_char.second);
+      if (__last_char._M_is_char())
+       __matcher._M_add_char(__last_char.get());
       __matcher._M_ready();
       _M_stack.push(_StateSeqT(
                      *_M_nfa,
@@ -447,27 +438,27 @@ namespace __detail
   template<bool __icase, bool __collate>
     bool
     _Compiler<_TraitsT>::
-    _M_expression_term(pair<bool, _CharT>& __last_char,
-                      _BracketMatcher<_TraitsT, __icase, __collate>& __matcher)
+    _M_expression_term(_BracketState& __last_char,
+                      _BracketMatcher<__icase, __collate>& __matcher)
     {
       if (_M_match_token(_ScannerT::_S_token_bracket_end))
        return false;
 
+      // Add any previously cached char into the matcher and update cache.
       const auto __push_char = [&](_CharT __ch)
       {
-       if (__last_char.first)
-         __matcher._M_add_char(__last_char.second);
-       else
-         __last_char.first = true;
-       __last_char.second = __ch;
+       if (__last_char._M_is_char())
+         __matcher._M_add_char(__last_char.get());
+       __last_char.set(__ch);
       };
-      const auto __flush = [&]
+      // Add any previously cached char into the matcher and update cache.
+      const auto __push_class = [&]
       {
-       if (__last_char.first)
-         {
-           __matcher._M_add_char(__last_char.second);
-           __last_char.first = false;
-         }
+        if (__last_char._M_is_char())
+         __matcher._M_add_char(__last_char.get());
+       // We don't cache anything here, just record that the last thing
+       // processed was a character class (or similar).
+       __last_char.reset(_BracketState::_Type::_Class);
       };
 
       if (_M_match_token(_ScannerT::_S_token_collsymbol))
@@ -476,16 +467,16 @@ namespace __detail
          if (__symbol.size() == 1)
            __push_char(__symbol[0]);
          else
-           __flush();
+           __push_class();
        }
       else if (_M_match_token(_ScannerT::_S_token_equiv_class_name))
        {
-         __flush();
+         __push_class();
          __matcher._M_add_equivalence_class(_M_value);
        }
       else if (_M_match_token(_ScannerT::_S_token_char_class_name))
        {
-         __flush();
+         __push_class();
          __matcher._M_add_character_class(_M_value, false);
        }
       else if (_M_try_char())
@@ -502,49 +493,50 @@ namespace __detail
       // It turns out that no one reads BNFs ;)
       else if (_M_match_token(_ScannerT::_S_token_bracket_dash))
        {
-         if (!__last_char.first)
+         if (_M_match_token(_ScannerT::_S_token_bracket_end))
            {
-             if (!(_M_flags & regex_constants::ECMAScript))
-               {
-                 if (_M_match_token(_ScannerT::_S_token_bracket_end))
-                   {
-                     __push_char('-');
-                     return false;
-                   }
-                 __throw_regex_error(
-                   regex_constants::error_range,
-                   "Unexpected dash in bracket expression. For POSIX syntax, "
-                   "a dash is not treated literally only when it is at "
-                   "beginning or end.");
-               }
+             // For "-]" the dash is a literal character.
              __push_char('-');
+             return false;
            }
-         else
+         else if (__last_char._M_is_class())
+           {
+             // "\\w-" is invalid, start of range must be a single char.
+             __throw_regex_error(regex_constants::error_range,
+                   "Invalid start of range in bracket expression.");
+           }
+         else if (__last_char._M_is_char())
            {
              if (_M_try_char())
                {
-                 __matcher._M_make_range(__last_char.second, _M_value[0]);
-                 __last_char.first = false;
+                 // "x-y"
+                 __matcher._M_make_range(__last_char.get(), _M_value[0]);
+                 __last_char.reset();
                }
              else if (_M_match_token(_ScannerT::_S_token_bracket_dash))
                {
-                 __matcher._M_make_range(__last_char.second, '-');
-                 __last_char.first = false;
+                 // "x--"
+                 __matcher._M_make_range(__last_char.get(), '-');
+                 __last_char.reset();
                }
              else
-               {
-                 if (_M_scanner._M_get_token()
-                     != _ScannerT::_S_token_bracket_end)
-                   __throw_regex_error(
-                     regex_constants::error_range,
-                     "Character is expected after a dash.");
-                 __push_char('-');
-               }
+               __throw_regex_error(regex_constants::error_range,
+                     "Invalid end of range in bracket expression.");
            }
+         else if (_M_flags & regex_constants::ECMAScript)
+           {
+             // A dash that is not part of an existing range. Might be the
+             // start of a new range, or might just be a literal '-' char.
+             // Only ECMAScript allows that in the middle of a bracket expr.
+             __push_char('-');
+           }
+         else
+           __throw_regex_error(regex_constants::error_range,
+                               "Invalid dash in bracket expression.");
        }
       else if (_M_match_token(_ScannerT::_S_token_quoted_class))
        {
-         __flush();
+         __push_class();
          __matcher._M_add_character_class(_M_value,
                                           _M_ctype.is(_CtypeT::upper,
                                                       _M_value[0]));
index 7df7060..0d76e63 100644 (file)
@@ -69,6 +69,16 @@ test01()
 void
 test02()
 {
+  VERIFY(regex_match("-", regex("[-]", regex_constants::ECMAScript)));
+  VERIFY(regex_match("-", regex("[--]", regex_constants::ECMAScript)));
+  VERIFY(regex_match("-", regex("[---]", regex_constants::ECMAScript)));
+  VERIFY(regex_match("-", regex("[----]", regex_constants::ECMAScript)));
+  VERIFY(regex_match("-", regex("[-----]", regex_constants::ECMAScript)));
+
+  VERIFY(regex_match("-", regex("[-]", regex_constants::extended)));
+  VERIFY(regex_match("-", regex("[--]", regex_constants::extended)));
+  VERIFY(regex_match("-", regex("[---]", regex_constants::extended)));
+  VERIFY(regex_match("-", regex("[----]", regex_constants::extended)));
   try
   {
     std::regex re("[-----]", std::regex::extended);
@@ -78,7 +88,6 @@ test02()
   {
     VERIFY(e.code() == std::regex_constants::error_range);
   }
-  std::regex re("[-----]", std::regex::ECMAScript);
 
   VERIFY(!regex_match("b", regex("[-ac]", regex_constants::extended)));
   VERIFY(!regex_match("b", regex("[ac-]", regex_constants::extended)));
@@ -93,7 +102,27 @@ test02()
   }
   catch (const std::regex_error& e)
   {
+    VERIFY(e.code() == std::regex_constants::error_range);
+  }
+  try
+  {
+    regex("[@--]", regex_constants::extended);
+    VERIFY(false);
   }
+  catch (const std::regex_error& e)
+  {
+    VERIFY(e.code() == std::regex_constants::error_range);
+  }
+  try
+  {
+    regex("[--%]", regex_constants::extended);
+    VERIFY(false);
+  }
+  catch (const std::regex_error& e)
+  {
+    VERIFY(e.code() == std::regex_constants::error_range);
+  }
+
   VERIFY(regex_match("].", regex("[][.hyphen.]-0]*", regex_constants::extended)));
 }
 
@@ -158,6 +187,36 @@ test06()
   VERIFY(regex_match("a-", debian_cron_namespace_ok));
 }
 
+// libstdc++/102447
+void
+test07()
+{
+  VERIFY(regex_match("-", std::regex("[\\w-]", std::regex::ECMAScript)));
+  VERIFY(regex_match("a", std::regex("[\\w-]", std::regex::ECMAScript)));
+  VERIFY(regex_match("-", std::regex("[a-]", std::regex::ECMAScript)));
+  VERIFY(regex_match("a", std::regex("[a-]", std::regex::ECMAScript)));
+
+  try
+  {
+    std::regex re("[\\w-a]", std::regex::ECMAScript);
+    VERIFY(false);
+  }
+  catch (const std::regex_error& e)
+  {
+    VERIFY(e.code() == std::regex_constants::error_range);
+  }
+
+  try
+  {
+    std::regex re("[\\w--]", std::regex::ECMAScript);
+    VERIFY(false);
+  }
+  catch (const std::regex_error& e)
+  {
+    VERIFY(e.code() == std::regex_constants::error_range);
+  }
+}
+
 int
 main()
 {
@@ -167,6 +226,7 @@ main()
   test04();
   test05();
   test06();
+  test07();
 
   return 0;
 }