[Support] Fix alternation support in backreferences (PR60073)
authorNikita Popov <npopov@redhat.com>
Mon, 16 Jan 2023 14:03:35 +0000 (15:03 +0100)
committerNikita Popov <npopov@redhat.com>
Tue, 17 Jan 2023 08:58:10 +0000 (09:58 +0100)
backref() always performs a full match on the remaining string,
and as such also needs to be matched against the whole remaining
strip. For alternations, the match was performed against just the
sub-strip for one alternative, which would of course fail to match
the whole string.

This can be done by skipping the part of the strip between OOR1
and O_CH, so that only the first alternative in the strip is
matched, and the remaining ones are skipped. Indeed, the necessary
OOR1 skipping code was already implemented in the easy-path of
backref(), so this is clearly how it was supposed to work.

However, there were two bugs: First, under this scheme we should
be passing the stop point of the original strip, not just the
alternative sub-strip. Second, while skipping for OOR1 was
implemented, handling for O_CH was missing. This would occur when
the last alternative matches, as O_CH is preceded by an implicit
OOR1 only.

Fixes https://github.com/llvm/llvm-project/issues/60073.

llvm/lib/Support/regengine.inc
llvm/unittests/Support/RegexTest.cpp

index 3b7014a..b32392a 100644 (file)
@@ -590,6 +590,7 @@ backref(struct match *m, const char *start, const char *stop, sopno startst,
                                return(NULL);
                        break;
                case O_QUEST:
+               case O_CH:
                        break;
                case OOR1:      /* matches null but needs to skip */
                        ss++;
@@ -662,7 +663,7 @@ backref(struct match *m, const char *start, const char *stop, sopno startst,
                esub = ss + OPND(s) - 1;
                assert(OP(m->g->strip[esub]) == OOR1);
                for (;;) {      /* find first matching branch */
-                       dp = backref(m, sp, stop, ssub, esub, lev, rec);
+                       dp = backref(m, sp, stop, ssub, stopst, lev, rec);
                        if (dp != NULL)
                                return(dp);
                        /* that one missed, try next one */
index eb8160e..78f37cd 100644 (file)
@@ -80,6 +80,24 @@ TEST_F(RegexTest, Backreferences) {
   EXPECT_EQ("z", Matches[2].str());
   EXPECT_FALSE(r3.match("a6zb6y"));
   EXPECT_FALSE(r3.match("a6zb7z"));
+
+  Regex r4("(abc|xyz|uvw)_\\1");
+  EXPECT_TRUE(r4.match("abc_abc", &Matches));
+  EXPECT_EQ(2u, Matches.size());
+  EXPECT_FALSE(r4.match("abc_ab", &Matches));
+  EXPECT_FALSE(r4.match("abc_xyz", &Matches));
+
+  Regex r5("(xyz|abc|uvw)_\\1");
+  EXPECT_TRUE(r5.match("abc_abc", &Matches));
+  EXPECT_EQ(2u, Matches.size());
+  EXPECT_FALSE(r5.match("abc_ab", &Matches));
+  EXPECT_FALSE(r5.match("abc_xyz", &Matches));
+
+  Regex r6("(xyz|uvw|abc)_\\1");
+  EXPECT_TRUE(r6.match("abc_abc", &Matches));
+  EXPECT_EQ(2u, Matches.size());
+  EXPECT_FALSE(r6.match("abc_ab", &Matches));
+  EXPECT_FALSE(r6.match("abc_xyz", &Matches));
 }
 
 TEST_F(RegexTest, Substitution) {