lib/regexprops.c

   1 /* regexprops.c -- document the properties of the regular expressions
   2    understood by gnulib.
   3
   4    Copyright 2005, 2007, 2010 Free Software Foundation, Inc.
   5
   6    This program is free software: you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation, either version 3 of the License, or
   9    (at your option) any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  18 */
  19
  20
  21 /*
  22   The output of this program is included in the GNU findutils source
  23   distribution.  The copying conditions for that file are generated
  24   by the copying() function below.
  25 */
  26
  27 /* Written by James Youngman, <jay@gnu.org>. */
  28
  29 #include <config.h>
  30
  31
  32 #include <stdio.h>
  33 #include <string.h>
  34 #include <unistd.h>
  35 #include <errno.h>
  36
  37 #include "regex.h"
  38 #include "regextype.h"
  39 #include "progname.h"
  40
  41
  42 static void
  43 output (const char *s, int escape)
  44 {
  45   (void) escape;
  46
  47   fputs (s, stdout);
  48 }
  49
  50
  51 static void
  52 newline (void)
  53 {
  54   output ("\n", 0);
  55 }
  56
  57 static void
  58 content (const char *s)
  59 {
  60   output (s, 1);
  61 }
  62
  63 static void
  64 literal (const char *s)
  65 {
  66   output (s, 0);
  67 }
  68
  69 static void
  70 directive (const char *s)
  71 {
  72   output (s, 0);
  73 }
  74
  75 static void
  76 comment (const char *s)
  77 {
  78   directive ("@c ");
  79   literal (s);
  80   newline ();
  81 }
  82
  83 static void
  84 enum_item (const char *s)
  85 {
  86   newline ();
  87   directive ("@item ");
  88   literal (s);
  89   newline ();
  90 }
  91
  92 static void
  93 begin_subsection (const char *name,
  94                   const char *next,
  95                   const char *prev,
  96                   const char *up)
  97 {
  98   (void) next;
  99   (void) prev;
 100   (void) up;
 101
 102   newline ();
 103
 104   directive ("@node ");
 105   content (name);
 106   content (" regular expression syntax");
 107   newline ();
 108
 109   directive ("@subsection ");
 110   output ("@samp{", 0);
 111   content (name);
 112   output ("}", 0);
 113   content (" regular expression syntax");
 114   newline ();
 115 }
 116
 117 static void
 118 begintable_markup (char const *markup)
 119 {
 120   newline ();
 121   directive ("@table ");
 122   literal (markup);
 123   newline ();
 124 }
 125
 126 static void
 127 endtable ()
 128 {
 129   newline ();
 130   directive ("@end table");
 131   newline ();
 132 }
 133
 134 static void
 135 beginenum ()
 136 {
 137   newline ();
 138   directive ("@enumerate");
 139   newline ();
 140 }
 141
 142 static void
 143 endenum ()
 144 {
 145   newline ();
 146   directive ("@end enumerate");
 147   newline ();
 148 }
 149
 150 static void
 151 newpara ()
 152 {
 153   content ("\n\n");
 154 }
 155
 156
 157 static void
 158 describe_regex_syntax (int options)
 159 {
 160   newpara ();
 161   content ("The character @samp{.} matches any single character");
 162   if ( (options & RE_DOT_NEWLINE)  == 0 )
 163     {
 164       content (" except newline");
 165     }
 166   if (options & RE_DOT_NOT_NULL)
 167     {
 168       if ( (options & RE_DOT_NEWLINE)  == 0 )
 169         content (" and");
 170       else
 171         content (" except");
 172
 173       content (" the null character");
 174     }
 175   content (".  ");
 176   newpara ();
 177
 178   if (!(options & RE_LIMITED_OPS))
 179     {
 180       begintable_markup ("@samp");
 181       if (options & RE_BK_PLUS_QM)
 182         {
 183           enum_item ("\\+");
 184           content ("indicates that the regular expression should match one"
 185                    " or more occurrences of the previous atom or regexp.  ");
 186           enum_item ("\\?");
 187           content ("indicates that the regular expression should match zero"
 188                    " or one occurrence of the previous atom or regexp.  ");
 189           enum_item ("+ and ? ");
 190           content ("match themselves.  ");
 191         }
 192       else
 193         {
 194           enum_item ("+");
 195           content ("indicates that the regular expression should match one"
 196                    " or more occurrences of the previous atom or regexp.  ");
 197           enum_item ("?");
 198           content ("indicates that the regular expression should match zero"
 199                    " or one occurrence of the previous atom or regexp.  ");
 200           enum_item ("\\+");
 201           literal ("matches a @samp{+}");
 202           enum_item ("\\?");
 203           literal ("matches a @samp{?}.  ");
 204         }
 205       endtable ();
 206     }
 207
 208   newpara ();
 209
 210   content ("Bracket expressions are used to match ranges of characters.  ");
 211   literal ("Bracket expressions where the range is backward, for example @samp{[z-a]}, are ");
 212   if (options & RE_NO_EMPTY_RANGES)
 213     content ("invalid");
 214   else
 215     content ("ignored");
 216   content (".  ");
 217
 218   if (options &  RE_BACKSLASH_ESCAPE_IN_LISTS)
 219     literal ("Within square brackets, @samp{\\} can be used to quote "
 220              "the following character.  ");
 221   else
 222     literal ("Within square brackets, @samp{\\} is taken literally.  ");
 223
 224   if (options & RE_CHAR_CLASSES)
 225     content ("Character classes are supported; for example "
 226              "@samp{[[:digit:]]} will match a single decimal digit.  ");
 227   else
 228     literal ("Character classes are not supported, so for example "
 229              "you would need to use @samp{[0-9]} "
 230              "instead of @samp{[[:digit:]]}.  ");
 231
 232   if (options & RE_HAT_LISTS_NOT_NEWLINE)
 233     {
 234       literal ("Non-matching lists @samp{[^@dots{}]} do not ever match newline.  ");
 235     }
 236   newpara ();
 237   if (options & RE_NO_GNU_OPS)
 238     {
 239       content ("GNU extensions are not supported and so "
 240                "@samp{\\w}, @samp{\\W}, @samp{\\<}, @samp{\\>}, @samp{\\b}, @samp{\\B}, @samp{\\`}, and @samp{\\'} "
 241                "match "
 242                "@samp{w}, @samp{W}, @samp{<}, @samp{>}, @samp{b}, @samp{B}, @samp{`}, and @samp{'} respectively.  ");
 243     }
 244   else
 245     {
 246       content ("GNU extensions are supported:");
 247       beginenum ();
 248       enum_item ("@samp{\\w} matches a character within a word");
 249       enum_item ("@samp{\\W} matches a character which is not within a word");
 250       enum_item ("@samp{\\<} matches the beginning of a word");
 251       enum_item ("@samp{\\>} matches the end of a word");
 252       enum_item ("@samp{\\b} matches a word boundary");
 253       enum_item ("@samp{\\B} matches characters which are not a word boundary");
 254       enum_item ("@samp{\\`} matches the beginning of the whole input");
 255       enum_item ("@samp{\\'} matches the end of the whole input");
 256       endenum ();
 257     }
 258
 259   newpara ();
 260
 261
 262   if (options & RE_NO_BK_PARENS)
 263     {
 264       literal ("Grouping is performed with parentheses @samp{()}.  ");
 265
 266       if (options & RE_UNMATCHED_RIGHT_PAREN_ORD)
 267         literal ("An unmatched @samp{)} matches just itself.  ");
 268     }
 269   else
 270     {
 271       literal ("Grouping is performed with backslashes followed by parentheses @samp{\\(}, @samp{\\)}.  ");
 272     }
 273
 274   if (options & RE_NO_BK_REFS)
 275     {
 276       content ("A backslash followed by a digit matches that digit.  ");
 277     }
 278   else
 279     {
 280       literal ("A backslash followed by a digit acts as a back-reference and matches the same thing as the previous grouped expression indicated by that number.  For example @samp{\\2} matches the second group expression.  The order of group expressions is determined by the position of their opening parenthesis ");
 281       if (options & RE_NO_BK_PARENS)
 282         literal ("@samp{(}");
 283       else
 284         literal ("@samp{\\(}");
 285       content (".  ");
 286     }
 287
 288
 289   newpara ();
 290   if (!(options & RE_LIMITED_OPS))
 291     {
 292       if (options & RE_NO_BK_VBAR)
 293         literal ("The alternation operator is @samp{|}.  ");
 294       else
 295         literal ("The alternation operator is @samp{\\|}. ");
 296     }
 297   newpara ();
 298
 299   if (options & RE_CONTEXT_INDEP_ANCHORS)
 300     {
 301       literal ("The characters @samp{^} and @samp{$} always represent the beginning and end of a string respectively, except within square brackets.  Within brackets, @samp{^} can be used to invert the membership of the character class being specified.  ");
 302     }
 303   else
 304     {
 305       literal ("The character @samp{^} only represents the beginning of a string when it appears:");
 306       beginenum ();
 307       enum_item ("\nAt the beginning of a regular expression");
 308       enum_item ("After an open-group, signified by ");
 309       if (options & RE_NO_BK_PARENS)
 310         {
 311           literal ("@samp{(}");
 312         }
 313       else
 314         {
 315           literal ("@samp{\\(}");
 316         }
 317       newline ();
 318       if (!(options & RE_LIMITED_OPS))
 319         {
 320           if (options & RE_NEWLINE_ALT)
 321             enum_item ("After a newline");
 322
 323           if (options & RE_NO_BK_VBAR )
 324             enum_item ("After the alternation operator @samp{|}");
 325           else
 326             enum_item ("After the alternation operator @samp{\\|}");
 327         }
 328       endenum ();
 329
 330       newpara ();
 331       literal ("The character @samp{$} only represents the end of a string when it appears:");
 332       beginenum ();
 333       enum_item ("At the end of a regular expression");
 334       enum_item ("Before a close-group, signified by ");
 335       if (options & RE_NO_BK_PARENS)
 336         {
 337           literal ("@samp{)}");
 338         }
 339       else
 340         {
 341           literal ("@samp{\\)}");
 342         }
 343       if (!(options & RE_LIMITED_OPS))
 344         {
 345           if (options & RE_NEWLINE_ALT)
 346             enum_item ("Before a newline");
 347
 348           if (options & RE_NO_BK_VBAR)
 349             enum_item ("Before the alternation operator @samp{|}");
 350           else
 351             enum_item ("Before the alternation operator @samp{\\|}");
 352         }
 353       endenum ();
 354     }
 355   newpara ();
 356   if (!(options & RE_LIMITED_OPS) )
 357     {
 358       if ((options & RE_CONTEXT_INDEP_OPS)
 359           && !(options & RE_CONTEXT_INVALID_OPS))
 360         {
 361           literal ("The characters @samp{*}, @samp{+} and @samp{?} are special anywhere in a regular expression.  ");
 362         }
 363       else
 364         {
 365           if (options & RE_BK_PLUS_QM)
 366             literal ("@samp{\\*}, @samp{\\+} and @samp{\\?} ");
 367           else
 368             literal ("@samp{*}, @samp{+} and @samp{?} ");
 369
 370           if (options & RE_CONTEXT_INVALID_OPS)
 371             {
 372               content ("are special at any point in a regular expression except the following places, where they are not allowed:");
 373             }
 374           else
 375             {
 376               content ("are special at any point in a regular expression except:");
 377             }
 378
 379           beginenum ();
 380           enum_item ("At the beginning of a regular expression");
 381           enum_item ("After an open-group, signified by ");
 382           if (options & RE_NO_BK_PARENS)
 383             {
 384               literal ("@samp{(}");
 385             }
 386           else
 387             {
 388               literal ("@samp{\\(}");
 389             }
 390           if (!(options & RE_LIMITED_OPS))
 391             {
 392               if (options & RE_NEWLINE_ALT)
 393                 enum_item ("After a newline");
 394
 395               if (options & RE_NO_BK_VBAR)
 396                 enum_item ("After the alternation operator @samp{|}");
 397               else
 398                 enum_item ("After the alternation operator @samp{\\|}");
 399             }
 400           endenum ();
 401         }
 402     }
 403
 404
 405   newpara ();
 406   if (options & RE_INTERVALS)
 407     {
 408       if (options & RE_NO_BK_BRACES)
 409         {
 410           literal ("Intervals are specified by @samp{@{} and @samp{@}}.  ");
 411           if (options & RE_INVALID_INTERVAL_ORD)
 412             {
 413               literal ("Invalid intervals are treated as literals, for example @samp{a@{1} is treated as @samp{a\\@{1}");
 414             }
 415           else
 416             {
 417               literal ("Invalid intervals such as @samp{a@{1z} are not accepted.  ");
 418             }
 419         }
 420       else
 421         {
 422           literal ("Intervals are specified by @samp{\\@{} and @samp{\\@}}.  ");
 423           if (options & RE_INVALID_INTERVAL_ORD)
 424             {
 425               literal ("Invalid intervals are treated as literals, for example @samp{a\\@{1} is treated as @samp{a@{1}");
 426             }
 427           else
 428             {
 429               literal ("Invalid intervals such as @samp{a\\@{1z} are not accepted.  ");
 430             }
 431         }
 432
 433     }
 434
 435   newpara ();
 436   if (options & RE_NO_POSIX_BACKTRACKING)
 437     {
 438       content ("Matching succeeds as soon as the whole pattern is matched, meaning that the result may not be the longest possible match.  ");
 439     }
 440   else
 441     {
 442       content ("The longest possible match is returned; this applies to the regular expression as a whole and (subject to this constraint) to subexpressions within groups.  ");
 443     }
 444   newpara ();
 445 }
 446
 447
 448 static void
 449 copying (void)
 450 {
 451   static const char *copy_para[]=
 452     {
 453       "Copyright (C) 1994, 1996, 1998, 2000, 2001, 2003, 2004, 2005, 2006,"
 454       ,"2007, 2009, 2010 Free Software Foundation, Inc."
 455       ,""
 456       ,"Permission is granted to copy, distribute and/or modify this document"
 457       ,"under the terms of the GNU Free Documentation License, Version 1.3 or"
 458       ,"any later version published by the Free Software Foundation; with no"
 459       ,"Invariant Sections, with no Front-Cover Texts, and with no Back-Cover"
 460       ,"Texts.  A copy of the license is included in the ``GNU Free"
 461       ,"Documentation License'' file as part of this distribution."
 462       ""
 463       ,NULL
 464     };
 465   const char **s = copy_para;
 466   while (*s)
 467     comment (*s++);
 468 }
 469
 470 static int
 471 ignore (int ix, const unsigned int context)
 472 {
 473   return 0 == (get_regex_type_context (ix) & context);
 474 }
 475
 476 static void
 477 menu (unsigned int context)
 478 {
 479   int i, options;
 480   const char *name;
 481
 482   output ("@menu\n", 0);
 483   for (i=0;
 484        options = get_regex_type_flags (i),
 485          name=get_regex_type_name (i);
 486        ++i)
 487     {
 488       if (!ignore (i, context))
 489         {
 490           output ("* ", 0);
 491           output (name, 0);
 492           content (" regular expression syntax");
 493           output ("::", 0);
 494           newline ();
 495         }
 496     }
 497   output ("@end menu\n", 0);
 498 }
 499
 500
 501
 502 static const char *
 503 get_next (unsigned int ix, unsigned int context)
 504 {
 505   const char *next;
 506   while (get_regex_type_name (ix))
 507     {
 508       if (!ignore (ix, context))
 509         {
 510           next = get_regex_type_name (ix);
 511           if (NULL == next)
 512             return "";
 513           else
 514             return next;
 515         }
 516       ++ix;
 517     }
 518   return "";
 519 }
 520
 521
 522 static void
 523 describe_all (const char *contextname,
 524               unsigned int context,
 525               const char *up)
 526 {
 527   const char *name, *next, *previous;
 528   int options;
 529   int i, parent;
 530
 531   copying ();
 532   newline ();
 533   literal ("@c this regular expression description is for: ");
 534   literal (contextname);
 535   newline ();
 536   newline ();
 537   menu (context);
 538
 539   previous = "";
 540
 541   for (i=0;
 542        options = get_regex_type_flags (i),
 543          name=get_regex_type_name (i);
 544        ++i)
 545     {
 546       if (ignore (i, context))
 547         {
 548           fprintf (stderr,
 549                    "Skipping regexp type %s for context %s\n",
 550                    name, contextname);
 551           name = previous;
 552           continue;
 553         }
 554
 555       next = get_next (i+1, context);
 556       if (NULL == next)
 557         next = "";
 558       begin_subsection (name, next, previous, up);
 559       parent = get_regex_type_synonym (i);
 560       if (parent >= 0)
 561         {
 562           content ("This is a synonym for ");
 563           content (get_regex_type_name (parent));
 564           content (".");
 565         }
 566       else
 567         {
 568           describe_regex_syntax (options);
 569         }
 570       previous = name;
 571     }
 572 }
 573
 574
 575
 576 int
 577 main (int argc, char *argv[])
 578 {
 579   const char *up = "";
 580   unsigned int context = CONTEXT_ALL;
 581   const char *contextname = "all";
 582
 583   if (argc)
 584     set_program_name (argv[0]);
 585   else
 586     set_program_name ("regexprops");
 587
 588   if (argc > 1)
 589     {
 590       up = argv[1];
 591     }
 592   if (argc > 2)
 593     {
 594       contextname = argv[2];
 595       if (0 == strcmp (contextname, "findutils"))
 596         context = CONTEXT_FINDUTILS;
 597       else if (0 == strcmp (contextname, "generic"))
 598         context = CONTEXT_GENERIC;
 599       else if (0 == strcmp (contextname, "all"))
 600         context = CONTEXT_ALL;
 601       else
 602         {
 603           fprintf (stderr, "Unexpected context %s",
 604                    contextname);
 605           return 1;
 606         }
 607     }
 608
 609   describe_all (contextname, context, up);
 610   return 0;
 611 }