src/ptx.c

   1 /* Permuted index for GNU, with keywords in their context.
   2    Copyright (C) 1990-2012 Free Software Foundation, Inc.
   3    François Pinard <pinard@iro.umontreal.ca>, 1988.
   4
   5    This program is free software: you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation, either version 3 of the License, or
   8    (at your option) any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  17
  18    François Pinard <pinard@iro.umontreal.ca> */
  19
  20 #include <config.h>
  21
  22 #include <getopt.h>
  23 #include <sys/types.h>
  24 #include "system.h"
  25 #include <regex.h>
  26 #include "argmatch.h"
  27 #include "diacrit.h"
  28 #include "error.h"
  29 #include "fadvise.h"
  30 #include "quote.h"
  31 #include "quotearg.h"
  32 #include "read-file.h"
  33 #include "stdio--.h"
  34 #include "xstrtol.h"
  35
  36 /* The official name of this program (e.g., no 'g' prefix).  */
  37 #define PROGRAM_NAME "ptx"
  38
  39 /* TRANSLATORS: Please translate "F. Pinard" to "François Pinard"
  40    if "ç" (c-with-cedilla) is available in the translation's character
  41    set and encoding.  */
  42 #define AUTHORS proper_name_utf8 ("F. Pinard", "Fran\xc3\xa7ois Pinard")
  43
  44 /* Number of possible characters in a byte.  */
  45 #define CHAR_SET_SIZE 256
  46
  47 #define ISODIGIT(C) ((C) >= '0' && (C) <= '7')
  48 #define HEXTOBIN(C) ((C) >= 'a' && (C) <= 'f' ? (C)-'a'+10 \
  49                      : (C) >= 'A' && (C) <= 'F' ? (C)-'A'+10 : (C)-'0')
  50 #define OCTTOBIN(C) ((C) - '0')
  51
  52 /* Debugging the memory allocator.  */
  53
  54 #if WITH_DMALLOC
  55 # define MALLOC_FUNC_CHECK 1
  56 # include <dmalloc.h>
  57 #endif
  58 \f
  59 /* Global definitions.  */
  60
  61 /* FIXME: There are many unchecked integer overflows in this file,
  62    that will cause this command to misbehave given large inputs or
  63    options.  Many of the "int" values below should be "size_t" or
  64    something else like that.  */
  65
  66 /* Program options.  */
  67
  68 enum Format
  69 {
  70   UNKNOWN_FORMAT,               /* output format still unknown */
  71   DUMB_FORMAT,                  /* output for a dumb terminal */
  72   ROFF_FORMAT,                  /* output for 'troff' or 'nroff' */
  73   TEX_FORMAT                    /* output for 'TeX' or 'LaTeX' */
  74 };
  75
  76 static bool gnu_extensions = true;      /* trigger all GNU extensions */
  77 static bool auto_reference = false;     /* refs are 'file_name:line_number:' */
  78 static bool input_reference = false;    /* refs at beginning of input lines */
  79 static bool right_reference = false;    /* output refs after right context  */
  80 static int line_width = 72;     /* output line width in characters */
  81 static int gap_size = 3;        /* number of spaces between output fields */
  82 static const char *truncation_string = "/";
  83                                 /* string used to mark line truncations */
  84 static const char *macro_name = "xx";   /* macro name for roff or TeX output */
  85 static enum Format output_format = UNKNOWN_FORMAT;
  86                                 /* output format */
  87
  88 static bool ignore_case = false;        /* fold lower to upper for sorting */
  89 static const char *break_file = NULL;   /* name of the 'Break chars' file */
  90 static const char *only_file = NULL;    /* name of the 'Only words' file */
  91 static const char *ignore_file = NULL;  /* name of the 'Ignore words' file */
  92
  93 /* Options that use regular expressions.  */
  94 struct regex_data
  95 {
  96   /* The original regular expression, as a string.  */
  97   char const *string;
  98
  99   /* The compiled regular expression, and its fastmap.  */
 100   struct re_pattern_buffer pattern;
 101   char fastmap[UCHAR_MAX + 1];
 102 };
 103
 104 static struct regex_data context_regex; /* end of context */
 105 static struct regex_data word_regex;    /* keyword */
 106
 107 /* A BLOCK delimit a region in memory of arbitrary size, like the copy of a
 108    whole file.  A WORD is something smaller, its length should fit in a
 109    short integer.  A WORD_TABLE may contain several WORDs.  */
 110
 111 typedef struct
 112   {
 113     char *start;                /* pointer to beginning of region */
 114     char *end;                  /* pointer to end + 1 of region */
 115   }
 116 BLOCK;
 117
 118 typedef struct
 119   {
 120     char *start;                /* pointer to beginning of region */
 121     short int size;             /* length of the region */
 122   }
 123 WORD;
 124
 125 typedef struct
 126   {
 127     WORD *start;                /* array of WORDs */
 128     size_t alloc;               /* allocated length */
 129     size_t length;              /* number of used entries */
 130   }
 131 WORD_TABLE;
 132
 133 /* Pattern description tables.  */
 134
 135 /* For each character, provide its folded equivalent.  */
 136 static unsigned char folded_chars[CHAR_SET_SIZE];
 137
 138 /* End of context pattern register indices.  */
 139 static struct re_registers context_regs;
 140
 141 /* Keyword pattern register indices.  */
 142 static struct re_registers word_regs;
 143
 144 /* A word characters fastmap is used only when no word regexp has been
 145    provided.  A word is then made up of a sequence of one or more characters
 146    allowed by the fastmap.  Contains !0 if character allowed in word.  Not
 147    only this is faster in most cases, but it simplifies the implementation
 148    of the Break files.  */
 149 static char word_fastmap[CHAR_SET_SIZE];
 150
 151 /* Maximum length of any word read.  */
 152 static int maximum_word_length;
 153
 154 /* Maximum width of any reference used.  */
 155 static int reference_max_width;
 156
 157 /* Ignore and Only word tables.  */
 158
 159 static WORD_TABLE ignore_table; /* table of words to ignore */
 160 static WORD_TABLE only_table;           /* table of words to select */
 161
 162 /* Source text table, and scanning macros.  */
 163
 164 static int number_input_files;  /* number of text input files */
 165 static int total_line_count;    /* total number of lines seen so far */
 166 static const char **input_file_name;    /* array of text input file names */
 167 static int *file_line_count;    /* array of 'total_line_count' values at end */
 168
 169 static BLOCK text_buffer;       /* file to study */
 170
 171 /* SKIP_NON_WHITE used only for getting or skipping the reference.  */
 172
 173 #define SKIP_NON_WHITE(cursor, limit) \
 174   while (cursor < limit && ! isspace (to_uchar (*cursor)))              \
 175     cursor++
 176
 177 #define SKIP_WHITE(cursor, limit) \
 178   while (cursor < limit && isspace (to_uchar (*cursor)))                \
 179     cursor++
 180
 181 #define SKIP_WHITE_BACKWARDS(cursor, start) \
 182   while (cursor > start && isspace (to_uchar (cursor[-1])))             \
 183     cursor--
 184
 185 #define SKIP_SOMETHING(cursor, limit) \
 186   if (word_regex.string)                                                \
 187     {                                                                   \
 188       regoff_t count;                                                   \
 189       count = re_match (&word_regex.pattern, cursor, limit - cursor, 0, NULL); \
 190       if (count == -2)                                                  \
 191         matcher_error ();                                               \
 192       cursor += count == -1 ? 1 : count;                                \
 193     }                                                                   \
 194   else if (word_fastmap[to_uchar (*cursor)])                            \
 195     while (cursor < limit && word_fastmap[to_uchar (*cursor)])          \
 196       cursor++;                                                         \
 197   else                                                                  \
 198     cursor++
 199
 200 /* Occurrences table.
 201
 202    The 'keyword' pointer provides the central word, which is surrounded
 203    by a left context and a right context.  The 'keyword' and 'length'
 204    field allow full 8-bit characters keys, even including NULs.  At other
 205    places in this program, the name 'keyafter' refers to the keyword
 206    followed by its right context.
 207
 208    The left context does not extend, towards the beginning of the file,
 209    further than a distance given by the 'left' value.  This value is
 210    relative to the keyword beginning, it is usually negative.  This
 211    insures that, except for white space, we will never have to backward
 212    scan the source text, when it is time to generate the final output
 213    lines.
 214
 215    The right context, indirectly attainable through the keyword end, does
 216    not extend, towards the end of the file, further than a distance given
 217    by the 'right' value.  This value is relative to the keyword
 218    beginning, it is usually positive.
 219
 220    When automatic references are used, the 'reference' value is the
 221    overall line number in all input files read so far, in this case, it
 222    is of type (int).  When input references are used, the 'reference'
 223    value indicates the distance between the keyword beginning and the
 224    start of the reference field, it is of type (DELTA) and usually
 225    negative.  */
 226
 227 typedef short int DELTA;        /* to hold displacement within one context */
 228
 229 typedef struct
 230   {
 231     WORD key;                   /* description of the keyword */
 232     DELTA left;                 /* distance to left context start */
 233     DELTA right;                /* distance to right context end */
 234     int reference;              /* reference descriptor */
 235   }
 236 OCCURS;
 237
 238 /* The various OCCURS tables are indexed by the language.  But the time
 239    being, there is no such multiple language support.  */
 240
 241 static OCCURS *occurs_table[1]; /* all words retained from the read text */
 242 static size_t occurs_alloc[1];  /* allocated size of occurs_table */
 243 static size_t number_of_occurs[1]; /* number of used slots in occurs_table */
 244
 245
 246 /* Communication among output routines.  */
 247
 248 /* Indicate if special output processing is requested for each character.  */
 249 static char edited_flag[CHAR_SET_SIZE];
 250
 251 static int half_line_width;     /* half of line width, reference excluded */
 252 static int before_max_width;    /* maximum width of before field */
 253 static int keyafter_max_width;  /* maximum width of keyword-and-after field */
 254 static int truncation_string_length;/* length of string that flags truncation */
 255
 256 /* When context is limited by lines, wraparound may happen on final output:
 257    the 'head' pointer gives access to some supplementary left context which
 258    will be seen at the end of the output line, the 'tail' pointer gives
 259    access to some supplementary right context which will be seen at the
 260    beginning of the output line. */
 261
 262 static BLOCK tail;              /* tail field */
 263 static int tail_truncation;     /* flag truncation after the tail field */
 264
 265 static BLOCK before;            /* before field */
 266 static int before_truncation;   /* flag truncation before the before field */
 267
 268 static BLOCK keyafter;          /* keyword-and-after field */
 269 static int keyafter_truncation; /* flag truncation after the keyafter field */
 270
 271 static BLOCK head;              /* head field */
 272 static int head_truncation;     /* flag truncation before the head field */
 273
 274 static BLOCK reference;         /* reference field for input reference mode */
 275 \f
 276 /* Miscellaneous routines.  */
 277
 278 /* Diagnose an error in the regular expression matcher.  Then exit.  */
 279
 280 static void ATTRIBUTE_NORETURN
 281 matcher_error (void)
 282 {
 283   error (0, errno, _("error in regular expression matcher"));
 284   exit (EXIT_FAILURE);
 285 }
 286
 287 /*------------------------------------------------------.
 288 | Duplicate string STRING, while evaluating \-escapes.  |
 289 `------------------------------------------------------*/
 290
 291 /* Loosely adapted from GNU sh-utils printf.c code.  */
 292
 293 static char *
 294 copy_unescaped_string (const char *string)
 295 {
 296   char *result;                 /* allocated result */
 297   char *cursor;                 /* cursor in result */
 298   int value;                    /* value of \nnn escape */
 299   int length;                   /* length of \nnn escape */
 300
 301   result = xmalloc (strlen (string) + 1);
 302   cursor = result;
 303
 304   while (*string)
 305     {
 306       if (*string == '\\')
 307         {
 308           string++;
 309           switch (*string)
 310             {
 311             case 'x':           /* \xhhh escape, 3 chars maximum */
 312               value = 0;
 313               for (length = 0, string++;
 314                    length < 3 && isxdigit (to_uchar (*string));
 315                    length++, string++)
 316                 value = value * 16 + HEXTOBIN (*string);
 317               if (length == 0)
 318                 {
 319                   *cursor++ = '\\';
 320                   *cursor++ = 'x';
 321                 }
 322               else
 323                 *cursor++ = value;
 324               break;
 325
 326             case '0':           /* \0ooo escape, 3 chars maximum */
 327               value = 0;
 328               for (length = 0, string++;
 329                    length < 3 && ISODIGIT (*string);
 330                    length++, string++)
 331                 value = value * 8 + OCTTOBIN (*string);
 332               *cursor++ = value;
 333               break;
 334
 335             case 'a':           /* alert */
 336 #if __STDC__
 337               *cursor++ = '\a';
 338 #else
 339               *cursor++ = 7;
 340 #endif
 341               string++;
 342               break;
 343
 344             case 'b':           /* backspace */
 345               *cursor++ = '\b';
 346               string++;
 347               break;
 348
 349             case 'c':           /* cancel the rest of the output */
 350               while (*string)
 351                 string++;
 352               break;
 353
 354             case 'f':           /* form feed */
 355               *cursor++ = '\f';
 356               string++;
 357               break;
 358
 359             case 'n':           /* new line */
 360               *cursor++ = '\n';
 361               string++;
 362               break;
 363
 364             case 'r':           /* carriage return */
 365               *cursor++ = '\r';
 366               string++;
 367               break;
 368
 369             case 't':           /* horizontal tab */
 370               *cursor++ = '\t';
 371               string++;
 372               break;
 373
 374             case 'v':           /* vertical tab */
 375 #if __STDC__
 376               *cursor++ = '\v';
 377 #else
 378               *cursor++ = 11;
 379 #endif
 380               string++;
 381               break;
 382
 383             case '\0':          /* lone backslash at end of string */
 384               /* ignore it */
 385               break;
 386
 387             default:
 388               *cursor++ = '\\';
 389               *cursor++ = *string++;
 390               break;
 391             }
 392         }
 393       else
 394         *cursor++ = *string++;
 395     }
 396
 397   *cursor = '\0';
 398   return result;
 399 }
 400
 401 /*--------------------------------------------------------------------------.
 402 | Compile the regex represented by REGEX, diagnose and abort if any error.  |
 403 `--------------------------------------------------------------------------*/
 404
 405 static void
 406 compile_regex (struct regex_data *regex)
 407 {
 408   struct re_pattern_buffer *pattern = &regex->pattern;
 409   char const *string = regex->string;
 410   char const *message;
 411
 412   pattern->buffer = NULL;
 413   pattern->allocated = 0;
 414   pattern->fastmap = regex->fastmap;
 415   pattern->translate = ignore_case ? folded_chars : NULL;
 416
 417   message = re_compile_pattern (string, strlen (string), pattern);
 418   if (message)
 419     error (EXIT_FAILURE, 0, _("%s (for regexp %s)"), message, quote (string));
 420
 421   /* The fastmap should be compiled before 're_match'.  The following
 422      call is not mandatory, because 're_search' is always called sooner,
 423      and it compiles the fastmap if this has not been done yet.  */
 424
 425   re_compile_fastmap (pattern);
 426 }
 427
 428 /*------------------------------------------------------------------------.
 429 | This will initialize various tables for pattern match and compiles some |
 430 | regexps.                                                                |
 431 `------------------------------------------------------------------------*/
 432
 433 static void
 434 initialize_regex (void)
 435 {
 436   int character;                /* character value */
 437
 438   /* Initialize the case folding table.  */
 439
 440   if (ignore_case)
 441     for (character = 0; character < CHAR_SET_SIZE; character++)
 442       folded_chars[character] = toupper (character);
 443
 444   /* Unless the user already provided a description of the end of line or
 445      end of sentence sequence, select an end of line sequence to compile.
 446      If the user provided an empty definition, thus disabling end of line
 447      or sentence feature, make it NULL to speed up tests.  If GNU
 448      extensions are enabled, use end of sentence like in GNU emacs.  If
 449      disabled, use end of lines.  */
 450
 451   if (context_regex.string)
 452     {
 453       if (!*context_regex.string)
 454         context_regex.string = NULL;
 455     }
 456   else if (gnu_extensions && !input_reference)
 457     context_regex.string = "[.?!][]\"')}]*\\($\\|\t\\|  \\)[ \t\n]*";
 458   else
 459     context_regex.string = "\n";
 460
 461   if (context_regex.string)
 462     compile_regex (&context_regex);
 463
 464   /* If the user has already provided a non-empty regexp to describe
 465      words, compile it.  Else, unless this has already been done through
 466      a user provided Break character file, construct a fastmap of
 467      characters that may appear in a word.  If GNU extensions enabled,
 468      include only letters of the underlying character set.  If disabled,
 469      include almost everything, even punctuations; stop only on white
 470      space.  */
 471
 472   if (word_regex.string)
 473     compile_regex (&word_regex);
 474   else if (!break_file)
 475     {
 476       if (gnu_extensions)
 477         {
 478
 479           /* Simulate \w+.  */
 480
 481           for (character = 0; character < CHAR_SET_SIZE; character++)
 482             word_fastmap[character] = !! isalpha (character);
 483         }
 484       else
 485         {
 486
 487           /* Simulate [^ \t\n]+.  */
 488
 489           memset (word_fastmap, 1, CHAR_SET_SIZE);
 490           word_fastmap[' '] = 0;
 491           word_fastmap['\t'] = 0;
 492           word_fastmap['\n'] = 0;
 493         }
 494     }
 495 }
 496
 497 /*------------------------------------------------------------------------.
 498 | This routine will attempt to swallow a whole file name FILE_NAME into a |
 499 | contiguous region of memory and return a description of it into BLOCK.  |
 500 | Standard input is assumed whenever FILE_NAME is NULL, empty or "-".     |
 501 |                                                                         |
 502 | Previously, in some cases, white space compression was attempted while  |
 503 | inputting text.  This was defeating some regexps like default end of    |
 504 | sentence, which checks for two consecutive spaces.  If white space      |
 505 | compression is ever reinstated, it should be in output routines.        |
 506 `------------------------------------------------------------------------*/
 507
 508 static void
 509 swallow_file_in_memory (const char *file_name, BLOCK *block)
 510 {
 511   size_t used_length;           /* used length in memory buffer */
 512
 513   /* As special cases, a file name which is NULL or "-" indicates standard
 514      input, which is already opened.  In all other cases, open the file from
 515      its name.  */
 516   bool using_stdin = !file_name || !*file_name || STREQ (file_name, "-");
 517   if (using_stdin)
 518     block->start = fread_file (stdin, &used_length);
 519   else
 520     block->start = read_file (file_name, &used_length);
 521
 522   if (!block->start)
 523     error (EXIT_FAILURE, errno, "%s", quote (using_stdin ? "-" : file_name));
 524
 525   block->end = block->start + used_length;
 526 }
 527 \f
 528 /* Sort and search routines.  */
 529
 530 /*--------------------------------------------------------------------------.
 531 | Compare two words, FIRST and SECOND, and return 0 if they are identical.  |
 532 | Return less than 0 if the first word goes before the second; return       |
 533 | greater than 0 if the first word goes after the second.                   |
 534 |                                                                           |
 535 | If a word is indeed a prefix of the other, the shorter should go first.   |
 536 `--------------------------------------------------------------------------*/
 537
 538 static int
 539 compare_words (const void *void_first, const void *void_second)
 540 {
 541 #define first ((const WORD *) void_first)
 542 #define second ((const WORD *) void_second)
 543   int length;                   /* minimum of two lengths */
 544   int counter;                  /* cursor in words */
 545   int value;                    /* value of comparison */
 546
 547   length = first->size < second->size ? first->size : second->size;
 548
 549   if (ignore_case)
 550     {
 551       for (counter = 0; counter < length; counter++)
 552         {
 553           value = (folded_chars [to_uchar (first->start[counter])]
 554                    - folded_chars [to_uchar (second->start[counter])]);
 555           if (value != 0)
 556             return value;
 557         }
 558     }
 559   else
 560     {
 561       for (counter = 0; counter < length; counter++)
 562         {
 563           value = (to_uchar (first->start[counter])
 564                    - to_uchar (second->start[counter]));
 565           if (value != 0)
 566             return value;
 567         }
 568     }
 569
 570   return first->size - second->size;
 571 #undef first
 572 #undef second
 573 }
 574
 575 /*-----------------------------------------------------------------------.
 576 | Decides which of two OCCURS, FIRST or SECOND, should lexicographically |
 577 | go first.  In case of a tie, preserve the original order through a     |
 578 | pointer comparison.                                                    |
 579 `-----------------------------------------------------------------------*/
 580
 581 static int
 582 compare_occurs (const void *void_first, const void *void_second)
 583 {
 584 #define first ((const OCCURS *) void_first)
 585 #define second ((const OCCURS *) void_second)
 586   int value;
 587
 588   value = compare_words (&first->key, &second->key);
 589   return value == 0 ? first->key.start - second->key.start : value;
 590 #undef first
 591 #undef second
 592 }
 593
 594 /*------------------------------------------------------------.
 595 | Return !0 if WORD appears in TABLE.  Uses a binary search.  |
 596 `------------------------------------------------------------*/
 597
 598 static int _GL_ATTRIBUTE_PURE
 599 search_table (WORD *word, WORD_TABLE *table)
 600 {
 601   int lowest;                   /* current lowest possible index */
 602   int highest;                  /* current highest possible index */
 603   int middle;                   /* current middle index */
 604   int value;                    /* value from last comparison */
 605
 606   lowest = 0;
 607   highest = table->length - 1;
 608   while (lowest <= highest)
 609     {
 610       middle = (lowest + highest) / 2;
 611       value = compare_words (word, table->start + middle);
 612       if (value < 0)
 613         highest = middle - 1;
 614       else if (value > 0)
 615         lowest = middle + 1;
 616       else
 617         return 1;
 618     }
 619   return 0;
 620 }
 621
 622 /*---------------------------------------------------------------------.
 623 | Sort the whole occurs table in memory.  Presumably, 'qsort' does not |
 624 | take intermediate copies or table elements, so the sort will be      |
 625 | stabilized throughout the comparison routine.                        |
 626 `---------------------------------------------------------------------*/
 627
 628 static void
 629 sort_found_occurs (void)
 630 {
 631
 632   /* Only one language for the time being.  */
 633
 634   qsort (occurs_table[0], number_of_occurs[0], sizeof **occurs_table,
 635          compare_occurs);
 636 }
 637 \f
 638 /* Parameter files reading routines.  */
 639
 640 /*----------------------------------------------------------------------.
 641 | Read a file named FILE_NAME, containing a set of break characters.    |
 642 | Build a content to the array word_fastmap in which all characters are |
 643 | allowed except those found in the file.  Characters may be repeated.  |
 644 `----------------------------------------------------------------------*/
 645
 646 static void
 647 digest_break_file (const char *file_name)
 648 {
 649   BLOCK file_contents;          /* to receive a copy of the file */
 650   char *cursor;                 /* cursor in file copy */
 651
 652   swallow_file_in_memory (file_name, &file_contents);
 653
 654   /* Make the fastmap and record the file contents in it.  */
 655
 656   memset (word_fastmap, 1, CHAR_SET_SIZE);
 657   for (cursor = file_contents.start; cursor < file_contents.end; cursor++)
 658     word_fastmap[to_uchar (*cursor)] = 0;
 659
 660   if (!gnu_extensions)
 661     {
 662
 663       /* If GNU extensions are enabled, the only way to avoid newline as
 664          a break character is to write all the break characters in the
 665          file with no newline at all, not even at the end of the file.
 666          If disabled, spaces, tabs and newlines are always considered as
 667          break characters even if not included in the break file.  */
 668
 669       word_fastmap[' '] = 0;
 670       word_fastmap['\t'] = 0;
 671       word_fastmap['\n'] = 0;
 672     }
 673
 674   /* Return the space of the file, which is no more required.  */
 675
 676   free (file_contents.start);
 677 }
 678
 679 /*-----------------------------------------------------------------------.
 680 | Read a file named FILE_NAME, containing one word per line, then        |
 681 | construct in TABLE a table of WORD descriptors for them.  The routine  |
 682 | swallows the whole file in memory; this is at the expense of space     |
 683 | needed for newlines, which are useless; however, the reading is fast.  |
 684 `-----------------------------------------------------------------------*/
 685
 686 static void
 687 digest_word_file (const char *file_name, WORD_TABLE *table)
 688 {
 689   BLOCK file_contents;          /* to receive a copy of the file */
 690   char *cursor;                 /* cursor in file copy */
 691   char *word_start;             /* start of the current word */
 692
 693   swallow_file_in_memory (file_name, &file_contents);
 694
 695   table->start = NULL;
 696   table->alloc = 0;
 697   table->length = 0;
 698
 699   /* Read the whole file.  */
 700
 701   cursor = file_contents.start;
 702   while (cursor < file_contents.end)
 703     {
 704
 705       /* Read one line, and save the word in contains.  */
 706
 707       word_start = cursor;
 708       while (cursor < file_contents.end && *cursor != '\n')
 709         cursor++;
 710
 711       /* Record the word in table if it is not empty.  */
 712
 713       if (cursor > word_start)
 714         {
 715           if (table->length == table->alloc)
 716             {
 717               if ((SIZE_MAX / sizeof *table->start - 1) / 2 < table->alloc)
 718                 xalloc_die ();
 719               table->alloc = table->alloc * 2 + 1;
 720               table->start = xrealloc (table->start,
 721                                        table->alloc * sizeof *table->start);
 722             }
 723
 724           table->start[table->length].start = word_start;
 725           table->start[table->length].size = cursor - word_start;
 726           table->length++;
 727         }
 728
 729       /* This test allows for an incomplete line at end of file.  */
 730
 731       if (cursor < file_contents.end)
 732         cursor++;
 733     }
 734
 735   /* Finally, sort all the words read.  */
 736
 737   qsort (table->start, table->length, sizeof table->start[0], compare_words);
 738 }
 739 \f
 740 /* Keyword recognition and selection.  */
 741
 742 /*----------------------------------------------------------------------.
 743 | For each keyword in the source text, constructs an OCCURS structure.  |
 744 `----------------------------------------------------------------------*/
 745
 746 static void
 747 find_occurs_in_text (void)
 748 {
 749   char *cursor;                 /* for scanning the source text */
 750   char *scan;                   /* for scanning the source text also */
 751   char *line_start;             /* start of the current input line */
 752   char *line_scan;              /* newlines scanned until this point */
 753   int reference_length;         /* length of reference in input mode */
 754   WORD possible_key;            /* possible key, to ease searches */
 755   OCCURS *occurs_cursor;        /* current OCCURS under construction */
 756
 757   char *context_start;          /* start of left context */
 758   char *context_end;            /* end of right context */
 759   char *word_start;             /* start of word */
 760   char *word_end;               /* end of word */
 761   char *next_context_start;     /* next start of left context */
 762
 763   /* reference_length is always used within 'if (input_reference)'.
 764      However, GNU C diagnoses that it may be used uninitialized.  The
 765      following assignment is merely to shut it up.  */
 766
 767   reference_length = 0;
 768
 769   /* Tracking where lines start is helpful for reference processing.  In
 770      auto reference mode, this allows counting lines.  In input reference
 771      mode, this permits finding the beginning of the references.
 772
 773      The first line begins with the file, skip immediately this very first
 774      reference in input reference mode, to help further rejection any word
 775      found inside it.  Also, unconditionally assigning these variable has
 776      the happy effect of shutting up lint.  */
 777
 778   line_start = text_buffer.start;
 779   line_scan = line_start;
 780   if (input_reference)
 781     {
 782       SKIP_NON_WHITE (line_scan, text_buffer.end);
 783       reference_length = line_scan - line_start;
 784       SKIP_WHITE (line_scan, text_buffer.end);
 785     }
 786
 787   /* Process the whole buffer, one line or one sentence at a time.  */
 788
 789   for (cursor = text_buffer.start;
 790        cursor < text_buffer.end;
 791        cursor = next_context_start)
 792     {
 793
 794       /* 'context_start' gets initialized before the processing of each
 795          line, or once for the whole buffer if no end of line or sentence
 796          sequence separator.  */
 797
 798       context_start = cursor;
 799
 800       /* If an end of line or end of sentence sequence is defined and
 801          non-empty, 'next_context_start' will be recomputed to be the end of
 802          each line or sentence, before each one is processed.  If no such
 803          sequence, then 'next_context_start' is set at the end of the whole
 804          buffer, which is then considered to be a single line or sentence.
 805          This test also accounts for the case of an incomplete line or
 806          sentence at the end of the buffer.  */
 807
 808       next_context_start = text_buffer.end;
 809       if (context_regex.string)
 810         switch (re_search (&context_regex.pattern, cursor,
 811                            text_buffer.end - cursor,
 812                            0, text_buffer.end - cursor, &context_regs))
 813           {
 814           case -2:
 815             matcher_error ();
 816
 817           case -1:
 818             break;
 819
 820           default:
 821             next_context_start = cursor + context_regs.end[0];
 822             break;
 823           }
 824
 825       /* Include the separator into the right context, but not any suffix
 826          white space in this separator; this insures it will be seen in
 827          output and will not take more space than necessary.  */
 828
 829       context_end = next_context_start;
 830       SKIP_WHITE_BACKWARDS (context_end, context_start);
 831
 832       /* Read and process a single input line or sentence, one word at a
 833          time.  */
 834
 835       while (1)
 836         {
 837           if (word_regex.string)
 838
 839             /* If a word regexp has been compiled, use it to skip at the
 840                beginning of the next word.  If there is no such word, exit
 841                the loop.  */
 842
 843             {
 844               regoff_t r = re_search (&word_regex.pattern, cursor,
 845                                       context_end - cursor,
 846                                       0, context_end - cursor, &word_regs);
 847               if (r == -2)
 848                 matcher_error ();
 849               if (r == -1)
 850                 break;
 851               word_start = cursor + word_regs.start[0];
 852               word_end = cursor + word_regs.end[0];
 853             }
 854           else
 855
 856             /* Avoid re_search and use the fastmap to skip to the
 857                beginning of the next word.  If there is no more word in
 858                the buffer, exit the loop.  */
 859
 860             {
 861               scan = cursor;
 862               while (scan < context_end
 863                      && !word_fastmap[to_uchar (*scan)])
 864                 scan++;
 865
 866               if (scan == context_end)
 867                 break;
 868
 869               word_start = scan;
 870
 871               while (scan < context_end
 872                      && word_fastmap[to_uchar (*scan)])
 873                 scan++;
 874
 875               word_end = scan;
 876             }
 877
 878           /* Skip right to the beginning of the found word.  */
 879
 880           cursor = word_start;
 881
 882           /* Skip any zero length word.  Just advance a single position,
 883              then go fetch the next word.  */
 884
 885           if (word_end == word_start)
 886             {
 887               cursor++;
 888               continue;
 889             }
 890
 891           /* This is a genuine, non empty word, so save it as a possible
 892              key.  Then skip over it.  Also, maintain the maximum length of
 893              all words read so far.  It is mandatory to take the maximum
 894              length of all words in the file, without considering if they
 895              are actually kept or rejected, because backward jumps at output
 896              generation time may fall in *any* word.  */
 897
 898           possible_key.start = cursor;
 899           possible_key.size = word_end - word_start;
 900           cursor += possible_key.size;
 901
 902           if (possible_key.size > maximum_word_length)
 903             maximum_word_length = possible_key.size;
 904
 905           /* In input reference mode, update 'line_start' from its previous
 906              value.  Count the lines just in case auto reference mode is
 907              also selected. If it happens that the word just matched is
 908              indeed part of a reference; just ignore it.  */
 909
 910           if (input_reference)
 911             {
 912               while (line_scan < possible_key.start)
 913                 if (*line_scan == '\n')
 914                   {
 915                     total_line_count++;
 916                     line_scan++;
 917                     line_start = line_scan;
 918                     SKIP_NON_WHITE (line_scan, text_buffer.end);
 919                     reference_length = line_scan - line_start;
 920                   }
 921                 else
 922                   line_scan++;
 923               if (line_scan > possible_key.start)
 924                 continue;
 925             }
 926
 927           /* Ignore the word if an 'Ignore words' table exists and if it is
 928              part of it.  Also ignore the word if an 'Only words' table and
 929              if it is *not* part of it.
 930
 931              It is allowed that both tables be used at once, even if this
 932              may look strange for now.  Just ignore a word that would appear
 933              in both.  If regexps are eventually implemented for these
 934              tables, the Ignore table could then reject words that would
 935              have been previously accepted by the Only table.  */
 936
 937           if (ignore_file && search_table (&possible_key, &ignore_table))
 938             continue;
 939           if (only_file && !search_table (&possible_key, &only_table))
 940             continue;
 941
 942           /* A non-empty word has been found.  First of all, insure
 943              proper allocation of the next OCCURS, and make a pointer to
 944              where it will be constructed.  */
 945
 946           if (number_of_occurs[0] == occurs_alloc[0])
 947             {
 948               if ((SIZE_MAX / sizeof *occurs_table[0] - 1) / 2
 949                   < occurs_alloc[0])
 950                 xalloc_die ();
 951               occurs_alloc[0] = occurs_alloc[0] * 2 + 1;
 952               occurs_table[0] =
 953                 xrealloc (occurs_table[0],
 954                           occurs_alloc[0] * sizeof *occurs_table[0]);
 955             }
 956
 957           occurs_cursor = occurs_table[0] + number_of_occurs[0];
 958
 959           /* Define the refence field, if any.  */
 960
 961           if (auto_reference)
 962             {
 963
 964               /* While auto referencing, update 'line_start' from its
 965                  previous value, counting lines as we go.  If input
 966                  referencing at the same time, 'line_start' has been
 967                  advanced earlier, and the following loop is never really
 968                  executed.  */
 969
 970               while (line_scan < possible_key.start)
 971                 if (*line_scan == '\n')
 972                   {
 973                     total_line_count++;
 974                     line_scan++;
 975                     line_start = line_scan;
 976                     SKIP_NON_WHITE (line_scan, text_buffer.end);
 977                   }
 978                 else
 979                   line_scan++;
 980
 981               occurs_cursor->reference = total_line_count;
 982             }
 983           else if (input_reference)
 984             {
 985
 986               /* If only input referencing, 'line_start' has been computed
 987                  earlier to detect the case the word matched would be part
 988                  of the reference.  The reference position is simply the
 989                  value of 'line_start'.  */
 990
 991               occurs_cursor->reference
 992                 = (DELTA) (line_start - possible_key.start);
 993               if (reference_length > reference_max_width)
 994                 reference_max_width = reference_length;
 995             }
 996
 997           /* Exclude the reference from the context in simple cases.  */
 998
 999           if (input_reference && line_start == context_start)
1000             {
1001               SKIP_NON_WHITE (context_start, context_end);
1002               SKIP_WHITE (context_start, context_end);
1003             }
1004
1005           /* Completes the OCCURS structure.  */
1006
1007           occurs_cursor->key = possible_key;
1008           occurs_cursor->left = context_start - possible_key.start;
1009           occurs_cursor->right = context_end - possible_key.start;
1010
1011           number_of_occurs[0]++;
1012         }
1013     }
1014 }
1015 \f
1016 /* Formatting and actual output - service routines.  */
1017
1018 /*-----------------------------------------.
1019 | Prints some NUMBER of spaces on stdout.  |
1020 `-----------------------------------------*/
1021
1022 static void
1023 print_spaces (int number)
1024 {
1025   int counter;
1026
1027   for (counter = number; counter > 0; counter--)
1028     putchar (' ');
1029 }
1030
1031 /*-------------------------------------.
1032 | Prints the field provided by FIELD.  |
1033 `-------------------------------------*/
1034
1035 static void
1036 print_field (BLOCK field)
1037 {
1038   char *cursor;                 /* Cursor in field to print */
1039   int base;                     /* Base character, without diacritic */
1040   int diacritic;                /* Diacritic code for the character */
1041
1042   /* Whitespace is not really compressed.  Instead, each white space
1043      character (tab, vt, ht etc.) is printed as one single space.  */
1044
1045   for (cursor = field.start; cursor < field.end; cursor++)
1046     {
1047       unsigned char character = *cursor;
1048       if (edited_flag[character])
1049         {
1050
1051           /* First check if this is a diacriticized character.
1052
1053              This works only for TeX.  I do not know how diacriticized
1054              letters work with 'roff'.  Please someone explain it to me!  */
1055
1056           diacritic = todiac (character);
1057           if (diacritic != 0 && output_format == TEX_FORMAT)
1058             {
1059               base = tobase (character);
1060               switch (diacritic)
1061                 {
1062
1063                 case 1:         /* Latin diphthongs */
1064                   switch (base)
1065                     {
1066                     case 'o':
1067                       fputs ("\\oe{}", stdout);
1068                       break;
1069
1070                     case 'O':
1071                       fputs ("\\OE{}", stdout);
1072                       break;
1073
1074                     case 'a':
1075                       fputs ("\\ae{}", stdout);
1076                       break;
1077
1078                     case 'A':
1079                       fputs ("\\AE{}", stdout);
1080                       break;
1081
1082                     default:
1083                       putchar (' ');
1084                     }
1085                   break;
1086
1087                 case 2:         /* Acute accent */
1088                   printf ("\\'%s%c", (base == 'i' ? "\\" : ""), base);
1089                   break;
1090
1091                 case 3:         /* Grave accent */
1092                   printf ("\\'%s%c", (base == 'i' ? "\\" : ""), base);
1093                   break;
1094
1095                 case 4:         /* Circumflex accent */
1096                   printf ("\\^%s%c", (base == 'i' ? "\\" : ""), base);
1097                   break;
1098
1099                 case 5:         /* Diaeresis */
1100                   printf ("\\\"%s%c", (base == 'i' ? "\\" : ""), base);
1101                   break;
1102
1103                 case 6:         /* Tilde accent */
1104                   printf ("\\~%s%c", (base == 'i' ? "\\" : ""), base);
1105                   break;
1106
1107                 case 7:         /* Cedilla */
1108                   printf ("\\c{%c}", base);
1109                   break;
1110
1111                 case 8:         /* Small circle beneath */
1112                   switch (base)
1113                     {
1114                     case 'a':
1115                       fputs ("\\aa{}", stdout);
1116                       break;
1117
1118                     case 'A':
1119                       fputs ("\\AA{}", stdout);
1120                       break;
1121
1122                     default:
1123                       putchar (' ');
1124                     }
1125                   break;
1126
1127                 case 9:         /* Strike through */
1128                   switch (base)
1129                     {
1130                     case 'o':
1131                       fputs ("\\o{}", stdout);
1132                       break;
1133
1134                     case 'O':
1135                       fputs ("\\O{}", stdout);
1136                       break;
1137
1138                     default:
1139                       putchar (' ');
1140                     }
1141                   break;
1142                 }
1143             }
1144           else
1145
1146             /* This is not a diacritic character, so handle cases which are
1147                really specific to 'roff' or TeX.  All white space processing
1148                is done as the default case of this switch.  */
1149
1150             switch (character)
1151               {
1152               case '"':
1153                 /* In roff output format, double any quote.  */
1154                 putchar ('"');
1155                 putchar ('"');
1156                 break;
1157
1158               case '$':
1159               case '%':
1160               case '&':
1161               case '#':
1162               case '_':
1163                 /* In TeX output format, precede these with a backslash.  */
1164                 putchar ('\\');
1165                 putchar (character);
1166                 break;
1167
1168               case '{':
1169               case '}':
1170                 /* In TeX output format, precede these with a backslash and
1171                    force mathematical mode.  */
1172                 printf ("$\\%c$", character);
1173                 break;
1174
1175               case '\\':
1176                 /* In TeX output mode, request production of a backslash.  */
1177                 fputs ("\\backslash{}", stdout);
1178                 break;
1179
1180               default:
1181                 /* Any other flagged character produces a single space.  */
1182                 putchar (' ');
1183               }
1184         }
1185       else
1186         putchar (*cursor);
1187     }
1188 }
1189 \f
1190 /* Formatting and actual output - planning routines.  */
1191
1192 /*--------------------------------------------------------------------.
1193 | From information collected from command line options and input file |
1194 | readings, compute and fix some output parameter values.             |
1195 `--------------------------------------------------------------------*/
1196
1197 static void
1198 fix_output_parameters (void)
1199 {
1200   int file_index;               /* index in text input file arrays */
1201   int line_ordinal;             /* line ordinal value for reference */
1202   char ordinal_string[12];      /* edited line ordinal for reference */
1203   int reference_width;          /* width for the whole reference */
1204   int character;                /* character ordinal */
1205   const char *cursor;           /* cursor in some constant strings */
1206
1207   /* In auto reference mode, the maximum width of this field is
1208      precomputed and subtracted from the overall line width.  Add one for
1209      the column which separate the file name from the line number.  */
1210
1211   if (auto_reference)
1212     {
1213       reference_max_width = 0;
1214       for (file_index = 0; file_index < number_input_files; file_index++)
1215         {
1216           line_ordinal = file_line_count[file_index] + 1;
1217           if (file_index > 0)
1218             line_ordinal -= file_line_count[file_index - 1];
1219           sprintf (ordinal_string, "%d", line_ordinal);
1220           reference_width = strlen (ordinal_string);
1221           if (input_file_name[file_index])
1222             reference_width += strlen (input_file_name[file_index]);
1223           if (reference_width > reference_max_width)
1224             reference_max_width = reference_width;
1225         }
1226       reference_max_width++;
1227       reference.start = xmalloc ((size_t) reference_max_width + 1);
1228     }
1229
1230   /* If the reference appears to the left of the output line, reserve some
1231      space for it right away, including one gap size.  */
1232
1233   if ((auto_reference || input_reference) && !right_reference)
1234     line_width -= reference_max_width + gap_size;
1235
1236   /* The output lines, minimally, will contain from left to right a left
1237      context, a gap, and a keyword followed by the right context with no
1238      special intervening gap.  Half of the line width is dedicated to the
1239      left context and the gap, the other half is dedicated to the keyword
1240      and the right context; these values are computed once and for all here.
1241      There also are tail and head wrap around fields, used when the keyword
1242      is near the beginning or the end of the line, or when some long word
1243      cannot fit in, but leave place from wrapped around shorter words.  The
1244      maximum width of these fields are recomputed separately for each line,
1245      on a case by case basis.  It is worth noting that it cannot happen that
1246      both the tail and head fields are used at once.  */
1247
1248   half_line_width = line_width / 2;
1249   before_max_width = half_line_width - gap_size;
1250   keyafter_max_width = half_line_width;
1251
1252   /* If truncation_string is the empty string, make it NULL to speed up
1253      tests.  In this case, truncation_string_length will never get used, so
1254      there is no need to set it.  */
1255
1256   if (truncation_string && *truncation_string)
1257     truncation_string_length = strlen (truncation_string);
1258   else
1259     truncation_string = NULL;
1260
1261   if (gnu_extensions)
1262     {
1263
1264       /* When flagging truncation at the left of the keyword, the
1265          truncation mark goes at the beginning of the before field,
1266          unless there is a head field, in which case the mark goes at the
1267          left of the head field.  When flagging truncation at the right
1268          of the keyword, the mark goes at the end of the keyafter field,
1269          unless there is a tail field, in which case the mark goes at the
1270          end of the tail field.  Only eight combination cases could arise
1271          for truncation marks:
1272
1273          . None.
1274          . One beginning the before field.
1275          . One beginning the head field.
1276          . One ending the keyafter field.
1277          . One ending the tail field.
1278          . One beginning the before field, another ending the keyafter field.
1279          . One ending the tail field, another beginning the before field.
1280          . One ending the keyafter field, another beginning the head field.
1281
1282          So, there is at most two truncation marks, which could appear both
1283          on the left side of the center of the output line, both on the
1284          right side, or one on either side.  */
1285
1286       before_max_width -= 2 * truncation_string_length;
1287       if (before_max_width < 0)
1288         before_max_width = 0;
1289       keyafter_max_width -= 2 * truncation_string_length;
1290     }
1291   else
1292     {
1293
1294       /* I never figured out exactly how UNIX' ptx plans the output width
1295          of its various fields.  If GNU extensions are disabled, do not
1296          try computing the field widths correctly; instead, use the
1297          following formula, which does not completely imitate UNIX' ptx,
1298          but almost.  */
1299
1300       keyafter_max_width -= 2 * truncation_string_length + 1;
1301     }
1302
1303   /* Compute which characters need special output processing.  Initialize
1304      by flagging any white space character.  Some systems do not consider
1305      form feed as a space character, but we do.  */
1306
1307   for (character = 0; character < CHAR_SET_SIZE; character++)
1308     edited_flag[character] = !! isspace (character);
1309   edited_flag['\f'] = 1;
1310
1311   /* Complete the special character flagging according to selected output
1312      format.  */
1313
1314   switch (output_format)
1315     {
1316     case UNKNOWN_FORMAT:
1317       /* Should never happen.  */
1318
1319     case DUMB_FORMAT:
1320       break;
1321
1322     case ROFF_FORMAT:
1323
1324       /* 'Quote' characters should be doubled.  */
1325
1326       edited_flag['"'] = 1;
1327       break;
1328
1329     case TEX_FORMAT:
1330
1331       /* Various characters need special processing.  */
1332
1333       for (cursor = "$%&#_{}\\"; *cursor; cursor++)
1334         edited_flag[to_uchar (*cursor)] = 1;
1335
1336       /* Any character with 8th bit set will print to a single space, unless
1337          it is diacriticized.  */
1338
1339       for (character = 0200; character < CHAR_SET_SIZE; character++)
1340         edited_flag[character] = todiac (character) != 0;
1341       break;
1342     }
1343 }
1344
1345 /*------------------------------------------------------------------.
1346 | Compute the position and length of all the output fields, given a |
1347 | pointer to some OCCURS.                                           |
1348 `------------------------------------------------------------------*/
1349
1350 static void
1351 define_all_fields (OCCURS *occurs)
1352 {
1353   int tail_max_width;           /* allowable width of tail field */
1354   int head_max_width;           /* allowable width of head field */
1355   char *cursor;                 /* running cursor in source text */
1356   char *left_context_start;     /* start of left context */
1357   char *right_context_end;      /* end of right context */
1358   char *left_field_start;       /* conservative start for 'head'/'before' */
1359   int file_index;               /* index in text input file arrays */
1360   const char *file_name;        /* file name for reference */
1361   int line_ordinal;             /* line ordinal for reference */
1362
1363   /* Define 'keyafter', start of left context and end of right context.
1364      'keyafter' starts at the saved position for keyword and extend to the
1365      right from the end of the keyword, eating separators or full words, but
1366      not beyond maximum allowed width for 'keyafter' field or limit for the
1367      right context.  Suffix spaces will be removed afterwards.  */
1368
1369   keyafter.start = occurs->key.start;
1370   keyafter.end = keyafter.start + occurs->key.size;
1371   left_context_start = keyafter.start + occurs->left;
1372   right_context_end = keyafter.start + occurs->right;
1373
1374   cursor = keyafter.end;
1375   while (cursor < right_context_end
1376          && cursor <= keyafter.start + keyafter_max_width)
1377     {
1378       keyafter.end = cursor;
1379       SKIP_SOMETHING (cursor, right_context_end);
1380     }
1381   if (cursor <= keyafter.start + keyafter_max_width)
1382     keyafter.end = cursor;
1383
1384   keyafter_truncation = truncation_string && keyafter.end < right_context_end;
1385
1386   SKIP_WHITE_BACKWARDS (keyafter.end, keyafter.start);
1387
1388   /* When the left context is wide, it might take some time to catch up from
1389      the left context boundary to the beginning of the 'head' or 'before'
1390      fields.  So, in this case, to speed the catchup, we jump back from the
1391      keyword, using some secure distance, possibly falling in the middle of
1392      a word.  A secure backward jump would be at least half the maximum
1393      width of a line, plus the size of the longest word met in the whole
1394      input.  We conclude this backward jump by a skip forward of at least
1395      one word.  In this manner, we should not inadvertently accept only part
1396      of a word.  From the reached point, when it will be time to fix the
1397      beginning of 'head' or 'before' fields, we will skip forward words or
1398      delimiters until we get sufficiently near.  */
1399
1400   if (-occurs->left > half_line_width + maximum_word_length)
1401     {
1402       left_field_start
1403         = keyafter.start - (half_line_width + maximum_word_length);
1404       SKIP_SOMETHING (left_field_start, keyafter.start);
1405     }
1406   else
1407     left_field_start = keyafter.start + occurs->left;
1408
1409   /* 'before' certainly ends at the keyword, but not including separating
1410      spaces.  It starts after than the saved value for the left context, by
1411      advancing it until it falls inside the maximum allowed width for the
1412      before field.  There will be no prefix spaces either.  'before' only
1413      advances by skipping single separators or whole words. */
1414
1415   before.start = left_field_start;
1416   before.end = keyafter.start;
1417   SKIP_WHITE_BACKWARDS (before.end, before.start);
1418
1419   while (before.start + before_max_width < before.end)
1420     SKIP_SOMETHING (before.start, before.end);
1421
1422   if (truncation_string)
1423     {
1424       cursor = before.start;
1425       SKIP_WHITE_BACKWARDS (cursor, text_buffer.start);
1426       before_truncation = cursor > left_context_start;
1427     }
1428   else
1429     before_truncation = 0;
1430
1431   SKIP_WHITE (before.start, text_buffer.end);
1432
1433   /* The tail could not take more columns than what has been left in the
1434      left context field, and a gap is mandatory.  It starts after the
1435      right context, and does not contain prefixed spaces.  It ends at
1436      the end of line, the end of buffer or when the tail field is full,
1437      whichever comes first.  It cannot contain only part of a word, and
1438      has no suffixed spaces.  */
1439
1440   tail_max_width
1441     = before_max_width - (before.end - before.start) - gap_size;
1442
1443   if (tail_max_width > 0)
1444     {
1445       tail.start = keyafter.end;
1446       SKIP_WHITE (tail.start, text_buffer.end);
1447
1448       tail.end = tail.start;
1449       cursor = tail.end;
1450       while (cursor < right_context_end
1451              && cursor < tail.start + tail_max_width)
1452         {
1453           tail.end = cursor;
1454           SKIP_SOMETHING (cursor, right_context_end);
1455         }
1456
1457       if (cursor < tail.start + tail_max_width)
1458         tail.end = cursor;
1459
1460       if (tail.end > tail.start)
1461         {
1462           keyafter_truncation = 0;
1463           tail_truncation = truncation_string && tail.end < right_context_end;
1464         }
1465       else
1466         tail_truncation = 0;
1467
1468       SKIP_WHITE_BACKWARDS (tail.end, tail.start);
1469     }
1470   else
1471     {
1472
1473       /* No place left for a tail field.  */
1474
1475       tail.start = NULL;
1476       tail.end = NULL;
1477       tail_truncation = 0;
1478     }
1479
1480   /* 'head' could not take more columns than what has been left in the right
1481      context field, and a gap is mandatory.  It ends before the left
1482      context, and does not contain suffixed spaces.  Its pointer is advanced
1483      until the head field has shrunk to its allowed width.  It cannot
1484      contain only part of a word, and has no suffixed spaces.  */
1485
1486   head_max_width
1487     = keyafter_max_width - (keyafter.end - keyafter.start) - gap_size;
1488
1489   if (head_max_width > 0)
1490     {
1491       head.end = before.start;
1492       SKIP_WHITE_BACKWARDS (head.end, text_buffer.start);
1493
1494       head.start = left_field_start;
1495       while (head.start + head_max_width < head.end)
1496         SKIP_SOMETHING (head.start, head.end);
1497
1498       if (head.end > head.start)
1499         {
1500           before_truncation = 0;
1501           head_truncation = (truncation_string
1502                              && head.start > left_context_start);
1503         }
1504       else
1505         head_truncation = 0;
1506
1507       SKIP_WHITE (head.start, head.end);
1508     }
1509   else
1510     {
1511
1512       /* No place left for a head field.  */
1513
1514       head.start = NULL;
1515       head.end = NULL;
1516       head_truncation = 0;
1517     }
1518
1519   if (auto_reference)
1520     {
1521
1522       /* Construct the reference text in preallocated space from the file
1523          name and the line number.  Find out in which file the reference
1524          occurred.  Standard input yields an empty file name.  Insure line
1525          numbers are one based, even if they are computed zero based.  */
1526
1527       file_index = 0;
1528       while (file_line_count[file_index] < occurs->reference)
1529         file_index++;
1530
1531       file_name = input_file_name[file_index];
1532       if (!file_name)
1533         file_name = "";
1534
1535       line_ordinal = occurs->reference + 1;
1536       if (file_index > 0)
1537         line_ordinal -= file_line_count[file_index - 1];
1538
1539       sprintf (reference.start, "%s:%d", file_name, line_ordinal);
1540       reference.end = reference.start + strlen (reference.start);
1541     }
1542   else if (input_reference)
1543     {
1544
1545       /* Reference starts at saved position for reference and extends right
1546          until some white space is met.  */
1547
1548       reference.start = keyafter.start + (DELTA) occurs->reference;
1549       reference.end = reference.start;
1550       SKIP_NON_WHITE (reference.end, right_context_end);
1551     }
1552 }
1553 \f
1554 /* Formatting and actual output - control routines.  */
1555
1556 /*----------------------------------------------------------------------.
1557 | Output the current output fields as one line for 'troff' or 'nroff'.  |
1558 `----------------------------------------------------------------------*/
1559
1560 static void
1561 output_one_roff_line (void)
1562 {
1563   /* Output the 'tail' field.  */
1564
1565   printf (".%s \"", macro_name);
1566   print_field (tail);
1567   if (tail_truncation)
1568     fputs (truncation_string, stdout);
1569   putchar ('"');
1570
1571   /* Output the 'before' field.  */
1572
1573   fputs (" \"", stdout);
1574   if (before_truncation)
1575     fputs (truncation_string, stdout);
1576   print_field (before);
1577   putchar ('"');
1578
1579   /* Output the 'keyafter' field.  */
1580
1581   fputs (" \"", stdout);
1582   print_field (keyafter);
1583   if (keyafter_truncation)
1584     fputs (truncation_string, stdout);
1585   putchar ('"');
1586
1587   /* Output the 'head' field.  */
1588
1589   fputs (" \"", stdout);
1590   if (head_truncation)
1591     fputs (truncation_string, stdout);
1592   print_field (head);
1593   putchar ('"');
1594
1595   /* Conditionally output the 'reference' field.  */
1596
1597   if (auto_reference || input_reference)
1598     {
1599       fputs (" \"", stdout);
1600       print_field (reference);
1601       putchar ('"');
1602     }
1603
1604   putchar ('\n');
1605 }
1606
1607 /*---------------------------------------------------------.
1608 | Output the current output fields as one line for 'TeX'.  |
1609 `---------------------------------------------------------*/
1610
1611 static void
1612 output_one_tex_line (void)
1613 {
1614   BLOCK key;                    /* key field, isolated */
1615   BLOCK after;                  /* after field, isolated */
1616   char *cursor;                 /* running cursor in source text */
1617
1618   printf ("\\%s ", macro_name);
1619   putchar ('{');
1620   print_field (tail);
1621   fputs ("}{", stdout);
1622   print_field (before);
1623   fputs ("}{", stdout);
1624   key.start = keyafter.start;
1625   after.end = keyafter.end;
1626   cursor = keyafter.start;
1627   SKIP_SOMETHING (cursor, keyafter.end);
1628   key.end = cursor;
1629   after.start = cursor;
1630   print_field (key);
1631   fputs ("}{", stdout);
1632   print_field (after);
1633   fputs ("}{", stdout);
1634   print_field (head);
1635   putchar ('}');
1636   if (auto_reference || input_reference)
1637     {
1638       putchar ('{');
1639       print_field (reference);
1640       putchar ('}');
1641     }
1642   putchar ('\n');
1643 }
1644
1645 /*-------------------------------------------------------------------.
1646 | Output the current output fields as one line for a dumb terminal.  |
1647 `-------------------------------------------------------------------*/
1648
1649 static void
1650 output_one_dumb_line (void)
1651 {
1652   if (!right_reference)
1653     {
1654       if (auto_reference)
1655         {
1656
1657           /* Output the 'reference' field, in such a way that GNU emacs
1658              next-error will handle it.  The ending colon is taken from the
1659              gap which follows.  */
1660
1661           print_field (reference);
1662           putchar (':');
1663           print_spaces (reference_max_width
1664                         + gap_size
1665                         - (reference.end - reference.start)
1666                         - 1);
1667         }
1668       else
1669         {
1670
1671           /* Output the 'reference' field and its following gap.  */
1672
1673           print_field (reference);
1674           print_spaces (reference_max_width
1675                         + gap_size
1676                         - (reference.end - reference.start));
1677         }
1678     }
1679
1680   if (tail.start < tail.end)
1681     {
1682       /* Output the 'tail' field.  */
1683
1684       print_field (tail);
1685       if (tail_truncation)
1686         fputs (truncation_string, stdout);
1687
1688       print_spaces (half_line_width - gap_size
1689                     - (before.end - before.start)
1690                     - (before_truncation ? truncation_string_length : 0)
1691                     - (tail.end - tail.start)
1692                     - (tail_truncation ? truncation_string_length : 0));
1693     }
1694   else
1695     print_spaces (half_line_width - gap_size
1696                   - (before.end - before.start)
1697                   - (before_truncation ? truncation_string_length : 0));
1698
1699   /* Output the 'before' field.  */
1700
1701   if (before_truncation)
1702     fputs (truncation_string, stdout);
1703   print_field (before);
1704
1705   print_spaces (gap_size);
1706
1707   /* Output the 'keyafter' field.  */
1708
1709   print_field (keyafter);
1710   if (keyafter_truncation)
1711     fputs (truncation_string, stdout);
1712
1713   if (head.start < head.end)
1714     {
1715       /* Output the 'head' field.  */
1716
1717       print_spaces (half_line_width
1718                     - (keyafter.end - keyafter.start)
1719                     - (keyafter_truncation ? truncation_string_length : 0)
1720                     - (head.end - head.start)
1721                     - (head_truncation ? truncation_string_length : 0));
1722       if (head_truncation)
1723         fputs (truncation_string, stdout);
1724       print_field (head);
1725     }
1726   else
1727
1728     if ((auto_reference || input_reference) && right_reference)
1729       print_spaces (half_line_width
1730                     - (keyafter.end - keyafter.start)
1731                     - (keyafter_truncation ? truncation_string_length : 0));
1732
1733   if ((auto_reference || input_reference) && right_reference)
1734     {
1735       /* Output the 'reference' field.  */
1736
1737       print_spaces (gap_size);
1738       print_field (reference);
1739     }
1740
1741   putchar ('\n');
1742 }
1743
1744 /*------------------------------------------------------------------------.
1745 | Scan the whole occurs table and, for each entry, output one line in the |
1746 | appropriate format.                                                     |
1747 `------------------------------------------------------------------------*/
1748
1749 static void
1750 generate_all_output (void)
1751 {
1752   size_t occurs_index;          /* index of keyword entry being processed */
1753   OCCURS *occurs_cursor;        /* current keyword entry being processed */
1754
1755   /* The following assignments are useful to provide default values in case
1756      line contexts or references are not used, in which case these variables
1757      would never be computed.  */
1758
1759   tail.start = NULL;
1760   tail.end = NULL;
1761   tail_truncation = 0;
1762
1763   head.start = NULL;
1764   head.end = NULL;
1765   head_truncation = 0;
1766
1767   /* Loop over all keyword occurrences.  */
1768
1769   occurs_cursor = occurs_table[0];
1770
1771   for (occurs_index = 0; occurs_index < number_of_occurs[0]; occurs_index++)
1772     {
1773       /* Compute the exact size of every field and whenever truncation flags
1774          are present or not.  */
1775
1776       define_all_fields (occurs_cursor);
1777
1778       /* Produce one output line according to selected format.  */
1779
1780       switch (output_format)
1781         {
1782         case UNKNOWN_FORMAT:
1783           /* Should never happen.  */
1784
1785         case DUMB_FORMAT:
1786           output_one_dumb_line ();
1787           break;
1788
1789         case ROFF_FORMAT:
1790           output_one_roff_line ();
1791           break;
1792
1793         case TEX_FORMAT:
1794           output_one_tex_line ();
1795           break;
1796         }
1797
1798       /* Advance the cursor into the occurs table.  */
1799
1800       occurs_cursor++;
1801     }
1802 }
1803 \f
1804 /* Option decoding and main program.  */
1805
1806 /*------------------------------------------------------.
1807 | Print program identification and options, then exit.  |
1808 `------------------------------------------------------*/
1809
1810 void
1811 usage (int status)
1812 {
1813   if (status != EXIT_SUCCESS)
1814     emit_try_help ();
1815   else
1816     {
1817       printf (_("\
1818 Usage: %s [OPTION]... [INPUT]...   (without -G)\n\
1819   or:  %s -G [OPTION]... [INPUT [OUTPUT]]\n"),
1820               program_name, program_name);
1821       fputs (_("\
1822 Output a permuted index, including context, of the words in the input files.\n\
1823 \n\
1824 "), stdout);
1825       fputs (_("\
1826 Mandatory arguments to long options are mandatory for short options too.\n\
1827 "), stdout);
1828       fputs (_("\
1829   -A, --auto-reference           output automatically generated references\n\
1830   -G, --traditional              behave more like System V 'ptx'\n\
1831   -F, --flag-truncation=STRING   use STRING for flagging line truncations\n\
1832 "), stdout);
1833       fputs (_("\
1834   -M, --macro-name=STRING        macro name to use instead of 'xx'\n\
1835   -O, --format=roff              generate output as roff directives\n\
1836   -R, --right-side-refs          put references at right, not counted in -w\n\
1837   -S, --sentence-regexp=REGEXP   for end of lines or end of sentences\n\
1838   -T, --format=tex               generate output as TeX directives\n\
1839 "), stdout);
1840       fputs (_("\
1841   -W, --word-regexp=REGEXP       use REGEXP to match each keyword\n\
1842   -b, --break-file=FILE          word break characters in this FILE\n\
1843   -f, --ignore-case              fold lower case to upper case for sorting\n\
1844   -g, --gap-size=NUMBER          gap size in columns between output fields\n\
1845   -i, --ignore-file=FILE         read ignore word list from FILE\n\
1846   -o, --only-file=FILE           read only word list from this FILE\n\
1847 "), stdout);
1848       fputs (_("\
1849   -r, --references               first field of each line is a reference\n\
1850   -t, --typeset-mode               - not implemented -\n\
1851   -w, --width=NUMBER             output width in columns, reference excluded\n\
1852 "), stdout);
1853       fputs (HELP_OPTION_DESCRIPTION, stdout);
1854       fputs (VERSION_OPTION_DESCRIPTION, stdout);
1855       fputs (_("\
1856 \n\
1857 With no FILE or if FILE is -, read Standard Input.  '-F /' by default.\n\
1858 "), stdout);
1859       emit_ancillary_info ();
1860     }
1861   exit (status);
1862 }
1863
1864 /*----------------------------------------------------------------------.
1865 | Main program.  Decode ARGC arguments passed through the ARGV array of |
1866 | strings, then launch execution.                                       |
1867 `----------------------------------------------------------------------*/
1868
1869 /* Long options equivalences.  */
1870 static struct option const long_options[] =
1871 {
1872   {"auto-reference", no_argument, NULL, 'A'},
1873   {"break-file", required_argument, NULL, 'b'},
1874   {"flag-truncation", required_argument, NULL, 'F'},
1875   {"ignore-case", no_argument, NULL, 'f'},
1876   {"gap-size", required_argument, NULL, 'g'},
1877   {"ignore-file", required_argument, NULL, 'i'},
1878   {"macro-name", required_argument, NULL, 'M'},
1879   {"only-file", required_argument, NULL, 'o'},
1880   {"references", no_argument, NULL, 'r'},
1881   {"right-side-refs", no_argument, NULL, 'R'},
1882   {"format", required_argument, NULL, 10},
1883   {"sentence-regexp", required_argument, NULL, 'S'},
1884   {"traditional", no_argument, NULL, 'G'},
1885   {"typeset-mode", no_argument, NULL, 't'},
1886   {"width", required_argument, NULL, 'w'},
1887   {"word-regexp", required_argument, NULL, 'W'},
1888   {GETOPT_HELP_OPTION_DECL},
1889   {GETOPT_VERSION_OPTION_DECL},
1890   {NULL, 0, NULL, 0},
1891 };
1892
1893 static char const* const format_args[] =
1894 {
1895   "roff", "tex", NULL
1896 };
1897
1898 static enum Format const format_vals[] =
1899 {
1900   ROFF_FORMAT, TEX_FORMAT
1901 };
1902
1903 int
1904 main (int argc, char **argv)
1905 {
1906   int optchar;                  /* argument character */
1907   int file_index;               /* index in text input file arrays */
1908
1909   /* Decode program options.  */
1910
1911   initialize_main (&argc, &argv);
1912   set_program_name (argv[0]);
1913   setlocale (LC_ALL, "");
1914   bindtextdomain (PACKAGE, LOCALEDIR);
1915   textdomain (PACKAGE);
1916
1917   atexit (close_stdout);
1918
1919 #if HAVE_SETCHRCLASS
1920   setchrclass (NULL);
1921 #endif
1922
1923   while (optchar = getopt_long (argc, argv, "AF:GM:ORS:TW:b:i:fg:o:trw:",
1924                                 long_options, NULL),
1925          optchar != EOF)
1926     {
1927       switch (optchar)
1928         {
1929         default:
1930           usage (EXIT_FAILURE);
1931
1932         case 'G':
1933           gnu_extensions = false;
1934           break;
1935
1936         case 'b':
1937           break_file = optarg;
1938           break;
1939
1940         case 'f':
1941           ignore_case = true;
1942           break;
1943
1944         case 'g':
1945           {
1946             unsigned long int tmp_ulong;
1947             if (xstrtoul (optarg, NULL, 0, &tmp_ulong, NULL) != LONGINT_OK
1948                 || ! (0 < tmp_ulong && tmp_ulong <= INT_MAX))
1949               error (EXIT_FAILURE, 0, _("invalid gap width: %s"),
1950                      quotearg (optarg));
1951             gap_size = tmp_ulong;
1952             break;
1953           }
1954
1955         case 'i':
1956           ignore_file = optarg;
1957           break;
1958
1959         case 'o':
1960           only_file = optarg;
1961           break;
1962
1963         case 'r':
1964           input_reference = true;
1965           break;
1966
1967         case 't':
1968           /* Yet to understand...  */
1969           break;
1970
1971         case 'w':
1972           {
1973             unsigned long int tmp_ulong;
1974             if (xstrtoul (optarg, NULL, 0, &tmp_ulong, NULL) != LONGINT_OK
1975                 || ! (0 < tmp_ulong && tmp_ulong <= INT_MAX))
1976               error (EXIT_FAILURE, 0, _("invalid line width: %s"),
1977                      quotearg (optarg));
1978             line_width = tmp_ulong;
1979             break;
1980           }
1981
1982         case 'A':
1983           auto_reference = true;
1984           break;
1985
1986         case 'F':
1987           truncation_string = copy_unescaped_string (optarg);
1988           break;
1989
1990         case 'M':
1991           macro_name = optarg;
1992           break;
1993
1994         case 'O':
1995           output_format = ROFF_FORMAT;
1996           break;
1997
1998         case 'R':
1999           right_reference = true;
2000           break;
2001
2002         case 'S':
2003           context_regex.string = copy_unescaped_string (optarg);
2004           break;
2005
2006         case 'T':
2007           output_format = TEX_FORMAT;
2008           break;
2009
2010         case 'W':
2011           word_regex.string = copy_unescaped_string (optarg);
2012           if (!*word_regex.string)
2013             word_regex.string = NULL;
2014           break;
2015
2016         case 10:
2017           output_format = XARGMATCH ("--format", optarg,
2018                                      format_args, format_vals);
2019         case_GETOPT_HELP_CHAR;
2020
2021         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
2022         }
2023     }
2024
2025   /* Process remaining arguments.  If GNU extensions are enabled, process
2026      all arguments as input parameters.  If disabled, accept at most two
2027      arguments, the second of which is an output parameter.  */
2028
2029   if (optind == argc)
2030     {
2031
2032       /* No more argument simply means: read standard input.  */
2033
2034       input_file_name = xmalloc (sizeof *input_file_name);
2035       file_line_count = xmalloc (sizeof *file_line_count);
2036       number_input_files = 1;
2037       input_file_name[0] = NULL;
2038     }
2039   else if (gnu_extensions)
2040     {
2041       number_input_files = argc - optind;
2042       input_file_name = xmalloc (number_input_files * sizeof *input_file_name);
2043       file_line_count = xmalloc (number_input_files * sizeof *file_line_count);
2044
2045       for (file_index = 0; file_index < number_input_files; file_index++)
2046         {
2047           if (!*argv[optind] || STREQ (argv[optind], "-"))
2048             input_file_name[file_index] = NULL;
2049           else
2050             input_file_name[file_index] = argv[optind];
2051           optind++;
2052         }
2053     }
2054   else
2055     {
2056
2057       /* There is one necessary input file.  */
2058
2059       number_input_files = 1;
2060       input_file_name = xmalloc (sizeof *input_file_name);
2061       file_line_count = xmalloc (sizeof *file_line_count);
2062       if (!*argv[optind] || STREQ (argv[optind], "-"))
2063         input_file_name[0] = NULL;
2064       else
2065         input_file_name[0] = argv[optind];
2066       optind++;
2067
2068       /* Redirect standard output, only if requested.  */
2069
2070       if (optind < argc)
2071         {
2072           if (! freopen (argv[optind], "w", stdout))
2073             error (EXIT_FAILURE, errno, "%s", argv[optind]);
2074           optind++;
2075         }
2076
2077       /* Diagnose any other argument as an error.  */
2078
2079       if (optind < argc)
2080         {
2081           error (0, 0, _("extra operand %s"), quote (argv[optind]));
2082           usage (EXIT_FAILURE);
2083         }
2084     }
2085
2086   /* If the output format has not been explicitly selected, choose dumb
2087      terminal format if GNU extensions are enabled, else 'roff' format.  */
2088
2089   if (output_format == UNKNOWN_FORMAT)
2090     output_format = gnu_extensions ? DUMB_FORMAT : ROFF_FORMAT;
2091
2092   /* Initialize the main tables.  */
2093
2094   initialize_regex ();
2095
2096   /* Read 'Break character' file, if any.  */
2097
2098   if (break_file)
2099     digest_break_file (break_file);
2100
2101   /* Read 'Ignore words' file and 'Only words' files, if any.  If any of
2102      these files is empty, reset the name of the file to NULL, to avoid
2103      unnecessary calls to search_table. */
2104
2105   if (ignore_file)
2106     {
2107       digest_word_file (ignore_file, &ignore_table);
2108       if (ignore_table.length == 0)
2109         ignore_file = NULL;
2110     }
2111
2112   if (only_file)
2113     {
2114       digest_word_file (only_file, &only_table);
2115       if (only_table.length == 0)
2116         only_file = NULL;
2117     }
2118
2119   /* Prepare to study all the input files.  */
2120
2121   number_of_occurs[0] = 0;
2122   total_line_count = 0;
2123   maximum_word_length = 0;
2124   reference_max_width = 0;
2125
2126   for (file_index = 0; file_index < number_input_files; file_index++)
2127     {
2128
2129       /* Read the file in core, than study it.  */
2130
2131       swallow_file_in_memory (input_file_name[file_index], &text_buffer);
2132       find_occurs_in_text ();
2133
2134       /* Maintain for each file how many lines has been read so far when its
2135          end is reached.  Incrementing the count first is a simple kludge to
2136          handle a possible incomplete line at end of file.  */
2137
2138       total_line_count++;
2139       file_line_count[file_index] = total_line_count;
2140     }
2141
2142   /* Do the output process phase.  */
2143
2144   sort_found_occurs ();
2145   fix_output_parameters ();
2146   generate_all_output ();
2147
2148   /* All done.  */
2149
2150   exit (EXIT_SUCCESS);
2151 }