1 /* Permuted index for GNU, with keywords in their context.
2 Copyright (C) 1990, 1991, 1993, 1998-2006 Free Software Foundation, Inc.
3 François Pinard <pinard@iro.umontreal.ca>, 1988.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software Foundation,
17 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 François Pinard <pinard@iro.umontreal.ca> */
25 #include <sys/types.h>
35 /* The official name of this program (e.g., no `g' prefix). */
36 #define PROGRAM_NAME "ptx"
38 /* Note to translator: Please translate "F. Pinard" to "François
39 Pinard" if "ç" (c-with-cedilla) is available in the
40 translation's character set and encoding. */
41 #define AUTHORS _("F. Pinard")
43 /* Number of possible characters in a byte. */
44 #define CHAR_SET_SIZE 256
46 #define ISODIGIT(C) ((C) >= '0' && (C) <= '7')
47 #define HEXTOBIN(C) ((C) >= 'a' && (C) <= 'f' ? (C)-'a'+10 \
48 : (C) >= 'A' && (C) <= 'F' ? (C)-'A'+10 : (C)-'0')
49 #define OCTTOBIN(C) ((C) - '0')
51 /* Debugging the memory allocator. */
54 # define MALLOC_FUNC_CHECK 1
58 /* Global definitions. */
60 /* FIXME: There are many unchecked integer overflows in this file,
61 that will cause this command to misbehave given large inputs or
62 options. Many of the "int" values below should be "size_t" or
63 something else like that. */
65 /* Reallocation step when swallowing non regular files. The value is not
66 the actual reallocation step, but its base two logarithm. */
67 #define SWALLOW_REALLOC_LOG 12
69 /* Imported from "regex.c". */
72 /* The name this program was run with. */
75 /* Program options. */
79 UNKNOWN_FORMAT, /* output format still unknown */
80 DUMB_FORMAT, /* output for a dumb terminal */
81 ROFF_FORMAT, /* output for `troff' or `nroff' */
82 TEX_FORMAT /* output for `TeX' or `LaTeX' */
85 static bool gnu_extensions = true; /* trigger all GNU extensions */
86 static bool auto_reference = false; /* refs are `file_name:line_number:' */
87 static bool input_reference = false; /* refs at beginning of input lines */
88 static bool right_reference = false; /* output refs after right context */
89 static int line_width = 72; /* output line width in characters */
90 static int gap_size = 3; /* number of spaces between output fields */
91 static const char *truncation_string = "/";
92 /* string used to mark line truncations */
93 static const char *macro_name = "xx"; /* macro name for roff or TeX output */
94 static enum Format output_format = UNKNOWN_FORMAT;
97 static bool ignore_case = false; /* fold lower to upper for sorting */
98 static const char *break_file = NULL; /* name of the `Break characters' file */
99 static const char *only_file = NULL; /* name of the `Only words' file */
100 static const char *ignore_file = NULL; /* name of the `Ignore words' file */
102 /* Options that use regular expressions. */
105 /* The original regular expression, as a string. */
108 /* The compiled regular expression, and its fastmap. */
109 struct re_pattern_buffer pattern;
110 char fastmap[UCHAR_MAX + 1];
113 static struct regex_data context_regex; /* end of context */
114 static struct regex_data word_regex; /* keyword */
116 /* A BLOCK delimit a region in memory of arbitrary size, like the copy of a
117 whole file. A WORD is something smaller, its length should fit in a
118 short integer. A WORD_TABLE may contain several WORDs. */
122 char *start; /* pointer to beginning of region */
123 char *end; /* pointer to end + 1 of region */
129 char *start; /* pointer to beginning of region */
130 short int size; /* length of the region */
136 WORD *start; /* array of WORDs */
137 size_t alloc; /* allocated length */
138 size_t length; /* number of used entries */
142 /* Pattern description tables. */
144 /* For each character, provide its folded equivalent. */
145 static unsigned char folded_chars[CHAR_SET_SIZE];
147 /* End of context pattern register indices. */
148 static struct re_registers context_regs;
150 /* Keyword pattern register indices. */
151 static struct re_registers word_regs;
153 /* A word characters fastmap is used only when no word regexp has been
154 provided. A word is then made up of a sequence of one or more characters
155 allowed by the fastmap. Contains !0 if character allowed in word. Not
156 only this is faster in most cases, but it simplifies the implementation
157 of the Break files. */
158 static char word_fastmap[CHAR_SET_SIZE];
160 /* Maximum length of any word read. */
161 static int maximum_word_length;
163 /* Maximum width of any reference used. */
164 static int reference_max_width;
166 /* Ignore and Only word tables. */
168 static WORD_TABLE ignore_table; /* table of words to ignore */
169 static WORD_TABLE only_table; /* table of words to select */
171 /* Source text table, and scanning macros. */
173 static int number_input_files; /* number of text input files */
174 static int total_line_count; /* total number of lines seen so far */
175 static const char **input_file_name; /* array of text input file names */
176 static int *file_line_count; /* array of `total_line_count' values at end */
178 static BLOCK text_buffer; /* file to study */
180 /* SKIP_NON_WHITE used only for getting or skipping the reference. */
182 #define SKIP_NON_WHITE(cursor, limit) \
183 while (cursor < limit && !ISSPACE(*cursor)) \
186 #define SKIP_WHITE(cursor, limit) \
187 while (cursor < limit && ISSPACE(*cursor)) \
190 #define SKIP_WHITE_BACKWARDS(cursor, start) \
191 while (cursor > start && ISSPACE(cursor[-1])) \
194 #define SKIP_SOMETHING(cursor, limit) \
195 if (word_regex.string) \
198 count = re_match (&word_regex.pattern, cursor, limit - cursor, 0, NULL); \
201 cursor += count == -1 ? 1 : count; \
203 else if (word_fastmap[to_uchar (*cursor)]) \
204 while (cursor < limit && word_fastmap[to_uchar (*cursor)]) \
209 /* Occurrences table.
211 The `keyword' pointer provides the central word, which is surrounded
212 by a left context and a right context. The `keyword' and `length'
213 field allow full 8-bit characters keys, even including NULs. At other
214 places in this program, the name `keyafter' refers to the keyword
215 followed by its right context.
217 The left context does not extend, towards the beginning of the file,
218 further than a distance given by the `left' value. This value is
219 relative to the keyword beginning, it is usually negative. This
220 insures that, except for white space, we will never have to backward
221 scan the source text, when it is time to generate the final output
224 The right context, indirectly attainable through the keyword end, does
225 not extend, towards the end of the file, further than a distance given
226 by the `right' value. This value is relative to the keyword
227 beginning, it is usually positive.
229 When automatic references are used, the `reference' value is the
230 overall line number in all input files read so far, in this case, it
231 is of type (int). When input references are used, the `reference'
232 value indicates the distance between the keyword beginning and the
233 start of the reference field, it is of type (DELTA) and usually
236 typedef short int DELTA; /* to hold displacement within one context */
240 WORD key; /* description of the keyword */
241 DELTA left; /* distance to left context start */
242 DELTA right; /* distance to right context end */
243 int reference; /* reference descriptor */
247 /* The various OCCURS tables are indexed by the language. But the time
248 being, there is no such multiple language support. */
250 static OCCURS *occurs_table[1]; /* all words retained from the read text */
251 static size_t occurs_alloc[1]; /* allocated size of occurs_table */
252 static size_t number_of_occurs[1]; /* number of used slots in occurs_table */
255 /* Communication among output routines. */
257 /* Indicate if special output processing is requested for each character. */
258 static char edited_flag[CHAR_SET_SIZE];
260 static int half_line_width; /* half of line width, reference excluded */
261 static int before_max_width; /* maximum width of before field */
262 static int keyafter_max_width; /* maximum width of keyword-and-after field */
263 static int truncation_string_length;/* length of string used to flag truncation */
265 /* When context is limited by lines, wraparound may happen on final output:
266 the `head' pointer gives access to some supplementary left context which
267 will be seen at the end of the output line, the `tail' pointer gives
268 access to some supplementary right context which will be seen at the
269 beginning of the output line. */
271 static BLOCK tail; /* tail field */
272 static int tail_truncation; /* flag truncation after the tail field */
274 static BLOCK before; /* before field */
275 static int before_truncation; /* flag truncation before the before field */
277 static BLOCK keyafter; /* keyword-and-after field */
278 static int keyafter_truncation; /* flag truncation after the keyafter field */
280 static BLOCK head; /* head field */
281 static int head_truncation; /* flag truncation before the head field */
283 static BLOCK reference; /* reference field for input reference mode */
285 /* Miscellaneous routines. */
287 /* Diagnose an error in the regular expression matcher. Then exit. */
289 static void ATTRIBUTE_NORETURN
292 error (0, errno, _("error in regular expression matcher"));
296 /*------------------------------------------------------.
297 | Duplicate string STRING, while evaluating \-escapes. |
298 `------------------------------------------------------*/
300 /* Loosely adapted from GNU sh-utils printf.c code. */
303 copy_unescaped_string (const char *string)
305 char *result; /* allocated result */
306 char *cursor; /* cursor in result */
307 int value; /* value of \nnn escape */
308 int length; /* length of \nnn escape */
310 result = xmalloc (strlen (string) + 1);
319 case 'x': /* \xhhh escape, 3 chars maximum */
321 for (length = 0, string++;
322 length < 3 && ISXDIGIT (*string);
324 value = value * 16 + HEXTOBIN (*string);
334 case '0': /* \0ooo escape, 3 chars maximum */
336 for (length = 0, string++;
337 length < 3 && ISODIGIT (*string);
339 value = value * 8 + OCTTOBIN (*string);
343 case 'a': /* alert */
352 case 'b': /* backspace */
357 case 'c': /* cancel the rest of the output */
362 case 'f': /* form feed */
367 case 'n': /* new line */
372 case 'r': /* carriage return */
377 case 't': /* horizontal tab */
382 case 'v': /* vertical tab */
393 *cursor++ = *string++;
398 *cursor++ = *string++;
404 /*--------------------------------------------------------------------------.
405 | Compile the regex represented by REGEX, diagnose and abort if any error. |
406 `--------------------------------------------------------------------------*/
409 compile_regex (struct regex_data *regex)
411 struct re_pattern_buffer *pattern = ®ex->pattern;
412 char const *string = regex->string;
415 pattern->buffer = NULL;
416 pattern->allocated = 0;
417 pattern->fastmap = regex->fastmap;
418 pattern->translate = ignore_case ? folded_chars : NULL;
420 message = re_compile_pattern (string, strlen (string), pattern);
422 error (EXIT_FAILURE, 0, _("%s (for regexp %s)"), message, quote (string));
424 /* The fastmap should be compiled before `re_match'. The following
425 call is not mandatory, because `re_search' is always called sooner,
426 and it compiles the fastmap if this has not been done yet. */
428 re_compile_fastmap (pattern);
431 /*------------------------------------------------------------------------.
432 | This will initialize various tables for pattern match and compiles some |
434 `------------------------------------------------------------------------*/
437 initialize_regex (void)
439 int character; /* character value */
441 /* Initialize the case folding table. */
444 for (character = 0; character < CHAR_SET_SIZE; character++)
445 folded_chars[character] = TOUPPER (character);
447 /* Unless the user already provided a description of the end of line or
448 end of sentence sequence, select an end of line sequence to compile.
449 If the user provided an empty definition, thus disabling end of line
450 or sentence feature, make it NULL to speed up tests. If GNU
451 extensions are enabled, use end of sentence like in GNU emacs. If
452 disabled, use end of lines. */
454 if (context_regex.string)
456 if (!*context_regex.string)
457 context_regex.string = NULL;
459 else if (gnu_extensions & !input_reference)
460 context_regex.string = "[.?!][]\"')}]*\\($\\|\t\\| \\)[ \t\n]*";
462 context_regex.string = "\n";
464 if (context_regex.string)
465 compile_regex (&context_regex);
467 /* If the user has already provided a non-empty regexp to describe
468 words, compile it. Else, unless this has already been done through
469 a user provided Break character file, construct a fastmap of
470 characters that may appear in a word. If GNU extensions enabled,
471 include only letters of the underlying character set. If disabled,
472 include almost everything, even punctuations; stop only on white
475 if (word_regex.string)
476 compile_regex (&word_regex);
477 else if (!break_file)
484 for (character = 0; character < CHAR_SET_SIZE; character++)
485 word_fastmap[character] = ISALPHA (character) ? 1 : 0;
490 /* Simulate [^ \t\n]+. */
492 memset (word_fastmap, 1, CHAR_SET_SIZE);
493 word_fastmap[' '] = 0;
494 word_fastmap['\t'] = 0;
495 word_fastmap['\n'] = 0;
500 /*------------------------------------------------------------------------.
501 | This routine will attempt to swallow a whole file name FILE_NAME into a |
502 | contiguous region of memory and return a description of it into BLOCK. |
503 | Standard input is assumed whenever FILE_NAME is NULL, empty or "-". |
505 | Previously, in some cases, white space compression was attempted while |
506 | inputting text. This was defeating some regexps like default end of |
507 | sentence, which checks for two consecutive spaces. If white space |
508 | compression is ever reinstated, it should be in output routines. |
509 `------------------------------------------------------------------------*/
512 swallow_file_in_memory (const char *file_name, BLOCK *block)
514 int file_handle; /* file descriptor number */
515 struct stat stat_block; /* stat block for file */
516 size_t allocated_length; /* allocated length of memory buffer */
517 size_t used_length; /* used length in memory buffer */
518 int read_length; /* number of character gotten on last read */
520 /* As special cases, a file name which is NULL or "-" indicates standard
521 input, which is already opened. In all other cases, open the file from
523 bool using_stdin = !file_name || !*file_name || STREQ (file_name, "-");
525 file_handle = STDIN_FILENO;
527 if ((file_handle = open (file_name, O_RDONLY)) < 0)
528 error (EXIT_FAILURE, errno, "%s", file_name);
530 /* If the file is a plain, regular file, allocate the memory buffer all at
531 once and swallow the file in one blow. In other cases, read the file
532 repeatedly in smaller chunks until we have it all, reallocating memory
533 once in a while, as we go. */
535 if (fstat (file_handle, &stat_block) < 0)
536 error (EXIT_FAILURE, errno, "%s", file_name);
538 if (S_ISREG (stat_block.st_mode))
540 size_t in_memory_size;
542 block->start = xmalloc ((size_t) stat_block.st_size);
544 if ((in_memory_size = read (file_handle,
545 block->start, (size_t) stat_block.st_size))
546 != stat_block.st_size)
549 /* On MSDOS, in memory size may be smaller than the file
550 size, because of end of line conversions. But it can
551 never be smaller than half the file size, because the
552 minimum is when all lines are empty and terminated by
554 if (in_memory_size != (size_t)-1
555 && in_memory_size >= stat_block.st_size / 2)
556 block->start = xrealloc (block->start, in_memory_size);
558 #endif /* not MSDOS */
560 error (EXIT_FAILURE, errno, "%s", file_name);
562 block->end = block->start + in_memory_size;
566 block->start = xmalloc ((size_t) 1 << SWALLOW_REALLOC_LOG);
568 allocated_length = (1 << SWALLOW_REALLOC_LOG);
570 while (read_length = read (file_handle,
571 block->start + used_length,
572 allocated_length - used_length),
575 used_length += read_length;
576 if (used_length == allocated_length)
578 allocated_length += (1 << SWALLOW_REALLOC_LOG);
580 = xrealloc (block->start, allocated_length);
585 error (EXIT_FAILURE, errno, "%s", file_name);
587 block->end = block->start + used_length;
590 /* Close the file, but only if it was not the standard input. */
592 if (! using_stdin && close (file_handle) != 0)
593 error (EXIT_FAILURE, errno, "%s", file_name);
596 /* Sort and search routines. */
598 /*--------------------------------------------------------------------------.
599 | Compare two words, FIRST and SECOND, and return 0 if they are identical. |
600 | Return less than 0 if the first word goes before the second; return |
601 | greater than 0 if the first word goes after the second. |
603 | If a word is indeed a prefix of the other, the shorter should go first. |
604 `--------------------------------------------------------------------------*/
607 compare_words (const void *void_first, const void *void_second)
609 #define first ((const WORD *) void_first)
610 #define second ((const WORD *) void_second)
611 int length; /* minimum of two lengths */
612 int counter; /* cursor in words */
613 int value; /* value of comparison */
615 length = first->size < second->size ? first->size : second->size;
619 for (counter = 0; counter < length; counter++)
621 value = (folded_chars [to_uchar (first->start[counter])]
622 - folded_chars [to_uchar (second->start[counter])]);
629 for (counter = 0; counter < length; counter++)
631 value = (to_uchar (first->start[counter])
632 - to_uchar (second->start[counter]));
638 return first->size - second->size;
643 /*-----------------------------------------------------------------------.
644 | Decides which of two OCCURS, FIRST or SECOND, should lexicographically |
645 | go first. In case of a tie, preserve the original order through a |
646 | pointer comparison. |
647 `-----------------------------------------------------------------------*/
650 compare_occurs (const void *void_first, const void *void_second)
652 #define first ((const OCCURS *) void_first)
653 #define second ((const OCCURS *) void_second)
656 value = compare_words (&first->key, &second->key);
657 return value == 0 ? first->key.start - second->key.start : value;
662 /*------------------------------------------------------------.
663 | Return !0 if WORD appears in TABLE. Uses a binary search. |
664 `------------------------------------------------------------*/
667 search_table (WORD *word, WORD_TABLE *table)
669 int lowest; /* current lowest possible index */
670 int highest; /* current highest possible index */
671 int middle; /* current middle index */
672 int value; /* value from last comparison */
675 highest = table->length - 1;
676 while (lowest <= highest)
678 middle = (lowest + highest) / 2;
679 value = compare_words (word, table->start + middle);
681 highest = middle - 1;
690 /*---------------------------------------------------------------------.
691 | Sort the whole occurs table in memory. Presumably, `qsort' does not |
692 | take intermediate copies or table elements, so the sort will be |
693 | stabilized throughout the comparison routine. |
694 `---------------------------------------------------------------------*/
697 sort_found_occurs (void)
700 /* Only one language for the time being. */
702 qsort (occurs_table[0], number_of_occurs[0], sizeof **occurs_table,
706 /* Parameter files reading routines. */
708 /*----------------------------------------------------------------------.
709 | Read a file named FILE_NAME, containing a set of break characters. |
710 | Build a content to the array word_fastmap in which all characters are |
711 | allowed except those found in the file. Characters may be repeated. |
712 `----------------------------------------------------------------------*/
715 digest_break_file (const char *file_name)
717 BLOCK file_contents; /* to receive a copy of the file */
718 char *cursor; /* cursor in file copy */
720 swallow_file_in_memory (file_name, &file_contents);
722 /* Make the fastmap and record the file contents in it. */
724 memset (word_fastmap, 1, CHAR_SET_SIZE);
725 for (cursor = file_contents.start; cursor < file_contents.end; cursor++)
726 word_fastmap[to_uchar (*cursor)] = 0;
731 /* If GNU extensions are enabled, the only way to avoid newline as
732 a break character is to write all the break characters in the
733 file with no newline at all, not even at the end of the file.
734 If disabled, spaces, tabs and newlines are always considered as
735 break characters even if not included in the break file. */
737 word_fastmap[' '] = 0;
738 word_fastmap['\t'] = 0;
739 word_fastmap['\n'] = 0;
742 /* Return the space of the file, which is no more required. */
744 free (file_contents.start);
747 /*-----------------------------------------------------------------------.
748 | Read a file named FILE_NAME, containing one word per line, then |
749 | construct in TABLE a table of WORD descriptors for them. The routine |
750 | swallows the whole file in memory; this is at the expense of space |
751 | needed for newlines, which are useless; however, the reading is fast. |
752 `-----------------------------------------------------------------------*/
755 digest_word_file (const char *file_name, WORD_TABLE *table)
757 BLOCK file_contents; /* to receive a copy of the file */
758 char *cursor; /* cursor in file copy */
759 char *word_start; /* start of the current word */
761 swallow_file_in_memory (file_name, &file_contents);
767 /* Read the whole file. */
769 cursor = file_contents.start;
770 while (cursor < file_contents.end)
773 /* Read one line, and save the word in contains. */
776 while (cursor < file_contents.end && *cursor != '\n')
779 /* Record the word in table if it is not empty. */
781 if (cursor > word_start)
783 if (table->length == table->alloc)
785 if ((SIZE_MAX / sizeof *table->start - 1) / 2 < table->alloc)
787 table->alloc = table->alloc * 2 + 1;
788 table->start = xrealloc (table->start,
789 table->alloc * sizeof *table->start);
792 table->start[table->length].start = word_start;
793 table->start[table->length].size = cursor - word_start;
797 /* This test allows for an incomplete line at end of file. */
799 if (cursor < file_contents.end)
803 /* Finally, sort all the words read. */
805 qsort (table->start, table->length, sizeof table->start[0], compare_words);
808 /* Keyword recognition and selection. */
810 /*----------------------------------------------------------------------.
811 | For each keyword in the source text, constructs an OCCURS structure. |
812 `----------------------------------------------------------------------*/
815 find_occurs_in_text (void)
817 char *cursor; /* for scanning the source text */
818 char *scan; /* for scanning the source text also */
819 char *line_start; /* start of the current input line */
820 char *line_scan; /* newlines scanned until this point */
821 int reference_length; /* length of reference in input mode */
822 WORD possible_key; /* possible key, to ease searches */
823 OCCURS *occurs_cursor; /* current OCCURS under construction */
825 char *context_start; /* start of left context */
826 char *context_end; /* end of right context */
827 char *word_start; /* start of word */
828 char *word_end; /* end of word */
829 char *next_context_start; /* next start of left context */
831 /* reference_length is always used within `if (input_reference)'.
832 However, GNU C diagnoses that it may be used uninitialized. The
833 following assignment is merely to shut it up. */
835 reference_length = 0;
837 /* Tracking where lines start is helpful for reference processing. In
838 auto reference mode, this allows counting lines. In input reference
839 mode, this permits finding the beginning of the references.
841 The first line begins with the file, skip immediately this very first
842 reference in input reference mode, to help further rejection any word
843 found inside it. Also, unconditionally assigning these variable has
844 the happy effect of shutting up lint. */
846 line_start = text_buffer.start;
847 line_scan = line_start;
850 SKIP_NON_WHITE (line_scan, text_buffer.end);
851 reference_length = line_scan - line_start;
852 SKIP_WHITE (line_scan, text_buffer.end);
855 /* Process the whole buffer, one line or one sentence at a time. */
857 for (cursor = text_buffer.start;
858 cursor < text_buffer.end;
859 cursor = next_context_start)
862 /* `context_start' gets initialized before the processing of each
863 line, or once for the whole buffer if no end of line or sentence
864 sequence separator. */
866 context_start = cursor;
868 /* If a end of line or end of sentence sequence is defined and
869 non-empty, `next_context_start' will be recomputed to be the end of
870 each line or sentence, before each one is processed. If no such
871 sequence, then `next_context_start' is set at the end of the whole
872 buffer, which is then considered to be a single line or sentence.
873 This test also accounts for the case of an incomplete line or
874 sentence at the end of the buffer. */
876 next_context_start = text_buffer.end;
877 if (context_regex.string)
878 switch (re_search (&context_regex.pattern, cursor,
879 text_buffer.end - cursor,
880 0, text_buffer.end - cursor, &context_regs))
889 next_context_start = cursor + context_regs.end[0];
893 /* Include the separator into the right context, but not any suffix
894 white space in this separator; this insures it will be seen in
895 output and will not take more space than necessary. */
897 context_end = next_context_start;
898 SKIP_WHITE_BACKWARDS (context_end, context_start);
900 /* Read and process a single input line or sentence, one word at a
905 if (word_regex.string)
907 /* If a word regexp has been compiled, use it to skip at the
908 beginning of the next word. If there is no such word, exit
912 regoff_t r = re_search (&word_regex.pattern, cursor,
913 context_end - cursor,
914 0, context_end - cursor, &word_regs);
919 word_start = cursor + word_regs.start[0];
920 word_end = cursor + word_regs.end[0];
924 /* Avoid re_search and use the fastmap to skip to the
925 beginning of the next word. If there is no more word in
926 the buffer, exit the loop. */
930 while (scan < context_end
931 && !word_fastmap[to_uchar (*scan)])
934 if (scan == context_end)
939 while (scan < context_end
940 && word_fastmap[to_uchar (*scan)])
946 /* Skip right to the beginning of the found word. */
950 /* Skip any zero length word. Just advance a single position,
951 then go fetch the next word. */
953 if (word_end == word_start)
959 /* This is a genuine, non empty word, so save it as a possible
960 key. Then skip over it. Also, maintain the maximum length of
961 all words read so far. It is mandatory to take the maximum
962 length of all words in the file, without considering if they
963 are actually kept or rejected, because backward jumps at output
964 generation time may fall in *any* word. */
966 possible_key.start = cursor;
967 possible_key.size = word_end - word_start;
968 cursor += possible_key.size;
970 if (possible_key.size > maximum_word_length)
971 maximum_word_length = possible_key.size;
973 /* In input reference mode, update `line_start' from its previous
974 value. Count the lines just in case auto reference mode is
975 also selected. If it happens that the word just matched is
976 indeed part of a reference; just ignore it. */
980 while (line_scan < possible_key.start)
981 if (*line_scan == '\n')
985 line_start = line_scan;
986 SKIP_NON_WHITE (line_scan, text_buffer.end);
987 reference_length = line_scan - line_start;
991 if (line_scan > possible_key.start)
995 /* Ignore the word if an `Ignore words' table exists and if it is
996 part of it. Also ignore the word if an `Only words' table and
997 if it is *not* part of it.
999 It is allowed that both tables be used at once, even if this
1000 may look strange for now. Just ignore a word that would appear
1001 in both. If regexps are eventually implemented for these
1002 tables, the Ignore table could then reject words that would
1003 have been previously accepted by the Only table. */
1005 if (ignore_file && search_table (&possible_key, &ignore_table))
1007 if (only_file && !search_table (&possible_key, &only_table))
1010 /* A non-empty word has been found. First of all, insure
1011 proper allocation of the next OCCURS, and make a pointer to
1012 where it will be constructed. */
1014 if (number_of_occurs[0] == occurs_alloc[0])
1016 if ((SIZE_MAX / sizeof *occurs_table[0] - 1) / 2
1019 occurs_alloc[0] = occurs_alloc[0] * 2 + 1;
1020 occurs_table[0] = xrealloc (occurs_table[0],
1021 occurs_alloc[0] * sizeof *occurs_table[0]);
1024 occurs_cursor = occurs_table[0] + number_of_occurs[0];
1026 /* Define the refence field, if any. */
1031 /* While auto referencing, update `line_start' from its
1032 previous value, counting lines as we go. If input
1033 referencing at the same time, `line_start' has been
1034 advanced earlier, and the following loop is never really
1037 while (line_scan < possible_key.start)
1038 if (*line_scan == '\n')
1042 line_start = line_scan;
1043 SKIP_NON_WHITE (line_scan, text_buffer.end);
1048 occurs_cursor->reference = total_line_count;
1050 else if (input_reference)
1053 /* If only input referencing, `line_start' has been computed
1054 earlier to detect the case the word matched would be part
1055 of the reference. The reference position is simply the
1056 value of `line_start'. */
1058 occurs_cursor->reference
1059 = (DELTA) (line_start - possible_key.start);
1060 if (reference_length > reference_max_width)
1061 reference_max_width = reference_length;
1064 /* Exclude the reference from the context in simple cases. */
1066 if (input_reference && line_start == context_start)
1068 SKIP_NON_WHITE (context_start, context_end);
1069 SKIP_WHITE (context_start, context_end);
1072 /* Completes the OCCURS structure. */
1074 occurs_cursor->key = possible_key;
1075 occurs_cursor->left = context_start - possible_key.start;
1076 occurs_cursor->right = context_end - possible_key.start;
1078 number_of_occurs[0]++;
1083 /* Formatting and actual output - service routines. */
1085 /*-----------------------------------------.
1086 | Prints some NUMBER of spaces on stdout. |
1087 `-----------------------------------------*/
1090 print_spaces (int number)
1094 for (counter = number; counter > 0; counter--)
1098 /*-------------------------------------.
1099 | Prints the field provided by FIELD. |
1100 `-------------------------------------*/
1103 print_field (BLOCK field)
1105 char *cursor; /* Cursor in field to print */
1106 int base; /* Base character, without diacritic */
1107 int diacritic; /* Diacritic code for the character */
1109 /* Whitespace is not really compressed. Instead, each white space
1110 character (tab, vt, ht etc.) is printed as one single space. */
1112 for (cursor = field.start; cursor < field.end; cursor++)
1114 unsigned char character = *cursor;
1115 if (edited_flag[character])
1118 /* First check if this is a diacriticized character.
1120 This works only for TeX. I do not know how diacriticized
1121 letters work with `roff'. Please someone explain it to me! */
1123 diacritic = todiac (character);
1124 if (diacritic != 0 && output_format == TEX_FORMAT)
1126 base = tobase (character);
1130 case 1: /* Latin diphthongs */
1134 fputs ("\\oe{}", stdout);
1138 fputs ("\\OE{}", stdout);
1142 fputs ("\\ae{}", stdout);
1146 fputs ("\\AE{}", stdout);
1154 case 2: /* Acute accent */
1155 printf ("\\'%s%c", (base == 'i' ? "\\" : ""), base);
1158 case 3: /* Grave accent */
1159 printf ("\\`%s%c", (base == 'i' ? "\\" : ""), base);
1162 case 4: /* Circumflex accent */
1163 printf ("\\^%s%c", (base == 'i' ? "\\" : ""), base);
1166 case 5: /* Diaeresis */
1167 printf ("\\\"%s%c", (base == 'i' ? "\\" : ""), base);
1170 case 6: /* Tilde accent */
1171 printf ("\\~%s%c", (base == 'i' ? "\\" : ""), base);
1174 case 7: /* Cedilla */
1175 printf ("\\c{%c}", base);
1178 case 8: /* Small circle beneath */
1182 fputs ("\\aa{}", stdout);
1186 fputs ("\\AA{}", stdout);
1194 case 9: /* Strike through */
1198 fputs ("\\o{}", stdout);
1202 fputs ("\\O{}", stdout);
1213 /* This is not a diacritic character, so handle cases which are
1214 really specific to `roff' or TeX. All white space processing
1215 is done as the default case of this switch. */
1220 /* In roff output format, double any quote. */
1230 /* In TeX output format, precede these with a backslash. */
1232 putchar (character);
1237 /* In TeX output format, precede these with a backslash and
1238 force mathematical mode. */
1239 printf ("$\\%c$", character);
1243 /* In TeX output mode, request production of a backslash. */
1244 fputs ("\\backslash{}", stdout);
1248 /* Any other flagged character produces a single space. */
1257 /* Formatting and actual output - planning routines. */
1259 /*--------------------------------------------------------------------.
1260 | From information collected from command line options and input file |
1261 | readings, compute and fix some output parameter values. |
1262 `--------------------------------------------------------------------*/
1265 fix_output_parameters (void)
1267 int file_index; /* index in text input file arrays */
1268 int line_ordinal; /* line ordinal value for reference */
1269 char ordinal_string[12]; /* edited line ordinal for reference */
1270 int reference_width; /* width for the whole reference */
1271 int character; /* character ordinal */
1272 const char *cursor; /* cursor in some constant strings */
1274 /* In auto reference mode, the maximum width of this field is
1275 precomputed and subtracted from the overall line width. Add one for
1276 the column which separate the file name from the line number. */
1280 reference_max_width = 0;
1281 for (file_index = 0; file_index < number_input_files; file_index++)
1283 line_ordinal = file_line_count[file_index] + 1;
1285 line_ordinal -= file_line_count[file_index - 1];
1286 sprintf (ordinal_string, "%d", line_ordinal);
1287 reference_width = strlen (ordinal_string);
1288 if (input_file_name[file_index])
1289 reference_width += strlen (input_file_name[file_index]);
1290 if (reference_width > reference_max_width)
1291 reference_max_width = reference_width;
1293 reference_max_width++;
1294 reference.start = xmalloc ((size_t) reference_max_width + 1);
1297 /* If the reference appears to the left of the output line, reserve some
1298 space for it right away, including one gap size. */
1300 if ((auto_reference | input_reference) & !right_reference)
1301 line_width -= reference_max_width + gap_size;
1303 /* The output lines, minimally, will contain from left to right a left
1304 context, a gap, and a keyword followed by the right context with no
1305 special intervening gap. Half of the line width is dedicated to the
1306 left context and the gap, the other half is dedicated to the keyword
1307 and the right context; these values are computed once and for all here.
1308 There also are tail and head wrap around fields, used when the keyword
1309 is near the beginning or the end of the line, or when some long word
1310 cannot fit in, but leave place from wrapped around shorter words. The
1311 maximum width of these fields are recomputed separately for each line,
1312 on a case by case basis. It is worth noting that it cannot happen that
1313 both the tail and head fields are used at once. */
1315 half_line_width = line_width / 2;
1316 before_max_width = half_line_width - gap_size;
1317 keyafter_max_width = half_line_width;
1319 /* If truncation_string is the empty string, make it NULL to speed up
1320 tests. In this case, truncation_string_length will never get used, so
1321 there is no need to set it. */
1323 if (truncation_string && *truncation_string)
1324 truncation_string_length = strlen (truncation_string);
1326 truncation_string = NULL;
1331 /* When flagging truncation at the left of the keyword, the
1332 truncation mark goes at the beginning of the before field,
1333 unless there is a head field, in which case the mark goes at the
1334 left of the head field. When flagging truncation at the right
1335 of the keyword, the mark goes at the end of the keyafter field,
1336 unless there is a tail field, in which case the mark goes at the
1337 end of the tail field. Only eight combination cases could arise
1338 for truncation marks:
1341 . One beginning the before field.
1342 . One beginning the head field.
1343 . One ending the keyafter field.
1344 . One ending the tail field.
1345 . One beginning the before field, another ending the keyafter field.
1346 . One ending the tail field, another beginning the before field.
1347 . One ending the keyafter field, another beginning the head field.
1349 So, there is at most two truncation marks, which could appear both
1350 on the left side of the center of the output line, both on the
1351 right side, or one on either side. */
1353 before_max_width -= 2 * truncation_string_length;
1354 keyafter_max_width -= 2 * truncation_string_length;
1359 /* I never figured out exactly how UNIX' ptx plans the output width
1360 of its various fields. If GNU extensions are disabled, do not
1361 try computing the field widths correctly; instead, use the
1362 following formula, which does not completely imitate UNIX' ptx,
1365 keyafter_max_width -= 2 * truncation_string_length + 1;
1368 /* Compute which characters need special output processing. Initialize
1369 by flagging any white space character. Some systems do not consider
1370 form feed as a space character, but we do. */
1372 for (character = 0; character < CHAR_SET_SIZE; character++)
1373 edited_flag[character] = ISSPACE (character) != 0;
1374 edited_flag['\f'] = 1;
1376 /* Complete the special character flagging according to selected output
1379 switch (output_format)
1381 case UNKNOWN_FORMAT:
1382 /* Should never happen. */
1389 /* `Quote' characters should be doubled. */
1391 edited_flag['"'] = 1;
1396 /* Various characters need special processing. */
1398 for (cursor = "$%&#_{}\\"; *cursor; cursor++)
1399 edited_flag[to_uchar (*cursor)] = 1;
1401 /* Any character with 8th bit set will print to a single space, unless
1402 it is diacriticized. */
1404 for (character = 0200; character < CHAR_SET_SIZE; character++)
1405 edited_flag[character] = todiac (character) != 0;
1410 /*------------------------------------------------------------------.
1411 | Compute the position and length of all the output fields, given a |
1412 | pointer to some OCCURS. |
1413 `------------------------------------------------------------------*/
1416 define_all_fields (OCCURS *occurs)
1418 int tail_max_width; /* allowable width of tail field */
1419 int head_max_width; /* allowable width of head field */
1420 char *cursor; /* running cursor in source text */
1421 char *left_context_start; /* start of left context */
1422 char *right_context_end; /* end of right context */
1423 char *left_field_start; /* conservative start for `head'/`before' */
1424 int file_index; /* index in text input file arrays */
1425 const char *file_name; /* file name for reference */
1426 int line_ordinal; /* line ordinal for reference */
1428 /* Define `keyafter', start of left context and end of right context.
1429 `keyafter' starts at the saved position for keyword and extend to the
1430 right from the end of the keyword, eating separators or full words, but
1431 not beyond maximum allowed width for `keyafter' field or limit for the
1432 right context. Suffix spaces will be removed afterwards. */
1434 keyafter.start = occurs->key.start;
1435 keyafter.end = keyafter.start + occurs->key.size;
1436 left_context_start = keyafter.start + occurs->left;
1437 right_context_end = keyafter.start + occurs->right;
1439 cursor = keyafter.end;
1440 while (cursor < right_context_end
1441 && cursor <= keyafter.start + keyafter_max_width)
1443 keyafter.end = cursor;
1444 SKIP_SOMETHING (cursor, right_context_end);
1446 if (cursor <= keyafter.start + keyafter_max_width)
1447 keyafter.end = cursor;
1449 keyafter_truncation = truncation_string && keyafter.end < right_context_end;
1451 SKIP_WHITE_BACKWARDS (keyafter.end, keyafter.start);
1453 /* When the left context is wide, it might take some time to catch up from
1454 the left context boundary to the beginning of the `head' or `before'
1455 fields. So, in this case, to speed the catchup, we jump back from the
1456 keyword, using some secure distance, possibly falling in the middle of
1457 a word. A secure backward jump would be at least half the maximum
1458 width of a line, plus the size of the longest word met in the whole
1459 input. We conclude this backward jump by a skip forward of at least
1460 one word. In this manner, we should not inadvertently accept only part
1461 of a word. From the reached point, when it will be time to fix the
1462 beginning of `head' or `before' fields, we will skip forward words or
1463 delimiters until we get sufficiently near. */
1465 if (-occurs->left > half_line_width + maximum_word_length)
1468 = keyafter.start - (half_line_width + maximum_word_length);
1469 SKIP_SOMETHING (left_field_start, keyafter.start);
1472 left_field_start = keyafter.start + occurs->left;
1474 /* `before' certainly ends at the keyword, but not including separating
1475 spaces. It starts after than the saved value for the left context, by
1476 advancing it until it falls inside the maximum allowed width for the
1477 before field. There will be no prefix spaces either. `before' only
1478 advances by skipping single separators or whole words. */
1480 before.start = left_field_start;
1481 before.end = keyafter.start;
1482 SKIP_WHITE_BACKWARDS (before.end, before.start);
1484 while (before.start + before_max_width < before.end)
1485 SKIP_SOMETHING (before.start, before.end);
1487 if (truncation_string)
1489 cursor = before.start;
1490 SKIP_WHITE_BACKWARDS (cursor, text_buffer.start);
1491 before_truncation = cursor > left_context_start;
1494 before_truncation = 0;
1496 SKIP_WHITE (before.start, text_buffer.end);
1498 /* The tail could not take more columns than what has been left in the
1499 left context field, and a gap is mandatory. It starts after the
1500 right context, and does not contain prefixed spaces. It ends at
1501 the end of line, the end of buffer or when the tail field is full,
1502 whichever comes first. It cannot contain only part of a word, and
1503 has no suffixed spaces. */
1506 = before_max_width - (before.end - before.start) - gap_size;
1508 if (tail_max_width > 0)
1510 tail.start = keyafter.end;
1511 SKIP_WHITE (tail.start, text_buffer.end);
1513 tail.end = tail.start;
1515 while (cursor < right_context_end
1516 && cursor < tail.start + tail_max_width)
1519 SKIP_SOMETHING (cursor, right_context_end);
1522 if (cursor < tail.start + tail_max_width)
1525 if (tail.end > tail.start)
1527 keyafter_truncation = 0;
1528 tail_truncation = truncation_string && tail.end < right_context_end;
1531 tail_truncation = 0;
1533 SKIP_WHITE_BACKWARDS (tail.end, tail.start);
1538 /* No place left for a tail field. */
1542 tail_truncation = 0;
1545 /* `head' could not take more columns than what has been left in the right
1546 context field, and a gap is mandatory. It ends before the left
1547 context, and does not contain suffixed spaces. Its pointer is advanced
1548 until the head field has shrunk to its allowed width. It cannot
1549 contain only part of a word, and has no suffixed spaces. */
1552 = keyafter_max_width - (keyafter.end - keyafter.start) - gap_size;
1554 if (head_max_width > 0)
1556 head.end = before.start;
1557 SKIP_WHITE_BACKWARDS (head.end, text_buffer.start);
1559 head.start = left_field_start;
1560 while (head.start + head_max_width < head.end)
1561 SKIP_SOMETHING (head.start, head.end);
1563 if (head.end > head.start)
1565 before_truncation = 0;
1566 head_truncation = (truncation_string
1567 && head.start > left_context_start);
1570 head_truncation = 0;
1572 SKIP_WHITE (head.start, head.end);
1577 /* No place left for a head field. */
1581 head_truncation = 0;
1587 /* Construct the reference text in preallocated space from the file
1588 name and the line number. Find out in which file the reference
1589 occurred. Standard input yields an empty file name. Insure line
1590 numbers are one based, even if they are computed zero based. */
1593 while (file_line_count[file_index] < occurs->reference)
1596 file_name = input_file_name[file_index];
1600 line_ordinal = occurs->reference + 1;
1602 line_ordinal -= file_line_count[file_index - 1];
1604 sprintf (reference.start, "%s:%d", file_name, line_ordinal);
1605 reference.end = reference.start + strlen (reference.start);
1607 else if (input_reference)
1610 /* Reference starts at saved position for reference and extends right
1611 until some white space is met. */
1613 reference.start = keyafter.start + (DELTA) occurs->reference;
1614 reference.end = reference.start;
1615 SKIP_NON_WHITE (reference.end, right_context_end);
1619 /* Formatting and actual output - control routines. */
1621 /*----------------------------------------------------------------------.
1622 | Output the current output fields as one line for `troff' or `nroff'. |
1623 `----------------------------------------------------------------------*/
1626 output_one_roff_line (void)
1628 /* Output the `tail' field. */
1630 printf (".%s \"", macro_name);
1632 if (tail_truncation)
1633 fputs (truncation_string, stdout);
1636 /* Output the `before' field. */
1638 fputs (" \"", stdout);
1639 if (before_truncation)
1640 fputs (truncation_string, stdout);
1641 print_field (before);
1644 /* Output the `keyafter' field. */
1646 fputs (" \"", stdout);
1647 print_field (keyafter);
1648 if (keyafter_truncation)
1649 fputs (truncation_string, stdout);
1652 /* Output the `head' field. */
1654 fputs (" \"", stdout);
1655 if (head_truncation)
1656 fputs (truncation_string, stdout);
1660 /* Conditionally output the `reference' field. */
1662 if (auto_reference | input_reference)
1664 fputs (" \"", stdout);
1665 print_field (reference);
1672 /*---------------------------------------------------------.
1673 | Output the current output fields as one line for `TeX'. |
1674 `---------------------------------------------------------*/
1677 output_one_tex_line (void)
1679 BLOCK key; /* key field, isolated */
1680 BLOCK after; /* after field, isolated */
1681 char *cursor; /* running cursor in source text */
1683 printf ("\\%s ", macro_name);
1686 fputs ("}{", stdout);
1687 print_field (before);
1688 fputs ("}{", stdout);
1689 key.start = keyafter.start;
1690 after.end = keyafter.end;
1691 cursor = keyafter.start;
1692 SKIP_SOMETHING (cursor, keyafter.end);
1694 after.start = cursor;
1696 fputs ("}{", stdout);
1697 print_field (after);
1698 fputs ("}{", stdout);
1701 if (auto_reference | input_reference)
1704 print_field (reference);
1710 /*-------------------------------------------------------------------.
1711 | Output the current output fields as one line for a dumb terminal. |
1712 `-------------------------------------------------------------------*/
1715 output_one_dumb_line (void)
1717 if (!right_reference)
1722 /* Output the `reference' field, in such a way that GNU emacs
1723 next-error will handle it. The ending colon is taken from the
1724 gap which follows. */
1726 print_field (reference);
1728 print_spaces (reference_max_width
1730 - (reference.end - reference.start)
1736 /* Output the `reference' field and its following gap. */
1738 print_field (reference);
1739 print_spaces (reference_max_width
1741 - (reference.end - reference.start));
1745 if (tail.start < tail.end)
1747 /* Output the `tail' field. */
1750 if (tail_truncation)
1751 fputs (truncation_string, stdout);
1753 print_spaces (half_line_width - gap_size
1754 - (before.end - before.start)
1755 - (before_truncation ? truncation_string_length : 0)
1756 - (tail.end - tail.start)
1757 - (tail_truncation ? truncation_string_length : 0));
1760 print_spaces (half_line_width - gap_size
1761 - (before.end - before.start)
1762 - (before_truncation ? truncation_string_length : 0));
1764 /* Output the `before' field. */
1766 if (before_truncation)
1767 fputs (truncation_string, stdout);
1768 print_field (before);
1770 print_spaces (gap_size);
1772 /* Output the `keyafter' field. */
1774 print_field (keyafter);
1775 if (keyafter_truncation)
1776 fputs (truncation_string, stdout);
1778 if (head.start < head.end)
1780 /* Output the `head' field. */
1782 print_spaces (half_line_width
1783 - (keyafter.end - keyafter.start)
1784 - (keyafter_truncation ? truncation_string_length : 0)
1785 - (head.end - head.start)
1786 - (head_truncation ? truncation_string_length : 0));
1787 if (head_truncation)
1788 fputs (truncation_string, stdout);
1793 if ((auto_reference | input_reference) & right_reference)
1794 print_spaces (half_line_width
1795 - (keyafter.end - keyafter.start)
1796 - (keyafter_truncation ? truncation_string_length : 0));
1798 if ((auto_reference | input_reference) & right_reference)
1800 /* Output the `reference' field. */
1802 print_spaces (gap_size);
1803 print_field (reference);
1809 /*------------------------------------------------------------------------.
1810 | Scan the whole occurs table and, for each entry, output one line in the |
1811 | appropriate format. |
1812 `------------------------------------------------------------------------*/
1815 generate_all_output (void)
1817 size_t occurs_index; /* index of keyword entry being processed */
1818 OCCURS *occurs_cursor; /* current keyword entry being processed */
1820 /* The following assignments are useful to provide default values in case
1821 line contexts or references are not used, in which case these variables
1822 would never be computed. */
1826 tail_truncation = 0;
1830 head_truncation = 0;
1832 /* Loop over all keyword occurrences. */
1834 occurs_cursor = occurs_table[0];
1836 for (occurs_index = 0; occurs_index < number_of_occurs[0]; occurs_index++)
1838 /* Compute the exact size of every field and whenever truncation flags
1839 are present or not. */
1841 define_all_fields (occurs_cursor);
1843 /* Produce one output line according to selected format. */
1845 switch (output_format)
1847 case UNKNOWN_FORMAT:
1848 /* Should never happen. */
1851 output_one_dumb_line ();
1855 output_one_roff_line ();
1859 output_one_tex_line ();
1863 /* Advance the cursor into the occurs table. */
1869 /* Option decoding and main program. */
1871 /*------------------------------------------------------.
1872 | Print program identification and options, then exit. |
1873 `------------------------------------------------------*/
1878 if (status != EXIT_SUCCESS)
1879 fprintf (stderr, _("Try `%s --help' for more information.\n"),
1884 Usage: %s [OPTION]... [INPUT]... (without -G)\n\
1885 or: %s -G [OPTION]... [INPUT [OUTPUT]]\n"),
1886 program_name, program_name);
1888 Output a permuted index, including context, of the words in the input files.\n\
1892 Mandatory arguments to long options are mandatory for short options too.\n\
1895 -A, --auto-reference output automatically generated references\n\
1896 -G, --traditional behave more like System V `ptx'\n\
1897 -F, --flag-truncation=STRING use STRING for flagging line truncations\n\
1900 -M, --macro-name=STRING macro name to use instead of `xx'\n\
1901 -O, --format=roff generate output as roff directives\n\
1902 -R, --right-side-refs put references at right, not counted in -w\n\
1903 -S, --sentence-regexp=REGEXP for end of lines or end of sentences\n\
1904 -T, --format=tex generate output as TeX directives\n\
1907 -W, --word-regexp=REGEXP use REGEXP to match each keyword\n\
1908 -b, --break-file=FILE word break characters in this FILE\n\
1909 -f, --ignore-case fold lower case to upper case for sorting\n\
1910 -g, --gap-size=NUMBER gap size in columns between output fields\n\
1911 -i, --ignore-file=FILE read ignore word list from FILE\n\
1912 -o, --only-file=FILE read only word list from this FILE\n\
1915 -r, --references first field of each line is a reference\n\
1916 -t, --typeset-mode - not implemented -\n\
1917 -w, --width=NUMBER output width in columns, reference excluded\n\
1919 fputs (HELP_OPTION_DESCRIPTION, stdout);
1920 fputs (VERSION_OPTION_DESCRIPTION, stdout);
1923 With no FILE or if FILE is -, read Standard Input. `-F /' by default.\n\
1925 printf (_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT);
1930 /*----------------------------------------------------------------------.
1931 | Main program. Decode ARGC arguments passed through the ARGV array of |
1932 | strings, then launch execution. |
1933 `----------------------------------------------------------------------*/
1935 /* Long options equivalences. */
1936 static const struct option long_options[] =
1938 {"auto-reference", no_argument, NULL, 'A'},
1939 {"break-file", required_argument, NULL, 'b'},
1940 {"copyright", no_argument, NULL, 'C'}, /* Deprecated, remove in 2007. */
1941 {"flag-truncation", required_argument, NULL, 'F'},
1942 {"ignore-case", no_argument, NULL, 'f'},
1943 {"gap-size", required_argument, NULL, 'g'},
1944 {"ignore-file", required_argument, NULL, 'i'},
1945 {"macro-name", required_argument, NULL, 'M'},
1946 {"only-file", required_argument, NULL, 'o'},
1947 {"references", no_argument, NULL, 'r'},
1948 {"right-side-refs", no_argument, NULL, 'R'},
1949 {"format", required_argument, NULL, 10},
1950 {"sentence-regexp", required_argument, NULL, 'S'},
1951 {"traditional", no_argument, NULL, 'G'},
1952 {"typeset-mode", no_argument, NULL, 't'},
1953 {"width", required_argument, NULL, 'w'},
1954 {"word-regexp", required_argument, NULL, 'W'},
1955 {GETOPT_HELP_OPTION_DECL},
1956 {GETOPT_VERSION_OPTION_DECL},
1960 static char const* const format_args[] =
1965 static enum Format const format_vals[] =
1967 ROFF_FORMAT, TEX_FORMAT
1971 main (int argc, char **argv)
1973 int optchar; /* argument character */
1974 int file_index; /* index in text input file arrays */
1976 /* Decode program options. */
1978 initialize_main (&argc, &argv);
1979 program_name = argv[0];
1980 setlocale (LC_ALL, "");
1981 bindtextdomain (PACKAGE, LOCALEDIR);
1982 textdomain (PACKAGE);
1984 atexit (close_stdout);
1986 #if HAVE_SETCHRCLASS
1990 while (optchar = getopt_long (argc, argv, "ACF:GM:ORS:TW:b:i:fg:o:trw:",
1991 long_options, NULL),
1997 usage (EXIT_FAILURE);
2000 gnu_extensions = false;
2004 break_file = optarg;
2013 unsigned long int tmp_ulong;
2014 if (xstrtoul (optarg, NULL, 0, &tmp_ulong, NULL) != LONGINT_OK
2015 || ! (0 < tmp_ulong && tmp_ulong <= INT_MAX))
2016 error (EXIT_FAILURE, 0, _("invalid gap width: %s"),
2018 gap_size = tmp_ulong;
2023 ignore_file = optarg;
2031 input_reference = true;
2035 /* Yet to understand... */
2040 unsigned long int tmp_ulong;
2041 if (xstrtoul (optarg, NULL, 0, &tmp_ulong, NULL) != LONGINT_OK
2042 || ! (0 < tmp_ulong && tmp_ulong <= INT_MAX))
2043 error (EXIT_FAILURE, 0, _("invalid line width: %s"),
2045 line_width = tmp_ulong;
2050 auto_reference = true;
2054 truncation_string = copy_unescaped_string (optarg);
2058 macro_name = optarg;
2062 output_format = ROFF_FORMAT;
2066 right_reference = true;
2070 context_regex.string = copy_unescaped_string (optarg);
2074 output_format = TEX_FORMAT;
2078 word_regex.string = copy_unescaped_string (optarg);
2079 if (!*word_regex.string)
2080 word_regex.string = NULL;
2084 output_format = XARGMATCH ("--format", optarg,
2085 format_args, format_vals);
2086 case_GETOPT_HELP_CHAR;
2090 the --copyright option is deprecated; use --version instead"));
2093 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
2097 /* Process remaining arguments. If GNU extensions are enabled, process
2098 all arguments as input parameters. If disabled, accept at most two
2099 arguments, the second of which is an output parameter. */
2104 /* No more argument simply means: read standard input. */
2106 input_file_name = xmalloc (sizeof *input_file_name);
2107 file_line_count = xmalloc (sizeof *file_line_count);
2108 number_input_files = 1;
2109 input_file_name[0] = NULL;
2111 else if (gnu_extensions)
2113 number_input_files = argc - optind;
2114 input_file_name = xmalloc (number_input_files * sizeof *input_file_name);
2115 file_line_count = xmalloc (number_input_files * sizeof *file_line_count);
2117 for (file_index = 0; file_index < number_input_files; file_index++)
2119 input_file_name[file_index] = argv[optind];
2120 if (!*argv[optind] || STREQ (argv[optind], "-"))
2121 input_file_name[0] = NULL;
2123 input_file_name[0] = argv[optind];
2130 /* There is one necessary input file. */
2132 number_input_files = 1;
2133 input_file_name = xmalloc (sizeof *input_file_name);
2134 file_line_count = xmalloc (sizeof *file_line_count);
2135 if (!*argv[optind] || STREQ (argv[optind], "-"))
2136 input_file_name[0] = NULL;
2138 input_file_name[0] = argv[optind];
2141 /* Redirect standard output, only if requested. */
2145 if (! freopen (argv[optind], "w", stdout))
2146 error (EXIT_FAILURE, errno, "%s", argv[optind]);
2150 /* Diagnose any other argument as an error. */
2154 error (0, 0, _("extra operand %s"), quote (argv[optind]));
2155 usage (EXIT_FAILURE);
2159 /* If the output format has not been explicitly selected, choose dumb
2160 terminal format if GNU extensions are enabled, else `roff' format. */
2162 if (output_format == UNKNOWN_FORMAT)
2163 output_format = gnu_extensions ? DUMB_FORMAT : ROFF_FORMAT;
2165 /* Initialize the main tables. */
2167 initialize_regex ();
2169 /* Read `Break character' file, if any. */
2172 digest_break_file (break_file);
2174 /* Read `Ignore words' file and `Only words' files, if any. If any of
2175 these files is empty, reset the name of the file to NULL, to avoid
2176 unnecessary calls to search_table. */
2180 digest_word_file (ignore_file, &ignore_table);
2181 if (ignore_table.length == 0)
2187 digest_word_file (only_file, &only_table);
2188 if (only_table.length == 0)
2192 /* Prepare to study all the input files. */
2194 number_of_occurs[0] = 0;
2195 total_line_count = 0;
2196 maximum_word_length = 0;
2197 reference_max_width = 0;
2199 for (file_index = 0; file_index < number_input_files; file_index++)
2202 /* Read the file in core, than study it. */
2204 swallow_file_in_memory (input_file_name[file_index], &text_buffer);
2205 find_occurs_in_text ();
2207 /* Maintain for each file how many lines has been read so far when its
2208 end is reached. Incrementing the count first is a simple kludge to
2209 handle a possible incomplete line at end of file. */
2212 file_line_count[file_index] = total_line_count;
2215 /* Do the output process phase. */
2217 sort_found_occurs ();
2218 fix_output_parameters ();
2219 generate_all_output ();
2223 exit (EXIT_SUCCESS);