1 #!/usr/bin/perl -- # -*- Perl -*-
3 # $Id: collateindex.pl,v 1.10 2004/10/24 17:05:41 petere78 Exp $
7 collateindex.pl - generate DocBook index files
11 B<collateindex.pl> [B<-f>] [B<-g>] [B<-i> I<id>] [B<-I> I<scope>] [B<-N>]
12 [B<-o> F<file>] [B<-p>] [B<-P> F<file>] [B<-q>] [B<-s> I<name>]
13 [B<-S> I<scope>] [B<-t> I<name>] [B<-x>] F<file>
17 B<collateindex.pl> creates index data for DocBook XML or SGML files.
26 $usage = "Usage: $0 [options] file
27 Try \"perldoc $me\" for documentation.\n";
29 ( $version = '$Revision: 1.10 $' ) =~ s/^\$[R]evision:\s*([^ ]*)\s*\$$/$1/;
37 Force the output file to be written, even if it appears to have been
42 Group terms with IndexDiv based on the first letter of the term (or
43 its SortAs attribute). (This might not handle all language environments.)
47 The ID to use for the E<lt>indexE<gt> tag.
51 The implied scope, must be C<all>, C<local>, or C<global>. IndexTerms
52 which do not specify a scope will have the implied scope. If
53 unspecified, C<all> is assumed.
57 New index (generates an empty index file).
61 Output to F<file>. Defaults to F<stdout>.
65 Link to points in the document. The default is to link to the closest
70 Read a preamble from F<file>. The contents of F<file> will be
71 inserted before the E<lt>indexE<gt> tag.
79 Name the IndexDiv that contains symbols. The default is C<Symbols>.
80 Meaningless if B<-g> is not used.
84 Scope of the index, must be C<all>, C<local>, or C<global>. If
85 unspecified, C<all> is assumed.
97 Print version number and exit.
101 The file containing index data generated with the DocBook DSSSL
102 HTML stylesheet (usually called F<HTML.index>).
109 die $usage if ! getopts('Dfgi:NpP:s:o:S:I:t:xqV');
111 $linkpoints = $opt_p;
112 $lettergroups = $opt_g;
113 $symbolsname = $opt_s || "Symbols";
116 $outfile = $opt_o || '-';
118 $scope = uc($opt_S) || 'ALL';
119 $impliedscope = uc($opt_I) || 'ALL';
121 $forceoutput = $opt_f;
127 print "collateindex.pl $version\n";
131 $indextag = $setindex ? 'setindex' : 'index';
134 safe_open(*OUT, $outfile);
136 print OUT "<$indextag id='$indexid'>\n\n";
138 print OUT "<$indextag>\n\n";
141 print OUT "<!-- This file was produced by collateindex.pl. -->\n";
142 print OUT "<!-- Remove this comment if you edit this file by hand! -->\n";
144 print OUT "</$indextag>\n";
148 $dat = shift @ARGV || die $usage;
149 die "$me: file \"$dat\" does not exist\n" if ! -f $dat;
151 %legal_scopes = ('ALL' => 1, 'LOCAL' => 1, 'GLOBAL' => 1);
152 if ($scope && !$legal_scopes{$scope}) {
153 die "$me: invalid scope: $scope\n";
155 if ($impliedscope && !$legal_scopes{$impliedscope}) {
156 die "$me: invalid implied scope: $impliedscope\n";
164 $quiet || print STDERR "Processing $dat...\n";
166 # Read the index file, creating an array of objects. Each object
167 # represents and indexterm and has fields for the content of the
175 if (/^\/indexterm/i) {
180 if (/^indexterm (.*)$/i) {
185 $idx->{'count'} = $termcount;
186 $idx->{'scope'} = $impliedscope;
190 if (/^indexpoint (.*)$/i) {
191 $idx->{'hrefpoint'} = $1;
195 if (/^title (.*)$/i) {
196 $idx->{'title'} = $1;
200 if (/^primary[\[ ](.*)$/i) {
201 if (/^primary\[(.*?)\] (.*)$/i) {
202 $idx->{'psortas'} = &escape($1);
203 $idx->{'primary'} = &escape($2);
205 $idx->{'psortas'} = &escape($1);
206 $idx->{'primary'} = &escape($1);
211 if (/^secondary[\[ ](.*)$/i) {
212 if (/^secondary\[(.*?)\] (.*)$/i) {
213 $idx->{'ssortas'} = &escape($1);
214 $idx->{'secondary'} = &escape($2);
216 $idx->{'ssortas'} = &escape($1);
217 $idx->{'secondary'} = &escape($1);
222 if (/^tertiary[\[ ](.*)$/i) {
223 if (/^tertiary\[(.*?)\] (.*)$/i) {
224 $idx->{'tsortas'} = &escape($1);
225 $idx->{'tertiary'} = &escape($2);
227 $idx->{'tsortas'} = &escape($1);
228 $idx->{'tertiary'} = &escape($1);
234 $idx->{'see'} = &escape($1);
238 if (/^seealso (.*)$/i) {
239 $idx->{'seealso'} = &escape($1);
243 if (/^significance (.*)$/i) {
244 $idx->{'significance'} = &escape($1);
248 if (/^class (.*)$/i) {
249 $idx->{'class'} = &escape($1);
253 if (/^scope (.*)$/i) {
254 $idx->{'scope'} = &escape(uc($1));
258 if (/^startref (.*)$/i) {
259 $idx->{'startref'} = $1;
269 if (/^zone (.*)$/i) {
273 die "$me: invalid zone: $_\n" if !/^title (.*)$/i;
274 $idx->{'zone'}->{$href} = $1;
278 die "$me: unrecognized tag in input: $_\n";
282 $quiet || print STDERR "$termcount entries loaded...\n";
284 # Fixup the startrefs...
285 # In DocBook, STARTREF is a #CONREF attribute; support this by copying
286 # all of the fields from the indexterm with the id specified by STARTREF
287 # to the indexterm that has the STARTREF.
288 foreach $idx (@term) {
290 if ($idx->{'startref'}) {
291 $ididx = $id{$idx->{'startref'}};
292 foreach $field ('primary', 'secondary', 'tertiary', 'see', 'seealso',
293 'psortas', 'ssortas', 'tsortas', 'significance',
295 $idx->{$field} = $ididx->{$field};
300 # Sort the index terms
301 @term = sort termsort @term;
303 # Move all of the non-alphabetic entries to the front of the index.
304 @term = sortsymbols(@term);
306 safe_open(*OUT, $outfile);
310 print OUT "<$indextag id='$indexid'>\n\n";
312 print OUT "<$indextag>\n\n";
315 print OUT "<!-- This file was produced by collateindex.pl. -->\n";
316 print OUT "<!-- Remove this comment if you edit this file by hand! -->\n";
318 print OUT "<!-- ULINK is abused here.
320 The URL attribute holds the URL that points from the index entry
321 back to the appropriate place in the output produced by the HTML
322 stylesheet. (It's much easier to calculate this URL in the first
325 The Role attribute holds the ID (either real or manufactured) of
326 the corresponding INDEXTERM. This is used by the print backends
327 to produce page numbers.
329 The entries below are sorted and collated into the correct order.
330 Duplicates may be removed in the HTML backend, but in the print
331 backends, it is impossible to suppress duplicate pages or coalesce
332 sequences of pages into a range.
335 print OUT "<title>$title</title>\n\n" if $title;
337 $last = {}; # the last indexterm we processed
338 $first = 1; # this is the first one
339 $group = ""; # we're not in a group yet
340 $lastout = ""; # we've not put anything out yet
341 @seealsos = (); # See also stack.
343 foreach $idx (@term) {
344 next if $idx->{'startref'}; # no way to represent spans...
345 next if ($idx->{'scope'} eq 'LOCAL') && ($scope eq 'GLOBAL');
346 next if ($idx->{'scope'} eq 'GLOBAL') && ($scope eq 'LOCAL');
347 next if &same($idx, $last); # suppress duplicates
351 # If primary changes, output a whole new index term, otherwise just
352 # output another secondary or tertiary, as appropriate. We know from
353 # sorting that the terms will always be in the right order.
354 if (!&tsame($last, $idx, 'primary')) {
355 print "DIFF PRIM\n" if $debug;
356 &end_entry() if not $first;
359 # If we're grouping, make the right indexdivs
360 $letter = $idx->{'psortas'};
361 $letter = $idx->{'primary'} if !$letter;
362 $letter = uc(substr($letter, 0, 1));
364 # symbols are a special case
365 if (($letter lt 'A') || ($letter gt 'Z')) {
367 || (($group ge 'A') && ($group le 'Z'))) {
368 print OUT "</indexdiv>\n" if !$first;
369 print OUT "<indexdiv><title>$symbolsname</title>\n\n";
372 } elsif (($group eq '') || ($group ne $letter)) {
373 print OUT "</indexdiv>\n" if !$first;
374 print OUT "<indexdiv><title>$letter</title>\n\n";
379 $first = 0; # there can only be on first ;-)
381 print OUT "<indexentry>\n";
382 print OUT " <primaryie>", $idx->{'primary'};
383 $lastout = "primaryie";
385 if ($idx->{'secondary'}) {
386 print OUT "\n </primaryie>\n";
387 print OUT " <secondaryie>", $idx->{'secondary'};
388 $lastout = "secondaryie";
391 if ($idx->{'tertiary'}) {
392 print OUT "\n </secondaryie>\n";
393 print OUT " <tertiaryie>", $idx->{'tertiary'};
394 $lastout = "tertiaryie";
396 } elsif (!&tsame($last, $idx, 'secondary')) {
397 print "DIFF SEC\n" if $debug;
399 print OUT "\n </$lastout>\n" if $lastout;
401 foreach (@seealsos) {
402 # it'd be nice to make this a link...
403 print OUT $indent, " <seealsoie>", &escape($_), "</seealsoie>\n";
407 print OUT " <secondaryie>", $idx->{'secondary'};
408 $lastout = "secondaryie";
409 if ($idx->{'tertiary'}) {
410 print OUT "\n </secondaryie>\n";
411 print OUT " <tertiaryie>", $idx->{'tertiary'};
412 $lastout = "tertiaryie";
414 } elsif (!&tsame($last, $idx, 'tertiary')) {
415 print "DIFF TERT\n" if $debug;
417 print OUT "\n </$lastout>\n" if $lastout;
419 foreach (@seealsos) {
420 # it'd be nice to make this a link...
421 print OUT $indent, " <seealsoie>", &escape($_), "</seealsoie>\n";
425 if ($idx->{'tertiary'}) {
426 print OUT " <tertiaryie>", $idx->{'tertiary'};
427 $lastout = "tertiaryie";
436 # Termcount is > 0 iff some entries were skipped.
437 $quiet || print STDERR "$termcount entries ignored...\n";
441 print OUT "</indexdiv>\n" if $lettergroups;
442 print OUT "</$indextag>\n";
446 $quiet || print STDERR "Done.\n";
452 my($aP) = $a->{'psortas'} || $a->{'primary'};
453 my($aS) = $a->{'ssortas'} || $a->{'secondary'};
454 my($aT) = $a->{'tsortas'} || $a->{'tertiary'};
456 my($bP) = $b->{'psortas'} || $b->{'primary'};
457 my($bS) = $b->{'ssortas'} || $b->{'secondary'};
458 my($bT) = $b->{'tsortas'} || $b->{'tertiary'};
462 $aP =~ s/^\s*//; $aP =~ s/\s*$//; $aP = uc($aP);
463 $aS =~ s/^\s*//; $aS =~ s/\s*$//; $aS = uc($aS);
464 $aT =~ s/^\s*//; $aT =~ s/\s*$//; $aT = uc($aT);
465 $bP =~ s/^\s*//; $bP =~ s/\s*$//; $bP = uc($bP);
466 $bS =~ s/^\s*//; $bS =~ s/\s*$//; $bS = uc($bS);
467 $bT =~ s/^\s*//; $bT =~ s/\s*$//; $bT = uc($bT);
469 # print "[$aP]=[$bP]\n";
470 # print "[$aS]=[$bS]\n";
471 # print "[$aT]=[$bT]\n";
473 # Two index terms are the same if:
474 # 1. the primary, secondary, and tertiary entries are the same
475 # (or have the same SORTAS)
477 # 2. They occur in the same titled section
479 # 3. They point to the same place
481 # Notes: Scope is used to suppress some entries, but can't be used
482 # for comparing duplicates.
483 # Interpretation of "the same place" depends on whether or
484 # not $linkpoints is true.
486 $same = (($aP eq $bP)
489 && ($a->{'title'} eq $b->{'title'})
490 && ($a->{'href'} eq $b->{'href'}));
492 # If we're linking to points, they're only the same if they link
493 # to exactly the same spot.
494 $same = $same && ($a->{'hrefpoint'} eq $b->{'hrefpoint'})
498 warn "$me: duplicated index entry found: $aP $aS $aT\n";
505 # Unlike same(), tsame only compares a single term
509 my($sterm) = substr($term, 0, 1) . "sortas";
512 $A = $a->{$sterm} || $a->{$term};
513 $B = $b->{$sterm} || $b->{$term};
515 $A =~ s/^\s*//; $A =~ s/\s*$//; $A = uc($A);
516 $B =~ s/^\s*//; $B =~ s/\s*$//; $B = uc($B);
522 # End any open elements...
523 print OUT "\n </$lastout>\n" if $lastout;
525 foreach (@seealsos) {
526 # it'd be nice to make this a link...
527 print OUT $indent, " <seealsoie>", &escape($_), "</seealsoie>\n";
531 print OUT "</indexentry>\n\n";
536 # Print out the links for an indexterm. There can be more than
537 # one if the term has a ZONE that points to more than one place.
538 # (do we do the right thing in that case?)
540 my($key, $indent, @hrefs);
547 # it'd be nice to make this a link...
549 print OUT "\n </$lastout>\n";
552 print OUT $indent, "<seeie>", &escape($idx->{'see'}), "</seeie>\n";
556 if (keys %{$idx->{'zone'}}) {
557 foreach $key (keys %{$idx->{'zone'}}) {
558 $href{$key} = $idx->{'zone'}->{$key};
562 $href{$idx->{'href'}} = $idx->{'title'};
563 $phref{$idx->{'href'}} = $idx->{'hrefpoint'};
566 # We can't use <LINK> because we don't know the ID of the term in the
567 # original source (and, in fact, it might not have one).
575 $linkend = $phref{$key};
580 $role = $phref{$key};
581 $role = $1 if $role =~ /\#(.*)$/;
582 $role = $1 if $role =~ /(.*)\./;
585 print OUT "<ulink url=\"$linkend\" role=\"$role\">";
586 print OUT "<emphasis>" if ($idx->{'significance'} eq 'PREFERRED');
587 print OUT &escape($href{$key});
588 print OUT "</emphasis>" if ($idx->{'significance'} eq 'PREFERRED');
589 print OUT "</ulink>";
592 if ($idx->{'seealso'}) {
593 push @seealsos, $idx->{'seealso'};
598 my($aP) = $a->{'psortas'} || $a->{'primary'};
599 my($aS) = $a->{'ssortas'} || $a->{'secondary'};
600 my($aT) = $a->{'tsortas'} || $a->{'tertiary'};
601 my($ap) = $a->{'count'};
603 my($bP) = $b->{'psortas'} || $b->{'primary'};
604 my($bS) = $b->{'ssortas'} || $b->{'secondary'};
605 my($bT) = $b->{'tsortas'} || $b->{'tertiary'};
606 my($bp) = $b->{'count'};
608 $aP =~ s/^\s*//; $aP =~ s/\s*$//; $aP = uc($aP);
609 $aS =~ s/^\s*//; $aS =~ s/\s*$//; $aS = uc($aS);
610 $aT =~ s/^\s*//; $aT =~ s/\s*$//; $aT = uc($aT);
611 $bP =~ s/^\s*//; $bP =~ s/\s*$//; $bP = uc($bP);
612 $bS =~ s/^\s*//; $bS =~ s/\s*$//; $bS = uc($bS);
613 $bT =~ s/^\s*//; $bT =~ s/\s*$//; $bT = uc($bT);
618 # make sure seealso's always sort to the bottom
619 return 1 if ($a->{'seealso'});
620 return -1 if ($b->{'seealso'});
621 # if everything else is the same, keep these elements
622 # in document order (so the index links are in the right
643 # Move the non-letter things to the front. Should digits be thier
644 # own group? Maybe...
645 foreach $idx (@term) {
646 $letter = $idx->{'psortas'};
647 $letter = $idx->{'primary'} if !$letter;
648 $letter = uc(substr($letter, 0, 1));
650 if (($letter lt 'A') || ($letter gt 'Z')) {
664 if (($outfile ne '-') && (!$forceoutput)) {
666 if (open (OUT, $outfile)) {
668 if (/<!-- Remove this comment if you edit this file by hand! -->/){
679 print STDERR "$me: file \"$outfile\" appears to have been edited by hand\n";
680 print STDERR "Use the -f option or specify a different output file name.\n";
685 open (OUT, ">$outfile") || die "$me: could not open file \"$outfile\": $!\n";
689 if (open(F, $preamble)) {
695 warn "$me: could not open preamble file \"$preamble\": $!\n";
701 # make sure & and < don't show up in the index
705 s/>/>/sg; # what the heck
714 B<collateindex.pl> B<-o> F<index.sgml> F<HTML.index>
732 Norm Walsh E<lt>ndw@nwalsh.comE<gt>
734 Minor updates by Adam Di Carlo E<lt>adam@onshore.comE<gt> and Peter Eisentraut E<lt>peter_e@gmx.netE<gt>