doc/tools/html2texi.pl

   1 #! /usr/bin/env perl
   2 # html2texi.pl -- Convert HTML documentation to Texinfo format
   3 # Michael Ernst <mernst@cs.washington.edu>
   4 # Time-stamp: <1999-01-12 21:34:27 mernst>
   5
   6 # This program converts HTML documentation trees into Texinfo format.
   7 # Given the name of a main (or contents) HTML file, it processes that file,
   8 # and other files (transitively) referenced by it, into a Texinfo file
   9 # (whose name is chosen from the file or directory name of the argument).
  10 # For instance:
  11 #   html2texi.pl api/index.html
  12 # produces file "api.texi".
  13
  14 # Texinfo format can be easily converted to Info format (for browsing in
  15 # Emacs or the standalone Info browser), to a printed manual, or to HTML.
  16 # Thus, html2texi.pl permits conversion of HTML files to Info format, and
  17 # secondarily enables producing printed versions of Web page hierarchies.
  18
  19 # Unlike HTML, Info format is searchable.  Since Info is integrated into
  20 # Emacs, one can read documentation without starting a separate Web
  21 # browser.  Additionally, Info browsers (including Emacs) contain
  22 # convenient features missing from Web browsers, such as easy index lookup
  23 # and mouse-free browsing.
  24
  25 # Limitations:
  26 # html2texi.pl is currently tuned to latex2html output (and it corrects
  27 # several latex2html bugs), but should be extensible to arbitrary HTML
  28 # documents.  It will be most useful for HTML with a hierarchical structure
  29 # and an index, and it recognizes those features as created by latex2html
  30 # (and possibly by some other tools).  The HTML tree to be traversed must
  31 # be on local disk, rather than being accessed via HTTP.
  32 # This script requires the use of "checkargs.pm".  To eliminate that
  33 # dependence, replace calls to check_args* by @_ (which is always the last
  34 # argument to those functions).
  35 # Also see the "to do" section, below.
  36 # Comments, suggestions, bug fixes, and enhancements are welcome.
  37
  38 # Troubleshooting:
  39 # Malformed HTML can cause this program to abort, so
  40 # you should check your HTML files to make sure they are legal.
  41
  42
  43 ###
  44 ### Typical usage for the Python documentation:
  45 ###
  46
  47 # (Actually, most of this is in a Makefile instead.)
  48 # The resulting Info format Python documentation is currently available at
  49 # ftp://ftp.cs.washington.edu/homes/mernst/python-info.tar.gz
  50
  51 # Fix up HTML problems, eg <DT><DL COMPACT><DD> should be <DT><DL COMPACT><DD>.
  52
  53 # html2texi.pl /homes/fish/mernst/tmp/python-doc/html/api/index.html
  54 # html2texi.pl /homes/fish/mernst/tmp/python-doc/html/ext/index.html
  55 # html2texi.pl /homes/fish/mernst/tmp/python-doc/html/lib/index.html
  56 # html2texi.pl /homes/fish/mernst/tmp/python-doc/html/mac/index.html
  57 # html2texi.pl /homes/fish/mernst/tmp/python-doc/html/ref/index.html
  58 # html2texi.pl /homes/fish/mernst/tmp/python-doc/html/tut/index.html
  59
  60 # Edit the generated .texi files:
  61 #   * change @setfilename to prefix "python-"
  62 #   * fix up any sectioning, such as for Abstract
  63 #   * make Texinfo menus
  64 #   * perhaps remove the @detailmenu ... @end detailmenu
  65 # In Emacs, to do all this:
  66 #   (progn (goto-char (point-min)) (replace-regexp "\\(@setfilename \\)\\([-a-z]*\\)$" "\\1python-\\2.info") (replace-string "@node Front Matter\n@chapter Abstract\n" "@node Abstract\n@section Abstract\n") (progn (mark-whole-buffer) (texinfo-master-menu 'update-all-nodes)) (save-buffer))
  67
  68 # makeinfo api.texi
  69 # makeinfo ext.texi
  70 # makeinfo lib.texi
  71 # makeinfo mac.texi
  72 # makeinfo ref.texi
  73 # makeinfo tut.texi
  74
  75
  76 ###
  77 ### Structure of the code
  78 ###
  79
  80 # To be written...
  81
  82
  83 ###
  84 ### Design decisions
  85 ###
  86
  87 # Source and destination languages
  88 # --------------------------------
  89 #
  90 # The goal is Info files; I create Texinfo, so I don't have to worry about
  91 # the finer details of Info file creation.  (I'm not even sure of its exact
  92 # format.)
  93 #
  94 # Why not start from LaTeX rather than HTML?
  95 # I could hack latex2html itself to produce Texinfo instead, or fix up
  96 # partparse.py (which already translates LaTeX to Teinfo).
  97 #  Pros:
  98 #   * has high-level information such as index entries, original formatting
  99 #  Cons:
 100 #   * those programs are complicated to read and understand
 101 #   * those programs try to handle arbitrary LaTeX input, track catcodes,
 102 #     and more:  I don't want to go to that effort.  HTML isn't as powerful
 103 #     as LaTeX, so there are fewer subtleties.
 104 #   * the result wouldn't work for arbitrary HTML documents; it would be
 105 #     nice to eventually extend this program to HTML produced from Docbook,
 106 #     Frame, and more.
 107
 108 # Parsing
 109 # -------
 110 #
 111 # I don't want to view the text as a linear stream; I'd rather parse the
 112 # whole thing and then do pattern matching over the parsed representation (to
 113 # find idioms such as indices, lists of child nodes, etc.).
 114 #  * Perl provides HTML::TreeBuilder, which does just what I want.
 115 #     * libwww-perl: http://www.linpro.no/lwp/
 116 #     * TreeBuilder: HTML-Tree-0.51.tar.gz
 117 #  * Python Parsers, Formatters, and Writers don't really provide the right
 118 #    interface (and the version in Grail doesn't correspond to another
 119 #    distributed version, so I'm confused about which to be using).  I could
 120 #    write something in Python that creates a parse tree, but why bother?
 121
 122 # Other implementation language issues:
 123 #  * Python lacks variable declarations, reasonable scoping, and static
 124 #    checking tools.  I've written some of the latter for myself that make
 125 #    my Perl programming a lot safer than my Python programming will be until
 126 #    I have a similar suite for that language.
 127
 128
 129 ###########################################################################
 130 ### To do
 131 ###
 132
 133 # Section names:
 134 #   Fix the problem with multiple sections in a single file (eg, Abstract in
 135 #     Front Matter section).
 136 #   Deal with cross-references, as in /homes/fish/mernst/tmp/python-doc/html/ref/types.html:310
 137 # Index:
 138 #   Perhaps double-check that every tag mentioned in the index is found
 139 #     in the text.
 140 # Python:  email to python-docs@python.org, to get their feedback.
 141 #   Compare to existing lib/ Info manual
 142 #   Write the hooks into info-look; replace pyliblookup1-1.tar.gz.
 143 #   Postpass to remove extra quotation marks around typography already in
 144 #     a different font (to avoid double delimiters as in "`code'"); or
 145 #     perhaps consider using only font-based markup so that we don't get
 146 #     the extra *bold* and `code' markup in Info.
 147
 148 ## Perhaps don't rely on automatic means for adding up, next, prev; I have
 149 ## all that info available to me already, so it's not so much trouble to
 150 ## add it.  (Right?)  But it is *so* easy to use Emacs instead...
 151
 152
 153 ###########################################################################
 154 ### Strictures
 155 ###
 156
 157 # man HTML::TreeBuilder
 158 # man HTML::Parser
 159 # man HTML::Element
 160
 161 # require HTML::ParserWComment;
 162 require HTML::Parser;
 163 require HTML::TreeBuilder;
 164 require HTML::Element;
 165
 166 use File::Basename;
 167
 168 use strict;
 169 # use Carp;
 170
 171 use checkargs;
 172
 173
 174 ###########################################################################
 175 ### Variables
 176 ###
 177
 178 my @section_stack = ();         # elements are chapter/section/subsec nodetitles (I think)
 179 my $current_ref_tdf;            # for the file currently being processed;
 180                                 #  used in error messages
 181 my $html_directory;
 182 my %footnotes;
 183
 184 # First element should not be used.
 185 my @sectionmarker = ("manual", "chapter", "section", "subsection", "subsubsection");
 186
 187 my %inline_markup = ("b" => "strong",
 188                      "code" => "code",
 189                      "i" => "emph",
 190                      "kbd" => "kbd",
 191                      "samp" => "samp",
 192                      "strong" => "strong",
 193                      "tt" => "code",
 194                      "var" => "var");
 195
 196 my @deferred_index_entries = ();
 197
 198 my @index_titles = ();          # list of (filename, type) lists
 199 my %index_info = ("Index" => ["\@blindex", "bl"],
 200                   "Concept Index" => ["\@cindex", "cp"],
 201                   "Module Index" => ["\@mdindex", "md"]);
 202
 203
 204 ###########################################################################
 205 ### Main/contents page
 206 ###
 207
 208 # Process first-level page on its own, or just a contents page?  Well, I do
 209 # want the title, author, etc., and the front matter...  For now, just add
 210 # that by hand at the end.
 211
 212
 213 # data structure possibilities:
 214 #  * tree-like (need some kind of stack when processing (or parent pointers))
 215 #  * list of name and depth; remember old and new depths.
 216
 217 # Each element is a reference to a list of (nodetitle, depth, filename).
 218 my @contents_list = ();
 219
 220 # The problem with doing fixups on the fly is that some sections may have
 221 # already been processed (and no longer available) by the time we notice
 222 # others with the same name.  It's probably better to fully construct the
 223 # contents list (reading in all files of interest) upfront; that will also
 224 # let me do a better job with cross-references, because again, all files
 225 # will already be read in.
 226 my %contents_hash = ();
 227 my %contents_fixups = ();
 228
 229 my @current_contents_list = ();
 230
 231 # Merge @current_contents_list into @contents_list,
 232 # and set @current_contents_list to be empty.
 233 sub merge_contents_lists ( )
 234 { check_args(0, @_);
 235
 236   # Three possibilities:
 237   #  * @contents_list is empty: replace it by @current_contents_list.
 238   #  * prefixes of the two lists are identical: do nothing
 239   #  * @current_contents_list is all at lower level than $contents_list[0];
 240   #    prefix @contents_list by @current_contents_list
 241
 242   if (scalar(@current_contents_list) == 0)
 243     { die "empty current_contents_list"; }
 244
 245   #   if (scalar(@contents_list) == 0)
 246   #     { @contents_list = @current_contents_list;
 247   #       @current_contents_list = ();
 248   #       return; }
 249
 250   #   if (($ {$contents_list[0]}[1]) < ($ {$current_contents_list[0]}[1]))
 251   #     { unshift @contents_list, @current_contents_list;
 252   #       @current_contents_list = ();
 253   #       return; }
 254
 255   for (my $i=0; $i<scalar(@current_contents_list); $i++)
 256     { my $ref_c_tdf = $current_contents_list[$i];
 257       if ($i >= scalar(@contents_list))
 258         { push @contents_list, $ref_c_tdf;
 259           my $title = $ {$ref_c_tdf}[0];
 260           if (defined $contents_hash{$title})
 261             { $contents_fixups{$title} = 1; }
 262           else
 263             { $contents_hash{$title} = 1; }
 264           next; }
 265       my $ref_tdf = $contents_list[$i];
 266       my ($title, $depth, $file) = @{$ref_tdf};
 267       my ($c_title, $c_depth, $c_file) = @{$ref_c_tdf};
 268
 269       if (($title ne $c_title)
 270           && ($depth < $c_depth)
 271           && ($file ne $c_file))
 272         { splice @contents_list, $i, 0, $ref_c_tdf;
 273           if (defined $contents_hash{$c_title})
 274             { $contents_fixups{$c_title} = 1; }
 275           else
 276             { $contents_hash{$c_title} = 1; }
 277           next; }
 278
 279       if (($title ne $c_title)
 280           || ($depth != $c_depth)
 281           || ($file ne $c_file))
 282         { die ("while processing $ {$current_ref_tdf}[2] at depth $ {$current_ref_tdf}[1], mismatch at index $i:",
 283                "\n  main:  <<<$title>>> $depth $file",
 284                "\n  curr:  <<<$c_title>>> $c_depth $c_file"); }
 285     }
 286   @current_contents_list = ();
 287 }
 288
 289
 290
 291 # Set @current_contents_list to a list of (title, href, sectionlevel);
 292 #  then merge that list into @contents_list.
 293 # Maybe this function should also produce a map
 294 #  from title (or href) to sectionlevel (eg "chapter"?).
 295 sub process_child_links ( $ )
 296 { my ($he) = check_args(1, @_);
 297
 298   # $he->dump();
 299   if (scalar(@current_contents_list) != 0)
 300     { die "current_contents_list nonempty: @current_contents_list"; }
 301   $he->traverse(\&increment_current_contents_list, 'ignore text');
 302
 303   # Normalize the depths; for instance, convert 1,3,5 into 0,1,2.
 304   my %depths = ();
 305   for my $ref_tdf (@current_contents_list)
 306     { $depths{$ {$ref_tdf}[1]} = 1; }
 307   my @sorted_depths = sort keys %depths;
 308   my $current_depth = scalar(@section_stack)-1;
 309   my $current_depth_2 = $ {$current_ref_tdf}[1];
 310   if ($current_depth != $current_depth_2)
 311     { die "mismatch in current depths: $current_depth $current_depth_2; ", join(", ", @section_stack); }
 312   for (my $i=0; $i<scalar(@sorted_depths); $i++)
 313     { $depths{$sorted_depths[$i]} = $i + $current_depth+1; }
 314   for my $ref_tdf (@current_contents_list)
 315     { $ {$ref_tdf}[1] = $depths{$ {$ref_tdf}[1]}; }
 316
 317   # Eliminate uninteresting sections.  Hard-coded hack for now.
 318   if ($ {$current_contents_list[-1]}[0] eq "About this document ...")
 319     { pop @current_contents_list; }
 320   if ((scalar(@current_contents_list) > 1)
 321       && ($ {$current_contents_list[1]}[0] eq "Contents"))
 322     { my $ref_first_tdf = shift @current_contents_list;
 323       $current_contents_list[0] = $ref_first_tdf; }
 324
 325   for (my $i=0; $i<scalar(@current_contents_list); $i++)
 326     { my $ref_tdf = $current_contents_list[$i];
 327       my $title = $ {$ref_tdf}[0];
 328       if (exists $index_info{$title})
 329         { my $index_file = $ {$ref_tdf}[2];
 330           my ($indexing_command, $suffix) = @{$index_info{$title}};
 331           process_index_file($index_file, $indexing_command);
 332           print TEXI "\n\@defindex $suffix\n";
 333           push @index_titles, $title;
 334           splice @current_contents_list, $i, 1;
 335           $i--; }
 336       elsif ($title =~ /\bIndex$/)
 337         { print STDERR "Warning: \"$title\" might be an index; if so, edit \%index_info.\n"; } }
 338
 339   merge_contents_lists();
 340
 341   # print_contents_list();
 342   # print_index_info();
 343 }
 344
 345
 346 sub increment_current_contents_list ( $$$ )
 347 { my ($he, $startflag, $depth) = check_args(3, @_);
 348   if (!$startflag)
 349     { return; }
 350
 351   if ($he->tag eq "li")
 352     { my @li_content = @{$he->content};
 353       if ($li_content[0]->tag ne "a")
 354         { die "first element of <LI> should be <A>"; }
 355       my ($name, $href, @content) = anchor_info($li_content[0]);
 356       # unused $name
 357       my $title = join("", collect_texts($li_content[0]));
 358       $title = texi_remove_punctuation($title);
 359       # The problem with these is that they are formatted differently in
 360       # @menu and @node!
 361       $title =~ s/``/\"/g;
 362       $title =~ s/''/\"/g;
 363       $title =~ s/ -- / /g;
 364       push @current_contents_list, [ $title, $depth, $href ]; }
 365   return 1;
 366 }
 367
 368 # Simple version for section titles
 369 sub html_to_texi ( $ )
 370 { my ($he) = check_args(1, @_);
 371   if (!ref $he)
 372     { return $he; }
 373
 374   my $tag = $he->tag;
 375   if (exists $inline_markup{$tag})
 376     { my $result = "\@$inline_markup{$tag}\{";
 377       for my $elt (@{$he->content})
 378         { $result .= html_to_texi($elt); }
 379       $result .= "\}";
 380       return $result; }
 381   else
 382     { $he->dump();
 383       die "html_to_texi confused by <$tag>"; }
 384 }
 385
 386
 387
 388 sub print_contents_list ()
 389 { check_args(0, @_);
 390   print STDERR "Contents list:\n";
 391   for my $ref_tdf (@contents_list)
 392     { my ($title, $depth, $file) = @{$ref_tdf};
 393       print STDERR "$title $depth $file\n"; }
 394 }
 395
 396
 397
 398 ###########################################################################
 399 ### Index
 400 ###
 401
 402 my $l2h_broken_link_name = "l2h-";
 403
 404
 405 # map from file to (map from anchor name to (list of index texts))
 406 # (The list is needed when a single LaTeX command like \envvar
 407 # expands to multiple \index commands.)
 408 my %file_index_entries = ();
 409 my %this_index_entries;         # map from anchor name to (list of index texts)
 410
 411 my %file_index_entries_broken = (); # map from file to (list of index texts)
 412 my @this_index_entries_broken;
 413
 414 my $index_prefix = "";
 415 my @index_prefixes = ();
 416
 417 my $this_indexing_command;
 418
 419 sub print_index_info ()
 420 { check_args(0, @_);
 421   my ($key, $val);
 422   for my $file (sort keys %file_index_entries)
 423     { my %index_entries = %{$file_index_entries{$file}};
 424       print STDERR "file: $file\n";
 425       for my $aname (sort keys %index_entries)
 426         { my @entries = @{$index_entries{$aname}};
 427           if (scalar(@entries) == 1)
 428             { print STDERR "  $aname : $entries[0]\n"; }
 429           else
 430             { print STDERR "  $aname : ", join("\n     " . (" " x length($aname)), @entries), "\n"; } } }
 431   for my $file (sort keys %file_index_entries_broken)
 432     { my @entries = @{$file_index_entries_broken{$file}};
 433       print STDERR "file: $file\n";
 434       for my $entry (@entries)
 435         { print STDERR "  $entry\n"; }
 436     }
 437 }
 438
 439
 440 sub process_index_file ( $$ )
 441 { my ($file, $indexing_command) = check_args(2, @_);
 442   # print "process_index_file $file $indexing_command\n";
 443
 444   my $he = file_to_tree($html_directory . $file);
 445   # $he->dump();
 446
 447   $this_indexing_command = $indexing_command;
 448   $he->traverse(\&process_if_index_dl_compact, 'ignore text');
 449   undef $this_indexing_command;
 450   # print "process_index_file done\n";
 451 }
 452
 453
 454 sub process_if_index_dl_compact ( $$$ )
 455 { my ($he, $startflag) = (check_args(3, @_))[0,1]; #  ignore depth argument
 456   if (!$startflag)
 457     { return; }
 458
 459   if (($he->tag() eq "dl") && (defined $he->attr('compact')))
 460     { process_index_dl_compact($he);
 461       return 0; }
 462   else
 463     { return 1; }
 464 }
 465
 466
 467 # The elements of a <DL COMPACT> list from a LaTeX2HTML index:
 468 #  * a single space: text to be ignored
 469 #  * <DT> elements with an optional <DD> element following each one
 470 #    Two types of <DT> elements:
 471 #     * Followed by a <DD> element:  the <DT> contains a single
 472 #       string, and the <DD> contains a whitespace string to be ignored, a
 473 #       <DL COMPACT> to be recursively processed (with the <DT> string as a
 474 #       prefix), and a whitespace string to be ignored.
 475 #     * Not followed by a <DD> element:  contains a list of anchors
 476 #       and texts (ignore the texts, which are only whitespace and commas).
 477 #       Optionally contains a <DL COMPACT> to be recursively processed (with
 478 #       the <DT> string as a prefix)
 479 sub process_index_dl_compact ( $ )
 480 { my ($h) = check_args(1, @_);
 481   my @content = @{$h->content()};
 482   for (my $i = 0; $i < scalar(@content); $i++)
 483     { my $this_he = $content[$i];
 484       if ($this_he->tag ne "dt")
 485         { $this_he->dump();
 486           die "Expected <DT> tag: " . $this_he->tag; }
 487       if (($i < scalar(@content) - 1) && ($content[$i+1]->tag eq "dd"))
 488         { process_index_dt_and_dd($this_he, $content[$i+1]);
 489           $i++; }
 490       else
 491         { process_index_lone_dt($this_he); } } }
 492
 493
 494
 495 # Argument is a <DT> element.  If it contains more than one anchor, then
 496 # the texts of all subsequent ones are "[Link]".  Example:
 497 #       <DT>
 498 #         <A HREF="embedding.html#l2h-201">
 499 #           "$PATH"
 500 #         ", "
 501 #         <A HREF="embedding.html#l2h-205">
 502 #           "[Link]"
 503 # Optionally contains a <DL COMPACT> as well.  Example:
 504 # <DT>
 505 #   <A HREF="types.html#l2h-616">
 506 #     "attribute"
 507 #   <DL COMPACT>
 508 #     <DT>
 509 #       <A HREF="assignment.html#l2h-3074">
 510 #         "assignment"
 511 #       ", "
 512 #       <A HREF="assignment.html#l2h-3099">
 513 #         "[Link]"
 514 #     <DT>
 515 #       <A HREF="types.html#l2h-">
 516 #         "assignment, class"
 517
 518 sub process_index_lone_dt ( $ )
 519 { my ($dt) = check_args(1, @_);
 520   my @dtcontent = @{$dt->content()};
 521   my $acontent;
 522   my $acontent_suffix;
 523   for my $a (@dtcontent)
 524     { if ($a eq ", ")
 525         { next; }
 526       if (!ref $a)
 527         { $dt->dump;
 528           die "Unexpected <DT> string element: $a"; }
 529
 530       if ($a->tag eq "dl")
 531         { push @index_prefixes, $index_prefix;
 532           if (!defined $acontent_suffix)
 533             { die "acontent_suffix not yet defined"; }
 534           $index_prefix .= $acontent_suffix . ", ";
 535           process_index_dl_compact($a);
 536           $index_prefix = pop(@index_prefixes);
 537           return; }
 538
 539       if ($a->tag ne "a")
 540         { $dt->dump;
 541           $a->dump;
 542           die "Expected anchor in lone <DT>"; }
 543
 544       my ($aname, $ahref, @acontent) = anchor_info($a);
 545       # unused $aname
 546       if (scalar(@acontent) != 1)
 547         { die "Expected just one content of <A> in <DT>: @acontent"; }
 548       if (ref $acontent[0])
 549         { $acontent[0]->dump;
 550           die "Expected string content of <A> in <DT>: $acontent[0]"; }
 551       if (!defined($acontent))
 552         { $acontent = $index_prefix . $acontent[0];
 553           $acontent_suffix = $acontent[0]; }
 554       elsif (($acontent[0] ne "[Link]") && ($acontent ne ($index_prefix . $acontent[0])))
 555         { die "Differing content: <<<$acontent>>>, <<<$acontent[0]>>>"; }
 556
 557       if (!defined $ahref)
 558         { $dt->dump;
 559           die "no HREF in nachor in <DT>"; }
 560       my ($ahref_file, $ahref_name) = split(/\#/, $ahref);
 561       if (!defined $ahref_name)
 562         { # Reference to entire file
 563           $ahref_name = ""; }
 564
 565       if ($ahref_name eq $l2h_broken_link_name)
 566         { if (!exists $file_index_entries_broken{$ahref_file})
 567             { $file_index_entries_broken{$ahref_file} = []; }
 568           push @{$file_index_entries_broken{$ahref_file}}, "$this_indexing_command $acontent";
 569           next; }
 570
 571       if (!exists $file_index_entries{$ahref_file})
 572         { $file_index_entries{$ahref_file} = {}; }
 573       # Don't do this!  It appears to make a copy, which is not desired.
 574       # my %index_entries = %{$file_index_entries{$ahref_file}};
 575       if (!exists $ {$file_index_entries{$ahref_file}}{$ahref_name})
 576         { $ {$file_index_entries{$ahref_file}}{$ahref_name} = []; }
 577       #         { my $oldcontent = $ {$file_index_entries{$ahref_file}}{$ahref_name};
 578       #           if ($acontent eq $oldcontent)
 579       #             { die "Multiple identical index entries?"; }
 580       #           die "Trying to add $acontent, but already have index entry pointing at $ahref_file\#$ahref_name: ${$file_index_entries{$ahref_file}}{$ahref_name}"; }
 581
 582       push @{$ {$file_index_entries{$ahref_file}}{$ahref_name}}, "$this_indexing_command $acontent";
 583       # print STDERR "keys: ", keys %{$file_index_entries{$ahref_file}}, "\n";
 584     }
 585 }
 586
 587 sub process_index_dt_and_dd ( $$ )
 588 { my ($dt, $dd) = check_args(2, @_);
 589   my $dtcontent;
 590   { my @dtcontent = @{$dt->content()};
 591     if ((scalar(@dtcontent) != 1) || (ref $dtcontent[0]))
 592       { $dd->dump;
 593         $dt->dump;
 594         die "Expected single string (actual size = " . scalar(@dtcontent) . ") in content of <DT>: @dtcontent"; }
 595     $dtcontent = $dtcontent[0];
 596     $dtcontent =~ s/ +$//; }
 597   my $ddcontent;
 598   { my @ddcontent = @{$dd->content()};
 599     if (scalar(@ddcontent) != 1)
 600       { die "Expected single <DD> content, got ", scalar(@ddcontent), " elements:\n", join("\n", @ddcontent), "\n "; }
 601     $ddcontent = $ddcontent[0]; }
 602   if ($ddcontent->tag ne "dl")
 603     { die "Expected <DL> as content of <DD>, but saw: $ddcontent"; }
 604
 605   push @index_prefixes, $index_prefix;
 606   $index_prefix .= $dtcontent . ", ";
 607   process_index_dl_compact($ddcontent);
 608   $index_prefix = pop(@index_prefixes);
 609 }
 610
 611
 612 ###########################################################################
 613 ### Ordinary sections
 614 ###
 615
 616 sub process_section_file ( $$$ )
 617 { my ($file, $depth, $nodetitle) = check_args(3, @_);
 618   my $he = file_to_tree(($file =~ /^\//) ? $file : $html_directory . $file);
 619
 620   # print STDERR "process_section_file: $file $depth $nodetitle\n";
 621
 622   # Equivalently:
 623   #   while ($depth >= scalar(@section_stack)) { pop(@section_stack); }
 624   @section_stack = @section_stack[0..$depth-1];
 625
 626   # Not a great nodename fixup scheme; need a more global view
 627   if ((defined $contents_fixups{$nodetitle})
 628       && (scalar(@section_stack) > 0))
 629     { my $up_title = $section_stack[$#section_stack];
 630       # hack for Python Standard Library
 631       $up_title =~ s/^(Built-in|Standard) Module //g;
 632       my ($up_first_word) = split(/ /, $up_title);
 633       $nodetitle = "$up_first_word $nodetitle";
 634     }
 635
 636   push @section_stack, $nodetitle;
 637   # print STDERR "new section_stack: ", join(", ", @section_stack), "\n";
 638
 639   $he->traverse(\&process_if_child_links, 'ignore text');
 640   %footnotes = ();
 641   # $he->dump;
 642   $he->traverse(\&process_if_footnotes, 'ignore text');
 643
 644   # $he->dump;
 645
 646   if (exists $file_index_entries{$file})
 647     { %this_index_entries = %{$file_index_entries{$file}};
 648       # print STDERR "this_index_entries:\n ", join("\n ", keys %this_index_entries), "\n";
 649     }
 650   else
 651     { # print STDERR "Warning: no index entries for file $file\n";
 652       %this_index_entries = (); }
 653
 654   if (exists $file_index_entries_broken{$file})
 655     { @this_index_entries_broken = @{$file_index_entries_broken{$file}}; }
 656   else
 657     { # print STDERR "Warning: no index entries for file $file\n";
 658       @this_index_entries_broken = (); }
 659
 660
 661   if ($he->tag() ne "html")
 662     { die "Expected <HTML> at top level"; }
 663   my @content = @{$he->content()};
 664   if ((!ref $content[0]) or ($content[0]->tag ne "head"))
 665     { $he->dump;
 666       die "<HEAD> not first element of <HTML>"; }
 667   if ((!ref $content[1]) or ($content[1]->tag ne "body"))
 668     { $he->dump;
 669       die "<BODY> not second element of <HTML>"; }
 670
 671   $content[1]->traverse(\&output_body);
 672 }
 673
 674 # stack of things we're inside that are preventing indexing from occurring now.
 675 # These are "h1", "h2", "h3", "h4", "h5", "h6", "dt" (and possibly others?)
 676 my @index_deferrers = ();
 677
 678 sub push_or_pop_index_deferrers ( $$ )
 679 { my ($tag, $startflag) = check_args(2, @_);
 680   if ($startflag)
 681     { push @index_deferrers, $tag; }
 682   else
 683     { my $old_deferrer = pop @index_deferrers;
 684       if ($tag ne $old_deferrer)
 685         { die "Expected $tag at top of index_deferrers but saw $old_deferrer; remainder = ", join(" ", @index_deferrers); }
 686       do_deferred_index_entries(); }
 687 }
 688
 689
 690 sub label_add_index_entries ( $;$ )
 691 { my ($label, $he) = check_args_range(1, 2, @_);
 692   # print ((exists $this_index_entries{$label}) ? "*" : " "), " label_add_index_entries $label\n";
 693   # $he is the anchor element
 694   if (exists $this_index_entries{$label})
 695     { push @deferred_index_entries, @{$this_index_entries{$label}};
 696       return; }
 697
 698   if ($label eq $l2h_broken_link_name)
 699     { # Try to find some text to use in guessing which links should point here
 700       # I should probably only look at the previous element, or if that is
 701       # all punctuation, the one before it; collecting all the previous texts
 702       # is a bit of overkill.
 703       my @anchor_texts = collect_texts($he);
 704       my @previous_texts = collect_texts($he->parent, $he);
 705       # 4 elements is arbitrary; ought to filter out punctuation and small words
 706       # first, then perhaps keep fewer.  Perhaps also filter out formatting so
 707       # that we can see a larger chunk of text?  (Probably not.)
 708       # Also perhaps should do further chunking into words, in case the
 709       # index term isn't a chunk of its own (eg, was in <tt>...</tt>.
 710       my @candidate_texts = (@anchor_texts, (reverse(@previous_texts))[0..min(3,$#previous_texts)]);
 711
 712       my $guessed = 0;
 713       for my $text (@candidate_texts)
 714         { # my $orig_text = $text;
 715           if ($text =~ /^[\"\`\'().?! ]*$/)
 716             { next; }
 717           if (length($text) <= 2)
 718             { next; }
 719           # hack for Python manual; maybe defer until failure first time around?
 720           $text =~ s/^sys\.//g;
 721           for my $iterm (@this_index_entries_broken)
 722             { # I could test for zero:  LaTeX2HTML's failures in the Python
 723               # documentation are only for items of the form "... (built-in...)"
 724               if (index($iterm, $text) != -1)
 725                 { push @deferred_index_entries, $iterm;
 726                   # print STDERR "Guessing index term `$iterm' for text `$orig_text'\n";
 727                   $guessed = 1;
 728                 } } }
 729       if (!$guessed)
 730         { # print STDERR "No guess in `", join("'; `", @this_index_entries_broken), "' for texts:\n `", join("'\n `", @candidate_texts), "'\n";
 731         }
 732     }
 733 }
 734
 735
 736 # Need to add calls to this at various places.
 737 # Perhaps add HTML::Element argument and do the check for appropriateness
 738 # here (ie, no action if inside <H1>, etc.).
 739 sub do_deferred_index_entries ()
 740 { check_args(0, @_);
 741   if ((scalar(@deferred_index_entries) > 0)
 742       && (scalar(@index_deferrers) == 0))
 743     { print TEXI "\n", join("\n", @deferred_index_entries), "\n";
 744       @deferred_index_entries = (); }
 745 }
 746
 747 my $table_columns;              # undefined if not in a table
 748 my $table_first_column;         # boolean
 749
 750 sub output_body ( $$$ )
 751 { my ($he, $startflag) = (check_args(3, @_))[0,1]; #  ignore depth argument
 752
 753   if (!ref $he)
 754     { my $space_index = index($he, " ");
 755       if ($space_index != -1)
 756         { # Why does
 757           #   print TEXI texi_quote(substr($he, 0, $space_index+1));
 758           # give:  Can't locate object method "TEXI" via package "texi_quote"
 759           # (Because the definition texi_quote hasn't been seen yet.)
 760           print TEXI &texi_quote(substr($he, 0, $space_index+1));
 761           do_deferred_index_entries();
 762           print TEXI &texi_quote(substr($he, $space_index+1)); }
 763       else
 764         { print TEXI &texi_quote($he); }
 765       return; }
 766
 767   my $tag = $he->tag();
 768
 769   # Ordinary text markup first
 770   if (exists $inline_markup{$tag})
 771     { if ($startflag)
 772         { print TEXI "\@$inline_markup{$tag}\{"; }
 773       else
 774         { print TEXI "\}"; } }
 775   elsif ($tag eq "a")
 776     { my ($name, $href, @content) = anchor_info($he);
 777       if (!$href)
 778         { # This anchor is only here for indexing/cross referencing purposes.
 779           if ($startflag)
 780             { label_add_index_entries($name, $he); }
 781         }
 782       elsif ($href =~ "^(ftp|http|news):")
 783         { if ($startflag)
 784             { # Should avoid second argument if it's identical to the URL.
 785               print TEXI "\@uref\{$href, "; }
 786           else
 787             { print TEXI "\}"; }
 788         }
 789       elsif ($href =~ /^\#(foot[0-9]+)$/)
 790         { # Footnote
 791           if ($startflag)
 792             { # Could double-check name and content, but I'm not
 793               # currently storing that information.
 794               print TEXI "\@footnote\{";
 795               $footnotes{$1}->traverse(\&output_body);
 796               print TEXI "\}";
 797               return 0; } }
 798       else
 799         { if ($startflag)
 800             { # cross-references are not active Info links, but no text is lost
 801               print STDERR "Can't deal with internal HREF anchors yet:\n";
 802               $he->dump; }
 803         }
 804     }
 805   elsif ($tag eq "br")
 806     { print TEXI "\@\n"; }
 807   elsif ($tag eq "body")
 808     { }
 809   elsif ($tag eq "center")
 810     { if (has_single_content_string($he)
 811           && ($ {$he->content}[0] =~ /^ *$/))
 812         { return 0; }
 813       if ($startflag)
 814         { print TEXI "\n\@center\n"; }
 815       else
 816         { print TEXI "\n\@end center\n"; }
 817     }
 818   elsif ($tag eq "div")
 819     { my $align = $he->attr('align');
 820       if (defined($align) && ($align eq "center"))
 821         { if (has_single_content_string($he)
 822               && ($ {$he->content}[0] =~ /^ *$/))
 823             { return 0; }
 824           if ($startflag)
 825             { print TEXI "\n\@center\n"; }
 826           else
 827             { print TEXI "\n\@end center\n"; } }
 828     }
 829   elsif ($tag eq "dl")
 830     { # Recognize "<dl><dd><pre> ... </pre></dl>" paradigm for "@example"
 831       if (has_single_content_with_tag($he, "dd"))
 832         { my $he_dd = $ {$he->content}[0];
 833           if (has_single_content_with_tag($he_dd, "pre"))
 834             { my $he_pre = $ {$he_dd->content}[0];
 835               print_pre($he_pre);
 836               return 0; } }
 837       if ($startflag)
 838         { # Could examine the elements, to be cleverer about formatting.
 839           # (Also to use ftable, vtable...)
 840           print TEXI "\n\@table \@asis\n"; }
 841       else
 842         { print TEXI "\n\@end table\n"; }
 843     }
 844   elsif ($tag eq "dt")
 845     { push_or_pop_index_deferrers($tag, $startflag);
 846       if ($startflag)
 847         { print TEXI "\n\@item "; }
 848       else
 849         { } }
 850   elsif ($tag eq "dd")
 851     { if ($startflag)
 852         { print TEXI "\n"; }
 853       else
 854         { }
 855       if (scalar(@index_deferrers) != 0)
 856         { $he->dump;
 857           die "Unexpected <$tag> while inside: (" . join(" ", @index_deferrers) . "); bad HTML?"; }
 858       do_deferred_index_entries();
 859     }
 860   elsif ($tag =~ /^(font|big|small)$/)
 861     { # Do nothing for now.
 862     }
 863   elsif ($tag =~ /^h[1-6]$/)
 864     { # We don't need this because we never recursively enter the heading content.
 865       # push_or_pop_index_deferrers($tag, $startflag);
 866       my $secname = "";
 867       my @seclabels = ();
 868       for my $elt (@{$he->content})
 869         { if (!ref $elt)
 870             { $secname .= $elt; }
 871           elsif ($elt->tag eq "br")
 872             { }
 873           elsif ($elt->tag eq "a")
 874             { my ($name, $href, @acontent) = anchor_info($elt);
 875               if ($href)
 876                 { $he->dump;
 877                   $elt->dump;
 878                   die "Nonsimple anchor in <$tag>"; }
 879               if (!defined $name)
 880                 { die "No NAME for anchor in $tag"; }
 881               push @seclabels, $name;
 882               for my $subelt (@acontent)
 883                 { $secname .= html_to_texi($subelt); } }
 884           else
 885             { $secname .= html_to_texi($elt); } }
 886       if ($secname eq "")
 887         { die "No section name in <$tag>"; }
 888       if (scalar(@section_stack) == 1)
 889         { if ($section_stack[-1] ne "Top")
 890             { die "Not top? $section_stack[-1]"; }
 891           print TEXI "\@settitle $secname\n";
 892           print TEXI "\@c %**end of header\n";
 893           print TEXI "\n";
 894           print TEXI "\@node Top\n";
 895           print TEXI "\n"; }
 896       else
 897         { print TEXI "\n\@node $section_stack[-1]\n";
 898           print TEXI "\@$sectionmarker[scalar(@section_stack)-1] ", texi_remove_punctuation($secname), "\n"; }
 899       for my $seclabel (@seclabels)
 900         { label_add_index_entries($seclabel); }
 901       # This should only happen once per file.
 902       label_add_index_entries("");
 903       if (scalar(@index_deferrers) != 0)
 904         { $he->dump;
 905           die "Unexpected <$tag> while inside: (" . join(" ", @index_deferrers) . "); bad HTML?"; }
 906       do_deferred_index_entries();
 907       return 0;
 908     }
 909   elsif ($tag eq "hr")
 910     { }
 911   elsif ($tag eq "ignore")
 912     { # Hack for ignored elements
 913       return 0;
 914     }
 915   elsif ($tag eq "li")
 916     { if ($startflag)
 917         { print TEXI "\n\n\@item\n";
 918           do_deferred_index_entries(); } }
 919   elsif ($tag eq "ol")
 920     { if ($startflag)
 921         { print TEXI "\n\@enumerate \@bullet\n"; }
 922       else
 923         { print TEXI "\n\@end enumerate\n"; } }
 924   elsif ($tag eq "p")
 925     { if ($startflag)
 926         { print TEXI "\n\n"; }
 927       if (scalar(@index_deferrers) != 0)
 928         { $he->dump;
 929           die "Unexpected <$tag> while inside: (" . join(" ", @index_deferrers) . "); bad HTML?"; }
 930       do_deferred_index_entries(); }
 931   elsif ($tag eq "pre")
 932     { print_pre($he);
 933       return 0; }
 934   elsif ($tag eq "table")
 935     { # Could also indicate common formatting for first column, or
 936       # determine relative widths for columns (or determine a prototype row)
 937       if ($startflag)
 938         { if (defined $table_columns)
 939             { $he->dump;
 940               die "Can't deal with table nested inside $table_columns-column table"; }
 941           $table_columns = table_columns($he);
 942           if ($table_columns < 2)
 943             { $he->dump;
 944               die "Column with $table_columns columns?"; }
 945           elsif ($table_columns == 2)
 946             { print TEXI "\n\@table \@asis\n"; }
 947           else
 948             { print TEXI "\n\@multitable \@columnfractions";
 949               for (my $i=0; $i<$table_columns; $i++)
 950                 { print TEXI " ", 1.0/$table_columns; }
 951               print TEXI "\n"; } }
 952       else
 953         { if ($table_columns == 2)
 954             { print TEXI "\n\@end table\n"; }
 955           else
 956             { print TEXI "\n\@end multitable\n"; }
 957           undef $table_columns; } }
 958   elsif (($tag eq "td") || ($tag eq "th"))
 959     { if ($startflag)
 960         { if ($table_first_column)
 961             { print TEXI "\n\@item ";
 962               $table_first_column = 0; }
 963           elsif ($table_columns > 2)
 964             { print TEXI "\n\@tab "; } }
 965       else
 966         { print TEXI "\n"; } }
 967   elsif ($tag eq "tr")
 968     { if ($startflag)
 969         { $table_first_column = 1; } }
 970   elsif ($tag eq "ul")
 971     { if ($startflag)
 972         { print TEXI "\n\@itemize \@bullet\n"; }
 973       else
 974         { print TEXI "\n\@end itemize\n"; } }
 975   else
 976     { # I used to have a newline before "output_body" here.
 977       print STDERR "output_body: ignoring <$tag> tag\n";
 978       $he->dump;
 979       return 0; }
 980
 981   return 1;
 982 }
 983
 984 sub print_pre ( $ )
 985 { my ($he_pre) = check_args(1, @_);
 986   if (!has_single_content_string($he_pre))
 987     { die "Multiple or non-string content for <PRE>: ", @{$he_pre->content}; }
 988   my $pre_content = $ {$he_pre->content}[0];
 989   print TEXI "\n\@example";
 990   print TEXI &texi_quote($pre_content);
 991   print TEXI "\@end example\n";
 992 }
 993
 994 sub table_columns ( $ )
 995 { my ($table) = check_args(1, @_);
 996   my $result = 0;
 997   for my $row (@{$table->content})
 998     { if ($row->tag ne "tr")
 999         { $table->dump;
1000           $row->dump;
1001           die "Expected <TR> as table row."; }
1002       $result = max($result, scalar(@{$row->content})); }
1003   return $result;
1004 }
1005
1006
1007 ###########################################################################
1008 ### Utilities
1009 ###
1010
1011 sub min ( $$ )
1012 { my ($x, $y) = check_args(2, @_);
1013   return ($x < $y) ? $x : $y;
1014 }
1015
1016 sub max ( $$ )
1017 { my ($x, $y) = check_args(2, @_);
1018   return ($x > $y) ? $x : $y;
1019 }
1020
1021 sub file_to_tree ( $ )
1022 { my ($file) = check_args(1, @_);
1023
1024   my $tree = new HTML::TreeBuilder;
1025   $tree->ignore_unknown(1);
1026   # $tree->warn(1);
1027   $tree->parse_file($file);
1028   cleanup_parse_tree($tree);
1029   return $tree
1030 }
1031
1032
1033 sub has_single_content ( $ )
1034 { my ($he) = check_args(1, @_);
1035   if (!ref $he)
1036     { # return 0;
1037       die "Non-reference argument: $he"; }
1038   my $ref_content = $he->content;
1039   if (!defined $ref_content)
1040     { return 0; }
1041   my @content = @{$ref_content};
1042   if (scalar(@content) != 1)
1043     { return 0; }
1044   return 1;
1045 }
1046
1047
1048 # Return true if the content of the element contains only one element itself,
1049 # and that inner element has the specified tag.
1050 sub has_single_content_with_tag ( $$ )
1051 { my ($he, $tag) = check_args(2, @_);
1052   if (!has_single_content($he))
1053     { return 0; }
1054   my $content = $ {$he->content}[0];
1055   if (!ref $content)
1056     { return 0; }
1057   my $content_tag = $content->tag;
1058   if (!defined $content_tag)
1059     { return 0; }
1060   return $content_tag eq $tag;
1061 }
1062
1063 sub has_single_content_string ( $ )
1064 { my ($he) = check_args(1, @_);
1065   if (!has_single_content($he))
1066     { return 0; }
1067   my $content = $ {$he->content}[0];
1068   if (ref $content)
1069     { return 0; }
1070   return 1;
1071 }
1072
1073
1074 # Return name, href, content.  First two may be undefined; third is an array.
1075 # I don't see how to determine if there are more attributes.
1076 sub anchor_info ( $ )
1077 { my ($he) = check_args(1, @_);
1078   if ($he->tag ne "a")
1079     { $he->dump;
1080       die "passed non-anchor to anchor_info"; }
1081   my $name = $he->attr('name');
1082   my $href = $he->attr('href');
1083   my @content = ();
1084   { my $ref_content = $he->content;
1085     if (defined $ref_content)
1086       { @content = @{$ref_content}; } }
1087   return ($name, $href, @content);
1088 }
1089
1090
1091 sub texi_quote ( $ )
1092 { my ($text) = check_args(1, @_);
1093   $text =~ s/([\@\{\}])/\@$1/g;
1094   $text =~ s/ -- / --- /g;
1095   return $text;
1096 }
1097
1098 # Eliminate bad punctuation (that confuses Makeinfo or Info) for section titles.
1099 sub texi_remove_punctuation ( $ )
1100 { my ($text) = check_args(1, @_);
1101
1102   $text =~ s/^ +//g;
1103   $text =~ s/[ :]+$//g;
1104   $text =~ s/^[1-9][0-9.]* +//g;
1105   $text =~ s/,//g;
1106   # Both embedded colons and " -- " confuse makeinfo.  (Perhaps " -- "
1107   # gets converted into " - ", just as "---" would be converted into " -- ",
1108   # so the names end up differing.)
1109   # $text =~ s/:/ -- /g;
1110   $text =~ s/://g;
1111   return $text;
1112 }
1113
1114
1115 ## Do not use this inside `traverse':  it throws off the traversal.  Use
1116 ## html_replace_by_ignore or html_replace_by_meta instead.
1117 # Returns 1 if success, 0 if failure.
1118 sub html_remove ( $;$ )
1119 { my ($he, $parent) = check_args_range(1, 2, @_);
1120   if (!defined $parent)
1121     { $parent = $he->parent; }
1122   my $ref_pcontent = $parent->content;
1123   my @pcontent = @{$ref_pcontent};
1124   for (my $i=0; $i<scalar(@pcontent); $i++)
1125     { if ($pcontent[$i] eq $he)
1126         { splice @{$ref_pcontent}, $i, 1;
1127           $he->parent(undef);
1128           return 1; } }
1129   die "Didn't find $he in $parent";
1130 }
1131
1132
1133 sub html_replace ( $$;$ )
1134 { my ($orig, $new, $parent) = check_args_range(2, 3, @_);
1135   if (!defined $parent)
1136     { $parent = $orig->parent; }
1137   my $ref_pcontent = $parent->content;
1138   my @pcontent = @{$ref_pcontent};
1139   for (my $i=0; $i<scalar(@pcontent); $i++)
1140     { if ($pcontent[$i] eq $orig)
1141         { $ {$ref_pcontent}[$i] = $new;
1142           $new->parent($parent);
1143           $orig->parent(undef);
1144           return 1; } }
1145   die "Didn't find $orig in $parent";
1146 }
1147
1148 sub html_replace_by_meta ( $;$ )
1149 { my ($orig, $parent) = check_args_range(1, 2, @_);
1150   my $meta = new HTML::Element "meta";
1151   if (!defined $parent)
1152     { $parent = $orig->parent; }
1153   return html_replace($orig, $meta, $parent);
1154 }
1155
1156 sub html_replace_by_ignore ( $;$ )
1157 { my ($orig, $parent) = check_args_range(1, 2, @_);
1158   my $ignore = new HTML::Element "ignore";
1159   if (!defined $parent)
1160     { $parent = $orig->parent; }
1161   return html_replace($orig, $ignore, $parent);
1162 }
1163
1164
1165
1166 ###
1167 ### Collect text elements
1168 ###
1169
1170 my @collected_texts;
1171 my $collect_texts_stoppoint;
1172 my $done_collecting;
1173
1174 sub collect_texts ( $;$ )
1175 { my ($root, $stop) = check_args_range(1, 2, @_);
1176   # print STDERR "collect_texts: $root $stop\n";
1177   $collect_texts_stoppoint = $stop;
1178   $done_collecting = 0;
1179   @collected_texts = ();
1180   $root->traverse(\&collect_if_text); # process texts
1181   # print STDERR "collect_texts => ", join(";;;", @collected_texts), "\n";
1182   return @collected_texts;
1183 }
1184
1185 sub collect_if_text ( $$$ )
1186 { my $he = (check_args(3, @_))[0]; #  ignore depth and startflag arguments
1187   if ($done_collecting)
1188     { return 0; }
1189   if (!defined $he)
1190     { return 0; }
1191   if (!ref $he)
1192     { push @collected_texts, $he;
1193       return 0; }
1194   if ((defined $collect_texts_stoppoint) && ($he eq $collect_texts_stoppoint))
1195     { $done_collecting = 1;
1196       return 0; }
1197   return 1;
1198 }
1199
1200
1201 ###########################################################################
1202 ### Clean up parse tree
1203 ###
1204
1205 sub cleanup_parse_tree ( $ )
1206 { my ($he) = check_args(1, @_);
1207   $he->traverse(\&delete_if_navigation, 'ignore text');
1208   $he->traverse(\&delete_extra_spaces, 'ignore text');
1209   $he->traverse(\&merge_dl, 'ignore text');
1210   $he->traverse(\&reorder_dt_and_dl, 'ignore text');
1211   return $he;
1212 }
1213
1214
1215 ## Simpler version that deletes contents but not the element itself.
1216 # sub delete_if_navigation ( $$$ )
1217 # { my $he = (check_args(3, @_))[0]; # ignore startflag and depth
1218 #   if (($he->tag() eq "div") && ($he->attr('class') eq 'navigation'))
1219 #     { $he->delete();
1220 #       return 0; }
1221 #   else
1222 #     { return 1; }
1223 # }
1224
1225 sub delete_if_navigation ( $$$ )
1226 { my ($he, $startflag) = (check_args(3, @_))[0,1]; #  ignore depth argument
1227   if (!$startflag)
1228     { return; }
1229
1230   if (($he->tag() eq "div") && (defined $he->attr('class')) && ($he->attr('class') eq 'navigation'))
1231     { my $ref_pcontent = $he->parent()->content();
1232       # Don't try to modify @pcontent, which appears to be a COPY.
1233       # my @pcontent = @{$ref_pcontent};
1234       for (my $i = 0; $i<scalar(@{$ref_pcontent}); $i++)
1235         { if (${$ref_pcontent}[$i] eq $he)
1236             { splice(@{$ref_pcontent}, $i, 1);
1237               last; } }
1238       $he->delete();
1239       return 0; }
1240   else
1241     { return 1; }
1242 }
1243
1244 sub delete_extra_spaces ( $$$ )
1245 { my ($he, $startflag) = (check_args(3, @_))[0,1]; #  ignore depth argument
1246   if (!$startflag)
1247     { return; }
1248
1249   my $tag = $he->tag;
1250   if ($tag =~ /^(head|html|table|tr|ul)$/)
1251     { delete_child_spaces($he); }
1252   delete_trailing_spaces($he);
1253   return 1;
1254 }
1255
1256
1257 sub delete_child_spaces ( $ )
1258 { my ($he) = check_args(1, @_);
1259   my $ref_content = $he->content();
1260   for (my $i = 0; $i<scalar(@{$ref_content}); $i++)
1261     { if ($ {$ref_content}[$i] =~ /^ *$/)
1262         { splice(@{$ref_content}, $i, 1);
1263           $i--; } }
1264 }
1265
1266 sub delete_trailing_spaces ( $ )
1267 { my ($he) = check_args(1, @_);
1268   my $ref_content = $he->content();
1269   if (! defined $ref_content)
1270     { return; }
1271   # Could also check for previous element = /^h[1-6]$/.
1272   for (my $i = 0; $i<scalar(@{$ref_content})-1; $i++)
1273     { if ($ {$ref_content}[$i] =~ /^ *$/)
1274         { my $next_elt = $ {$ref_content}[$i+1];
1275           if ((ref $next_elt) && ($next_elt->tag =~ /^(br|dd|dl|dt|hr|p|ul)$/))
1276             { splice(@{$ref_content}, $i, 1);
1277               $i--; } } }
1278   if ($he->tag =~ /^(dd|dt|^h[1-6]|li|p)$/)
1279     { my $last_elt = $ {$ref_content}[$#{$ref_content}];
1280       if ((defined $last_elt) && ($last_elt =~ /^ *$/))
1281         { pop @{$ref_content}; } }
1282 }
1283
1284
1285 # LaTeX2HTML sometimes creates
1286 #   <DT>text
1287 #   <DL COMPACT><DD>text
1288 # which should actually be:
1289 #   <DL COMPACT>
1290 #   <DT>text
1291 #   <DD>text
1292 # Since a <DL> gets added, this ends up looking like
1293 # <P>
1294 #   <DL>
1295 #     <DT>
1296 #       text1...
1297 #       <DL COMPACT>
1298 #         <DD>
1299 #           text2...
1300 #         dt_or_dd1...
1301 #     dt_or_dd2...
1302 # which should become
1303 # <P>
1304 #   <DL COMPACT>
1305 #     <DT>
1306 #       text1...
1307 #     <DD>
1308 #       text2...
1309 #     dt_or_dd1...
1310 #     dt_or_dd2...
1311
1312 sub reorder_dt_and_dl ( $$$ )
1313 { my ($he, $startflag) = (check_args(3, @_))[0,1]; #  ignore depth argument
1314   if (!$startflag)
1315     { return; }
1316
1317   if ($he->tag() eq "p")
1318     { my $ref_pcontent = $he->content();
1319       if (defined $ref_pcontent)
1320         { my @pcontent = @{$ref_pcontent};
1321           # print "reorder_dt_and_dl found a <p>\n"; $he->dump();
1322           if ((scalar(@pcontent) >= 1)
1323               && (ref $pcontent[0]) && ($pcontent[0]->tag() eq "dl")
1324               && $pcontent[0]->implicit())
1325             { my $ref_dlcontent = $pcontent[0]->content();
1326               # print "reorder_dt_and_dl found a <p> and implicit <dl>\n";
1327               if (defined $ref_dlcontent)
1328                 { my @dlcontent = @{$ref_dlcontent};
1329                   if ((scalar(@dlcontent) >= 1)
1330                       && (ref $dlcontent[0]) && ($dlcontent[0]->tag() eq "dt"))
1331                     { my $ref_dtcontent = $dlcontent[0]->content();
1332                       # print "reorder_dt_and_dl found a <p>, implicit <dl>, and <dt>\n";
1333                       if (defined $ref_dtcontent)
1334                         { my @dtcontent = @{$ref_dtcontent};
1335                           if ((scalar(@dtcontent) > 0)
1336                               && (ref $dtcontent[$#dtcontent])
1337                               && ($dtcontent[$#dtcontent]->tag() eq "dl"))
1338                             { my $ref_dl2content = $dtcontent[$#dtcontent]->content();
1339                               # print "reorder_dt_and_dl found a <p>, implicit <dl>, <dt>, and <dl>\n";
1340                               if (defined $ref_dl2content)
1341                                 { my @dl2content = @{$ref_dl2content};
1342                                   if ((scalar(@dl2content) > 0)
1343                                       && (ref ($dl2content[0]))
1344                                       && ($dl2content[0]->tag() eq "dd"))
1345                             {
1346                               # print "reorder_dt_and_dl found a <p>, implicit <dl>, <dt>, <dl>, and <dd>\n";
1347                               # print STDERR "CHANGING\n"; $he->dump();
1348                               html_replace_by_ignore($dtcontent[$#dtcontent]);
1349                               splice(@{$ref_dlcontent}, 1, 0, @dl2content);
1350                               # print STDERR "CHANGED TO:\n"; $he->dump();
1351                               return 0; # don't traverse children
1352                             } } } } } } } } }
1353   return 1;
1354 }
1355
1356
1357 # If we find a paragraph that looks like
1358 # <P>
1359 #   <HR>
1360 #   <UL>
1361 # then accumulate its links into a contents_list and delete the paragraph.
1362 sub process_if_child_links ( $$$ )
1363 { my ($he, $startflag) = (check_args(3, @_))[0,1]; #  ignore depth argument
1364   if (!$startflag)
1365     { return; }
1366
1367   if ($he->tag() eq "p")
1368     { my $ref_content = $he->content();
1369       if (defined $ref_content)
1370         { my @content = @{$ref_content};
1371           if ((scalar(@content) == 2)
1372               && (ref $content[0]) && $content[0]->tag() eq "hr"
1373               && (ref $content[1]) && $content[1]->tag() eq "ul")
1374             { process_child_links($he);
1375               $he->delete();
1376               return 0; } } }
1377   return 1;
1378 }
1379
1380
1381 # If we find
1382 #     <H4>
1383 #       "Footnotes"
1384 #     <DL>
1385 #       <DT>
1386 #         <A NAME="foot560">
1387 #           "...borrow"
1388 #         <A HREF="refcountsInPython.html#tex2html2" NAME="foot560">
1389 #           "1.2"
1390 #       <DD>
1391 #         "The metaphor of ``borrowing'' a reference is not completely correct: the owner still has a copy of the reference. "
1392 #       ...
1393 # then record the footnote information and delete the section and list.
1394
1395 my $process_if_footnotes_expect_dl_next = 0;
1396
1397 sub process_if_footnotes ( $$$ )
1398 { my ($he, $startflag) = (check_args(3, @_))[0,1]; #  ignore depth argument
1399   if (!$startflag)
1400     { return; }
1401
1402   if (($he->tag() eq "h4")
1403       && has_single_content_string($he)
1404       && ($ {$he->content}[0] eq "Footnotes"))
1405     { html_replace_by_ignore($he);
1406       $process_if_footnotes_expect_dl_next = 1;
1407       return 0; }
1408
1409   if ($process_if_footnotes_expect_dl_next && ($he->tag() eq "dl"))
1410     { my $ref_content = $he->content();
1411       if (defined $ref_content)
1412         { $process_if_footnotes_expect_dl_next = 0;
1413           my @content = @{$ref_content};
1414           for (my $i=0; $i<$#content; $i+=2)
1415             { my $he_dt = $content[$i];
1416               my $he_dd = $content[$i+1];
1417               if (($he_dt->tag ne "dt") || ($he_dd->tag ne "dd"))
1418                 { $he->dump;
1419                   die "expected <DT> and <DD> at positions $i and ", $i+1; }
1420               my @dt_content = @{$he_dt->content()};
1421               if ((scalar(@dt_content) != 2)
1422                   || ($dt_content[0]->tag ne "a")
1423                   || ($dt_content[1]->tag ne "a"))
1424                 { $he_dt->dump;
1425                   die "Expected 2 anchors as content of <DT>"; }
1426               my ($dt1_name, $dt1_href, $dt1_content) = anchor_info($dt_content[0]);
1427               my ($dt2_name, $dt2_href, $dt2_content) = anchor_info($dt_content[0]);
1428               # unused: $dt1_href, $dt1_content, $dt2_href, $dt2_content
1429               if ($dt1_name ne $dt2_name)
1430                 { $he_dt->dump;
1431                   die "Expected identical names for anchors"; }
1432               html_replace_by_ignore($he_dd);
1433               $he_dd->tag("div"); # has no effect
1434               $footnotes{$dt1_name} = $he_dd; }
1435           html_replace_by_ignore($he);
1436           return 0; } }
1437
1438   if ($process_if_footnotes_expect_dl_next)
1439     { $he->dump;
1440       die "Expected <DL> for footnotes next"; }
1441
1442   return 1;
1443 }
1444
1445
1446
1447 ## Merge two adjacent paragraphs containing <DL> items, such as:
1448 #     <P>
1449 #       <DL>
1450 #         <DT>
1451 #           ...
1452 #         <DD>
1453 #           ...
1454 #     <P>
1455 #       <DL>
1456 #         <DT>
1457 #           ...
1458 #         <DD>
1459 #           ...
1460
1461 sub merge_dl ( $$$ )
1462 { my ($he, $startflag) = (check_args(3, @_))[0,1]; #  ignore depth argument
1463   if (!$startflag)
1464     { return; }
1465
1466   my $ref_content = $he->content;
1467   if (!defined $ref_content)
1468     { return; }
1469   my $i = 0;
1470   while ($i < scalar(@{$ref_content})-1)
1471     { my $p1 = $ {$ref_content}[$i];
1472       if ((ref $p1) && ($p1->tag eq "p")
1473           && has_single_content_with_tag($p1, "dl"))
1474         { my $dl1 = $ {$p1->content}[0];
1475           # In this loop, rhs, not lhs, of < comparison changes,
1476           # because we are removing elements from the content of $he.
1477           while ($i < scalar(@{$ref_content})-1)
1478             { my $p2 = $ {$ref_content}[$i+1];
1479               if (!((ref $p2) && ($p2->tag eq "p")
1480                     && has_single_content_with_tag($p2, "dl")))
1481                 { last; }
1482               # Merge these two elements.
1483               splice(@{$ref_content}, $i+1, 1); # remove $p2
1484               my $dl2 = $ {$p2->content}[0];
1485               $dl1->push_content(@{$dl2->content}); # put $dl2's content in $dl1
1486             }
1487           # extra increment because next element isn't a candidate for $p1
1488           $i++; }
1489       $i++; }
1490   return 1;
1491 }
1492
1493
1494
1495 ###########################################################################
1496 ### Testing
1497 ###
1498
1499 sub test ( $$ )
1500 { my ($action, $file) = check_args(2, @_);
1501
1502   # General testing
1503   if (($action eq "view") || ($action eq ""))
1504     { # # $file = "/homes/gws/mernst/www/links.html";
1505       # # $file = "/homes/gws/mernst/www/index.html";
1506       # # $file = "/homes/fish/mernst/java/gud/doc/manual.html";
1507       # # $file = "/projects/cecil/cecil/doc/manuals/stdlib-man/stdlib/stdlib.html";
1508       # # $file = "/homes/fish/mernst/tmp/python-doc/html/index.html";
1509       # $file = "/homes/fish/mernst/tmp/python-doc/html/api/complexObjects.html";
1510       my $tree = file_to_tree($file);
1511
1512       ## Testing
1513       # print STDERR $tree->as_HTML;
1514       $tree->dump();
1515
1516       # print STDERR $tree->tag(), "\n";
1517       # print STDERR @{$tree->content()}, "\n";
1518       #
1519       # for (@{ $tree->extract_links(qw(a img)) }) {
1520       #   my ($link, $linkelem) = @$_;
1521       #   print STDERR "$link ", $linkelem->as_HTML;
1522       #   }
1523       #
1524       # print STDERR @{$tree->extract_links()}, "\n";
1525
1526       # my @top_level_elts = @{$tree->content()};
1527
1528       # if scalar(@{$tree->content()})
1529       return;
1530     }
1531
1532   elsif ($action eq "raw")
1533     { my $tree = new HTML::TreeBuilder;
1534       $tree->ignore_unknown(1);
1535       # $tree->warn(1);
1536       $tree->parse_file($file);
1537
1538       $tree->dump();
1539
1540       # cleanup_parse_tree($tree);
1541       # $tree->dump();
1542       return;
1543     }
1544
1545   # Test dealing with a section.
1546   elsif ($action eq "section")
1547     { # my $file;
1548       # $file = "/homes/fish/mernst/tmp/python-doc/html/api/intro.html";
1549       # $file = "/homes/fish/mernst/tmp/python-doc/html/api/includes.html";
1550       # $file = "/homes/fish/mernst/tmp/python-doc/html/api/complexObjects.html";
1551       process_section_file($file, 0, "Title");
1552     }
1553
1554   # Test dealing with many sections
1555   elsif (0)
1556     { my @files = ("/homes/fish/mernst/tmp/python-doc/html/api/about.html",
1557                    "/homes/fish/mernst/tmp/python-doc/html/api/abstract.html",
1558                    "/homes/fish/mernst/tmp/python-doc/html/api/api.html",
1559                    "/homes/fish/mernst/tmp/python-doc/html/api/cObjects.html",
1560                    "/homes/fish/mernst/tmp/python-doc/html/api/complexObjects.html",
1561                    "/homes/fish/mernst/tmp/python-doc/html/api/concrete.html",
1562                    # "/homes/fish/mernst/tmp/python-doc/html/api/contents.html",
1563                    "/homes/fish/mernst/tmp/python-doc/html/api/countingRefs.html",
1564                    "/homes/fish/mernst/tmp/python-doc/html/api/debugging.html",
1565                    "/homes/fish/mernst/tmp/python-doc/html/api/dictObjects.html",
1566                    "/homes/fish/mernst/tmp/python-doc/html/api/embedding.html",
1567                    "/homes/fish/mernst/tmp/python-doc/html/api/exceptionHandling.html",
1568                    "/homes/fish/mernst/tmp/python-doc/html/api/exceptions.html",
1569                    "/homes/fish/mernst/tmp/python-doc/html/api/fileObjects.html",
1570                    "/homes/fish/mernst/tmp/python-doc/html/api/floatObjects.html",
1571                    "/homes/fish/mernst/tmp/python-doc/html/api/front.html",
1572                    "/homes/fish/mernst/tmp/python-doc/html/api/fundamental.html",
1573                    # "/homes/fish/mernst/tmp/python-doc/html/api/genindex.html",
1574                    "/homes/fish/mernst/tmp/python-doc/html/api/importing.html",
1575                    "/homes/fish/mernst/tmp/python-doc/html/api/includes.html",
1576                    "/homes/fish/mernst/tmp/python-doc/html/api/index.html",
1577                    "/homes/fish/mernst/tmp/python-doc/html/api/initialization.html",
1578                    "/homes/fish/mernst/tmp/python-doc/html/api/intObjects.html",
1579                    "/homes/fish/mernst/tmp/python-doc/html/api/intro.html",
1580                    "/homes/fish/mernst/tmp/python-doc/html/api/listObjects.html",
1581                    "/homes/fish/mernst/tmp/python-doc/html/api/longObjects.html",
1582                    "/homes/fish/mernst/tmp/python-doc/html/api/mapObjects.html",
1583                    "/homes/fish/mernst/tmp/python-doc/html/api/mapping.html",
1584                    "/homes/fish/mernst/tmp/python-doc/html/api/newTypes.html",
1585                    "/homes/fish/mernst/tmp/python-doc/html/api/node24.html",
1586                    "/homes/fish/mernst/tmp/python-doc/html/api/noneObject.html",
1587                    "/homes/fish/mernst/tmp/python-doc/html/api/number.html",
1588                    "/homes/fish/mernst/tmp/python-doc/html/api/numericObjects.html",
1589                    "/homes/fish/mernst/tmp/python-doc/html/api/object.html",
1590                    "/homes/fish/mernst/tmp/python-doc/html/api/objects.html",
1591                    "/homes/fish/mernst/tmp/python-doc/html/api/os.html",
1592                    "/homes/fish/mernst/tmp/python-doc/html/api/otherObjects.html",
1593                    "/homes/fish/mernst/tmp/python-doc/html/api/processControl.html",
1594                    "/homes/fish/mernst/tmp/python-doc/html/api/refcountDetails.html",
1595                    "/homes/fish/mernst/tmp/python-doc/html/api/refcounts.html",
1596                    "/homes/fish/mernst/tmp/python-doc/html/api/sequence.html",
1597                    "/homes/fish/mernst/tmp/python-doc/html/api/sequenceObjects.html",
1598                    "/homes/fish/mernst/tmp/python-doc/html/api/standardExceptions.html",
1599                    "/homes/fish/mernst/tmp/python-doc/html/api/stringObjects.html",
1600                    "/homes/fish/mernst/tmp/python-doc/html/api/threads.html",
1601                    "/homes/fish/mernst/tmp/python-doc/html/api/tupleObjects.html",
1602                    "/homes/fish/mernst/tmp/python-doc/html/api/typeObjects.html",
1603                    "/homes/fish/mernst/tmp/python-doc/html/api/types.html",
1604                    "/homes/fish/mernst/tmp/python-doc/html/api/utilities.html",
1605                    "/homes/fish/mernst/tmp/python-doc/html/api/veryhigh.html");
1606       for my $file (@files)
1607         { print STDERR "\n", "=" x 75, "\n", "$file:\n";
1608           process_section_file($file, 0, "Title");
1609         }
1610     }
1611
1612   # Test dealing with index.
1613   elsif ($action eq "index")
1614     { # my $file;
1615       # $file = "/homes/fish/mernst/tmp/python-doc/html/api/genindex.html";
1616
1617       process_index_file($file, "\@cindex");
1618       print_index_info();
1619     }
1620
1621   else
1622     { die "Unrecognized action `$action'"; }
1623 }
1624
1625
1626 ###########################################################################
1627 ### Main loop
1628 ###
1629
1630 sub process_contents_file ( $ )
1631 { my ($file) = check_args(1, @_);
1632
1633   # could also use File::Basename
1634   my $info_file = $file;
1635   $info_file =~ s/(\/?index)?\.html$//;
1636   if ($info_file eq "")
1637     { chomp($info_file = `pwd`); }
1638   $info_file =~ s/^.*\///;      # not the most efficient way to remove dirs
1639
1640   $html_directory = $file;
1641   $html_directory =~ s/(\/|^)[^\/]+$/$1/;
1642
1643   my $texi_file = "$info_file.texi";
1644   open(TEXI, ">$texi_file");
1645
1646   print TEXI "\\input texinfo   \@c -*-texinfo-*-\n";
1647   print TEXI "\@c %**start of header\n";
1648   print TEXI "\@setfilename $info_file\n";
1649
1650   # 2. Summary Description and Copyright
1651   #      The "Summary Description and Copyright" segment describes the
1652   #      document and contains the copyright notice and copying permissions
1653   #      for the Info file.  The segment must be enclosed between `@ifinfo'
1654   #      and `@end ifinfo' commands so that the formatters place it only in
1655   #      the Info file.
1656   #
1657   # The summary description and copyright segment does not appear in the
1658   # printed document.
1659   #
1660   #      @ifinfo
1661   #      This is a short example of a complete Texinfo file.
1662   #
1663   #      Copyright @copyright{} 1990 Free Software Foundation, Inc.
1664   #      @end ifinfo
1665
1666
1667   # 3. Title and Copyright
1668   #      The "Title and Copyright" segment contains the title and copyright
1669   #      pages and copying permissions for the printed manual.  The segment
1670   #      must be enclosed between `@titlepage' and `@end titlepage'
1671   #      commands.  The title and copyright page appear only in the printed
1672   #      manual.
1673   #
1674   # The titlepage segment does not appear in the Info file.
1675   #
1676   #      @titlepage
1677   #      @sp 10
1678   #      @comment The title is printed in a large font.
1679   #      @center @titlefont{Sample Title}
1680   #
1681   #      @c The following two commands start the copyright page.
1682   #      @page
1683   #      @vskip 0pt plus 1filll
1684   #      Copyright @copyright{} 1990 Free Software Foundation, Inc.
1685   #      @end titlepage
1686
1687
1688   # 4. `Top' Node and Master Menu
1689   #      The "Master Menu" contains a complete menu of all the nodes in the
1690   #      whole Info file.  It appears only in the Info file, in the `Top'
1691   #      node.
1692   #
1693   # The `Top' node contains the master menu for the Info file.  Since a
1694   # printed manual uses a table of contents rather than a menu, the master
1695   # menu appears only in the Info file.
1696   #
1697   #      @node    Top,       First Chapter, ,         (dir)
1698   #      @comment node-name, next,          previous, up
1699   #
1700   #      @menu
1701   #      * First Chapter::    The first chapter is the
1702   #                           only chapter in this sample.
1703   #      * Concept Index::    This index has two entries.
1704   #      @end menu
1705
1706
1707
1708   $current_ref_tdf = [ "Top", 0, $ARGV[0] ];
1709   process_section_file($file, 0, "Top");
1710   while (scalar(@contents_list))
1711   { $current_ref_tdf = shift @contents_list;
1712     process_section_file($ {$current_ref_tdf}[2], $ {$current_ref_tdf}[1], $ {$current_ref_tdf}[0]);
1713   }
1714
1715   print TEXI "\n";
1716   for my $indextitle (@index_titles)
1717     { print TEXI "\@node $indextitle\n";
1718       print TEXI "\@unnumbered $indextitle\n";
1719       print TEXI "\@printindex $ {$index_info{$indextitle}}[1]\n";
1720       print TEXI "\n"; }
1721
1722   print TEXI "\@contents\n";
1723   print TEXI "\@bye\n";
1724   close(TEXI);
1725 }
1726
1727 # This needs to be last so global variable initializations are reached.
1728
1729 if (scalar(@ARGV) == 0)
1730 { die "No arguments supplied to html2texi.pl"; }
1731
1732 if ($ARGV[0] eq "-test")
1733 { my @test_args = @ARGV[1..$#ARGV];
1734   if (scalar(@test_args) == 0)
1735     { test("", "index.html"); }
1736   elsif (scalar(@test_args) == 1)
1737     { test("", $test_args[0]); }
1738   elsif (scalar(@test_args) == 2)
1739     { test($test_args[0], $test_args[1]); }
1740   else
1741     { die "Too many test arguments passed to html2texi: ", join(" ", @ARGV); }
1742   exit();
1743 }
1744
1745 if (scalar(@ARGV) != 1)
1746 { die "Pass one argument, the main/contents page"; }
1747
1748 process_contents_file($ARGV[0]);
1749
1750 # end of html2texi.pl