tools/auto_index/src/auto_index.cpp

   1 // Copyright 2008 John Maddock
   2 //
   3 // Use, modification and distribution are subject to the
   4 // Boost Software License, Version 1.0.
   5 // (See accompanying file LICENSE_1_0.txt
   6 // or copy at http://www.boost.org/LICENSE_1_0.txt)
   7
   8 #include <set>
   9 #include <cstring>
  10 #include <boost/array.hpp>
  11 #include <boost/exception/all.hpp>
  12 #include <boost/program_options.hpp>
  13 #include "auto_index.hpp"
  14
  15 std::string infile, outfile, prefix, last_primary, last_secondary, last_tertiary;
  16 std::set<index_info> index_terms;
  17 std::set<std::pair<std::string, std::string> > found_terms;
  18 bool no_duplicates = false;
  19 bool verbose = false;
  20 bool use_section_names = true;
  21 index_entry_set index_entries;
  22 boost::tiny_xml::element_list indexes;
  23 std::list<id_rewrite_rule> id_rewrite_list;
  24 bool internal_indexes = false;
  25 std::string internal_index_type = "section";
  26 boost::regex debug;
  27 file_scanner_set_type file_scanner_set;
  28
  29 int help()
  30 {
  31    std::cout << "Please refer to the documentation for the correct command line syntax" << std::endl;
  32    return 1;
  33 }
  34
  35 void eat_block(std::string& result, std::istream & is)
  36 {
  37    //
  38    // everything until we get to a closing '>':
  39    //
  40    char c;
  41    while(is.get(c) && c != '>')
  42    {
  43       result += c;
  44       if(c == '\\')
  45       {
  46          is.get(c);
  47          result += c;
  48       }
  49    }
  50    result += c;
  51 }
  52
  53 std::string get_header(std::istream & is)
  54 {
  55    //
  56    // We need to get any leading <? and <! elements:
  57    //
  58    std::string result;
  59    is >> std::ws;
  60    if(is.get() != '<')
  61       throw std::runtime_error("Invalid leading markup in XML file found");
  62    char c = is.peek();
  63    while((c == '?') || (c == '!'))
  64    {
  65       std::string temp;
  66       std::getline(is, temp, '>');
  67       result += '<' + temp + '>';
  68       is >> std::ws;
  69       if(is.get() != '<')
  70          throw std::runtime_error("Invalid leading markup in XML file found");
  71       c = is.peek();
  72       result += '\n';
  73    }
  74    return result;
  75 }
  76 //
  77 // Find attribute named "name" in node "node":
  78 //
  79 const std::string* find_attr(boost::tiny_xml::element_ptr node, const char* name)
  80 {
  81    for(boost::tiny_xml::attribute_list::const_iterator i = node->attributes.begin();
  82       i != node->attributes.end(); ++i)
  83    {
  84       if(i->name == name)
  85          return &(i->value);
  86    }
  87    return 0;
  88 }
  89 //
  90 // Get the ID of the current block scope, basically
  91 // move up the XML tree until we find a valid ID:
  92 //
  93 const std::string* get_current_block_id(node_id const* id)
  94 {
  95    while((id->id == 0) && (id->prev))
  96       id = id->prev;
  97    if(!id->id)
  98       BOOST_THROW_EXCEPTION(std::runtime_error("Current XML block has no enclosing ID: XML is not valid Boostbook?"));
  99    return id->id;
 100 }
 101 //
 102 // Get the title of the current block scope, basically
 103 // move up the XML tree until we find a valid title:
 104 //
 105 const std::string& get_current_block_title(title_info const* id)
 106 {
 107    while((id->title.size() == 0) && (id->prev))
 108       id = id->prev;
 109    return id->title;
 110 }
 111 //
 112 // Get all the content under this node, with any inline XML
 113 // stripped out:
 114 //
 115 std::string get_consolidated_content(boost::tiny_xml::element_ptr node)
 116 {
 117    std::string result(node->content);
 118    for(boost::tiny_xml::element_list::const_iterator i = node->elements.begin();
 119       i != node->elements.end(); ++i)
 120    {
 121       result += " ";
 122       result += get_consolidated_content(*i);
 123    }
 124    static const boost::regex e("(^[[:space:]]+)|([[:space:]]+)|([[:space:]]+$)");
 125    return regex_replace(result, e, "(?2 )", boost::regex_constants::format_all);
 126 }
 127 //
 128 // Rewrite a title based on any rewrite rules we may have:
 129 //
 130 std::string rewrite_title(const std::string& title, const std::string& id)
 131 {
 132    for(std::list<id_rewrite_rule>::const_iterator i = id_rewrite_list.begin(); i != id_rewrite_list.end(); ++i)
 133    {
 134       if(i->base_on_id)
 135       {
 136          if(regex_match(id, i->id))
 137             return i->new_name;
 138       }
 139       else
 140       {
 141          if(regex_match(title, i->id))
 142             return regex_replace(title, i->id, i->new_name);
 143       }
 144    }
 145    return title;
 146 }
 147
 148 struct string_cmp
 149 {
 150    bool operator()(const char* a, const char* b)const
 151    {
 152       return std::strcmp(a, b) < 0;
 153    }
 154 };
 155 //
 156 // Discover whether this node can contain a <title> or not, if not
 157 // we don't want to link to it, or the XSL HTML stylesheets may do strange
 158 // things, and at least emit copious messages.  See https://sourceforge.net/tracker/?func=detail&aid=3325153&group_id=21935&atid=373747
 159 //
 160 bool can_contain_title(const char* name)
 161 {
 162    static const boost::array<const char*, 103> names =
 163    { {
 164       "abstract", "appendix", "appendixinfo", "article", "articleinfo", "authorblurb", "bibliodiv", "biblioentry", "bibliography",
 165        "bibliographyinfo", "bibliolist", "bibliomixed", "bibliomset", "biblioset", "blockinfo", "blockquote", "book", "bookinfo",
 166        "calloutlist", "caution", "chapter", "chapterinfo", "colophon", "constraintdef", "dedication", "equation", "example", "figure",
 167        "formalpara", "glossary", "glossaryinfo", "glossdiv", "glosslist", "important", "index", "indexdiv", "indexinfo", "itemizedlist",
 168        "legalnotice", "lot", "msg", "msgexplan", "msgmain", "msgrel", "msgset", "msgsub", "note", "objectinfo", "orderedlist", "part",
 169        "partinfo", "partintro", "personblurb", "preface", "prefaceinfo", "procedure", "productionset", "qandadiv", "qandaset",
 170        "refentryinfo", "reference", "referenceinfo", "refsect1", "refsect1info", "refsect2", "refsect2info", "refsect3", "refsect3info",
 171        "refsection", "refsectioninfo", "refsynopsisdiv", "refsynopsisdivinfo", "sect1", "sect1info", "sect2", "sect2info", "sect3",
 172        "sect3info", "sect4", "sect4info", "sect5", "sect5info", "section", "sectioninfo", "segmentedlist", "set", "setindex",
 173        "setindexinfo", "setinfo", "sidebar", "sidebarinfo", "simplesect", "step", "table", "task", "taskprerequisites",
 174        "taskrelated", "tasksummary", "tip", "toc", "variablelist", "warning", "refentry"
 175    } };
 176    static std::set<const char*, string_cmp> permitted;
 177
 178    if(permitted.empty())
 179       permitted.insert(names.begin(), names.end());
 180
 181    return 0 != permitted.count(name);
 182 }
 183 //
 184 // Determine whether this node can contain an indexterm or not:
 185 //
 186 bool can_contain_indexterm(const char* name)
 187 {
 188    static const boost::array<const char*, 257> names =
 189    { {
 190       "abbrev", "accel", "ackno", "acronym", "action", "answer", "appendix", "appendixinfo", "application",
 191       "article", "articleinfo", "artpagenums", "attribution", "authorinitials", "bibliocoverage", "bibliodiv",
 192       "biblioentry", "bibliography", "bibliographyinfo", "biblioid", "bibliomisc", "bibliomixed", "bibliomset",
 193       "bibliorelation", "biblioset", "bibliosource", "blockinfo", "blockquote", "bookinfo", "bridgehead", "callout",
 194       "caution", "chapter", "chapterinfo", "citation", "citebiblioid", "citetitle", "city", "classname", "classsynopsisinfo",
 195       "code", "collabname", "command", "computeroutput", "confdates", "confnum", "confsponsor", "conftitle", "constant",
 196       "constraintdef", "contractnum", "contractsponsor", "contrib", "corpauthor", "corpcredit", "corpname", "country",
 197       "database", "date", "dedication", "edition", "email", "emphasis", "entry", "envar", "errorcode", "errorname", "errortext",
 198       "errortype", "example", "exceptionname", "fax", "figure", "filename", "firstname", "firstterm", "foreignphrase",
 199       "formalpara", "funcparams", "funcsynopsisinfo", "function", "glossary", "glossaryinfo", "glossdef", "glossdiv",
 200       "glossentry", "glosssee", "glossseealso", "glossterm", "guibutton", "guiicon", "guilabel", "guimenu", "guimenuitem",
 201       "guisubmenu", "hardware", "highlights", "holder", "honorific", "important", "index", "indexinfo", "informalexample",
 202       "informalfigure", "initializer", "interface", "interfacename", "invpartnumber", "isbn", "issn", "issuenum", "itemizedlist",
 203       "itermset", "jobtitle", "keycap", "keycode", "keysym", "label", "legalnotice", "lineage", "lineannotation",
 204       /*"link", */"listitem", "literal", "literallayout", "lotentry", "manvolnum", "markup", "medialabel", "member",
 205       "methodname", "modespec", "modifier", "mousebutton", "msgaud", "msgexplan", "msglevel", "msgorig", "msgtext", "note",
 206       "objectinfo", "olink", "option", "optional", "orderedlist", "orgdiv", "orgname", "otheraddr", "othername", "package",
 207       "pagenums", "para", "parameter", "partinfo", "partintro", "phone", "phrase", "pob", "postcode", "preface", "prefaceinfo",
 208       "procedure", "productname", "productnumber", "programlisting", "prompt", "property", "pubdate", "publishername",
 209       "pubsnumber", "qandadiv", "qandaset", "question", "quote", "refentry", "refentryinfo", "refentrytitle", "referenceinfo",
 210       "refmeta", "refmiscinfo", "refpurpose", "refsect1", "refsect1info", "refsect2", "refsect2info", "refsect3", "refsect3info",
 211       "refsection", "refsectioninfo", "refsynopsisdiv", "refsynopsisdivinfo", "releaseinfo", "remark", "returnvalue",
 212       "revdescription", "revnumber", "revremark", "screen", "screeninfo", "sect1", "sect1info", "sect2", "sect2info", "sect3",
 213       "sect3info", "sect4", "sect4info", "sect5", "sect5info", "section", "sectioninfo", "seg", "segtitle", "seriesvolnums",
 214       "setindex", "setindexinfo", "setinfo", "sgmltag", "shortaffil", "sidebar", "sidebarinfo", "simpara", "simplesect",
 215       "state", "step", "street", "structfield", "structname", "subtitle", "surname", "symbol", "synopsis", "systemitem",
 216       "table", "task", "taskprerequisites", "taskrelated", "tasksummary", "td", "term", "termdef", "th", "tip", /*"title",*/
 217       "titleabbrev", "tocback", "tocentry", "tocfront", "token", "type", "ulink", "uri", "userinput", "variablelist",
 218       "varname", "volumenum", "warning", "wordasword", "year"
 219    } };
 220    static std::set<const char*, string_cmp> permitted;
 221
 222    if(permitted.empty())
 223       permitted.insert(names.begin(), names.end());
 224
 225    return 0 != permitted.count(name);
 226 }
 227 //
 228 // Decide whether to flatten this node for searching purposes:
 229 //
 230 bool should_flatten_node(const char* name)
 231 {
 232    //
 233    // The list of nodes to flatten is basically the list of elements that
 234    // can appear inside a <section> - see http://www.docbook.org/tdg/en/html/section.html.
 235    // In other words basically anything at the level of a paragraph/table/listing etc.
 236    //
 237    static const boost::array<const char*, 57> names =
 238    { {
 239       "title", "subtitle", "titleabbrev",
 240       "toc", "lot", "glossary", "bibliography",
 241       /*"calloutlist", "glosslist", "bibliolist", "itemizedlist", "orderedlist",
 242       "segmentedlist", "simplelist", "variablelist",*/ "caution", "important", "note",
 243       "tip", "warning", "literallayout", "programlisting", "programlistingco",
 244       "screen", "screenco", "screenshot", "synopsis", "cmdsynopsis", "funcsynopsis",
 245       "classsynopsis", "fieldsynopsis", "constructorsynopsis",
 246       "destructorsynopsis", "methodsynopsis", "formalpara", "para", "simpara",
 247       "address", "blockquote", "graphic", "graphicco", "mediaobject",
 248       "mediaobjectco", "informalequation", "informalexample", "informalfigure",
 249       "informaltable", "equation", "example", "figure", "table", "msgset", "procedure",
 250       "sidebar", "qandaset", "task", "productionset", "constraintdef", "anchor",
 251       "bridgehead", "remark", "highlights", "abstract", "authorblurb", "epigraph"
 252       /*"biblioentry", "bibliomixed", "callout", "glossentry", "listitem", "seg", "seglistitem", "member",
 253       "term", */
 254    } };
 255    static std::set<const char*, string_cmp> terminals;
 256
 257    if(terminals.empty())
 258          terminals.insert(names.begin(), names.end());
 259    return 0 != terminals.count(name);
 260 }
 261 std::string unescape_xml(const std::string& s)
 262 {
 263    boost::regex e("&(?:(quot)|(amp)|(apos)|(lt)|(gt));");
 264    return regex_replace(s, e, "(?1\")(?2&)(?3\')(?4<)(?5>)", boost::regex_constants::format_all);
 265 }
 266 //
 267 // Exception classes to propagate processing instruction info:
 268 //
 269 struct ignore_section{};
 270 struct ignore_block{};
 271 //
 272 // Check if we're in a section (or chapter etc) or not:
 273 //
 274 bool is_section(const std::string& name)
 275 {
 276    static const boost::array<const char*, 19> data =
 277    {{
 278       "dedication", "toc", "lot", "glossary", "bibliography", "preface", "chapter",
 279       "reference", "part", "article", "appendix", "index", "setindex", "colophon",
 280       "sect1", "refentry", "simplesect", "section", "partintro"
 281    }};
 282    std::set<std::string> names;
 283    if(names.empty())
 284       names.insert(data.begin(), data.end());
 285    return 0 != names.count(name);
 286 }
 287 //
 288 // Check if we're in a block/paragraph or not:
 289 //
 290 bool is_block(const std::string& name)
 291 {
 292    static const boost::array<const char*, 58> data =
 293    {{
 294       "calloutlist", "glosslist", "bibliolist", "itemizedlist", "orderedlist",
 295       "segmentedlist", "simplelist", "variablelist", "caution", "important", "note",
 296       "tip", "warning", "literallayout", "programlisting", "programlistingco",
 297       "screen", "screenco", "screenshot", "synopsis", "cmdsynopsis", "funcsynopsis",
 298       "classsynopsis", "fieldsynopsis", "constructorsynopsis",
 299       "destructorsynopsis", "methodsynopsis", "formalpara", "para", "simpara",
 300       "address", "blockquote", "graphic", "graphicco", "mediaobject",
 301       "mediaobjectco", "informalequation", "informalexample", "informalfigure",
 302       "informaltable", "equation", "example", "figure", "table", "msgset", "procedure",
 303       "sidebar", "qandaset", "task", "productionset", "constraintdef", "anchor",
 304       "bridgehead", "remark", "highlights", "abstract", "authorblurb", "epigraph"
 305    }};
 306    std::set<std::string> names;
 307    if(names.empty())
 308       names.insert(data.begin(), data.end());
 309    return 0 != names.count(name);
 310 }
 311 //
 312 // Helper proc to recurse through children:
 313 //
 314 void process_node(boost::tiny_xml::element_ptr node, node_id* prev, title_info* pt, bool seen);
 315 bool recurse_through_children(boost::tiny_xml::element_ptr node, node_id* id, title_info* pt, bool seen)
 316 {
 317    try
 318    {
 319       for(boost::tiny_xml::element_list::const_iterator i = node->elements.begin();
 320          i != node->elements.end(); ++i)
 321       {
 322          process_node(*i, id, pt, seen);
 323       }
 324    }
 325    catch(const ignore_section&)
 326    {
 327       if(is_section(node->name))
 328          return false;
 329       else
 330          throw;
 331    }
 332    catch(const ignore_block&)
 333    {
 334       if(is_block(node->name) || is_section(node->name))
 335          return false;
 336       else
 337          throw;
 338    }
 339    return true;
 340 }
 341 //
 342 // This does most of the work: process the node pointed to, and any children
 343 // that it may have:
 344 //
 345 void process_node(boost::tiny_xml::element_ptr node, node_id* prev, title_info* pt, bool seen = false)
 346 {
 347    //
 348    // Store the current ID and title as nested scoped objects:
 349    //
 350    node_id id = { 0, prev };
 351    if(can_contain_title(node->name.c_str()))
 352    {
 353       // Only set the ID to link to if the block can contain a title, see
 354       // can_contain_title above for rationale.
 355       id.id = find_attr(node, "id");
 356    }
 357    title_info title = { "", pt};
 358    bool flatten = should_flatten_node(node->name.c_str());
 359
 360    if(node->name.size() && node->name[0] == '?')
 361    {
 362       if(node->name == "?BoostAutoIndex")
 363       {
 364          if(node->content == "IgnoreSection")
 365          {
 366             throw ignore_section();
 367          }
 368          else if(node->content == "IgnoreBlock")
 369          {
 370             throw ignore_block();
 371          }
 372       }
 373       return; // Ignore processing instructions
 374    }
 375    else if((node->name == "title") && (id.prev->id))
 376    {
 377       //
 378       // This actually sets the title of the enclosing scope,
 379       // not this tag itself:
 380       //
 381       title.prev->title = get_consolidated_content(node);
 382       if(verbose)
 383          std::cout << "Indexing section: " << title.prev->title << std::endl;
 384    }
 385    else if((node->name == "refentrytitle") && (id.prev->prev->id))
 386    {
 387       //
 388       // This actually sets the title of the enclosing refentry scope,
 389       // not this tag itself:
 390       //
 391       title.prev->prev->title = get_consolidated_content(node);
 392       if(verbose)
 393          std::cout << "Indexing refentry: " << title.prev->prev->title << std::endl;
 394    }
 395    if(node->name == "anchor")
 396    {
 397       if(node->parent.lock()->name == "title")
 398       {
 399          // We have a title with a nested anchor ID, change the ID of our parents parent to match:
 400          id.prev->prev->id = id.id;
 401       }
 402    }
 403    else if(node->name == "index")
 404    {
 405       // Keep track of all the indexes we see:
 406       indexes.push_back(node);
 407       if(node->parent.lock()->name == "para")
 408          node->parent.lock()->name = "";
 409    }
 410    else if(node->name == "primary")
 411    {
 412       last_primary = get_consolidated_content(node);
 413    }
 414    else if(node->name == "secondary")
 415    {
 416       last_secondary = get_consolidated_content(node);
 417    }
 418    else if(node->name == "tertiary")
 419    {
 420       last_tertiary = get_consolidated_content(node);
 421    }
 422    else if((node->name == "see") && internal_indexes)
 423    {
 424       std::cerr << "WARNING: <see> in XML source will be ignored for the index generation" << std::endl;
 425    }
 426    else if((node->name == "seealso") && internal_indexes)
 427    {
 428       std::cerr << "WARNING: <seealso> in XML source will be ignored for the index generation" << std::endl;
 429    }
 430
 431    std::string flattenned_text;
 432    const std::string* ptext;
 433    if(flatten)
 434    {
 435       flattenned_text = unescape_xml(get_consolidated_content(node));
 436       ptext = &flattenned_text;
 437       //
 438       // Recurse through children here if we're going to flatten the text, that way we see any processing instructions first:
 439       //
 440       if(!recurse_through_children(node, &id, &title, flatten || seen))
 441          return;
 442    }
 443    else
 444    {
 445       ptext = &(node->content);
 446    }
 447
 448    //
 449    // Search content for items: we only search if the content is not empty,
 450    // and the content is not whitespace alone, and we haven't already searched this
 451    // text in one of our parent nodes that got flattened.
 452    //
 453    static const boost::regex space_re("[[:space:]]+");
 454    if(!seen && ptext->size() && !regex_match(*ptext, space_re))
 455    {
 456       // Save block ID and title in case we find some hits:
 457       const std::string* pid = get_current_block_id(&id);
 458       const std::string& rtitle = get_current_block_title(&title);
 459       const std::string simple_title = rewrite_title(rtitle, *pid);
 460       // Scan for each index term:
 461       for(std::set<index_info>::const_iterator i = index_terms.begin();
 462             i != index_terms.end(); ++i)
 463       {
 464          if(regex_search(*ptext, i->search_text))
 465          {
 466             //
 467             // We need to check to see if this term has already been indexed
 468             // in this zone, in order to prevent duplicate entries, also check
 469             // that any constrait placed on the term's ID is satisfied:
 470             //
 471             std::pair<std::string, std::string> item_index(*pid, i->term);
 472             if(((no_duplicates == false) || (0 == found_terms.count(item_index)))
 473                && (i->search_id.empty() || regex_match(*pid, i->search_id)))
 474             {
 475                // We have something to index!
 476                found_terms.insert(item_index);
 477
 478                if(!debug.empty() && (regex_match(i->term, debug) || regex_match(rtitle, debug) || regex_match(simple_title, debug)))
 479                {
 480                   std::cout << "Debug term found, in block with ID: " << *pid << std::endl;
 481                   std::cout << "Current section title is: " << rtitle << std::endl;
 482                   std::cout << "The main index entry will be : " << simple_title << std::endl;
 483                   std::cout << "The indexed term is: " << i->term << std::endl;
 484                   std::cout << "The search regex is: " << i->search_text << std::endl;
 485                   std::cout << "The section constraint is: " << i->search_id << std::endl;
 486                   std::cout << "The index type for this entry is: " << i->category << std::endl;
 487                }
 488
 489                if(use_section_names && (simple_title != i->term))
 490                {
 491                   //
 492                   // First off insert index entry with primary term
 493                   // consisting of the section title, and secondary term the
 494                   // actual index term, this gets skipped if the title and index
 495                   // term are the same:
 496                   //
 497                   if(internal_indexes == false)
 498                   {
 499                      // Insert an <indexterm> into the XML:
 500                      boost::tiny_xml::element_ptr p(new boost::tiny_xml::element());
 501                      p->name = "indexterm";
 502                      boost::tiny_xml::element_ptr prim(new boost::tiny_xml::element());
 503                      prim->name = "primary";
 504                      prim->elements.push_front(boost::tiny_xml::element_ptr(new boost::tiny_xml::element()));
 505                      prim->elements.front()->content = simple_title;
 506                      p->elements.push_front(prim);
 507
 508                      boost::tiny_xml::element_ptr sec(new boost::tiny_xml::element());
 509                      sec->name = "secondary";
 510                      sec->elements.push_front(boost::tiny_xml::element_ptr(new boost::tiny_xml::element()));
 511                      sec->elements.front()->content = i->term;
 512                      p->elements.push_back(sec);
 513                      try{
 514                         // Insert the Indexterm:
 515                         boost::tiny_xml::element_ptr parent(node->parent);
 516                         while(!can_contain_indexterm(parent->name.c_str()))
 517                            parent = parent->parent.lock();
 518                         parent->elements.push_front(p);
 519                      }
 520                      catch(const std::exception&)
 521                      {
 522                         std::cerr << "Unable to find location to insert <indexterm>" << std::endl;
 523                      }
 524                   }
 525                   // Track the entry in our internal index:
 526                   index_entry_ptr item1(new index_entry(simple_title));
 527                   index_entry_ptr item2(new index_entry(i->term, *pid));
 528                   index_entry_set::iterator pos = index_entries.insert(item1).first;
 529                   (**pos).sub_keys.insert(item2);
 530                }
 531                //
 532                // Now insert another index entry with the index term
 533                // as the primary key, and the section title as the
 534                // secondary key, this one gets assigned to the
 535                // appropriate index category if there is one:
 536                //
 537                bool preferred_term = false;
 538                if(internal_indexes == false)
 539                {
 540                   // Insert <indexterm> into the XML:
 541                   boost::tiny_xml::element_ptr p2(new boost::tiny_xml::element());
 542                   p2->name = "indexterm";
 543                   if(i->category.size())
 544                   {
 545                      p2->attributes.push_back(boost::tiny_xml::attribute("type", i->category));
 546                   }
 547                   boost::tiny_xml::element_ptr prim2(new boost::tiny_xml::element());
 548                   prim2->name = "primary";
 549                   prim2->elements.push_front(boost::tiny_xml::element_ptr(new boost::tiny_xml::element()));
 550                   prim2->elements.front()->content = i->term;
 551                   p2->elements.push_front(prim2);
 552
 553                   boost::tiny_xml::element_ptr sec2(new boost::tiny_xml::element());
 554                   sec2->name = "secondary";
 555                   sec2->elements.push_front(boost::tiny_xml::element_ptr(new boost::tiny_xml::element()));
 556                   sec2->elements.front()->content = rtitle;
 557                   p2->elements.push_back(sec2);
 558                   try{
 559                      // Insert the Indexterm:
 560                      boost::tiny_xml::element_ptr parent(node->parent);
 561                      while(!can_contain_indexterm(parent->name.c_str()))
 562                      {
 563                         // If the search text was found in a title then make it a preferred term:
 564                         if(parent->name == "title")
 565                            preferred_term = true;
 566                         parent = parent->parent.lock();
 567                      }
 568                      if(preferred_term)
 569                      {
 570                         boost::tiny_xml::attribute a("significance", "preferred");
 571                         p2->attributes.push_back(a);
 572                      }
 573                      parent->elements.push_front(p2);
 574                   }
 575                   catch(const std::exception&)
 576                   {
 577                      std::cerr << "Unable to find location to insert <indexterm>" << std::endl;
 578                   }
 579                }
 580
 581                // Track the entry in our internal index:
 582                try{
 583                   // figure out if it's preferred or not:
 584                   boost::tiny_xml::element_ptr parent(node->parent);
 585                   while(!can_contain_indexterm(parent->name.c_str()))
 586                   {
 587                      // If the search text was found in a title then make it a preferred term:
 588                      if(parent->name == "title")
 589                      {
 590                         preferred_term = true;
 591                      }
 592                      parent = parent->parent.lock();
 593                      if(!parent)
 594                         break;
 595                   }
 596                }
 597                catch(const std::exception&){}
 598
 599                index_entry_ptr item3(new index_entry(i->term));
 600                if(i->category.size())
 601                   item3->category = i->category;
 602                index_entry_ptr item4(new index_entry(rtitle, *pid));
 603                item4->preferred = preferred_term;
 604                index_entry_set::iterator pos = index_entries.insert(item3).first;
 605                (**pos).sub_keys.insert(item4);
 606             }
 607          }
 608       }
 609    }
 610    //
 611    // Recurse through children, if not done already:
 612    //
 613    if(!flatten)
 614       recurse_through_children(node, &id, &title, flatten || seen);
 615    //
 616    // Process manual index entries last of all:
 617    //
 618    if(node->name == "indexterm")
 619    {
 620       // Track the entry in our internal index:
 621       const std::string* pid = get_current_block_id(&id);
 622       const std::string* attr = find_attr(node, "type");
 623       const std::string& rtitle = get_current_block_title(&title);
 624       const std::string simple_title = rewrite_title(rtitle, *pid);
 625       index_entry_ptr item1(new index_entry(last_primary, "", attr ? *attr : ""));
 626       index_entry_set* parent = &((*index_entries.insert(item1).first)->sub_keys);
 627
 628       if(last_secondary.size())
 629       {
 630          item1.reset(new index_entry(last_secondary, "", attr ? *attr : ""));
 631          parent = &((*parent->insert(item1).first)->sub_keys);
 632       }
 633       if(last_tertiary.size())
 634       {
 635          item1.reset(new index_entry(last_tertiary, "", attr ? *attr : ""));
 636          parent = &((*parent->insert(item1).first)->sub_keys);
 637       }
 638       item1.reset(new index_entry(simple_title, *pid, attr ? *attr : ""));
 639       parent->insert(item1);
 640
 641       last_primary = "";
 642       last_secondary = "";
 643       last_tertiary = "";
 644    }
 645 }
 646
 647 void process_nodes(boost::tiny_xml::element_ptr node)
 648 {
 649    node_id id = { 0, };
 650    title_info t = { "", 0 };
 651    process_node(node, &id, &t);
 652 }
 653
 654 int main(int argc, char* argv[])
 655 {
 656    try{
 657
 658    namespace po = boost::program_options;
 659    po::options_description desc("AutoIndex Allowed Options");
 660    desc.add_options()
 661       ("help", "Print help message")
 662       ("in", po::value<std::string>(), "Set the input XML file.")
 663       ("out", po::value<std::string>(), "Set output input XML file.")
 664       ("scan", po::value<std::string>(), "Scan the specified file for terms to try and index.")
 665       ("script", po::value<std::string>(), "Specifies the script file to use.")
 666       ("no-duplicates", "Prevents duplicate index entries within the same section.")
 667       ("no-section-names", "Suppresses use of section names as index entries.")
 668       ("internal-index", "Causes AutoIndex to generate the index itself, rather than relying on the XSL stylesheets.")
 669       ("verbose", "Turns on verbose mode.")
 670       ("prefix", po::value<std::string>(), "Sets the prefix to be prepended to all file names and paths in the script file.")
 671       ("index-type", po::value<std::string>(), "Sets the XML container type to use the index.")
 672    ;
 673
 674    po::variables_map vm;
 675    po::store(po::parse_command_line(argc, argv, desc), vm);
 676    po::notify(vm);
 677
 678    //
 679    // Process arguments:
 680    //
 681    if(vm.count("help"))
 682    {
 683       std::cout << desc;
 684       return 0;
 685    }
 686    if(vm.count("in"))
 687    {
 688       infile = vm["in"].as<std::string>();
 689    }
 690    else
 691    {
 692       std::cerr << "No input XML file specified" << std::endl;
 693       return 1;
 694    }
 695    if(vm.count("out"))
 696    {
 697       outfile = vm["out"].as<std::string>();
 698    }
 699    else
 700    {
 701       std::cerr << "No output XML file specified" << std::endl;
 702       return 1;
 703    }
 704    if(vm.count("verbose"))
 705    {
 706       verbose = true;
 707    }
 708    if(vm.count("prefix"))
 709    {
 710       prefix = vm["prefix"].as<std::string>();
 711    }
 712    if(vm.count("scan"))
 713    {
 714       std::string f = vm["scan"].as<std::string>();
 715       if(!exists(boost::filesystem::path(f)))
 716          throw std::runtime_error("Error the file requested for scanning does not exist: " + f);
 717       scan_file(f);
 718    }
 719    if(vm.count("script"))
 720    {
 721       process_script(vm["script"].as<std::string>());
 722    }
 723    if(vm.count("no-duplicates"))
 724    {
 725       no_duplicates = true;
 726    }
 727    if(vm.count("no-section-names"))
 728    {
 729       use_section_names = false;
 730    }
 731    if(vm.count("internal-index"))
 732    {
 733       internal_indexes = true;
 734    }
 735    if(vm.count("index-type"))
 736    {
 737       internal_index_type = vm["index-type"].as<std::string>();
 738    }
 739
 740    std::ifstream is(infile.c_str());
 741    if((0 == is.peek()) || !is.good())
 742    {
 743       std::cerr << "Unable to open XML data file " << argv[1] << std::endl;
 744       return 1;
 745    }
 746    //
 747    // We need to skip any leading <? and <! elements:
 748    //
 749    std::string header = get_header(is);
 750    boost::tiny_xml::element_ptr xml = boost::tiny_xml::parse(is, "");
 751    is.close();
 752
 753    std::cout << "Indexing " << index_terms.size() << " terms..." << std::endl;
 754
 755    process_nodes(xml);
 756
 757    if(internal_indexes)
 758       generate_indexes();
 759
 760    std::ofstream os(outfile.c_str());
 761    os << header << std::endl;
 762    boost::tiny_xml::write(*xml, os);
 763    std::cout << index_entries.size() << " Index entries were created." << std::endl;
 764
 765    }
 766    catch(boost::exception& e)
 767    {
 768       std::cerr << diagnostic_information(e);
 769       return 1;
 770    }
 771    catch(const std::exception& e)
 772    {
 773       std::cerr << e.what() << std::endl;
 774       return 1;
 775    }
 776    catch(const std::string& s)
 777    {
 778       std::cerr << s << std::endl;
 779       return 1;
 780    }
 781
 782    return 0;
 783 }