1 // Copyright 2008 John Maddock
3 // Use, modification and distribution are subject to the
4 // Boost Software License, Version 1.0.
5 // (See accompanying file LICENSE_1_0.txt
6 // or copy at http://www.boost.org/LICENSE_1_0.txt)
10 #include <boost/array.hpp>
11 #include <boost/exception/all.hpp>
12 #include <boost/program_options.hpp>
13 #include "auto_index.hpp"
15 std::string infile, outfile, prefix, last_primary, last_secondary, last_tertiary;
16 std::set<index_info> index_terms;
17 std::set<std::pair<std::string, std::string> > found_terms;
18 bool no_duplicates = false;
20 bool use_section_names = true;
21 index_entry_set index_entries;
22 boost::tiny_xml::element_list indexes;
23 std::list<id_rewrite_rule> id_rewrite_list;
24 bool internal_indexes = false;
25 std::string internal_index_type = "section";
27 file_scanner_set_type file_scanner_set;
31 std::cout << "Please refer to the documentation for the correct command line syntax" << std::endl;
35 void eat_block(std::string& result, std::istream & is)
38 // everything until we get to a closing '>':
41 while(is.get(c) && c != '>')
53 std::string get_header(std::istream & is)
56 // We need to get any leading <? and <! elements:
61 throw std::runtime_error("Invalid leading markup in XML file found");
63 while((c == '?') || (c == '!'))
66 std::getline(is, temp, '>');
67 result += '<' + temp + '>';
70 throw std::runtime_error("Invalid leading markup in XML file found");
77 // Find attribute named "name" in node "node":
79 const std::string* find_attr(boost::tiny_xml::element_ptr node, const char* name)
81 for(boost::tiny_xml::attribute_list::const_iterator i = node->attributes.begin();
82 i != node->attributes.end(); ++i)
90 // Get the ID of the current block scope, basically
91 // move up the XML tree until we find a valid ID:
93 const std::string* get_current_block_id(node_id const* id)
95 while((id->id == 0) && (id->prev))
98 BOOST_THROW_EXCEPTION(std::runtime_error("Current XML block has no enclosing ID: XML is not valid Boostbook?"));
102 // Get the title of the current block scope, basically
103 // move up the XML tree until we find a valid title:
105 const std::string& get_current_block_title(title_info const* id)
107 while((id->title.size() == 0) && (id->prev))
112 // Get all the content under this node, with any inline XML
115 std::string get_consolidated_content(boost::tiny_xml::element_ptr node)
117 std::string result(node->content);
118 for(boost::tiny_xml::element_list::const_iterator i = node->elements.begin();
119 i != node->elements.end(); ++i)
122 result += get_consolidated_content(*i);
124 static const boost::regex e("(^[[:space:]]+)|([[:space:]]+)|([[:space:]]+$)");
125 return regex_replace(result, e, "(?2 )", boost::regex_constants::format_all);
128 // Rewrite a title based on any rewrite rules we may have:
130 std::string rewrite_title(const std::string& title, const std::string& id)
132 for(std::list<id_rewrite_rule>::const_iterator i = id_rewrite_list.begin(); i != id_rewrite_list.end(); ++i)
136 if(regex_match(id, i->id))
141 if(regex_match(title, i->id))
142 return regex_replace(title, i->id, i->new_name);
150 bool operator()(const char* a, const char* b)const
152 return std::strcmp(a, b) < 0;
156 // Discover whether this node can contain a <title> or not, if not
157 // we don't want to link to it, or the XSL HTML stylesheets may do strange
158 // things, and at least emit copious messages. See https://sourceforge.net/tracker/?func=detail&aid=3325153&group_id=21935&atid=373747
160 bool can_contain_title(const char* name)
162 static const boost::array<const char*, 103> names =
164 "abstract", "appendix", "appendixinfo", "article", "articleinfo", "authorblurb", "bibliodiv", "biblioentry", "bibliography",
165 "bibliographyinfo", "bibliolist", "bibliomixed", "bibliomset", "biblioset", "blockinfo", "blockquote", "book", "bookinfo",
166 "calloutlist", "caution", "chapter", "chapterinfo", "colophon", "constraintdef", "dedication", "equation", "example", "figure",
167 "formalpara", "glossary", "glossaryinfo", "glossdiv", "glosslist", "important", "index", "indexdiv", "indexinfo", "itemizedlist",
168 "legalnotice", "lot", "msg", "msgexplan", "msgmain", "msgrel", "msgset", "msgsub", "note", "objectinfo", "orderedlist", "part",
169 "partinfo", "partintro", "personblurb", "preface", "prefaceinfo", "procedure", "productionset", "qandadiv", "qandaset",
170 "refentryinfo", "reference", "referenceinfo", "refsect1", "refsect1info", "refsect2", "refsect2info", "refsect3", "refsect3info",
171 "refsection", "refsectioninfo", "refsynopsisdiv", "refsynopsisdivinfo", "sect1", "sect1info", "sect2", "sect2info", "sect3",
172 "sect3info", "sect4", "sect4info", "sect5", "sect5info", "section", "sectioninfo", "segmentedlist", "set", "setindex",
173 "setindexinfo", "setinfo", "sidebar", "sidebarinfo", "simplesect", "step", "table", "task", "taskprerequisites",
174 "taskrelated", "tasksummary", "tip", "toc", "variablelist", "warning", "refentry"
176 static std::set<const char*, string_cmp> permitted;
178 if(permitted.empty())
179 permitted.insert(names.begin(), names.end());
181 return 0 != permitted.count(name);
184 // Determine whether this node can contain an indexterm or not:
186 bool can_contain_indexterm(const char* name)
188 static const boost::array<const char*, 257> names =
190 "abbrev", "accel", "ackno", "acronym", "action", "answer", "appendix", "appendixinfo", "application",
191 "article", "articleinfo", "artpagenums", "attribution", "authorinitials", "bibliocoverage", "bibliodiv",
192 "biblioentry", "bibliography", "bibliographyinfo", "biblioid", "bibliomisc", "bibliomixed", "bibliomset",
193 "bibliorelation", "biblioset", "bibliosource", "blockinfo", "blockquote", "bookinfo", "bridgehead", "callout",
194 "caution", "chapter", "chapterinfo", "citation", "citebiblioid", "citetitle", "city", "classname", "classsynopsisinfo",
195 "code", "collabname", "command", "computeroutput", "confdates", "confnum", "confsponsor", "conftitle", "constant",
196 "constraintdef", "contractnum", "contractsponsor", "contrib", "corpauthor", "corpcredit", "corpname", "country",
197 "database", "date", "dedication", "edition", "email", "emphasis", "entry", "envar", "errorcode", "errorname", "errortext",
198 "errortype", "example", "exceptionname", "fax", "figure", "filename", "firstname", "firstterm", "foreignphrase",
199 "formalpara", "funcparams", "funcsynopsisinfo", "function", "glossary", "glossaryinfo", "glossdef", "glossdiv",
200 "glossentry", "glosssee", "glossseealso", "glossterm", "guibutton", "guiicon", "guilabel", "guimenu", "guimenuitem",
201 "guisubmenu", "hardware", "highlights", "holder", "honorific", "important", "index", "indexinfo", "informalexample",
202 "informalfigure", "initializer", "interface", "interfacename", "invpartnumber", "isbn", "issn", "issuenum", "itemizedlist",
203 "itermset", "jobtitle", "keycap", "keycode", "keysym", "label", "legalnotice", "lineage", "lineannotation",
204 /*"link", */"listitem", "literal", "literallayout", "lotentry", "manvolnum", "markup", "medialabel", "member",
205 "methodname", "modespec", "modifier", "mousebutton", "msgaud", "msgexplan", "msglevel", "msgorig", "msgtext", "note",
206 "objectinfo", "olink", "option", "optional", "orderedlist", "orgdiv", "orgname", "otheraddr", "othername", "package",
207 "pagenums", "para", "parameter", "partinfo", "partintro", "phone", "phrase", "pob", "postcode", "preface", "prefaceinfo",
208 "procedure", "productname", "productnumber", "programlisting", "prompt", "property", "pubdate", "publishername",
209 "pubsnumber", "qandadiv", "qandaset", "question", "quote", "refentry", "refentryinfo", "refentrytitle", "referenceinfo",
210 "refmeta", "refmiscinfo", "refpurpose", "refsect1", "refsect1info", "refsect2", "refsect2info", "refsect3", "refsect3info",
211 "refsection", "refsectioninfo", "refsynopsisdiv", "refsynopsisdivinfo", "releaseinfo", "remark", "returnvalue",
212 "revdescription", "revnumber", "revremark", "screen", "screeninfo", "sect1", "sect1info", "sect2", "sect2info", "sect3",
213 "sect3info", "sect4", "sect4info", "sect5", "sect5info", "section", "sectioninfo", "seg", "segtitle", "seriesvolnums",
214 "setindex", "setindexinfo", "setinfo", "sgmltag", "shortaffil", "sidebar", "sidebarinfo", "simpara", "simplesect",
215 "state", "step", "street", "structfield", "structname", "subtitle", "surname", "symbol", "synopsis", "systemitem",
216 "table", "task", "taskprerequisites", "taskrelated", "tasksummary", "td", "term", "termdef", "th", "tip", /*"title",*/
217 "titleabbrev", "tocback", "tocentry", "tocfront", "token", "type", "ulink", "uri", "userinput", "variablelist",
218 "varname", "volumenum", "warning", "wordasword", "year"
220 static std::set<const char*, string_cmp> permitted;
222 if(permitted.empty())
223 permitted.insert(names.begin(), names.end());
225 return 0 != permitted.count(name);
228 // Decide whether to flatten this node for searching purposes:
230 bool should_flatten_node(const char* name)
233 // The list of nodes to flatten is basically the list of elements that
234 // can appear inside a <section> - see http://www.docbook.org/tdg/en/html/section.html.
235 // In other words basically anything at the level of a paragraph/table/listing etc.
237 static const boost::array<const char*, 57> names =
239 "title", "subtitle", "titleabbrev",
240 "toc", "lot", "glossary", "bibliography",
241 /*"calloutlist", "glosslist", "bibliolist", "itemizedlist", "orderedlist",
242 "segmentedlist", "simplelist", "variablelist",*/ "caution", "important", "note",
243 "tip", "warning", "literallayout", "programlisting", "programlistingco",
244 "screen", "screenco", "screenshot", "synopsis", "cmdsynopsis", "funcsynopsis",
245 "classsynopsis", "fieldsynopsis", "constructorsynopsis",
246 "destructorsynopsis", "methodsynopsis", "formalpara", "para", "simpara",
247 "address", "blockquote", "graphic", "graphicco", "mediaobject",
248 "mediaobjectco", "informalequation", "informalexample", "informalfigure",
249 "informaltable", "equation", "example", "figure", "table", "msgset", "procedure",
250 "sidebar", "qandaset", "task", "productionset", "constraintdef", "anchor",
251 "bridgehead", "remark", "highlights", "abstract", "authorblurb", "epigraph"
252 /*"biblioentry", "bibliomixed", "callout", "glossentry", "listitem", "seg", "seglistitem", "member",
255 static std::set<const char*, string_cmp> terminals;
257 if(terminals.empty())
258 terminals.insert(names.begin(), names.end());
259 return 0 != terminals.count(name);
261 std::string unescape_xml(const std::string& s)
263 boost::regex e("&(?:(quot)|(amp)|(apos)|(lt)|(gt));");
264 return regex_replace(s, e, "(?1\")(?2&)(?3\')(?4<)(?5>)", boost::regex_constants::format_all);
267 // Exception classes to propagate processing instruction info:
269 struct ignore_section{};
270 struct ignore_block{};
272 // Check if we're in a section (or chapter etc) or not:
274 bool is_section(const std::string& name)
276 static const boost::array<const char*, 19> data =
278 "dedication", "toc", "lot", "glossary", "bibliography", "preface", "chapter",
279 "reference", "part", "article", "appendix", "index", "setindex", "colophon",
280 "sect1", "refentry", "simplesect", "section", "partintro"
282 std::set<std::string> names;
284 names.insert(data.begin(), data.end());
285 return 0 != names.count(name);
288 // Check if we're in a block/paragraph or not:
290 bool is_block(const std::string& name)
292 static const boost::array<const char*, 58> data =
294 "calloutlist", "glosslist", "bibliolist", "itemizedlist", "orderedlist",
295 "segmentedlist", "simplelist", "variablelist", "caution", "important", "note",
296 "tip", "warning", "literallayout", "programlisting", "programlistingco",
297 "screen", "screenco", "screenshot", "synopsis", "cmdsynopsis", "funcsynopsis",
298 "classsynopsis", "fieldsynopsis", "constructorsynopsis",
299 "destructorsynopsis", "methodsynopsis", "formalpara", "para", "simpara",
300 "address", "blockquote", "graphic", "graphicco", "mediaobject",
301 "mediaobjectco", "informalequation", "informalexample", "informalfigure",
302 "informaltable", "equation", "example", "figure", "table", "msgset", "procedure",
303 "sidebar", "qandaset", "task", "productionset", "constraintdef", "anchor",
304 "bridgehead", "remark", "highlights", "abstract", "authorblurb", "epigraph"
306 std::set<std::string> names;
308 names.insert(data.begin(), data.end());
309 return 0 != names.count(name);
312 // Helper proc to recurse through children:
314 void process_node(boost::tiny_xml::element_ptr node, node_id* prev, title_info* pt, bool seen);
315 bool recurse_through_children(boost::tiny_xml::element_ptr node, node_id* id, title_info* pt, bool seen)
319 for(boost::tiny_xml::element_list::const_iterator i = node->elements.begin();
320 i != node->elements.end(); ++i)
322 process_node(*i, id, pt, seen);
325 catch(const ignore_section&)
327 if(is_section(node->name))
332 catch(const ignore_block&)
334 if(is_block(node->name) || is_section(node->name))
342 // This does most of the work: process the node pointed to, and any children
345 void process_node(boost::tiny_xml::element_ptr node, node_id* prev, title_info* pt, bool seen = false)
348 // Store the current ID and title as nested scoped objects:
350 node_id id = { 0, prev };
351 if(can_contain_title(node->name.c_str()))
353 // Only set the ID to link to if the block can contain a title, see
354 // can_contain_title above for rationale.
355 id.id = find_attr(node, "id");
357 title_info title = { "", pt};
358 bool flatten = should_flatten_node(node->name.c_str());
360 if(node->name.size() && node->name[0] == '?')
362 if(node->name == "?BoostAutoIndex")
364 if(node->content == "IgnoreSection")
366 throw ignore_section();
368 else if(node->content == "IgnoreBlock")
370 throw ignore_block();
373 return; // Ignore processing instructions
375 else if((node->name == "title") && (id.prev->id))
378 // This actually sets the title of the enclosing scope,
379 // not this tag itself:
381 title.prev->title = get_consolidated_content(node);
383 std::cout << "Indexing section: " << title.prev->title << std::endl;
385 else if((node->name == "refentrytitle") && (id.prev->prev->id))
388 // This actually sets the title of the enclosing refentry scope,
389 // not this tag itself:
391 title.prev->prev->title = get_consolidated_content(node);
393 std::cout << "Indexing refentry: " << title.prev->prev->title << std::endl;
395 if(node->name == "anchor")
397 if(node->parent.lock()->name == "title")
399 // We have a title with a nested anchor ID, change the ID of our parents parent to match:
400 id.prev->prev->id = id.id;
403 else if(node->name == "index")
405 // Keep track of all the indexes we see:
406 indexes.push_back(node);
407 if(node->parent.lock()->name == "para")
408 node->parent.lock()->name = "";
410 else if(node->name == "primary")
412 last_primary = get_consolidated_content(node);
414 else if(node->name == "secondary")
416 last_secondary = get_consolidated_content(node);
418 else if(node->name == "tertiary")
420 last_tertiary = get_consolidated_content(node);
422 else if((node->name == "see") && internal_indexes)
424 std::cerr << "WARNING: <see> in XML source will be ignored for the index generation" << std::endl;
426 else if((node->name == "seealso") && internal_indexes)
428 std::cerr << "WARNING: <seealso> in XML source will be ignored for the index generation" << std::endl;
431 std::string flattenned_text;
432 const std::string* ptext;
435 flattenned_text = unescape_xml(get_consolidated_content(node));
436 ptext = &flattenned_text;
438 // Recurse through children here if we're going to flatten the text, that way we see any processing instructions first:
440 if(!recurse_through_children(node, &id, &title, flatten || seen))
445 ptext = &(node->content);
449 // Search content for items: we only search if the content is not empty,
450 // and the content is not whitespace alone, and we haven't already searched this
451 // text in one of our parent nodes that got flattened.
453 static const boost::regex space_re("[[:space:]]+");
454 if(!seen && ptext->size() && !regex_match(*ptext, space_re))
456 // Save block ID and title in case we find some hits:
457 const std::string* pid = get_current_block_id(&id);
458 const std::string& rtitle = get_current_block_title(&title);
459 const std::string simple_title = rewrite_title(rtitle, *pid);
460 // Scan for each index term:
461 for(std::set<index_info>::const_iterator i = index_terms.begin();
462 i != index_terms.end(); ++i)
464 if(regex_search(*ptext, i->search_text))
467 // We need to check to see if this term has already been indexed
468 // in this zone, in order to prevent duplicate entries, also check
469 // that any constrait placed on the term's ID is satisfied:
471 std::pair<std::string, std::string> item_index(*pid, i->term);
472 if(((no_duplicates == false) || (0 == found_terms.count(item_index)))
473 && (i->search_id.empty() || regex_match(*pid, i->search_id)))
475 // We have something to index!
476 found_terms.insert(item_index);
478 if(!debug.empty() && (regex_match(i->term, debug) || regex_match(rtitle, debug) || regex_match(simple_title, debug)))
480 std::cout << "Debug term found, in block with ID: " << *pid << std::endl;
481 std::cout << "Current section title is: " << rtitle << std::endl;
482 std::cout << "The main index entry will be : " << simple_title << std::endl;
483 std::cout << "The indexed term is: " << i->term << std::endl;
484 std::cout << "The search regex is: " << i->search_text << std::endl;
485 std::cout << "The section constraint is: " << i->search_id << std::endl;
486 std::cout << "The index type for this entry is: " << i->category << std::endl;
489 if(use_section_names && (simple_title != i->term))
492 // First off insert index entry with primary term
493 // consisting of the section title, and secondary term the
494 // actual index term, this gets skipped if the title and index
495 // term are the same:
497 if(internal_indexes == false)
499 // Insert an <indexterm> into the XML:
500 boost::tiny_xml::element_ptr p(new boost::tiny_xml::element());
501 p->name = "indexterm";
502 boost::tiny_xml::element_ptr prim(new boost::tiny_xml::element());
503 prim->name = "primary";
504 prim->elements.push_front(boost::tiny_xml::element_ptr(new boost::tiny_xml::element()));
505 prim->elements.front()->content = simple_title;
506 p->elements.push_front(prim);
508 boost::tiny_xml::element_ptr sec(new boost::tiny_xml::element());
509 sec->name = "secondary";
510 sec->elements.push_front(boost::tiny_xml::element_ptr(new boost::tiny_xml::element()));
511 sec->elements.front()->content = i->term;
512 p->elements.push_back(sec);
514 // Insert the Indexterm:
515 boost::tiny_xml::element_ptr parent(node->parent);
516 while(!can_contain_indexterm(parent->name.c_str()))
517 parent = parent->parent.lock();
518 parent->elements.push_front(p);
520 catch(const std::exception&)
522 std::cerr << "Unable to find location to insert <indexterm>" << std::endl;
525 // Track the entry in our internal index:
526 index_entry_ptr item1(new index_entry(simple_title));
527 index_entry_ptr item2(new index_entry(i->term, *pid));
528 index_entry_set::iterator pos = index_entries.insert(item1).first;
529 (**pos).sub_keys.insert(item2);
532 // Now insert another index entry with the index term
533 // as the primary key, and the section title as the
534 // secondary key, this one gets assigned to the
535 // appropriate index category if there is one:
537 bool preferred_term = false;
538 if(internal_indexes == false)
540 // Insert <indexterm> into the XML:
541 boost::tiny_xml::element_ptr p2(new boost::tiny_xml::element());
542 p2->name = "indexterm";
543 if(i->category.size())
545 p2->attributes.push_back(boost::tiny_xml::attribute("type", i->category));
547 boost::tiny_xml::element_ptr prim2(new boost::tiny_xml::element());
548 prim2->name = "primary";
549 prim2->elements.push_front(boost::tiny_xml::element_ptr(new boost::tiny_xml::element()));
550 prim2->elements.front()->content = i->term;
551 p2->elements.push_front(prim2);
553 boost::tiny_xml::element_ptr sec2(new boost::tiny_xml::element());
554 sec2->name = "secondary";
555 sec2->elements.push_front(boost::tiny_xml::element_ptr(new boost::tiny_xml::element()));
556 sec2->elements.front()->content = rtitle;
557 p2->elements.push_back(sec2);
559 // Insert the Indexterm:
560 boost::tiny_xml::element_ptr parent(node->parent);
561 while(!can_contain_indexterm(parent->name.c_str()))
563 // If the search text was found in a title then make it a preferred term:
564 if(parent->name == "title")
565 preferred_term = true;
566 parent = parent->parent.lock();
570 boost::tiny_xml::attribute a("significance", "preferred");
571 p2->attributes.push_back(a);
573 parent->elements.push_front(p2);
575 catch(const std::exception&)
577 std::cerr << "Unable to find location to insert <indexterm>" << std::endl;
581 // Track the entry in our internal index:
583 // figure out if it's preferred or not:
584 boost::tiny_xml::element_ptr parent(node->parent);
585 while(!can_contain_indexterm(parent->name.c_str()))
587 // If the search text was found in a title then make it a preferred term:
588 if(parent->name == "title")
590 preferred_term = true;
592 parent = parent->parent.lock();
597 catch(const std::exception&){}
599 index_entry_ptr item3(new index_entry(i->term));
600 if(i->category.size())
601 item3->category = i->category;
602 index_entry_ptr item4(new index_entry(rtitle, *pid));
603 item4->preferred = preferred_term;
604 index_entry_set::iterator pos = index_entries.insert(item3).first;
605 (**pos).sub_keys.insert(item4);
611 // Recurse through children, if not done already:
614 recurse_through_children(node, &id, &title, flatten || seen);
616 // Process manual index entries last of all:
618 if(node->name == "indexterm")
620 // Track the entry in our internal index:
621 const std::string* pid = get_current_block_id(&id);
622 const std::string* attr = find_attr(node, "type");
623 const std::string& rtitle = get_current_block_title(&title);
624 const std::string simple_title = rewrite_title(rtitle, *pid);
625 index_entry_ptr item1(new index_entry(last_primary, "", attr ? *attr : ""));
626 index_entry_set* parent = &((*index_entries.insert(item1).first)->sub_keys);
628 if(last_secondary.size())
630 item1.reset(new index_entry(last_secondary, "", attr ? *attr : ""));
631 parent = &((*parent->insert(item1).first)->sub_keys);
633 if(last_tertiary.size())
635 item1.reset(new index_entry(last_tertiary, "", attr ? *attr : ""));
636 parent = &((*parent->insert(item1).first)->sub_keys);
638 item1.reset(new index_entry(simple_title, *pid, attr ? *attr : ""));
639 parent->insert(item1);
647 void process_nodes(boost::tiny_xml::element_ptr node)
650 title_info t = { "", 0 };
651 process_node(node, &id, &t);
654 int main(int argc, char* argv[])
658 namespace po = boost::program_options;
659 po::options_description desc("AutoIndex Allowed Options");
661 ("help", "Print help message")
662 ("in", po::value<std::string>(), "Set the input XML file.")
663 ("out", po::value<std::string>(), "Set output input XML file.")
664 ("scan", po::value<std::string>(), "Scan the specified file for terms to try and index.")
665 ("script", po::value<std::string>(), "Specifies the script file to use.")
666 ("no-duplicates", "Prevents duplicate index entries within the same section.")
667 ("no-section-names", "Suppresses use of section names as index entries.")
668 ("internal-index", "Causes AutoIndex to generate the index itself, rather than relying on the XSL stylesheets.")
669 ("verbose", "Turns on verbose mode.")
670 ("prefix", po::value<std::string>(), "Sets the prefix to be prepended to all file names and paths in the script file.")
671 ("index-type", po::value<std::string>(), "Sets the XML container type to use the index.")
674 po::variables_map vm;
675 po::store(po::parse_command_line(argc, argv, desc), vm);
679 // Process arguments:
688 infile = vm["in"].as<std::string>();
692 std::cerr << "No input XML file specified" << std::endl;
697 outfile = vm["out"].as<std::string>();
701 std::cerr << "No output XML file specified" << std::endl;
704 if(vm.count("verbose"))
708 if(vm.count("prefix"))
710 prefix = vm["prefix"].as<std::string>();
714 std::string f = vm["scan"].as<std::string>();
715 if(!exists(boost::filesystem::path(f)))
716 throw std::runtime_error("Error the file requested for scanning does not exist: " + f);
719 if(vm.count("script"))
721 process_script(vm["script"].as<std::string>());
723 if(vm.count("no-duplicates"))
725 no_duplicates = true;
727 if(vm.count("no-section-names"))
729 use_section_names = false;
731 if(vm.count("internal-index"))
733 internal_indexes = true;
735 if(vm.count("index-type"))
737 internal_index_type = vm["index-type"].as<std::string>();
740 std::ifstream is(infile.c_str());
741 if((0 == is.peek()) || !is.good())
743 std::cerr << "Unable to open XML data file " << argv[1] << std::endl;
747 // We need to skip any leading <? and <! elements:
749 std::string header = get_header(is);
750 boost::tiny_xml::element_ptr xml = boost::tiny_xml::parse(is, "");
753 std::cout << "Indexing " << index_terms.size() << " terms..." << std::endl;
760 std::ofstream os(outfile.c_str());
761 os << header << std::endl;
762 boost::tiny_xml::write(*xml, os);
763 std::cout << index_entries.size() << " Index entries were created." << std::endl;
766 catch(boost::exception& e)
768 std::cerr << diagnostic_information(e);
771 catch(const std::exception& e)
773 std::cerr << e.what() << std::endl;
776 catch(const std::string& s)
778 std::cerr << s << std::endl;