3 """Generate ESIS events based on a LaTeX source document and
6 The conversion is not strong enough to work with arbitrary LaTeX
7 documents; it has only been designed to work with the highly stylized
8 markup used in the standard Python documentation. A lot of
9 information about specific markup is encoded in the control table
10 passed to the convert() function; changing this table can allow this
11 tool to support additional LaTeX markups.
13 The format of the table is largely undocumented; see the commented
14 headers where the table is specified in main(). There is no provision
15 to load an alternate table from an external file.
25 import xml.sax.saxutils
27 from types import ListType, StringType, TupleType
30 from xml.parsers.xmllib import XMLParser
32 from xmllib import XMLParser
35 from esistools import encode
41 class LaTeXFormatError(Exception):
45 class LaTeXStackError(LaTeXFormatError):
46 def __init__(self, found, stack):
47 msg = "environment close for %s doesn't match;\n stack = %s" \
51 LaTeXFormatError.__init__(self, msg)
54 _begin_env_rx = re.compile(r"[\\]begin{([^}]*)}")
55 _end_env_rx = re.compile(r"[\\]end{([^}]*)}")
56 _begin_macro_rx = re.compile(r"[\\]([a-zA-Z]+[*]?) ?({|\s*\n?)")
57 _comment_rx = re.compile("%+ ?(.*)\n[ \t]*")
58 _text_rx = re.compile(r"[^]~%\\{}]+")
59 _optional_rx = re.compile(r"\s*[[]([^]]*)[]]")
60 # _parameter_rx is this complicated to allow {...} inside a parameter;
61 # this is useful to match tabular layout specifications like {c|p{24pt}}
62 _parameter_rx = re.compile("[ \n]*{(([^{}}]|{[^}]*})*)}")
63 _token_rx = re.compile(r"[a-zA-Z][a-zA-Z0-9.-]*$")
64 _start_group_rx = re.compile("[ \n]*{")
65 _start_optional_rx = re.compile("[ \n]*[[]")
68 ESCAPED_CHARS = "$%#^ {}&~"
73 sys.stderr.write(msg + "\n")
75 def pushing(name, point, depth):
76 dbgmsg("pushing <%s> at %s" % (name, point))
78 def popping(name, point, depth):
79 dbgmsg("popping </%s> at %s" % (name, point))
82 class _Stack(UserList.UserList):
83 def append(self, entry):
84 if type(entry) is not StringType:
85 raise LaTeXFormatError("cannot push non-string on stack: "
87 #dbgmsg("%s<%s>" % (" "*len(self.data), entry))
88 self.data.append(entry)
90 def pop(self, index=-1):
91 entry = self.data[index]
93 #dbgmsg("%s</%s>" % (" "*len(self.data), entry))
95 def __delitem__(self, index):
96 entry = self.data[index]
98 #dbgmsg("%s</%s>" % (" "*len(self.data), entry))
108 def __init__(self, ifp, ofp, table):
109 self.write = ofp.write
112 self.line = string.join(map(string.rstrip, ifp.readlines()), "\n")
118 def subconvert(self, endchar=None, depth=0):
120 # Parses content, including sub-structures, until the character
121 # 'endchar' is found (with no open structures), or until the end
122 # of the input data is endchar is None.
127 if line[0] == endchar and not stack:
130 m = _comment_rx.match(line)
134 self.write("(COMMENT\n- %s \n)COMMENT\n-\\n\n"
136 line = line[m.end():]
138 m = _begin_env_rx.match(line)
141 entry = self.get_env_entry(name)
142 # re-write to use the macro handler
143 line = r"\%s %s" % (name, line[m.end():])
145 m = _end_env_rx.match(line)
149 entry = self.get_entry(envname)
150 while stack and envname != stack[-1] \
151 and stack[-1] in entry.endcloses:
152 self.write(")%s\n" % stack.pop())
153 if stack and envname == stack[-1]:
154 self.write(")%s\n" % entry.outputname)
157 raise LaTeXStackError(envname, stack)
158 line = line[m.end():]
160 m = _begin_macro_rx.match(line)
163 macroname = m.group(1)
165 # Ugh! This is a combining character...
167 self.combining_char("c", line[endpos])
168 line = line[endpos + 1:]
170 entry = self.get_entry(macroname)
173 pos = string.find(line, "\\end{%s}" % macroname)
174 text = line[m.end(1):pos]
175 stack.append(entry.name)
176 self.write("(%s\n" % entry.outputname)
177 self.write("-%s\n" % encode(text))
178 self.write(")%s\n" % entry.outputname)
180 line = line[pos + len("\\end{%s}" % macroname):]
182 while stack and stack[-1] in entry.closes:
184 topentry = self.get_entry(top)
185 if topentry.outputname:
186 self.write(")%s\n-\\n\n" % topentry.outputname)
192 params, optional, empty, environ = self.start_macro(macroname)
193 # rip off the macroname
195 line = line[m.end(1):]
197 line = line[m.end(1):]
199 line = line[m.end():]
203 # handle attribute mappings here:
204 for pentry in params:
205 if pentry.type == "attribute":
207 m = _optional_rx.match(line)
208 if m and entry.outputname:
209 line = line[m.end():]
210 self.dump_attr(pentry, m.group(1))
211 elif pentry.text and entry.outputname:
212 # value supplied by conversion spec:
213 self.dump_attr(pentry, pentry.text)
215 m = _parameter_rx.match(line)
217 raise LaTeXFormatError(
218 "could not extract parameter %s for %s: %s"
219 % (pentry.name, macroname, `line[:100]`))
221 self.dump_attr(pentry, m.group(1))
222 line = line[m.end():]
223 elif pentry.type == "child":
225 m = _optional_rx.match(line)
227 line = line[m.end():]
228 if entry.outputname and not opened:
230 self.write("(%s\n" % entry.outputname)
231 stack.append(macroname)
232 stack.append(pentry.name)
233 self.write("(%s\n" % pentry.name)
234 self.write("-%s\n" % encode(m.group(1)))
235 self.write(")%s\n" % pentry.name)
238 if entry.outputname and not opened:
240 self.write("(%s\n" % entry.outputname)
241 stack.append(entry.name)
242 self.write("(%s\n" % pentry.name)
243 stack.append(pentry.name)
244 self.line = skip_white(line)[1:]
245 line = self.subconvert(
246 "}", len(stack) + depth + 1)[1:]
247 self.write(")%s\n" % stack.pop())
248 elif pentry.type == "content":
252 if entry.outputname and not opened:
254 self.write("(%s\n" % entry.outputname)
255 stack.append(entry.name)
256 line = skip_white(line)
258 raise LaTeXFormatError(
259 "missing content for " + macroname)
261 line = self.subconvert("}", len(stack) + depth + 1)
262 if line and line[0] == "}":
264 elif pentry.type == "text" and pentry.text:
265 if entry.outputname and not opened:
267 stack.append(entry.name)
268 self.write("(%s\n" % entry.outputname)
269 #dbgmsg("--- text: %s" % `pentry.text`)
270 self.write("-%s\n" % encode(pentry.text))
271 elif pentry.type == "entityref":
272 self.write("&%s\n" % pentry.name)
275 self.write("(%s\n" % entry.outputname)
276 stack.append(entry.name)
277 if not implied_content:
278 self.write(")%s\n" % entry.outputname)
281 if line[0] == endchar and not stack:
285 # end of macro or group
286 macroname = stack[-1]
288 conversion = self.table[macroname]
289 if conversion.outputname:
290 # otherwise, it was just a bare group
291 self.write(")%s\n" % conversion.outputname)
296 # don't worry about the "tie" aspect of this command
304 if line[0] == "\\" and line[1] in ESCAPED_CHARS:
305 self.write("-%s\n" % encode(line[1]))
308 if line[:2] == r"\\":
309 self.write("(BREAK\n)BREAK\n")
312 if line[:2] == r"\_":
313 line = "_" + line[2:]
315 if line[:2] in (r"\'", r'\"'):
316 # combining characters...
317 self.combining_char(line[1], line[2])
320 m = _text_rx.match(line)
322 text = encode(m.group())
323 self.write("-%s\n" % text)
324 line = line[m.end():]
326 # special case because of \item[]
327 # XXX can we axe this???
332 # avoid infinite loops
336 raise LaTeXFormatError("could not identify markup: %s%s"
337 % (`line[:100]`, extra))
339 entry = self.get_entry(stack[-1])
341 self.write(")%s\n-%s\n" % (entry.outputname, encode("\n")))
346 raise LaTeXFormatError("elements remain on stack: "
347 + string.join(stack, ", "))
348 # otherwise we just ran out of input here...
350 # This is a really limited table of combinations, but it will have
358 def combining_char(self, prefix, char):
359 ordinal = self._combinations[(prefix, char)]
360 self.write("-\\%%%d;\n" % ordinal)
362 def start_macro(self, name):
363 conversion = self.get_entry(name)
364 parameters = conversion.parameters
365 optional = parameters and parameters[0].optional
366 return parameters, optional, conversion.empty, conversion.environment
368 def get_entry(self, name):
369 entry = self.table.get(name)
371 dbgmsg("get_entry(%s) failing; building default entry!" % `name`)
372 # not defined; build a default entry:
373 entry = TableEntry(name)
374 entry.has_content = 1
375 entry.parameters.append(Parameter("content"))
376 self.table[name] = entry
379 def get_env_entry(self, name):
380 entry = self.table.get(name)
382 # not defined; build a default entry:
383 entry = TableEntry(name, 1)
384 entry.has_content = 1
385 entry.parameters.append(Parameter("content"))
386 entry.parameters[-1].implied = 1
387 self.table[name] = entry
388 elif not entry.environment:
389 raise LaTeXFormatError(
390 name + " is defined as a macro; expected environment")
393 def dump_attr(self, pentry, value):
394 if not (pentry.name and value):
396 if _token_rx.match(value):
400 self.write("A%s %s %s\n" % (pentry.name, dtype, encode(value)))
403 def convert(ifp, ofp, table):
404 c = Conversion(ifp, ofp, table)
407 except IOError, (err, msg):
408 if err != errno.EPIPE:
412 def skip_white(line):
413 while line and line[0] in " %\n\t\r":
414 line = string.lstrip(line[1:])
420 def __init__(self, name, environment=0):
422 self.outputname = name
423 self.environment = environment
424 self.empty = not environment
433 def __init__(self, type, name=None, optional=0):
436 self.optional = optional
441 class TableParser(XMLParser):
442 def __init__(self, table=None):
446 self.__current = None
448 XMLParser.__init__(self)
451 for entry in self.__table.values():
452 if entry.environment and not entry.has_content:
453 p = Parameter("content")
455 entry.parameters.append(p)
456 entry.has_content = 1
459 def start_environment(self, attrs):
461 self.__current = TableEntry(name, environment=1)
462 self.__current.verbatim = attrs.get("verbatim") == "yes"
463 if attrs.has_key("outputname"):
464 self.__current.outputname = attrs.get("outputname")
465 self.__current.endcloses = string.split(attrs.get("endcloses", ""))
466 def end_environment(self):
469 def start_macro(self, attrs):
471 self.__current = TableEntry(name)
472 self.__current.closes = string.split(attrs.get("closes", ""))
473 if attrs.has_key("outputname"):
474 self.__current.outputname = attrs.get("outputname")
476 self.__table[self.__current.name] = self.__current
477 self.__current = None
479 def start_attribute(self, attrs):
480 name = attrs.get("name")
481 optional = attrs.get("optional") == "yes"
483 p = Parameter("attribute", name, optional=optional)
485 p = Parameter("attribute", optional=optional)
486 self.__current.parameters.append(p)
488 def end_attribute(self):
489 self.__current.parameters[-1].text = self.__buffer
491 def start_entityref(self, attrs):
493 p = Parameter("entityref", name)
494 self.__current.parameters.append(p)
496 def start_child(self, attrs):
498 p = Parameter("child", name, attrs.get("optional") == "yes")
499 self.__current.parameters.append(p)
500 self.__current.empty = 0
502 def start_content(self, attrs):
503 p = Parameter("content")
504 p.implied = attrs.get("implied") == "yes"
505 if self.__current.environment:
507 self.__current.parameters.append(p)
508 self.__current.has_content = 1
509 self.__current.empty = 0
511 def start_text(self, attrs):
512 self.__current.empty = 0
515 p = Parameter("text")
516 p.text = self.__buffer
517 self.__current.parameters.append(p)
519 def handle_data(self, data):
520 self.__buffer = self.__buffer + data
523 def load_table(fp, table=None):
524 parser = TableParser(table=table)
525 parser.feed(fp.read())
527 return parser.get_table()
533 opts, args = getopt.getopt(sys.argv[1:], "D", ["debug"])
534 for opt, arg in opts:
535 if opt in ("-D", "--debug"):
545 ofp = open(args[1], "w")
550 table = load_table(open(os.path.join(sys.path[0], 'conversion.xml')))
551 convert(ifp, ofp, table)
554 if __name__ == "__main__":