bindings/python/codegen/docextract.py

   1 # -*- Mode: Python; py-indent-offset: 4 -*-
   2 '''Simple module for extracting GNOME style doc comments from C
   3 sources, so I can use them for other purposes.'''
   4
   5 import sys, os, string, re
   6
   7 # Used to tell if the "Since: ..." portion of the gtkdoc function description
   8 # should be omitted.  This is useful for some C++ modules such as gstreamermm
   9 # that wrap C API which is still unstable and including this information would
  10 # not be useful.
  11 # This variable is modified from docextract_to_xml based on the --no-since
  12 # option being specified.
  13 no_since = False
  14
  15 __all__ = ['extract']
  16
  17 class GtkDoc:
  18     def __init__(self):
  19         self.name = None
  20         self.block_type = '' # The block type ('function', 'signal', 'property')
  21         self.params = []
  22         self.annotations = []
  23         self.description = ''
  24         self.ret = ('', []) # (return, annotations)
  25     def set_name(self, name):
  26         self.name = name
  27     def set_type(self, block_type):
  28         self.block_type = block_type
  29     def get_type(self):
  30         return self.block_type
  31     def add_param(self, name, description, annotations=[]):
  32         if name == '...':
  33             name = 'Varargs'
  34         self.params.append((name, description, annotations))
  35     def append_to_last_param(self, extra):
  36         self.params[-1] = (self.params[-1][0], self.params[-1][1] + extra,
  37             self.params[-1][2])
  38     def append_to_named_param(self, name, extra):
  39         for i in range(len(self.params)):
  40             if self.params[i][0] == name:
  41                 self.params[i] = (name, self.params[i][1] + extra,
  42                     self.params[i][2])
  43                 return
  44         # fall through to adding extra parameter ...
  45         self.add_param(name, extra)
  46     def add_annotation(self, annotation):
  47         self.annotations.append(annotation)
  48     def get_annotations(self):
  49         return self.annotations
  50     def append_to_description(self, extra):
  51         self.description = self.description + extra
  52     def get_description(self):
  53         return self.description
  54     def add_return(self, first_line, annotations=[]):
  55         self.ret = (first_line, annotations)
  56     def append_to_return(self, extra):
  57         self.ret = (self.ret[0] + extra, self.ret[1])
  58
  59 comment_start_pattern = re.compile(r'^\s*/\*\*\s')
  60 comment_end_pattern = re.compile(r'^\s*\*+/')
  61 comment_line_lead_pattern = re.compile(r'^\s*\*\s*')
  62 comment_empty_line_pattern = re.compile(r'^\s*\**\s*$')
  63 function_name_pattern = re.compile(r'^([a-z]\w*)\s*:?(\s*\(.*\)\s*){0,2}\s*$')
  64 signal_name_pattern = re.compile(r'^([A-Z]\w+::[a-z0-9-]+)\s*:?(\s*\(.*\)\s*){0,2}\s*$')
  65 property_name_pattern = re.compile(r'^([A-Z]\w+:[a-z0-9-]+)\s*:?(\s*\(.*\)\s*){0,2}\s*$')
  66 return_pattern = re.compile(r'^@?(returns:|return\s+value:)(.*\n?)$', re.IGNORECASE)
  67 deprecated_pattern = re.compile(r'^(deprecated\s*:\s*.*\n?)$', re.IGNORECASE)
  68 rename_to_pattern = re.compile(r'^(rename\s+to)\s*:\s*(.*\n?)$', re.IGNORECASE)
  69 param_pattern = re.compile(r'^@(\S+)\s*:(.*\n?)$')
  70 # Used to extract the annotations in the parameter and return descriptions
  71 # extracted using above [param|return]_pattern patterns.
  72 annotations_pattern = re.compile(r'^(?:(\s*\(.*\)\s*)*:)')
  73 # Used to construct the annotation lists.
  74 annotation_lead_pattern = re.compile(r'^\s*\(\s*(.*?)\s*\)\s*')
  75
  76 # These patterns determine the identifier of the current comment block.  They
  77 # are grouped in a list for easy determination of block identifiers (in
  78 # skip_to_identifier).  The function_name_pattern should be tested for last
  79 # because it always matches signal and property identifiers.
  80 identifier_patterns = [ signal_name_pattern, property_name_pattern, function_name_pattern ]
  81
  82 # This pattern is to match return sections that forget to have a colon (':')
  83 # after the initial 'Return' phrase.  It is not included by default in the list
  84 # of final sections below because a lot of function descriptions begin with
  85 # 'Returns ...' and the process_description() function would stop right at that
  86 # first line, thinking it is a return section.
  87 no_colon_return_pattern = re.compile(r'^@?(returns|return\s+value)\s*(.*\n?)$', re.IGNORECASE)
  88 since_pattern = re.compile(r'^(since\s*:\s*.*\n?)$', re.IGNORECASE)
  89
  90 # These patterns normally will be encountered after the description.  Knowing
  91 # the order of their appearance is difficult so this list is used to test when
  92 # one begins and the other ends when processing the rest of the sections after
  93 # the description.
  94 final_section_patterns = [ return_pattern, since_pattern, deprecated_pattern, rename_to_pattern ]
  95
  96 def parse_file(fp, doc_dict):
  97     line = fp.readline()
  98     while line:
  99         cur_doc = GtkDoc()
 100         line = skip_to_comment_block(fp, line)
 101         line = skip_to_identifier(fp, line, cur_doc)
 102         # See if the identifier is found (stored in the current GtkDoc by
 103         # skip_to_identifier).  If so, continue reading the rest of the comment
 104         # block.
 105         if cur_doc.name:
 106             line = process_params(fp, line, cur_doc)
 107             line = process_description(fp, line, cur_doc)
 108             line = process_final_sections(fp, line, cur_doc)
 109             # Add the current doc block to the dictionary of doc blocks.
 110             doc_dict[cur_doc.name] = cur_doc
 111
 112 # Given a list of annotations as string of the form
 113 # '(annotation1) (annotation2) ...' return a list of annotations of the form
 114 # [ (name1, value1), (name2, value2) ... ].  Not all annotations have values so
 115 # the values in the list of tuples could be empty ('').
 116 def get_annotation_list(annotations):
 117     annotation_list = []
 118     while annotations:
 119         match = annotation_lead_pattern.match(annotations)
 120         if match:
 121             annotation_contents = match.group(1)
 122             name, split, value = annotation_contents.strip().partition(' ')
 123             annotation_list.append((name, value))
 124             # Remove first occurrence to continue processing.
 125             annotations = annotation_lead_pattern.sub('', annotations)
 126         else:
 127             break
 128     return annotation_list
 129
 130 # Given a currently read line, test that line and continue reading until the
 131 # beginning of a comment block is found or eof is reached.  Return the last
 132 # read line.
 133 def skip_to_comment_block(fp, line):
 134     while line:
 135         if comment_start_pattern.match(line):
 136             break
 137         line = fp.readline()
 138     return line
 139
 140 # Given the current line in a comment block, continue skipping lines until a
 141 # non-blank line in the comment block is found or until the end of the block
 142 # (or eof) is reached.  Returns the line where reading stopped.
 143 def skip_to_nonblank(fp, line):
 144     while line:
 145         if not comment_empty_line_pattern.match(line):
 146             break
 147         line = fp.readline()
 148         # Stop processing if eof or end of comment block is reached.
 149         if not line or comment_end_pattern.match(line):
 150             break
 151     return line
 152
 153 # Given the first line of a comment block (the '/**'), see if the next
 154 # non-blank line is the identifier of the comment block.  Stop processing if
 155 # the end of the block or eof is reached.  Store the identifier (if there is
 156 # one) and its type ('function', 'signal' or 'property') in the given GtkDoc.
 157 # Return the line where the identifier is found or the line that stops the
 158 # processing (if eof or the end of the comment block is found first).
 159 def skip_to_identifier(fp, line, cur_doc):
 160     # Skip the initial comment block line ('/**') if not eof.
 161     if line: line = fp.readline()
 162
 163     # Now skip empty lines.
 164     line = skip_to_nonblank(fp, line)
 165
 166     # See if the first non-blank line is the identifier.
 167     if line and not comment_end_pattern.match(line):
 168         # Remove the initial ' * ' in comment block line and see if there is an
 169         # identifier.
 170         line = comment_line_lead_pattern.sub('', line)
 171         for pattern in identifier_patterns:
 172             match = pattern.match(line)
 173             if match:
 174                 # Set the GtkDoc name.
 175                 cur_doc.set_name(match.group(1))
 176                 # Get annotations and add them to the GtkDoc.
 177                 annotations = get_annotation_list(match.group(2))
 178                 for annotation in annotations:
 179                     cur_doc.add_annotation(annotation)
 180                 # Set the GtkDoc type.
 181                 if pattern == signal_name_pattern:
 182                     cur_doc.set_type('signal')
 183                 elif pattern == property_name_pattern:
 184                     cur_doc.set_type('property')
 185                 elif pattern == function_name_pattern:
 186                     cur_doc.set_type('function')
 187                 return line
 188     return line
 189
 190 # Given a currently read line (presumably the identifier line), read the next
 191 # lines, testing to see if the lines are part of parameter descriptions.  If
 192 # so, store the parameter descriptions in the given doc block.  Stop on eof and
 193 # return the last line that stops the processing.
 194 def process_params(fp, line, cur_doc):
 195     # Skip the identifier line if not eof.  Also skip any blank lines in the
 196     # comment block.  Return if eof or the end of the comment block are
 197     # encountered.
 198     if line: line = fp.readline()
 199     line = skip_to_nonblank(fp, line)
 200     if not line or comment_end_pattern.match(line):
 201         return line
 202
 203     # Remove initial ' * ' in first non-empty comment block line.
 204     line = comment_line_lead_pattern.sub('', line)
 205
 206     # Now process possible parameters as long as no eof or the end of the
 207     # param section is not reached (which could be triggered by anything that
 208     # doesn't match a '@param:..." line, even the end of the comment block).
 209     match = param_pattern.match(line)
 210     while line and match:
 211         description = match.group(2)
 212
 213         # First extract the annotations from the description and save them.
 214         annotations = []
 215         annotation_match = annotations_pattern.match(description)
 216         if annotation_match:
 217             annotations = get_annotation_list(annotation_match.group(1))
 218             # Remove the annotations from the description
 219             description = annotations_pattern.sub('', description)
 220
 221         # Default to appending lines to current parameter.
 222         append_func = cur_doc.append_to_last_param
 223
 224         # See if the return has been included as part of the parameter
 225         # section and make sure that lines are added to the GtkDoc return if
 226         # so.
 227         if match.group(1).lower() == "returns":
 228             cur_doc.add_return(description, annotations)
 229             append_func = cur_doc.append_to_return
 230         # If not, just add it as a regular parameter.
 231         else:
 232             cur_doc.add_param(match.group(1), description, annotations)
 233
 234         # Now read lines and append them until next parameter, beginning of
 235         # description (an empty line), the end of the comment block or eof.
 236         line = fp.readline()
 237         while line:
 238             # Stop processing if end of comment block or a blank comment line
 239             # is encountered.
 240             if comment_empty_line_pattern.match(line) or \
 241                     comment_end_pattern.match(line):
 242                 break
 243
 244             # Remove initial ' * ' in comment block line.
 245             line = comment_line_lead_pattern.sub('', line)
 246
 247             # Break from current param processing if a new one is
 248             # encountered.
 249             if param_pattern.match(line): break;
 250
 251             # Otherwise, just append the current line and get the next line.
 252             append_func(line)
 253             line = fp.readline()
 254
 255         # Re-evaluate match for while condition
 256         match = param_pattern.match(line)
 257
 258     # End by returning the current line.
 259     return line
 260
 261 # Having processed parameters, read the following lines into the description of
 262 # the current doc block until the end of the comment block, the end of file or
 263 # a return section is encountered.
 264 def process_description(fp, line, cur_doc):
 265     # First skip empty lines returning on eof or end of comment block.
 266     line = skip_to_nonblank(fp, line)
 267     if not line or comment_end_pattern.match(line):
 268         return line
 269
 270     # Remove initial ' * ' in non-empty comment block line.
 271     line = comment_line_lead_pattern.sub('', line)
 272
 273     # Also remove possible 'Description:' prefix.
 274     if line[:12] == 'Description:': line = line[12:]
 275
 276     # Used to tell if the previous line was blank and a return section
 277     # uncommonly marked with 'Returns ...' instead of 'Returns: ...'  has
 278     # started (assume it is non-empty to begin with).
 279     prev_line = 'non-empty'
 280
 281     # Now read lines until a new section (like a return or a since section) is
 282     # encountered.
 283     while line:
 284         # See if the description section has ended (if the line begins with
 285         # 'Returns ...' and the previous line was empty -- this loop replaces
 286         # empty lines with a newline).
 287         if no_colon_return_pattern.match(line) and prev_line == '\n':
 288             return line
 289         # Or if one of the patterns of the final sections match
 290         for pattern in final_section_patterns:
 291             if pattern.match(line):
 292                 return line
 293
 294         # If not, append lines to description in the doc comment block.
 295         cur_doc.append_to_description(line)
 296
 297         prev_line = line
 298         line = fp.readline()
 299
 300         # Stop processing on eof or at the end of comment block.
 301         if not line or comment_end_pattern.match(line):
 302             return line
 303
 304         # Remove initial ' * ' in line so that the text can be appended to the
 305         # description of the comment block and make sure that if the line is
 306         # empty it be interpreted as a newline.
 307         line = comment_line_lead_pattern.sub('', line)
 308         if not line: line = '\n'
 309
 310 # Given the line that ended the description (the first line of one of the final
 311 # sections) process the final sections ('Returns:', 'Since:', etc.) until the
 312 # end of the comment block or eof.  Return the line that ends the processing.
 313 def process_final_sections(fp, line, cur_doc):
 314     while line and not comment_end_pattern.match(line):
 315         # Remove leading ' * ' from current non-empty comment line.
 316         line = comment_line_lead_pattern.sub('', line)
 317         # Temporarily append the no colon return pattern to the final section
 318         # patterns now that the description has been processed.  It will be
 319         # removed after the for loop below executes so that future descriptions
 320         # that begin with 'Returns ...' are not interpreted as a return
 321         # section.
 322         final_section_patterns.append(no_colon_return_pattern)
 323         for pattern in final_section_patterns:
 324             match = pattern.match(line)
 325             if match:
 326                 if pattern == return_pattern or \
 327                         pattern == no_colon_return_pattern:
 328                     # Dealing with a 'Returns:' so first extract the
 329                     # annotations from the description and save them.
 330                     description = match.group(2)
 331                     annotations = []
 332                     annotation_match = \
 333                             annotations_pattern.match(description)
 334                     if annotation_match:
 335                         annotations = \
 336                                 get_annotation_list(annotation_match.group(1))
 337                         # Remove the annotations from the description
 338                         description = annotations_pattern.sub('', description)
 339
 340                     # Now add the return.
 341                     cur_doc.add_return(description, annotations)
 342                     # In case more lines need to be appended.
 343                     append_func = cur_doc.append_to_return
 344                 elif pattern == rename_to_pattern:
 345                     # Dealing with a 'Rename to:' section (GObjectIntrospection
 346                     # annotation) so no further lines will be appended but this
 347                     # single one (and only to the annotations).
 348                     append_func = None
 349                     cur_doc.add_annotation((match.group(1),
 350                             match.group(2)))
 351                 else:
 352                     # For all others ('Since:' and 'Deprecated:') just append
 353                     # the line to the description for now.
 354                     # But if --no-since is specified, don't append it.
 355                     if no_since and pattern == since_pattern:
 356                         pass
 357                     else:
 358                         cur_doc.append_to_description(line)
 359
 360                     # In case more lines need to be appended.
 361                     append_func = cur_doc.append_to_description
 362
 363                 # Stop final section pattern matching for loop since a match
 364                 # has already been found.
 365                 break
 366
 367         # Remove the no colon return pattern (which was temporarily added in
 368         # the just executed loop) from the list of final section patterns.
 369         final_section_patterns.pop()
 370
 371         line = fp.readline()
 372
 373         # Now continue appending lines to current section until a new one is
 374         # found or an eof or the end of the comment block is encountered.
 375         finished = False
 376         while not finished and line and \
 377                 not comment_end_pattern.match(line):
 378             # Remove leading ' * ' from line and make sure that if it is empty,
 379             # it be interpreted as a newline.
 380             line = comment_line_lead_pattern.sub('', line)
 381             if not line: line = '\n'
 382
 383             for pattern in final_section_patterns:
 384                 if pattern.match(line):
 385                     finished = True
 386                     break
 387
 388             # Break out of loop if a new section is found (determined in above
 389             # inner loop).
 390             if finished: break
 391
 392             # Now it's safe to append line.
 393             if append_func: append_func(line)
 394
 395             # Get the next line to continue processing.
 396             line = fp.readline()
 397
 398     return line
 399
 400 def parse_dir(dir, doc_dict):
 401     for file in os.listdir(dir):
 402         if file in ('.', '..'): continue
 403         path = os.path.join(dir, file)
 404         if os.path.isdir(path):
 405             parse_dir(path, doc_dict)
 406         if len(file) > 2 and file[-2:] == '.c':
 407             sys.stderr.write("Processing " + path + '\n')
 408             parse_file(open(path, 'r'), doc_dict)
 409
 410 def extract(dirs, doc_dict=None):
 411     if not doc_dict: doc_dict = {}
 412     for dir in dirs:
 413         parse_dir(dir, doc_dict)
 414     return doc_dict
 415
 416 tmpl_section_pattern = re.compile(r'^<!-- ##### (\w+) (\w+) ##### -->$')
 417 def parse_tmpl(fp, doc_dict):
 418     cur_doc = None
 419
 420     line = fp.readline()
 421     while line:
 422         match = tmpl_section_pattern.match(line)
 423         if match:
 424             cur_doc = None  # new input shouldn't affect the old doc dict
 425             sect_type = match.group(1)
 426             sect_name = match.group(2)
 427
 428             if sect_type == 'FUNCTION':
 429                 cur_doc = doc_dict.get(sect_name)
 430                 if not cur_doc:
 431                     cur_doc = GtkDoc()
 432                     cur_doc.set_name(sect_name)
 433                     doc_dict[sect_name] = cur_doc
 434         elif line == '<!-- # Unused Parameters # -->\n':
 435             cur_doc = None # don't worry about unused params.
 436         elif cur_doc:
 437             if line[:10] == '@Returns: ':
 438                 if string.strip(line[10:]):
 439                     cur_doc.append_to_return(line[10:])
 440             elif line[0] == '@':
 441                 pos = string.find(line, ':')
 442                 if pos >= 0:
 443                     cur_doc.append_to_named_param(line[1:pos], line[pos+1:])
 444                 else:
 445                     cur_doc.append_to_description(line)
 446             else:
 447                 cur_doc.append_to_description(line)
 448
 449         line = fp.readline()
 450
 451 def extract_tmpl(dirs, doc_dict=None):
 452     if not doc_dict: doc_dict = {}
 453     for dir in dirs:
 454         for file in os.listdir(dir):
 455             if file in ('.', '..'): continue
 456             path = os.path.join(dir, file)
 457             if os.path.isdir(path):
 458                 continue
 459             if len(file) > 2 and file[-2:] == '.sgml':
 460                 parse_tmpl(open(path, 'r'), doc_dict)
 461     return doc_dict