murphy: abnf.py script now uses both the flex and bison file to generate ABNF
authorJanos Kovacs <jankovac503@gmail.com>
Sat, 21 Apr 2012 16:43:24 +0000 (19:43 +0300)
committerJanos Kovacs <jankovac503@gmail.com>
Sat, 21 Apr 2012 16:43:24 +0000 (19:43 +0300)
* flex support added for the automatic document generation, ie. ABNF is
  generated from the regexp's in the definition section
* more correct ABNF description is generated
* the output is directly includable into DocBook xml file

doc/scripts/abnf.py

index 74ca065..8691ef1 100755 (executable)
 
 import os, sys, re
 
-def canonic_optlist(olist):
-    canonic  = ""
-    sep = ""
-    if len(olist) > 0:
-        members = olist.split(' ')
-        nmember = len(members)
-        if nmember > 0:
-            for member in members:
-                tag = member.strip()
-                if len(tag) < 1:
+def regexp_to_abnf(string):
+    return regexp_parse(string, 0)[0]
+
+def regexp_parse(string, index):
+    stack = []
+    precedence = 0
+    escape = False
+
+    while index < len(string):
+        c = string[index]
+
+        insert = False
+
+        if c == "\\":
+            escape = True
+        else:
+            if c.isalnum() or escape:
+                if c == "\"":
+                    new_string = "DQUOTE"
+                elif c == " ":
+                    new_string = "SP"
+                else:
+                    new_string = "\"%s\"" % c
+                new_precedence = 100
+                escape = False
+            elif c == ".":
+                new_precedence = 90
+                new_string = "VCHAR"
+            elif c == "{":
+                new_precedence = 100
+                if index+1 >= len(string) or string.find("}", index+1) < 0:
+                    new_string = "\"{\""
+                else:
+                    begin = index+1
+                    end = string.find("}", begin)
+                    new_string = string[begin:end]
+                    index = end
+            elif c == "(":
+                new_precedence = 80
+                result, index = regexp_parse(string, index+1)
+                if len(result) < 1:
+                    index += 1
                     continue
-                canonic += sep
-                sep = " "
-                if tag.startswith("TKN_"):
-                    canonic += tag[4:]
+                new_string = "(" + result + ")"
+            elif c == ")":
+                break 
+            elif c == "[":
+                new_precedence = 70
+                new_string, index = regexp_character_class(string, index)
+            elif c == "*":
+                new_precedence = 43
+                new_string = "*"
+                insert = True
+            elif c == "+":
+                new_precedence = 42
+                new_string = "1*"
+                insert = True
+            elif c == "?":
+                new_precedence = 40
+                new_string = "0*1"
+                insert = True
+            elif c == "|":
+                new_precedence = 30
+                new_string = "/"
+            else:
+                new_precedence = 100
+                if c == "\"":
+                    new_string = "DQUOTE"
+                elif ord(c) < ord(' '):
+                    new_string = "\%x%x" % ord(c)
                 else:
-                    canonic += tag
-    return canonic
+                    new_string = "\"" + c + "\""
+
+            if insert:
+                last = len(stack) - 1
+                if last >= 0:
+                    stack.insert(last, (stack[last][0], new_string))
+                    stack = regexp_merge(new_precedence, stack)
+            else:
+                stack = regexp_merge(new_precedence, stack)
+                stack.append((new_precedence, new_string))
 
+            precedence = new_precedence
+        index += 1
 
-def canonic_complist(clist): 
-    stripped = clist.strip()
-    canonic  = ""
+    if len(stack) < 1:
+        result = ""
+    else:
+        result = regexp_merge(-1, stack)[0][1]
+
+
+    if result.startswith("(") and result.endswith(")"):
+        strip = True
+        balance = 0
+        for c in result[1:-1]:
+            if c == "(":
+                balance += 1
+            elif c == ")":
+                balance -= 1
+                if balance < 0:
+                    strip = False
+                    break
+        if strip and balance == 0:
+            result = result[1:-1]
+
+    return (result, index)
+
+def regexp_character_class(string, index):
+    cnt = 0
+    result = ""
+    backslash = False
     sep = ""
-    if len(stripped) > 0:
-        options = stripped.split('|')
-        noption = len(options)
-        if noption == 1:
-            stripped_option = options[0].strip()
-            canonic += canonic_optlist(stripped_option)
-        elif noption > 1:
-            if len(options[0].strip()) < 1:
-                canonic = "["
-                close   = "]"
+
+    if string[index+1] == "^":
+        index += 1
+        ranges = [(32, 126)]
+        escape = False
+        while index+1 < len(string):
+            index += 1
+            char = string[index]
+
+            if char == "]":
+                break
+
+            if char == "\\":
+                escape = True
+                continue
+            
+            c = ord(char)
+
+            if escape:
+                if char == "n":
+                    c = 10
+                if char == "t":
+                    c = 9
+                escape = False
+
+            for r in ranges:
+                if c >= r[0] and c <= r[1]:
+                    ranges.remove(r)
+                    if c == r[0]:
+                        if c != r[1]:
+                            ranges.append((r[0]+1, r[1]))
+                        break
+                    elif c == r[1]:
+                        if c != r[0]:
+                            ranges.append((r[0], r[1]-1))
+                        break
+                    else:
+                        ranges.append((r[0], c-1))
+                        ranges.append((c+1, r[1]))
+                        break
+
+        lower = False
+        upper = False
+        digit = False
+        for r in ranges:
+            if r[0] <= 65 and r[1] >= 90:
+                upper = r
+            if r[0] <= 97 and r[1] >= 122:
+                lower = r
+            if r[0] <= 48 and r[1] >= 57:
+                digit = r
+        if lower and upper:
+            ranges = regexp_range_extract(ranges, 97,122)
+            ranges = regexp_range_extract(ranges, 65,90)
+            result += sep + "ALPHA"
+            sep = " / "
+            cnt += 1
+        if digit:
+            ranges = regexp_range_extract(ranges, 48,57)
+            result += sep + "DIGIT"
+            sep = " / "
+            cnt += 1
+        ranges.sort(regexp_range_sort)
+        for r in ranges:
+            if r[0] == r[1]:
+                result += sep + "%x" + "%x" % r[0]
             else:
-                canonic = "("
-                close   = ")"
-            for option in options:
-                stripped_option = option.strip()
-                if len(stripped_option) > 0:
-                    canonic += sep + canonic_optlist(stripped_option)
-                    sep = "|"
-            canonic += close
-    return canonic
+                result += sep + "%x" + "%x-%x" % r
+            sep = " / "
+            cnt += 1
+    else:
+        while index+1 < len(string):
+            index += 1
+            c = string[index]
+
+            if c == "]":
+                break
+
+            if c == "\\":
+                if not backslash:
+                    backslash = True
+                    continue
+            else:
+                if backslash:
+                    backslash = False
+                    c = regexp_escape(c)
+                elif c == " ":
+                    c = "SP"
+                elif c == "\"":
+                    c = "DQUOTE"
+                else:
+                    if string[index:index+3] == "0-9":
+                        index += 2
+                        c = "DIGIT"
+                    elif string[index:index+6] == "a-zA-Z" or \
+                            string[index:index+6] == "A-Za-z":
+                        index += 5
+                        c = "ALPHA"
+                    elif index < len(string)-2 and string[index+1] == "-" and \
+                            ((string[index].isalpha() and \
+                              string[index+2].isalpha()) or \
+                             (string[index].isdigit() and \
+                              string[index+2].isdigit())):
+                        index += 2
+                        start = ord(c)
+                        end = ord(string[index])+1
+                        sep2 = ""
+                        c = ""
+
+                        for i in range(start, end):
+                            c += sep2 +  "\"" + chr(i) + "\""
+                            sep2 = " / "
+                    else:
+                        c = "\"" + c + "\""
+
+            result += sep + c
+            sep = " / "
+            cnt += 1
+
+    if cnt > 1:
+        result = "(" + result + ")"
+
+    return (result, index)
+
+def regexp_escape(char):
+    if char == "n":
+        return "CRLF"
+    elif char == "t":
+        return "HTAB"
+    elif char == " ":
+        return "SP"
+    elif char == "\"":
+        return "DQUOTE"
+    elif char == "\\":
+        return "\"\\\""
+    elif char == "^":
+        return "\"^\""
+
+    return "\"" + char + "\""
+
+
+def regexp_merge(new_precedence, stack):
+    last = len(stack) - 1
+    precedence = 0
+
+    if last >= 0:
+        for merge in range(last,-1,-1):
+            precedence = stack[merge][0]
+
+            if new_precedence >= precedence:
+                break
+
+        append = False
+        string = ""
+        sep = ""
+
+        for i in range(merge,last+1):
+            element = stack[i][1]
+            string += sep + element
+            if element.find("*") == len(element)-1:
+                sep = ""
+            else:
+                sep = " "
+            append = True
 
 
-def print_canonic_rules(toprules):
-    print terminals + "\n"
-    for rule in toprules:
-        print canonic_rule(toprules, rule, 0, " ") + "\n"
+        for i in range(last,merge-1,-1):
+            stack.pop()
 
+        if append:
+            stack.append((new_precedence, string))
 
-def canonic_rule(toprules, name, rdepth, gap):
-    canonic = ""
-    stripped_name = name.strip()
-    toplevel = name in toprules 
+    return stack
 
-    if stripped_name in rules:
-        rule = rules[stripped_name]
-        if toplevel:
-            canonic += "<" + name + "> ="
-        if rule.startswith("(") and rule.endswith(")"):
-            stripped_rule = rule[1:-1].strip()
+def regexp_range_extract(ranges, l,h):
+    for r in ranges:
+        if r[0] <= l and r[1] >= h:
+            ranges.remove(r)
+            if l == r[0] and h < r[1]:
+                ranges.append((h+1, r[1]))
+            elif l > r[0] and h == r[1]:
+                ranges.append((r[0], l-1))
+            elif l > r[0] and h < r[1]:
+                ranges.append((r[0],l-1))
+                ranges.append((h+1,r[1])) 
+            break
+    return ranges
+
+def regexp_range_sort(a,b):
+    if a[0] < b[0]:
+        return -1
+    elif a[0] > b[0]:
+        return +1
+    return 0
+
+
+def component_list(input_list):
+    output_list  = ""
+    sep = ""
+    if len(input_list) > 0:
+        components = input_list.split(' ')
+        ncomponent = len(components)
+        if ncomponent > 0:
+            for component in components:
+                name = component.strip()
+                if len(name) < 1:
+                    continue
+                output_list += sep
+                sep = " "
+                if name.startswith("TKN_"):
+                    output_list += name[4:]
+                else:
+                    output_list += name
+    return output_list
+
+
+def rule_list(rule_def): 
+    stripped_rule_def = rule_def.strip()
+    rlist = ""
+    sep = ""
+    if len(stripped_rule_def) > 0:
+        rules = stripped_rule_def.split('|')
+        nrule = len(rules)
+        if nrule == 1:
+            stripped_rule = rules[0].strip()
+            rlist += component_list(stripped_rule)
+        elif nrule > 1:
+            if len(rules[0].strip()) < 1:
+                rlist = "["
+                close = "]"
+            else:
+                rlist = "("
+                close = ")"
+            for rule in rules:
+                stripped_rule = rule.strip()
+                if len(stripped_rule) > 0:
+                    rlist += sep + component_list(stripped_rule)
+                    sep = "|"
+            rlist += close
+    return rlist
+
+
+def print_abnf_rules():
+    name_len = 0
+    prologue = "<![CDATA["
+    epilogue = "]]>"
+    extra_linefeed = ""
+
+    for rule in abnf:
+        name_len = max(len(rule[0]), name_len)
+
+    for rule in abnf:
+        line   = rule[0].ljust(name_len) + " ="
+        words  = rule[1].split(' ')
+        margin = len(line)
+        width  = margin
+
+        for word in words:
+            wl = len(word) + 1
+            if width + wl > line_width:
+                line += "\n".ljust(margin+2)
+                width = margin
+            line += " " + word
+            width += wl
+
+        if rule[0].isupper():
+            extra_linefeed = ""
+        else:
+            if len(extra_linefeed) < 1:
+                print "\n"
+            extra_linefeed = "\n"
+        
+        print prologue + line + extra_linefeed
+        prologue = ""
+    print epilogue
+
+def make_abnf_rules(topresults):
+    for result in topresults:
+        abnf.append( (result, abnf_rule(topresults, result, 0, " ")) )
+
+
+def abnf_rule(topresults, result, rdepth, gap):
+    canonic   = ""
+    component = result.strip()
+    toplevel  = component in topresults 
+
+    if component in results:
+        rule_list = results[component]
+        if rule_list.startswith("(") and rule_list.endswith(")"):
+            stripped_rule_list = rule_list[1:-1].strip()
             if rdepth < 1:
                 close = ""
             else:
                 canonic += " ("
                 close = ")"
                 gap = ""
-        elif rule.startswith("[") and rule.endswith("]"):
-            stripped_rule = rule[1:-1].strip()
+        elif rule_list.startswith("[") and rule_list.endswith("]"):
+            stripped_rule_list = rule_list[1:-1].strip()
             canonic += " ["
             close = "]"
             gap = ""
         else:
-            stripped_rule = rule.strip()
+            stripped_rule_list = rule_list.strip()
             close = ""
 
-        escaped_rule = stripped_rule.replace("\"|\"","Ö")
-        members = escaped_rule.split("|")
+        escaped_rule_list = stripped_rule_list.replace("\"|\"","Ö")
+        rules = escaped_rule_list.split("|")
         sep = ""
-        for member in members:
-            tags = member.replace("Ö","\"|\"").split(' ')
+        for rule in rules:
+            components = rule.replace("Ö","\"|\"").split(' ')
 
-            if tags[0].strip() == name:
+            if components[0].strip() == result:
                 rept = ")"
             else:
                 rept = ""
@@ -102,46 +441,50 @@ def canonic_rule(toprules, name, rdepth, gap):
                 sep = " /"
 
             first = True
-            for tag in tags:
-                tagname = tag.strip()
-                if len(tagname) < 1:
+            for component in components:
+                component_name = component.strip()
+                if len(component_name) < 1:
                     continue
-                if first and tagname == name:
+                if first and component_name == result:
                     canonic += " *("
                     gap = ""
-                elif tagname in toprules:
-                    canonic += gap + "<" + tagname + ">"
+                elif component_name in topresults:
+                    canonic += gap + component_name
                     gap = " "
                 else:
-                    canonic += canonic_rule(toprules, tagname, rdepth+1, gap)
+                    canonic += abnf_rule(topresults, component_name, \
+                                         rdepth+1,gap)
                     gap = " "
                 first = False
             canonic += rept
 
         canonic += close
     else:
-        canonic += gap + stripped_name
+        canonic += gap + component
 
     return canonic
 
 
-lfile      = sys.argv[1]
-yfile      = sys.argv[2]
-inputdir   = os.path.dirname(yfile)
-scriptdir  = sys.path[0] 
-start      = ""
-grammar    = False
-prologue   = False
-comment    = False
-toplevel   = False
-depth      = 0
-result     = ""
-components = ""
-terminals  = ""
-rules      = {}
-top_rules  = []
-pos        = 0
-end        = 0
+lfile       = sys.argv[1]
+yfile       = sys.argv[2]
+line_width  = 78
+inputdir    = os.path.dirname(yfile)
+scriptdir   = sys.path[0] 
+lname       = os.path.basename(lfile)
+yname       = os.path.basename(yfile)
+start       = ""
+grammar     = False
+prologue    = False
+comment     = False
+toplevel    = False
+depth       = 0
+result      = ""
+result_def  = ""
+results     = {}
+top_results = []
+abnf        = []
+pos         = 0
+end         = 0
 
 
 
@@ -188,18 +531,18 @@ with open(lfile, "r") as l:
                 continue
 
             if value.isalpha():
-                terminals += key + " = \"" + value.upper() + "\"\n"
+                abnf.append( (key, " \"" + value.upper() + "\"") )
             else:
                 if value[0] == "\\" and len(value) == 2:
-                    rules[key] = "\"%s\"" % value[1]
+                    results[key] = "\"%s\"" % value[1]
                 elif len(value) == 1:
-                    rules[key] = "\"%s\"" % value[0]
+                    results[key] = "\"%s\"" % value[0]
                 else:
                     if value.find("[") < 0 and value.find("(") < 0:
-                        rules[key] = "\"%s\"" % value.replace("/","")
+                        results[key] = "\"%s\"" % value.replace("/","")
                     else:
-                        terminals += "<" + key.lower() + "> = " + value + "\n"
-                        rules[key] = "<" + key.lower() + ">"
+                        abnf.append( (key.lower(), " "+regexp_to_abnf(value)) )
+                        results[key] = key.lower()
 
     l.close()
 
@@ -221,7 +564,7 @@ with open(yfile, "r") as y:
                     slash_star = line.find("/*")
                     if colon > 0 and (slash_star < 0 or slash_star > colon):
                         result = line[:colon]
-                        components = ""
+                        result_def = ""
                         pos = colon + 1
                 while pos < end-1:
                     if comment:
@@ -246,7 +589,7 @@ with open(yfile, "r") as y:
                            (slash_star  < 0 or open_brace < slash_star ):
 
                             if depth == 0:
-                                components += line[pos: open_brace]
+                                result_def += line[pos: open_brace]
                             depth += 1
                             pos = open_brace + 1
                             continue
@@ -259,15 +602,15 @@ with open(yfile, "r") as y:
                         if depth == 0:
                             semicolon = line.find(";", pos)
                             if semicolon >= pos:
-                                components += line[pos: semicolon]
-                                rules[result] = canonic_complist(components)
+                                result_def += line[pos: semicolon]
+                                results[result] = rule_list(result_def)
                                 if toplevel:
-                                    top_rules.append(result)
+                                    top_results.append(result)
                                 result = ""
-                                components = ""
+                                result_def = ""
                                 toplevel = False
                             else:
-                                components += line[pos: -1]
+                                result_def += line[pos: -1]
                         break
         else:
             if prologue:
@@ -283,7 +626,15 @@ with open(yfile, "r") as y:
 
 
 
-print_canonic_rules(top_rules)
+make_abnf_rules(top_results)
+
+print "<!-- XML file was automatically generated from %s and from %s -->\n" % \
+    (lname, yname)
+
+print "<screen>"
+print_abnf_rules()
+print "</screen>"
+