From ce815229861795cfddcf9331a598c88606736706 Mon Sep 17 00:00:00 2001 From: DongHun Kwak Date: Mon, 18 Jul 2022 14:42:32 +0900 Subject: [PATCH] Imported Upstream version 2.4.2 --- CHANGES | 55 + CODE_OF_CONDUCT.rst | 90 + CONTRIBUTING.md | 113 + MANIFEST.in | 8 +- PKG-INFO | 2 +- README.rst | 44 +- docs/HowToUsePyparsing.rst | 1948 ++++++++--------- examples/javascript_grammar.g | 894 ++++++++ examples/sexpParser.py | 314 +-- examples/statemachine/documentSignoffDemo.py | 50 + .../statemachine/documentsignoffstate.pystate | 71 + examples/statemachine/libraryBookDemo.py | 70 + .../statemachine/librarybookstate.pystate | 19 + examples/statemachine/statemachine.py | 347 +++ examples/statemachine/trafficLightDemo.py | 26 + .../statemachine/trafficlightstate.pystate | 47 + examples/statemachine/vending_machine.py | 78 + examples/statemachine/video_demo.py | 48 + examples/statemachine/videostate.pystate | 32 + examples/wordsToNum.py | 220 +- pyparsing.egg-info/PKG-INFO | 2 +- pyparsing.egg-info/SOURCES.txt | 13 + pyparsing.py | 67 +- unitTests.py | 79 +- 24 files changed, 3349 insertions(+), 1288 deletions(-) create mode 100644 CODE_OF_CONDUCT.rst create mode 100644 CONTRIBUTING.md create mode 100644 examples/javascript_grammar.g create mode 100644 examples/statemachine/documentSignoffDemo.py create mode 100644 examples/statemachine/documentsignoffstate.pystate create mode 100644 examples/statemachine/libraryBookDemo.py create mode 100644 examples/statemachine/librarybookstate.pystate create mode 100644 examples/statemachine/statemachine.py create mode 100644 examples/statemachine/trafficLightDemo.py create mode 100644 examples/statemachine/trafficlightstate.pystate create mode 100644 examples/statemachine/vending_machine.py create mode 100644 examples/statemachine/video_demo.py create mode 100644 examples/statemachine/videostate.pystate diff --git a/CHANGES b/CHANGES index 397af9c..e1a9462 100644 --- a/CHANGES +++ b/CHANGES @@ -2,6 +2,61 @@ Change Log ========== +Version 2.4.2 - July, 2019 +-------------------------- +- Updated the shorthand notation that has been added for repetition + expressions: expr[min, max], with '...' valid as a min or max value: + - expr[...] and expr[0, ...] are equivalent to ZeroOrMore(expr) + - expr[1, ...] is equivalent to OneOrMore(expr) + - expr[n, ...] or expr[n,] is equivalent + to expr*n + ZeroOrMore(expr) + (read as "n or more instances of expr") + - expr[..., n] is equivalent to expr*(0, n) + - expr[m, n] is equivalent to expr*(m, n) + Note that expr[..., n] and expr[m, n] do not raise an exception + if more than n exprs exist in the input stream. If this + behavior is desired, then write expr[..., n] + ~expr. + + Better interpretation of [...] as ZeroOrMore raised by crowsonkb, + thanks for keeping me in line! + + If upgrading from 2.4.1 or 2.4.1.1 and you have used `expr[...]` + for `OneOrMore(expr)`, it must be updated to `expr[1, ...]`. + +- The defaults on all the `__diag__` switches have been set to False, + to avoid getting alarming warnings. To use these diagnostics, set + them to True after importing pyparsing. + + Example: + + import pyparsing as pp + pp.__diag__.warn_multiple_tokens_in_named_alternation = True + +- Fixed bug introduced by the use of __getitem__ for repetition, + overlooking Python's legacy implementation of iteration + by sequentially calling __getitem__ with increasing numbers until + getting an IndexError. Found during investigation of problem + reported by murlock, merci! + + +Version 2.4.2a1 - July, 2019 +---------------------------- +It turns out I got the meaning of `[...]` absolutely backwards, +so I've deleted 2.4.1 and am repushing this release as 2.4.2a1 +for people to give it a try before I can call it ready to go. + +The `expr[...]` notation was pushed out to be synonymous with +`OneOrMore(expr)`, but this is really counter to most Python +notations (and even other internal pyparsing notations as well). +It should have been defined to be equivalent to ZeroOrMore(expr). + +- Changed [...] to emit ZeroOrMore instead of OneOrMore. + +- Removed code that treats ParserElements like iterables. + +- Change all __diag__ switches to False. + + Version 2.4.1.1 - July 24, 2019 ------------------------------- This is a re-release of version 2.4.1 to restore the release history diff --git a/CODE_OF_CONDUCT.rst b/CODE_OF_CONDUCT.rst new file mode 100644 index 0000000..4f84d2c --- /dev/null +++ b/CODE_OF_CONDUCT.rst @@ -0,0 +1,90 @@ +Contributor Covenant Code of Conduct +==================================== + +Our Pledge +---------- + +In the interest of fostering an open and welcoming environment, +we as contributors and maintainers pledge to making participation +in our project and our community a harassment-free experience for +everyone, regardless of age, body size, disability, ethnicity, +sex characteristics, gender identity and expression, level of +experience, education, socio-economic status, nationality, +personal appearance, race, religion, or sexual identity and +orientation. + +Our Standards +------------- + +Examples of behavior that contributes to creating a positive +environment include: + +- Using welcoming and inclusive language +- Being respectful of differing viewpoints and experiences +- Gracefully accepting constructive criticism +- Focusing on what is best for the community +- Showing empathy towards other community members + +Examples of unacceptable behavior by participants include: + +- The use of sexualized language or imagery and unwelcome sexual + attention or advances +- Trolling, insulting/derogatory comments, and personal or political + attacks +- Public or private harassment +- Publishing others’ private information, such as a physical or + electronic address, without explicit permission +- Other conduct which could reasonably be considered + inappropriate in a professional setting + +Our Responsibilities +-------------------- + +Project maintainers are responsible for clarifying the standards +of acceptable behavior and are expected to take appropriate and +fair corrective action in response to any instances of +unacceptable behavior. + +Project maintainers have the right and responsibility to remove, +edit, or reject comments, commits, code, wiki edits, issues, and +other contributions that are not aligned to this Code of Conduct, +or to ban temporarily or permanently any contributor for other +behaviors that they deem inappropriate, threatening, offensive, +or harmful. + +Scope +----- + +This Code of Conduct applies both within project spaces and in +public spaces when an individual is representing the project or +its community. Examples of representing a project or community +include using an official project e-mail address, posting via an +official social media account, or acting as an appointed +representative at an online or offline event. Representation of +a project may be further defined and clarified by project +maintainers. + +Enforcement +----------- + +Instances of abusive, harassing, or otherwise unacceptable +behavior may be reported by contacting the project team at +pyparsing@mail.com. All complaints will be reviewed and +investigated and will result in a response that is deemed +necessary and appropriate to the circumstances. The project team +is obligated to maintain confidentiality with regard to the +reporter of an incident. Further details of specific enforcement +policies may be posted separately. + +Project maintainers who do not follow or enforce the Code of +Conduct in good faith may face temporary or permanent +repercussions as determined by other members of the project’s +leadership. + +Attribution +----------- + +This Code of Conduct is adapted from the `Contributor Covenant +`__, version 1.4, available +at +https://www.contributor-covenant.org/version/1/4/code-of-conduct.html diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..7b19d7a --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,113 @@ +# CONTRIBUTING + +Thank you for your interest in working on pyparsing! Pyparsing has become a popular module for creating simple +text parsing and data scraping applications. It has been incorporated in several widely-used packages, and is +often used by beginners as part of their first Python project. + +## Raising questions / asking for help + +If you have a question on using pyparsing, there are a number of resources available online. + +- [StackOverflow](https://stackoverflow.com/questions/tagged/pyparsing) - about 10 years of SO questions and answers + can be searched on StackOverflow, tagged with the `pyparsing` tag. Note that some of the older posts will refer + to features in Python 2, or to versions and coding practices for pyparsing that have been replaced by newer classes + and coding idioms. + +- [pyparsing sub-reddit](https://www.reddit.com/r/pyparsing/) - still very lightly attended, but open to anyone + wishing to post questions or links related to pyparsing. An alternative channel to StackOverflow for asking + questions. + +- [online docs](https://pyparsing-docs.readthedocs.io/en/latest/index.html) and a separately maintained set of class + library docs [here](https://pyparsing-doc.neocities.org/) - These docs are auto-generated from the docstrings + embedded in the pyparsing classes, so they can also be viewed in the interactive Python console's and Jupyter + Notebook's `help` commands. + +- [the pyparsing Wikispaces archive](https://github.com/pyparsing/wikispaces_archive) - Before hosting on GitHub, + pyparsing had a separate wiki on the wikispaces.com website. In 2018 this page was discontinued. The discussion + content archive has been reformatted into Markdown and can be viewed by year at the GitHub repository. Just as + with some of the older questions on StackOverflow, some of these older posts may reflect out-of-date pyparsing + and Python features. + +- [submit an issue](https://github.com/pyparsing/pyparsing/issues) - If you have a problem with pyparsing that looks + like an actual bug, or have an idea for a feature to add to pyaprsing please submit an issue on GitHub. Some + pyparsing behavior may be counter-intuitive, so try to review some of the other resources first, or some of the + other open and closed issues. Or post your question on SO or reddit. But don't wait until you are desperate and + frustrated - just ask! :) + + +## Submitting changes + +If you are considering proposing updates to pyparsing, please bear in mind the following guidelines. + +Please review [_The Zen of Pyparsing_ and _The Zen of Pyparsing +Development_](https://github.com/pyparsing/pyparsing/wiki/Zen) +article on the pyparsing wiki, to get a general feel for the historical and future approaches to pyparsing's +design, and intended developer experience as an embedded DSL. + +## Some design points + +- Minimize additions to the module namespace. Over time, pyparsing's namespace has acquired a *lot* of names. + New features have been encapsulated into namespace classes to try to hold back the name flooding when importing + pyparsing. + +- New operator overloads will need to show broad applicability. + +- Performance tuning should focus on parse time performance. Optimizing parser definition performance is secondary. + +- New external dependencies will require substantial justification, and if included, will need to be guarded for + `ImportError`s raised if the external module is not installed. + +## Some coding points + +These coding styles are encouraged whether submitting code for core pyparsing or for submitting an example. + +- PEP8 - at this time, pyparsing is very non-compliant with many PEP8 guidelines, especially those regarding + name casing. I had just finished several years of Java and Smalltalk development, and camel case seemed to be the + future trend in coding styles. There are plans to convert these names to PEP8-conformant snake case, but this will + be done over several releases to provide a migration path for current pyparsing-dependent applications. See more + information at the [PEP8 wiki page](https://github.com/pyparsing/pyparsing/wiki/PEP-8-planning). + + If you wish to submit a new example, please follow PEP8 name and coding guidelines. Example code must be available + for distribution with the rest of pyparsing under the MIT open source license. + +- No backslashes for line continuations. + Continuation lines for expressions in ()'s should start with the continuing operator: + + really_long_line = (something + + some_other_long_thing + + even_another_long_thing) + +- Changes to core pyparsing must be compatible back to Py3.5 without conditionalizing. Later Py3 features may be + used in examples by way of illustration. + +- str.format() statements should use named format arguments (unless this proves to be a slowdown at parse time). + +- List, tuple, and dict literals should include a trailing comma after the last element, which reduces changeset + clutter when another element gets added to the end. + +- Examples should import pyparsing and the common namespace classes as: + + import pyparsing as pp + # if necessary + ppc = pp.pyparsing_common + ppu = pp.pyparsing_unicode + +- Where possible use operators to create composite parse expressions: + + expr = expr_a + expr_b | expr_c + + instead of: + + expr = pp.MatchFirst([pp.And([expr_a, expr_b]), expr_c]) + + Exception: if using a generator to create an expression: + + import keyword + python_keywords = keyword.kwlist + any_keyword = pp.MatchFirst(pp.Keyword(kw) + for kw in python_keywords)) + +- Learn [The Classic Blunders](https://github.com/pyparsing/pyparsing/wiki/The-Classic-Blunders) and + how to avoid them when developing new examples. + +- New features should be accompanied with updates to unitTests.py and a bullet in the CHANGES file. diff --git a/MANIFEST.in b/MANIFEST.in index a13fe7f..48d9e1a 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,8 +1,8 @@ include pyparsing.py -include HowToUsePyparsing.html pyparsingClassDiagram.* -include README.md CODE_OF_CONDUCT.md CHANGES LICENSE -include examples/*.py examples/Setup.ini examples/*.dfm examples/*.ics examples/*.html examples/*.h +include HowToUsePyparsing.rst pyparsingClassDiagram.* +include README.md CODE_OF_CONDUCT.rst CHANGES LICENSE CONTRIBUTING.md modules.rst +include examples/*.py examples/Setup.ini examples/*.dfm examples/*.ics examples/*.html examples/*.h examples/*.g examples/statemachine/* recursive-include docs * prune docs/_build/* recursive-include test * -include simple_unit_tests.py unitTests.py +include setup.py simple_unit_tests.py unitTests.py diff --git a/PKG-INFO b/PKG-INFO index 113698f..8bfd5eb 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 1.2 Name: pyparsing -Version: 2.4.1.1 +Version: 2.4.2 Summary: Python parsing module Home-page: https://github.com/pyparsing/pyparsing/ Author: Paul McGuire diff --git a/README.rst b/README.rst index 0d702d7..dca0a71 100644 --- a/README.rst +++ b/README.rst @@ -1,5 +1,5 @@ -PyParsing – A Python Parsing Module -=================================== +PyParsing -- A Python Parsing Module +==================================== |Build Status| @@ -12,45 +12,63 @@ use of regular expressions. The pyparsing module provides a library of classes that client code uses to construct the grammar directly in Python code. -Here is a program to parse “Hello, World!” (or any greeting of the form -“salutation, addressee!”): +*[Since first writing this description of pyparsing in late 2003, this +technique for developing parsers has become more widespread, under the +name Parsing Expression Grammars - PEGs. See more information on PEGs at* +https://en.wikipedia.org/wiki/Parsing_expression_grammar *.]* + +Here is a program to parse ``"Hello, World!"`` (or any greeting of the form +``"salutation, addressee!"``): .. code:: python from pyparsing import Word, alphas - greet = Word( alphas ) + "," + Word( alphas ) + "!" + greet = Word(alphas) + "," + Word(alphas) + "!" hello = "Hello, World!" - print(hello, "->", greet.parseString( hello )) + print(hello, "->", greet.parseString(hello)) The program outputs the following:: Hello, World! -> ['Hello', ',', 'World', '!'] The Python representation of the grammar is quite readable, owing to the -self-explanatory class names, and the use of ‘+’, ‘\|’ and ‘^’ operator +self-explanatory class names, and the use of '+', '|' and '^' operator definitions. -The parsed results returned from parseString() can be accessed as a +The parsed results returned from ``parseString()`` can be accessed as a nested list, a dictionary, or an object with named attributes. The pyparsing module handles some of the problems that are typically -vexing when writing text parsers: - extra or missing whitespace (the -above program will also handle “Hello,World!”, “Hello , World !”, etc.) -- quoted strings - embedded comments +vexing when writing text parsers: + +- extra or missing whitespace (the above program will also handle ``"Hello,World!"``, ``"Hello , World !"``, etc.) +- quoted strings +- embedded comments The examples directory includes a simple SQL parser, simple CORBA IDL parser, a config file parser, a chemical formula parser, and a four- function algebraic notation parser, among many others. +Documentation +============= + +There are many examples in the online docstrings of the classes +and methods in pyparsing. You can find them compiled into online docs +at https://pyparsing-docs.readthedocs.io/en/latest/. Additional +documentation resources and project info are listed in the online +GitHub wiki, at https://github.com/pyparsing/pyparsing/wiki. An +entire directory of examples is at +https://github.com/pyparsing/pyparsing/tree/master/examples. + License ======= - MIT License. See header of pyparsing.py +MIT License. See header of pyparsing.py History ======= - See CHANGES file. +See CHANGES file. .. |Build Status| image:: https://travis-ci.org/pyparsing/pyparsing.svg?branch=master :target: https://travis-ci.org/pyparsing/pyparsing diff --git a/docs/HowToUsePyparsing.rst b/docs/HowToUsePyparsing.rst index 62dc677..dd75443 100644 --- a/docs/HowToUsePyparsing.rst +++ b/docs/HowToUsePyparsing.rst @@ -1,975 +1,973 @@ -========================== -Using the pyparsing module -========================== - -:author: Paul McGuire -:address: ptmcg@users.sourceforge.net - -:revision: 2.0.1a -:date: July, 2013 (minor update August, 2018) - -:copyright: Copyright |copy| 2003-2013 Paul McGuire. - -.. |copy| unicode:: 0xA9 - -:abstract: This document provides how-to instructions for the - pyparsing library, an easy-to-use Python module for constructing - and executing basic text parsers. The pyparsing module is useful - for evaluating user-definable - expressions, processing custom application language commands, or - extracting data from formatted reports. - -.. sectnum:: :depth: 4 - -.. contents:: :depth: 4 - -Note: While this content is still valid, there are more detailed -descriptions and examples at the online doc server at -https://pythonhosted.org/pyparsing/pyparsing-module.html - -Steps to follow -=============== - -To parse an incoming data string, the client code must follow these steps: - -1. First define the tokens and patterns to be matched, and assign - this to a program variable. Optional results names or parsing - actions can also be defined at this time. - -2. Call ``parseString()`` or ``scanString()`` on this variable, passing in - the string to - be parsed. During the matching process, whitespace between - tokens is skipped by default (although this can be changed). - When token matches occur, any defined parse action methods are - called. - -3. Process the parsed results, returned as a list of strings. - Matching results may also be accessed as named attributes of - the returned results, if names are defined in the definition of - the token pattern, using ``setResultsName()``. - - -Hello, World! -------------- - -The following complete Python program will parse the greeting "Hello, World!", -or any other greeting of the form ", !":: - - from pyparsing import Word, alphas - - greet = Word( alphas ) + "," + Word( alphas ) + "!" - greeting = greet.parseString( "Hello, World!" ) - print greeting - -The parsed tokens are returned in the following form:: - - ['Hello', ',', 'World', '!'] - - -Usage notes ------------ - -- The pyparsing module can be used to interpret simple command - strings or algebraic expressions, or can be used to extract data - from text reports with complicated format and structure ("screen - or report scraping"). However, it is possible that your defined - matching patterns may accept invalid inputs. Use pyparsing to - extract data from strings assumed to be well-formatted. - -- To keep up the readability of your code, use operators_ such as ``+``, ``|``, - ``^``, and ``~`` to combine expressions. You can also combine - string literals with ParseExpressions - they will be - automatically converted to Literal objects. For example:: - - integer = Word( nums ) # simple unsigned integer - variable = Word( alphas, max=1 ) # single letter variable, such as x, z, m, etc. - arithOp = Word( "+-*/", max=1 ) # arithmetic operators - equation = variable + "=" + integer + arithOp + integer # will match "x=2+2", etc. - - In the definition of ``equation``, the string ``"="`` will get added as - a ``Literal("=")``, but in a more readable way. - -- The pyparsing module's default behavior is to ignore whitespace. This is the - case for 99% of all parsers ever written. This allows you to write simple, clean, - grammars, such as the above ``equation``, without having to clutter it up with - extraneous ``ws`` markers. The ``equation`` grammar will successfully parse all of the - following statements:: - - x=2+2 - x = 2+2 - a = 10 * 4 - r= 1234/ 100000 - - Of course, it is quite simple to extend this example to support more elaborate expressions, with - nesting with parentheses, floating point numbers, scientific notation, and named constants - (such as ``e`` or ``pi``). See ``fourFn.py``, included in the examples directory. - -- To modify pyparsing's default whitespace skipping, you can use one or - more of the following methods: - - - use the static method ``ParserElement.setDefaultWhitespaceChars`` - to override the normal set of whitespace chars (' \t\n'). For instance - when defining a grammar in which newlines are significant, you should - call ``ParserElement.setDefaultWhitespaceChars(' \t')`` to remove - newline from the set of skippable whitespace characters. Calling - this method will affect all pyparsing expressions defined afterward. - - - call ``leaveWhitespace()`` on individual expressions, to suppress the - skipping of whitespace before trying to match the expression - - - use ``Combine`` to require that successive expressions must be - adjacent in the input string. For instance, this expression:: - - real = Word(nums) + '.' + Word(nums) - - will match "3.14159", but will also match "3 . 12". It will also - return the matched results as ['3', '.', '14159']. By changing this - expression to:: - - real = Combine( Word(nums) + '.' + Word(nums) ) - - it will not match numbers with embedded spaces, and it will return a - single concatenated string '3.14159' as the parsed token. - -- Repetition of expressions can be indicated using the '*' operator. An - expression may be multiplied by an integer value (to indicate an exact - repetition count), or by a tuple containing - two integers, or None and an integer, representing min and max repetitions - (with None representing no min or no max, depending whether it is the first or - second tuple element). See the following examples, where n is used to - indicate an integer value: - - - ``expr*3`` is equivalent to ``expr + expr + expr`` - - - ``expr*(2,3)`` is equivalent to ``expr + expr + Optional(expr)`` - - - ``expr*(n,None)`` or ``expr*(n,)`` is equivalent - to ``expr*n + ZeroOrMore(expr)`` (read as "at least n instances of expr") - - - ``expr*(None,n)`` is equivalent to ``expr*(0,n)`` - (read as "0 to n instances of expr") - - - ``expr*(None,None)`` is equivalent to ``ZeroOrMore(expr)`` - - - ``expr*(1,None)`` is equivalent to ``OneOrMore(expr)`` - - Note that ``expr*(None,n)`` does not raise an exception if - more than n exprs exist in the input stream; that is, - ``expr*(None,n)`` does not enforce a maximum number of expr - occurrences. If this behavior is desired, then write - ``expr*(None,n) + ~expr``. - -- ``MatchFirst`` expressions are matched left-to-right, and the first - match found will skip all later expressions within, so be sure - to define less-specific patterns after more-specific patterns. - If you are not sure which expressions are most specific, use Or - expressions (defined using the ``^`` operator) - they will always - match the longest expression, although they are more - compute-intensive. - -- ``Or`` expressions will evaluate all of the specified subexpressions - to determine which is the "best" match, that is, which matches - the longest string in the input data. In case of a tie, the - left-most expression in the ``Or`` list will win. - -- If parsing the contents of an entire file, pass it to the - ``parseFile`` method using:: - - expr.parseFile( sourceFile ) - -- ``ParseExceptions`` will report the location where an expected token - or expression failed to match. For example, if we tried to use our - "Hello, World!" parser to parse "Hello World!" (leaving out the separating - comma), we would get an exception, with the message:: - - pyparsing.ParseException: Expected "," (6), (1,7) - - In the case of complex - expressions, the reported location may not be exactly where you - would expect. See more information under ParseException_ . - -- Use the ``Group`` class to enclose logical groups of tokens within a - sublist. This will help organize your results into more - hierarchical form (the default behavior is to return matching - tokens as a flat list of matching input strings). - -- Punctuation may be significant for matching, but is rarely of - much interest in the parsed results. Use the ``suppress()`` method - to keep these tokens from cluttering up your returned lists of - tokens. For example, ``delimitedList()`` matches a succession of - one or more expressions, separated by delimiters (commas by - default), but only returns a list of the actual expressions - - the delimiters are used for parsing, but are suppressed from the - returned output. - -- Parse actions can be used to convert values from strings to - other data types (ints, floats, booleans, etc.). - -- Results names are recommended for retrieving tokens from complex - expressions. It is much easier to access a token using its field - name than using a positional index, especially if the expression - contains optional elements. You can also shortcut - the ``setResultsName`` call:: - - stats = "AVE:" + realNum.setResultsName("average") + \ - "MIN:" + realNum.setResultsName("min") + \ - "MAX:" + realNum.setResultsName("max") - - can now be written as this:: - - stats = "AVE:" + realNum("average") + \ - "MIN:" + realNum("min") + \ - "MAX:" + realNum("max") - -- Be careful when defining parse actions that modify global variables or - data structures (as in ``fourFn.py``), especially for low level tokens - or expressions that may occur within an ``And`` expression; an early element - of an ``And`` may match, but the overall expression may fail. - - -Classes -======= - -Classes in the pyparsing module -------------------------------- - -``ParserElement`` - abstract base class for all pyparsing classes; -methods for code to use are: - -- ``parseString( sourceString, parseAll=False )`` - only called once, on the overall - matching pattern; returns a ParseResults_ object that makes the - matched tokens available as a list, and optionally as a dictionary, - or as an object with named attributes; if parseAll is set to True, then - parseString will raise a ParseException if the grammar does not process - the complete input string. - -- ``parseFile( sourceFile )`` - a convenience function, that accepts an - input file object or filename. The file contents are passed as a - string to ``parseString()``. ``parseFile`` also supports the ``parseAll`` argument. - -- ``scanString( sourceString )`` - generator function, used to find and - extract matching text in the given source string; for each matched text, - returns a tuple of: - - - matched tokens (packaged as a ParseResults_ object) - - - start location of the matched text in the given source string - - - end location in the given source string - - ``scanString`` allows you to scan through the input source string for - random matches, instead of exhaustively defining the grammar for the entire - source text (as would be required with ``parseString``). - -- ``transformString( sourceString )`` - convenience wrapper function for - ``scanString``, to process the input source string, and replace matching - text with the tokens returned from parse actions defined in the grammar - (see setParseAction_). - -- ``searchString( sourceString )`` - another convenience wrapper function for - ``scanString``, returns a list of the matching tokens returned from each - call to ``scanString``. - -- ``setName( name )`` - associate a short descriptive name for this - element, useful in displaying exceptions and trace information - -- ``setResultsName( string, listAllMatches=False )`` - name to be given - to tokens matching - the element; if multiple tokens within - a repetition group (such as ``ZeroOrMore`` or ``delimitedList``) the - default is to return only the last matching token - if listAllMatches - is set to True, then a list of all the matching tokens is returned. - (New in 1.5.6 - a results name with a trailing '*' character will be - interpreted as setting listAllMatches to True.) - Note: - ``setResultsName`` returns a *copy* of the element so that a single - basic element can be referenced multiple times and given - different names within a complex grammar. - -.. _setParseAction: - -- ``setParseAction( *fn )`` - specify one or more functions to call after successful - matching of the element; each function is defined as ``fn( s, - loc, toks )``, where: - - - ``s`` is the original parse string - - - ``loc`` is the location in the string where matching started - - - ``toks`` is the list of the matched tokens, packaged as a ParseResults_ object - - Multiple functions can be attached to a ParserElement by specifying multiple - arguments to setParseAction, or by calling setParseAction multiple times. - - Each parse action function can return a modified ``toks`` list, to perform conversion, or - string modifications. For brevity, ``fn`` may also be a - lambda - here is an example of using a parse action to convert matched - integer tokens from strings to integers:: - - intNumber = Word(nums).setParseAction( lambda s,l,t: [ int(t[0]) ] ) - - If ``fn`` does not modify the ``toks`` list, it does not need to return - anything at all. - -- ``setBreak( breakFlag=True )`` - if breakFlag is True, calls pdb.set_break() - as this expression is about to be parsed - -- ``copy()`` - returns a copy of a ParserElement; can be used to use the same - parse expression in different places in a grammar, with different parse actions - attached to each - -- ``leaveWhitespace()`` - change default behavior of skipping - whitespace before starting matching (mostly used internally to the - pyparsing module, rarely used by client code) - -- ``setWhitespaceChars( chars )`` - define the set of chars to be ignored - as whitespace before trying to match a specific ParserElement, in place of the - default set of whitespace (space, tab, newline, and return) - -- ``setDefaultWhitespaceChars( chars )`` - class-level method to override - the default set of whitespace chars for all subsequently created ParserElements - (including copies); useful when defining grammars that treat one or more of the - default whitespace characters as significant (such as a line-sensitive grammar, to - omit newline from the list of ignorable whitespace) - -- ``suppress()`` - convenience function to suppress the output of the - given element, instead of wrapping it with a Suppress object. - -- ``ignore( expr )`` - function to specify parse expression to be - ignored while matching defined patterns; can be called - repeatedly to specify multiple expressions; useful to specify - patterns of comment syntax, for example - -- ``setDebug( dbgFlag=True )`` - function to enable/disable tracing output - when trying to match this element - -- ``validate()`` - function to verify that the defined grammar does not - contain infinitely recursive constructs - -.. _parseWithTabs: - -- ``parseWithTabs()`` - function to override default behavior of converting - tabs to spaces before parsing the input string; rarely used, except when - specifying whitespace-significant grammars using the White_ class. - -- ``enablePackrat()`` - a class-level static method to enable a memoizing - performance enhancement, known as "packrat parsing". packrat parsing is - disabled by default, since it may conflict with some user programs that use - parse actions. To activate the packrat feature, your - program must call the class method ParserElement.enablePackrat(). For best - results, call enablePackrat() immediately after importing pyparsing. - - -Basic ParserElement subclasses ------------------------------- - -- ``Literal`` - construct with a string to be matched exactly - -- ``CaselessLiteral`` - construct with a string to be matched, but - without case checking; results are always returned as the - defining literal, NOT as they are found in the input string - -- ``Keyword`` - similar to Literal, but must be immediately followed by - whitespace, punctuation, or other non-keyword characters; prevents - accidental matching of a non-keyword that happens to begin with a - defined keyword - -- ``CaselessKeyword`` - similar to Keyword, but with caseless matching - behavior - -.. _Word: - -- ``Word`` - one or more contiguous characters; construct with a - string containing the set of allowed initial characters, and an - optional second string of allowed body characters; for instance, - a common Word construct is to match a code identifier - in C, a - valid identifier must start with an alphabetic character or an - underscore ('_'), followed by a body that can also include numeric - digits. That is, ``a``, ``i``, ``MAX_LENGTH``, ``_a1``, ``b_109_``, and - ``plan9FromOuterSpace`` - are all valid identifiers; ``9b7z``, ``$a``, ``.section``, and ``0debug`` - are not. To - define an identifier using a Word, use either of the following:: - - - Word( alphas+"_", alphanums+"_" ) - - Word( srange("[a-zA-Z_]"), srange("[a-zA-Z0-9_]") ) - - If only one - string given, it specifies that the same character set defined - for the initial character is used for the word body; for instance, to - define an identifier that can only be composed of capital letters and - underscores, use:: - - - Word( "ABCDEFGHIJKLMNOPQRSTUVWXYZ_" ) - - Word( srange("[A-Z_]") ) - - A Word may - also be constructed with any of the following optional parameters: - - - ``min`` - indicating a minimum length of matching characters - - - ``max`` - indicating a maximum length of matching characters - - - ``exact`` - indicating an exact length of matching characters - - If ``exact`` is specified, it will override any values for ``min`` or ``max``. - - New in 1.5.6 - Sometimes you want to define a word using all - characters in a range except for one or two of them; you can do this - with the new ``excludeChars`` argument. This is helpful if you want to define - a word with all printables except for a single delimiter character, such - as '.'. Previously, you would have to create a custom string to pass to Word. - With this change, you can just create ``Word(printables, excludeChars='.')``. - -- ``CharsNotIn`` - similar to Word_, but matches characters not - in the given constructor string (accepts only one string for both - initial and body characters); also supports ``min``, ``max``, and ``exact`` - optional parameters. - -- ``Regex`` - a powerful construct, that accepts a regular expression - to be matched at the current parse position; accepts an optional - ``flags`` parameter, corresponding to the flags parameter in the re.compile - method; if the expression includes named sub-fields, they will be - represented in the returned ParseResults_ - -- ``QuotedString`` - supports the definition of custom quoted string - formats, in addition to pyparsing's built-in ``dblQuotedString`` and - ``sglQuotedString``. ``QuotedString`` allows you to specify the following - parameters: - - - ``quoteChar`` - string of one or more characters defining the quote delimiting string - - - ``escChar`` - character to escape quotes, typically backslash (default=None) - - - ``escQuote`` - special quote sequence to escape an embedded quote string (such as SQL's "" to escape an embedded ") (default=None) - - - ``multiline`` - boolean indicating whether quotes can span multiple lines (default=False) - - - ``unquoteResults`` - boolean indicating whether the matched text should be unquoted (default=True) - - - ``endQuoteChar`` - string of one or more characters defining the end of the quote delimited string (default=None => same as quoteChar) - -- ``SkipTo`` - skips ahead in the input string, accepting any - characters up to the specified pattern; may be constructed with - the following optional parameters: - - - ``include`` - if set to true, also consumes the match expression - (default is false) - - - ``ignore`` - allows the user to specify patterns to not be matched, - to prevent false matches - - - ``failOn`` - if a literal string or expression is given for this argument, it defines an expression that - should cause the ``SkipTo`` expression to fail, and not skip over that expression - -.. _White: - -- ``White`` - also similar to Word_, but matches whitespace - characters. Not usually needed, as whitespace is implicitly - ignored by pyparsing. However, some grammars are whitespace-sensitive, - such as those that use leading tabs or spaces to indicating grouping - or hierarchy. (If matching on tab characters, be sure to call - parseWithTabs_ on the top-level parse element.) - -- ``Empty`` - a null expression, requiring no characters - will always - match; useful for debugging and for specialized grammars - -- ``NoMatch`` - opposite of Empty, will never match; useful for debugging - and for specialized grammars - - -Expression subclasses ---------------------- - -- ``And`` - construct with a list of ParserElements, all of which must - match for And to match; can also be created using the '+' - operator; multiple expressions can be Anded together using the '*' - operator as in:: - - ipAddress = Word(nums) + ('.'+Word(nums))*3 - - A tuple can be used as the multiplier, indicating a min/max:: - - usPhoneNumber = Word(nums) + ('-'+Word(nums))*(1,2) - - A special form of ``And`` is created if the '-' operator is used - instead of the '+' operator. In the ipAddress example above, if - no trailing '.' and Word(nums) are found after matching the initial - Word(nums), then pyparsing will back up in the grammar and try other - alternatives to ipAddress. However, if ipAddress is defined as:: - - strictIpAddress = Word(nums) - ('.'+Word(nums))*3 - - then no backing up is done. If the first Word(nums) of strictIpAddress - is matched, then any mismatch after that will raise a ParseSyntaxException, - which will halt the parsing process immediately. By careful use of the - '-' operator, grammars can provide meaningful error messages close to - the location where the incoming text does not match the specified - grammar. - -- ``Or`` - construct with a list of ParserElements, any of which must - match for Or to match; if more than one expression matches, the - expression that makes the longest match will be used; can also - be created using the '^' operator - -- ``MatchFirst`` - construct with a list of ParserElements, any of - which must match for MatchFirst to match; matching is done - left-to-right, taking the first expression that matches; can - also be created using the '|' operator - -- ``Each`` - similar to And, in that all of the provided expressions - must match; however, Each permits matching to be done in any order; - can also be created using the '&' operator - -- ``Optional`` - construct with a ParserElement, but this element is - not required to match; can be constructed with an optional ``default`` argument, - containing a default string or object to be supplied if the given optional - parse element is not found in the input string; parse action will only - be called if a match is found, or if a default is specified - -- ``ZeroOrMore`` - similar to Optional, but can be repeated - -- ``OneOrMore`` - similar to ZeroOrMore, but at least one match must - be present - -- ``FollowedBy`` - a lookahead expression, requires matching of the given - expressions, but does not advance the parsing position within the input string - -- ``NotAny`` - a negative lookahead expression, prevents matching of named - expressions, does not advance the parsing position within the input string; - can also be created using the unary '~' operator - - -.. _operators: - -Expression operators --------------------- - -- ``~`` - creates NotAny using the expression after the operator - -- ``+`` - creates And using the expressions before and after the operator - -- ``|`` - creates MatchFirst (first left-to-right match) using the expressions before and after the operator - -- ``^`` - creates Or (longest match) using the expressions before and after the operator - -- ``&`` - creates Each using the expressions before and after the operator - -- ``*`` - creates And by multiplying the expression by the integer operand; if - expression is multiplied by a 2-tuple, creates an And of (min,max) - expressions (similar to "{min,max}" form in regular expressions); if - min is None, intepret as (0,max); if max is None, interpret as - expr*min + ZeroOrMore(expr) - -- ``-`` - like ``+`` but with no backup and retry of alternatives - -- ``*`` - repetition of expression - -- ``==`` - matching expression to string; returns True if the string matches the given expression - -- ``<<=`` - inserts the expression following the operator as the body of the - Forward expression before the operator - - - -Positional subclasses ---------------------- - -- ``StringStart`` - matches beginning of the text - -- ``StringEnd`` - matches the end of the text - -- ``LineStart`` - matches beginning of a line (lines delimited by ``\n`` characters) - -- ``LineEnd`` - matches the end of a line - -- ``WordStart`` - matches a leading word boundary - -- ``WordEnd`` - matches a trailing word boundary - - - -Converter subclasses --------------------- - -- ``Combine`` - joins all matched tokens into a single string, using - specified joinString (default ``joinString=""``); expects - all matching tokens to be adjacent, with no intervening - whitespace (can be overridden by specifying ``adjacent=False`` in constructor) - -- ``Suppress`` - clears matched tokens; useful to keep returned - results from being cluttered with required but uninteresting - tokens (such as list delimiters) - - -Special subclasses ------------------- - -- ``Group`` - causes the matched tokens to be enclosed in a list; - useful in repeated elements like ``ZeroOrMore`` and ``OneOrMore`` to - break up matched tokens into groups for each repeated pattern - -- ``Dict`` - like ``Group``, but also constructs a dictionary, using the - [0]'th elements of all enclosed token lists as the keys, and - each token list as the value - -- ``SkipTo`` - catch-all matching expression that accepts all characters - up until the given pattern is found to match; useful for specifying - incomplete grammars - -- ``Forward`` - placeholder token used to define recursive token - patterns; when defining the actual expression later in the - program, insert it into the ``Forward`` object using the ``<<`` - operator (see ``fourFn.py`` for an example). - - -Other classes -------------- -.. _ParseResults: - -- ``ParseResults`` - class used to contain and manage the lists of tokens - created from parsing the input using the user-defined parse - expression. ParseResults can be accessed in a number of ways: - - - as a list - - - total list of elements can be found using len() - - - individual elements can be found using [0], [1], [-1], etc. - - - elements can be deleted using ``del`` - - - the -1th element can be extracted and removed in a single operation - using ``pop()``, or any element can be extracted and removed - using ``pop(n)`` - - - as a dictionary - - - if ``setResultsName()`` is used to name elements within the - overall parse expression, then these fields can be referenced - as dictionary elements or as attributes - - - the Dict class generates dictionary entries using the data of the - input text - in addition to ParseResults listed as ``[ [ a1, b1, c1, ...], [ a2, b2, c2, ...] ]`` - it also acts as a dictionary with entries defined as ``{ a1 : [ b1, c1, ... ] }, { a2 : [ b2, c2, ... ] }``; - this is especially useful when processing tabular data where the first column contains a key - value for that line of data - - - list elements that are deleted using ``del`` will still be accessible by their - dictionary keys - - - supports ``get()``, ``items()`` and ``keys()`` methods, similar to a dictionary - - - a keyed item can be extracted and removed using ``pop(key)``. Here - key must be non-numeric (such as a string), in order to use dict - extraction instead of list extraction. - - - new named elements can be added (in a parse action, for instance), using the same - syntax as adding an item to a dict (``parseResults["X"]="new item"``); named elements can be removed using ``del parseResults["X"]`` - - - as a nested list - - - results returned from the Group class are encapsulated within their - own list structure, so that the tokens can be handled as a hierarchical - tree - - ParseResults can also be converted to an ordinary list of strings - by calling ``asList()``. Note that this will strip the results of any - field names that have been defined for any embedded parse elements. - (The ``pprint`` module is especially good at printing out the nested contents - given by ``asList()``.) - - Finally, ParseResults can be viewed by calling ``dump()``. ``dump()` will first show - the ``asList()`` output, followed by an indented structure listing parsed tokens that - have been assigned results names. - - -Exception classes and Troubleshooting -------------------------------------- - -.. _ParseException: - -- ``ParseException`` - exception returned when a grammar parse fails; - ParseExceptions have attributes loc, msg, line, lineno, and column; to view the - text line and location where the reported ParseException occurs, use:: - - except ParseException, err: - print err.line - print " "*(err.column-1) + "^" - print err - -- ``RecursiveGrammarException`` - exception returned by ``validate()`` if - the grammar contains a recursive infinite loop, such as:: - - badGrammar = Forward() - goodToken = Literal("A") - badGrammar <<= Optional(goodToken) + badGrammar - -- ``ParseFatalException`` - exception that parse actions can raise to stop parsing - immediately. Should be used when a semantic error is found in the input text, such - as a mismatched XML tag. - -- ``ParseSyntaxException`` - subclass of ``ParseFatalException`` raised when a - syntax error is found, based on the use of the '-' operator when defining - a sequence of expressions in an ``And`` expression. - -You can also get some insights into the parsing logic using diagnostic parse actions, -and setDebug(), or test the matching of expression fragments by testing them using -scanString(). - - -Miscellaneous attributes and methods -==================================== - -Helper methods --------------- - -- ``delimitedList( expr, delim=',')`` - convenience function for - matching one or more occurrences of expr, separated by delim. - By default, the delimiters are suppressed, so the returned results contain - only the separate list elements. Can optionally specify ``combine=True``, - indicating that the expressions and delimiters should be returned as one - combined value (useful for scoped variables, such as ``"a.b.c"``, or - ``"a::b::c"``, or paths such as ``"a/b/c"``). - -- ``countedArray( expr )`` - convenience function for a pattern where an list of - instances of the given expression are preceded by an integer giving the count of - elements in the list. Returns an expression that parses the leading integer, - reads exactly that many expressions, and returns the array of expressions in the - parse results - the leading integer is suppressed from the results (although it - is easily reconstructed by using len on the returned array). - -- ``oneOf( string, caseless=False )`` - convenience function for quickly declaring an - alternative set of ``Literal`` tokens, by splitting the given string on - whitespace boundaries. The tokens are sorted so that longer - matches are attempted first; this ensures that a short token does - not mask a longer one that starts with the same characters. If ``caseless=True``, - will create an alternative set of CaselessLiteral tokens. - -- ``dictOf( key, value )`` - convenience function for quickly declaring a - dictionary pattern of ``Dict( ZeroOrMore( Group( key + value ) ) )``. - -- ``makeHTMLTags( tagName )`` and ``makeXMLTags( tagName )`` - convenience - functions to create definitions of opening and closing tag expressions. Returns - a pair of expressions, for the corresponding and strings. Includes - support for attributes in the opening tag, such as - attributes - are returned as keyed tokens in the returned ParseResults. ``makeHTMLTags`` is less - restrictive than ``makeXMLTags``, especially with respect to case sensitivity. - -- ``infixNotation(baseOperand, operatorList)`` - (formerly named ``operatorPrecedence``) convenience function to define a - grammar for parsing infix notation - expressions with a hierarchical precedence of operators. To use the ``infixNotation`` - helper: - - 1. Define the base "atom" operand term of the grammar. - For this simple grammar, the smallest operand is either - and integer or a variable. This will be the first argument - to the ``infixNotation`` method. - - 2. Define a list of tuples for each level of operator - precendence. Each tuple is of the form - ``(opExpr, numTerms, rightLeftAssoc, parseAction)``, where: - - - ``opExpr`` - the pyparsing expression for the operator; - may also be a string, which will be converted to a Literal; if - None, indicates an empty operator, such as the implied - multiplication operation between 'm' and 'x' in "y = mx + b". - - - ``numTerms`` - the number of terms for this operator (must - be 1, 2, or 3) - - - ``rightLeftAssoc`` is the indicator whether the operator is - right or left associative, using the pyparsing-defined - constants ``opAssoc.RIGHT`` and ``opAssoc.LEFT``. - - - ``parseAction`` is the parse action to be associated with - expressions matching this operator expression (the - ``parseAction`` tuple member may be omitted) - - 3. Call ``infixNotation`` passing the operand expression and - the operator precedence list, and save the returned value - as the generated pyparsing expression. You can then use - this expression to parse input strings, or incorporate it - into a larger, more complex grammar. - -- ``matchPreviousLiteral`` and ``matchPreviousExpr`` - function to define and - expression that matches the same content - as was parsed in a previous parse expression. For instance:: - - first = Word(nums) - matchExpr = first + ":" + matchPreviousLiteral(first) - - will match "1:1", but not "1:2". Since this matches at the literal - level, this will also match the leading "1:1" in "1:10". - - In contrast:: - - first = Word(nums) - matchExpr = first + ":" + matchPreviousExpr(first) - - will *not* match the leading "1:1" in "1:10"; the expressions are - evaluated first, and then compared, so "1" is compared with "10". - -- ``nestedExpr(opener, closer, content=None, ignoreExpr=quotedString)`` - method for defining nested - lists enclosed in opening and closing delimiters. - - - ``opener`` - opening character for a nested list (default="("); can also be a pyparsing expression - - - ``closer`` - closing character for a nested list (default=")"); can also be a pyparsing expression - - - ``content`` - expression for items within the nested lists (default=None) - - - ``ignoreExpr`` - expression for ignoring opening and closing delimiters (default=quotedString) - - If an expression is not provided for the content argument, the nested - expression will capture all whitespace-delimited content between delimiters - as a list of separate values. - - Use the ignoreExpr argument to define expressions that may contain - opening or closing characters that should not be treated as opening - or closing characters for nesting, such as quotedString or a comment - expression. Specify multiple expressions using an Or or MatchFirst. - The default is quotedString, but if no expressions are to be ignored, - then pass None for this argument. - - -- ``indentedBlock( statementExpr, indentationStackVar, indent=True)`` - - function to define an indented block of statements, similar to - indentation-based blocking in Python source code: - - - ``statementExpr`` - the expression defining a statement that - will be found in the indented block; a valid ``indentedBlock`` - must contain at least 1 matching ``statementExpr`` - - - ``indentationStackVar`` - a Python list variable; this variable - should be common to all ``indentedBlock`` expressions defined - within the same grammar, and should be reinitialized to [1] - each time the grammar is to be used - - - ``indent`` - a boolean flag indicating whether the expressions - within the block must be indented from the current parse - location; if using ``indentedBlock`` to define the left-most - statements (all starting in column 1), set ``indent`` to False - -.. _originalTextFor: - -- ``originalTextFor( expr )`` - helper function to preserve the originally parsed text, regardless of any - token processing or conversion done by the contained expression. For instance, the following expression:: - - fullName = Word(alphas) + Word(alphas) - - will return the parse of "John Smith" as ['John', 'Smith']. In some applications, the actual name as it - was given in the input string is what is desired. To do this, use ``originalTextFor``:: - - fullName = originalTextFor(Word(alphas) + Word(alphas)) - -- ``ungroup( expr )`` - function to "ungroup" returned tokens; useful - to undo the default behavior of And to always group the returned tokens, even - if there is only one in the list. (New in 1.5.6) - -- ``lineno( loc, string )`` - function to give the line number of the - location within the string; the first line is line 1, newlines - start new rows - -- ``col( loc, string )`` - function to give the column number of the - location within the string; the first column is column 1, - newlines reset the column number to 1 - -- ``line( loc, string )`` - function to retrieve the line of text - representing ``lineno( loc, string )``; useful when printing out diagnostic - messages for exceptions - -- ``srange( rangeSpec )`` - function to define a string of characters, - given a string of the form used by regexp string ranges, such as ``"[0-9]"`` for - all numeric digits, ``"[A-Z_]"`` for uppercase characters plus underscore, and - so on (note that rangeSpec does not include support for generic regular - expressions, just string range specs) - -- ``getTokensEndLoc()`` - function to call from within a parse action to get - the ending location for the matched tokens - -- ``traceParseAction(fn)`` - decorator function to debug parse actions. Lists - each call, called arguments, and return value or exception - - - -Helper parse actions --------------------- - -- ``removeQuotes`` - removes the first and last characters of a quoted string; - useful to remove the delimiting quotes from quoted strings - -- ``replaceWith(replString)`` - returns a parse action that simply returns the - replString; useful when using transformString, or converting HTML entities, as in:: - - nbsp = Literal(" ").setParseAction( replaceWith("") ) - -- ``keepOriginalText``- (deprecated, use originalTextFor_ instead) restores any internal whitespace or suppressed - text within the tokens for a matched parse - expression. This is especially useful when defining expressions - for scanString or transformString applications. - -- ``withAttribute( *args, **kwargs )`` - helper to create a validating parse action to be used with start tags created - with ``makeXMLTags`` or ``makeHTMLTags``. Use ``withAttribute`` to qualify a starting tag - with a required attribute value, to avoid false matches on common tags such as - ```` or ``
``. - - ``withAttribute`` can be called with: - - - keyword arguments, as in ``(class="Customer",align="right")``, or - - - a list of name-value tuples, as in ``( ("ns1:class", "Customer"), ("ns2:align","right") )`` - - An attribute can be specified to have the special value - ``withAttribute.ANY_VALUE``, which will match any value - use this to - ensure that an attribute is present but any attribute value is - acceptable. - -- ``downcaseTokens`` - converts all matched tokens to lowercase - -- ``upcaseTokens`` - converts all matched tokens to uppercase - -- ``matchOnlyAtCol( columnNumber )`` - a parse action that verifies that - an expression was matched at a particular column, raising a - ParseException if matching at a different column number; useful when parsing - tabular data - - - -Common string and token constants ---------------------------------- - -- ``alphas`` - same as ``string.letters`` - -- ``nums`` - same as ``string.digits`` - -- ``alphanums`` - a string containing ``alphas + nums`` - -- ``alphas8bit`` - a string containing alphabetic 8-bit characters:: - - ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþ - -- ``printables`` - same as ``string.printable``, minus the space (``' '``) character - -- ``empty`` - a global ``Empty()``; will always match - -- ``sglQuotedString`` - a string of characters enclosed in 's; may - include whitespace, but not newlines - -- ``dblQuotedString`` - a string of characters enclosed in "s; may - include whitespace, but not newlines - -- ``quotedString`` - ``sglQuotedString | dblQuotedString`` - -- ``cStyleComment`` - a comment block delimited by ``'/*'`` and ``'*/'`` sequences; can span - multiple lines, but does not support nesting of comments - -- ``htmlComment`` - a comment block delimited by ``''`` sequences; can span - multiple lines, but does not support nesting of comments - -- ``commaSeparatedList`` - similar to ``delimitedList``, except that the - list expressions can be any text value, or a quoted string; quoted strings can - safely include commas without incorrectly breaking the string into two tokens - -- ``restOfLine`` - all remaining printable characters up to but not including the next - newline +========================== +Using the pyparsing module +========================== + +:author: Paul McGuire +:address: ptmcg@users.sourceforge.net + +:revision: 2.0.1a +:date: July, 2013 (minor update August, 2018) + +:copyright: Copyright |copy| 2003-2013 Paul McGuire. + +.. |copy| unicode:: 0xA9 + +:abstract: This document provides how-to instructions for the + pyparsing library, an easy-to-use Python module for constructing + and executing basic text parsers. The pyparsing module is useful + for evaluating user-definable + expressions, processing custom application language commands, or + extracting data from formatted reports. + +.. sectnum:: :depth: 4 + +.. contents:: :depth: 4 + +Note: While this content is still valid, there are more detailed +descriptions and examples at the online doc server at +https://pythonhosted.org/pyparsing/pyparsing-module.html + +Steps to follow +=============== + +To parse an incoming data string, the client code must follow these steps: + +1. First define the tokens and patterns to be matched, and assign + this to a program variable. Optional results names or parsing + actions can also be defined at this time. + +2. Call ``parseString()`` or ``scanString()`` on this variable, passing in + the string to + be parsed. During the matching process, whitespace between + tokens is skipped by default (although this can be changed). + When token matches occur, any defined parse action methods are + called. + +3. Process the parsed results, returned as a list of strings. + Matching results may also be accessed as named attributes of + the returned results, if names are defined in the definition of + the token pattern, using ``setResultsName()``. + + +Hello, World! +------------- + +The following complete Python program will parse the greeting "Hello, World!", +or any other greeting of the form ", !":: + + from pyparsing import Word, alphas + + greet = Word(alphas) + "," + Word(alphas) + "!" + greeting = greet.parseString("Hello, World!") + print greeting + +The parsed tokens are returned in the following form:: + + ['Hello', ',', 'World', '!'] + + +Usage notes +----------- + +- The pyparsing module can be used to interpret simple command + strings or algebraic expressions, or can be used to extract data + from text reports with complicated format and structure ("screen + or report scraping"). However, it is possible that your defined + matching patterns may accept invalid inputs. Use pyparsing to + extract data from strings assumed to be well-formatted. + +- To keep up the readability of your code, use operators_ such as ``+``, ``|``, + ``^``, and ``~`` to combine expressions. You can also combine + string literals with ParseExpressions - they will be + automatically converted to Literal objects. For example:: + + integer = Word(nums) # simple unsigned integer + variable = Word(alphas, max=1) # single letter variable, such as x, z, m, etc. + arithOp = Word("+-*/", max=1) # arithmetic operators + equation = variable + "=" + integer + arithOp + integer # will match "x=2+2", etc. + + In the definition of ``equation``, the string ``"="`` will get added as + a ``Literal("=")``, but in a more readable way. + +- The pyparsing module's default behavior is to ignore whitespace. This is the + case for 99% of all parsers ever written. This allows you to write simple, clean, + grammars, such as the above ``equation``, without having to clutter it up with + extraneous ``ws`` markers. The ``equation`` grammar will successfully parse all of the + following statements:: + + x=2+2 + x = 2+2 + a = 10 * 4 + r= 1234/ 100000 + + Of course, it is quite simple to extend this example to support more elaborate expressions, with + nesting with parentheses, floating point numbers, scientific notation, and named constants + (such as ``e`` or ``pi``). See ``fourFn.py``, included in the examples directory. + +- To modify pyparsing's default whitespace skipping, you can use one or + more of the following methods: + + - use the static method ``ParserElement.setDefaultWhitespaceChars`` + to override the normal set of whitespace chars (' \t\n'). For instance + when defining a grammar in which newlines are significant, you should + call ``ParserElement.setDefaultWhitespaceChars(' \t')`` to remove + newline from the set of skippable whitespace characters. Calling + this method will affect all pyparsing expressions defined afterward. + + - call ``leaveWhitespace()`` on individual expressions, to suppress the + skipping of whitespace before trying to match the expression + + - use ``Combine`` to require that successive expressions must be + adjacent in the input string. For instance, this expression:: + + real = Word(nums) + '.' + Word(nums) + + will match "3.14159", but will also match "3 . 12". It will also + return the matched results as ['3', '.', '14159']. By changing this + expression to:: + + real = Combine(Word(nums) + '.' + Word(nums)) + + it will not match numbers with embedded spaces, and it will return a + single concatenated string '3.14159' as the parsed token. + +- Repetition of expressions can be indicated using ``*`` or ``[]`` notation. An + expression may be multiplied by an integer value (to indicate an exact + repetition count), or indexed with a tuple, representing min and max repetitions + (with ``...`` representing no min or no max, depending whether it is the first or + second tuple element). See the following examples, where n is used to + indicate an integer value: + + - ``expr*3`` is equivalent to ``expr + expr + expr`` + + - ``expr[2, 3]`` is equivalent to ``expr + expr + Optional(expr)`` + + - ``expr[n, ...]`` or ``expr[n,]`` is equivalent + to ``expr*n + ZeroOrMore(expr)`` (read as "at least n instances of expr") + + - ``expr[... ,n]`` is equivalent to ``expr*(0, n)`` + (read as "0 to n instances of expr") + + - ``expr[...]`` and ``expr[0, ...]`` are equivalent to ``ZeroOrMore(expr)`` + + - ``expr[1, ...]`` is equivalent to ``OneOrMore(expr)`` + + Note that ``expr[..., n]`` does not raise an exception if + more than n exprs exist in the input stream; that is, + ``expr[..., n]`` does not enforce a maximum number of expr + occurrences. If this behavior is desired, then write + ``expr[..., n] + ~expr``. + +- ``MatchFirst`` expressions are matched left-to-right, and the first + match found will skip all later expressions within, so be sure + to define less-specific patterns after more-specific patterns. + If you are not sure which expressions are most specific, use Or + expressions (defined using the ``^`` operator) - they will always + match the longest expression, although they are more + compute-intensive. + +- ``Or`` expressions will evaluate all of the specified subexpressions + to determine which is the "best" match, that is, which matches + the longest string in the input data. In case of a tie, the + left-most expression in the ``Or`` list will win. + +- If parsing the contents of an entire file, pass it to the + ``parseFile`` method using:: + + expr.parseFile(sourceFile) + +- ``ParseExceptions`` will report the location where an expected token + or expression failed to match. For example, if we tried to use our + "Hello, World!" parser to parse "Hello World!" (leaving out the separating + comma), we would get an exception, with the message:: + + pyparsing.ParseException: Expected "," (6), (1,7) + + In the case of complex + expressions, the reported location may not be exactly where you + would expect. See more information under ParseException_ . + +- Use the ``Group`` class to enclose logical groups of tokens within a + sublist. This will help organize your results into more + hierarchical form (the default behavior is to return matching + tokens as a flat list of matching input strings). + +- Punctuation may be significant for matching, but is rarely of + much interest in the parsed results. Use the ``suppress()`` method + to keep these tokens from cluttering up your returned lists of + tokens. For example, ``delimitedList()`` matches a succession of + one or more expressions, separated by delimiters (commas by + default), but only returns a list of the actual expressions - + the delimiters are used for parsing, but are suppressed from the + returned output. + +- Parse actions can be used to convert values from strings to + other data types (ints, floats, booleans, etc.). + +- Results names are recommended for retrieving tokens from complex + expressions. It is much easier to access a token using its field + name than using a positional index, especially if the expression + contains optional elements. You can also shortcut + the ``setResultsName`` call:: + + stats = ("AVE:" + realNum.setResultsName("average") + + "MIN:" + realNum.setResultsName("min") + + "MAX:" + realNum.setResultsName("max")) + + can now be written as this:: + + stats = ("AVE:" + realNum("average") + + "MIN:" + realNum("min") + + "MAX:" + realNum("max")) + +- Be careful when defining parse actions that modify global variables or + data structures (as in ``fourFn.py``), especially for low level tokens + or expressions that may occur within an ``And`` expression; an early element + of an ``And`` may match, but the overall expression may fail. + + +Classes +======= + +Classes in the pyparsing module +------------------------------- + +``ParserElement`` - abstract base class for all pyparsing classes; +methods for code to use are: + +- ``parseString(sourceString, parseAll=False)`` - only called once, on the overall + matching pattern; returns a ParseResults_ object that makes the + matched tokens available as a list, and optionally as a dictionary, + or as an object with named attributes; if parseAll is set to True, then + parseString will raise a ParseException if the grammar does not process + the complete input string. + +- ``parseFile(sourceFile)`` - a convenience function, that accepts an + input file object or filename. The file contents are passed as a + string to ``parseString()``. ``parseFile`` also supports the ``parseAll`` argument. + +- ``scanString(sourceString)`` - generator function, used to find and + extract matching text in the given source string; for each matched text, + returns a tuple of: + + - matched tokens (packaged as a ParseResults_ object) + + - start location of the matched text in the given source string + + - end location in the given source string + + ``scanString`` allows you to scan through the input source string for + random matches, instead of exhaustively defining the grammar for the entire + source text (as would be required with ``parseString``). + +- ``transformString(sourceString)`` - convenience wrapper function for + ``scanString``, to process the input source string, and replace matching + text with the tokens returned from parse actions defined in the grammar + (see setParseAction_). + +- ``searchString(sourceString)`` - another convenience wrapper function for + ``scanString``, returns a list of the matching tokens returned from each + call to ``scanString``. + +- ``setName(name)`` - associate a short descriptive name for this + element, useful in displaying exceptions and trace information + +- ``setResultsName(string, listAllMatches=False)`` - name to be given + to tokens matching + the element; if multiple tokens within + a repetition group (such as ``ZeroOrMore`` or ``delimitedList``) the + default is to return only the last matching token - if listAllMatches + is set to True, then a list of all the matching tokens is returned. + (New in 1.5.6 - a results name with a trailing '*' character will be + interpreted as setting listAllMatches to True.) + Note: + ``setResultsName`` returns a *copy* of the element so that a single + basic element can be referenced multiple times and given + different names within a complex grammar. + +.. _setParseAction: + +- ``setParseAction(*fn)`` - specify one or more functions to call after successful + matching of the element; each function is defined as ``fn(s, loc, toks)``, where: + + - ``s`` is the original parse string + + - ``loc`` is the location in the string where matching started + + - ``toks`` is the list of the matched tokens, packaged as a ParseResults_ object + + Multiple functions can be attached to a ParserElement by specifying multiple + arguments to setParseAction, or by calling setParseAction multiple times. + + Each parse action function can return a modified ``toks`` list, to perform conversion, or + string modifications. For brevity, ``fn`` may also be a + lambda - here is an example of using a parse action to convert matched + integer tokens from strings to integers:: + + intNumber = Word(nums).setParseAction(lambda s,l,t: [int(t[0])]) + + If ``fn`` does not modify the ``toks`` list, it does not need to return + anything at all. + +- ``setBreak(breakFlag=True)`` - if breakFlag is True, calls pdb.set_break() + as this expression is about to be parsed + +- ``copy()`` - returns a copy of a ParserElement; can be used to use the same + parse expression in different places in a grammar, with different parse actions + attached to each + +- ``leaveWhitespace()`` - change default behavior of skipping + whitespace before starting matching (mostly used internally to the + pyparsing module, rarely used by client code) + +- ``setWhitespaceChars(chars)`` - define the set of chars to be ignored + as whitespace before trying to match a specific ParserElement, in place of the + default set of whitespace (space, tab, newline, and return) + +- ``setDefaultWhitespaceChars(chars)`` - class-level method to override + the default set of whitespace chars for all subsequently created ParserElements + (including copies); useful when defining grammars that treat one or more of the + default whitespace characters as significant (such as a line-sensitive grammar, to + omit newline from the list of ignorable whitespace) + +- ``suppress()`` - convenience function to suppress the output of the + given element, instead of wrapping it with a Suppress object. + +- ``ignore(expr)`` - function to specify parse expression to be + ignored while matching defined patterns; can be called + repeatedly to specify multiple expressions; useful to specify + patterns of comment syntax, for example + +- ``setDebug(dbgFlag=True)`` - function to enable/disable tracing output + when trying to match this element + +- ``validate()`` - function to verify that the defined grammar does not + contain infinitely recursive constructs + +.. _parseWithTabs: + +- ``parseWithTabs()`` - function to override default behavior of converting + tabs to spaces before parsing the input string; rarely used, except when + specifying whitespace-significant grammars using the White_ class. + +- ``enablePackrat()`` - a class-level static method to enable a memoizing + performance enhancement, known as "packrat parsing". packrat parsing is + disabled by default, since it may conflict with some user programs that use + parse actions. To activate the packrat feature, your + program must call the class method ParserElement.enablePackrat(). For best + results, call enablePackrat() immediately after importing pyparsing. + + +Basic ParserElement subclasses +------------------------------ + +- ``Literal`` - construct with a string to be matched exactly + +- ``CaselessLiteral`` - construct with a string to be matched, but + without case checking; results are always returned as the + defining literal, NOT as they are found in the input string + +- ``Keyword`` - similar to Literal, but must be immediately followed by + whitespace, punctuation, or other non-keyword characters; prevents + accidental matching of a non-keyword that happens to begin with a + defined keyword + +- ``CaselessKeyword`` - similar to Keyword, but with caseless matching + behavior + +.. _Word: + +- ``Word`` - one or more contiguous characters; construct with a + string containing the set of allowed initial characters, and an + optional second string of allowed body characters; for instance, + a common Word construct is to match a code identifier - in C, a + valid identifier must start with an alphabetic character or an + underscore ('_'), followed by a body that can also include numeric + digits. That is, ``a``, ``i``, ``MAX_LENGTH``, ``_a1``, ``b_109_``, and + ``plan9FromOuterSpace`` + are all valid identifiers; ``9b7z``, ``$a``, ``.section``, and ``0debug`` + are not. To + define an identifier using a Word, use either of the following:: + + - Word(alphas+"_", alphanums+"_") + - Word(srange("[a-zA-Z_]"), srange("[a-zA-Z0-9_]")) + + If only one + string given, it specifies that the same character set defined + for the initial character is used for the word body; for instance, to + define an identifier that can only be composed of capital letters and + underscores, use:: + + - Word("ABCDEFGHIJKLMNOPQRSTUVWXYZ_") + - Word(srange("[A-Z_]")) + + A Word may + also be constructed with any of the following optional parameters: + + - ``min`` - indicating a minimum length of matching characters + + - ``max`` - indicating a maximum length of matching characters + + - ``exact`` - indicating an exact length of matching characters + + If ``exact`` is specified, it will override any values for ``min`` or ``max``. + + New in 1.5.6 - Sometimes you want to define a word using all + characters in a range except for one or two of them; you can do this + with the new ``excludeChars`` argument. This is helpful if you want to define + a word with all printables except for a single delimiter character, such + as '.'. Previously, you would have to create a custom string to pass to Word. + With this change, you can just create ``Word(printables, excludeChars='.')``. + +- ``CharsNotIn`` - similar to Word_, but matches characters not + in the given constructor string (accepts only one string for both + initial and body characters); also supports ``min``, ``max``, and ``exact`` + optional parameters. + +- ``Regex`` - a powerful construct, that accepts a regular expression + to be matched at the current parse position; accepts an optional + ``flags`` parameter, corresponding to the flags parameter in the re.compile + method; if the expression includes named sub-fields, they will be + represented in the returned ParseResults_ + +- ``QuotedString`` - supports the definition of custom quoted string + formats, in addition to pyparsing's built-in ``dblQuotedString`` and + ``sglQuotedString``. ``QuotedString`` allows you to specify the following + parameters: + + - ``quoteChar`` - string of one or more characters defining the quote delimiting string + + - ``escChar`` - character to escape quotes, typically backslash (default=None) + + - ``escQuote`` - special quote sequence to escape an embedded quote string (such as SQL's "" to escape an embedded ") (default=None) + + - ``multiline`` - boolean indicating whether quotes can span multiple lines (default=False) + + - ``unquoteResults`` - boolean indicating whether the matched text should be unquoted (default=True) + + - ``endQuoteChar`` - string of one or more characters defining the end of the quote delimited string (default=None => same as quoteChar) + +- ``SkipTo`` - skips ahead in the input string, accepting any + characters up to the specified pattern; may be constructed with + the following optional parameters: + + - ``include`` - if set to true, also consumes the match expression + (default is false) + + - ``ignore`` - allows the user to specify patterns to not be matched, + to prevent false matches + + - ``failOn`` - if a literal string or expression is given for this argument, it defines an expression that + should cause the ``SkipTo`` expression to fail, and not skip over that expression + +.. _White: + +- ``White`` - also similar to Word_, but matches whitespace + characters. Not usually needed, as whitespace is implicitly + ignored by pyparsing. However, some grammars are whitespace-sensitive, + such as those that use leading tabs or spaces to indicating grouping + or hierarchy. (If matching on tab characters, be sure to call + parseWithTabs_ on the top-level parse element.) + +- ``Empty`` - a null expression, requiring no characters - will always + match; useful for debugging and for specialized grammars + +- ``NoMatch`` - opposite of Empty, will never match; useful for debugging + and for specialized grammars + + +Expression subclasses +--------------------- + +- ``And`` - construct with a list of ParserElements, all of which must + match for And to match; can also be created using the '+' + operator; multiple expressions can be Anded together using the '*' + operator as in:: + + ipAddress = Word(nums) + ('.' + Word(nums)) * 3 + + A tuple can be used as the multiplier, indicating a min/max:: + + usPhoneNumber = Word(nums) + ('-' + Word(nums)) * (1,2) + + A special form of ``And`` is created if the '-' operator is used + instead of the '+' operator. In the ipAddress example above, if + no trailing '.' and Word(nums) are found after matching the initial + Word(nums), then pyparsing will back up in the grammar and try other + alternatives to ipAddress. However, if ipAddress is defined as:: + + strictIpAddress = Word(nums) - ('.'+Word(nums))*3 + + then no backing up is done. If the first Word(nums) of strictIpAddress + is matched, then any mismatch after that will raise a ParseSyntaxException, + which will halt the parsing process immediately. By careful use of the + '-' operator, grammars can provide meaningful error messages close to + the location where the incoming text does not match the specified + grammar. + +- ``Or`` - construct with a list of ParserElements, any of which must + match for Or to match; if more than one expression matches, the + expression that makes the longest match will be used; can also + be created using the '^' operator + +- ``MatchFirst`` - construct with a list of ParserElements, any of + which must match for MatchFirst to match; matching is done + left-to-right, taking the first expression that matches; can + also be created using the '|' operator + +- ``Each`` - similar to And, in that all of the provided expressions + must match; however, Each permits matching to be done in any order; + can also be created using the '&' operator + +- ``Optional`` - construct with a ParserElement, but this element is + not required to match; can be constructed with an optional ``default`` argument, + containing a default string or object to be supplied if the given optional + parse element is not found in the input string; parse action will only + be called if a match is found, or if a default is specified + +- ``ZeroOrMore`` - similar to Optional, but can be repeated + +- ``OneOrMore`` - similar to ZeroOrMore, but at least one match must + be present + +- ``FollowedBy`` - a lookahead expression, requires matching of the given + expressions, but does not advance the parsing position within the input string + +- ``NotAny`` - a negative lookahead expression, prevents matching of named + expressions, does not advance the parsing position within the input string; + can also be created using the unary '~' operator + + +.. _operators: + +Expression operators +-------------------- + +- ``~`` - creates NotAny using the expression after the operator + +- ``+`` - creates And using the expressions before and after the operator + +- ``|`` - creates MatchFirst (first left-to-right match) using the expressions before and after the operator + +- ``^`` - creates Or (longest match) using the expressions before and after the operator + +- ``&`` - creates Each using the expressions before and after the operator + +- ``*`` - creates And by multiplying the expression by the integer operand; if + expression is multiplied by a 2-tuple, creates an And of (min,max) + expressions (similar to "{min,max}" form in regular expressions); if + min is None, intepret as (0,max); if max is None, interpret as + expr*min + ZeroOrMore(expr) + +- ``-`` - like ``+`` but with no backup and retry of alternatives + +- ``*`` - repetition of expression + +- ``==`` - matching expression to string; returns True if the string matches the given expression + +- ``<<=`` - inserts the expression following the operator as the body of the + Forward expression before the operator + + + +Positional subclasses +--------------------- + +- ``StringStart`` - matches beginning of the text + +- ``StringEnd`` - matches the end of the text + +- ``LineStart`` - matches beginning of a line (lines delimited by ``\n`` characters) + +- ``LineEnd`` - matches the end of a line + +- ``WordStart`` - matches a leading word boundary + +- ``WordEnd`` - matches a trailing word boundary + + + +Converter subclasses +-------------------- + +- ``Combine`` - joins all matched tokens into a single string, using + specified joinString (default ``joinString=""``); expects + all matching tokens to be adjacent, with no intervening + whitespace (can be overridden by specifying ``adjacent=False`` in constructor) + +- ``Suppress`` - clears matched tokens; useful to keep returned + results from being cluttered with required but uninteresting + tokens (such as list delimiters) + + +Special subclasses +------------------ + +- ``Group`` - causes the matched tokens to be enclosed in a list; + useful in repeated elements like ``ZeroOrMore`` and ``OneOrMore`` to + break up matched tokens into groups for each repeated pattern + +- ``Dict`` - like ``Group``, but also constructs a dictionary, using the + [0]'th elements of all enclosed token lists as the keys, and + each token list as the value + +- ``SkipTo`` - catch-all matching expression that accepts all characters + up until the given pattern is found to match; useful for specifying + incomplete grammars + +- ``Forward`` - placeholder token used to define recursive token + patterns; when defining the actual expression later in the + program, insert it into the ``Forward`` object using the ``<<`` + operator (see ``fourFn.py`` for an example). + + +Other classes +------------- +.. _ParseResults: + +- ``ParseResults`` - class used to contain and manage the lists of tokens + created from parsing the input using the user-defined parse + expression. ParseResults can be accessed in a number of ways: + + - as a list + + - total list of elements can be found using len() + + - individual elements can be found using [0], [1], [-1], etc. + + - elements can be deleted using ``del`` + + - the -1th element can be extracted and removed in a single operation + using ``pop()``, or any element can be extracted and removed + using ``pop(n)`` + + - as a dictionary + + - if ``setResultsName()`` is used to name elements within the + overall parse expression, then these fields can be referenced + as dictionary elements or as attributes + + - the Dict class generates dictionary entries using the data of the + input text - in addition to ParseResults listed as ``[ [ a1, b1, c1, ...], [ a2, b2, c2, ...] ]`` + it also acts as a dictionary with entries defined as ``{ a1 : [ b1, c1, ... ] }, { a2 : [ b2, c2, ... ] }``; + this is especially useful when processing tabular data where the first column contains a key + value for that line of data + + - list elements that are deleted using ``del`` will still be accessible by their + dictionary keys + + - supports ``get()``, ``items()`` and ``keys()`` methods, similar to a dictionary + + - a keyed item can be extracted and removed using ``pop(key)``. Here + key must be non-numeric (such as a string), in order to use dict + extraction instead of list extraction. + + - new named elements can be added (in a parse action, for instance), using the same + syntax as adding an item to a dict (``parseResults["X"] = "new item"``); named elements can be removed using ``del parseResults["X"]`` + + - as a nested list + + - results returned from the Group class are encapsulated within their + own list structure, so that the tokens can be handled as a hierarchical + tree + + ParseResults can also be converted to an ordinary list of strings + by calling ``asList()``. Note that this will strip the results of any + field names that have been defined for any embedded parse elements. + (The ``pprint`` module is especially good at printing out the nested contents + given by ``asList()``.) + + Finally, ParseResults can be viewed by calling ``dump()``. ``dump()` will first show + the ``asList()`` output, followed by an indented structure listing parsed tokens that + have been assigned results names. + + +Exception classes and Troubleshooting +------------------------------------- + +.. _ParseException: + +- ``ParseException`` - exception returned when a grammar parse fails; + ParseExceptions have attributes loc, msg, line, lineno, and column; to view the + text line and location where the reported ParseException occurs, use:: + + except ParseException, err: + print err.line + print " " * (err.column - 1) + "^" + print err + +- ``RecursiveGrammarException`` - exception returned by ``validate()`` if + the grammar contains a recursive infinite loop, such as:: + + badGrammar = Forward() + goodToken = Literal("A") + badGrammar <<= Optional(goodToken) + badGrammar + +- ``ParseFatalException`` - exception that parse actions can raise to stop parsing + immediately. Should be used when a semantic error is found in the input text, such + as a mismatched XML tag. + +- ``ParseSyntaxException`` - subclass of ``ParseFatalException`` raised when a + syntax error is found, based on the use of the '-' operator when defining + a sequence of expressions in an ``And`` expression. + +You can also get some insights into the parsing logic using diagnostic parse actions, +and setDebug(), or test the matching of expression fragments by testing them using +scanString(). + + +Miscellaneous attributes and methods +==================================== + +Helper methods +-------------- + +- ``delimitedList(expr, delim=',')`` - convenience function for + matching one or more occurrences of expr, separated by delim. + By default, the delimiters are suppressed, so the returned results contain + only the separate list elements. Can optionally specify ``combine=True``, + indicating that the expressions and delimiters should be returned as one + combined value (useful for scoped variables, such as ``"a.b.c"``, or + ``"a::b::c"``, or paths such as ``"a/b/c"``). + +- ``countedArray(expr)`` - convenience function for a pattern where an list of + instances of the given expression are preceded by an integer giving the count of + elements in the list. Returns an expression that parses the leading integer, + reads exactly that many expressions, and returns the array of expressions in the + parse results - the leading integer is suppressed from the results (although it + is easily reconstructed by using len on the returned array). + +- ``oneOf(string, caseless=False)`` - convenience function for quickly declaring an + alternative set of ``Literal`` tokens, by splitting the given string on + whitespace boundaries. The tokens are sorted so that longer + matches are attempted first; this ensures that a short token does + not mask a longer one that starts with the same characters. If ``caseless=True``, + will create an alternative set of CaselessLiteral tokens. + +- ``dictOf(key, value)`` - convenience function for quickly declaring a + dictionary pattern of ``Dict(ZeroOrMore(Group(key + value)))``. + +- ``makeHTMLTags(tagName)`` and ``makeXMLTags(tagName)`` - convenience + functions to create definitions of opening and closing tag expressions. Returns + a pair of expressions, for the corresponding and strings. Includes + support for attributes in the opening tag, such as - attributes + are returned as keyed tokens in the returned ParseResults. ``makeHTMLTags`` is less + restrictive than ``makeXMLTags``, especially with respect to case sensitivity. + +- ``infixNotation(baseOperand, operatorList)`` - (formerly named ``operatorPrecedence``) + convenience function to define a grammar for parsing infix notation + expressions with a hierarchical precedence of operators. To use the ``infixNotation`` + helper: + + 1. Define the base "atom" operand term of the grammar. + For this simple grammar, the smallest operand is either + and integer or a variable. This will be the first argument + to the ``infixNotation`` method. + + 2. Define a list of tuples for each level of operator + precendence. Each tuple is of the form + ``(opExpr, numTerms, rightLeftAssoc, parseAction)``, where: + + - ``opExpr`` - the pyparsing expression for the operator; + may also be a string, which will be converted to a Literal; if + None, indicates an empty operator, such as the implied + multiplication operation between 'm' and 'x' in "y = mx + b". + + - ``numTerms`` - the number of terms for this operator (must + be 1, 2, or 3) + + - ``rightLeftAssoc`` is the indicator whether the operator is + right or left associative, using the pyparsing-defined + constants ``opAssoc.RIGHT`` and ``opAssoc.LEFT``. + + - ``parseAction`` is the parse action to be associated with + expressions matching this operator expression (the + ``parseAction`` tuple member may be omitted) + + 3. Call ``infixNotation`` passing the operand expression and + the operator precedence list, and save the returned value + as the generated pyparsing expression. You can then use + this expression to parse input strings, or incorporate it + into a larger, more complex grammar. + +- ``matchPreviousLiteral`` and ``matchPreviousExpr`` - function to define and + expression that matches the same content + as was parsed in a previous parse expression. For instance:: + + first = Word(nums) + matchExpr = first + ":" + matchPreviousLiteral(first) + + will match "1:1", but not "1:2". Since this matches at the literal + level, this will also match the leading "1:1" in "1:10". + + In contrast:: + + first = Word(nums) + matchExpr = first + ":" + matchPreviousExpr(first) + + will *not* match the leading "1:1" in "1:10"; the expressions are + evaluated first, and then compared, so "1" is compared with "10". + +- ``nestedExpr(opener, closer, content=None, ignoreExpr=quotedString)`` - method for defining nested + lists enclosed in opening and closing delimiters. + + - ``opener`` - opening character for a nested list (default="("); can also be a pyparsing expression + + - ``closer`` - closing character for a nested list (default=")"); can also be a pyparsing expression + + - ``content`` - expression for items within the nested lists (default=None) + + - ``ignoreExpr`` - expression for ignoring opening and closing delimiters (default=quotedString) + + If an expression is not provided for the content argument, the nested + expression will capture all whitespace-delimited content between delimiters +vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv as a list of separate values. + + Use the ignoreExpr argument to define expressions that may contain + opening or closing characters that should not be treated as opening + or closing characters for nesting, such as quotedString or a comment + expression. Specify multiple expressions using an Or or MatchFirst. + The default is quotedString, but if no expressions are to be ignored, + then pass None for this argument. + + +- ``indentedBlock(statementExpr, indentationStackVar, indent=True)`` - + function to define an indented block of statements, similar to + indentation-based blocking in Python source code: + + - ``statementExpr`` - the expression defining a statement that + will be found in the indented block; a valid ``indentedBlock`` + must contain at least 1 matching ``statementExpr`` + + - ``indentationStackVar`` - a Python list variable; this variable + should be common to all ``indentedBlock`` expressions defined + within the same grammar, and should be reinitialized to [1] + each time the grammar is to be used + + - ``indent`` - a boolean flag indicating whether the expressions + within the block must be indented from the current parse + location; if using ``indentedBlock`` to define the left-most + statements (all starting in column 1), set ``indent`` to False + +.. _originalTextFor: + +- ``originalTextFor(expr)`` - helper function to preserve the originally parsed text, regardless of any + token processing or conversion done by the contained expression. For instance, the following expression:: + + fullName = Word(alphas) + Word(alphas) + + will return the parse of "John Smith" as ['John', 'Smith']. In some applications, the actual name as it + was given in the input string is what is desired. To do this, use ``originalTextFor``:: + + fullName = originalTextFor(Word(alphas) + Word(alphas)) + +- ``ungroup(expr)`` - function to "ungroup" returned tokens; useful + to undo the default behavior of And to always group the returned tokens, even + if there is only one in the list. (New in 1.5.6) + +- ``lineno(loc, string)`` - function to give the line number of the + location within the string; the first line is line 1, newlines + start new rows + +- ``col(loc, string)`` - function to give the column number of the + location within the string; the first column is column 1, + newlines reset the column number to 1 + +- ``line(loc, string)`` - function to retrieve the line of text + representing ``lineno(loc, string)``; useful when printing out diagnostic + messages for exceptions + +- ``srange(rangeSpec)`` - function to define a string of characters, + given a string of the form used by regexp string ranges, such as ``"[0-9]"`` for + all numeric digits, ``"[A-Z_]"`` for uppercase characters plus underscore, and + so on (note that rangeSpec does not include support for generic regular + expressions, just string range specs) + +- ``getTokensEndLoc()`` - function to call from within a parse action to get + the ending location for the matched tokens + +- ``traceParseAction(fn)`` - decorator function to debug parse actions. Lists + each call, called arguments, and return value or exception + + + +Helper parse actions +-------------------- + +- ``removeQuotes`` - removes the first and last characters of a quoted string; + useful to remove the delimiting quotes from quoted strings + +- ``replaceWith(replString)`` - returns a parse action that simply returns the + replString; useful when using transformString, or converting HTML entities, as in:: + + nbsp = Literal(" ").setParseAction(replaceWith("")) + +- ``keepOriginalText``- (deprecated, use originalTextFor_ instead) restores any internal whitespace or suppressed + text within the tokens for a matched parse + expression. This is especially useful when defining expressions + for scanString or transformString applications. + +- ``withAttribute(*args, **kwargs)`` - helper to create a validating parse action to be used with start tags created + with ``makeXMLTags`` or ``makeHTMLTags``. Use ``withAttribute`` to qualify a starting tag + with a required attribute value, to avoid false matches on common tags such as + ```` or ``
``. + + ``withAttribute`` can be called with: + + - keyword arguments, as in ``(class="Customer", align="right")``, or + + - a list of name-value tuples, as in ``(("ns1:class", "Customer"), ("ns2:align", "right"))`` + + An attribute can be specified to have the special value + ``withAttribute.ANY_VALUE``, which will match any value - use this to + ensure that an attribute is present but any attribute value is + acceptable. + +- ``downcaseTokens`` - converts all matched tokens to lowercase + +- ``upcaseTokens`` - converts all matched tokens to uppercase + +- ``matchOnlyAtCol(columnNumber)`` - a parse action that verifies that + an expression was matched at a particular column, raising a + ParseException if matching at a different column number; useful when parsing + tabular data + + + +Common string and token constants +--------------------------------- + +- ``alphas`` - same as ``string.letters`` + +- ``nums`` - same as ``string.digits`` + +- ``alphanums`` - a string containing ``alphas + nums`` + +- ``alphas8bit`` - a string containing alphabetic 8-bit characters:: + + ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþ + +- ``printables`` - same as ``string.printable``, minus the space (``' '``) character + +- ``empty`` - a global ``Empty()``; will always match + +- ``sglQuotedString`` - a string of characters enclosed in 's; may + include whitespace, but not newlines + +- ``dblQuotedString`` - a string of characters enclosed in "s; may + include whitespace, but not newlines + +- ``quotedString`` - ``sglQuotedString | dblQuotedString`` + +- ``cStyleComment`` - a comment block delimited by ``'/*'`` and ``'*/'`` sequences; can span + multiple lines, but does not support nesting of comments + +- ``htmlComment`` - a comment block delimited by ``''`` sequences; can span + multiple lines, but does not support nesting of comments + +- ``commaSeparatedList`` - similar to ``delimitedList``, except that the + list expressions can be any text value, or a quoted string; quoted strings can + safely include commas without incorrectly breaking the string into two tokens + +- ``restOfLine`` - all remaining printable characters up to but not including the next + newline diff --git a/examples/javascript_grammar.g b/examples/javascript_grammar.g new file mode 100644 index 0000000..49fc238 --- /dev/null +++ b/examples/javascript_grammar.g @@ -0,0 +1,894 @@ +/* + Copyright 2008 Chris Lambrou. + All rights reserved. +*/ + +grammar JavaScript; + +options +{ + output=AST; + backtrack=true; + memoize=true; +} + +program + : LT!* sourceElements LT!* EOF! + ; + +sourceElements + : sourceElement (LT!* sourceElement)* + ; + +sourceElement + : functionDeclaration + | statement + ; + +// functions +functionDeclaration + : 'function' LT!* Identifier LT!* formalParameterList LT!* functionBody + ; + +functionExpression + : 'function' LT!* Identifier? LT!* formalParameterList LT!* functionBody + ; + +formalParameterList + : '(' (LT!* Identifier (LT!* ',' LT!* Identifier)*)? LT!* ')' + ; + +functionBody + : '{' LT!* sourceElements LT!* '}' + ; + +// statements +statement + : statementBlock + | variableStatement + | emptyStatement + | expressionStatement + | ifStatement + | iterationStatement + | continueStatement + | breakStatement + | returnStatement + | withStatement + | labelledStatement + | switchStatement + | throwStatement + | tryStatement + ; + +statementBlock + : '{' LT!* statementList? LT!* '}' + ; + +statementList + : statement (LT!* statement)* + ; + +variableStatement + : 'var' LT!* variableDeclarationList (LT | ';')! + ; + +variableDeclarationList + : variableDeclaration (LT!* ',' LT!* variableDeclaration)* + ; + +variableDeclarationListNoIn + : variableDeclarationNoIn (LT!* ',' LT!* variableDeclarationNoIn)* + ; + +variableDeclaration + : Identifier LT!* initialiser? + ; + +variableDeclarationNoIn + : Identifier LT!* initialiserNoIn? + ; + +initialiser + : '=' LT!* assignmentExpression + ; + +initialiserNoIn + : '=' LT!* assignmentExpressionNoIn + ; + +emptyStatement + : ';' + ; + +expressionStatement + : expression (LT | ';')! + ; + +ifStatement + : 'if' LT!* '(' LT!* expression LT!* ')' LT!* statement (LT!* 'else' LT!* statement)? + ; + +iterationStatement + : doWhileStatement + | whileStatement + | forStatement + | forInStatement + ; + +doWhileStatement + : 'do' LT!* statement LT!* 'while' LT!* '(' expression ')' (LT | ';')! + ; + +whileStatement + : 'while' LT!* '(' LT!* expression LT!* ')' LT!* statement + ; + +forStatement + : 'for' LT!* '(' (LT!* forStatementInitialiserPart)? LT!* ';' (LT!* expression)? LT!* ';' (LT!* expression)? LT!* ')' LT!* statement + ; + +forStatementInitialiserPart + : expressionNoIn + | 'var' LT!* variableDeclarationListNoIn + ; + +forInStatement + : 'for' LT!* '(' LT!* forInStatementInitialiserPart LT!* 'in' LT!* expression LT!* ')' LT!* statement + ; + +forInStatementInitialiserPart + : leftHandSideExpression + | 'var' LT!* variableDeclarationNoIn + ; + +continueStatement + : 'continue' Identifier? (LT | ';')! + ; + +breakStatement + : 'break' Identifier? (LT | ';')! + ; + +returnStatement + : 'return' expression? (LT | ';')! + ; + +withStatement + : 'with' LT!* '(' LT!* expression LT!* ')' LT!* statement + ; + +labelledStatement + : Identifier LT!* ':' LT!* statement + ; + +switchStatement + : 'switch' LT!* '(' LT!* expression LT!* ')' LT!* caseBlock + ; + +caseBlock + : '{' (LT!* caseClause)* (LT!* defaultClause (LT!* caseClause)*)? LT!* '}' + ; + +caseClause + : 'case' LT!* expression LT!* ':' LT!* statementList? + ; + +defaultClause + : 'default' LT!* ':' LT!* statementList? + ; + +throwStatement + : 'throw' expression (LT | ';')! + ; + +tryStatement + : 'try' LT!* statementBlock LT!* (finallyClause | catchClause (LT!* finallyClause)?) + ; + +catchClause + : 'catch' LT!* '(' LT!* Identifier LT!* ')' LT!* statementBlock + ; + +finallyClause + : 'finally' LT!* statementBlock + ; + +// expressions +expression + : assignmentExpression (LT!* ',' LT!* assignmentExpression)* + ; + +expressionNoIn + : assignmentExpressionNoIn (LT!* ',' LT!* assignmentExpressionNoIn)* + ; + +assignmentExpression + : conditionalExpression + | leftHandSideExpression LT!* assignmentOperator LT!* assignmentExpression + ; + +assignmentExpressionNoIn + : conditionalExpressionNoIn + | leftHandSideExpression LT!* assignmentOperator LT!* assignmentExpressionNoIn + ; + +leftHandSideExpression + : callExpression + | newExpression + ; + +newExpression + : memberExpression + | 'new' LT!* newExpression + ; + +memberExpression + : (primaryExpression | functionExpression | 'new' LT!* memberExpression LT!* arguments) (LT!* memberExpressionSuffix)* + ; + +memberExpressionSuffix + : indexSuffix + | propertyReferenceSuffix + ; + +callExpression + : memberExpression LT!* arguments (LT!* callExpressionSuffix)* + ; + +callExpressionSuffix + : arguments + | indexSuffix + | propertyReferenceSuffix + ; + +arguments + : '(' (LT!* assignmentExpression (LT!* ',' LT!* assignmentExpression)*)? LT!* ')' + ; + +indexSuffix + : '[' LT!* expression LT!* ']' + ; + +propertyReferenceSuffix + : '.' LT!* Identifier + ; + +assignmentOperator + : '=' | '*=' | '/=' | '%=' | '+=' | '-=' | '<<=' | '>>=' | '>>>=' | '&=' | '^=' | '|=' + ; + +conditionalExpression + : logicalORExpression (LT!* '?' LT!* assignmentExpression LT!* ':' LT!* assignmentExpression)? + ; + +conditionalExpressionNoIn + : logicalORExpressionNoIn (LT!* '?' LT!* assignmentExpressionNoIn LT!* ':' LT!* assignmentExpressionNoIn)? + ; + +logicalORExpression + : logicalANDExpression (LT!* '||' LT!* logicalANDExpression)* + ; + +logicalORExpressionNoIn + : logicalANDExpressionNoIn (LT!* '||' LT!* logicalANDExpressionNoIn)* + ; + +logicalANDExpression + : bitwiseORExpression (LT!* '&&' LT!* bitwiseORExpression)* + ; + +logicalANDExpressionNoIn + : bitwiseORExpressionNoIn (LT!* '&&' LT!* bitwiseORExpressionNoIn)* + ; + +bitwiseORExpression + : bitwiseXORExpression (LT!* '|' LT!* bitwiseXORExpression)* + ; + +bitwiseORExpressionNoIn + : bitwiseXORExpressionNoIn (LT!* '|' LT!* bitwiseXORExpressionNoIn)* + ; + +bitwiseXORExpression + : bitwiseANDExpression (LT!* '^' LT!* bitwiseANDExpression)* + ; + +bitwiseXORExpressionNoIn + : bitwiseANDExpressionNoIn (LT!* '^' LT!* bitwiseANDExpressionNoIn)* + ; + +bitwiseANDExpression + : equalityExpression (LT!* '&' LT!* equalityExpression)* + ; + +bitwiseANDExpressionNoIn + : equalityExpressionNoIn (LT!* '&' LT!* equalityExpressionNoIn)* + ; + +equalityExpression + : relationalExpression (LT!* ('==' | '!=' | '===' | '!==') LT!* relationalExpression)* + ; + +equalityExpressionNoIn + : relationalExpressionNoIn (LT!* ('==' | '!=' | '===' | '!==') LT!* relationalExpressionNoIn)* + ; + +relationalExpression + : shiftExpression (LT!* ('<' | '>' | '<=' | '>=' | 'instanceof' | 'in') LT!* shiftExpression)* + ; + +relationalExpressionNoIn + : shiftExpression (LT!* ('<' | '>' | '<=' | '>=' | 'instanceof') LT!* shiftExpression)* + ; + +shiftExpression + : additiveExpression (LT!* ('<<' | '>>' | '>>>') LT!* additiveExpression)* + ; + +additiveExpression + : multiplicativeExpression (LT!* ('+' | '-') LT!* multiplicativeExpression)* + ; + +multiplicativeExpression + : unaryExpression (LT!* ('*' | '/' | '%') LT!* unaryExpression)* + ; + +unaryExpression + : postfixExpression + | ('delete' | 'void' | 'typeof' | '++' | '--' | '+' | '-' | '~' | '!') unaryExpression + ; + +postfixExpression + : leftHandSideExpression ('++' | '--')? + ; + +primaryExpression + : 'this' + | Identifier + | literal + | arrayLiteral + | objectLiteral + | '(' LT!* expression LT!* ')' + ; + +// arrayLiteral definition. +arrayLiteral + : '[' LT!* assignmentExpression? (LT!* ',' (LT!* assignmentExpression)?)* LT!* ']' + ; + +// objectLiteral definition. +objectLiteral + : '{' LT!* propertyNameAndValue (LT!* ',' LT!* propertyNameAndValue)* LT!* '}' + ; + +propertyNameAndValue + : propertyName LT!* ':' LT!* assignmentExpression + ; + +propertyName + : Identifier + | StringLiteral + | NumericLiteral + ; + +// primitive literal definition. +literal + : 'null' + | 'true' + | 'false' + | StringLiteral + | NumericLiteral + ; + +// lexer rules. +StringLiteral + : '"' DoubleStringCharacter* '"' + | '\'' SingleStringCharacter* '\'' + ; + +fragment DoubleStringCharacter + : ~('"' | '\\' | LT) + | '\\' EscapeSequence + ; + +fragment SingleStringCharacter + : ~('\'' | '\\' | LT) + | '\\' EscapeSequence + ; + +fragment EscapeSequence + : CharacterEscapeSequence + | '0' + | HexEscapeSequence + | UnicodeEscapeSequence + ; + +fragment CharacterEscapeSequence + : SingleEscapeCharacter + | NonEscapeCharacter + ; + +fragment NonEscapeCharacter + : ~(EscapeCharacter | LT) + ; + +fragment SingleEscapeCharacter + : '\'' | '"' | '\\' | 'b' | 'f' | 'n' | 'r' | 't' | 'v' + ; + +fragment EscapeCharacter + : SingleEscapeCharacter + | DecimalDigit + | 'x' + | 'u' + ; + +fragment HexEscapeSequence + : 'x' HexDigit HexDigit + ; + +fragment UnicodeEscapeSequence + : 'u' HexDigit HexDigit HexDigit HexDigit + ; + +NumericLiteral + : DecimalLiteral + | HexIntegerLiteral + ; + +fragment HexIntegerLiteral + : '0' ('x' | 'X') HexDigit+ + ; + +fragment HexDigit + : DecimalDigit | ('a'..'f') | ('A'..'F') + ; + +fragment DecimalLiteral + : DecimalDigit+ '.' DecimalDigit* ExponentPart? + | '.'? DecimalDigit+ ExponentPart? + ; + +fragment DecimalDigit + : ('0'..'9') + ; + +fragment ExponentPart + : ('e' | 'E') ('+' | '-') ? DecimalDigit+ + ; + +Identifier + : IdentifierStart IdentifierPart* + ; + +fragment IdentifierStart + : UnicodeLetter + | '$' + | '_' + | '\\' UnicodeEscapeSequence + ; + +fragment IdentifierPart + : (IdentifierStart) => IdentifierStart // Avoids ambiguity, as some IdentifierStart chars also match following alternatives. + | UnicodeDigit + | UnicodeConnectorPunctuation + ; + +fragment UnicodeLetter // Any character in the Unicode categories "Uppercase letter (Lu)", + : '\u0041'..'\u005A' // "Lowercase letter (Ll)", "Titlecase letter (Lt)", + | '\u0061'..'\u007A' // "Modifier letter (Lm)", "Other letter (Lo)", or "Letter number (Nl)". + | '\u00AA' + | '\u00B5' + | '\u00BA' + | '\u00C0'..'\u00D6' + | '\u00D8'..'\u00F6' + | '\u00F8'..'\u021F' + | '\u0222'..'\u0233' + | '\u0250'..'\u02AD' + | '\u02B0'..'\u02B8' + | '\u02BB'..'\u02C1' + | '\u02D0'..'\u02D1' + | '\u02E0'..'\u02E4' + | '\u02EE' + | '\u037A' + | '\u0386' + | '\u0388'..'\u038A' + | '\u038C' + | '\u038E'..'\u03A1' + | '\u03A3'..'\u03CE' + | '\u03D0'..'\u03D7' + | '\u03DA'..'\u03F3' + | '\u0400'..'\u0481' + | '\u048C'..'\u04C4' + | '\u04C7'..'\u04C8' + | '\u04CB'..'\u04CC' + | '\u04D0'..'\u04F5' + | '\u04F8'..'\u04F9' + | '\u0531'..'\u0556' + | '\u0559' + | '\u0561'..'\u0587' + | '\u05D0'..'\u05EA' + | '\u05F0'..'\u05F2' + | '\u0621'..'\u063A' + | '\u0640'..'\u064A' + | '\u0671'..'\u06D3' + | '\u06D5' + | '\u06E5'..'\u06E6' + | '\u06FA'..'\u06FC' + | '\u0710' + | '\u0712'..'\u072C' + | '\u0780'..'\u07A5' + | '\u0905'..'\u0939' + | '\u093D' + | '\u0950' + | '\u0958'..'\u0961' + | '\u0985'..'\u098C' + | '\u098F'..'\u0990' + | '\u0993'..'\u09A8' + | '\u09AA'..'\u09B0' + | '\u09B2' + | '\u09B6'..'\u09B9' + | '\u09DC'..'\u09DD' + | '\u09DF'..'\u09E1' + | '\u09F0'..'\u09F1' + | '\u0A05'..'\u0A0A' + | '\u0A0F'..'\u0A10' + | '\u0A13'..'\u0A28' + | '\u0A2A'..'\u0A30' + | '\u0A32'..'\u0A33' + | '\u0A35'..'\u0A36' + | '\u0A38'..'\u0A39' + | '\u0A59'..'\u0A5C' + | '\u0A5E' + | '\u0A72'..'\u0A74' + | '\u0A85'..'\u0A8B' + | '\u0A8D' + | '\u0A8F'..'\u0A91' + | '\u0A93'..'\u0AA8' + | '\u0AAA'..'\u0AB0' + | '\u0AB2'..'\u0AB3' + | '\u0AB5'..'\u0AB9' + | '\u0ABD' + | '\u0AD0' + | '\u0AE0' + | '\u0B05'..'\u0B0C' + | '\u0B0F'..'\u0B10' + | '\u0B13'..'\u0B28' + | '\u0B2A'..'\u0B30' + | '\u0B32'..'\u0B33' + | '\u0B36'..'\u0B39' + | '\u0B3D' + | '\u0B5C'..'\u0B5D' + | '\u0B5F'..'\u0B61' + | '\u0B85'..'\u0B8A' + | '\u0B8E'..'\u0B90' + | '\u0B92'..'\u0B95' + | '\u0B99'..'\u0B9A' + | '\u0B9C' + | '\u0B9E'..'\u0B9F' + | '\u0BA3'..'\u0BA4' + | '\u0BA8'..'\u0BAA' + | '\u0BAE'..'\u0BB5' + | '\u0BB7'..'\u0BB9' + | '\u0C05'..'\u0C0C' + | '\u0C0E'..'\u0C10' + | '\u0C12'..'\u0C28' + | '\u0C2A'..'\u0C33' + | '\u0C35'..'\u0C39' + | '\u0C60'..'\u0C61' + | '\u0C85'..'\u0C8C' + | '\u0C8E'..'\u0C90' + | '\u0C92'..'\u0CA8' + | '\u0CAA'..'\u0CB3' + | '\u0CB5'..'\u0CB9' + | '\u0CDE' + | '\u0CE0'..'\u0CE1' + | '\u0D05'..'\u0D0C' + | '\u0D0E'..'\u0D10' + | '\u0D12'..'\u0D28' + | '\u0D2A'..'\u0D39' + | '\u0D60'..'\u0D61' + | '\u0D85'..'\u0D96' + | '\u0D9A'..'\u0DB1' + | '\u0DB3'..'\u0DBB' + | '\u0DBD' + | '\u0DC0'..'\u0DC6' + | '\u0E01'..'\u0E30' + | '\u0E32'..'\u0E33' + | '\u0E40'..'\u0E46' + | '\u0E81'..'\u0E82' + | '\u0E84' + | '\u0E87'..'\u0E88' + | '\u0E8A' + | '\u0E8D' + | '\u0E94'..'\u0E97' + | '\u0E99'..'\u0E9F' + | '\u0EA1'..'\u0EA3' + | '\u0EA5' + | '\u0EA7' + | '\u0EAA'..'\u0EAB' + | '\u0EAD'..'\u0EB0' + | '\u0EB2'..'\u0EB3' + | '\u0EBD'..'\u0EC4' + | '\u0EC6' + | '\u0EDC'..'\u0EDD' + | '\u0F00' + | '\u0F40'..'\u0F6A' + | '\u0F88'..'\u0F8B' + | '\u1000'..'\u1021' + | '\u1023'..'\u1027' + | '\u1029'..'\u102A' + | '\u1050'..'\u1055' + | '\u10A0'..'\u10C5' + | '\u10D0'..'\u10F6' + | '\u1100'..'\u1159' + | '\u115F'..'\u11A2' + | '\u11A8'..'\u11F9' + | '\u1200'..'\u1206' + | '\u1208'..'\u1246' + | '\u1248' + | '\u124A'..'\u124D' + | '\u1250'..'\u1256' + | '\u1258' + | '\u125A'..'\u125D' + | '\u1260'..'\u1286' + | '\u1288' + | '\u128A'..'\u128D' + | '\u1290'..'\u12AE' + | '\u12B0' + | '\u12B2'..'\u12B5' + | '\u12B8'..'\u12BE' + | '\u12C0' + | '\u12C2'..'\u12C5' + | '\u12C8'..'\u12CE' + | '\u12D0'..'\u12D6' + | '\u12D8'..'\u12EE' + | '\u12F0'..'\u130E' + | '\u1310' + | '\u1312'..'\u1315' + | '\u1318'..'\u131E' + | '\u1320'..'\u1346' + | '\u1348'..'\u135A' + | '\u13A0'..'\u13B0' + | '\u13B1'..'\u13F4' + | '\u1401'..'\u1676' + | '\u1681'..'\u169A' + | '\u16A0'..'\u16EA' + | '\u1780'..'\u17B3' + | '\u1820'..'\u1877' + | '\u1880'..'\u18A8' + | '\u1E00'..'\u1E9B' + | '\u1EA0'..'\u1EE0' + | '\u1EE1'..'\u1EF9' + | '\u1F00'..'\u1F15' + | '\u1F18'..'\u1F1D' + | '\u1F20'..'\u1F39' + | '\u1F3A'..'\u1F45' + | '\u1F48'..'\u1F4D' + | '\u1F50'..'\u1F57' + | '\u1F59' + | '\u1F5B' + | '\u1F5D' + | '\u1F5F'..'\u1F7D' + | '\u1F80'..'\u1FB4' + | '\u1FB6'..'\u1FBC' + | '\u1FBE' + | '\u1FC2'..'\u1FC4' + | '\u1FC6'..'\u1FCC' + | '\u1FD0'..'\u1FD3' + | '\u1FD6'..'\u1FDB' + | '\u1FE0'..'\u1FEC' + | '\u1FF2'..'\u1FF4' + | '\u1FF6'..'\u1FFC' + | '\u207F' + | '\u2102' + | '\u2107' + | '\u210A'..'\u2113' + | '\u2115' + | '\u2119'..'\u211D' + | '\u2124' + | '\u2126' + | '\u2128' + | '\u212A'..'\u212D' + | '\u212F'..'\u2131' + | '\u2133'..'\u2139' + | '\u2160'..'\u2183' + | '\u3005'..'\u3007' + | '\u3021'..'\u3029' + | '\u3031'..'\u3035' + | '\u3038'..'\u303A' + | '\u3041'..'\u3094' + | '\u309D'..'\u309E' + | '\u30A1'..'\u30FA' + | '\u30FC'..'\u30FE' + | '\u3105'..'\u312C' + | '\u3131'..'\u318E' + | '\u31A0'..'\u31B7' + | '\u3400' + | '\u4DB5' + | '\u4E00' + | '\u9FA5' + | '\uA000'..'\uA48C' + | '\uAC00' + | '\uD7A3' + | '\uF900'..'\uFA2D' + | '\uFB00'..'\uFB06' + | '\uFB13'..'\uFB17' + | '\uFB1D' + | '\uFB1F'..'\uFB28' + | '\uFB2A'..'\uFB36' + | '\uFB38'..'\uFB3C' + | '\uFB3E' + | '\uFB40'..'\uFB41' + | '\uFB43'..'\uFB44' + | '\uFB46'..'\uFBB1' + | '\uFBD3'..'\uFD3D' + | '\uFD50'..'\uFD8F' + | '\uFD92'..'\uFDC7' + | '\uFDF0'..'\uFDFB' + | '\uFE70'..'\uFE72' + | '\uFE74' + | '\uFE76'..'\uFEFC' + | '\uFF21'..'\uFF3A' + | '\uFF41'..'\uFF5A' + | '\uFF66'..'\uFFBE' + | '\uFFC2'..'\uFFC7' + | '\uFFCA'..'\uFFCF' + | '\uFFD2'..'\uFFD7' + | '\uFFDA'..'\uFFDC' + ; + +fragment UnicodeCombiningMark // Any character in the Unicode categories "Non-spacing mark (Mn)" + : '\u0300'..'\u034E' // or "Combining spacing mark (Mc)". + | '\u0360'..'\u0362' + | '\u0483'..'\u0486' + | '\u0591'..'\u05A1' + | '\u05A3'..'\u05B9' + | '\u05BB'..'\u05BD' + | '\u05BF' + | '\u05C1'..'\u05C2' + | '\u05C4' + | '\u064B'..'\u0655' + | '\u0670' + | '\u06D6'..'\u06DC' + | '\u06DF'..'\u06E4' + | '\u06E7'..'\u06E8' + | '\u06EA'..'\u06ED' + | '\u0711' + | '\u0730'..'\u074A' + | '\u07A6'..'\u07B0' + | '\u0901'..'\u0903' + | '\u093C' + | '\u093E'..'\u094D' + | '\u0951'..'\u0954' + | '\u0962'..'\u0963' + | '\u0981'..'\u0983' + | '\u09BC'..'\u09C4' + | '\u09C7'..'\u09C8' + | '\u09CB'..'\u09CD' + | '\u09D7' + | '\u09E2'..'\u09E3' + | '\u0A02' + | '\u0A3C' + | '\u0A3E'..'\u0A42' + | '\u0A47'..'\u0A48' + | '\u0A4B'..'\u0A4D' + | '\u0A70'..'\u0A71' + | '\u0A81'..'\u0A83' + | '\u0ABC' + | '\u0ABE'..'\u0AC5' + | '\u0AC7'..'\u0AC9' + | '\u0ACB'..'\u0ACD' + | '\u0B01'..'\u0B03' + | '\u0B3C' + | '\u0B3E'..'\u0B43' + | '\u0B47'..'\u0B48' + | '\u0B4B'..'\u0B4D' + | '\u0B56'..'\u0B57' + | '\u0B82'..'\u0B83' + | '\u0BBE'..'\u0BC2' + | '\u0BC6'..'\u0BC8' + | '\u0BCA'..'\u0BCD' + | '\u0BD7' + | '\u0C01'..'\u0C03' + | '\u0C3E'..'\u0C44' + | '\u0C46'..'\u0C48' + | '\u0C4A'..'\u0C4D' + | '\u0C55'..'\u0C56' + | '\u0C82'..'\u0C83' + | '\u0CBE'..'\u0CC4' + | '\u0CC6'..'\u0CC8' + | '\u0CCA'..'\u0CCD' + | '\u0CD5'..'\u0CD6' + | '\u0D02'..'\u0D03' + | '\u0D3E'..'\u0D43' + | '\u0D46'..'\u0D48' + | '\u0D4A'..'\u0D4D' + | '\u0D57' + | '\u0D82'..'\u0D83' + | '\u0DCA' + | '\u0DCF'..'\u0DD4' + | '\u0DD6' + | '\u0DD8'..'\u0DDF' + | '\u0DF2'..'\u0DF3' + | '\u0E31' + | '\u0E34'..'\u0E3A' + | '\u0E47'..'\u0E4E' + | '\u0EB1' + | '\u0EB4'..'\u0EB9' + | '\u0EBB'..'\u0EBC' + | '\u0EC8'..'\u0ECD' + | '\u0F18'..'\u0F19' + | '\u0F35' + | '\u0F37' + | '\u0F39' + | '\u0F3E'..'\u0F3F' + | '\u0F71'..'\u0F84' + | '\u0F86'..'\u0F87' + | '\u0F90'..'\u0F97' + | '\u0F99'..'\u0FBC' + | '\u0FC6' + | '\u102C'..'\u1032' + | '\u1036'..'\u1039' + | '\u1056'..'\u1059' + | '\u17B4'..'\u17D3' + | '\u18A9' + | '\u20D0'..'\u20DC' + | '\u20E1' + | '\u302A'..'\u302F' + | '\u3099'..'\u309A' + | '\uFB1E' + | '\uFE20'..'\uFE23' + ; + +fragment UnicodeDigit // Any character in the Unicode category "Decimal number (Nd)". + : '\u0030'..'\u0039' + | '\u0660'..'\u0669' + | '\u06F0'..'\u06F9' + | '\u0966'..'\u096F' + | '\u09E6'..'\u09EF' + | '\u0A66'..'\u0A6F' + | '\u0AE6'..'\u0AEF' + | '\u0B66'..'\u0B6F' + | '\u0BE7'..'\u0BEF' + | '\u0C66'..'\u0C6F' + | '\u0CE6'..'\u0CEF' + | '\u0D66'..'\u0D6F' + | '\u0E50'..'\u0E59' + | '\u0ED0'..'\u0ED9' + | '\u0F20'..'\u0F29' + | '\u1040'..'\u1049' + | '\u1369'..'\u1371' + | '\u17E0'..'\u17E9' + | '\u1810'..'\u1819' + | '\uFF10'..'\uFF19' + ; + +fragment UnicodeConnectorPunctuation // Any character in the Unicode category "Connector punctuation (Pc)". + : '\u005F' + | '\u203F'..'\u2040' + | '\u30FB' + | '\uFE33'..'\uFE34' + | '\uFE4D'..'\uFE4F' + | '\uFF3F' + | '\uFF65' + ; + +Comment + : '/*' (options {greedy=false;} : .)* '*/' {$channel=HIDDEN;} + ; + +LineComment + : '//' ~(LT)* {$channel=HIDDEN;} + ; + +LT + : '\n' // Line feed. + | '\r' // Carriage return. + | '\u2028' // Line separator. + | '\u2029' // Paragraph separator. + ; + +WhiteSpace // Tab, vertical tab, form feed, space, non-breaking space and any other unicode "space separator". + : ('\t' | '\v' | '\f' | ' ' | '\u00A0') {$channel=HIDDEN;} + ; diff --git a/examples/sexpParser.py b/examples/sexpParser.py index fd8ffd3..0d006d2 100644 --- a/examples/sexpParser.py +++ b/examples/sexpParser.py @@ -1,157 +1,157 @@ -# sexpParser.py -# -# Demonstration of the pyparsing module, implementing a simple S-expression -# parser. -# -# Updates: -# November, 2011 - fixed errors in precedence of alternatives in simpleString; -# fixed exception raised in verifyLen to properly signal the input string -# and exception location so that markInputline works correctly; fixed -# definition of decimal to accept a single '0' and optional leading '-' -# sign; updated tests to improve parser coverage -# -# Copyright 2007-2011, by Paul McGuire -# -""" -BNF reference: http://theory.lcs.mit.edu/~rivest/sexp.txt - - :: | - :: ? ; - :: | | | | - ; - :: "[" "]" ; - :: ":" ; - :: + ; - -- decimal numbers should have no unnecessary leading zeros - -- any string of bytes, of the indicated length - :: + ; - :: ? "|" ( | )* "|" ; - :: "#" ( | )* "#" ; - :: ? - :: "\"" "\"" - :: "(" ( | )* ")" ; - :: * ; - :: | | ; - :: | | ; - :: "a" | ... | "z" ; - :: "A" | ... | "Z" ; - :: "0" | ... | "9" ; - :: | "A" | ... | "F" | "a" | ... | "f" ; - :: "-" | "." | "/" | "_" | ":" | "*" | "+" | "=" ; - :: " " | "\t" | "\r" | "\n" ; - :: | | "+" | "/" | "=" ; - :: "" ; -""" - -import pyparsing as pp -from base64 import b64decode -import pprint - - -def verify_length(s, l, t): - t = t[0] - if t.len is not None: - t1len = len(t[1]) - if t1len != t.len: - raise pp.ParseFatalException(s, l, "invalid data of length {0}, expected {1}".format(t1len, t.len)) - return t[1] - - -# define punctuation literals -LPAR, RPAR, LBRK, RBRK, LBRC, RBRC, VBAR, COLON = (pp.Suppress(c).setName(c) for c in "()[]{}|:") - -decimal = pp.Regex(r'-?0|[1-9]\d*').setParseAction(lambda t: int(t[0])) -hexadecimal = ("#" + pp.Word(pp.hexnums)[...] + "#").setParseAction(lambda t: int("".join(t[1:-1]), 16)) -bytes = pp.Word(pp.printables) -raw = pp.Group(decimal("len") + COLON + bytes).setParseAction(verify_length) -base64_ = pp.Group(pp.Optional(decimal | hexadecimal, default=None)("len") - + VBAR - + pp.Word(pp.alphanums + "+/=")[...].setParseAction(lambda t: b64decode("".join(t))) - + VBAR - ).setParseAction(verify_length) - -real = pp.Regex(r"[+-]?\d+\.\d*([eE][+-]?\d+)?").setParseAction(lambda tokens: float(tokens[0])) -token = pp.Word(pp.alphanums + "-./_:*+=!<>") -qString = pp.Group(pp.Optional(decimal, default=None)("len") - + pp.dblQuotedString.setParseAction(pp.removeQuotes) - ).setParseAction(verify_length) - -simpleString = real | base64_ | raw | decimal | token | hexadecimal | qString - -display = LBRK + simpleString + RBRK -string_ = pp.Optional(display) + simpleString - -sexp = pp.Forward() -sexpList = pp.Group(LPAR + sexp[0, ...] + RPAR) -sexp <<= string_ | sexpList - - -# Test data - -test00 = """(snicker "abc" (#03# |YWJj|))""" -test01 = """(certificate - (issuer - (name - (public-key - rsa-with-md5 - (e 15 |NFGq/E3wh9f4rJIQVXhS|) - (n |d738/4ghP9rFZ0gAIYZ5q9y6iskDJwASi5rEQpEQq8ZyMZeIZzIAR2I5iGE=|)) - aid-committee)) - (subject - (ref - (public-key - rsa-with-md5 - (e |NFGq/E3wh9f4rJIQVXhS|) - (n |d738/4ghP9rFZ0gAIYZ5q9y6iskDJwASi5rEQpEQq8ZyMZeIZzIAR2I5iGE=|)) - tom - mother)) - (not-before "1997-01-01_09:00:00") - (not-after "1998-01-01_09:00:00") - (tag - (spend (account "12345678") (* numeric range "1" "1000")))) -""" -test02 = """(lambda (x) (* x x))""" -test03 = """(def length - (lambda (x) - (cond - ((not x) 0) - ( t (+ 1 (length (cdr x)))) - ) - ) -) -""" -test04 = """(2:XX "abc" (#03# |YWJj|))""" -test05 = """(if (is (window_name) "XMMS") (set_workspace 2))""" -test06 = """(if - (and - (is (application_name) "Firefox") - (or - (contains (window_name) "Enter name of file to save to") - (contains (window_name) "Save As") - (contains (window_name) "Save Image") - () - ) - ) - (geometry "+140+122") -) -""" -test07 = """(defun factorial (x) - (if (zerop x) 1 - (* x (factorial (- x 1))))) - """ -test51 = """(2:XX "abc" (#03# |YWJj|))""" -test51error = """(3:XX "abc" (#03# |YWJj|))""" - -test52 = """ - (and - (or (> uid 1000) - (!= gid 20) - ) - (> quota 5.0e+03) - ) - """ - -# Run tests -alltests = [globals()[testname] for testname in sorted(locals()) if testname.startswith("test")] - -sexp.runTests(alltests, fullDump=False) +# sexpParser.py +# +# Demonstration of the pyparsing module, implementing a simple S-expression +# parser. +# +# Updates: +# November, 2011 - fixed errors in precedence of alternatives in simpleString; +# fixed exception raised in verifyLen to properly signal the input string +# and exception location so that markInputline works correctly; fixed +# definition of decimal to accept a single '0' and optional leading '-' +# sign; updated tests to improve parser coverage +# +# Copyright 2007-2011, by Paul McGuire +# +""" +BNF reference: http://theory.lcs.mit.edu/~rivest/sexp.txt + + :: | + :: ? ; + :: | | | | + ; + :: "[" "]" ; + :: ":" ; + :: + ; + -- decimal numbers should have no unnecessary leading zeros + -- any string of bytes, of the indicated length + :: + ; + :: ? "|" ( | )* "|" ; + :: "#" ( | )* "#" ; + :: ? + :: "\"" "\"" + :: "(" ( | )* ")" ; + :: * ; + :: | | ; + :: | | ; + :: "a" | ... | "z" ; + :: "A" | ... | "Z" ; + :: "0" | ... | "9" ; + :: | "A" | ... | "F" | "a" | ... | "f" ; + :: "-" | "." | "/" | "_" | ":" | "*" | "+" | "=" ; + :: " " | "\t" | "\r" | "\n" ; + :: | | "+" | "/" | "=" ; + :: "" ; +""" + +import pyparsing as pp +from base64 import b64decode +import pprint + + +def verify_length(s, l, t): + t = t[0] + if t.len is not None: + t1len = len(t[1]) + if t1len != t.len: + raise pp.ParseFatalException(s, l, "invalid data of length {0}, expected {1}".format(t1len, t.len)) + return t[1] + + +# define punctuation literals +LPAR, RPAR, LBRK, RBRK, LBRC, RBRC, VBAR, COLON = (pp.Suppress(c).setName(c) for c in "()[]{}|:") + +decimal = pp.Regex(r'-?0|[1-9]\d*').setParseAction(lambda t: int(t[0])) +hexadecimal = ("#" + pp.Word(pp.hexnums)[1, ...] + "#").setParseAction(lambda t: int("".join(t[1:-1]), 16)) +bytes = pp.Word(pp.printables) +raw = pp.Group(decimal("len") + COLON + bytes).setParseAction(verify_length) +base64_ = pp.Group(pp.Optional(decimal | hexadecimal, default=None)("len") + + VBAR + + pp.Word(pp.alphanums + "+/=")[1, ...].setParseAction(lambda t: b64decode("".join(t))) + + VBAR + ).setParseAction(verify_length) + +real = pp.Regex(r"[+-]?\d+\.\d*([eE][+-]?\d+)?").setParseAction(lambda tokens: float(tokens[0])) +token = pp.Word(pp.alphanums + "-./_:*+=!<>") +qString = pp.Group(pp.Optional(decimal, default=None)("len") + + pp.dblQuotedString.setParseAction(pp.removeQuotes) + ).setParseAction(verify_length) + +simpleString = real | base64_ | raw | decimal | token | hexadecimal | qString + +display = LBRK + simpleString + RBRK +string_ = pp.Optional(display) + simpleString + +sexp = pp.Forward() +sexpList = pp.Group(LPAR + sexp[...] + RPAR) +sexp <<= string_ | sexpList + + +# Test data + +test00 = """(snicker "abc" (#03# |YWJj|))""" +test01 = """(certificate + (issuer + (name + (public-key + rsa-with-md5 + (e 15 |NFGq/E3wh9f4rJIQVXhS|) + (n |d738/4ghP9rFZ0gAIYZ5q9y6iskDJwASi5rEQpEQq8ZyMZeIZzIAR2I5iGE=|)) + aid-committee)) + (subject + (ref + (public-key + rsa-with-md5 + (e |NFGq/E3wh9f4rJIQVXhS|) + (n |d738/4ghP9rFZ0gAIYZ5q9y6iskDJwASi5rEQpEQq8ZyMZeIZzIAR2I5iGE=|)) + tom + mother)) + (not-before "1997-01-01_09:00:00") + (not-after "1998-01-01_09:00:00") + (tag + (spend (account "12345678") (* numeric range "1" "1000")))) +""" +test02 = """(lambda (x) (* x x))""" +test03 = """(def length + (lambda (x) + (cond + ((not x) 0) + ( t (+ 1 (length (cdr x)))) + ) + ) +) +""" +test04 = """(2:XX "abc" (#03# |YWJj|))""" +test05 = """(if (is (window_name) "XMMS") (set_workspace 2))""" +test06 = """(if + (and + (is (application_name) "Firefox") + (or + (contains (window_name) "Enter name of file to save to") + (contains (window_name) "Save As") + (contains (window_name) "Save Image") + () + ) + ) + (geometry "+140+122") +) +""" +test07 = """(defun factorial (x) + (if (zerop x) 1 + (* x (factorial (- x 1))))) + """ +test51 = """(2:XX "abc" (#03# |YWJj|))""" +test51error = """(3:XX "abc" (#03# |YWJj|))""" + +test52 = """ + (and + (or (> uid 1000) + (!= gid 20) + ) + (> quota 5.0e+03) + ) + """ + +# Run tests +alltests = [globals()[testname] for testname in sorted(locals()) if testname.startswith("test")] + +sexp.runTests(alltests, fullDump=False) diff --git a/examples/statemachine/documentSignoffDemo.py b/examples/statemachine/documentSignoffDemo.py new file mode 100644 index 0000000..2ca38c8 --- /dev/null +++ b/examples/statemachine/documentSignoffDemo.py @@ -0,0 +1,50 @@ +# +# documentSignoffDemo.py +# +# Example of a state machine modeling the state of a document in a document +# control system, using named state transitions +# +import statemachine +import documentsignoffstate + +print('\n'.join(t.__name__ for t in documentsignoffstate.DocumentRevisionState.transitions())) + +class Document(documentsignoffstate.DocumentRevisionStateMixin): + def __init__(self): + self.initialize_state(documentsignoffstate.New) + + +def run_demo(): + import random + + doc = Document() + print(doc) + + # begin editing document + doc.create() + print(doc) + print(doc.state.description) + + while not isinstance(doc._state, documentsignoffstate.Approved): + + print('...submit') + doc.submit() + print(doc) + print(doc.state.description) + + if random.randint(1,10) > 3: + print('...reject') + doc.reject() + else: + print('...approve') + doc.approve() + + print(doc) + print(doc.state.description) + + doc.activate() + print(doc) + print(doc.state.description) + +if __name__ == '__main__': + run_demo() diff --git a/examples/statemachine/documentsignoffstate.pystate b/examples/statemachine/documentsignoffstate.pystate new file mode 100644 index 0000000..04df274 --- /dev/null +++ b/examples/statemachine/documentsignoffstate.pystate @@ -0,0 +1,71 @@ +# +# documentsignoffstate.pystate +# +# state machine model of the states and associated behaviors and properties for each +# different state of a document in a document control system +# +# example using named state transitions + +# This implements a state model for submitting, +# approving, activating, and purging document +# revisions in a document management system. +# +# The state model looks like: +# +# New +# | +# | (create) +# | +# v +# Editing ----------------------------------------------+ +# | ^ | +# | | | +# | +----------+ | +# | | | +# | (submit) | | (cancel) +# | | (reject) | +# v | | +# PendingApproval-+ | +# | | +# | (approve) | +# | | +# v | +# Approved <--------------------------+ (deactivate) | +# | | | | +# | +--------------+ | | +# | | (activate) | | +# | v | | +# | (retire) Active ----------+ | +# | | +# v | +# Retired | +# | | +# | (purge) | +# | | +# v | +# Deleted <---------------------------------------------+ +# +# +# There is no behavior attached to these states, this is +# just an example of a state machine with named transitions. +# + + +statemachine DocumentRevisionState: + New -( create )-> Editing + Editing -( cancel )-> Deleted + Editing -( submit )-> PendingApproval + PendingApproval -( reject )-> Editing + PendingApproval -( approve )-> Approved + Approved -( activate )-> Active + Active -( deactivate )-> Approved + Approved -( retire )-> Retired + Retired -( purge )-> Deleted + +New.description = 'creating...' +Editing.description = 'editing...' +PendingApproval.description = 'reviewing...' +Approved.description = 'approved/inactive...' +Active.description = 'approved/active...' +Deleted.description = 'deleted...' +Retired.description = 'retired...' \ No newline at end of file diff --git a/examples/statemachine/libraryBookDemo.py b/examples/statemachine/libraryBookDemo.py new file mode 100644 index 0000000..a5e018d --- /dev/null +++ b/examples/statemachine/libraryBookDemo.py @@ -0,0 +1,70 @@ +# +# libraryBookDemo.py +# +# Simple statemachine demo, based on the state transitions given in librarybookstate.pystate +# + +import statemachine +import librarybookstate + + +class Book(librarybookstate.BookStateMixin): + def __init__(self): + self.initialize_state(librarybookstate.New) + + +class RestrictedBook(Book): + def __init__(self): + super(RestrictedBook, self).__init__() + self._authorized_users = [] + + def authorize(self, name): + self._authorized_users.append(name) + + # specialized checkout to check permission of user first + def checkout(self, user=None): + if user in self._authorized_users: + super().checkout() + else: + raise Exception("{0} could not check out restricted book".format(user if user is not None else "anonymous")) + + +def run_demo(): + book = Book() + book.shelve() + print(book) + book.checkout() + print(book) + book.checkin() + print(book) + book.reserve() + print(book) + try: + book.checkout() + except Exception as e: # statemachine.InvalidTransitionException: + print(e) + print('..cannot check out reserved book') + book.release() + print(book) + book.checkout() + print(book) + print() + + restricted_book = RestrictedBook() + restricted_book.authorize("BOB") + restricted_book.restrict() + print(restricted_book) + for name in [None, "BILL", "BOB"]: + try: + restricted_book.checkout(name) + except Exception as e: + print('..' + str(e)) + else: + print('checkout to', name) + print(restricted_book) + restricted_book.checkin() + print(restricted_book) + + +if __name__ == '__main__': + run_demo() diff --git a/examples/statemachine/librarybookstate.pystate b/examples/statemachine/librarybookstate.pystate new file mode 100644 index 0000000..24f07ed --- /dev/null +++ b/examples/statemachine/librarybookstate.pystate @@ -0,0 +1,19 @@ +# +# librarybookstate.pystate +# +# This state machine models the state of books in a library. +# + +statemachine BookState: + New -(shelve)-> Available + Available -(reserve)-> OnHold + OnHold -(release)-> Available + Available -(checkout)-> CheckedOut + CheckedOut -(checkin)-> Available + + # add states for restricted books + New -(restrict)-> Restricted + Available -(restrict)-> Restricted + Restricted -(release)-> Available + Restricted -(checkout)-> CheckedOutRestricted + CheckedOutRestricted -(checkin)-> Restricted diff --git a/examples/statemachine/statemachine.py b/examples/statemachine/statemachine.py new file mode 100644 index 0000000..44f64d2 --- /dev/null +++ b/examples/statemachine/statemachine.py @@ -0,0 +1,347 @@ +# stateMachine.py +# +# module to define .pystate import handler +# +# import imputil +import keyword +import sys +import os +import types +import importlib +try: + import urllib.parse + url_parse = urllib.parse.urlparse +except ImportError: + print("import error, Python 2 not supported") + raise + import urllib + url_parse = urllib.parse + + +DEBUG = False + + +import pyparsing as pp + +# define basic exception for invalid state transitions - state machine classes will subclass to +# define their own specific exception type +class InvalidTransitionException(Exception): pass + + +ident = pp.Word(pp.alphas + "_", pp.alphanums + "_$") + +# add parse-time condition to make sure we do not allow any Python keywords to be used as +# statemachine identifiers +def no_keywords_allowed(s, l, t): + wd = t[0] + return not keyword.iskeyword(wd) +ident.addCondition(no_keywords_allowed, message="cannot use a Python keyword for state or transition identifier") + +stateTransition = ident("from_state") + "->" + ident("to_state") +stateMachine = (pp.Keyword("statemachine") + ident("name") + ":" + + pp.OneOrMore(pp.Group(stateTransition))("transitions")) + +namedStateTransition = (ident("from_state") + + "-(" + ident("transition") + ")->" + + ident("to_state")) +namedStateMachine = (pp.Keyword("statemachine") + ident("name") + ":" + + pp.OneOrMore(pp.Group(namedStateTransition))("transitions")) + + +def expand_state_definition(source, loc, tokens): + """ + Parse action to convert statemachine to corresponding Python classes and methods + """ + indent = " " * (pp.col(loc, source) - 1) + statedef = [] + + # build list of states + states = set() + fromTo = {} + for tn in tokens.transitions: + states.add(tn.from_state) + states.add(tn.to_state) + fromTo[tn.from_state] = tn.to_state + + # define base class for state classes + baseStateClass = tokens.name + statedef.extend([ + "class %s(object):" % baseStateClass, + " def __str__(self):", + " return self.__class__.__name__", + + " @classmethod", + " def states(cls):", + " return list(cls.__subclasses__())", + + " def next_state(self):", + " return self._next_state_class()", + ]) + + # define all state classes + statedef.extend("class {0}({1}): pass".format(s, baseStateClass) for s in states) + + # define state->state transitions + statedef.extend("{0}._next_state_class = {1}".format(s, fromTo[s]) for s in states if s in fromTo) + + statedef.extend([ + "class {baseStateClass}Mixin:".format(baseStateClass=baseStateClass), + " def __init__(self):", + " self._state = None", + + " def initialize_state(self, init_state):", + " if issubclass(init_state, {baseStateClass}):".format(baseStateClass=baseStateClass), + " init_state = init_state()", + " self._state = init_state", + + " @property", + " def state(self):", + " return self._state", + + " # get behavior/properties from current state", + " def __getattr__(self, attrname):", + " attr = getattr(self._state, attrname)", + " return attr", + + " def __str__(self):", + " return '{0}: {1}'.format(self.__class__.__name__, self._state)", + ]) + + return ("\n" + indent).join(statedef) + "\n" + +stateMachine.setParseAction(expand_state_definition) + + +def expand_named_state_definition(source, loc, tokens): + """ + Parse action to convert statemachine with named transitions to corresponding Python + classes and methods + """ + indent = " " * (pp.col(loc, source) - 1) + statedef = [] + # build list of states and transitions + states = set() + transitions = set() + + baseStateClass = tokens.name + + fromTo = {} + for tn in tokens.transitions: + states.add(tn.from_state) + states.add(tn.to_state) + transitions.add(tn.transition) + if tn.from_state in fromTo: + fromTo[tn.from_state][tn.transition] = tn.to_state + else: + fromTo[tn.from_state] = {tn.transition: tn.to_state} + + # add entries for terminal states + for s in states: + if s not in fromTo: + fromTo[s] = {} + + # define state transition class + statedef.extend([ + "class {baseStateClass}Transition:".format(baseStateClass=baseStateClass), + " def __str__(self):", + " return self.transitionName", + ]) + statedef.extend( + "{tn_name} = {baseStateClass}Transition()".format(tn_name=tn, + baseStateClass=baseStateClass) + for tn in transitions) + statedef.extend("{tn_name}.transitionName = '{tn_name}'".format(tn_name=tn) + for tn in transitions) + + # define base class for state classes + statedef.extend([ + "class %s(object):" % baseStateClass, + " from statemachine import InvalidTransitionException as BaseTransitionException", + " class InvalidTransitionException(BaseTransitionException): pass", + " def __str__(self):", + " return self.__class__.__name__", + + " @classmethod", + " def states(cls):", + " return list(cls.__subclasses__())", + + " @classmethod", + " def next_state(cls, name):", + " try:", + " return cls.tnmap[name]()", + " except KeyError:", + " raise cls.InvalidTransitionException('%s does not support transition %r'% (cls.__name__, name))", + + " def __bad_tn(name):", + " def _fn(cls):", + " raise cls.InvalidTransitionException('%s does not support transition %r'% (cls.__name__, name))", + " _fn.__name__ = name", + " return _fn", + ]) + + # define default 'invalid transition' methods in base class, valid transitions will be implemented in subclasses + statedef.extend( + " {tn_name} = classmethod(__bad_tn({tn_name!r}))".format(tn_name=tn) + for tn in transitions) + + # define all state classes + statedef.extend("class %s(%s): pass" % (s, baseStateClass) + for s in states) + + # define state transition methods for valid transitions from each state + for s in states: + trns = list(fromTo[s].items()) + # statedef.append("%s.tnmap = {%s}" % (s, ", ".join("%s:%s" % tn for tn in trns))) + statedef.extend("%s.%s = classmethod(lambda cls: %s())" % (s, tn_, to_) + for tn_, to_ in trns) + + statedef.extend([ + "{baseStateClass}.transitions = classmethod(lambda cls: [{transition_class_list}])".format( + baseStateClass=baseStateClass, + transition_class_list = ', '.join("cls.{0}".format(tn) for tn in transitions) + ), + "{baseStateClass}.transition_names = [tn.__name__ for tn in {baseStateClass}.transitions()]".format( + baseStateClass=baseStateClass + ) + ]) + + # define Mixin class for application classes that delegate to the state + statedef.extend([ + "class {baseStateClass}Mixin:".format(baseStateClass=baseStateClass), + " def __init__(self):", + " self._state = None", + + " def initialize_state(self, init_state):", + " if issubclass(init_state, {baseStateClass}):".format(baseStateClass=baseStateClass), + " init_state = init_state()", + " self._state = init_state", + + " @property", + " def state(self):", + " return self._state", + + " # get behavior/properties from current state", + " def __getattr__(self, attrname):", + " attr = getattr(self._state, attrname)", + " return attr", + + " def __str__(self):", + " return '{0}: {1}'.format(self.__class__.__name__, self._state)", + + ]) + + # define transition methods to be delegated to the _state instance variable + statedef.extend( + " def {tn_name}(self): self._state = self._state.{tn_name}()".format(tn_name=tn) + for tn in transitions + ) + return ("\n" + indent).join(statedef) + "\n" + +namedStateMachine.setParseAction(expand_named_state_definition) + + +# ====================================================================== +# NEW STUFF - Matt Anderson, 2009-11-26 +# ====================================================================== +class SuffixImporter(object): + """An importer designed using the mechanism defined in :pep:`302`. I read + the PEP, and also used Doug Hellmann's PyMOTW article `Modules and + Imports`_, as a pattern. + + .. _`Modules and Imports`: http://www.doughellmann.com/PyMOTW/sys/imports.html + + Define a subclass that specifies a :attr:`suffix` attribute, and + implements a :meth:`process_filedata` method. Then call the classmethod + :meth:`register` on your class to actually install it in the appropriate + places in :mod:`sys`. """ + + scheme = 'suffix' + suffix = None + path_entry = None + + @classmethod + def trigger_url(cls): + if cls.suffix is None: + raise ValueError('%s.suffix is not set' % cls.__name__) + return 'suffix:%s' % cls.suffix + + @classmethod + def register(cls): + sys.path_hooks.append(cls) + sys.path.append(cls.trigger_url()) + + def __init__(self, path_entry): + pr = url_parse(str(path_entry)) + if pr.scheme != self.scheme or pr.path != self.suffix: + raise ImportError() + self.path_entry = path_entry + self._found = {} + + def checkpath_iter(self, fullname): + for dirpath in sys.path: + # if the value in sys.path_importer_cache is None, then this + # path *should* be imported by the builtin mechanism, and the + # entry is thus a path to a directory on the filesystem; + # if it's not None, then some other importer is in charge, and + # it probably isn't even a filesystem path + finder = sys.path_importer_cache.get(dirpath) + if isinstance(finder, (type(None), importlib.machinery.FileFinder)): + checkpath = os.path.join(dirpath, '{0}.{1}'.format(fullname, self.suffix)) + yield checkpath + + def find_module(self, fullname, path=None): + for checkpath in self.checkpath_iter(fullname): + if os.path.isfile(checkpath): + self._found[fullname] = checkpath + return self + return None + + def load_module(self, fullname): + assert fullname in self._found + if fullname in sys.modules: + module = sys.modules[fullname] + else: + sys.modules[fullname] = module = types.ModuleType(fullname) + data = None + with open(self._found[fullname]) as f: + data = f.read() + + module.__dict__.clear() + module.__file__ = self._found[fullname] + module.__name__ = fullname + module.__loader__ = self + self.process_filedata(module, data) + return module + + def process_filedata(self, module, data): + pass + + +class PystateImporter(SuffixImporter): + suffix = 'pystate' + + def process_filedata(self, module, data): + # MATT-NOTE: re-worked :func:`get_state_machine` + + # convert any statemachine expressions + stateMachineExpr = (stateMachine | namedStateMachine).ignore(pp.pythonStyleComment) + generated_code = stateMachineExpr.transformString(data) + + if DEBUG: print(generated_code) + + # compile code object from generated code + # (strip trailing spaces and tabs, compile doesn't like + # dangling whitespace) + COMPILE_MODE = 'exec' + + codeobj = compile(generated_code.rstrip(" \t"), + module.__file__, + COMPILE_MODE) + + exec(codeobj, module.__dict__) + + +PystateImporter.register() + +if DEBUG: + print("registered {0!r} importer".format(PystateImporter.suffix)) diff --git a/examples/statemachine/trafficLightDemo.py b/examples/statemachine/trafficLightDemo.py new file mode 100644 index 0000000..a8fac8c --- /dev/null +++ b/examples/statemachine/trafficLightDemo.py @@ -0,0 +1,26 @@ +# +# trafficLightDemo.py +# +# Example of a simple state machine modeling the state of a traffic light +# + +import statemachine +import trafficlightstate + + +class TrafficLight(trafficlightstate.TrafficLightStateMixin): + def __init__(self): + self.initialize_state(trafficlightstate.Red) + + def change(self): + self._state = self._state.next_state() + + +light = TrafficLight() +for i in range(10): + print("{0} {1}".format(light, ("STOP", "GO")[light.cars_can_go])) + light.crossing_signal() + light.delay() + print() + + light.change() diff --git a/examples/statemachine/trafficlightstate.pystate b/examples/statemachine/trafficlightstate.pystate new file mode 100644 index 0000000..8790189 --- /dev/null +++ b/examples/statemachine/trafficlightstate.pystate @@ -0,0 +1,47 @@ +# +# trafficlightstate.pystate +# +# state machine model of the states and associated behaviors and properties for each +# different state of a traffic light + + +# define state machine with transitions +# (states will be implemented as Python classes, so use name case appropriate for class names) +statemachine TrafficLightState: + Red -> Green + Green -> Yellow + Yellow -> Red + + +# statemachine only defines the state->state transitions - actual behavior and properties +# must be added separately + + +# define some class level constants +Red.cars_can_go = False +Yellow.cars_can_go = True +Green.cars_can_go = True + + +# setup some class level methods +def flash_crosswalk(s): + def flash(): + print("%s...%s...%s" % (s, s, s)) + + return flash + +Red.crossing_signal = staticmethod(flash_crosswalk("WALK")) +Yellow.crossing_signal = staticmethod(flash_crosswalk("DONT WALK")) +Green.crossing_signal = staticmethod(flash_crosswalk("DONT WALK")) + + +# setup some instance methods +def wait(nSeconds): + def waitFn(self): + print("" % nSeconds) + + return waitFn + +Red.delay = wait(20) +Yellow.delay = wait(3) +Green.delay = wait(15) diff --git a/examples/statemachine/vending_machine.py b/examples/statemachine/vending_machine.py new file mode 100644 index 0000000..f48d2f9 --- /dev/null +++ b/examples/statemachine/vending_machine.py @@ -0,0 +1,78 @@ +# +# vending_machine.py +# +# Example of using the statemachine parser without importing a .pystate module. +# +# A vending machine that dispenses candy and chips in a 4x4 grid, A1 thru D4. +# To dispense a product, you must press an alpha button, then a digit button. +# + +import statemachine + +# Vending machine buttons: +# A, B, C, D +# 1, 2, 3, 4 +# +vending_machine_state_description = """\ +statemachine VendingMachineState: + Idle-(press_alpha_button)->WaitingOnDigit + WaitingOnDigit-(press_alpha_button)->WaitingOnDigit + WaitingOnDigit-(press_digit_button)->DispenseProduct + DispenseProduct-(dispense)->Idle +""" + +# convert state machine text to state classes +generated = statemachine.namedStateMachine.transformString(vending_machine_state_description) +# print(generated) +# exec generated code to define state classes and state mixin +exec(generated) + +class VendingMachine(VendingMachineStateMixin): + def __init__(self): + self.initialize_state(Idle) + self._pressed = None + self._alpha_pressed = None + self._digit_pressed = None + + def press_button(self, button): + if button in "ABCD": + self._pressed = button + self.press_alpha_button() + elif button in "1234": + self._pressed = button + self.press_digit_button() + else: + print('Did not recognize button {!r}'.format(str(button))) + + def press_alpha_button(self): + try: + super(VendingMachine, self).press_alpha_button() + except VendingMachineState.InvalidTransitionException as ite: + print(ite) + else: + self._alpha_pressed = self._pressed + + def press_digit_button(self): + try: + super(VendingMachine, self).press_digit_button() + except VendingMachineState.InvalidTransitionException as ite: + print(ite) + else: + self._digit_pressed = self._pressed + self.dispense() + + def dispense(self): + try: + super(VendingMachine, self).dispense() + except VendingMachineState.InvalidTransitionException as ite: + print(ite) + else: + print("Dispensing at {}{}".format(self._alpha_pressed, self._digit_pressed)) + self._alpha_pressed = self._digit_pressed = None + + +vm = VendingMachine() +for button in "1 A B 1".split(): + print(">> pressing {!r}".format(button)) + vm.press_button(button) + print("Vending machine is now in {} state".format(vm.state)) diff --git a/examples/statemachine/video_demo.py b/examples/statemachine/video_demo.py new file mode 100644 index 0000000..fadfb9d --- /dev/null +++ b/examples/statemachine/video_demo.py @@ -0,0 +1,48 @@ +# +# video_demo.py +# +# Simple statemachine demo, based on the state transitions given in videostate.pystate +# + +import statemachine +import videostate + + +class Video(videostate.VideoStateMixin): + def __init__(self, title): + self.initialize_state(videostate.Stopped) + self.title = title + + +# ==== main loop - a REPL ==== + +v = Video("Die Hard.mp4") + +while True: + print(v.state) + cmd = input("Command ({})> ".format('/'.join(videostate.VideoState.transition_names))).lower().strip() + if not cmd: + continue + + if cmd in ('?', 'h', 'help'): + print('enter a transition {!r}'.format(videostate.VideoState.transition_names)) + print(' q - quit') + print(' ?, h, help - this message') + continue + + # quitting out + if cmd.startswith('q'): + break + + # get transition function for given command + state_transition_fn = getattr(v, cmd, None) + + if state_transition_fn is None: + print('???') + continue + + # invoke the input transition, handle invalid commands + try: + state_transition_fn() + except videostate.VideoState.InvalidTransitionException as e: + print(e) diff --git a/examples/statemachine/videostate.pystate b/examples/statemachine/videostate.pystate new file mode 100644 index 0000000..874001c --- /dev/null +++ b/examples/statemachine/videostate.pystate @@ -0,0 +1,32 @@ +# +# videostate.pystate +# +# Statemachine describing the playing of a video +# [] = stop +# > = play +# || = pause +# >> = fast forward +# << = rewind + +statemachine VideoState: + # basic >, [], and || controls + Stopped-(play)->Playing + Playing-(pause)-> Paused + Playing-(stop)-> Stopped + Paused-(stop)-> Stopped + Paused-(play)->Playing + + # add >> and << controls - different meanings if occur while playing or stopped + Playing-(fast_forward)->FastForward + FastForward-(play)->Playing + FastForward-(pause)->Paused + FastForward-(stop)->Stopped + Stopped-(fast_forward)->Forwardwinding + Forwardwinding-(stop)->Stopped + + Playing-(rewind)->ReversePlaying + ReversePlaying-(play)->Playing + ReversePlaying-(pause)->Paused + ReversePlaying-(stop)->Stopped + Stopped-(rewind)->Rewinding + Rewinding-(stop)->Stopped diff --git a/examples/wordsToNum.py b/examples/wordsToNum.py index d9511da..71538ba 100644 --- a/examples/wordsToNum.py +++ b/examples/wordsToNum.py @@ -1,110 +1,110 @@ -# wordsToNum.py -# Copyright 2006, Paul McGuire -# -# Sample parser grammar to read a number given in words, and return the numeric value. -# -import pyparsing as pp -from operator import mul -from functools import reduce - -def makeLit(s, val): - ret = pp.CaselessLiteral(s) - return ret.setParseAction(pp.replaceWith(val)) - -unitDefinitions = [ - ("zero", 0), - ("oh", 0), - ("zip", 0), - ("zilch", 0), - ("nada", 0), - ("bupkis", 0), - ("one", 1), - ("two", 2), - ("three", 3), - ("four", 4), - ("five", 5), - ("six", 6), - ("seven", 7), - ("eight", 8), - ("nine", 9), - ("ten", 10), - ("eleven", 11), - ("twelve", 12), - ("thirteen", 13), - ("fourteen", 14), - ("fifteen", 15), - ("sixteen", 16), - ("seventeen", 17), - ("eighteen", 18), - ("nineteen", 19), - ] -units = pp.MatchFirst(makeLit(s,v) for s,v in sorted(unitDefinitions, key=lambda d: -len(d[0]))) - -tensDefinitions = [ - ("ten", 10), - ("twenty", 20), - ("thirty", 30), - ("forty", 40), - ("fourty", 40), # for the spelling-challenged... - ("fifty", 50), - ("sixty", 60), - ("seventy", 70), - ("eighty", 80), - ("ninety", 90), - ] -tens = pp.MatchFirst(makeLit(s,v) for s,v in tensDefinitions) - -hundreds = makeLit("hundred", 100) - -majorDefinitions = [ - ("thousand", int(1e3)), - ("million", int(1e6)), - ("billion", int(1e9)), - ("trillion", int(1e12)), - ("quadrillion", int(1e15)), - ("quintillion", int(1e18)), - ] -mag = pp.MatchFirst(makeLit(s,v) for s,v in majorDefinitions) - -wordprod = lambda t: reduce(mul,t) -numPart = ((((units + pp.Optional(hundreds)).setParseAction(wordprod) - + pp.Optional(tens) - ).setParseAction(sum) - ^ tens) - + pp.Optional(units) - ).setParseAction(sum) -numWords = ((numPart + pp.Optional(mag)).setParseAction(wordprod)[...]).setParseAction(sum) -numWords.setName("num word parser") - -numWords.ignore(pp.Literal("-")) -numWords.ignore(pp.CaselessLiteral("and")) - -tests = """ - one hundred twenty hundred, None - one hundred and twennty, None - one hundred and twenty, 120 - one hundred and three, 103 - one hundred twenty-three, 123 - one hundred and twenty three, 123 - one hundred twenty three million, 123000000 - one hundred and twenty three million, 123000000 - one hundred twenty three million and three, 123000003 - fifteen hundred and sixty five, 1565 - seventy-seven thousand eight hundred and nineteen, 77819 - seven hundred seventy-seven thousand seven hundred and seventy-seven, 777777 - zero, 0 - forty two, 42 - fourty two, 42 -""" - -# use '| ...' to indicate "if omitted, skip to next" logic -test_expr = (numWords('result') | ...) + ',' + (pp.pyparsing_common.integer('expected') | 'None') - -def verify_result(t): - if '_skipped' in t: - t['pass'] = False - elif 'expected' in t: - t['pass'] = t.result == t.expected -test_expr.addParseAction(verify_result) - -test_expr.runTests(tests) +# wordsToNum.py +# Copyright 2006, Paul McGuire +# +# Sample parser grammar to read a number given in words, and return the numeric value. +# +import pyparsing as pp +from operator import mul +from functools import reduce + +def makeLit(s, val): + ret = pp.CaselessLiteral(s) + return ret.setParseAction(pp.replaceWith(val)) + +unitDefinitions = [ + ("zero", 0), + ("oh", 0), + ("zip", 0), + ("zilch", 0), + ("nada", 0), + ("bupkis", 0), + ("one", 1), + ("two", 2), + ("three", 3), + ("four", 4), + ("five", 5), + ("six", 6), + ("seven", 7), + ("eight", 8), + ("nine", 9), + ("ten", 10), + ("eleven", 11), + ("twelve", 12), + ("thirteen", 13), + ("fourteen", 14), + ("fifteen", 15), + ("sixteen", 16), + ("seventeen", 17), + ("eighteen", 18), + ("nineteen", 19), + ] +units = pp.MatchFirst(makeLit(s,v) for s,v in sorted(unitDefinitions, key=lambda d: -len(d[0]))) + +tensDefinitions = [ + ("ten", 10), + ("twenty", 20), + ("thirty", 30), + ("forty", 40), + ("fourty", 40), # for the spelling-challenged... + ("fifty", 50), + ("sixty", 60), + ("seventy", 70), + ("eighty", 80), + ("ninety", 90), + ] +tens = pp.MatchFirst(makeLit(s,v) for s,v in tensDefinitions) + +hundreds = makeLit("hundred", 100) + +majorDefinitions = [ + ("thousand", int(1e3)), + ("million", int(1e6)), + ("billion", int(1e9)), + ("trillion", int(1e12)), + ("quadrillion", int(1e15)), + ("quintillion", int(1e18)), + ] +mag = pp.MatchFirst(makeLit(s,v) for s,v in majorDefinitions) + +wordprod = lambda t: reduce(mul,t) +numPart = ((((units + pp.Optional(hundreds)).setParseAction(wordprod) + + pp.Optional(tens) + ).setParseAction(sum) + ^ tens) + + pp.Optional(units) + ).setParseAction(sum) +numWords = ((numPart + pp.Optional(mag)).setParseAction(wordprod)[1, ...]).setParseAction(sum) +numWords.setName("num word parser") + +numWords.ignore(pp.Literal("-")) +numWords.ignore(pp.CaselessLiteral("and")) + +tests = """ + one hundred twenty hundred, None + one hundred and twennty, None + one hundred and twenty, 120 + one hundred and three, 103 + one hundred twenty-three, 123 + one hundred and twenty three, 123 + one hundred twenty three million, 123000000 + one hundred and twenty three million, 123000000 + one hundred twenty three million and three, 123000003 + fifteen hundred and sixty five, 1565 + seventy-seven thousand eight hundred and nineteen, 77819 + seven hundred seventy-seven thousand seven hundred and seventy-seven, 777777 + zero, 0 + forty two, 42 + fourty two, 42 +""" + +# use '| ...' to indicate "if omitted, skip to next" logic +test_expr = (numWords('result') | ...) + ',' + (pp.pyparsing_common.integer('expected') | 'None') + +def verify_result(t): + if '_skipped' in t: + t['pass'] = False + elif 'expected' in t: + t['pass'] = t.result == t.expected +test_expr.addParseAction(verify_result) + +test_expr.runTests(tests) diff --git a/pyparsing.egg-info/PKG-INFO b/pyparsing.egg-info/PKG-INFO index 113698f..8bfd5eb 100644 --- a/pyparsing.egg-info/PKG-INFO +++ b/pyparsing.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 1.2 Name: pyparsing -Version: 2.4.1.1 +Version: 2.4.2 Summary: Python parsing module Home-page: https://github.com/pyparsing/pyparsing/ Author: Paul McGuire diff --git a/pyparsing.egg-info/SOURCES.txt b/pyparsing.egg-info/SOURCES.txt index b767e5a..44e029d 100644 --- a/pyparsing.egg-info/SOURCES.txt +++ b/pyparsing.egg-info/SOURCES.txt @@ -1,4 +1,6 @@ CHANGES +CODE_OF_CONDUCT.rst +CONTRIBUTING.md LICENSE MANIFEST.in README.rst @@ -61,6 +63,7 @@ examples/idlParse.py examples/include_preprocessor.py examples/indentedGrammarExample.py examples/invRegex.py +examples/javascript_grammar.g examples/jsonParser.py examples/linenoExample.py examples/list1.py @@ -109,6 +112,16 @@ examples/urlExtractorNew.py examples/verilogParse.py examples/withAttribute.py examples/wordsToNum.py +examples/statemachine/documentSignoffDemo.py +examples/statemachine/documentsignoffstate.pystate +examples/statemachine/libraryBookDemo.py +examples/statemachine/librarybookstate.pystate +examples/statemachine/statemachine.py +examples/statemachine/trafficLightDemo.py +examples/statemachine/trafficlightstate.pystate +examples/statemachine/vending_machine.py +examples/statemachine/video_demo.py +examples/statemachine/videostate.pystate pyparsing.egg-info/PKG-INFO pyparsing.egg-info/SOURCES.txt pyparsing.egg-info/dependency_links.txt diff --git a/pyparsing.py b/pyparsing.py index fb277fd..3854210 100644 --- a/pyparsing.py +++ b/pyparsing.py @@ -95,8 +95,8 @@ classes inherit from. Use the docstrings for examples of how to: namespace class """ -__version__ = "2.4.1.1" -__versionTime__ = "25 Jul 2019 01:03 UTC" +__version__ = "2.4.2" +__versionTime__ = "29 Jul 2019 02:58 UTC" __author__ = "Paul McGuire " import string @@ -165,24 +165,24 @@ __compat__.collect_all_And_tokens = True __diag__ = SimpleNamespace() __diag__.__doc__ = """ -Diagnostic configuration +Diagnostic configuration (all default to False) - warn_multiple_tokens_in_named_alternation - flag to enable warnings when a results name is defined on a MatchFirst or Or expression with one or more And subexpressions - (default=True) (only warns if __compat__.collect_all_And_tokens is False) + (only warns if __compat__.collect_all_And_tokens is False) - warn_ungrouped_named_tokens_in_collection - flag to enable warnings when a results name is defined on a containing expression with ungrouped subexpressions that also - have results names (default=True) + have results names - warn_name_set_on_empty_Forward - flag to enable warnings whan a Forward is defined - with a results name, but has no contents defined (default=False) + with a results name, but has no contents defined - warn_on_multiple_string_args_to_oneof - flag to enable warnings whan oneOf is - incorrectly called with multiple str arguments (default=True) + incorrectly called with multiple str arguments - enable_debug_on_named_expressions - flag to auto-enable debug on all subsequent - calls to ParserElement.setName() (default=False) + calls to ParserElement.setName() """ -__diag__.warn_multiple_tokens_in_named_alternation = True -__diag__.warn_ungrouped_named_tokens_in_collection = True +__diag__.warn_multiple_tokens_in_named_alternation = False +__diag__.warn_ungrouped_named_tokens_in_collection = False __diag__.warn_name_set_on_empty_Forward = False -__diag__.warn_on_multiple_string_args_to_oneof = True +__diag__.warn_on_multiple_string_args_to_oneof = False __diag__.enable_debug_on_named_expressions = False # ~ sys.stderr.write("testing pyparsing module, version %s, %s\n" % (__version__, __versionTime__)) @@ -2210,8 +2210,11 @@ class ParserElement(object): occurrences. If this behavior is desired, then write ``expr*(None, n) + ~expr`` """ - if other is Ellipsis or other == (Ellipsis,): - other = (1, None) + if other is Ellipsis: + other = (0, None) + elif isinstance(other, tuple) and other[:1] == (Ellipsis,): + other = ((0, ) + other[1:] + (None,))[:2] + if isinstance(other, int): minElements, optElements = other, 0 elif isinstance(other, tuple): @@ -2345,6 +2348,11 @@ class ParserElement(object): """ return NotAny(self) + def __iter__(self): + # must implement __iter__ to override legacy use of sequential access to __getitem__ to + # iterate over a sequence + raise TypeError('%r object is not iterable' % self.__class__.__name__) + def __getitem__(self, key): """ use ``[]`` indexing notation as a short form for expression repetition: @@ -2355,9 +2363,8 @@ class ParserElement(object): (read as "at least n instances of ``expr``") - ``expr[..., n]`` is equivalent to ``expr*(0, n)`` (read as "0 to n instances of ``expr``") - - ``expr[0, ...]`` is equivalent to ``ZeroOrMore(expr)`` + - ``expr[...]`` and ``expr[0, ...]`` are equivalent to ``ZeroOrMore(expr)`` - ``expr[1, ...]`` is equivalent to ``OneOrMore(expr)`` - - ``expr[...]`` is equivalent to ``OneOrMore(expr)`` ``None`` may be used in place of ``...``. Note that ``expr[..., n]`` and ``expr[m, n]``do not raise an exception @@ -2371,7 +2378,7 @@ class ParserElement(object): key = (key,) iter(key) except TypeError: - key = (key,) + key = (key, key) if len(key) > 2: warnings.warn("only 1 or 2 index arguments supported ({0}{1})".format(key[:5], @@ -3836,6 +3843,8 @@ class ParseExpression(ParserElement): if isinstance(exprs, basestring): self.exprs = [self._literalStringClass(exprs)] + elif isinstance(exprs, ParserElement): + self.exprs = [exprs] elif isinstance(exprs, Iterable): exprs = list(exprs) # if sequence of strings provided, wrap with Literal @@ -3989,15 +3998,17 @@ class And(ParseExpression): def streamline(self): # collapse any _PendingSkip's - if any(isinstance(e, ParseExpression) and isinstance(e.exprs[-1], _PendingSkip) for e in self.exprs[:-1]): - for i, e in enumerate(self.exprs[:-1]): - if e is None: - continue - if (isinstance(e, ParseExpression) - and isinstance(e.exprs[-1], _PendingSkip)): - e.exprs[-1] = e.exprs[-1] + self.exprs[i + 1] - self.exprs[i + 1] = None - self.exprs = [e for e in self.exprs if e is not None] + if self.exprs: + if any(isinstance(e, ParseExpression) and e.exprs and isinstance(e.exprs[-1], _PendingSkip) + for e in self.exprs[:-1]): + for i, e in enumerate(self.exprs[:-1]): + if e is None: + continue + if (isinstance(e, ParseExpression) + and e.exprs and isinstance(e.exprs[-1], _PendingSkip)): + e.exprs[-1] = e.exprs[-1] + self.exprs[i + 1] + self.exprs[i + 1] = None + self.exprs = [e for e in self.exprs if e is not None] super(And, self).streamline() self.mayReturnEmpty = all(e.mayReturnEmpty for e in self.exprs) @@ -4105,6 +4116,12 @@ class Or(ParseExpression): # might change whether or how much they match of the input. matches.sort(key=itemgetter(0), reverse=True) + if not doActions: + # no further conditions or parse actions to change the selection of + # alternative, so the first match will be the best match + best_expr = matches[0][1] + return best_expr._parse(instring, loc, doActions) + longest = -1, None for loc1, expr1 in matches: if loc1 <= longest[0]: diff --git a/unitTests.py b/unitTests.py index 7bfbe52..90e3344 100644 --- a/unitTests.py +++ b/unitTests.py @@ -1171,7 +1171,7 @@ class SkipToParserTests(ParseTestCase): # e = define_expr('"start" + (num_word | ...)("inner") + "end"') # test(e, "start 456 end", ['start', '456', 'end'], {'inner': '456'}) - e = define_expr('"start" + (alpha_word[0, ...] & num_word[0, ...] | ...) + "end"') + e = define_expr('"start" + (alpha_word[...] & num_word[...] | ...) + "end"') test(e, "start 456 red end", ['start', '456', 'red', 'end'], {}) test(e, "start red 456 end", ['start', 'red', '456', 'end'], {}) test(e, "start 456 red + end", ['start', '456', 'red', '+ ', 'end'], {'_skipped': ['+ ']}) @@ -1180,7 +1180,7 @@ class SkipToParserTests(ParseTestCase): test(e, "start end", ['start', 'end'], {}) test(e, "start 456 + end", ['start', '456', '+ ', 'end'], {'_skipped': ['+ ']}) - e = define_expr('"start" + (alpha_word[...] & num_word[...] | ...) + "end"') + e = define_expr('"start" + (alpha_word[1, ...] & num_word[1, ...] | ...) + "end"') test(e, "start 456 red end", ['start', '456', 'red', 'end'], {}) test(e, "start red 456 end", ['start', 'red', '456', 'end'], {}) test(e, "start 456 red + end", ['start', '456', 'red', '+ ', 'end'], {'_skipped': ['+ ']}) @@ -1197,6 +1197,53 @@ class SkipToParserTests(ParseTestCase): e = define_expr('Literal("start") + ... + "+" + ... + "end"') test(e, "start red + 456 end", ['start', 'red ', '+', '456 ', 'end'], {'_skipped': ['red ', '456 ']}) +class EllipsisRepetionTest(ParseTestCase): + def runTest(self): + import pyparsing as pp + import re + + word = pp.Word(pp.alphas).setName("word") + num = pp.Word(pp.nums).setName("num") + + exprs = [ + word[...] + num, + word[0, ...] + num, + word[1, ...] + num, + word[2, ...] + num, + word[..., 3] + num, + word[2] + num, + ] + + expected_res = [ + r"([abcd]+ )*\d+", + r"([abcd]+ )*\d+", + r"([abcd]+ )+\d+", + r"([abcd]+ ){2,}\d+", + r"([abcd]+ ){0,3}\d+", + r"([abcd]+ ){2}\d+", + ] + + tests = [ + "aa bb cc dd 123", + "bb cc dd 123", + "cc dd 123", + "dd 123", + "123", + ] + + all_success = True + for expr, expected_re in zip(exprs, expected_res): + successful_tests = [t for t in tests if re.match(expected_re, t)] + failure_tests = [t for t in tests if not re.match(expected_re, t)] + success1, _ = expr.runTests(successful_tests) + success2, _ = expr.runTests(failure_tests, failureTests=True) + all_success = all_success and success1 and success2 + if not all_success: + print_("Failed expression:", expr) + break + + self.assertTrue(all_success, "failed getItem_ellipsis test") + class CustomQuotesTest(ParseTestCase): def runTest(self): @@ -4623,6 +4670,34 @@ class EnableDebugOnNamedExpressionsTest(ParseTestCase): "using enable_debug_on_named_expressions") +class UndesirableButCommonPracticesTest(ParseTestCase): + def runTest(self): + import pyparsing as pp + ppc = pp.pyparsing_common + + # While these are valid constructs, and they are not encouraged + # there is apparently a lot of code out there using these + # coding styles. + # + # Even though they are not encouraged, we shouldn't break them. + + # Create an And using a list of expressions instead of using '+' operator + expr = pp.And([pp.Word('abc'), pp.Word('123')]) + expr.runTests(""" + aaa 333 + b 1 + ababab 32123 + """) + + # Passing a single expression to a ParseExpression, when it really wants a sequence + expr = pp.Or(pp.Or(ppc.integer)) + expr.runTests(""" + 123 + 456 + abc + """) + + class MiscellaneousParserTests(ParseTestCase): def runTest(self): -- 2.34.1